Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 112 additions & 11 deletions ext/cgi/escape/escape.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "ruby.h"
#include "ruby/encoding.h"
#include <stdbool.h>

RUBY_EXTERN unsigned long ruby_scan_digits(const char *str, ssize_t len, int base, size_t *retlen, int *overflow);
RUBY_EXTERN const char ruby_hexdigits[];
Expand Down Expand Up @@ -33,21 +34,21 @@ preserve_original_state(VALUE orig, VALUE dest)
}

static inline long
escaped_length(VALUE str)
escaped_length(VALUE str, long escape_max_len)
{
const long len = RSTRING_LEN(str);
if (len >= LONG_MAX / HTML_ESCAPE_MAX_LEN) {
ruby_malloc_size_overflow(len, HTML_ESCAPE_MAX_LEN);
if (len >= LONG_MAX / escape_max_len) {
ruby_malloc_size_overflow(len, escape_max_len);
}
return len * HTML_ESCAPE_MAX_LEN;
return len * escape_max_len;
}

static VALUE
optimized_escape_html(VALUE str)
{
VALUE escaped;
VALUE vbuf;
char *buf = ALLOCV_N(char, vbuf, escaped_length(str));
char *buf = ALLOCV_N(char, vbuf, escaped_length(str, HTML_ESCAPE_MAX_LEN));
const char *cstr = RSTRING_PTR(str);
const char *end = cstr + RSTRING_LEN(str);

Expand Down Expand Up @@ -75,6 +76,93 @@ optimized_escape_html(VALUE str)
return escaped;
}

struct build_escape_table_args {
long max_escape_length;
VALUE *escape_table;
bool non_ascii_value;
};

static int
build_escape_table_i(VALUE key, VALUE val, VALUE _arg)
{
struct build_escape_table_args *arg = (struct build_escape_table_args *)_arg;
long escape_length;
unsigned char c;

Check_Type(key, T_STRING);
Check_Type(val, T_STRING);

if (RSTRING_LEN(key) != 1) {
rb_raise(rb_eArgError, "CGI.escapeHTML keys must be single ASCII characters");
}

c = RSTRING_PTR(key)[0];
if (c >= 0x80) {
rb_raise(rb_eArgError, "CGI.escapeHTML keys must be single ASCII characters");
}

if (rb_enc_str_coderange(val) != ENC_CODERANGE_7BIT) {
arg->non_ascii_value = true;
}

arg->escape_table[c] = val;

escape_length = RSTRING_LEN(val);
if (arg->max_escape_length < escape_length) {
arg->max_escape_length = escape_length;
}

return ST_CONTINUE;
}

static VALUE
dynamic_escape_html(VALUE str, VALUE rb_escape_table)
{
VALUE escape_table[UCHAR_MAX+1] = {0};
VALUE vbuf, escaped;
char *buf, *dest;
const char *cstr, *end;

struct build_escape_table_args arg = {
.escape_table = escape_table,
};
rb_hash_foreach(rb_escape_table, build_escape_table_i, (VALUE)&arg);

if (arg.non_ascii_value) {
return Qundef;
}

buf = ALLOCV_N(char, vbuf, escaped_length(str, arg.max_escape_length));
cstr = RSTRING_PTR(str);
end = cstr + RSTRING_LEN(str);

dest = buf;
while (cstr < end) {
const unsigned char c = *cstr++;
VALUE escaped_character = escape_table[c];
if (escaped_character) {
const char *ptr;
long len;
RSTRING_GETMEM(escaped_character, ptr, len);
MEMCPY(dest, ptr, char, len);
dest += len;
}
else {
*dest++ = c;
}
}

if (RSTRING_LEN(str) < (dest - buf)) {
escaped = rb_str_new(buf, dest - buf);
preserve_original_state(str, escaped);
}
else {
escaped = rb_str_dup(str);
}
ALLOCV_END(vbuf);
return escaped;
}

static VALUE
optimized_unescape_html(VALUE str)
{
Expand Down Expand Up @@ -331,16 +419,29 @@ optimized_unescape(VALUE str, VALUE encoding, int unescape_plus)
*
*/
static VALUE
cgiesc_escape_html(VALUE self, VALUE str)
cgiesc_escape_html(int argc, VALUE *argv, VALUE self)
{
VALUE str;
rb_check_arity(argc, 1, 2);

str = argv[0];
StringValue(str);

if (rb_enc_str_asciicompat_p(str)) {
return optimized_escape_html(str);
}
else {
return rb_call_super(1, &str);
if (argc == 1) {
return optimized_escape_html(str);
}
else {
VALUE result;
Check_Type(argv[1], T_HASH);
result = dynamic_escape_html(str, argv[1]);
if (result != Qundef) {
return result;
}
}
}

return rb_call_super(argc, argv);
}

/*
Expand Down Expand Up @@ -474,7 +575,7 @@ InitVM_escape(void)
rb_cCGI = rb_define_class("CGI", rb_cObject);
rb_mEscapeExt = rb_define_module_under(rb_cCGI, "EscapeExt");
rb_mEscape = rb_define_module_under(rb_cCGI, "Escape");
rb_define_method(rb_mEscapeExt, "escapeHTML", cgiesc_escape_html, 1);
rb_define_method(rb_mEscapeExt, "escapeHTML", cgiesc_escape_html, -1);
rb_define_method(rb_mEscapeExt, "unescapeHTML", cgiesc_unescape_html, 1);
rb_define_method(rb_mEscapeExt, "escapeURIComponent", cgiesc_escape_uri_component, 1);
rb_define_alias(rb_mEscapeExt, "escape_uri_component", "escapeURIComponent");
Expand Down
30 changes: 26 additions & 4 deletions lib/cgi/escape.rb
Original file line number Diff line number Diff line change
Expand Up @@ -80,21 +80,43 @@ def unescapeURIComponent(string, encoding = @@accept_charset)
# Escape special characters in HTML, namely '&\"<>
# CGI.escapeHTML('Usage: foo "bar" <baz>')
# # => "Usage: foo &quot;bar&quot; &lt;baz&gt;"
def escapeHTML(string)
def escapeHTML(string, escape_table = nil)
enc = string.encoding

if escape_table
escape_table.each_key do |key|
if key.bytesize != 1 || !key.ascii_only?
raise ArgumentError, "CGI.escapeHTML keys must be single ASCII characters"
end
end
end

unless enc.ascii_compatible?
if enc.dummy?
origenc = enc
enc = Encoding::Converter.asciicompat_encoding(enc)
string = enc ? string.encode(enc) : string.b
end
table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
string = string.gsub(/#{"['&\"<>]".encode(enc)}/, table)

table = Hash[(escape_table || TABLE_FOR_ESCAPE_HTML__).map {|pair| pair.map {|s|s.encode(enc)}}]

if escape_table
pattern = Regexp.union(table.keys)
string = string.gsub(pattern, table)
else
string = string.gsub(/#{"['&\"<>]".encode(enc)}/, table)
end

string.encode!(origenc) if origenc
string
else
string = string.b
string.gsub!(/['&\"<>]/, TABLE_FOR_ESCAPE_HTML__)
if escape_table
table = Hash[escape_table.map {|pair| pair.map {|s|s.encode(enc)}}]
string.gsub!(Regexp.union(table.keys), table)
else
string.gsub!(/['&\"<>]/, TABLE_FOR_ESCAPE_HTML__)
end
string.force_encoding(enc)
end
end
Expand Down
45 changes: 45 additions & 0 deletions test/cgi/test_cgi_escape.rb
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,39 @@ def test_cgi_escapeHTML
assert_equal("&#39;&amp;&quot;&gt;&lt;", CGI.escapeHTML("'&\"><"))
end

def test_dynamic_cgi_escapeHTML
assert_equal("'&\">&lt;", CGI.escapeHTML("'&\"><", { "<" => "&lt;" }))
assert_equal("'\\u0026\"\\u003e\\u003c", CGI.escapeHTML("'&\"><", {
">" => '\u003e',
"<" => '\u003c',
"&" => '\u0026',
}))

assert_raise(ArgumentError) { CGI.escapeHTML(" ", { "12" => "&lt;" }) }
assert_raise(ArgumentError) { CGI.escapeHTML(" ", { "€" => "&lt;" }) }
assert_raise(ArgumentError) { CGI.escapeHTML(" ", { "" => "&lt;" }) }

assert_equal("'Û\"€™", CGI.escapeHTML("'&\"><", {
">" => '€',
"<" => '™',
"&" => 'Û',
}))
end

def test_dynamic_cgi_escapeHTML_mixed_encodings
assert_equal("'î\"éà", CGI.escapeHTML("'&\"><", {
">".encode(Encoding::ISO_8859_1) => 'é'.encode(Encoding::ISO_8859_1),
"<".encode(Encoding::ISO_8859_1) => 'à'.encode(Encoding::ISO_8859_1),
"&".encode(Encoding::ISO_8859_1) => 'î'.encode(Encoding::ISO_8859_1),
}))

assert_equal("'î\"éà".encode(Encoding::ISO_8859_1), CGI.escapeHTML("'&\"><".encode(Encoding::ISO_8859_1), {
">" => 'é',
"<" => 'à',
"&" => 'î',
}))
end

def test_cgi_escape_html_duplicated
orig = "Ruby".dup.force_encoding("US-ASCII")
str = CGI.escapeHTML(orig)
Expand Down Expand Up @@ -260,9 +293,21 @@ def test_cgi_unescapeHTML_charref_preserve_encoding
define_method("test_cgi_escapeHTML:#{enc.name}") do
assert_equal(escaped, CGI.escapeHTML(unescaped))
end

define_method("test_cgi_unescapeHTML:#{enc.name}") do
assert_equal(unescaped, CGI.unescapeHTML(escaped))
end

define_method("test_cgi_dynamic_unescapeHTML:#{enc.name}") do
table = {
"'" => '&#39;',
'&' => '&amp;',
'"' => '&quot;',
'<' => '&lt;',
'>' => '&gt;',
}
assert_equal(escaped, CGI.escapeHTML(unescaped, table))
end
end
end

Expand Down
Loading