From 51256f6296105e522c9f18c004f6ccb17b8bb978 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Wed, 26 Feb 2025 19:32:02 +0100 Subject: [PATCH] Allow GCI.escapeHTML to take a custom escape table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases you may want to escape a string in a different way than the default behavior. For instance, if you are trying to make some JSON save to include in a `', ">" => '\u003e', "<" => '\u003c', "&" => '\u0026') => "Hello \\u003c/script\\u003e" ``` Of course you can always use `gsub` for that, but `CGI.escapeHTML` being specialized is able to be very significantly faster: ``` ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- gsub 82.135k i/100ms escapeHTML 221.405k i/100ms Calculating ------------------------------------- gsub 821.890k (± 2.2%) i/s (1.22 μs/i) - 4.189M in 5.099152s escapeHTML 2.330M (± 0.5%) i/s (429.22 ns/i) - 11.734M in 5.036770s Comparison: escapeHTML: 2329816.5 i/s gsub: 821889.7 i/s - 2.83x slower ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- gsub 36.235k i/100ms escapeHTML 171.347k i/100ms Calculating ------------------------------------- gsub 359.528k (± 1.5%) i/s (2.78 μs/i) - 1.812M in 5.040422s escapeHTML 1.812M (± 0.7%) i/s (551.84 ns/i) - 9.081M in 5.011762s Comparison: escapeHTML: 1812105.3 i/s gsub: 359527.5 i/s - 5.04x slower ``` ```ruby require "benchmark/ips" require "cgi" ESCAPE = { ">" => '\u003e', "<" => '\u003c', "&" => '\u0026', } ESCAPE_B = { ">".b => '\u003e'.b, "<".b => '\u003c'.b, "&".b => '\u0026'.b, } ESCAPE_REGEX = Regexp.union(ESCAPE_B.keys) str = ("a" * 1024).freeze Benchmark.ips do |x| x.report("gsub") do b = str.b b.gsub!(ESCAPE_REGEX, ESCAPE_B) b.force_encoding(str.encoding) end x.report("escapeHTML") do CGI.escapeHTML(str, ESCAPE) end x.compare! end str = (("a" * 1023) + "<").freeze Benchmark.ips do |x| x.report("gsub") do b = str.b b.gsub!(ESCAPE_REGEX, ESCAPE_B) b.force_encoding(str.encoding) end x.report("escapeHTML") do CGI.escapeHTML(str, ESCAPE) end x.compare! end ``` --- ext/cgi/escape/escape.c | 123 ++++++++++++++++++++++++++++++++---- lib/cgi/escape.rb | 30 +++++++-- test/cgi/test_cgi_escape.rb | 45 +++++++++++++ 3 files changed, 183 insertions(+), 15 deletions(-) diff --git a/ext/cgi/escape/escape.c b/ext/cgi/escape/escape.c index 4773186..94d72b9 100644 --- a/ext/cgi/escape/escape.c +++ b/ext/cgi/escape/escape.c @@ -1,5 +1,6 @@ #include "ruby.h" #include "ruby/encoding.h" +#include RUBY_EXTERN unsigned long ruby_scan_digits(const char *str, ssize_t len, int base, size_t *retlen, int *overflow); RUBY_EXTERN const char ruby_hexdigits[]; @@ -33,13 +34,13 @@ preserve_original_state(VALUE orig, VALUE dest) } static inline long -escaped_length(VALUE str) +escaped_length(VALUE str, long escape_max_len) { const long len = RSTRING_LEN(str); - if (len >= LONG_MAX / HTML_ESCAPE_MAX_LEN) { - ruby_malloc_size_overflow(len, HTML_ESCAPE_MAX_LEN); + if (len >= LONG_MAX / escape_max_len) { + ruby_malloc_size_overflow(len, escape_max_len); } - return len * HTML_ESCAPE_MAX_LEN; + return len * escape_max_len; } static VALUE @@ -47,7 +48,7 @@ optimized_escape_html(VALUE str) { VALUE escaped; VALUE vbuf; - char *buf = ALLOCV_N(char, vbuf, escaped_length(str)); + char *buf = ALLOCV_N(char, vbuf, escaped_length(str, HTML_ESCAPE_MAX_LEN)); const char *cstr = RSTRING_PTR(str); const char *end = cstr + RSTRING_LEN(str); @@ -75,6 +76,93 @@ optimized_escape_html(VALUE str) return escaped; } +struct build_escape_table_args { + long max_escape_length; + VALUE *escape_table; + bool non_ascii_value; +}; + +static int +build_escape_table_i(VALUE key, VALUE val, VALUE _arg) +{ + struct build_escape_table_args *arg = (struct build_escape_table_args *)_arg; + long escape_length; + unsigned char c; + + Check_Type(key, T_STRING); + Check_Type(val, T_STRING); + + if (RSTRING_LEN(key) != 1) { + rb_raise(rb_eArgError, "CGI.escapeHTML keys must be single ASCII characters"); + } + + c = RSTRING_PTR(key)[0]; + if (c >= 0x80) { + rb_raise(rb_eArgError, "CGI.escapeHTML keys must be single ASCII characters"); + } + + if (rb_enc_str_coderange(val) != ENC_CODERANGE_7BIT) { + arg->non_ascii_value = true; + } + + arg->escape_table[c] = val; + + escape_length = RSTRING_LEN(val); + if (arg->max_escape_length < escape_length) { + arg->max_escape_length = escape_length; + } + + return ST_CONTINUE; +} + +static VALUE +dynamic_escape_html(VALUE str, VALUE rb_escape_table) +{ + VALUE escape_table[UCHAR_MAX+1] = {0}; + VALUE vbuf, escaped; + char *buf, *dest; + const char *cstr, *end; + + struct build_escape_table_args arg = { + .escape_table = escape_table, + }; + rb_hash_foreach(rb_escape_table, build_escape_table_i, (VALUE)&arg); + + if (arg.non_ascii_value) { + return Qundef; + } + + buf = ALLOCV_N(char, vbuf, escaped_length(str, arg.max_escape_length)); + cstr = RSTRING_PTR(str); + end = cstr + RSTRING_LEN(str); + + dest = buf; + while (cstr < end) { + const unsigned char c = *cstr++; + VALUE escaped_character = escape_table[c]; + if (escaped_character) { + const char *ptr; + long len; + RSTRING_GETMEM(escaped_character, ptr, len); + MEMCPY(dest, ptr, char, len); + dest += len; + } + else { + *dest++ = c; + } + } + + if (RSTRING_LEN(str) < (dest - buf)) { + escaped = rb_str_new(buf, dest - buf); + preserve_original_state(str, escaped); + } + else { + escaped = rb_str_dup(str); + } + ALLOCV_END(vbuf); + return escaped; +} + static VALUE optimized_unescape_html(VALUE str) { @@ -331,16 +419,29 @@ optimized_unescape(VALUE str, VALUE encoding, int unescape_plus) * */ static VALUE -cgiesc_escape_html(VALUE self, VALUE str) +cgiesc_escape_html(int argc, VALUE *argv, VALUE self) { + VALUE str; + rb_check_arity(argc, 1, 2); + + str = argv[0]; StringValue(str); if (rb_enc_str_asciicompat_p(str)) { - return optimized_escape_html(str); - } - else { - return rb_call_super(1, &str); + if (argc == 1) { + return optimized_escape_html(str); + } + else { + VALUE result; + Check_Type(argv[1], T_HASH); + result = dynamic_escape_html(str, argv[1]); + if (result != Qundef) { + return result; + } + } } + + return rb_call_super(argc, argv); } /* @@ -474,7 +575,7 @@ InitVM_escape(void) rb_cCGI = rb_define_class("CGI", rb_cObject); rb_mEscapeExt = rb_define_module_under(rb_cCGI, "EscapeExt"); rb_mEscape = rb_define_module_under(rb_cCGI, "Escape"); - rb_define_method(rb_mEscapeExt, "escapeHTML", cgiesc_escape_html, 1); + rb_define_method(rb_mEscapeExt, "escapeHTML", cgiesc_escape_html, -1); rb_define_method(rb_mEscapeExt, "unescapeHTML", cgiesc_unescape_html, 1); rb_define_method(rb_mEscapeExt, "escapeURIComponent", cgiesc_escape_uri_component, 1); rb_define_alias(rb_mEscapeExt, "escape_uri_component", "escapeURIComponent"); diff --git a/lib/cgi/escape.rb b/lib/cgi/escape.rb index 8668ab4..7d544b8 100644 --- a/lib/cgi/escape.rb +++ b/lib/cgi/escape.rb @@ -80,21 +80,43 @@ def unescapeURIComponent(string, encoding = @@accept_charset) # Escape special characters in HTML, namely '&\"<> # CGI.escapeHTML('Usage: foo "bar" ') # # => "Usage: foo "bar" <baz>" - def escapeHTML(string) + def escapeHTML(string, escape_table = nil) enc = string.encoding + + if escape_table + escape_table.each_key do |key| + if key.bytesize != 1 || !key.ascii_only? + raise ArgumentError, "CGI.escapeHTML keys must be single ASCII characters" + end + end + end + unless enc.ascii_compatible? if enc.dummy? origenc = enc enc = Encoding::Converter.asciicompat_encoding(enc) string = enc ? string.encode(enc) : string.b end - table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}] - string = string.gsub(/#{"['&\"<>]".encode(enc)}/, table) + + table = Hash[(escape_table || TABLE_FOR_ESCAPE_HTML__).map {|pair| pair.map {|s|s.encode(enc)}}] + + if escape_table + pattern = Regexp.union(table.keys) + string = string.gsub(pattern, table) + else + string = string.gsub(/#{"['&\"<>]".encode(enc)}/, table) + end + string.encode!(origenc) if origenc string else string = string.b - string.gsub!(/['&\"<>]/, TABLE_FOR_ESCAPE_HTML__) + if escape_table + table = Hash[escape_table.map {|pair| pair.map {|s|s.encode(enc)}}] + string.gsub!(Regexp.union(table.keys), table) + else + string.gsub!(/['&\"<>]/, TABLE_FOR_ESCAPE_HTML__) + end string.force_encoding(enc) end end diff --git a/test/cgi/test_cgi_escape.rb b/test/cgi/test_cgi_escape.rb index ddd692b..17aa7b6 100644 --- a/test/cgi/test_cgi_escape.rb +++ b/test/cgi/test_cgi_escape.rb @@ -130,6 +130,39 @@ def test_cgi_escapeHTML assert_equal("'&"><", CGI.escapeHTML("'&\"><")) end + def test_dynamic_cgi_escapeHTML + assert_equal("'&\"><", CGI.escapeHTML("'&\"><", { "<" => "<" })) + assert_equal("'\\u0026\"\\u003e\\u003c", CGI.escapeHTML("'&\"><", { + ">" => '\u003e', + "<" => '\u003c', + "&" => '\u0026', + })) + + assert_raise(ArgumentError) { CGI.escapeHTML(" ", { "12" => "<" }) } + assert_raise(ArgumentError) { CGI.escapeHTML(" ", { "€" => "<" }) } + assert_raise(ArgumentError) { CGI.escapeHTML(" ", { "" => "<" }) } + + assert_equal("'Û\"€™", CGI.escapeHTML("'&\"><", { + ">" => '€', + "<" => '™', + "&" => 'Û', + })) + end + + def test_dynamic_cgi_escapeHTML_mixed_encodings + assert_equal("'î\"éà", CGI.escapeHTML("'&\"><", { + ">".encode(Encoding::ISO_8859_1) => 'é'.encode(Encoding::ISO_8859_1), + "<".encode(Encoding::ISO_8859_1) => 'à'.encode(Encoding::ISO_8859_1), + "&".encode(Encoding::ISO_8859_1) => 'î'.encode(Encoding::ISO_8859_1), + })) + + assert_equal("'î\"éà".encode(Encoding::ISO_8859_1), CGI.escapeHTML("'&\"><".encode(Encoding::ISO_8859_1), { + ">" => 'é', + "<" => 'à', + "&" => 'î', + })) + end + def test_cgi_escape_html_duplicated orig = "Ruby".dup.force_encoding("US-ASCII") str = CGI.escapeHTML(orig) @@ -260,9 +293,21 @@ def test_cgi_unescapeHTML_charref_preserve_encoding define_method("test_cgi_escapeHTML:#{enc.name}") do assert_equal(escaped, CGI.escapeHTML(unescaped)) end + define_method("test_cgi_unescapeHTML:#{enc.name}") do assert_equal(unescaped, CGI.unescapeHTML(escaped)) end + + define_method("test_cgi_dynamic_unescapeHTML:#{enc.name}") do + table = { + "'" => ''', + '&' => '&', + '"' => '"', + '<' => '<', + '>' => '>', + } + assert_equal(escaped, CGI.escapeHTML(unescaped, table)) + end end end