From 51256f6296105e522c9f18c004f6ccb17b8bb978 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Wed, 26 Feb 2025 19:32:02 +0100
Subject: [PATCH] Allow GCI.escapeHTML to take a custom escape table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In some cases you may want to escape a string in a different way than
the default behavior.

For instance, if you are trying to make some JSON save to include
in a `<script>` tag, you may want to escape less, and using JavaScript
codepoints:

```ruby
>> CGI.escapeHTML('Hello </script>', ">" => '\u003e', "<" => '\u003c', "&" => '\u0026')
=> "Hello \\u003c/script\\u003e"
```

Of course you can always use `gsub` for that, but `CGI.escapeHTML` being
specialized is able to be very significantly faster:

```
ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
                gsub    82.135k i/100ms
          escapeHTML   221.405k i/100ms
Calculating -------------------------------------
                gsub    821.890k (± 2.2%) i/s    (1.22 μs/i) -      4.189M in   5.099152s
          escapeHTML      2.330M (± 0.5%) i/s  (429.22 ns/i) -     11.734M in   5.036770s

Comparison:
          escapeHTML:  2329816.5 i/s
                gsub:   821889.7 i/s - 2.83x  slower

ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
                gsub    36.235k i/100ms
          escapeHTML   171.347k i/100ms
Calculating -------------------------------------
                gsub    359.528k (± 1.5%) i/s    (2.78 μs/i) -      1.812M in   5.040422s
          escapeHTML      1.812M (± 0.7%) i/s  (551.84 ns/i) -      9.081M in   5.011762s

Comparison:
          escapeHTML:  1812105.3 i/s
                gsub:   359527.5 i/s - 5.04x  slower
```

```ruby

require "benchmark/ips"
require "cgi"

ESCAPE = {
  ">" => '\u003e', "<" => '\u003c', "&" => '\u0026',
}

ESCAPE_B = {
  ">".b => '\u003e'.b,
  "<".b => '\u003c'.b,
  "&".b => '\u0026'.b,
}
ESCAPE_REGEX = Regexp.union(ESCAPE_B.keys)

str = ("a" * 1024).freeze
Benchmark.ips do |x|
  x.report("gsub") do
    b = str.b
    b.gsub!(ESCAPE_REGEX, ESCAPE_B)
    b.force_encoding(str.encoding)
  end

  x.report("escapeHTML") do
    CGI.escapeHTML(str, ESCAPE)
  end

  x.compare!
end

str = (("a" * 1023) + "<").freeze
Benchmark.ips do |x|
  x.report("gsub") do
    b = str.b
    b.gsub!(ESCAPE_REGEX, ESCAPE_B)
    b.force_encoding(str.encoding)
  end

  x.report("escapeHTML") do
    CGI.escapeHTML(str, ESCAPE)
  end

  x.compare!
end
```
---
 ext/cgi/escape/escape.c     | 123 ++++++++++++++++++++++++++++++++----
 lib/cgi/escape.rb           |  30 +++++++--
 test/cgi/test_cgi_escape.rb |  45 +++++++++++++
 3 files changed, 183 insertions(+), 15 deletions(-)
diff --git a/ext/cgi/escape/escape.c b/ext/cgi/escape/escape.c
index 4773186..94d72b9 100644
--- a/ext/cgi/escape/escape.c
+++ b/ext/cgi/escape/escape.c
@@ -1,5 +1,6 @@
 #include "ruby.h"
 #include "ruby/encoding.h"
+#include <stdbool.h>
 
 RUBY_EXTERN unsigned long ruby_scan_digits(const char *str, ssize_t len, int base, size_t *retlen, int *overflow);
 RUBY_EXTERN const char ruby_hexdigits[];
@@ -33,13 +34,13 @@ preserve_original_state(VALUE orig, VALUE dest)
 }
 
 static inline long
-escaped_length(VALUE str)
+escaped_length(VALUE str, long escape_max_len)
 {
     const long len = RSTRING_LEN(str);
-    if (len >= LONG_MAX / HTML_ESCAPE_MAX_LEN) {
-        ruby_malloc_size_overflow(len, HTML_ESCAPE_MAX_LEN);
+    if (len >= LONG_MAX / escape_max_len) {
+        ruby_malloc_size_overflow(len, escape_max_len);
     }
-    return len * HTML_ESCAPE_MAX_LEN;
+    return len * escape_max_len;
 }
 
 static VALUE
@@ -47,7 +48,7 @@ optimized_escape_html(VALUE str)
 {
     VALUE escaped;
     VALUE vbuf;
-    char *buf = ALLOCV_N(char, vbuf, escaped_length(str));
+    char *buf = ALLOCV_N(char, vbuf, escaped_length(str, HTML_ESCAPE_MAX_LEN));
     const char *cstr = RSTRING_PTR(str);
     const char *end = cstr + RSTRING_LEN(str);
 
@@ -75,6 +76,93 @@ optimized_escape_html(VALUE str)
     return escaped;
 }
 
+struct build_escape_table_args {
+    long max_escape_length;
+    VALUE *escape_table;
+    bool non_ascii_value;
+};
+
+static int
+build_escape_table_i(VALUE key, VALUE val, VALUE _arg)
+{
+    struct build_escape_table_args *arg = (struct build_escape_table_args *)_arg;
+    long escape_length;
+    unsigned char c;
+
+    Check_Type(key, T_STRING);
+    Check_Type(val, T_STRING);
+
+    if (RSTRING_LEN(key) != 1) {
+        rb_raise(rb_eArgError, "CGI.escapeHTML keys must be single ASCII characters");
+    }
+
+    c = RSTRING_PTR(key)[0];
+    if (c >= 0x80) {
+        rb_raise(rb_eArgError, "CGI.escapeHTML keys must be single ASCII characters");
+    }
+
+    if (rb_enc_str_coderange(val) != ENC_CODERANGE_7BIT) {
+        arg->non_ascii_value = true;
+    }
+
+    arg->escape_table[c] = val;
+
+    escape_length = RSTRING_LEN(val);
+    if (arg->max_escape_length < escape_length) {
+        arg->max_escape_length = escape_length;
+    }
+
+    return ST_CONTINUE;
+}
+
+static VALUE
+dynamic_escape_html(VALUE str, VALUE rb_escape_table)
+{
+    VALUE escape_table[UCHAR_MAX+1] = {0};
+    VALUE vbuf, escaped;
+    char *buf, *dest;
+    const char *cstr, *end;
+
+    struct build_escape_table_args arg = {
+        .escape_table = escape_table,
+    };
+    rb_hash_foreach(rb_escape_table, build_escape_table_i, (VALUE)&arg);
+
+    if (arg.non_ascii_value) {
+        return Qundef;
+    }
+
+    buf = ALLOCV_N(char, vbuf, escaped_length(str, arg.max_escape_length));
+    cstr = RSTRING_PTR(str);
+    end = cstr + RSTRING_LEN(str);
+
+    dest = buf;
+    while (cstr < end) {
+        const unsigned char c = *cstr++;
+        VALUE escaped_character = escape_table[c];
+        if (escaped_character) {
+            const char *ptr;
+            long len;
+            RSTRING_GETMEM(escaped_character, ptr, len);
+            MEMCPY(dest, ptr, char, len);
+            dest += len;
+        }
+        else {
+            *dest++ = c;
+        }
+    }
+
+    if (RSTRING_LEN(str) < (dest - buf)) {
+        escaped = rb_str_new(buf, dest - buf);
+        preserve_original_state(str, escaped);
+    }
+    else {
+        escaped = rb_str_dup(str);
+    }
+    ALLOCV_END(vbuf);
+    return escaped;
+}
+
 static VALUE
 optimized_unescape_html(VALUE str)
 {
@@ -331,16 +419,29 @@ optimized_unescape(VALUE str, VALUE encoding, int unescape_plus)
  *
  */
 static VALUE
-cgiesc_escape_html(VALUE self, VALUE str)
+cgiesc_escape_html(int argc, VALUE *argv, VALUE self)
 {
+    VALUE str;
+    rb_check_arity(argc, 1, 2);
+
+    str = argv[0];
     StringValue(str);
 
     if (rb_enc_str_asciicompat_p(str)) {
-        return optimized_escape_html(str);
-    }
-    else {
-        return rb_call_super(1, &str);
+        if (argc == 1) {
+            return optimized_escape_html(str);
+        }
+        else {
+            VALUE result;
+            Check_Type(argv[1], T_HASH);
+            result = dynamic_escape_html(str, argv[1]);
+            if (result != Qundef) {
+                return result;
+            }
+        }
     }
+
+    return rb_call_super(argc, argv);
 }
 
 /*
@@ -474,7 +575,7 @@ InitVM_escape(void)
     rb_cCGI       = rb_define_class("CGI", rb_cObject);
     rb_mEscapeExt = rb_define_module_under(rb_cCGI, "EscapeExt");
     rb_mEscape    = rb_define_module_under(rb_cCGI, "Escape");
-    rb_define_method(rb_mEscapeExt, "escapeHTML", cgiesc_escape_html, 1);
+    rb_define_method(rb_mEscapeExt, "escapeHTML", cgiesc_escape_html, -1);
     rb_define_method(rb_mEscapeExt, "unescapeHTML", cgiesc_unescape_html, 1);
     rb_define_method(rb_mEscapeExt, "escapeURIComponent", cgiesc_escape_uri_component, 1);
     rb_define_alias(rb_mEscapeExt, "escape_uri_component", "escapeURIComponent");
diff --git a/lib/cgi/escape.rb b/lib/cgi/escape.rb
index 8668ab4..7d544b8 100644
--- a/lib/cgi/escape.rb
+++ b/lib/cgi/escape.rb
@@ -80,21 +80,43 @@ def unescapeURIComponent(string, encoding = @@accept_charset)
   # Escape special characters in HTML, namely '&\"<>
   #   CGI.escapeHTML('Usage: foo "bar" <baz>')
   #      # => "Usage: foo &quot;bar&quot; &lt;baz&gt;"
-  def escapeHTML(string)
+  def escapeHTML(string, escape_table = nil)
     enc = string.encoding
+
+    if escape_table
+      escape_table.each_key do |key|
+        if key.bytesize != 1 || !key.ascii_only?
+          raise ArgumentError, "CGI.escapeHTML keys must be single ASCII characters"
+        end
+      end
+    end
+
     unless enc.ascii_compatible?
       if enc.dummy?
         origenc = enc
         enc = Encoding::Converter.asciicompat_encoding(enc)
         string = enc ? string.encode(enc) : string.b
       end
-      table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
-      string = string.gsub(/#{"['&\"<>]".encode(enc)}/, table)
+
+      table = Hash[(escape_table || TABLE_FOR_ESCAPE_HTML__).map {|pair| pair.map {|s|s.encode(enc)}}]
+
+      if escape_table
+        pattern = Regexp.union(table.keys)
+        string = string.gsub(pattern, table) 
+      else
+        string = string.gsub(/#{"['&\"<>]".encode(enc)}/, table)
+      end
+
       string.encode!(origenc) if origenc
       string
     else
       string = string.b
-      string.gsub!(/['&\"<>]/, TABLE_FOR_ESCAPE_HTML__)
+      if escape_table
+        table = Hash[escape_table.map {|pair| pair.map {|s|s.encode(enc)}}]
+        string.gsub!(Regexp.union(table.keys), table)
+      else
+        string.gsub!(/['&\"<>]/, TABLE_FOR_ESCAPE_HTML__)
+      end
       string.force_encoding(enc)
     end
   end
diff --git a/test/cgi/test_cgi_escape.rb b/test/cgi/test_cgi_escape.rb
index ddd692b..17aa7b6 100644
--- a/test/cgi/test_cgi_escape.rb
+++ b/test/cgi/test_cgi_escape.rb
@@ -130,6 +130,39 @@ def test_cgi_escapeHTML
     assert_equal("&#39;&amp;&quot;&gt;&lt;", CGI.escapeHTML("'&\"><"))
   end
 
+  def test_dynamic_cgi_escapeHTML
+    assert_equal("'&\">&lt;", CGI.escapeHTML("'&\"><", { "<" => "&lt;" }))
+    assert_equal("'\\u0026\"\\u003e\\u003c", CGI.escapeHTML("'&\"><", {
+      ">" => '\u003e',
+      "<" => '\u003c',
+      "&" => '\u0026',
+    }))
+
+    assert_raise(ArgumentError) { CGI.escapeHTML("   ", { "12" => "&lt;" }) }
+    assert_raise(ArgumentError) { CGI.escapeHTML("   ", { "€" => "&lt;" }) }
+    assert_raise(ArgumentError) { CGI.escapeHTML("   ", { "" => "&lt;" }) }
+
+    assert_equal("'Û\"€™", CGI.escapeHTML("'&\"><", {
+      ">" => '€',
+      "<" => '™',
+      "&" => 'Û',
+    }))
+  end
+
+  def test_dynamic_cgi_escapeHTML_mixed_encodings
+    assert_equal("'î\"éà", CGI.escapeHTML("'&\"><", {
+      ">".encode(Encoding::ISO_8859_1) => 'é'.encode(Encoding::ISO_8859_1),
+      "<".encode(Encoding::ISO_8859_1) => 'à'.encode(Encoding::ISO_8859_1),
+      "&".encode(Encoding::ISO_8859_1) => 'î'.encode(Encoding::ISO_8859_1),
+    }))
+
+    assert_equal("'î\"éà".encode(Encoding::ISO_8859_1), CGI.escapeHTML("'&\"><".encode(Encoding::ISO_8859_1), {
+      ">" => 'é',
+      "<" => 'à',
+      "&" => 'î',
+    }))
+  end
+
   def test_cgi_escape_html_duplicated
     orig = "Ruby".dup.force_encoding("US-ASCII")
     str = CGI.escapeHTML(orig)
@@ -260,9 +293,21 @@ def test_cgi_unescapeHTML_charref_preserve_encoding
       define_method("test_cgi_escapeHTML:#{enc.name}") do
         assert_equal(escaped, CGI.escapeHTML(unescaped))
       end
+
       define_method("test_cgi_unescapeHTML:#{enc.name}") do
         assert_equal(unescaped, CGI.unescapeHTML(escaped))
       end
+
+      define_method("test_cgi_dynamic_unescapeHTML:#{enc.name}") do
+        table = {
+          "'" => '&#39;',
+          '&' => '&amp;',
+          '"' => '&quot;',
+          '<' => '&lt;',
+          '>' => '&gt;',
+        }
+        assert_equal(escaped, CGI.escapeHTML(unescaped, table))
+      end
     end
   end