ruby · hsbt · Jun 23, 2026 · Jun 23, 2026
diff --git a/lib/cgi/escape.rb b/lib/cgi/escape.rb
@@ -132,29 +132,27 @@ def unescapeHTML(string)
                 end
     string = string.b
     string.gsub!(/&(apos|amp|quot|gt|lt|\#[0-9]+|\#[xX][0-9A-Fa-f]+);/) do
-      match = $1.dup
+      match = $1
       case match
       when 'apos'                then "'"
       when 'amp'                 then '&'
       when 'quot'                then '"'
       when 'gt'                  then '>'
       when 'lt'                  then '<'
-      when /\A#0*(\d+)\z/
-        n = $1.to_i
-        if n < charlimit
-          n.chr(enc)
-        else
-          "&##{$1};"
-        end
-      when /\A#x([0-9a-f]+)\z/i
-        n = $1.hex
-        if n < charlimit
-          n.chr(enc)
+      else
+        # Numeric character reference. Decode into the binary buffer so that a
+        # non-ASCII byte already present in it never triggers an encoding
+        # compatibility error on concatenation; the trailing force_encoding
+        # re-tags the whole buffer. This mirrors the C extension's
+        # optimized_unescape_html.
+        n = match.start_with?('#x', '#X') ? match[2..-1].hex : match[1..-1].to_i
+        if n >= charlimit
+          "&#{match};"            # out of range: keep the reference verbatim
+        elsif charlimit > 256
+          [n].pack('U').b         # UTF-8: code point bytes, surrogates included (like rb_enc_mbcput)
         else
-          "&#x#{$1};"
+          n.chr.b                 # ISO-8859-1 / ASCII: single byte
         end
-      else
-        "&#{match};"
       end
     end
     string.force_encoding enc

diff --git a/test/cgi/test_cgi_escape.rb b/test/cgi/test_cgi_escape.rb
@@ -193,6 +193,59 @@ def test_cgi_unescapeHTML_following_number_sign
     def test_cgi_unescapeHTML_following_invalid_numeric
       assert_equal('&#1114112>&#x110000>', CGI.unescapeHTML('&#1114112&gt;&#x110000&gt;'))
     end
+
+    # https://github.com/ruby/cgi/issues/103
+    # A numeric character reference must decode without raising even when the
+    # surrounding text already contains non-ASCII bytes.
+    def test_cgi_unescapeHTML_nonascii_with_numeric_charref
+      assert_equal("☆ α", CGI.unescapeHTML("☆ &#945;"))
+      assert_equal("☆ α", CGI.unescapeHTML("☆ &#x3B1;"))
+      assert_equal("☆ α", CGI.unescapeHTML("☆ &#X3B1;"))
+      assert_equal("\u{1F984} α", CGI.unescapeHTML("\u{1F984} &#945;"))
+      result = CGI.unescapeHTML("☆ &#945;")
+      assert_equal(Encoding::UTF_8, result.encoding)
+      assert_predicate(result, :valid_encoding?)
+    end
+
+    def test_cgi_unescapeHTML_nonascii_with_named_charref
+      assert_equal("☆ & < > \" '", CGI.unescapeHTML("☆ &amp; &lt; &gt; &quot; &apos;"))
+    end
+
+    def test_cgi_unescapeHTML_nonascii_charref_iso_8859_1
+      input    = "\xE9 &#233; &#xE9;".dup.force_encoding("ISO-8859-1")
+      expected = "\xE9 \xE9 \xE9".dup.force_encoding("ISO-8859-1")
+      assert_equal(expected, CGI.unescapeHTML(input))
+      assert_equal(Encoding::ISO_8859_1, CGI.unescapeHTML(input).encoding)
+      # A code point outside Latin-1 exceeds the charlimit and is kept verbatim.
+      verbatim = "\xE9 &#945;".dup.force_encoding("ISO-8859-1")
+      assert_equal(verbatim, CGI.unescapeHTML(verbatim))
+    end
+
+    def test_cgi_unescapeHTML_charref_out_of_range
+      assert_equal("&#1114112;", CGI.unescapeHTML("&#1114112;"))
+      assert_equal("&#x110000;", CGI.unescapeHTML("&#x110000;"))
+      # Leading zeros are preserved when kept verbatim, matching the C extension.
+      assert_equal("&#0011141112;", CGI.unescapeHTML("&#0011141112;"))
+      assert_equal("☆ &#1114112;", CGI.unescapeHTML("☆ &#1114112;"))
+    end
+
+    def test_cgi_unescapeHTML_surrogate_charref
+      # Surrogate code points are below the UTF-8 limit, so they are emitted as
+      # raw (intentionally invalid) UTF-8 bytes rather than raising, matching the
+      # C extension's rb_enc_mbcput. The result is not validated.
+      assert_equal([0xED, 0xA0, 0x80], CGI.unescapeHTML("&#xD800;").bytes)
+      assert_equal([0xED, 0xA0, 0x80], CGI.unescapeHTML("&#55296;").bytes)
+      assert_equal([0xED, 0xBF, 0xBF], CGI.unescapeHTML("&#xDFFF;").bytes)
+    end
+
+    def test_cgi_unescapeHTML_charref_preserve_encoding
+      ["UTF-8", "EUC-JP", "Windows-31J", "ISO-8859-1", "US-ASCII"].each do |name|
+        enc = Encoding.find(name)
+        result = CGI.unescapeHTML("a&#65;b".encode(enc))
+        assert_equal("aAb".encode(enc), result, name)
+        assert_equal(enc, result.encoding, name)
+      end
+    end
   end
 
   include UnescapeHTMLTests