diff --git a/lib/cgi/escape.rb b/lib/cgi/escape.rb index 6748198..8668ab4 100644 --- a/lib/cgi/escape.rb +++ b/lib/cgi/escape.rb @@ -132,29 +132,27 @@ def unescapeHTML(string) end string = string.b string.gsub!(/&(apos|amp|quot|gt|lt|\#[0-9]+|\#[xX][0-9A-Fa-f]+);/) do - match = $1.dup + match = $1 case match when 'apos' then "'" when 'amp' then '&' when 'quot' then '"' when 'gt' then '>' when 'lt' then '<' - when /\A#0*(\d+)\z/ - n = $1.to_i - if n < charlimit - n.chr(enc) - else - "&##{$1};" - end - when /\A#x([0-9a-f]+)\z/i - n = $1.hex - if n < charlimit - n.chr(enc) + else + # Numeric character reference. Decode into the binary buffer so that a + # non-ASCII byte already present in it never triggers an encoding + # compatibility error on concatenation; the trailing force_encoding + # re-tags the whole buffer. This mirrors the C extension's + # optimized_unescape_html. + n = match.start_with?('#x', '#X') ? match[2..-1].hex : match[1..-1].to_i + if n >= charlimit + "&#{match};" # out of range: keep the reference verbatim + elsif charlimit > 256 + [n].pack('U').b # UTF-8: code point bytes, surrogates included (like rb_enc_mbcput) else - "&#x#{$1};" + n.chr.b # ISO-8859-1 / ASCII: single byte end - else - "&#{match};" end end string.force_encoding enc diff --git a/test/cgi/test_cgi_escape.rb b/test/cgi/test_cgi_escape.rb index 4bf0d45..ddd692b 100644 --- a/test/cgi/test_cgi_escape.rb +++ b/test/cgi/test_cgi_escape.rb @@ -193,6 +193,59 @@ def test_cgi_unescapeHTML_following_number_sign def test_cgi_unescapeHTML_following_invalid_numeric assert_equal('�>�>', CGI.unescapeHTML('�>�>')) end + + # https://github.com/ruby/cgi/issues/103 + # A numeric character reference must decode without raising even when the + # surrounding text already contains non-ASCII bytes. + def test_cgi_unescapeHTML_nonascii_with_numeric_charref + assert_equal("☆ α", CGI.unescapeHTML("☆ α")) + assert_equal("☆ α", CGI.unescapeHTML("☆ α")) + assert_equal("☆ α", CGI.unescapeHTML("☆ α")) + assert_equal("\u{1F984} α", CGI.unescapeHTML("\u{1F984} α")) + result = CGI.unescapeHTML("☆ α") + assert_equal(Encoding::UTF_8, result.encoding) + assert_predicate(result, :valid_encoding?) + end + + def test_cgi_unescapeHTML_nonascii_with_named_charref + assert_equal("☆ & < > \" '", CGI.unescapeHTML("☆ & < > " '")) + end + + def test_cgi_unescapeHTML_nonascii_charref_iso_8859_1 + input = "\xE9 é é".dup.force_encoding("ISO-8859-1") + expected = "\xE9 \xE9 \xE9".dup.force_encoding("ISO-8859-1") + assert_equal(expected, CGI.unescapeHTML(input)) + assert_equal(Encoding::ISO_8859_1, CGI.unescapeHTML(input).encoding) + # A code point outside Latin-1 exceeds the charlimit and is kept verbatim. + verbatim = "\xE9 α".dup.force_encoding("ISO-8859-1") + assert_equal(verbatim, CGI.unescapeHTML(verbatim)) + end + + def test_cgi_unescapeHTML_charref_out_of_range + assert_equal("�", CGI.unescapeHTML("�")) + assert_equal("�", CGI.unescapeHTML("�")) + # Leading zeros are preserved when kept verbatim, matching the C extension. + assert_equal("�", CGI.unescapeHTML("�")) + assert_equal("☆ �", CGI.unescapeHTML("☆ �")) + end + + def test_cgi_unescapeHTML_surrogate_charref + # Surrogate code points are below the UTF-8 limit, so they are emitted as + # raw (intentionally invalid) UTF-8 bytes rather than raising, matching the + # C extension's rb_enc_mbcput. The result is not validated. + assert_equal([0xED, 0xA0, 0x80], CGI.unescapeHTML("�").bytes) + assert_equal([0xED, 0xA0, 0x80], CGI.unescapeHTML("�").bytes) + assert_equal([0xED, 0xBF, 0xBF], CGI.unescapeHTML("�").bytes) + end + + def test_cgi_unescapeHTML_charref_preserve_encoding + ["UTF-8", "EUC-JP", "Windows-31J", "ISO-8859-1", "US-ASCII"].each do |name| + enc = Encoding.find(name) + result = CGI.unescapeHTML("aAb".encode(enc)) + assert_equal("aAb".encode(enc), result, name) + assert_equal(enc, result.encoding, name) + end + end end include UnescapeHTMLTests