Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 13 additions & 15 deletions lib/cgi/escape.rb
Original file line number Diff line number Diff line change
Expand Up @@ -132,29 +132,27 @@ def unescapeHTML(string)
end
string = string.b
string.gsub!(/&(apos|amp|quot|gt|lt|\#[0-9]+|\#[xX][0-9A-Fa-f]+);/) do
match = $1.dup
match = $1
case match
when 'apos' then "'"
when 'amp' then '&'
when 'quot' then '"'
when 'gt' then '>'
when 'lt' then '<'
when /\A#0*(\d+)\z/
n = $1.to_i
if n < charlimit
n.chr(enc)
else
"&##{$1};"
end
when /\A#x([0-9a-f]+)\z/i
n = $1.hex
if n < charlimit
n.chr(enc)
else
# Numeric character reference. Decode into the binary buffer so that a
# non-ASCII byte already present in it never triggers an encoding
# compatibility error on concatenation; the trailing force_encoding
# re-tags the whole buffer. This mirrors the C extension's
# optimized_unescape_html.
n = match.start_with?('#x', '#X') ? match[2..-1].hex : match[1..-1].to_i
if n >= charlimit
"&#{match};" # out of range: keep the reference verbatim
elsif charlimit > 256
[n].pack('U').b # UTF-8: code point bytes, surrogates included (like rb_enc_mbcput)
else
"&#x#{$1};"
n.chr.b # ISO-8859-1 / ASCII: single byte
end
else
"&#{match};"
end
end
string.force_encoding enc
Expand Down
53 changes: 53 additions & 0 deletions test/cgi/test_cgi_escape.rb
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,59 @@ def test_cgi_unescapeHTML_following_number_sign
def test_cgi_unescapeHTML_following_invalid_numeric
assert_equal('&#1114112>&#x110000>', CGI.unescapeHTML('&#1114112&gt;&#x110000&gt;'))
end

# https://github.com/ruby/cgi/issues/103
# A numeric character reference must decode without raising even when the
# surrounding text already contains non-ASCII bytes.
def test_cgi_unescapeHTML_nonascii_with_numeric_charref
assert_equal("☆ α", CGI.unescapeHTML("☆ &#945;"))
assert_equal("☆ α", CGI.unescapeHTML("☆ &#x3B1;"))
assert_equal("☆ α", CGI.unescapeHTML("☆ &#X3B1;"))
assert_equal("\u{1F984} α", CGI.unescapeHTML("\u{1F984} &#945;"))
result = CGI.unescapeHTML("☆ &#945;")
assert_equal(Encoding::UTF_8, result.encoding)
assert_predicate(result, :valid_encoding?)
end

def test_cgi_unescapeHTML_nonascii_with_named_charref
assert_equal("☆ & < > \" '", CGI.unescapeHTML("☆ &amp; &lt; &gt; &quot; &apos;"))
end

def test_cgi_unescapeHTML_nonascii_charref_iso_8859_1
input = "\xE9 &#233; &#xE9;".dup.force_encoding("ISO-8859-1")
expected = "\xE9 \xE9 \xE9".dup.force_encoding("ISO-8859-1")
assert_equal(expected, CGI.unescapeHTML(input))
assert_equal(Encoding::ISO_8859_1, CGI.unescapeHTML(input).encoding)
# A code point outside Latin-1 exceeds the charlimit and is kept verbatim.
verbatim = "\xE9 &#945;".dup.force_encoding("ISO-8859-1")
assert_equal(verbatim, CGI.unescapeHTML(verbatim))
end

def test_cgi_unescapeHTML_charref_out_of_range
assert_equal("&#1114112;", CGI.unescapeHTML("&#1114112;"))
assert_equal("&#x110000;", CGI.unescapeHTML("&#x110000;"))
# Leading zeros are preserved when kept verbatim, matching the C extension.
assert_equal("&#0011141112;", CGI.unescapeHTML("&#0011141112;"))
assert_equal("☆ &#1114112;", CGI.unescapeHTML("☆ &#1114112;"))
end

def test_cgi_unescapeHTML_surrogate_charref
# Surrogate code points are below the UTF-8 limit, so they are emitted as
# raw (intentionally invalid) UTF-8 bytes rather than raising, matching the
# C extension's rb_enc_mbcput. The result is not validated.
assert_equal([0xED, 0xA0, 0x80], CGI.unescapeHTML("&#xD800;").bytes)
assert_equal([0xED, 0xA0, 0x80], CGI.unescapeHTML("&#55296;").bytes)
assert_equal([0xED, 0xBF, 0xBF], CGI.unescapeHTML("&#xDFFF;").bytes)
end

def test_cgi_unescapeHTML_charref_preserve_encoding
["UTF-8", "EUC-JP", "Windows-31J", "ISO-8859-1", "US-ASCII"].each do |name|
enc = Encoding.find(name)
result = CGI.unescapeHTML("a&#65;b".encode(enc))
assert_equal("aAb".encode(enc), result, name)
assert_equal(enc, result.encoding, name)
end
end
end

include UnescapeHTMLTests
Expand Down
Loading