Merge pull request #75 from mprobst/sanitize_test

Avoid raw mode parsing so that tags like <script> don't cause escaping
2025-02-23 00:28:25 +00:00 · 2014-05-03 15:11:41 +03:00 · 2014-05-03 15:11:41 +03:00 · 643477a051
commit 643477a051
parent 50b8e0370b 11e042f6c1
2 changed files with 20 additions and 8 deletions
--- a/inline_test.go
+++ b/inline_test.go
@ -135,7 +135,7 @@ func TestRawHtmlTag(t *testing.T) {
 		"<p><a>xss link</a></p>\n",

 		`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`,
-		"<p><img>&lt;script&gt;alert(&amp;quot;XSS&amp;quot;)&lt;/script&gt;&#34;&gt;</p>\n",
+		"<p><img>&lt;script&gt;alert(&#34;XSS&#34;)&lt;/script&gt;&#34;&gt;</p>\n",

 		"<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>",
 		"<p><img></p>\n",
@ -182,18 +182,14 @@ func TestRawHtmlTag(t *testing.T) {
 		`<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
 		"<p>&lt;script/SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n",

-		// HTML5 interprets the <script> tag contents as raw test, thus the end
-		// result has double-escaped &amp;quot;
 		`<<SCRIPT>alert("XSS");//<</SCRIPT>`,
-		"<p>&lt;&lt;script&gt;alert(&amp;quot;XSS&amp;quot;);//&amp;lt;&lt;/script&gt;</p>\n",
+		"<p>&lt;&lt;script&gt;alert(&#34;XSS&#34;);//&lt;&lt;/script&gt;</p>\n",

-		// HTML5 parses the </p> within an unclosed <script> tag as text.
-		// Same for the following tests.
 		"<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >",
-		"<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;&lt;/p&gt;\n",
+		"<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;</p>\n",

 		"<SCRIPT SRC=//ha.ckers.org/.j>",
-		"<p>&lt;script SRC=//ha.ckers.org/.j&gt;&lt;/p&gt;\n",
+		"<p>&lt;script SRC=//ha.ckers.org/.j&gt;</p>\n",

 		`<IMG SRC="javascript:alert('XSS')"`,
 		"<p>&lt;IMG SRC=&#34;javascript:alert(&#39;XSS&#39;)&#34;</p>\n",
@ -220,11 +216,23 @@ func TestRawHtmlTag(t *testing.T) {

 func TestQuoteEscaping(t *testing.T) {
 	tests := []string{
+		// Make sure quotes are transported correctly (different entities or
+		// unicode, but correct semantics)
 		"<p>Here are some &quot;quotes&quot;.</p>\n",
 		"<p>Here are some &#34;quotes&#34;.</p>\n",

 		"<p>Here are some &ldquo;quotes&rdquo;.</p>\n",
 		"<p>Here are some \u201Cquotes\u201D.</p>\n",
+
+		// Within a <script> tag, content gets parsed by the raw text parsing rules.
+		// This test makes sure we correctly disable those parsing rules and do not
+		// escape e.g. the closing </p>.
+		`Here are <script> some "quotes".`,
+		"<p>Here are &lt;script&gt; some &#34;quotes&#34;.</p>\n",
+
+		// Same test for an unknown element that does not switch into raw mode.
+		`Here are <eviltag> some "quotes".`,
+		"<p>Here are &lt;eviltag&gt; some &#34;quotes&#34;.</p>\n",
 	}
 	doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT)
 }
--- a/sanitize.go
+++ b/sanitize.go
@ -107,6 +107,10 @@ func sanitizeHtmlSafe(input []byte) []byte {
 			} else {
 				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
 			}
+			// Make sure that tags like <script> that switch the parser into raw mode
+			// do not destroy the parse mode for following HTML text (the point is to
+			// escape them anyway). For that, switch off raw mode in the tokenizer.
+			tokenizer.NextIsNotRawText()
 		case html.EndTagToken:
 			// Whitelisted tokens can be written in raw.
 			tag, _ := tokenizer.TagName()