markdown: ignore references in HTML code elements (#907)

Summary:
Fixes #903. We already ignore Markdown code syntax (backticks), but
prior to this commit we treated the contents of all HTML elements,
including `<code>`, as normal text. As of this commit, `<code>` elements
are stripped entirely. Other HTML elements, like `<em>`, are unaffected.

Test Plan:
Unit tests added. Also, load data for `ipfs/js-ipfs-block-service`, and
observe in the UI that PR `#36` (Update aegir to version 9.0.0) no
longer has any outward references.

wchargin-branch: markdown-html-code
This commit is contained in:
William Chargin 2018-10-02 20:34:49 -07:00 committed by GitHub
parent 3e49466ad5
commit 1b09a7f61b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 92 additions and 2 deletions

View File

@ -1,6 +1,7 @@
// @flow
import {Node, Parser} from "commonmark";
import {OPENTAG, CLOSETAG} from "commonmark/lib/common";
/**
* Extract maximal contiguous blocks of text from a Markdown string, in
@ -66,9 +67,35 @@ type NodeType =
export function deformat(ast: Node): void {
const walker = ast.walker();
// We ignore the contents of HTML "code" elements and their subtrees.
// This variable tracks how deep we are in such a tree. It is 0 if we
// are not in such a tree, 1 if we are in a "code" element, 2 if we
// are in an element inside a "code" element, etc.
let htmlDepth: number = 0;
const reOpenCodeTag = /^<code(?:$|[ >])/i;
const reOpenTag = new RegExp(`^(?:${OPENTAG})`);
const reCloseTag = new RegExp(`^(?:${CLOSETAG})`);
for (let step; (step = walker.next()); ) {
const node: Node = step.node;
const type: NodeType = node.type;
if (htmlDepth > 0) {
if (type === "html_inline") {
if (reOpenTag.test(node.literal)) {
htmlDepth++;
} else if (reCloseTag.test(node.literal)) {
htmlDepth--;
}
}
// The AST walker gets into a broken state if you unlink a node
// that has children before those children have been visited. We
// only unlink when leaving a node, or when entering a node that
// has no children.
if (!step.entering || node.firstChild == null) {
node.unlink();
continue;
}
}
switch (type) {
case "text":
break;
@ -94,6 +121,11 @@ export function deformat(ast: Node): void {
}
break;
case "html_inline":
if (reOpenCodeTag.test(node.literal)) {
htmlDepth++; // should have been 0 previously
}
node.unlink();
break;
case "code":
case "document":
case "paragraph":

View File

@ -50,6 +50,54 @@ describe("plugins/github/parseMarkdown", () => {
];
expect(textBlocks(input)).toEqual(expected);
});
it("includes text inside of non-code HTML elements", () => {
const input = "My <strong>#1</strong> pal";
expect(textBlocks(input)).toEqual(["My #1 pal"]);
});
it('strips HTML "code" elements', () => {
const input = "My <code>#1</code> pal";
expect(textBlocks(input)).toEqual(["My pal"]);
});
it('strips subtrees rooted at HTML "code" elements', () => {
const input = "My <code>#1 <strong>*and* #2</strong></code> pals";
expect(textBlocks(input)).toEqual(["My pals"]);
});
it('strips "code" elements within "code" elements', () => {
const input = "see <code>#1 and <code>#2</code> okay</code>";
expect(textBlocks(input)).toEqual(["see "]);
});
it('handles comments and CDATA within "code" elements', () => {
// These are "html_inline" nodes, but are not HTML elements. They
// may contain closing-tag sequences, but these do not actually
// close a tag.
const input = [
"note",
"<code>alpha ",
"<!-- bravo </code> charlie --> ",
"<![CDATA[delta </code> echo]]> ",
"foxtrot</code> ",
"well",
].join("");
expect(textBlocks(input)).toEqual(["note well"]);
});
it('strips HTML "pre" blocks and subtrees', () => {
// "pre" is not handled specially; all blocks are skipped.
const input =
"Hello\n\n<pre>some pre-formatted <code>code</code></pre>\n\nworld";
expect(textBlocks(input)).toEqual(["Hello", "world"]);
});
it('strips non-"pre" blocks and subtrees', () => {
const input =
"Hello\n\n<div>some pre-formatted <code>code</code></div>\n\nworld";
expect(textBlocks(input)).toEqual(["Hello", "world"]);
});
});
describe("coalesceText", () => {
@ -157,7 +205,7 @@ describe("plugins/github/parseMarkdown", () => {
"much wow",
"```",
"",
"Here's a list: <!-- it's a secret -->",
"Here's a list:",
" - important things",
" - also important stuff",
" - abc versus `a*b*c`",

View File

@ -38,6 +38,16 @@ describe("plugins/github/parseReferences", () => {
expect(parseReferences(input)).toHaveLength(0);
});
it("finds references in normal HTML elements", () => {
const input = "see <em>#1, #2, and #3</em> for context";
expect(parseReferences(input)).toHaveLength(3);
});
it('does not find references in HTML "code" elements', () => {
const input = "see <code>#1, #2, and #3</code> for context";
expect(parseReferences(input)).toHaveLength(0);
});
it("does not find references in inline code with lots of backticks", () => {
// An attempt to evade inline code with regular expressions might
// well fail here, because an even number of backticks appears on