markdown: ignore references in HTML code elements (#907)

Summary: Fixes #903. We already ignore Markdown code syntax (backticks), but prior to this commit we treated the contents of all HTML elements, including `<code>`, as normal text. As of this commit, `<code>` elements are stripped entirely. Other HTML elements, like `<em>`, are unaffected. Test Plan: Unit tests added. Also, load data for `ipfs/js-ipfs-block-service`, and observe in the UI that PR `#36` (Update aegir to version 9.0.0) no longer has any outward references. wchargin-branch: markdown-html-code
2018-10-02 20:34:49 -07:00 · 2018-10-02 20:34:49 -07:00 · 1b09a7f61b
parent 3e49466ad5
commit 1b09a7f61b
3 changed files with 92 additions and 2 deletions
--- a/src/plugins/github/parseMarkdown.js
+++ b/src/plugins/github/parseMarkdown.js
@ -1,6 +1,7 @@
 // @flow

 import {Node, Parser} from "commonmark";
+import {OPENTAG, CLOSETAG} from "commonmark/lib/common";

 /**
 * Extract maximal contiguous blocks of text from a Markdown string, in
@ -66,9 +67,35 @@ type NodeType =

 export function deformat(ast: Node): void {
  const walker = ast.walker();
+  // We ignore the contents of HTML "code" elements and their subtrees.
+  // This variable tracks how deep we are in such a tree. It is 0 if we
+  // are not in such a tree, 1 if we are in a "code" element, 2 if we
+  // are in an element inside a "code" element, etc.
+  let htmlDepth: number = 0;
+  const reOpenCodeTag = /^<code(?:$|[ >])/i;
+  const reOpenTag = new RegExp(`^(?:${OPENTAG})`);
+  const reCloseTag = new RegExp(`^(?:${CLOSETAG})`);
+
  for (let step; (step = walker.next()); ) {
    const node: Node = step.node;
    const type: NodeType = node.type;
+    if (htmlDepth > 0) {
+      if (type === "html_inline") {
+        if (reOpenTag.test(node.literal)) {
+          htmlDepth++;
+        } else if (reCloseTag.test(node.literal)) {
+          htmlDepth--;
+        }
+      }
+      // The AST walker gets into a broken state if you unlink a node
+      // that has children before those children have been visited. We
+      // only unlink when leaving a node, or when entering a node that
+      // has no children.
+      if (!step.entering || node.firstChild == null) {
+        node.unlink();
+        continue;
+      }
+    }
    switch (type) {
      case "text":
        break;
@ -94,6 +121,11 @@ export function deformat(ast: Node): void {
        }
        break;
      case "html_inline":
+        if (reOpenCodeTag.test(node.literal)) {
+          htmlDepth++; // should have been 0 previously
+        }
+        node.unlink();
+        break;
      case "code":
      case "document":
      case "paragraph":
--- a/src/plugins/github/parseMarkdown.test.js
+++ b/src/plugins/github/parseMarkdown.test.js
@ -50,6 +50,54 @@ describe("plugins/github/parseMarkdown", () => {
      ];
      expect(textBlocks(input)).toEqual(expected);
    });
+
+    it("includes text inside of non-code HTML elements", () => {
+      const input = "My <strong>#1</strong> pal";
+      expect(textBlocks(input)).toEqual(["My #1 pal"]);
+    });
+
+    it('strips HTML "code" elements', () => {
+      const input = "My <code>#1</code> pal";
+      expect(textBlocks(input)).toEqual(["My  pal"]);
+    });
+
+    it('strips subtrees rooted at HTML "code" elements', () => {
+      const input = "My <code>#1 <strong>*and* #2</strong></code> pals";
+      expect(textBlocks(input)).toEqual(["My  pals"]);
+    });
+
+    it('strips "code" elements within "code" elements', () => {
+      const input = "see <code>#1 and <code>#2</code> okay</code>";
+      expect(textBlocks(input)).toEqual(["see "]);
+    });
+
+    it('handles comments and CDATA within "code" elements', () => {
+      // These are "html_inline" nodes, but are not HTML elements. They
+      // may contain closing-tag sequences, but these do not actually
+      // close a tag.
+      const input = [
+        "note",
+        "<code>alpha ",
+        "<!-- bravo </code> charlie --> ",
+        "<![CDATA[delta </code> echo]]> ",
+        "foxtrot</code> ",
+        "well",
+      ].join("");
+      expect(textBlocks(input)).toEqual(["note well"]);
+    });
+
+    it('strips HTML "pre" blocks and subtrees', () => {
+      // "pre" is not handled specially; all blocks are skipped.
+      const input =
+        "Hello\n\n<pre>some pre-formatted <code>code</code></pre>\n\nworld";
+      expect(textBlocks(input)).toEqual(["Hello", "world"]);
+    });
+
+    it('strips non-"pre" blocks and subtrees', () => {
+      const input =
+        "Hello\n\n<div>some pre-formatted <code>code</code></div>\n\nworld";
+      expect(textBlocks(input)).toEqual(["Hello", "world"]);
+    });
  });

  describe("coalesceText", () => {
@ -157,7 +205,7 @@ describe("plugins/github/parseMarkdown", () => {
          "much wow",
          "```",
          "",
-          "Here's a list: <!-- it's a secret -->",
+          "Here's a list:",
          "  - important things",
          "  - also important stuff",
          "  - abc versus `a*b*c`",
--- a/src/plugins/github/parseReferences.test.js
+++ b/src/plugins/github/parseReferences.test.js
@ -38,6 +38,16 @@ describe("plugins/github/parseReferences", () => {
    expect(parseReferences(input)).toHaveLength(0);
  });

+  it("finds references in normal HTML elements", () => {
+    const input = "see <em>#1, #2, and #3</em> for context";
+    expect(parseReferences(input)).toHaveLength(3);
+  });
+
+  it('does not find references in HTML "code" elements', () => {
+    const input = "see <code>#1, #2, and #3</code> for context";
+    expect(parseReferences(input)).toHaveLength(0);
+  });
+
  it("does not find references in inline code with lots of backticks", () => {
    // An attempt to evade inline code with regular expressions might
    // well fail here, because an even number of backticks appears on