From 4ee1ed54c8c23657f90b465a2c5355a76639ba8c Mon Sep 17 00:00:00 2001
From: William Chargin <wchargin@gmail.com>
Date: Thu, 28 Jun 2018 17:30:59 -0700
Subject: [PATCH] Transform Markdown AST to strip formatting (#441)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
This makes progress on #432. We’d like to look for GitHub references
only within each text node of the Markdown AST. But there are two
complications:

  - Text nodes split across formatting, and it’s valid for someone to
    write `*Paired* with @decentralion, but *tested* independently`, or
    `**Closes** #12345`, or something.

  - Sometimes contiguous blocks of text expand to multiple text nodes,
    because of how CommonMark approaches smart punctuation. For
    instance: the document `It's got "punctuation" and stuff!` has eight
    text nodes ([demo][1]).

In this commit, we introduce functions `deformat` and `coalesceText` to
solve these problems. (They go together because `coalesceText` is useful
for testing `deformat`.)

[1]: https://spec.commonmark.org/dingus/?text=It%27s%20got%20%22punctuation%22%20and%20stuff!

wchargin-branch: markdown-deformat
---
 src/v3/plugins/github/parseMarkdown.js      | 100 ++++++++++++++
 src/v3/plugins/github/parseMarkdown.test.js | 137 ++++++++++++++++++++
 2 files changed, 237 insertions(+)
 create mode 100644 src/v3/plugins/github/parseMarkdown.js
 create mode 100644 src/v3/plugins/github/parseMarkdown.test.js

diff --git a/src/v3/plugins/github/parseMarkdown.js b/src/v3/plugins/github/parseMarkdown.js
new file mode 100644
index 0000000..9a5c7c2
--- /dev/null
+++ b/src/v3/plugins/github/parseMarkdown.js
@@ -0,0 +1,100 @@
+// @flow
+
+import {Node} from "commonmark";
+
+// Copied from:
+// https://github.com/DefinitelyTyped/DefinitelyTyped/blob/bd35c127a6fd869ab2844082ae41047668178b7f/types/commonmark/index.d.ts#L14-L15
+type NodeType =
+  | "text"
+  | "softbreak"
+  | "linebreak"
+  | "emph"
+  | "strong"
+  | "html_inline"
+  | "link"
+  | "image"
+  | "code"
+  | "document"
+  | "paragraph"
+  | "block_quote"
+  | "item"
+  | "list"
+  | "heading"
+  | "code_block"
+  | "html_block"
+  | "thematic_break"
+  | "custom_inline"
+  | "custom_block";
+
+export function deformat(ast: Node): void {
+  const walker = ast.walker();
+  for (let step; (step = walker.next()); ) {
+    const node: Node = step.node;
+    const type: NodeType = node.type;
+    switch (type) {
+      case "text":
+        break;
+      case "softbreak": {
+        const space = new Node("text", node.sourcepos);
+        space.literal = " ";
+        node.insertBefore(space);
+        node.unlink();
+        break;
+      }
+      case "linebreak":
+        break;
+      case "emph":
+      case "strong":
+      case "link":
+      case "image":
+        if (!step.entering) {
+          // Splice out the node.
+          while (node.firstChild) {
+            node.insertBefore(node.firstChild);
+          }
+          node.unlink();
+        }
+        break;
+      case "html_inline":
+      case "code":
+      case "document":
+      case "paragraph":
+      case "block_quote":
+      case "item":
+      case "list":
+      case "heading":
+      case "code_block":
+      case "html_block":
+      case "thematic_break":
+      case "custom_inline":
+      case "custom_block":
+        break;
+      default:
+        // eslint-disable-next-line no-unused-expressions
+        (type: empty);
+        throw new Error("unexpected type: " + type);
+    }
+  }
+}
+
+export function coalesceText(ast: Node): void {
+  const walker = ast.walker();
+  let acc = [];
+  let firstTextNode = null;
+  for (let step; (step = walker.next()); ) {
+    const node: Node = step.node;
+    const type: NodeType = node.type;
+    if (type === "text") {
+      acc.push(node.literal);
+      if (firstTextNode == null) {
+        firstTextNode = node;
+      } else {
+        node.unlink();
+      }
+    } else if (firstTextNode != null) {
+      firstTextNode.literal = acc.join("");
+      acc = [];
+      firstTextNode = null;
+    }
+  }
+}
diff --git a/src/v3/plugins/github/parseMarkdown.test.js b/src/v3/plugins/github/parseMarkdown.test.js
new file mode 100644
index 0000000..95b732f
--- /dev/null
+++ b/src/v3/plugins/github/parseMarkdown.test.js
@@ -0,0 +1,137 @@
+// @flow
+
+import {Node, Parser, XmlRenderer} from "commonmark";
+
+import {deformat, coalesceText} from "./parseMarkdown";
+
+describe("plugins/github/parseMarkdown", () => {
+  function astContents(ast) {
+    // The ASTs may differ in their `sourcepos` values, so we can't
+    // directly compare them for equality. Instead, we compare through
+    // the XML-rendered version of the tree. This has the side-effect
+    // that the Jest diffs are much more readable.
+    return new XmlRenderer().render(ast);
+  }
+
+  describe("coalesceText", () => {
+    it("coalesces adjacent text blocks", () => {
+      // This string will parse to a paragraph with eight text nodes:
+      // one for each apostrophe, quote, and exclamation mark, and one
+      // for each other contiguous block of text.
+      const inputString = 'It\'s got "punctuation" and stuff!';
+      const ast1 = new Parser().parse(inputString);
+      const ast2 = new Parser().parse(inputString);
+      {
+        const para = ast2.firstChild;
+        expect(para.type).toBe("paragraph");
+        const text = para.firstChild;
+        expect(text.type).toBe("text");
+        while (text.next) {
+          text.next.unlink();
+        }
+        text.literal = inputString;
+      }
+      expect(astContents(ast1)).not.toEqual(astContents(ast2));
+      coalesceText(ast1);
+      expect(astContents(ast1)).toEqual(astContents(ast2));
+    });
+
+    it("doesn't coalesce across soft breaks, hard breaks, or blocks", () => {
+      const inputString = "Hello\nworld  \nfriends\n\nand\n\n> foes\n";
+      const ast1 = new Parser().parse(inputString);
+      coalesceText(ast1);
+      const ast2 = new Parser().parse(inputString);
+      expect(ast1).toEqual(ast2); // even sourcepos should be the same
+    });
+  });
+
+  describe("deformat", () => {
+    // The output AST of `deformat` usually includes consecutive `text`
+    // nodes, and therefore may not be possible ot generate by directly
+    // parsing a given input document. For instance, deformatting the
+    // input `hello *world*` yields two text nodes `"hello "` and
+    // `"world"`, but no Markdown document parses to this same tree.
+    // Therefore, we include two test cases: one that directly
+    // constructs the expected AST (which is tedious but foolproof), and
+    // one that sends both the actual deformatted AST and the expected
+    // AST through `coalesceText`, which is easier to read and write but
+    // not quite as convincing a test because the output is
+    // post-processed.
+
+    it("works on a simple example", () => {
+      const ast1 = new Parser().parse("hello *world* and **f*r*iends**");
+      const ast2 = (() => {
+        const root = new Node("document");
+        let cursor = root;
+        cursor.appendChild(new Node("paragraph"));
+        cursor = cursor.firstChild;
+        cursor.appendChild(new Node("text"));
+        cursor = cursor.firstChild;
+        cursor.literal = "hello ";
+        for (const lit of ["world", " and ", "f", "r", "iends"]) {
+          cursor.insertAfter(new Node("text"));
+          cursor = cursor.next;
+          cursor.literal = lit;
+        }
+        return root;
+      })();
+      expect(astContents(ast1)).not.toEqual(astContents(ast2));
+      deformat(ast1);
+      expect(astContents(ast1)).toEqual(astContents(ast2));
+    });
+
+    it("works on a full example", () => {
+      const ast = new Parser().parse(
+        [
+          "Hello *dear **world** of* friends",
+          "and everyone else, too.",
+          "",
+          "Some `code` for [you][1]:",
+          "",
+          "```markdown",
+          "# such *meta*",
+          "much wow",
+          "```",
+          "",
+          "[1]: https://example.com/",
+          "",
+          "Here's a list: <!-- it's a secret -->",
+          "  - **important** things",
+          "  - *also **important*** stuff",
+          "  - a*b*c versus `a*b*c`",
+          "",
+          "> idea: ![lightbulb icon] never mind I forgot",
+          "",
+          "[lightbulb icon]: https://example.com/lightbulb.png",
+          "",
+        ].join("\n")
+      );
+      coalesceText(ast);
+      const expected = new Parser().parse(
+        [
+          "Hello dear world of friends and everyone else, too.",
+          "",
+          "Some `code` for you:",
+          "",
+          "```markdown",
+          "# such *meta*",
+          "much wow",
+          "```",
+          "",
+          "Here's a list: <!-- it's a secret -->",
+          "  - important things",
+          "  - also important stuff",
+          "  - abc versus `a*b*c`",
+          "",
+          "> idea: lightbulb icon never mind I forgot",
+          "",
+        ].join("\n")
+      );
+      coalesceText(expected);
+      expect(astContents(ast)).not.toEqual(astContents(expected));
+      deformat(ast);
+      coalesceText(ast);
+      expect(astContents(ast)).toEqual(astContents(expected));
+    });
+  });
+});