Transform Markdown AST to strip formatting (#441)

Summary: This makes progress on #432. We’d like to look for GitHub references only within each text node of the Markdown AST. But there are two complications: - Text nodes split across formatting, and it’s valid for someone to write `*Paired* with @decentralion, but *tested* independently`, or `**Closes** #12345`, or something. - Sometimes contiguous blocks of text expand to multiple text nodes, because of how CommonMark approaches smart punctuation. For instance: the document `It's got "punctuation" and stuff!` has eight text nodes ([demo][1]). In this commit, we introduce functions `deformat` and `coalesceText` to solve these problems. (They go together because `coalesceText` is useful for testing `deformat`.) [1]: https://spec.commonmark.org/dingus/?text=It%27s%20got%20%22punctuation%22%20and%20stuff! wchargin-branch: markdown-deformat
2018-06-28 17:30:59 -07:00 · 2018-06-28 17:30:59 -07:00 · 4ee1ed54c8
parent 0cc2907e9e
commit 4ee1ed54c8
2 changed files with 237 additions and 0 deletions
--- a/src/v3/plugins/github/parseMarkdown.js
+++ b/src/v3/plugins/github/parseMarkdown.js
@ -0,0 +1,100 @@
 // @flow
 import {Node} from "commonmark";
 // Copied from:
 // https://github.com/DefinitelyTyped/DefinitelyTyped/blob/bd35c127a6fd869ab2844082ae41047668178b7f/types/commonmark/index.d.ts#L14-L15
 type NodeType =
  | "text"
  | "softbreak"
  | "linebreak"
  | "emph"
  | "strong"
  | "html_inline"
  | "link"
  | "image"
  | "code"
  | "document"
  | "paragraph"
  | "block_quote"
  | "item"
  | "list"
  | "heading"
  | "code_block"
  | "html_block"
  | "thematic_break"
  | "custom_inline"
  | "custom_block";
 export function deformat(ast: Node): void {
  const walker = ast.walker();
  for (let step; (step = walker.next()); ) {
    const node: Node = step.node;
    const type: NodeType = node.type;
    switch (type) {
      case "text":
        break;
      case "softbreak": {
        const space = new Node("text", node.sourcepos);
        space.literal = " ";
        node.insertBefore(space);
        node.unlink();
        break;
      }
      case "linebreak":
        break;
      case "emph":
      case "strong":
      case "link":
      case "image":
        if (!step.entering) {
          // Splice out the node.
          while (node.firstChild) {
            node.insertBefore(node.firstChild);
          }
          node.unlink();
        }
        break;
      case "html_inline":
      case "code":
      case "document":
      case "paragraph":
      case "block_quote":
      case "item":
      case "list":
      case "heading":
      case "code_block":
      case "html_block":
      case "thematic_break":
      case "custom_inline":
      case "custom_block":
        break;
      default:
        // eslint-disable-next-line no-unused-expressions
        (type: empty);
        throw new Error("unexpected type: " + type);
    }
  }
 }
 export function coalesceText(ast: Node): void {
  const walker = ast.walker();
  let acc = [];
  let firstTextNode = null;
  for (let step; (step = walker.next()); ) {
    const node: Node = step.node;
    const type: NodeType = node.type;
    if (type === "text") {
      acc.push(node.literal);
      if (firstTextNode == null) {
        firstTextNode = node;
      } else {
        node.unlink();
      }
    } else if (firstTextNode != null) {
      firstTextNode.literal = acc.join("");
      acc = [];
      firstTextNode = null;
    }
  }
 }
--- a/src/v3/plugins/github/parseMarkdown.test.js
+++ b/src/v3/plugins/github/parseMarkdown.test.js
@ -0,0 +1,137 @@
 // @flow
 import {Node, Parser, XmlRenderer} from "commonmark";
 import {deformat, coalesceText} from "./parseMarkdown";
 describe("plugins/github/parseMarkdown", () => {
  function astContents(ast) {
    // The ASTs may differ in their `sourcepos` values, so we can't
    // directly compare them for equality. Instead, we compare through
    // the XML-rendered version of the tree. This has the side-effect
    // that the Jest diffs are much more readable.
    return new XmlRenderer().render(ast);
  }
  describe("coalesceText", () => {
    it("coalesces adjacent text blocks", () => {
      // This string will parse to a paragraph with eight text nodes:
      // one for each apostrophe, quote, and exclamation mark, and one
      // for each other contiguous block of text.
      const inputString = 'It\'s got "punctuation" and stuff!';
      const ast1 = new Parser().parse(inputString);
      const ast2 = new Parser().parse(inputString);
      {
        const para = ast2.firstChild;
        expect(para.type).toBe("paragraph");
        const text = para.firstChild;
        expect(text.type).toBe("text");
        while (text.next) {
          text.next.unlink();
        }
        text.literal = inputString;
      }
      expect(astContents(ast1)).not.toEqual(astContents(ast2));
      coalesceText(ast1);
      expect(astContents(ast1)).toEqual(astContents(ast2));
    });
    it("doesn't coalesce across soft breaks, hard breaks, or blocks", () => {
      const inputString = "Hello\nworld  \nfriends\n\nand\n\n> foes\n";
      const ast1 = new Parser().parse(inputString);
      coalesceText(ast1);
      const ast2 = new Parser().parse(inputString);
      expect(ast1).toEqual(ast2); // even sourcepos should be the same
    });
  });
  describe("deformat", () => {
    // The output AST of `deformat` usually includes consecutive `text`
    // nodes, and therefore may not be possible ot generate by directly
    // parsing a given input document. For instance, deformatting the
    // input `hello *world*` yields two text nodes `"hello "` and
    // `"world"`, but no Markdown document parses to this same tree.
    // Therefore, we include two test cases: one that directly
    // constructs the expected AST (which is tedious but foolproof), and
    // one that sends both the actual deformatted AST and the expected
    // AST through `coalesceText`, which is easier to read and write but
    // not quite as convincing a test because the output is
    // post-processed.
    it("works on a simple example", () => {
      const ast1 = new Parser().parse("hello *world* and **f*r*iends**");
      const ast2 = (() => {
        const root = new Node("document");
        let cursor = root;
        cursor.appendChild(new Node("paragraph"));
        cursor = cursor.firstChild;
        cursor.appendChild(new Node("text"));
        cursor = cursor.firstChild;
        cursor.literal = "hello ";
        for (const lit of ["world", " and ", "f", "r", "iends"]) {
          cursor.insertAfter(new Node("text"));
          cursor = cursor.next;
          cursor.literal = lit;
        }
        return root;
      })();
      expect(astContents(ast1)).not.toEqual(astContents(ast2));
      deformat(ast1);
      expect(astContents(ast1)).toEqual(astContents(ast2));
    });
    it("works on a full example", () => {
      const ast = new Parser().parse(
        [
          "Hello *dear **world** of* friends",
          "and everyone else, too.",
          "",
          "Some `code` for [you][1]:",
          "",
          "```markdown",
          "# such *meta*",
          "much wow",
          "```",
          "",
          "[1]: https://example.com/",
          "",
          "Here's a list: <!-- it's a secret -->",
          "  - **important** things",
          "  - *also **important*** stuff",
          "  - a*b*c versus `a*b*c`",
          "",
          "> idea: ![lightbulb icon] never mind I forgot",
          "",
          "[lightbulb icon]: https://example.com/lightbulb.png",
          "",
        ].join("\n")
      );
      coalesceText(ast);
      const expected = new Parser().parse(
        [
          "Hello dear world of friends and everyone else, too.",
          "",
          "Some `code` for you:",
          "",
          "```markdown",
          "# such *meta*",
          "much wow",
          "```",
          "",
          "Here's a list: <!-- it's a secret -->",
          "  - important things",
          "  - also important stuff",
          "  - abc versus `a*b*c`",
          "",
          "> idea: lightbulb icon never mind I forgot",
          "",
        ].join("\n")
      );
      coalesceText(expected);
      expect(astContents(ast)).not.toEqual(astContents(expected));
      deformat(ast);
      coalesceText(ast);
      expect(astContents(ast)).toEqual(astContents(expected));
    });
  });
 });