From 4ee1ed54c8c23657f90b465a2c5355a76639ba8c Mon Sep 17 00:00:00 2001 From: William Chargin Date: Thu, 28 Jun 2018 17:30:59 -0700 Subject: [PATCH] Transform Markdown AST to strip formatting (#441) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: This makes progress on #432. We’d like to look for GitHub references only within each text node of the Markdown AST. But there are two complications: - Text nodes split across formatting, and it’s valid for someone to write `*Paired* with @decentralion, but *tested* independently`, or `**Closes** #12345`, or something. - Sometimes contiguous blocks of text expand to multiple text nodes, because of how CommonMark approaches smart punctuation. For instance: the document `It's got "punctuation" and stuff!` has eight text nodes ([demo][1]). In this commit, we introduce functions `deformat` and `coalesceText` to solve these problems. (They go together because `coalesceText` is useful for testing `deformat`.) [1]: https://spec.commonmark.org/dingus/?text=It%27s%20got%20%22punctuation%22%20and%20stuff! wchargin-branch: markdown-deformat --- src/v3/plugins/github/parseMarkdown.js | 100 ++++++++++++++ src/v3/plugins/github/parseMarkdown.test.js | 137 ++++++++++++++++++++ 2 files changed, 237 insertions(+) create mode 100644 src/v3/plugins/github/parseMarkdown.js create mode 100644 src/v3/plugins/github/parseMarkdown.test.js diff --git a/src/v3/plugins/github/parseMarkdown.js b/src/v3/plugins/github/parseMarkdown.js new file mode 100644 index 0000000..9a5c7c2 --- /dev/null +++ b/src/v3/plugins/github/parseMarkdown.js @@ -0,0 +1,100 @@ +// @flow + +import {Node} from "commonmark"; + +// Copied from: +// https://github.com/DefinitelyTyped/DefinitelyTyped/blob/bd35c127a6fd869ab2844082ae41047668178b7f/types/commonmark/index.d.ts#L14-L15 +type NodeType = + | "text" + | "softbreak" + | "linebreak" + | "emph" + | "strong" + | "html_inline" + | "link" + | "image" + | "code" + | "document" + | "paragraph" + | "block_quote" + | "item" + | "list" + | "heading" + | "code_block" + | "html_block" + | "thematic_break" + | "custom_inline" + | "custom_block"; + +export function deformat(ast: Node): void { + const walker = ast.walker(); + for (let step; (step = walker.next()); ) { + const node: Node = step.node; + const type: NodeType = node.type; + switch (type) { + case "text": + break; + case "softbreak": { + const space = new Node("text", node.sourcepos); + space.literal = " "; + node.insertBefore(space); + node.unlink(); + break; + } + case "linebreak": + break; + case "emph": + case "strong": + case "link": + case "image": + if (!step.entering) { + // Splice out the node. + while (node.firstChild) { + node.insertBefore(node.firstChild); + } + node.unlink(); + } + break; + case "html_inline": + case "code": + case "document": + case "paragraph": + case "block_quote": + case "item": + case "list": + case "heading": + case "code_block": + case "html_block": + case "thematic_break": + case "custom_inline": + case "custom_block": + break; + default: + // eslint-disable-next-line no-unused-expressions + (type: empty); + throw new Error("unexpected type: " + type); + } + } +} + +export function coalesceText(ast: Node): void { + const walker = ast.walker(); + let acc = []; + let firstTextNode = null; + for (let step; (step = walker.next()); ) { + const node: Node = step.node; + const type: NodeType = node.type; + if (type === "text") { + acc.push(node.literal); + if (firstTextNode == null) { + firstTextNode = node; + } else { + node.unlink(); + } + } else if (firstTextNode != null) { + firstTextNode.literal = acc.join(""); + acc = []; + firstTextNode = null; + } + } +} diff --git a/src/v3/plugins/github/parseMarkdown.test.js b/src/v3/plugins/github/parseMarkdown.test.js new file mode 100644 index 0000000..95b732f --- /dev/null +++ b/src/v3/plugins/github/parseMarkdown.test.js @@ -0,0 +1,137 @@ +// @flow + +import {Node, Parser, XmlRenderer} from "commonmark"; + +import {deformat, coalesceText} from "./parseMarkdown"; + +describe("plugins/github/parseMarkdown", () => { + function astContents(ast) { + // The ASTs may differ in their `sourcepos` values, so we can't + // directly compare them for equality. Instead, we compare through + // the XML-rendered version of the tree. This has the side-effect + // that the Jest diffs are much more readable. + return new XmlRenderer().render(ast); + } + + describe("coalesceText", () => { + it("coalesces adjacent text blocks", () => { + // This string will parse to a paragraph with eight text nodes: + // one for each apostrophe, quote, and exclamation mark, and one + // for each other contiguous block of text. + const inputString = 'It\'s got "punctuation" and stuff!'; + const ast1 = new Parser().parse(inputString); + const ast2 = new Parser().parse(inputString); + { + const para = ast2.firstChild; + expect(para.type).toBe("paragraph"); + const text = para.firstChild; + expect(text.type).toBe("text"); + while (text.next) { + text.next.unlink(); + } + text.literal = inputString; + } + expect(astContents(ast1)).not.toEqual(astContents(ast2)); + coalesceText(ast1); + expect(astContents(ast1)).toEqual(astContents(ast2)); + }); + + it("doesn't coalesce across soft breaks, hard breaks, or blocks", () => { + const inputString = "Hello\nworld \nfriends\n\nand\n\n> foes\n"; + const ast1 = new Parser().parse(inputString); + coalesceText(ast1); + const ast2 = new Parser().parse(inputString); + expect(ast1).toEqual(ast2); // even sourcepos should be the same + }); + }); + + describe("deformat", () => { + // The output AST of `deformat` usually includes consecutive `text` + // nodes, and therefore may not be possible ot generate by directly + // parsing a given input document. For instance, deformatting the + // input `hello *world*` yields two text nodes `"hello "` and + // `"world"`, but no Markdown document parses to this same tree. + // Therefore, we include two test cases: one that directly + // constructs the expected AST (which is tedious but foolproof), and + // one that sends both the actual deformatted AST and the expected + // AST through `coalesceText`, which is easier to read and write but + // not quite as convincing a test because the output is + // post-processed. + + it("works on a simple example", () => { + const ast1 = new Parser().parse("hello *world* and **f*r*iends**"); + const ast2 = (() => { + const root = new Node("document"); + let cursor = root; + cursor.appendChild(new Node("paragraph")); + cursor = cursor.firstChild; + cursor.appendChild(new Node("text")); + cursor = cursor.firstChild; + cursor.literal = "hello "; + for (const lit of ["world", " and ", "f", "r", "iends"]) { + cursor.insertAfter(new Node("text")); + cursor = cursor.next; + cursor.literal = lit; + } + return root; + })(); + expect(astContents(ast1)).not.toEqual(astContents(ast2)); + deformat(ast1); + expect(astContents(ast1)).toEqual(astContents(ast2)); + }); + + it("works on a full example", () => { + const ast = new Parser().parse( + [ + "Hello *dear **world** of* friends", + "and everyone else, too.", + "", + "Some `code` for [you][1]:", + "", + "```markdown", + "# such *meta*", + "much wow", + "```", + "", + "[1]: https://example.com/", + "", + "Here's a list: ", + " - **important** things", + " - *also **important*** stuff", + " - a*b*c versus `a*b*c`", + "", + "> idea: ![lightbulb icon] never mind I forgot", + "", + "[lightbulb icon]: https://example.com/lightbulb.png", + "", + ].join("\n") + ); + coalesceText(ast); + const expected = new Parser().parse( + [ + "Hello dear world of friends and everyone else, too.", + "", + "Some `code` for you:", + "", + "```markdown", + "# such *meta*", + "much wow", + "```", + "", + "Here's a list: ", + " - important things", + " - also important stuff", + " - abc versus `a*b*c`", + "", + "> idea: lightbulb icon never mind I forgot", + "", + ].join("\n") + ); + coalesceText(expected); + expect(astContents(ast)).not.toEqual(astContents(expected)); + deformat(ast); + coalesceText(ast); + expect(astContents(ast)).toEqual(astContents(expected)); + }); + }); +});