Transform Markdown AST to strip formatting (#441)

Summary:
This makes progress on #432. We’d like to look for GitHub references
only within each text node of the Markdown AST. But there are two
complications:

  - Text nodes split across formatting, and it’s valid for someone to
    write `*Paired* with @decentralion, but *tested* independently`, or
    `**Closes** #12345`, or something.

  - Sometimes contiguous blocks of text expand to multiple text nodes,
    because of how CommonMark approaches smart punctuation. For
    instance: the document `It's got "punctuation" and stuff!` has eight
    text nodes ([demo][1]).

In this commit, we introduce functions `deformat` and `coalesceText` to
solve these problems. (They go together because `coalesceText` is useful
for testing `deformat`.)

[1]: https://spec.commonmark.org/dingus/?text=It%27s%20got%20%22punctuation%22%20and%20stuff!

wchargin-branch: markdown-deformat
This commit is contained in:
William Chargin 2018-06-28 17:30:59 -07:00 committed by GitHub
parent 0cc2907e9e
commit 4ee1ed54c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 237 additions and 0 deletions

View File

@ -0,0 +1,100 @@
// @flow
import {Node} from "commonmark";
// Copied from:
// https://github.com/DefinitelyTyped/DefinitelyTyped/blob/bd35c127a6fd869ab2844082ae41047668178b7f/types/commonmark/index.d.ts#L14-L15
type NodeType =
| "text"
| "softbreak"
| "linebreak"
| "emph"
| "strong"
| "html_inline"
| "link"
| "image"
| "code"
| "document"
| "paragraph"
| "block_quote"
| "item"
| "list"
| "heading"
| "code_block"
| "html_block"
| "thematic_break"
| "custom_inline"
| "custom_block";
export function deformat(ast: Node): void {
const walker = ast.walker();
for (let step; (step = walker.next()); ) {
const node: Node = step.node;
const type: NodeType = node.type;
switch (type) {
case "text":
break;
case "softbreak": {
const space = new Node("text", node.sourcepos);
space.literal = " ";
node.insertBefore(space);
node.unlink();
break;
}
case "linebreak":
break;
case "emph":
case "strong":
case "link":
case "image":
if (!step.entering) {
// Splice out the node.
while (node.firstChild) {
node.insertBefore(node.firstChild);
}
node.unlink();
}
break;
case "html_inline":
case "code":
case "document":
case "paragraph":
case "block_quote":
case "item":
case "list":
case "heading":
case "code_block":
case "html_block":
case "thematic_break":
case "custom_inline":
case "custom_block":
break;
default:
// eslint-disable-next-line no-unused-expressions
(type: empty);
throw new Error("unexpected type: " + type);
}
}
}
export function coalesceText(ast: Node): void {
const walker = ast.walker();
let acc = [];
let firstTextNode = null;
for (let step; (step = walker.next()); ) {
const node: Node = step.node;
const type: NodeType = node.type;
if (type === "text") {
acc.push(node.literal);
if (firstTextNode == null) {
firstTextNode = node;
} else {
node.unlink();
}
} else if (firstTextNode != null) {
firstTextNode.literal = acc.join("");
acc = [];
firstTextNode = null;
}
}
}

View File

@ -0,0 +1,137 @@
// @flow
import {Node, Parser, XmlRenderer} from "commonmark";
import {deformat, coalesceText} from "./parseMarkdown";
describe("plugins/github/parseMarkdown", () => {
function astContents(ast) {
// The ASTs may differ in their `sourcepos` values, so we can't
// directly compare them for equality. Instead, we compare through
// the XML-rendered version of the tree. This has the side-effect
// that the Jest diffs are much more readable.
return new XmlRenderer().render(ast);
}
describe("coalesceText", () => {
it("coalesces adjacent text blocks", () => {
// This string will parse to a paragraph with eight text nodes:
// one for each apostrophe, quote, and exclamation mark, and one
// for each other contiguous block of text.
const inputString = 'It\'s got "punctuation" and stuff!';
const ast1 = new Parser().parse(inputString);
const ast2 = new Parser().parse(inputString);
{
const para = ast2.firstChild;
expect(para.type).toBe("paragraph");
const text = para.firstChild;
expect(text.type).toBe("text");
while (text.next) {
text.next.unlink();
}
text.literal = inputString;
}
expect(astContents(ast1)).not.toEqual(astContents(ast2));
coalesceText(ast1);
expect(astContents(ast1)).toEqual(astContents(ast2));
});
it("doesn't coalesce across soft breaks, hard breaks, or blocks", () => {
const inputString = "Hello\nworld \nfriends\n\nand\n\n> foes\n";
const ast1 = new Parser().parse(inputString);
coalesceText(ast1);
const ast2 = new Parser().parse(inputString);
expect(ast1).toEqual(ast2); // even sourcepos should be the same
});
});
describe("deformat", () => {
// The output AST of `deformat` usually includes consecutive `text`
// nodes, and therefore may not be possible ot generate by directly
// parsing a given input document. For instance, deformatting the
// input `hello *world*` yields two text nodes `"hello "` and
// `"world"`, but no Markdown document parses to this same tree.
// Therefore, we include two test cases: one that directly
// constructs the expected AST (which is tedious but foolproof), and
// one that sends both the actual deformatted AST and the expected
// AST through `coalesceText`, which is easier to read and write but
// not quite as convincing a test because the output is
// post-processed.
it("works on a simple example", () => {
const ast1 = new Parser().parse("hello *world* and **f*r*iends**");
const ast2 = (() => {
const root = new Node("document");
let cursor = root;
cursor.appendChild(new Node("paragraph"));
cursor = cursor.firstChild;
cursor.appendChild(new Node("text"));
cursor = cursor.firstChild;
cursor.literal = "hello ";
for (const lit of ["world", " and ", "f", "r", "iends"]) {
cursor.insertAfter(new Node("text"));
cursor = cursor.next;
cursor.literal = lit;
}
return root;
})();
expect(astContents(ast1)).not.toEqual(astContents(ast2));
deformat(ast1);
expect(astContents(ast1)).toEqual(astContents(ast2));
});
it("works on a full example", () => {
const ast = new Parser().parse(
[
"Hello *dear **world** of* friends",
"and everyone else, too.",
"",
"Some `code` for [you][1]:",
"",
"```markdown",
"# such *meta*",
"much wow",
"```",
"",
"[1]: https://example.com/",
"",
"Here's a list: <!-- it's a secret -->",
" - **important** things",
" - *also **important*** stuff",
" - a*b*c versus `a*b*c`",
"",
"> idea: ![lightbulb icon] never mind I forgot",
"",
"[lightbulb icon]: https://example.com/lightbulb.png",
"",
].join("\n")
);
coalesceText(ast);
const expected = new Parser().parse(
[
"Hello dear world of friends and everyone else, too.",
"",
"Some `code` for you:",
"",
"```markdown",
"# such *meta*",
"much wow",
"```",
"",
"Here's a list: <!-- it's a secret -->",
" - important things",
" - also important stuff",
" - abc versus `a*b*c`",
"",
"> idea: lightbulb icon never mind I forgot",
"",
].join("\n")
);
coalesceText(expected);
expect(astContents(ast)).not.toEqual(astContents(expected));
deformat(ast);
coalesceText(ast);
expect(astContents(ast)).toEqual(astContents(expected));
});
});
});