Transform Markdown AST to strip formatting (#441)
Summary: This makes progress on #432. We’d like to look for GitHub references only within each text node of the Markdown AST. But there are two complications: - Text nodes split across formatting, and it’s valid for someone to write `*Paired* with @decentralion, but *tested* independently`, or `**Closes** #12345`, or something. - Sometimes contiguous blocks of text expand to multiple text nodes, because of how CommonMark approaches smart punctuation. For instance: the document `It's got "punctuation" and stuff!` has eight text nodes ([demo][1]). In this commit, we introduce functions `deformat` and `coalesceText` to solve these problems. (They go together because `coalesceText` is useful for testing `deformat`.) [1]: https://spec.commonmark.org/dingus/?text=It%27s%20got%20%22punctuation%22%20and%20stuff! wchargin-branch: markdown-deformat
This commit is contained in:
parent
0cc2907e9e
commit
4ee1ed54c8
|
@ -0,0 +1,100 @@
|
||||||
|
// @flow
|
||||||
|
|
||||||
|
import {Node} from "commonmark";
|
||||||
|
|
||||||
|
// Copied from:
|
||||||
|
// https://github.com/DefinitelyTyped/DefinitelyTyped/blob/bd35c127a6fd869ab2844082ae41047668178b7f/types/commonmark/index.d.ts#L14-L15
|
||||||
|
type NodeType =
|
||||||
|
| "text"
|
||||||
|
| "softbreak"
|
||||||
|
| "linebreak"
|
||||||
|
| "emph"
|
||||||
|
| "strong"
|
||||||
|
| "html_inline"
|
||||||
|
| "link"
|
||||||
|
| "image"
|
||||||
|
| "code"
|
||||||
|
| "document"
|
||||||
|
| "paragraph"
|
||||||
|
| "block_quote"
|
||||||
|
| "item"
|
||||||
|
| "list"
|
||||||
|
| "heading"
|
||||||
|
| "code_block"
|
||||||
|
| "html_block"
|
||||||
|
| "thematic_break"
|
||||||
|
| "custom_inline"
|
||||||
|
| "custom_block";
|
||||||
|
|
||||||
|
export function deformat(ast: Node): void {
|
||||||
|
const walker = ast.walker();
|
||||||
|
for (let step; (step = walker.next()); ) {
|
||||||
|
const node: Node = step.node;
|
||||||
|
const type: NodeType = node.type;
|
||||||
|
switch (type) {
|
||||||
|
case "text":
|
||||||
|
break;
|
||||||
|
case "softbreak": {
|
||||||
|
const space = new Node("text", node.sourcepos);
|
||||||
|
space.literal = " ";
|
||||||
|
node.insertBefore(space);
|
||||||
|
node.unlink();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case "linebreak":
|
||||||
|
break;
|
||||||
|
case "emph":
|
||||||
|
case "strong":
|
||||||
|
case "link":
|
||||||
|
case "image":
|
||||||
|
if (!step.entering) {
|
||||||
|
// Splice out the node.
|
||||||
|
while (node.firstChild) {
|
||||||
|
node.insertBefore(node.firstChild);
|
||||||
|
}
|
||||||
|
node.unlink();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case "html_inline":
|
||||||
|
case "code":
|
||||||
|
case "document":
|
||||||
|
case "paragraph":
|
||||||
|
case "block_quote":
|
||||||
|
case "item":
|
||||||
|
case "list":
|
||||||
|
case "heading":
|
||||||
|
case "code_block":
|
||||||
|
case "html_block":
|
||||||
|
case "thematic_break":
|
||||||
|
case "custom_inline":
|
||||||
|
case "custom_block":
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// eslint-disable-next-line no-unused-expressions
|
||||||
|
(type: empty);
|
||||||
|
throw new Error("unexpected type: " + type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function coalesceText(ast: Node): void {
|
||||||
|
const walker = ast.walker();
|
||||||
|
let acc = [];
|
||||||
|
let firstTextNode = null;
|
||||||
|
for (let step; (step = walker.next()); ) {
|
||||||
|
const node: Node = step.node;
|
||||||
|
const type: NodeType = node.type;
|
||||||
|
if (type === "text") {
|
||||||
|
acc.push(node.literal);
|
||||||
|
if (firstTextNode == null) {
|
||||||
|
firstTextNode = node;
|
||||||
|
} else {
|
||||||
|
node.unlink();
|
||||||
|
}
|
||||||
|
} else if (firstTextNode != null) {
|
||||||
|
firstTextNode.literal = acc.join("");
|
||||||
|
acc = [];
|
||||||
|
firstTextNode = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,137 @@
|
||||||
|
// @flow
|
||||||
|
|
||||||
|
import {Node, Parser, XmlRenderer} from "commonmark";
|
||||||
|
|
||||||
|
import {deformat, coalesceText} from "./parseMarkdown";
|
||||||
|
|
||||||
|
describe("plugins/github/parseMarkdown", () => {
|
||||||
|
function astContents(ast) {
|
||||||
|
// The ASTs may differ in their `sourcepos` values, so we can't
|
||||||
|
// directly compare them for equality. Instead, we compare through
|
||||||
|
// the XML-rendered version of the tree. This has the side-effect
|
||||||
|
// that the Jest diffs are much more readable.
|
||||||
|
return new XmlRenderer().render(ast);
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("coalesceText", () => {
|
||||||
|
it("coalesces adjacent text blocks", () => {
|
||||||
|
// This string will parse to a paragraph with eight text nodes:
|
||||||
|
// one for each apostrophe, quote, and exclamation mark, and one
|
||||||
|
// for each other contiguous block of text.
|
||||||
|
const inputString = 'It\'s got "punctuation" and stuff!';
|
||||||
|
const ast1 = new Parser().parse(inputString);
|
||||||
|
const ast2 = new Parser().parse(inputString);
|
||||||
|
{
|
||||||
|
const para = ast2.firstChild;
|
||||||
|
expect(para.type).toBe("paragraph");
|
||||||
|
const text = para.firstChild;
|
||||||
|
expect(text.type).toBe("text");
|
||||||
|
while (text.next) {
|
||||||
|
text.next.unlink();
|
||||||
|
}
|
||||||
|
text.literal = inputString;
|
||||||
|
}
|
||||||
|
expect(astContents(ast1)).not.toEqual(astContents(ast2));
|
||||||
|
coalesceText(ast1);
|
||||||
|
expect(astContents(ast1)).toEqual(astContents(ast2));
|
||||||
|
});
|
||||||
|
|
||||||
|
it("doesn't coalesce across soft breaks, hard breaks, or blocks", () => {
|
||||||
|
const inputString = "Hello\nworld \nfriends\n\nand\n\n> foes\n";
|
||||||
|
const ast1 = new Parser().parse(inputString);
|
||||||
|
coalesceText(ast1);
|
||||||
|
const ast2 = new Parser().parse(inputString);
|
||||||
|
expect(ast1).toEqual(ast2); // even sourcepos should be the same
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("deformat", () => {
|
||||||
|
// The output AST of `deformat` usually includes consecutive `text`
|
||||||
|
// nodes, and therefore may not be possible ot generate by directly
|
||||||
|
// parsing a given input document. For instance, deformatting the
|
||||||
|
// input `hello *world*` yields two text nodes `"hello "` and
|
||||||
|
// `"world"`, but no Markdown document parses to this same tree.
|
||||||
|
// Therefore, we include two test cases: one that directly
|
||||||
|
// constructs the expected AST (which is tedious but foolproof), and
|
||||||
|
// one that sends both the actual deformatted AST and the expected
|
||||||
|
// AST through `coalesceText`, which is easier to read and write but
|
||||||
|
// not quite as convincing a test because the output is
|
||||||
|
// post-processed.
|
||||||
|
|
||||||
|
it("works on a simple example", () => {
|
||||||
|
const ast1 = new Parser().parse("hello *world* and **f*r*iends**");
|
||||||
|
const ast2 = (() => {
|
||||||
|
const root = new Node("document");
|
||||||
|
let cursor = root;
|
||||||
|
cursor.appendChild(new Node("paragraph"));
|
||||||
|
cursor = cursor.firstChild;
|
||||||
|
cursor.appendChild(new Node("text"));
|
||||||
|
cursor = cursor.firstChild;
|
||||||
|
cursor.literal = "hello ";
|
||||||
|
for (const lit of ["world", " and ", "f", "r", "iends"]) {
|
||||||
|
cursor.insertAfter(new Node("text"));
|
||||||
|
cursor = cursor.next;
|
||||||
|
cursor.literal = lit;
|
||||||
|
}
|
||||||
|
return root;
|
||||||
|
})();
|
||||||
|
expect(astContents(ast1)).not.toEqual(astContents(ast2));
|
||||||
|
deformat(ast1);
|
||||||
|
expect(astContents(ast1)).toEqual(astContents(ast2));
|
||||||
|
});
|
||||||
|
|
||||||
|
it("works on a full example", () => {
|
||||||
|
const ast = new Parser().parse(
|
||||||
|
[
|
||||||
|
"Hello *dear **world** of* friends",
|
||||||
|
"and everyone else, too.",
|
||||||
|
"",
|
||||||
|
"Some `code` for [you][1]:",
|
||||||
|
"",
|
||||||
|
"```markdown",
|
||||||
|
"# such *meta*",
|
||||||
|
"much wow",
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
"[1]: https://example.com/",
|
||||||
|
"",
|
||||||
|
"Here's a list: <!-- it's a secret -->",
|
||||||
|
" - **important** things",
|
||||||
|
" - *also **important*** stuff",
|
||||||
|
" - a*b*c versus `a*b*c`",
|
||||||
|
"",
|
||||||
|
"> idea: ![lightbulb icon] never mind I forgot",
|
||||||
|
"",
|
||||||
|
"[lightbulb icon]: https://example.com/lightbulb.png",
|
||||||
|
"",
|
||||||
|
].join("\n")
|
||||||
|
);
|
||||||
|
coalesceText(ast);
|
||||||
|
const expected = new Parser().parse(
|
||||||
|
[
|
||||||
|
"Hello dear world of friends and everyone else, too.",
|
||||||
|
"",
|
||||||
|
"Some `code` for you:",
|
||||||
|
"",
|
||||||
|
"```markdown",
|
||||||
|
"# such *meta*",
|
||||||
|
"much wow",
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
"Here's a list: <!-- it's a secret -->",
|
||||||
|
" - important things",
|
||||||
|
" - also important stuff",
|
||||||
|
" - abc versus `a*b*c`",
|
||||||
|
"",
|
||||||
|
"> idea: lightbulb icon never mind I forgot",
|
||||||
|
"",
|
||||||
|
].join("\n")
|
||||||
|
);
|
||||||
|
coalesceText(expected);
|
||||||
|
expect(astContents(ast)).not.toEqual(astContents(expected));
|
||||||
|
deformat(ast);
|
||||||
|
coalesceText(ast);
|
||||||
|
expect(astContents(ast)).toEqual(astContents(expected));
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
Loading…
Reference in New Issue