Extract contiguous blocks of text from Markdown (#479)

Summary:
This commit exposes a function of type `(string): string[]` to
encapsulate the whole Markdown pipeline, from parsing to AST
transformation to text node extraction. Clients of this module do not
need to know about `commonmark`.

Test Plan:
A single comprehensive test case has been added.

wchargin-branch: text-blocks
This commit is contained in:
William Chargin 2018-07-03 11:47:35 -07:00 committed by GitHub
parent bc9e94b2a1
commit a9600d0379
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 79 additions and 2 deletions

View File

@ -1,6 +1,44 @@
// @flow
import {Node} from "commonmark";
import {Node, Parser} from "commonmark";
/**
* Extract maximal contiguous blocks of text from a Markdown string, in
* source-appearance order.
*
* For the purposes of this method, code (of both the inline and block
* varieties) is not considered text, and will not be included in the
* output at all. HTML contents are similarly excluded.
*
* Normal text, emphasized/strong text, link text, and image alt text
* all count as text and will be included. A block of text is not
* required to have the same formatting: e.g., the Markdown document
* given by `hello *there* [you](https://example.com)` without the
* backticks has one contiguous block of text: `"hello there you"`.
*
* Softbreaks count as normal text, and render as a single space.
* Hardbreaks break a contiguous block of text.
*
* Block-level elements, such as paragraphs, lists, and block quotes,
* break contiguous blocks of text.
*
* See test cases for examples.
*/
export function textBlocks(string: string): string[] {
const ast = new Parser().parse(string);
deformat(ast);
coalesceText(ast);
const walker = ast.walker();
const results = [];
for (let step; (step = walker.next()); ) {
const node: Node = step.node;
const type: NodeType = node.type;
if (type === "text") {
results.push(node.literal);
}
}
return results;
}
// Copied from:
// https://github.com/DefinitelyTyped/DefinitelyTyped/blob/bd35c127a6fd869ab2844082ae41047668178b7f/types/commonmark/index.d.ts#L14-L15

View File

@ -2,7 +2,7 @@
import {Node, Parser, XmlRenderer} from "commonmark";
import {deformat, coalesceText} from "./parseMarkdown";
import {coalesceText, deformat, textBlocks} from "./parseMarkdown";
describe("plugins/github/parseMarkdown", () => {
function astContents(ast) {
@ -13,6 +13,45 @@ describe("plugins/github/parseMarkdown", () => {
return new XmlRenderer().render(ast);
}
describe("textBlocks", () => {
it("works on a full example", () => {
const input = [
"Hello *dear **world** of* friends",
"and everyone else, too.",
"",
"Some `code` for [you][1]:",
"",
"```markdown",
"# such *meta*",
"much wow",
"```",
"",
"[1]: https://example.com/",
"",
"Here's a list: <!-- it's a secret -->",
" - **important** things",
" - *also **important*** stuff",
" - a*b*c versus `a*b*c`",
"",
"> idea: ![lightbulb icon] never mind I forgot",
"",
"[lightbulb icon]: https://example.com/lightbulb.png",
"",
].join("\n");
const expected = [
"Hello dear world of friends and everyone else, too.",
"Some ",
" for you:",
"Here's a list: ",
"important things",
"also important stuff",
"abc versus ",
"idea: lightbulb icon never mind I forgot",
];
expect(textBlocks(input)).toEqual(expected);
});
});
describe("coalesceText", () => {
it("coalesces adjacent text blocks", () => {
// This string will parse to a paragraph with eight text nodes: