Extract contiguous blocks of text from Markdown (#479)
Summary: This commit exposes a function of type `(string): string[]` to encapsulate the whole Markdown pipeline, from parsing to AST transformation to text node extraction. Clients of this module do not need to know about `commonmark`. Test Plan: A single comprehensive test case has been added. wchargin-branch: text-blocks
This commit is contained in:
parent
bc9e94b2a1
commit
a9600d0379
|
@ -1,6 +1,44 @@
|
|||
// @flow
|
||||
|
||||
import {Node} from "commonmark";
|
||||
import {Node, Parser} from "commonmark";
|
||||
|
||||
/**
|
||||
* Extract maximal contiguous blocks of text from a Markdown string, in
|
||||
* source-appearance order.
|
||||
*
|
||||
* For the purposes of this method, code (of both the inline and block
|
||||
* varieties) is not considered text, and will not be included in the
|
||||
* output at all. HTML contents are similarly excluded.
|
||||
*
|
||||
* Normal text, emphasized/strong text, link text, and image alt text
|
||||
* all count as text and will be included. A block of text is not
|
||||
* required to have the same formatting: e.g., the Markdown document
|
||||
* given by `hello *there* [you](https://example.com)` without the
|
||||
* backticks has one contiguous block of text: `"hello there you"`.
|
||||
*
|
||||
* Softbreaks count as normal text, and render as a single space.
|
||||
* Hardbreaks break a contiguous block of text.
|
||||
*
|
||||
* Block-level elements, such as paragraphs, lists, and block quotes,
|
||||
* break contiguous blocks of text.
|
||||
*
|
||||
* See test cases for examples.
|
||||
*/
|
||||
export function textBlocks(string: string): string[] {
|
||||
const ast = new Parser().parse(string);
|
||||
deformat(ast);
|
||||
coalesceText(ast);
|
||||
const walker = ast.walker();
|
||||
const results = [];
|
||||
for (let step; (step = walker.next()); ) {
|
||||
const node: Node = step.node;
|
||||
const type: NodeType = node.type;
|
||||
if (type === "text") {
|
||||
results.push(node.literal);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
// Copied from:
|
||||
// https://github.com/DefinitelyTyped/DefinitelyTyped/blob/bd35c127a6fd869ab2844082ae41047668178b7f/types/commonmark/index.d.ts#L14-L15
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
import {Node, Parser, XmlRenderer} from "commonmark";
|
||||
|
||||
import {deformat, coalesceText} from "./parseMarkdown";
|
||||
import {coalesceText, deformat, textBlocks} from "./parseMarkdown";
|
||||
|
||||
describe("plugins/github/parseMarkdown", () => {
|
||||
function astContents(ast) {
|
||||
|
@ -13,6 +13,45 @@ describe("plugins/github/parseMarkdown", () => {
|
|||
return new XmlRenderer().render(ast);
|
||||
}
|
||||
|
||||
describe("textBlocks", () => {
|
||||
it("works on a full example", () => {
|
||||
const input = [
|
||||
"Hello *dear **world** of* friends",
|
||||
"and everyone else, too.",
|
||||
"",
|
||||
"Some `code` for [you][1]:",
|
||||
"",
|
||||
"```markdown",
|
||||
"# such *meta*",
|
||||
"much wow",
|
||||
"```",
|
||||
"",
|
||||
"[1]: https://example.com/",
|
||||
"",
|
||||
"Here's a list: <!-- it's a secret -->",
|
||||
" - **important** things",
|
||||
" - *also **important*** stuff",
|
||||
" - a*b*c versus `a*b*c`",
|
||||
"",
|
||||
"> idea: ![lightbulb icon] never mind I forgot",
|
||||
"",
|
||||
"[lightbulb icon]: https://example.com/lightbulb.png",
|
||||
"",
|
||||
].join("\n");
|
||||
const expected = [
|
||||
"Hello dear world of friends and everyone else, too.",
|
||||
"Some ",
|
||||
" for you:",
|
||||
"Here's a list: ",
|
||||
"important things",
|
||||
"also important stuff",
|
||||
"abc versus ",
|
||||
"idea: lightbulb icon never mind I forgot",
|
||||
];
|
||||
expect(textBlocks(input)).toEqual(expected);
|
||||
});
|
||||
});
|
||||
|
||||
describe("coalesceText", () => {
|
||||
it("coalesces adjacent text blocks", () => {
|
||||
// This string will parse to a paragraph with eight text nodes:
|
||||
|
|
Loading…
Reference in New Issue