Extract contiguous blocks of text from Markdown (#479)

Summary: This commit exposes a function of type `(string): string[]` to encapsulate the whole Markdown pipeline, from parsing to AST transformation to text node extraction. Clients of this module do not need to know about `commonmark`. Test Plan: A single comprehensive test case has been added. wchargin-branch: text-blocks
2025-03-03 21:50:45 +00:00 · 2018-07-03 11:47:35 -07:00 · 2018-07-03 11:47:35 -07:00 · a9600d0379
commit a9600d0379
parent bc9e94b2a1
2 changed files with 79 additions and 2 deletions
--- a/src/plugins/github/parseMarkdown.js
+++ b/src/plugins/github/parseMarkdown.js
@ -1,6 +1,44 @@
 // @flow

-import {Node} from "commonmark";
+import {Node, Parser} from "commonmark";
+
+/**
+ * Extract maximal contiguous blocks of text from a Markdown string, in
+ * source-appearance order.
+ *
+ * For the purposes of this method, code (of both the inline and block
+ * varieties) is not considered text, and will not be included in the
+ * output at all. HTML contents are similarly excluded.
+ *
+ * Normal text, emphasized/strong text, link text, and image alt text
+ * all count as text and will be included. A block of text is not
+ * required to have the same formatting: e.g., the Markdown document
+ * given by `hello *there* [you](https://example.com)` without the
+ * backticks has one contiguous block of text: `"hello there you"`.
+ *
+ * Softbreaks count as normal text, and render as a single space.
+ * Hardbreaks break a contiguous block of text.
+ *
+ * Block-level elements, such as paragraphs, lists, and block quotes,
+ * break contiguous blocks of text.
+ *
+ * See test cases for examples.
+ */
+export function textBlocks(string: string): string[] {
+  const ast = new Parser().parse(string);
+  deformat(ast);
+  coalesceText(ast);
+  const walker = ast.walker();
+  const results = [];
+  for (let step; (step = walker.next()); ) {
+    const node: Node = step.node;
+    const type: NodeType = node.type;
+    if (type === "text") {
+      results.push(node.literal);
+    }
+  }
+  return results;
+}

 // Copied from:
 // https://github.com/DefinitelyTyped/DefinitelyTyped/blob/bd35c127a6fd869ab2844082ae41047668178b7f/types/commonmark/index.d.ts#L14-L15
--- a/src/plugins/github/parseMarkdown.test.js
+++ b/src/plugins/github/parseMarkdown.test.js
@ -2,7 +2,7 @@

 import {Node, Parser, XmlRenderer} from "commonmark";

-import {deformat, coalesceText} from "./parseMarkdown";
+import {coalesceText, deformat, textBlocks} from "./parseMarkdown";

 describe("plugins/github/parseMarkdown", () => {
  function astContents(ast) {
@ -13,6 +13,45 @@ describe("plugins/github/parseMarkdown", () => {
    return new XmlRenderer().render(ast);
  }

+  describe("textBlocks", () => {
+    it("works on a full example", () => {
+      const input = [
+        "Hello *dear **world** of* friends",
+        "and everyone else, too.",
+        "",
+        "Some `code` for [you][1]:",
+        "",
+        "```markdown",
+        "# such *meta*",
+        "much wow",
+        "```",
+        "",
+        "[1]: https://example.com/",
+        "",
+        "Here's a list: <!-- it's a secret -->",
+        "  - **important** things",
+        "  - *also **important*** stuff",
+        "  - a*b*c versus `a*b*c`",
+        "",
+        "> idea: ![lightbulb icon] never mind I forgot",
+        "",
+        "[lightbulb icon]: https://example.com/lightbulb.png",
+        "",
+      ].join("\n");
+      const expected = [
+        "Hello dear world of friends and everyone else, too.",
+        "Some ",
+        " for you:",
+        "Here's a list: ",
+        "important things",
+        "also important stuff",
+        "abc versus ",
+        "idea: lightbulb icon never mind I forgot",
+      ];
+      expect(textBlocks(input)).toEqual(expected);
+    });
+  });
+
  describe("coalesceText", () => {
    it("coalesces adjacent text blocks", () => {
      // This string will parse to a paragraph with eight text nodes: