markdown: ignore references in HTML code elements (#907)
Summary: Fixes #903. We already ignore Markdown code syntax (backticks), but prior to this commit we treated the contents of all HTML elements, including `<code>`, as normal text. As of this commit, `<code>` elements are stripped entirely. Other HTML elements, like `<em>`, are unaffected. Test Plan: Unit tests added. Also, load data for `ipfs/js-ipfs-block-service`, and observe in the UI that PR `#36` (Update aegir to version 9.0.0) no longer has any outward references. wchargin-branch: markdown-html-code
This commit is contained in:
parent
3e49466ad5
commit
1b09a7f61b
|
@ -1,6 +1,7 @@
|
|||
// @flow
|
||||
|
||||
import {Node, Parser} from "commonmark";
|
||||
import {OPENTAG, CLOSETAG} from "commonmark/lib/common";
|
||||
|
||||
/**
|
||||
* Extract maximal contiguous blocks of text from a Markdown string, in
|
||||
|
@ -66,9 +67,35 @@ type NodeType =
|
|||
|
||||
export function deformat(ast: Node): void {
|
||||
const walker = ast.walker();
|
||||
// We ignore the contents of HTML "code" elements and their subtrees.
|
||||
// This variable tracks how deep we are in such a tree. It is 0 if we
|
||||
// are not in such a tree, 1 if we are in a "code" element, 2 if we
|
||||
// are in an element inside a "code" element, etc.
|
||||
let htmlDepth: number = 0;
|
||||
const reOpenCodeTag = /^<code(?:$|[ >])/i;
|
||||
const reOpenTag = new RegExp(`^(?:${OPENTAG})`);
|
||||
const reCloseTag = new RegExp(`^(?:${CLOSETAG})`);
|
||||
|
||||
for (let step; (step = walker.next()); ) {
|
||||
const node: Node = step.node;
|
||||
const type: NodeType = node.type;
|
||||
if (htmlDepth > 0) {
|
||||
if (type === "html_inline") {
|
||||
if (reOpenTag.test(node.literal)) {
|
||||
htmlDepth++;
|
||||
} else if (reCloseTag.test(node.literal)) {
|
||||
htmlDepth--;
|
||||
}
|
||||
}
|
||||
// The AST walker gets into a broken state if you unlink a node
|
||||
// that has children before those children have been visited. We
|
||||
// only unlink when leaving a node, or when entering a node that
|
||||
// has no children.
|
||||
if (!step.entering || node.firstChild == null) {
|
||||
node.unlink();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
switch (type) {
|
||||
case "text":
|
||||
break;
|
||||
|
@ -94,6 +121,11 @@ export function deformat(ast: Node): void {
|
|||
}
|
||||
break;
|
||||
case "html_inline":
|
||||
if (reOpenCodeTag.test(node.literal)) {
|
||||
htmlDepth++; // should have been 0 previously
|
||||
}
|
||||
node.unlink();
|
||||
break;
|
||||
case "code":
|
||||
case "document":
|
||||
case "paragraph":
|
||||
|
|
|
@ -50,6 +50,54 @@ describe("plugins/github/parseMarkdown", () => {
|
|||
];
|
||||
expect(textBlocks(input)).toEqual(expected);
|
||||
});
|
||||
|
||||
it("includes text inside of non-code HTML elements", () => {
|
||||
const input = "My <strong>#1</strong> pal";
|
||||
expect(textBlocks(input)).toEqual(["My #1 pal"]);
|
||||
});
|
||||
|
||||
it('strips HTML "code" elements', () => {
|
||||
const input = "My <code>#1</code> pal";
|
||||
expect(textBlocks(input)).toEqual(["My pal"]);
|
||||
});
|
||||
|
||||
it('strips subtrees rooted at HTML "code" elements', () => {
|
||||
const input = "My <code>#1 <strong>*and* #2</strong></code> pals";
|
||||
expect(textBlocks(input)).toEqual(["My pals"]);
|
||||
});
|
||||
|
||||
it('strips "code" elements within "code" elements', () => {
|
||||
const input = "see <code>#1 and <code>#2</code> okay</code>";
|
||||
expect(textBlocks(input)).toEqual(["see "]);
|
||||
});
|
||||
|
||||
it('handles comments and CDATA within "code" elements', () => {
|
||||
// These are "html_inline" nodes, but are not HTML elements. They
|
||||
// may contain closing-tag sequences, but these do not actually
|
||||
// close a tag.
|
||||
const input = [
|
||||
"note",
|
||||
"<code>alpha ",
|
||||
"<!-- bravo </code> charlie --> ",
|
||||
"<![CDATA[delta </code> echo]]> ",
|
||||
"foxtrot</code> ",
|
||||
"well",
|
||||
].join("");
|
||||
expect(textBlocks(input)).toEqual(["note well"]);
|
||||
});
|
||||
|
||||
it('strips HTML "pre" blocks and subtrees', () => {
|
||||
// "pre" is not handled specially; all blocks are skipped.
|
||||
const input =
|
||||
"Hello\n\n<pre>some pre-formatted <code>code</code></pre>\n\nworld";
|
||||
expect(textBlocks(input)).toEqual(["Hello", "world"]);
|
||||
});
|
||||
|
||||
it('strips non-"pre" blocks and subtrees', () => {
|
||||
const input =
|
||||
"Hello\n\n<div>some pre-formatted <code>code</code></div>\n\nworld";
|
||||
expect(textBlocks(input)).toEqual(["Hello", "world"]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("coalesceText", () => {
|
||||
|
@ -134,7 +182,7 @@ describe("plugins/github/parseMarkdown", () => {
|
|||
"",
|
||||
"[1]: https://example.com/",
|
||||
"",
|
||||
"Here's a list: <!-- it's a secret -->",
|
||||
"Here's a list:<!-- it's a secret -->",
|
||||
" - **important** things",
|
||||
" - *also **important*** stuff",
|
||||
" - a*b*c versus `a*b*c`",
|
||||
|
@ -157,7 +205,7 @@ describe("plugins/github/parseMarkdown", () => {
|
|||
"much wow",
|
||||
"```",
|
||||
"",
|
||||
"Here's a list: <!-- it's a secret -->",
|
||||
"Here's a list:",
|
||||
" - important things",
|
||||
" - also important stuff",
|
||||
" - abc versus `a*b*c`",
|
||||
|
|
|
@ -38,6 +38,16 @@ describe("plugins/github/parseReferences", () => {
|
|||
expect(parseReferences(input)).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("finds references in normal HTML elements", () => {
|
||||
const input = "see <em>#1, #2, and #3</em> for context";
|
||||
expect(parseReferences(input)).toHaveLength(3);
|
||||
});
|
||||
|
||||
it('does not find references in HTML "code" elements', () => {
|
||||
const input = "see <code>#1, #2, and #3</code> for context";
|
||||
expect(parseReferences(input)).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("does not find references in inline code with lots of backticks", () => {
|
||||
// An attempt to evade inline code with regular expressions might
|
||||
// well fail here, because an even number of backticks appears on
|
||||
|
|
Loading…
Reference in New Issue