Parse GitHub references from Markdown text nodes (#480)

Summary:
See #432. This commit switches the GitHub Markdown parser from matching
regular expressions against the raw post body to matching the same
regular expressions against the semantic text of a Markdown document.
See test cases for `parseReferences` and `parseMarkdown` for more
details.

There are no changes to snapshots or cached GitHub data because the
Markdown in the example repository is simple enough to be properly
parsed by a simple approach that treats everything as literal text.

The change to the “finds numeric references in a multiline string” test
case is to strip leading indentation from the string in question. As
previously written, the test had four spaces of leading indentation on
each line, causing the Markdown parser to interpret it as a code block,
in turn causing our logic to (correctly) omit it entirely. The revised
version of the test has no leading indentation.

Test Plan:
Light unit tests included; these tests are intended to verify that the
actual Markdown logic from `parseMarkdown` is being used, and that file
has more extensive tests. Additionally, `yarn travis --full` passes.

wchargin-branch: github-parse-markdown
This commit is contained in:
William Chargin 2018-07-03 11:52:27 -07:00 committed by GitHub
parent a9600d0379
commit 56d48e254c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 62 additions and 17 deletions

View File

@ -1,48 +1,61 @@
// @flow
import {textBlocks} from "./parseMarkdown";
export type ParsedReference = {|
// "@user" or "#123" or "https://github.com/owner/name/..."
+ref: string,
+refType: "BASIC" | "PAIRED_WITH",
|};
/**
* Parse GitHub references from a Markdown document, such as an issue or
* comment body. This will include references that span multiple lines
* (across softbreaks), and exclude references that occur within code
* blocks.
*/
export function parseReferences(body: string): ParsedReference[] {
// Note to maintainer: If it becomes necessary to encode references in a
// richer format, consider implementing the type signature described in
// https://github.com/sourcecred/sourcecred/pull/130#pullrequestreview-113849998
const blocks = textBlocks(body);
return [].concat.apply([], blocks.map(parseReferencesFromRawString));
}
function parseReferencesFromRawString(textBlock: string): ParsedReference[] {
return [
...findNumericReferences(body),
...findRepoNumericReferences(body),
...findGithubUrlReferences(body),
...findUsernameReferences(body),
...findNumericReferences(textBlock),
...findRepoNumericReferences(textBlock),
...findGithubUrlReferences(textBlock),
...findUsernameReferences(textBlock),
];
}
function findRepoNumericReferences(body: string): ParsedReference[] {
function findRepoNumericReferences(textBlock: string): ParsedReference[] {
return findAllMatches(
/(?:\W|^)([a-zA-Z0-9-]+\/[a-zA-Z0-9-]+#\d+)(?=\W|$)/gm,
body
textBlock
).map((x) => ({
refType: "BASIC",
ref: x[1],
}));
}
function findNumericReferences(body: string): ParsedReference[] {
return findAllMatches(/(?:\W|^)(#\d+)(?=\W|$)/gm, body).map((x) => ({
function findNumericReferences(textBlock: string): ParsedReference[] {
return findAllMatches(/(?:\W|^)(#\d+)(?=\W|$)/gm, textBlock).map((x) => ({
refType: "BASIC",
ref: x[1],
}));
}
function findUsernameReferences(body: string): ParsedReference[] {
function findUsernameReferences(textBlock: string): ParsedReference[] {
const pairedWithRefs = findAllMatches(
/(?:\W|^)(?:P|p)aired(?:-| )(?:w|W)ith:? (@[a-zA-Z0-9-]+)(?=\W|$)/gm,
body
textBlock
).map((x) => ({ref: x[1], refType: "PAIRED_WITH"}));
const basicRefs = findAllMatches(
/(?:\W|^)(@[a-zA-Z0-9-]+)(?=\W|$)/gm,
body
textBlock
).map((x) => ({ref: x[1], refType: "BASIC"}));
for (const {ref} of pairedWithRefs) {
const basicRefIndexToRemove = basicRefs.findIndex((x) => x.ref === ref);
@ -54,7 +67,7 @@ function findUsernameReferences(body: string): ParsedReference[] {
return [...pairedWithRefs, ...basicRefs];
}
function findGithubUrlReferences(body: string): ParsedReference[] {
function findGithubUrlReferences(textBlock: string): ParsedReference[] {
const githubNamePart = /(?:[a-zA-Z0-9_-]+)/.source;
const urlRegex = new RegExp(
"" +
@ -74,7 +87,7 @@ function findGithubUrlReferences(body: string): ParsedReference[] {
/(?=[^\w/]|$)/.source,
"gm"
);
return findAllMatches(urlRegex, body).map((match) => ({
return findAllMatches(urlRegex, textBlock).map((match) => ({
refType: "BASIC",
ref: match[1],
}));

View File

@ -17,10 +17,11 @@ describe("parseReferences", () => {
});
it("finds numeric references in a multiline string", () => {
const example = `
This is a multiline string.
It refers to #1. Oh, and to #2 too.
(#42 might be included too - who knows?)`;
const example = [
"This is a multiline string.",
"It refers to #1. Oh, and to #2 too.",
"(#42 might be included too - who knows?)",
].join("\n");
expect(parseReferences(example)).toEqual([
{refType: "BASIC", ref: "#1"},
{refType: "BASIC", ref: "#2"},
@ -32,6 +33,37 @@ describe("parseReferences", () => {
expect(parseReferences("foo#123 #124bar")).toHaveLength(0);
});
it("does not find references in inline code", () => {
const input = "text like `#1` means issue";
expect(parseReferences(input)).toHaveLength(0);
});
it("does not find references in inline code with lots of backticks", () => {
// An attempt to evade inline code with regular expressions might
// well fail here, because an even number of backticks appears on
// each side of the would-be reference.
const input = "text like ````#1```` means issue";
expect(parseReferences(input)).toHaveLength(0);
});
it("does not find references in indented-block code", () => {
const input = "text like\n\n #1\n\nmeans issue";
expect(parseReferences(input)).toHaveLength(0);
});
it("does not find references in fenced-block code", () => {
const input = "text like\n\n```\n#1\n```\n\nmeans issue";
expect(parseReferences(input)).toHaveLength(0);
});
it("finds references with mixed formatting", () => {
const input = "*Paired* with @alphonse, but *reviewed* by @betty.";
expect(parseReferences(input)).toEqual([
{refType: "PAIRED_WITH", ref: "@alphonse"},
{refType: "BASIC", ref: "@betty"},
]);
});
describe("cross-repo links", () => {
const repoRef = "sourcecred/sourcecred#12";
it("a bare link", () => {