Initiatives: implement "cooked HTML" template parsing (#1478)

Will return a partial Initiative, or throw when the template isn't matched.
This commit is contained in:
Robin van Boven 2020-01-07 14:15:26 +01:00 committed by GitHub
parent b05cc84f2e
commit 4ab58a09b5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 526 additions and 0 deletions

View File

@ -25,6 +25,7 @@ test_expect_success "application components must use <Link> instead of <a>" '
":(exclude,top)*/snapshots/*" \
":(exclude,top)src/plugins/discourse/references.test.js" \
":(exclude,top)src/plugins/discourse/createGraph.test.js" \
":(exclude,top)src/plugins/initiatives/htmlTemplate.test.js" \
":(exclude,top)src/webutil/Link.js" \
;
'

View File

@ -0,0 +1,215 @@
// @flow
import {DomHandler, DomUtils, Parser} from "htmlparser2";
import {type URL} from "./initiative";
/*
All headers are case-insensitive and can be h1-h6.
Headers can appear in any order.
A matching header for each field must appear exactly once.
The expected pattern for a cooked HTML template:
## Status: complete
Status value must be in the header, prefixed by "Status:".
Either "complete" or "completed". A missing status value,
or any other value is considered incomplete.
## Champions:
- [@Beanow](/u/beanow)
Any URLs that appear in the content below the "Champion" or "Champions" header.
No filters on user-like types applied here, that's left for after reference detection.
## Dependencies:
- [Dependency](/t/topic/123)
Any URLs that appear in the content below the "Dependency" or "Dependencies" header.
## References:
- [Reference](/t/topic/123)
Any URLs that appear in the content below the "Reference" or "References" header.
## Contributions:
- [Contribution](/t/topic/123)
Any URLs that appear in the content below the "Contribution" or "Contributions" header.
*/
/**
* A mapping from an HTML header, to any URLs in the body that follows it.
*/
type HeaderToURLsMap = Map<string, $ReadOnlyArray<URL>>;
/**
* A partial Iniatiative object, parsed from the Cooked HTML template.
*/
export type HtmlTemplateInitiativePartial = {|
+completed: boolean,
+dependencies: $ReadOnlyArray<URL>,
+references: $ReadOnlyArray<URL>,
+contributions: $ReadOnlyArray<URL>,
+champions: $ReadOnlyArray<URL>,
|};
/**
* Attempts to parse a cooked HTML body for Initiative data.
*
* Throws when it doesn't match the template.
*/
export function parseCookedHtml(
cookedHTML: string
): HtmlTemplateInitiativePartial {
const htu: HeaderToURLsMap = groupURLsByHeader(cookedHTML);
const completed = findCompletionStatus(htu);
const champions = singleMatch(htu, new RegExp(/^Champions?/i));
const contributions = singleMatch(htu, new RegExp(/^Contributions?/i));
const dependencies = singleMatch(htu, new RegExp(/^Dependenc(y|ies)/i));
const references = singleMatch(htu, new RegExp(/^References?/i));
const missing = [];
if (completed === null) missing.push("status");
if (!champions) missing.push("champions");
if (!contributions) missing.push("contributions");
if (!dependencies) missing.push("dependencies");
if (!references) missing.push("references");
if (
completed == null ||
champions == null ||
contributions == null ||
dependencies == null ||
references == null
) {
missing.sort();
throw new Error(`Missing or malformed headers ${JSON.stringify(missing)}`);
}
return {
completed,
dependencies,
references,
contributions,
champions,
};
}
/**
* Takes cooked HTML and creates a HeaderToURLsMap.
*
* Cooked HTML being HTML rendered from Markdown. We're assuming this behaves
* a lot like a subset of HTML, even though the option to write HTML manually
* exists. For the purpose of parsing Initiative data, we can require just
* using Markdown.
*
* Will throw when there are exact duplicate headers, as this would otherwise
* silently merge by header in unexpected ways.
*/
export function groupURLsByHeader(cookedHTML: string): HeaderToURLsMap {
const map: HeaderToURLsMap = new Map();
const dom = toDOM(cookedHTML);
let currentHeader: ?string;
for (const rootEl of dom) {
switch (rootEl.name) {
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
currentHeader = DomUtils.getText(rootEl);
if (map.has(currentHeader)) {
throw new Error(
`Unsupported duplicate header "${currentHeader}" found`
);
}
// We're also interested in just headers, so make sure an entry exists.
map.set(currentHeader, []);
break;
case "p":
case "ul":
case "ol":
if (currentHeader === undefined) break;
const existing = map.get(currentHeader) || [];
const anchors = DomUtils.findAll((el) => el.name === "a", [rootEl]).map(
(a) => a.attribs.href
);
map.set(currentHeader, [...existing, ...anchors]);
break;
}
}
return map;
}
/**
* Finds one "Status:" header, where the value is included in the header itself.
*
* Returns true when "Status:" is followed by "completed" in the header.
* Returns false when "Status:" is followed by any other value.
* Returns null when 0 or >1 headers start with "Status:".
*/
function findCompletionStatus(map: HeaderToURLsMap): boolean | null {
const pattern = new RegExp(/^Status:(.*)/i);
const headers = Array.from(map.keys())
.map((k) => k.trim())
.filter((k) => pattern.test(k));
if (headers.length !== 1) {
return null;
}
const matches = headers[0].match(pattern);
if (matches == null) {
return null;
}
const completedRE = new RegExp(/^completed?$/i);
return completedRE.test(matches[1].trim());
}
/**
* Finds one header to match the given RegExp.
*
* Returns the associated URL[] when exactly 1 header matches.
* Returns null when it matches 0 or >1 headers.
*/
function singleMatch(
map: HeaderToURLsMap,
pattern: RegExp
): $ReadOnlyArray<URL> | null {
const headers = Array.from(map.keys()).filter((k) => pattern.test(k.trim()));
if (headers.length !== 1) {
return null;
}
return map.get(headers[0]) || null;
}
function toDOM(cookedHTML: string): Object {
// Note: DomHandler is actually synchronous, in spite of the nodeback signature.
let dom;
const domHandler = new DomHandler((err, result) => {
if (err) throw err;
dom = result;
});
const htmlParser = new Parser(domHandler);
htmlParser.write(cookedHTML);
htmlParser.end();
// The .end() forces data to be flushed, so we know DomHandler calls the callback.
// But in case some implementation detail changes, add this error.
if (dom === undefined) {
throw new Error("DomHandler callback wasn't called after htmlParser.end()");
}
return dom;
}

View File

@ -0,0 +1,310 @@
// @flow
import {groupURLsByHeader, parseCookedHtml} from "./htmlTemplate";
describe("plugins/initiatives/htmlTemplate", () => {
describe("parseCookedHtml", () => {
const sampleStatusIncomplete = `<h2>Status: Testing</h2>`;
const sampleStatusComplete = `<h2>Status: Completed</h2>`;
const sampleChampion = `
<h2>Champion<a href="https://foo.bar/t/dont-include/10"><sup>?</sup></a>:</h2>
<p>
<a class="mention" href="/u/ChampUser">@ChampUser</a>
</p>
`;
const sampleDependencies = `
<h2>Dependencies:</h2>
<ul>
<li><a href="https://foo.bar/t/dependency/1">Thing we need</a></li>
<li><a href="https://foo.bar/t/dependency/2">Thing we need</a></li>
<li><a href="https://foo.bar/t/dependency/3">Thing we need</a></li>
</ul>
`;
const sampleReferences = `
<h2>References:</h2>
<ul>
<li><a href="https://foo.bar/t/reference/4">Some reference</a></li>
<li><a href="https://foo.bar/t/reference/5/2">Some reference</a></li>
<li><a href="https://foo.bar/t/reference/6/4">Some reference</a></li>
</ul>
`;
const sampleContributions = `
<h2>Contributions:</h2>
<ul>
<li><a href="https://foo.bar/t/contribution/7">Some contribution</a></li>
<li><a href="https://foo.bar/t/contribution/8/2">Some contribution</a></li>
<li><a href="https://github.com/sourcecred/sourcecred/pull/1416">Some contribution</a></li>
</ul>
`;
it("handles an example text", () => {
// Given
const sample = `
${sampleStatusIncomplete}
${sampleChampion}
${sampleDependencies}
${sampleReferences}
${sampleContributions}
`;
// When
const partial = parseCookedHtml(sample);
// Then
expect(partial).toMatchInlineSnapshot(`
Object {
"champions": Array [
"/u/ChampUser",
],
"completed": false,
"contributions": Array [
"https://foo.bar/t/contribution/7",
"https://foo.bar/t/contribution/8/2",
"https://github.com/sourcecred/sourcecred/pull/1416",
],
"dependencies": Array [
"https://foo.bar/t/dependency/1",
"https://foo.bar/t/dependency/2",
"https://foo.bar/t/dependency/3",
],
"references": Array [
"https://foo.bar/t/reference/4",
"https://foo.bar/t/reference/5/2",
"https://foo.bar/t/reference/6/4",
],
}
`);
});
it("considers blank status incomplete", () => {
// Given
const sample = `
<h1>Example initiative</h1>
<h2>Status:</h2>
<h2>Champion:</h2>
<h2>Dependencies:</h2>
<h2>References:</h2>
<h2>Contributions:</h2>
`;
// When
const partial = parseCookedHtml(sample);
// Then
expect(partial.completed).toEqual(false);
});
it("throws for missing all headers", () => {
// Given
const sample = `
<h1>Example initiative</h1>
`;
// When
const fn = () => parseCookedHtml(sample);
// Then
expect(fn).toThrow(
`Missing or malformed headers ["champions","contributions","dependencies","references","status"]`
);
});
it("throws for missing status header", () => {
// Given
const sample = `
${sampleChampion}
${sampleDependencies}
${sampleReferences}
${sampleContributions}
`;
// When
const fn = () => parseCookedHtml(sample);
// Then
expect(fn).toThrow(`Missing or malformed headers ["status"]`);
});
it("throws for missing champions header", () => {
// Given
const sample = `
${sampleStatusIncomplete}
${sampleDependencies}
${sampleReferences}
${sampleContributions}
`;
// When
const fn = () => parseCookedHtml(sample);
// Then
expect(fn).toThrow(`Missing or malformed headers ["champions"]`);
});
it("throws for missing dependencies header", () => {
// Given
const sample = `
${sampleStatusIncomplete}
${sampleChampion}
${sampleReferences}
${sampleContributions}
`;
// When
const fn = () => parseCookedHtml(sample);
// Then
expect(fn).toThrow(`Missing or malformed headers ["dependencies"]`);
});
it("throws for missing references header", () => {
// Given
const sample = `
${sampleStatusIncomplete}
${sampleChampion}
${sampleDependencies}
${sampleContributions}
`;
// When
const fn = () => parseCookedHtml(sample);
// Then
expect(fn).toThrow(`Missing or malformed headers ["references"]`);
});
it("throws for missing contributions header", () => {
// Given
const sample = `
${sampleStatusIncomplete}
${sampleChampion}
${sampleDependencies}
${sampleReferences}
`;
// When
const fn = () => parseCookedHtml(sample);
// Then
expect(fn).toThrow(`Missing or malformed headers ["contributions"]`);
});
it("throws for conflicting status headers", () => {
// Given
const sample = `
${sampleStatusIncomplete}
${sampleStatusComplete}
${sampleChampion}
${sampleDependencies}
${sampleReferences}
${sampleContributions}
`;
// When
const fn = () => parseCookedHtml(sample);
// Then
expect(fn).toThrow(`Missing or malformed headers ["status"]`);
});
it("throws for duplicate headers", () => {
// Given
const sample = `
${sampleStatusIncomplete}
${sampleChampion}
${sampleDependencies}
${sampleDependencies}
${sampleReferences}
${sampleContributions}
`;
// When
const fn = () => parseCookedHtml(sample);
// Then
expect(fn).toThrow(`Unsupported duplicate header "Dependencies:" found`);
});
});
describe("groupURLsByHeader", () => {
it("handles an example text", () => {
// Given
const sample = `
<h1>This is a title</h1>
<p>
Things to talk about.
<a href="https://foo.bar/1">With links</a>
</p>
<a href="https://foo.bar/baz">Seems unmarkdownly formatted</a>
<h2>Some <i>funky</i> section:</h2>
<p>
<a href="https://foo.bar/2">With</a>
<strong><a href="https://foo.bar/3">More</a></strong>
</p>
<p>
<a href="https://foo.bar/4">Links</a>
</p>
<h2>Listed things<a href="https://foo.bar/t/dont-include/10"><sup>?</sup></a>:</h2>
<ul>
<li><a href="https://foo.bar/5">Yet</a></li>
<li><a href="https://foo.bar/6">More</a></li>
<li><a href="https://foo.bar/7">Links</a></li>
</ul>
<h2>Ordered things:</h2>
<ol>
<li><a href="https://foo.bar/8">Yet</a></li>
<li><a href="https://foo.bar/9">More</a></li>
<li><a href="https://foo.bar/10">Links</a></li>
</ol>
`;
// When
const map = groupURLsByHeader(sample);
// Then
expect(map).toMatchInlineSnapshot(`
Map {
"This is a title" => Array [
"https://foo.bar/1",
],
"Some funky section:" => Array [
"https://foo.bar/2",
"https://foo.bar/3",
"https://foo.bar/4",
],
"Listed things?:" => Array [
"https://foo.bar/5",
"https://foo.bar/6",
"https://foo.bar/7",
],
"Ordered things:" => Array [
"https://foo.bar/8",
"https://foo.bar/9",
"https://foo.bar/10",
],
}
`);
});
it("throws for duplicate headers", () => {
// Given
const sample = `
<h1>This is a title</h1>
<h1>This is a title</h1>
`;
// When
const fn = () => groupURLsByHeader(sample);
// Then
expect(fn).toThrow(
`Unsupported duplicate header "This is a title" found`
);
});
});
});