From 4ab58a09b56c1b90a85fd976e916bf241abd0990 Mon Sep 17 00:00:00 2001 From: Robin van Boven <497556+Beanow@users.noreply.github.com> Date: Tue, 7 Jan 2020 14:15:26 +0100 Subject: [PATCH] Initiatives: implement "cooked HTML" template parsing (#1478) Will return a partial Initiative, or throw when the template isn't matched. --- sharness/test_no_raw_anchor_elements.t | 1 + src/plugins/initiatives/htmlTemplate.js | 215 +++++++++++++ src/plugins/initiatives/htmlTemplate.test.js | 310 +++++++++++++++++++ 3 files changed, 526 insertions(+) create mode 100644 src/plugins/initiatives/htmlTemplate.js create mode 100644 src/plugins/initiatives/htmlTemplate.test.js diff --git a/sharness/test_no_raw_anchor_elements.t b/sharness/test_no_raw_anchor_elements.t index bb66bd6..92857be 100755 --- a/sharness/test_no_raw_anchor_elements.t +++ b/sharness/test_no_raw_anchor_elements.t @@ -25,6 +25,7 @@ test_expect_success "application components must use instead of " ' ":(exclude,top)*/snapshots/*" \ ":(exclude,top)src/plugins/discourse/references.test.js" \ ":(exclude,top)src/plugins/discourse/createGraph.test.js" \ + ":(exclude,top)src/plugins/initiatives/htmlTemplate.test.js" \ ":(exclude,top)src/webutil/Link.js" \ ; ' diff --git a/src/plugins/initiatives/htmlTemplate.js b/src/plugins/initiatives/htmlTemplate.js new file mode 100644 index 0000000..c24e524 --- /dev/null +++ b/src/plugins/initiatives/htmlTemplate.js @@ -0,0 +1,215 @@ +// @flow + +import {DomHandler, DomUtils, Parser} from "htmlparser2"; +import {type URL} from "./initiative"; + +/* +All headers are case-insensitive and can be h1-h6. +Headers can appear in any order. +A matching header for each field must appear exactly once. +The expected pattern for a cooked HTML template: + + ## Status: complete + + Status value must be in the header, prefixed by "Status:". + Either "complete" or "completed". A missing status value, + or any other value is considered incomplete. + + ## Champions: + + - [@Beanow](/u/beanow) + + Any URLs that appear in the content below the "Champion" or "Champions" header. + No filters on user-like types applied here, that's left for after reference detection. + + ## Dependencies: + + - [Dependency](/t/topic/123) + + Any URLs that appear in the content below the "Dependency" or "Dependencies" header. + + ## References: + + - [Reference](/t/topic/123) + + Any URLs that appear in the content below the "Reference" or "References" header. + + ## Contributions: + + - [Contribution](/t/topic/123) + + Any URLs that appear in the content below the "Contribution" or "Contributions" header. +*/ + +/** + * A mapping from an HTML header, to any URLs in the body that follows it. + */ +type HeaderToURLsMap = Map>; + +/** + * A partial Iniatiative object, parsed from the Cooked HTML template. + */ +export type HtmlTemplateInitiativePartial = {| + +completed: boolean, + +dependencies: $ReadOnlyArray, + +references: $ReadOnlyArray, + +contributions: $ReadOnlyArray, + +champions: $ReadOnlyArray, +|}; + +/** + * Attempts to parse a cooked HTML body for Initiative data. + * + * Throws when it doesn't match the template. + */ +export function parseCookedHtml( + cookedHTML: string +): HtmlTemplateInitiativePartial { + const htu: HeaderToURLsMap = groupURLsByHeader(cookedHTML); + const completed = findCompletionStatus(htu); + const champions = singleMatch(htu, new RegExp(/^Champions?/i)); + const contributions = singleMatch(htu, new RegExp(/^Contributions?/i)); + const dependencies = singleMatch(htu, new RegExp(/^Dependenc(y|ies)/i)); + const references = singleMatch(htu, new RegExp(/^References?/i)); + + const missing = []; + if (completed === null) missing.push("status"); + if (!champions) missing.push("champions"); + if (!contributions) missing.push("contributions"); + if (!dependencies) missing.push("dependencies"); + if (!references) missing.push("references"); + + if ( + completed == null || + champions == null || + contributions == null || + dependencies == null || + references == null + ) { + missing.sort(); + throw new Error(`Missing or malformed headers ${JSON.stringify(missing)}`); + } + + return { + completed, + dependencies, + references, + contributions, + champions, + }; +} + +/** + * Takes cooked HTML and creates a HeaderToURLsMap. + * + * Cooked HTML being HTML rendered from Markdown. We're assuming this behaves + * a lot like a subset of HTML, even though the option to write HTML manually + * exists. For the purpose of parsing Initiative data, we can require just + * using Markdown. + * + * Will throw when there are exact duplicate headers, as this would otherwise + * silently merge by header in unexpected ways. + */ +export function groupURLsByHeader(cookedHTML: string): HeaderToURLsMap { + const map: HeaderToURLsMap = new Map(); + const dom = toDOM(cookedHTML); + + let currentHeader: ?string; + for (const rootEl of dom) { + switch (rootEl.name) { + case "h1": + case "h2": + case "h3": + case "h4": + case "h5": + case "h6": + currentHeader = DomUtils.getText(rootEl); + if (map.has(currentHeader)) { + throw new Error( + `Unsupported duplicate header "${currentHeader}" found` + ); + } + // We're also interested in just headers, so make sure an entry exists. + map.set(currentHeader, []); + break; + case "p": + case "ul": + case "ol": + if (currentHeader === undefined) break; + const existing = map.get(currentHeader) || []; + const anchors = DomUtils.findAll((el) => el.name === "a", [rootEl]).map( + (a) => a.attribs.href + ); + map.set(currentHeader, [...existing, ...anchors]); + break; + } + } + + return map; +} + +/** + * Finds one "Status:" header, where the value is included in the header itself. + * + * Returns true when "Status:" is followed by "completed" in the header. + * Returns false when "Status:" is followed by any other value. + * Returns null when 0 or >1 headers start with "Status:". + */ +function findCompletionStatus(map: HeaderToURLsMap): boolean | null { + const pattern = new RegExp(/^Status:(.*)/i); + const headers = Array.from(map.keys()) + .map((k) => k.trim()) + .filter((k) => pattern.test(k)); + + if (headers.length !== 1) { + return null; + } + + const matches = headers[0].match(pattern); + if (matches == null) { + return null; + } + + const completedRE = new RegExp(/^completed?$/i); + return completedRE.test(matches[1].trim()); +} + +/** + * Finds one header to match the given RegExp. + * + * Returns the associated URL[] when exactly 1 header matches. + * Returns null when it matches 0 or >1 headers. + */ +function singleMatch( + map: HeaderToURLsMap, + pattern: RegExp +): $ReadOnlyArray | null { + const headers = Array.from(map.keys()).filter((k) => pattern.test(k.trim())); + + if (headers.length !== 1) { + return null; + } + + return map.get(headers[0]) || null; +} + +function toDOM(cookedHTML: string): Object { + // Note: DomHandler is actually synchronous, in spite of the nodeback signature. + let dom; + const domHandler = new DomHandler((err, result) => { + if (err) throw err; + dom = result; + }); + + const htmlParser = new Parser(domHandler); + htmlParser.write(cookedHTML); + htmlParser.end(); + + // The .end() forces data to be flushed, so we know DomHandler calls the callback. + // But in case some implementation detail changes, add this error. + if (dom === undefined) { + throw new Error("DomHandler callback wasn't called after htmlParser.end()"); + } + + return dom; +} diff --git a/src/plugins/initiatives/htmlTemplate.test.js b/src/plugins/initiatives/htmlTemplate.test.js new file mode 100644 index 0000000..9d28c62 --- /dev/null +++ b/src/plugins/initiatives/htmlTemplate.test.js @@ -0,0 +1,310 @@ +// @flow + +import {groupURLsByHeader, parseCookedHtml} from "./htmlTemplate"; + +describe("plugins/initiatives/htmlTemplate", () => { + describe("parseCookedHtml", () => { + const sampleStatusIncomplete = `

Status: Testing

`; + const sampleStatusComplete = `

Status: Completed

`; + const sampleChampion = ` +

Champion?:

+

+ @ChampUser +

+ `; + const sampleDependencies = ` +

Dependencies:

+ + `; + const sampleReferences = ` +

References:

+ + `; + const sampleContributions = ` +

Contributions:

+ + `; + + it("handles an example text", () => { + // Given + const sample = ` + ${sampleStatusIncomplete} + ${sampleChampion} + ${sampleDependencies} + ${sampleReferences} + ${sampleContributions} + `; + + // When + const partial = parseCookedHtml(sample); + + // Then + expect(partial).toMatchInlineSnapshot(` + Object { + "champions": Array [ + "/u/ChampUser", + ], + "completed": false, + "contributions": Array [ + "https://foo.bar/t/contribution/7", + "https://foo.bar/t/contribution/8/2", + "https://github.com/sourcecred/sourcecred/pull/1416", + ], + "dependencies": Array [ + "https://foo.bar/t/dependency/1", + "https://foo.bar/t/dependency/2", + "https://foo.bar/t/dependency/3", + ], + "references": Array [ + "https://foo.bar/t/reference/4", + "https://foo.bar/t/reference/5/2", + "https://foo.bar/t/reference/6/4", + ], + } + `); + }); + + it("considers blank status incomplete", () => { + // Given + const sample = ` +

Example initiative

+

Status:

+

Champion:

+

Dependencies:

+

References:

+

Contributions:

+ `; + + // When + const partial = parseCookedHtml(sample); + + // Then + expect(partial.completed).toEqual(false); + }); + + it("throws for missing all headers", () => { + // Given + const sample = ` +

Example initiative

+ `; + + // When + const fn = () => parseCookedHtml(sample); + + // Then + expect(fn).toThrow( + `Missing or malformed headers ["champions","contributions","dependencies","references","status"]` + ); + }); + + it("throws for missing status header", () => { + // Given + const sample = ` + + ${sampleChampion} + ${sampleDependencies} + ${sampleReferences} + ${sampleContributions} + `; + + // When + const fn = () => parseCookedHtml(sample); + + // Then + expect(fn).toThrow(`Missing or malformed headers ["status"]`); + }); + + it("throws for missing champions header", () => { + // Given + const sample = ` + ${sampleStatusIncomplete} + + ${sampleDependencies} + ${sampleReferences} + ${sampleContributions} + `; + + // When + const fn = () => parseCookedHtml(sample); + + // Then + expect(fn).toThrow(`Missing or malformed headers ["champions"]`); + }); + + it("throws for missing dependencies header", () => { + // Given + const sample = ` + ${sampleStatusIncomplete} + ${sampleChampion} + + ${sampleReferences} + ${sampleContributions} + `; + + // When + const fn = () => parseCookedHtml(sample); + + // Then + expect(fn).toThrow(`Missing or malformed headers ["dependencies"]`); + }); + + it("throws for missing references header", () => { + // Given + const sample = ` + ${sampleStatusIncomplete} + ${sampleChampion} + ${sampleDependencies} + + ${sampleContributions} + `; + + // When + const fn = () => parseCookedHtml(sample); + + // Then + expect(fn).toThrow(`Missing or malformed headers ["references"]`); + }); + + it("throws for missing contributions header", () => { + // Given + const sample = ` + ${sampleStatusIncomplete} + ${sampleChampion} + ${sampleDependencies} + ${sampleReferences} + + `; + + // When + const fn = () => parseCookedHtml(sample); + + // Then + expect(fn).toThrow(`Missing or malformed headers ["contributions"]`); + }); + + it("throws for conflicting status headers", () => { + // Given + const sample = ` + ${sampleStatusIncomplete} + ${sampleStatusComplete} + ${sampleChampion} + ${sampleDependencies} + ${sampleReferences} + ${sampleContributions} + `; + + // When + const fn = () => parseCookedHtml(sample); + + // Then + expect(fn).toThrow(`Missing or malformed headers ["status"]`); + }); + + it("throws for duplicate headers", () => { + // Given + const sample = ` + ${sampleStatusIncomplete} + ${sampleChampion} + ${sampleDependencies} + ${sampleDependencies} + ${sampleReferences} + ${sampleContributions} + `; + + // When + const fn = () => parseCookedHtml(sample); + + // Then + expect(fn).toThrow(`Unsupported duplicate header "Dependencies:" found`); + }); + }); + + describe("groupURLsByHeader", () => { + it("handles an example text", () => { + // Given + const sample = ` +

This is a title

+

+ Things to talk about. + With links +

+ Seems unmarkdownly formatted +

Some funky section:

+

+ With + More +

+

+ Links +

+

Listed things?:

+ +

Ordered things:

+
    +
  1. Yet
  2. +
  3. More
  4. +
  5. Links
  6. +
+ `; + + // When + const map = groupURLsByHeader(sample); + + // Then + expect(map).toMatchInlineSnapshot(` + Map { + "This is a title" => Array [ + "https://foo.bar/1", + ], + "Some funky section:" => Array [ + "https://foo.bar/2", + "https://foo.bar/3", + "https://foo.bar/4", + ], + "Listed things?:" => Array [ + "https://foo.bar/5", + "https://foo.bar/6", + "https://foo.bar/7", + ], + "Ordered things:" => Array [ + "https://foo.bar/8", + "https://foo.bar/9", + "https://foo.bar/10", + ], + } + `); + }); + + it("throws for duplicate headers", () => { + // Given + const sample = ` +

This is a title

+

This is a title

+ `; + + // When + const fn = () => groupURLsByHeader(sample); + + // Then + expect(fn).toThrow( + `Unsupported duplicate header "This is a title" found` + ); + }); + }); +});