Parse Discourse references from hyperlinks (#1405)

The `discourse/references` module now has a `linksToReferences` method
which extracts the parsed Discourse references from an array of
hyperlinks. The method is tested.

Test plan: Unit tests added; `yarn test` passes.

This is progress towards [Discourse reference and mention detection][1].

[1]: https://discourse.sourcecred.io/t/discourse-reference-mention-detection/270
This commit is contained in:
Dandelion Mané 2019-10-16 18:39:46 -06:00 committed by GitHub
parent f725f7c47a
commit 78c34b5a36
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 181 additions and 3 deletions

View File

@ -2,9 +2,34 @@
const htmlparser2 = require("htmlparser2");
export type Hyperlink = string;
import {type TopicId} from "./fetch";
export function parseLinks(cookedHtml: string): Hyperlink[] {
export type DiscoursePostReference = {|
+type: "POST",
+topicId: TopicId,
+postIndex: number,
+serverUrl: string,
|};
export type DiscourseTopicReference = {|
+type: "TOPIC",
+topicId: TopicId,
+serverUrl: string,
|};
export type DiscourseUserReference = {|
+type: "USER",
+username: string,
+serverUrl: string,
|};
export type DiscourseReference =
| DiscoursePostReference
| DiscourseTopicReference
| DiscourseUserReference;
export type UrlString = string;
export function parseLinks(cookedHtml: string): UrlString[] {
const links = [];
const httpRegex = /^https?:\/\//;
const parser = new htmlparser2.Parser({
@ -23,3 +48,38 @@ export function parseLinks(cookedHtml: string): Hyperlink[] {
parser.end();
return links;
}
export function linksToReferences(
links: $ReadOnlyArray<UrlString>
): DiscourseReference[] {
const server = "(https://[\\w.-]+)";
const topic = `(?:${server})/t/[\\w-]+/(\\d+)`;
const post = `(?:${topic})/(\\d+)`;
const params = "(?:\\?[\\w-=]+)?";
const topicRegex = new RegExp(`^(?:${topic})(?:${params})/?$`);
const postRegex = new RegExp(`^(?:${post})(?:${params})/?$`);
const userRegex = new RegExp(`^(?:${server})/u/([\\w-]+)(?:${params})/?$`);
const references: DiscourseReference[] = [];
for (const link of links) {
let match = null;
const decoded = decodeURI(link);
if ((match = decoded.match(postRegex))) {
references.push({
type: "POST",
topicId: +match[2],
serverUrl: match[1],
postIndex: +match[3],
});
} else if ((match = decoded.match(topicRegex))) {
references.push({type: "TOPIC", topicId: +match[2], serverUrl: match[1]});
} else if ((match = decoded.match(userRegex))) {
references.push({
type: "USER",
username: match[2],
serverUrl: match[1],
});
}
}
return references;
}

View File

@ -1,6 +1,6 @@
// @flow
import {parseLinks} from "./references";
import {parseLinks, linksToReferences} from "./references";
describe("plugins/discourse/references", () => {
describe("parseLinks", () => {
@ -27,4 +27,122 @@ describe("plugins/discourse/references", () => {
expect(parseLinks(`<a href="#foo">A Link</a>`)).toEqual([]);
});
});
describe("linksToReferences", () => {
it("works for topics", () => {
const hyperlinks = [
"https://sourcecred-test.discourse.group/t/123-a-post-with-numbers-in-slug/20",
"https://sourcecred-test.discourse.group/t/123-a-post-with-numbers-in-slug/20/",
"https://sourcecred-test.discourse.group/t/123-a-post-with-numbers-in-slug/20?u=d11",
];
const reference = {
type: "TOPIC",
topicId: 20,
serverUrl: "https://sourcecred-test.discourse.group",
};
expect(linksToReferences(hyperlinks)).toEqual([
reference,
reference,
reference,
]);
});
it("works for posts", () => {
const hyperlinks = [
"https://sourcecred-test.discourse.group/t/my-first-test-post/11/2?u=d11",
"https://sourcecred-test.discourse.group/t/my-first-test-post/11/2/",
"https://sourcecred-test.discourse.group/t/my-first-test-post/11/2",
];
const reference = {
type: "POST",
topicId: 11,
postIndex: 2,
serverUrl: "https://sourcecred-test.discourse.group",
};
expect(linksToReferences(hyperlinks)).toEqual([
reference,
reference,
reference,
]);
});
it("works for mentions", () => {
const hyperlinks = ["https://sourcecred-test.discourse.group/u/d11"];
const reference = {
type: "USER",
username: "d11",
serverUrl: "https://sourcecred-test.discourse.group",
};
expect(linksToReferences(hyperlinks)).toEqual([reference]);
});
it("doesn't find bad or malformed references", () => {
const hyperlinks = [
// Not a reference to anything in particular.
"https://sourcecred-test.discourse.group",
// No https == no go. We can be more permissive if needed.
"sourcecred-test.discourse.group/t/foo/120",
// There's a space at the front.
" https://sourcecred-test.discourse.group/t/foo/120",
// unexpected trailing stuff
"https://sourcecred-test.discourse.group/t/foo/120$$",
];
expect(linksToReferences(hyperlinks)).toEqual([]);
});
it("works on a snapshot corpus", () => {
const hyperlinks = [
"https://discourse.sourcecred.io/t/experiment-sourcecred-stack-lookup/287/4",
"https://discourse.sourcecred.io/t/experiment-sourcecred-stack-lookup/287/4?u=decentralion",
"https://talk.observablehq.com/t/having-some-trouble-with-d3-dragging/776",
"https://talk.observablehq.com/t/package-integrity-and-yarn-lock-package-lock-json/2300/6",
// This topic has non-ASCII characters in the topic name; seems like
// (that particular discoures instance) filtered it out to leave a
// neutral topic slug.
"https://forums.eveonline.com/t/topic/195153",
// Shouldn't necessarily get a reference, since @-references generate
// links that do not have the /summary suffix.
"https://forums.eveonline.com/u/dorian_neil/summary",
"https://discourse.sourcecred.io/u/decentralion",
];
const hyperlinkToReference = {};
for (const hyperlink of hyperlinks) {
hyperlinkToReference[hyperlink] = linksToReferences([hyperlink])[0];
}
expect(hyperlinkToReference).toMatchInlineSnapshot(`
Object {
"https://discourse.sourcecred.io/t/experiment-sourcecred-stack-lookup/287/4": Object {
"postIndex": 4,
"serverUrl": "https://discourse.sourcecred.io",
"topicId": 287,
"type": "POST",
},
"https://discourse.sourcecred.io/t/experiment-sourcecred-stack-lookup/287/4?u=decentralion": Object {
"postIndex": 4,
"serverUrl": "https://discourse.sourcecred.io",
"topicId": 287,
"type": "POST",
},
"https://discourse.sourcecred.io/u/decentralion": Object {
"serverUrl": "https://discourse.sourcecred.io",
"type": "USER",
"username": "decentralion",
},
"https://forums.eveonline.com/t/topic/195153": Object {
"serverUrl": "https://forums.eveonline.com",
"topicId": 195153,
"type": "TOPIC",
},
"https://forums.eveonline.com/u/dorian_neil/summary": undefined,
"https://talk.observablehq.com/t/having-some-trouble-with-d3-dragging/776": Object {
"serverUrl": "https://talk.observablehq.com",
"topicId": 776,
"type": "TOPIC",
},
"https://talk.observablehq.com/t/package-integrity-and-yarn-lock-package-lock-json/2300/6": Object {
"postIndex": 6,
"serverUrl": "https://talk.observablehq.com",
"topicId": 2300,
"type": "POST",
},
}
`);
});
});
});