Add logic for plucking hyperlinks from cooked html (#1403)

This commit adds a `parseLinks` method to a new module,
`plugins/discourse/references`. `parseLinks` allows us to extract the
hyperlinks from `<a>` tags in "cooked" html.

I added `htmlparser2` as a dependency to parse the html. There were a
lot of options to choose from; I chose htmlparser2 because it has a lot
of usage, reasonable performance, and suits our needs. We use this
dependency in a lightweight and local way, so we can always change it
later if needed.

One thing which was a bit odd: I wasn't able to import it using
`import`, and needed a `require` statement instead.

Test plan: Unit tests added; `yarn test` passes.

This is progress towards [Discourse reference and mention detection][1].

[1]: https://discourse.sourcecred.io/t/discourse-reference-mention-detection/270
This commit is contained in:
Dandelion Mané 2019-10-11 13:36:31 -06:00 committed by GitHub
parent f82c1bfbbe
commit 5e02a2caeb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 101 additions and 0 deletions

View File

@ -19,6 +19,7 @@
"express": "^4.16.3",
"fs-extra": "8.1.0",
"history": "^3.0.0",
"htmlparser2": "^4.0.0",
"isomorphic-fetch": "^2.2.1",
"json-stable-stringify": "^1.0.1",
"lodash.clonedeep": "^4.5.0",

View File

@ -23,6 +23,7 @@ test_expect_success "application components must use <Link> instead of <a>" '
":/src/*.js" \
":(exclude,top)*/__snapshots__/*" \
":(exclude,top)*/snapshots/*" \
":(exclude,top)src/plugins/discourse/references.test.js" \
":(exclude,top)src/webutil/Link.js" \
;
'

View File

@ -0,0 +1,25 @@
// @flow
const htmlparser2 = require("htmlparser2");
export type Hyperlink = string;
export function parseLinks(cookedHtml: string): Hyperlink[] {
const links = [];
const httpRegex = /^https?:\/\//;
const parser = new htmlparser2.Parser({
onopentag(name, attribs) {
if (name === "a") {
const href = attribs.href;
if (href != null) {
if (href.match(httpRegex)) {
links.push(href);
}
}
}
},
});
parser.write(cookedHtml);
parser.end();
return links;
}

View File

@ -0,0 +1,30 @@
// @flow
import {parseLinks} from "./references";
describe("plugins/discourse/references", () => {
describe("parseLinks", () => {
it("does not error on empty string", () => {
expect(parseLinks("")).toEqual([]);
});
it("does not error on non-html", () => {
expect(parseLinks("foo bar")).toEqual([]);
});
it("does not pick up raw urls", () => {
expect(parseLinks("https://www.google.com")).toEqual([]);
});
it("picks up a (https://) hyperlink in href", () => {
expect(parseLinks(`<a href="https://www.google.com">A Link</a>`)).toEqual(
["https://www.google.com"]
);
});
it("picks up a (http://) hyperlink in href", () => {
expect(parseLinks(`<a href="http://www.google.com">A Link</a>`)).toEqual([
"http://www.google.com",
]);
});
it("doesn't pick up anchor hrefs", () => {
expect(parseLinks(`<a href="#foo">A Link</a>`)).toEqual([]);
});
});
});

View File

@ -2953,6 +2953,14 @@ dom-serializer@0, dom-serializer@~0.1.0, dom-serializer@~0.1.1:
domelementtype "^1.3.0"
entities "^1.1.1"
dom-serializer@^0.2.1:
version "0.2.1"
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-0.2.1.tgz#13650c850daffea35d8b626a4cfc4d3a17643fdb"
integrity sha512-sK3ujri04WyjwQXVoK4PU3y8ula1stq10GJZpqHIUgoGZdsGzAGu65BnU3d08aTVSvO7mGPZUc0wTEDL+qGE0Q==
dependencies:
domelementtype "^2.0.1"
entities "^2.0.0"
domain-browser@^1.1.1:
version "1.2.0"
resolved "https://registry.yarnpkg.com/domain-browser/-/domain-browser-1.2.0.tgz#3d31f50191a6749dd1375a7f522e823d42e54eda"
@ -2963,6 +2971,11 @@ domelementtype@1, domelementtype@^1.3.0, domelementtype@^1.3.1:
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-1.3.1.tgz#d048c44b37b0d10a7f2a3d5fee3f4333d790481f"
integrity sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w==
domelementtype@^2.0.1:
version "2.0.1"
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.0.1.tgz#1f8bdfe91f5a78063274e803b4bdcedf6e94f94d"
integrity sha512-5HOHUDsYZWV8FGWN0Njbr/Rn7f/eWSQi1v7+HsUVwXgn8nWWlL64zKDkS0n8ZmQ3mlWOMuXOnR+7Nx/5tMO5AQ==
domexception@^1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/domexception/-/domexception-1.0.1.tgz#937442644ca6a31261ef36e3ec677fe805582c90"
@ -2977,6 +2990,13 @@ domhandler@^2.3.0, domhandler@^2.4.2:
dependencies:
domelementtype "1"
domhandler@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-3.0.0.tgz#51cd13efca31da95bbb0c5bee3a48300e333b3e9"
integrity sha512-eKLdI5v9m67kbXQbJSNn1zjh0SDzvzWVWtX+qEI3eMjZw8daH9k8rlj1FZY9memPwjiskQFbe7vHVVJIAqoEhw==
dependencies:
domelementtype "^2.0.1"
domutils@1.5.1:
version "1.5.1"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.5.1.tgz#dcd8488a26f563d61079e48c9f7b7e32373682cf"
@ -2993,6 +3013,15 @@ domutils@^1.5.1:
dom-serializer "0"
domelementtype "1"
domutils@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-2.0.0.tgz#15b8278e37bfa8468d157478c58c367718133c08"
integrity sha512-n5SelJ1axbO636c2yUtOGia/IcJtVtlhQbFiVDBZHKV5ReJO1ViX7sFEemtuyoAnBxk5meNSYgA8V4s0271efg==
dependencies:
dom-serializer "^0.2.1"
domelementtype "^2.0.1"
domhandler "^3.0.0"
dotenv-expand@5.1.0:
version "5.1.0"
resolved "https://registry.yarnpkg.com/dotenv-expand/-/dotenv-expand-5.1.0.tgz#3fbaf020bfd794884072ea26b1e9791d45a629f0"
@ -3092,6 +3121,11 @@ entities@^1.1.1, "entities@~ 1.1.1", entities@~1.1.1:
resolved "https://registry.yarnpkg.com/entities/-/entities-1.1.2.tgz#bdfa735299664dfafd34529ed4f8522a275fea56"
integrity sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==
entities@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/entities/-/entities-2.0.0.tgz#68d6084cab1b079767540d80e56a39b423e4abf4"
integrity sha512-D9f7V0JSRwIxlRI2mjMqufDrRDnx8p+eEOz7aUM9SuvF8gsBzra0/6tbjl1m8eQHrZlYj6PxqE00hZ1SAIKPLw==
enzyme-adapter-react-16@^1.1.1:
version "1.14.0"
resolved "https://registry.yarnpkg.com/enzyme-adapter-react-16/-/enzyme-adapter-react-16-1.14.0.tgz#204722b769172bcf096cb250d33e6795c1f1858f"
@ -4221,6 +4255,16 @@ htmlparser2@^3.10.0, htmlparser2@^3.9.1:
inherits "^2.0.1"
readable-stream "^3.1.1"
htmlparser2@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-4.0.0.tgz#6034658db65b7713a572a9ebf79f650832dceec8"
integrity sha512-cChwXn5Vam57fyXajDtPXL1wTYc8JtLbr2TN76FYu05itVVVealxLowe2B3IEznJG4p9HAYn/0tJaRlGuEglFQ==
dependencies:
domelementtype "^2.0.1"
domhandler "^3.0.0"
domutils "^2.0.0"
entities "^2.0.0"
http-deceiver@^1.2.7:
version "1.2.7"
resolved "https://registry.yarnpkg.com/http-deceiver/-/http-deceiver-1.2.7.tgz#fa7168944ab9a519d337cb0bec7284dc3e723d87"