Add logic for plucking hyperlinks from cooked html (#1403)
This commit adds a `parseLinks` method to a new module, `plugins/discourse/references`. `parseLinks` allows us to extract the hyperlinks from `<a>` tags in "cooked" html. I added `htmlparser2` as a dependency to parse the html. There were a lot of options to choose from; I chose htmlparser2 because it has a lot of usage, reasonable performance, and suits our needs. We use this dependency in a lightweight and local way, so we can always change it later if needed. One thing which was a bit odd: I wasn't able to import it using `import`, and needed a `require` statement instead. Test plan: Unit tests added; `yarn test` passes. This is progress towards [Discourse reference and mention detection][1]. [1]: https://discourse.sourcecred.io/t/discourse-reference-mention-detection/270
This commit is contained in:
parent
f82c1bfbbe
commit
5e02a2caeb
|
@ -19,6 +19,7 @@
|
|||
"express": "^4.16.3",
|
||||
"fs-extra": "8.1.0",
|
||||
"history": "^3.0.0",
|
||||
"htmlparser2": "^4.0.0",
|
||||
"isomorphic-fetch": "^2.2.1",
|
||||
"json-stable-stringify": "^1.0.1",
|
||||
"lodash.clonedeep": "^4.5.0",
|
||||
|
|
|
@ -23,6 +23,7 @@ test_expect_success "application components must use <Link> instead of <a>" '
|
|||
":/src/*.js" \
|
||||
":(exclude,top)*/__snapshots__/*" \
|
||||
":(exclude,top)*/snapshots/*" \
|
||||
":(exclude,top)src/plugins/discourse/references.test.js" \
|
||||
":(exclude,top)src/webutil/Link.js" \
|
||||
;
|
||||
'
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
// @flow
|
||||
|
||||
const htmlparser2 = require("htmlparser2");
|
||||
|
||||
export type Hyperlink = string;
|
||||
|
||||
export function parseLinks(cookedHtml: string): Hyperlink[] {
|
||||
const links = [];
|
||||
const httpRegex = /^https?:\/\//;
|
||||
const parser = new htmlparser2.Parser({
|
||||
onopentag(name, attribs) {
|
||||
if (name === "a") {
|
||||
const href = attribs.href;
|
||||
if (href != null) {
|
||||
if (href.match(httpRegex)) {
|
||||
links.push(href);
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
});
|
||||
parser.write(cookedHtml);
|
||||
parser.end();
|
||||
return links;
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
// @flow
|
||||
|
||||
import {parseLinks} from "./references";
|
||||
|
||||
describe("plugins/discourse/references", () => {
|
||||
describe("parseLinks", () => {
|
||||
it("does not error on empty string", () => {
|
||||
expect(parseLinks("")).toEqual([]);
|
||||
});
|
||||
it("does not error on non-html", () => {
|
||||
expect(parseLinks("foo bar")).toEqual([]);
|
||||
});
|
||||
it("does not pick up raw urls", () => {
|
||||
expect(parseLinks("https://www.google.com")).toEqual([]);
|
||||
});
|
||||
it("picks up a (https://) hyperlink in href", () => {
|
||||
expect(parseLinks(`<a href="https://www.google.com">A Link</a>`)).toEqual(
|
||||
["https://www.google.com"]
|
||||
);
|
||||
});
|
||||
it("picks up a (http://) hyperlink in href", () => {
|
||||
expect(parseLinks(`<a href="http://www.google.com">A Link</a>`)).toEqual([
|
||||
"http://www.google.com",
|
||||
]);
|
||||
});
|
||||
it("doesn't pick up anchor hrefs", () => {
|
||||
expect(parseLinks(`<a href="#foo">A Link</a>`)).toEqual([]);
|
||||
});
|
||||
});
|
||||
});
|
44
yarn.lock
44
yarn.lock
|
@ -2953,6 +2953,14 @@ dom-serializer@0, dom-serializer@~0.1.0, dom-serializer@~0.1.1:
|
|||
domelementtype "^1.3.0"
|
||||
entities "^1.1.1"
|
||||
|
||||
dom-serializer@^0.2.1:
|
||||
version "0.2.1"
|
||||
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-0.2.1.tgz#13650c850daffea35d8b626a4cfc4d3a17643fdb"
|
||||
integrity sha512-sK3ujri04WyjwQXVoK4PU3y8ula1stq10GJZpqHIUgoGZdsGzAGu65BnU3d08aTVSvO7mGPZUc0wTEDL+qGE0Q==
|
||||
dependencies:
|
||||
domelementtype "^2.0.1"
|
||||
entities "^2.0.0"
|
||||
|
||||
domain-browser@^1.1.1:
|
||||
version "1.2.0"
|
||||
resolved "https://registry.yarnpkg.com/domain-browser/-/domain-browser-1.2.0.tgz#3d31f50191a6749dd1375a7f522e823d42e54eda"
|
||||
|
@ -2963,6 +2971,11 @@ domelementtype@1, domelementtype@^1.3.0, domelementtype@^1.3.1:
|
|||
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-1.3.1.tgz#d048c44b37b0d10a7f2a3d5fee3f4333d790481f"
|
||||
integrity sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w==
|
||||
|
||||
domelementtype@^2.0.1:
|
||||
version "2.0.1"
|
||||
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.0.1.tgz#1f8bdfe91f5a78063274e803b4bdcedf6e94f94d"
|
||||
integrity sha512-5HOHUDsYZWV8FGWN0Njbr/Rn7f/eWSQi1v7+HsUVwXgn8nWWlL64zKDkS0n8ZmQ3mlWOMuXOnR+7Nx/5tMO5AQ==
|
||||
|
||||
domexception@^1.0.1:
|
||||
version "1.0.1"
|
||||
resolved "https://registry.yarnpkg.com/domexception/-/domexception-1.0.1.tgz#937442644ca6a31261ef36e3ec677fe805582c90"
|
||||
|
@ -2977,6 +2990,13 @@ domhandler@^2.3.0, domhandler@^2.4.2:
|
|||
dependencies:
|
||||
domelementtype "1"
|
||||
|
||||
domhandler@^3.0.0:
|
||||
version "3.0.0"
|
||||
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-3.0.0.tgz#51cd13efca31da95bbb0c5bee3a48300e333b3e9"
|
||||
integrity sha512-eKLdI5v9m67kbXQbJSNn1zjh0SDzvzWVWtX+qEI3eMjZw8daH9k8rlj1FZY9memPwjiskQFbe7vHVVJIAqoEhw==
|
||||
dependencies:
|
||||
domelementtype "^2.0.1"
|
||||
|
||||
domutils@1.5.1:
|
||||
version "1.5.1"
|
||||
resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.5.1.tgz#dcd8488a26f563d61079e48c9f7b7e32373682cf"
|
||||
|
@ -2993,6 +3013,15 @@ domutils@^1.5.1:
|
|||
dom-serializer "0"
|
||||
domelementtype "1"
|
||||
|
||||
domutils@^2.0.0:
|
||||
version "2.0.0"
|
||||
resolved "https://registry.yarnpkg.com/domutils/-/domutils-2.0.0.tgz#15b8278e37bfa8468d157478c58c367718133c08"
|
||||
integrity sha512-n5SelJ1axbO636c2yUtOGia/IcJtVtlhQbFiVDBZHKV5ReJO1ViX7sFEemtuyoAnBxk5meNSYgA8V4s0271efg==
|
||||
dependencies:
|
||||
dom-serializer "^0.2.1"
|
||||
domelementtype "^2.0.1"
|
||||
domhandler "^3.0.0"
|
||||
|
||||
dotenv-expand@5.1.0:
|
||||
version "5.1.0"
|
||||
resolved "https://registry.yarnpkg.com/dotenv-expand/-/dotenv-expand-5.1.0.tgz#3fbaf020bfd794884072ea26b1e9791d45a629f0"
|
||||
|
@ -3092,6 +3121,11 @@ entities@^1.1.1, "entities@~ 1.1.1", entities@~1.1.1:
|
|||
resolved "https://registry.yarnpkg.com/entities/-/entities-1.1.2.tgz#bdfa735299664dfafd34529ed4f8522a275fea56"
|
||||
integrity sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==
|
||||
|
||||
entities@^2.0.0:
|
||||
version "2.0.0"
|
||||
resolved "https://registry.yarnpkg.com/entities/-/entities-2.0.0.tgz#68d6084cab1b079767540d80e56a39b423e4abf4"
|
||||
integrity sha512-D9f7V0JSRwIxlRI2mjMqufDrRDnx8p+eEOz7aUM9SuvF8gsBzra0/6tbjl1m8eQHrZlYj6PxqE00hZ1SAIKPLw==
|
||||
|
||||
enzyme-adapter-react-16@^1.1.1:
|
||||
version "1.14.0"
|
||||
resolved "https://registry.yarnpkg.com/enzyme-adapter-react-16/-/enzyme-adapter-react-16-1.14.0.tgz#204722b769172bcf096cb250d33e6795c1f1858f"
|
||||
|
@ -4221,6 +4255,16 @@ htmlparser2@^3.10.0, htmlparser2@^3.9.1:
|
|||
inherits "^2.0.1"
|
||||
readable-stream "^3.1.1"
|
||||
|
||||
htmlparser2@^4.0.0:
|
||||
version "4.0.0"
|
||||
resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-4.0.0.tgz#6034658db65b7713a572a9ebf79f650832dceec8"
|
||||
integrity sha512-cChwXn5Vam57fyXajDtPXL1wTYc8JtLbr2TN76FYu05itVVVealxLowe2B3IEznJG4p9HAYn/0tJaRlGuEglFQ==
|
||||
dependencies:
|
||||
domelementtype "^2.0.1"
|
||||
domhandler "^3.0.0"
|
||||
domutils "^2.0.0"
|
||||
entities "^2.0.0"
|
||||
|
||||
http-deceiver@^1.2.7:
|
||||
version "1.2.7"
|
||||
resolved "https://registry.yarnpkg.com/http-deceiver/-/http-deceiver-1.2.7.tgz#fa7168944ab9a519d337cb0bec7284dc3e723d87"
|
||||
|
|
Loading…
Reference in New Issue