From 86c121e3c62edbef4339e16321e91f378f425f78 Mon Sep 17 00:00:00 2001 From: William Chargin Date: Mon, 2 Mar 2020 21:25:07 -0800 Subject: [PATCH] github: use plaintext owner and name in cache key (#1685) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: This is mostly a QoL improvement for maintainers. GraphQL mirrors are stored in the SourceCred cache directory, but until now have been hard to tell apart. All IDs looked like long, similar hex strings; e.g.: ``` github_736f75726365637265642d746573742f6578616d706c652d676974687562.db github_736f75726365637265642f646f6373.db github_736f75726365637265642f736f7572636563726564.db ``` It’s too hard to tell that these are `sourcecred-test/example-github`, `sourcecred/docs`, and `sourcecred/sourcecred` in that order. (Yes, you can work it out if you try; that’s not good enough.) With dozens of caches loaded, finding the right one to poke at for debugging or progress-checking takes way too much scripting. The purpose of this abstruse encoding was portable filename safety, but we can actually achieve this with human-readable names like these: ``` github_sourcecred-test_example_github.db github_sourcecred_docs.db github_sourcecred_sourcecred.db ``` This suffices because (a) login names cannot contain underscores, so we can safely use that as a separator, and (b) GitHub disallows collisions on names that are equal ignoring case, so we can convert all names to lowercase without introducing collisions. This change orphans existing caches. Running `sourcecred clear --cache` will clean those up. Test Plan: Try to create a GitHub repository with the same name as a repository that you already have, but with inverted case; note that this is disallowed. Then, note that loading `sourcecred-test/example-github` still works and produces a database with nicely readable name. wchargin-branch: github-readable-db-names --- src/plugins/github/cacheId.js | 27 ++++++++++-- src/plugins/github/cacheId.test.js | 71 +++++++++++++++++------------- 2 files changed, 63 insertions(+), 35 deletions(-) diff --git a/src/plugins/github/cacheId.js b/src/plugins/github/cacheId.js index 164af8d..5724a7c 100644 --- a/src/plugins/github/cacheId.js +++ b/src/plugins/github/cacheId.js @@ -9,14 +9,33 @@ export opaque type CacheId: string = string; /** * Derives the CacheId for a RepoId. - * Returned CacheId's will be: + * + * Returned `CacheId`s will be: * - Deterministic * - Unique for this plugin * - Lowercase * - Safe to use for filenames + * - Distinct for semantically distinct inputs (input IDs that differ + * only in case may map to the same output, because GitHub does not + * permit collisions-modulo-case) */ export function cacheIdForRepoId(repoId: RepoId): CacheId { - const repoString = repoIdToString(repoId); - const repoStringHex = Buffer.from(repoString).toString("hex"); - return `github_${repoStringHex}`.toLowerCase(); + // GitHub owner (user/organization) and repository names may be in + // mixed case, but GitHub prevents mixed-case collisions: e.g., if + // `foo` is a user then `Foo` cannot also be a user, and likewise for + // repositories. GitHub login names are DNS-safe (`[0-9A-Za-z-]` only) + // and so are filename-safe. Repository names have a slightly larger + // character set, including underscore, but are also filename-safe. + // Because login may not contain an underscore, it thus suffices to + // use an underscore as the delimiter. The resulting filename uses + // only `[0-9A-Za-z_.-]` and so is valid on all major filesystems. + const owner = repoId.owner.toLowerCase(); + const name = repoId.name.toLowerCase(); + if (owner.includes("_")) { + throw new Error( + "unexpected underscore in GitHub owner name would be ambiguous: " + + repoIdToString(repoId) + ); + } + return `github_${owner}_${name}`; } diff --git a/src/plugins/github/cacheId.test.js b/src/plugins/github/cacheId.test.js index 74a94bb..290a39b 100644 --- a/src/plugins/github/cacheId.test.js +++ b/src/plugins/github/cacheId.test.js @@ -1,6 +1,6 @@ // @flow -import {makeRepoId, repoIdToString} from "./repoId"; +import {makeRepoId, type RepoId} from "./repoId"; import {type CacheId, cacheIdForRepoId} from "./cacheId"; describe("plugins/github/cacheId", () => { @@ -24,19 +24,6 @@ describe("plugins/github/cacheId", () => { const id = cacheIdForRepoId(sampleRepo); expect(id).toMatch(/^github_/); }); - it("should contain a hex encoding of 'repoIdToString'", () => { - const id = cacheIdForRepoId(sampleRepo); - const repoIdString = repoIdToString(sampleRepo); - const expectedHex = Buffer.from(repoIdString).toString("hex"); - expect(id).toContain(expectedHex); - }); - it("should treat repoId as case sensitive", () => { - const lowercaseId = cacheIdForRepoId(makeRepoId("foo", "bar")); - const upperOwnerId = cacheIdForRepoId(makeRepoId("FOO", "bar")); - const upperNameId = cacheIdForRepoId(makeRepoId("foo", "BAR")); - expect(upperOwnerId).not.toEqual(lowercaseId); - expect(upperNameId).not.toEqual(lowercaseId); - }); it("should have lowercase output", () => { const lowercaseId = cacheIdForRepoId(makeRepoId("foo", "bar")); const upperOwnerId = cacheIdForRepoId(makeRepoId("FOO", "bar")); @@ -46,24 +33,46 @@ describe("plugins/github/cacheId", () => { expect(upperNameId).not.toMatch(/[A-Z]/); }); it("should deterministically match examples", () => { - const ids = [ - cacheIdForRepoId(makeRepoId("sourcecred", "sourcecred")), - cacheIdForRepoId(makeRepoId("sourcecred", "sourcecred.github.io")), - cacheIdForRepoId(makeRepoId("foo", "Something-Good")), - cacheIdForRepoId(makeRepoId("foo", "still_good")), - cacheIdForRepoId(makeRepoId("fooolio", "foo-bar.bar-99_x")), - cacheIdForRepoId(makeRepoId("FOOolio", "foo-bar.bar-99_x")), + const expected = [ + { + repoId: makeRepoId("sourcecred", "sourcecred"), + cacheId: "github_sourcecred_sourcecred", + }, + { + repoId: makeRepoId("sourcecred", "sourcecred.github.io"), + cacheId: "github_sourcecred_sourcecred.github.io", + }, + { + repoId: makeRepoId("foo", "Something-Good"), + cacheId: "github_foo_something-good", + }, + { + repoId: makeRepoId("foo", "still_good"), + cacheId: "github_foo_still_good", + }, + { + repoId: makeRepoId("fooolio", "foo-bar.bar-99_x"), + cacheId: "github_fooolio_foo-bar.bar-99_x", + }, + { + repoId: makeRepoId("FOOolio", "foo-bar.bar-99_x"), + cacheId: "github_fooolio_foo-bar.bar-99_x", + }, ]; - expect(ids).toMatchInlineSnapshot(` - Array [ - "github_736f75726365637265642f736f7572636563726564", - "github_736f75726365637265642f736f75726365637265642e6769746875622e696f", - "github_666f6f2f536f6d657468696e672d476f6f64", - "github_666f6f2f7374696c6c5f676f6f64", - "github_666f6f6f6c696f2f666f6f2d6261722e6261722d39395f78", - "github_464f4f6f6c696f2f666f6f2d6261722e6261722d39395f78", - ] - `); + const output = expected.map(({repoId}) => ({ + repoId, + cacheId: cacheIdForRepoId(repoId), + })); + expect(output).toEqual(expected); + }); + it("should error if the owner name contains an undescore", () => { + // GitHub doesn't allow such names because they're not valid DNS + // labels. The `RepoId` type thus prevents constructing such + // invalid instances; we fabricate one as a defensive check. + const badRepoId: RepoId = ({owner: "h_mmm", name: "ok"}: any); + expect(() => { + cacheIdForRepoId(badRepoId); + }).toThrow(/ambiguous/); }); }); });