github: use plaintext owner and name in cache key (#1685)

Summary:
This is mostly a QoL improvement for maintainers. GraphQL mirrors are
stored in the SourceCred cache directory, but until now have been hard
to tell apart. All IDs looked like long, similar hex strings; e.g.:

```
github_736f75726365637265642d746573742f6578616d706c652d676974687562.db
github_736f75726365637265642f646f6373.db
github_736f75726365637265642f736f7572636563726564.db
```

It’s too hard to tell that these are `sourcecred-test/example-github`,
`sourcecred/docs`, and `sourcecred/sourcecred` in that order. (Yes, you
can work it out if you try; that’s not good enough.) With dozens of
caches loaded, finding the right one to poke at for debugging or
progress-checking takes way too much scripting.

The purpose of this abstruse encoding was portable filename safety, but
we can actually achieve this with human-readable names like these:

```
github_sourcecred-test_example_github.db
github_sourcecred_docs.db
github_sourcecred_sourcecred.db
```

This suffices because (a) login names cannot contain underscores, so we
can safely use that as a separator, and (b) GitHub disallows collisions
on names that are equal ignoring case, so we can convert all names to
lowercase without introducing collisions.

This change orphans existing caches. Running `sourcecred clear --cache`
will clean those up.

Test Plan:
Try to create a GitHub repository with the same name as a repository
that you already have, but with inverted case; note that this is
disallowed. Then, note that loading `sourcecred-test/example-github`
still works and produces a database with nicely readable name.

wchargin-branch: github-readable-db-names
This commit is contained in:
William Chargin 2020-03-02 21:25:07 -08:00 committed by GitHub
parent 09739512ab
commit 86c121e3c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 63 additions and 35 deletions

View File

@ -9,14 +9,33 @@ export opaque type CacheId: string = string;
/**
* Derives the CacheId for a RepoId.
* Returned CacheId's will be:
*
* Returned `CacheId`s will be:
* - Deterministic
* - Unique for this plugin
* - Lowercase
* - Safe to use for filenames
* - Distinct for semantically distinct inputs (input IDs that differ
* only in case may map to the same output, because GitHub does not
* permit collisions-modulo-case)
*/
export function cacheIdForRepoId(repoId: RepoId): CacheId {
const repoString = repoIdToString(repoId);
const repoStringHex = Buffer.from(repoString).toString("hex");
return `github_${repoStringHex}`.toLowerCase();
// GitHub owner (user/organization) and repository names may be in
// mixed case, but GitHub prevents mixed-case collisions: e.g., if
// `foo` is a user then `Foo` cannot also be a user, and likewise for
// repositories. GitHub login names are DNS-safe (`[0-9A-Za-z-]` only)
// and so are filename-safe. Repository names have a slightly larger
// character set, including underscore, but are also filename-safe.
// Because login may not contain an underscore, it thus suffices to
// use an underscore as the delimiter. The resulting filename uses
// only `[0-9A-Za-z_.-]` and so is valid on all major filesystems.
const owner = repoId.owner.toLowerCase();
const name = repoId.name.toLowerCase();
if (owner.includes("_")) {
throw new Error(
"unexpected underscore in GitHub owner name would be ambiguous: " +
repoIdToString(repoId)
);
}
return `github_${owner}_${name}`;
}

View File

@ -1,6 +1,6 @@
// @flow
import {makeRepoId, repoIdToString} from "./repoId";
import {makeRepoId, type RepoId} from "./repoId";
import {type CacheId, cacheIdForRepoId} from "./cacheId";
describe("plugins/github/cacheId", () => {
@ -24,19 +24,6 @@ describe("plugins/github/cacheId", () => {
const id = cacheIdForRepoId(sampleRepo);
expect(id).toMatch(/^github_/);
});
it("should contain a hex encoding of 'repoIdToString'", () => {
const id = cacheIdForRepoId(sampleRepo);
const repoIdString = repoIdToString(sampleRepo);
const expectedHex = Buffer.from(repoIdString).toString("hex");
expect(id).toContain(expectedHex);
});
it("should treat repoId as case sensitive", () => {
const lowercaseId = cacheIdForRepoId(makeRepoId("foo", "bar"));
const upperOwnerId = cacheIdForRepoId(makeRepoId("FOO", "bar"));
const upperNameId = cacheIdForRepoId(makeRepoId("foo", "BAR"));
expect(upperOwnerId).not.toEqual(lowercaseId);
expect(upperNameId).not.toEqual(lowercaseId);
});
it("should have lowercase output", () => {
const lowercaseId = cacheIdForRepoId(makeRepoId("foo", "bar"));
const upperOwnerId = cacheIdForRepoId(makeRepoId("FOO", "bar"));
@ -46,24 +33,46 @@ describe("plugins/github/cacheId", () => {
expect(upperNameId).not.toMatch(/[A-Z]/);
});
it("should deterministically match examples", () => {
const ids = [
cacheIdForRepoId(makeRepoId("sourcecred", "sourcecred")),
cacheIdForRepoId(makeRepoId("sourcecred", "sourcecred.github.io")),
cacheIdForRepoId(makeRepoId("foo", "Something-Good")),
cacheIdForRepoId(makeRepoId("foo", "still_good")),
cacheIdForRepoId(makeRepoId("fooolio", "foo-bar.bar-99_x")),
cacheIdForRepoId(makeRepoId("FOOolio", "foo-bar.bar-99_x")),
const expected = [
{
repoId: makeRepoId("sourcecred", "sourcecred"),
cacheId: "github_sourcecred_sourcecred",
},
{
repoId: makeRepoId("sourcecred", "sourcecred.github.io"),
cacheId: "github_sourcecred_sourcecred.github.io",
},
{
repoId: makeRepoId("foo", "Something-Good"),
cacheId: "github_foo_something-good",
},
{
repoId: makeRepoId("foo", "still_good"),
cacheId: "github_foo_still_good",
},
{
repoId: makeRepoId("fooolio", "foo-bar.bar-99_x"),
cacheId: "github_fooolio_foo-bar.bar-99_x",
},
{
repoId: makeRepoId("FOOolio", "foo-bar.bar-99_x"),
cacheId: "github_fooolio_foo-bar.bar-99_x",
},
];
expect(ids).toMatchInlineSnapshot(`
Array [
"github_736f75726365637265642f736f7572636563726564",
"github_736f75726365637265642f736f75726365637265642e6769746875622e696f",
"github_666f6f2f536f6d657468696e672d476f6f64",
"github_666f6f2f7374696c6c5f676f6f64",
"github_666f6f6f6c696f2f666f6f2d6261722e6261722d39395f78",
"github_464f4f6f6c696f2f666f6f2d6261722e6261722d39395f78",
]
`);
const output = expected.map(({repoId}) => ({
repoId,
cacheId: cacheIdForRepoId(repoId),
}));
expect(output).toEqual(expected);
});
it("should error if the owner name contains an undescore", () => {
// GitHub doesn't allow such names because they're not valid DNS
// labels. The `RepoId` type thus prevents constructing such
// invalid instances; we fabricate one as a defensive check.
const badRepoId: RepoId = ({owner: "h_mmm", name: "ok"}: any);
expect(() => {
cacheIdForRepoId(badRepoId);
}).toThrow(/ambiguous/);
});
});
});