fetchGithubRepo: use Mirror pipeline (#937)

Summary:
As of this commit, `node ./bin/sourcecred.js load` uses the Mirror code,
and the legacy continuation-fetching code is not included in the
`sourcecred.js` bundle.

We do not yet perform the commit prefetching described in #923. The code
should be plenty fast for repositories that merge pull requests at least
occasionally.

Test Plan:
Running `yarn test --full` passes. Loading `sourcecred/sourcecred` works
and generates a reasonable credit attribution. Loading it again
completes immediately.

wchargin-branch: fetchGithubRepo-mirror
This commit is contained in:
William Chargin 2018-10-28 12:03:06 -07:00 committed by GitHub
parent e2c99c418b
commit 08219f98bf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 1102 additions and 2549 deletions

View File

@ -29,9 +29,6 @@ module.exports = {
backendEntryPoints: { backendEntryPoints: {
sourcecred: resolveApp("src/cli/main.js"), sourcecred: resolveApp("src/cli/main.js"),
// //
testContinuations: resolveApp(
"src/plugins/github/bin/testContinuations.js"
),
generateGithubGraphqlFlowTypes: resolveApp( generateGithubGraphqlFlowTypes: resolveApp(
"src/plugins/github/bin/generateGraphqlFlowTypes.js" "src/plugins/github/bin/generateGraphqlFlowTypes.js"
), ),

View File

@ -12,8 +12,10 @@
* from https://github.com/settings/tokens/new. * from https://github.com/settings/tokens/new.
*/ */
import fetchGithubRepo from "../fetchGithubRepo";
import stringify from "json-stable-stringify"; import stringify from "json-stable-stringify";
import tmp from "tmp";
import fetchGithubRepo from "../fetchGithubRepo";
import {makeRepoId} from "../../../core/repoId"; import {makeRepoId} from "../../../core/repoId";
function parseArgs() { function parseArgs() {
@ -36,7 +38,8 @@ function parseArgs() {
function main() { function main() {
const args = parseArgs(); const args = parseArgs();
const repoId = makeRepoId(args.owner, args.name); const repoId = makeRepoId(args.owner, args.name);
fetchGithubRepo(repoId, args.githubToken) const options = {token: args.githubToken, cacheDirectory: tmp.dirSync().name};
fetchGithubRepo(repoId, options)
.then((data) => { .then((data) => {
console.log(stringify(data, {space: 4})); console.log(stringify(data, {space: 4}));
}) })

View File

@ -1,106 +0,0 @@
// @flow
// Ad hoc testing script for RelationalView input format consistency.
import Database from "better-sqlite3";
import fs from "fs-extra";
import stringify from "json-stable-stringify";
import deepEqual from "lodash.isequal";
import {makeRepoId} from "../../../core/repoId";
import {Mirror} from "../../../graphql/mirror";
import fetchGithubRepo, {postQuery} from "../fetchGithubRepo";
import type {Repository} from "../graphqlTypes";
import {RelationalView, type RelationalViewJSON} from "../relationalView";
import githubSchema from "../schema";
async function test(options: {|
+token: string,
+owner: string,
+name: string,
+graphqlId: string,
+outputFilepaths: {|
+continuations: string,
+mirror: string,
|},
|}) {
async function fetchViaContinuations(): Promise<RelationalViewJSON> {
const raw = await fetchGithubRepo(
makeRepoId(options.owner, options.name),
options.token
);
const rv = new RelationalView();
rv.addData(raw);
return rv.toJSON();
}
async function fetchViaMirror(): Promise<RelationalViewJSON> {
const mirror = new Mirror(new Database(":memory:"), githubSchema());
mirror.registerObject({typename: "Repository", id: options.graphqlId});
await mirror.update((payload) => postQuery(payload, options.token), {
nodesLimit: 100,
nodesOfTypeLimit: 100,
connectionPageSize: 100,
connectionLimit: 100,
since: new Date(0),
now: () => new Date(),
});
const repository = ((mirror.extract(options.graphqlId): any): Repository);
const rv = new RelationalView();
rv.addRepository(repository);
return rv.toJSON();
}
function saveTo(filename: string, repo: RelationalViewJSON): Promise<void> {
return fs.writeFile(filename, stringify(repo));
}
const [viaContinuations, viaMirror] = await Promise.all([
fetchViaContinuations(),
fetchViaMirror(),
]);
if (deepEqual(viaContinuations, viaMirror)) {
console.log("Identical. Saving to disk...");
} else {
console.log("Different. Saving to disk...");
}
await Promise.all([
saveTo(options.outputFilepaths.continuations, viaContinuations),
saveTo(options.outputFilepaths.mirror, viaMirror),
]);
}
async function main() {
const args = process.argv.slice(2);
const token = process.env.SOURCECRED_GITHUB_TOKEN;
if (args.length !== 5 || token == null) {
const invocation = [
"SOURCECRED_GITHUB_TOKEN=<token>",
"node",
"test.js",
"REPO_OWNER",
"REPO_NAME",
"GRAPHQL_ID",
"CONTINUATIONS_OUTPUT_FILENAME",
"MIRROR_OUTPUT_FILENAME",
];
console.error("usage: " + invocation.join(" "));
process.exitCode = 1;
return;
}
const [owner, name, graphqlId, continuations, mirror] = args;
const options = {
token,
owner,
name,
graphqlId,
outputFilepaths: {
continuations,
mirror,
},
};
await test(options);
}
main();

File diff suppressed because it is too large Load Diff

View File

@ -1,18 +1,18 @@
// @flow // @flow
import {RelationalView} from "../relationalView"; import {RelationalView} from "../relationalView";
import type {GithubResponseJSON} from "../graphql"; import type {Repository} from "../graphqlTypes";
import {Graph} from "../../../core/graph"; import {Graph} from "../../../core/graph";
import cloneDeep from "lodash.clonedeep"; import cloneDeep from "lodash.clonedeep";
import {createGraph} from "../createGraph"; import {createGraph} from "../createGraph";
export function exampleData(): GithubResponseJSON { export function exampleRepository(): Repository {
return cloneDeep(require("./example-github")); return cloneDeep(require("./example-github"));
} }
export function exampleRelationalView(): RelationalView { export function exampleRelationalView(): RelationalView {
const rv = new RelationalView(); const rv = new RelationalView();
rv.addData(exampleData()); rv.addRepository(exampleRepository());
return rv; return rv;
} }

View File

@ -4,13 +4,18 @@
* docstring of the default export for more details. * docstring of the default export for more details.
*/ */
import Database from "better-sqlite3";
import fetch from "isomorphic-fetch"; import fetch from "isomorphic-fetch";
import path from "path";
import retry from "retry"; import retry from "retry";
import {type RepoId, repoIdToString} from "../../core/repoId";
import {Mirror} from "../../graphql/mirror";
import * as Queries from "../../graphql/queries";
import {stringify, inlineLayout, type Body} from "../../graphql/queries"; import {stringify, inlineLayout, type Body} from "../../graphql/queries";
import {createQuery, createVariables, postQueryExhaustive} from "./graphql"; import * as Schema from "../../graphql/schema";
import type {GithubResponseJSON} from "./graphql"; import schema from "./schema";
import type {RepoId} from "../../core/repoId"; import type {Repository} from "./graphqlTypes";
/** /**
* Scrape data from a GitHub repo using the GitHub API. * Scrape data from a GitHub repo using the GitHub API.
@ -25,27 +30,49 @@ import type {RepoId} from "../../core/repoId";
* scraped from the repository, with data format to be specified * scraped from the repository, with data format to be specified
* later * later
*/ */
export default function fetchGithubRepo( export default async function fetchGithubRepo(
repoId: RepoId, repoId: RepoId,
token: string options: {|+token: string, +cacheDirectory: string|}
): Promise<GithubResponseJSON> { ): Promise<Repository> {
token = String(token); const {token, cacheDirectory} = options;
const validToken = /^[A-Fa-f0-9]{40}$/; const validToken = /^[A-Fa-f0-9]{40}$/;
if (!validToken.test(token)) { if (!validToken.test(token)) {
throw new Error(`Invalid token: ${token}`); throw new Error(`Invalid token: ${token}`);
} }
const postQueryWithToken = (payload) => postQuery(payload, token);
const body = createQuery(); const resolvedId: Schema.ObjectId = await resolveRepositoryGraphqlId(
const variables = createVariables(repoId); postQueryWithToken,
const payload = {body, variables}; repoId
return postQueryExhaustive( );
(somePayload) => postQuery(somePayload, token),
payload // Key the cache file against the GraphQL ID, but make sure that the
).then((x: GithubResponseJSON) => { // name is valid and uniquely identifying even on case-insensitive
ensureNoMorePages(x); // filesystems (HFS, HFS+, APFS, NTFS) or filesystems preventing
return x; // equals signs in file names.
const dbFilename = `mirror_${Buffer.from(resolvedId).toString("hex")}.db`;
const db = new Database(path.join(cacheDirectory, dbFilename));
const mirror = new Mirror(db, schema());
mirror.registerObject({typename: "Repository", id: resolvedId});
// These are arbitrary tuning parameters.
// TODO(#638): Design a configuration system for plugins.
const ttlSeconds = 86400;
const nodesLimit = 100;
const connectionLimit = 100;
await mirror.update(postQueryWithToken, {
since: new Date(Date.now() - ttlSeconds * 1000),
now: () => new Date(),
// These properties are arbitrary tuning parameters.
nodesLimit,
connectionLimit,
// These values are the maxima allowed by GitHub.
nodesOfTypeLimit: 100,
connectionPageSize: 100,
}); });
return ((mirror.extract(resolvedId): any): Repository);
} }
const GITHUB_GRAPHQL_SERVER = "https://api.github.com/graphql"; const GITHUB_GRAPHQL_SERVER = "https://api.github.com/graphql";
@ -185,23 +212,35 @@ export async function postQuery(
); );
} }
function ensureNoMorePages(result: any, path = []) { async function resolveRepositoryGraphqlId(
if (result == null) { postQuery: ({+body: Body, +variables: mixed}) => Promise<any>,
return; repoId: RepoId
} ): Promise<Schema.ObjectId> {
if (result.pageInfo) { const b = Queries.build;
if (result.pageInfo.hasNextPage) { const payload = {
console.error(result); body: [
throw new Error(`More pages at: ${path.join()}`); b.query(
} "ResolveRepositoryId",
} [b.param("owner", "String!"), b.param("name", "String!")],
if (Array.isArray(result)) { [
result.forEach((item, i) => { b.field(
ensureNoMorePages(item, [...path, i]); "repository",
}); {owner: b.variable("owner"), name: b.variable("name")},
} else if (typeof result === "object") { [b.field("id")]
Object.keys(result).forEach((k) => { ),
ensureNoMorePages(result[k], [...path, k]); ]
}); ),
],
variables: {owner: repoId.owner, name: repoId.name},
};
const data: {|+repository: null | {|+id: string|}|} = await postQuery(
payload
);
if (data.repository == null) {
throw new Error(
`No such repository: ${repoIdToString(repoId)} ` +
`(response data: ${JSON.stringify(data)})`
);
} }
return data.repository.id;
} }

View File

@ -24,11 +24,16 @@ export async function loadGithubData(options: Options): Promise<void> {
// > make requests for a single user or client ID concurrently. // > make requests for a single user or client ID concurrently.
const responses = []; const responses = [];
for (const repoId of options.repoIds) { for (const repoId of options.repoIds) {
responses.push(await fetchGithubRepo(repoId, options.token)); responses.push(
await fetchGithubRepo(repoId, {
token: options.token,
cacheDirectory: options.cacheDirectory,
})
);
} }
const view = new RelationalView(); const view = new RelationalView();
for (const response of responses) { for (const response of responses) {
view.addData(response); view.addRepository(response);
} }
view.compressByRemovingBody(); view.compressByRemovingBody();
const blob: Uint8Array = pako.gzip(JSON.stringify(view)); const blob: Uint8Array = pako.gzip(JSON.stringify(view));

View File

@ -2,7 +2,7 @@
import * as R from "./relationalView"; import * as R from "./relationalView";
import * as N from "./nodes"; import * as N from "./nodes";
import {exampleData, exampleRelationalView} from "./example/example"; import {exampleRepository, exampleRelationalView} from "./example/example";
import * as MapUtil from "../../util/map"; import * as MapUtil from "../../util/map";
describe("plugins/github/relationalView", () => { describe("plugins/github/relationalView", () => {
@ -276,8 +276,7 @@ describe("plugins/github/relationalView", () => {
describe("reaction detection", () => { describe("reaction detection", () => {
it("set of all reactions matches snapshot", () => { it("set of all reactions matches snapshot", () => {
const view = new R.RelationalView(); const view = exampleRelationalView();
view.addData(exampleData());
const urlToReactions = new Map(); const urlToReactions = new Map();
for (const reactable of view.reactableEntities()) { for (const reactable of view.reactableEntities()) {
const url = reactable.url(); const url = reactable.url();
@ -289,28 +288,26 @@ describe("plugins/github/relationalView", () => {
}); });
}); });
it("addData is idempotent", () => { it("addRepository is idempotent", () => {
const rv1 = new R.RelationalView(); const rv1 = new R.RelationalView();
rv1.addData(exampleData()); rv1.addRepository(exampleRepository());
const rv2 = new R.RelationalView(); const rv2 = new R.RelationalView();
rv2.addData(exampleData()); rv2.addRepository(exampleRepository());
rv2.addData(exampleData()); rv2.addRepository(exampleRepository());
// may be fragile // may be fragile
expect(rv1).toEqual(rv2); expect(rv1).toEqual(rv2);
}); });
describe("compressByRemovingBody", () => { describe("compressByRemovingBody", () => {
it("doesn't mutate the original entries", () => { it("doesn't mutate the original entries", () => {
const rv = new R.RelationalView(); const rv = exampleRelationalView();
rv.addData(exampleData());
const issue0 = Array.from(rv.issues())[0]; const issue0 = Array.from(rv.issues())[0];
expect(issue0.body()).not.toEqual(""); expect(issue0.body()).not.toEqual("");
rv.compressByRemovingBody(); rv.compressByRemovingBody();
expect(issue0.body()).not.toEqual(""); expect(issue0.body()).not.toEqual("");
}); });
it("removes bodies from all posts", () => { it("removes bodies from all posts", () => {
const rv = new R.RelationalView(); const rv = exampleRelationalView();
rv.addData(exampleData());
function somePostsHaveBodies() { function somePostsHaveBodies() {
for (const posts of [ for (const posts of [
rv.issues(), rv.issues(),
@ -331,8 +328,7 @@ describe("plugins/github/relationalView", () => {
expect(somePostsHaveBodies()).toBe(false); expect(somePostsHaveBodies()).toBe(false);
}); });
it("removes messages from all commits", () => { it("removes messages from all commits", () => {
const rv = new R.RelationalView(); const rv = exampleRelationalView();
rv.addData(exampleData());
function someCommitsHaveMessages() { function someCommitsHaveMessages() {
for (const commit of rv.commits()) { for (const commit of rv.commits()) {
if (commit.message() !== "") { if (commit.message() !== "") {

View File

@ -1,15 +1,9 @@
// @flow // @flow
import {exampleData} from "./example/example";
import translateContinuations from "./translateContinuations"; import translateContinuations from "./translateContinuations";
describe("plugins/github/translateContinuations", () => { describe("plugins/github/translateContinuations", () => {
describe("translateContinuations", () => { describe("translateContinuations", () => {
it("works on the example data", () => {
expect(translateContinuations(exampleData())).toMatchSnapshot();
});
it("raises a warning if the defaultBranchRef is not a commit", () => { it("raises a warning if the defaultBranchRef is not a commit", () => {
const exampleData = { const exampleData = {
repository: { repository: {