fetchGithubRepo: use Mirror pipeline (#937)

Summary:
As of this commit, `node ./bin/sourcecred.js load` uses the Mirror code,
and the legacy continuation-fetching code is not included in the
`sourcecred.js` bundle.

We do not yet perform the commit prefetching described in #923. The code
should be plenty fast for repositories that merge pull requests at least
occasionally.

Test Plan:
Running `yarn test --full` passes. Loading `sourcecred/sourcecred` works
and generates a reasonable credit attribution. Loading it again
completes immediately.

wchargin-branch: fetchGithubRepo-mirror
This commit is contained in:
William Chargin 2018-10-28 12:03:06 -07:00 committed by GitHub
parent e2c99c418b
commit 08219f98bf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 1102 additions and 2549 deletions

View File

@ -29,9 +29,6 @@ module.exports = {
backendEntryPoints: {
sourcecred: resolveApp("src/cli/main.js"),
//
testContinuations: resolveApp(
"src/plugins/github/bin/testContinuations.js"
),
generateGithubGraphqlFlowTypes: resolveApp(
"src/plugins/github/bin/generateGraphqlFlowTypes.js"
),

View File

@ -12,8 +12,10 @@
* from https://github.com/settings/tokens/new.
*/
import fetchGithubRepo from "../fetchGithubRepo";
import stringify from "json-stable-stringify";
import tmp from "tmp";
import fetchGithubRepo from "../fetchGithubRepo";
import {makeRepoId} from "../../../core/repoId";
function parseArgs() {
@ -36,7 +38,8 @@ function parseArgs() {
function main() {
const args = parseArgs();
const repoId = makeRepoId(args.owner, args.name);
fetchGithubRepo(repoId, args.githubToken)
const options = {token: args.githubToken, cacheDirectory: tmp.dirSync().name};
fetchGithubRepo(repoId, options)
.then((data) => {
console.log(stringify(data, {space: 4}));
})

View File

@ -1,106 +0,0 @@
// @flow
// Ad hoc testing script for RelationalView input format consistency.
import Database from "better-sqlite3";
import fs from "fs-extra";
import stringify from "json-stable-stringify";
import deepEqual from "lodash.isequal";
import {makeRepoId} from "../../../core/repoId";
import {Mirror} from "../../../graphql/mirror";
import fetchGithubRepo, {postQuery} from "../fetchGithubRepo";
import type {Repository} from "../graphqlTypes";
import {RelationalView, type RelationalViewJSON} from "../relationalView";
import githubSchema from "../schema";
async function test(options: {|
+token: string,
+owner: string,
+name: string,
+graphqlId: string,
+outputFilepaths: {|
+continuations: string,
+mirror: string,
|},
|}) {
async function fetchViaContinuations(): Promise<RelationalViewJSON> {
const raw = await fetchGithubRepo(
makeRepoId(options.owner, options.name),
options.token
);
const rv = new RelationalView();
rv.addData(raw);
return rv.toJSON();
}
async function fetchViaMirror(): Promise<RelationalViewJSON> {
const mirror = new Mirror(new Database(":memory:"), githubSchema());
mirror.registerObject({typename: "Repository", id: options.graphqlId});
await mirror.update((payload) => postQuery(payload, options.token), {
nodesLimit: 100,
nodesOfTypeLimit: 100,
connectionPageSize: 100,
connectionLimit: 100,
since: new Date(0),
now: () => new Date(),
});
const repository = ((mirror.extract(options.graphqlId): any): Repository);
const rv = new RelationalView();
rv.addRepository(repository);
return rv.toJSON();
}
function saveTo(filename: string, repo: RelationalViewJSON): Promise<void> {
return fs.writeFile(filename, stringify(repo));
}
const [viaContinuations, viaMirror] = await Promise.all([
fetchViaContinuations(),
fetchViaMirror(),
]);
if (deepEqual(viaContinuations, viaMirror)) {
console.log("Identical. Saving to disk...");
} else {
console.log("Different. Saving to disk...");
}
await Promise.all([
saveTo(options.outputFilepaths.continuations, viaContinuations),
saveTo(options.outputFilepaths.mirror, viaMirror),
]);
}
async function main() {
const args = process.argv.slice(2);
const token = process.env.SOURCECRED_GITHUB_TOKEN;
if (args.length !== 5 || token == null) {
const invocation = [
"SOURCECRED_GITHUB_TOKEN=<token>",
"node",
"test.js",
"REPO_OWNER",
"REPO_NAME",
"GRAPHQL_ID",
"CONTINUATIONS_OUTPUT_FILENAME",
"MIRROR_OUTPUT_FILENAME",
];
console.error("usage: " + invocation.join(" "));
process.exitCode = 1;
return;
}
const [owner, name, graphqlId, continuations, mirror] = args;
const options = {
token,
owner,
name,
graphqlId,
outputFilepaths: {
continuations,
mirror,
},
};
await test(options);
}
main();

File diff suppressed because it is too large Load Diff

View File

@ -1,18 +1,18 @@
// @flow
import {RelationalView} from "../relationalView";
import type {GithubResponseJSON} from "../graphql";
import type {Repository} from "../graphqlTypes";
import {Graph} from "../../../core/graph";
import cloneDeep from "lodash.clonedeep";
import {createGraph} from "../createGraph";
export function exampleData(): GithubResponseJSON {
export function exampleRepository(): Repository {
return cloneDeep(require("./example-github"));
}
export function exampleRelationalView(): RelationalView {
const rv = new RelationalView();
rv.addData(exampleData());
rv.addRepository(exampleRepository());
return rv;
}

View File

@ -4,13 +4,18 @@
* docstring of the default export for more details.
*/
import Database from "better-sqlite3";
import fetch from "isomorphic-fetch";
import path from "path";
import retry from "retry";
import {type RepoId, repoIdToString} from "../../core/repoId";
import {Mirror} from "../../graphql/mirror";
import * as Queries from "../../graphql/queries";
import {stringify, inlineLayout, type Body} from "../../graphql/queries";
import {createQuery, createVariables, postQueryExhaustive} from "./graphql";
import type {GithubResponseJSON} from "./graphql";
import type {RepoId} from "../../core/repoId";
import * as Schema from "../../graphql/schema";
import schema from "./schema";
import type {Repository} from "./graphqlTypes";
/**
* Scrape data from a GitHub repo using the GitHub API.
@ -25,27 +30,49 @@ import type {RepoId} from "../../core/repoId";
* scraped from the repository, with data format to be specified
* later
*/
export default function fetchGithubRepo(
export default async function fetchGithubRepo(
repoId: RepoId,
token: string
): Promise<GithubResponseJSON> {
token = String(token);
options: {|+token: string, +cacheDirectory: string|}
): Promise<Repository> {
const {token, cacheDirectory} = options;
const validToken = /^[A-Fa-f0-9]{40}$/;
if (!validToken.test(token)) {
throw new Error(`Invalid token: ${token}`);
}
const postQueryWithToken = (payload) => postQuery(payload, token);
const body = createQuery();
const variables = createVariables(repoId);
const payload = {body, variables};
return postQueryExhaustive(
(somePayload) => postQuery(somePayload, token),
payload
).then((x: GithubResponseJSON) => {
ensureNoMorePages(x);
return x;
const resolvedId: Schema.ObjectId = await resolveRepositoryGraphqlId(
postQueryWithToken,
repoId
);
// Key the cache file against the GraphQL ID, but make sure that the
// name is valid and uniquely identifying even on case-insensitive
// filesystems (HFS, HFS+, APFS, NTFS) or filesystems preventing
// equals signs in file names.
const dbFilename = `mirror_${Buffer.from(resolvedId).toString("hex")}.db`;
const db = new Database(path.join(cacheDirectory, dbFilename));
const mirror = new Mirror(db, schema());
mirror.registerObject({typename: "Repository", id: resolvedId});
// These are arbitrary tuning parameters.
// TODO(#638): Design a configuration system for plugins.
const ttlSeconds = 86400;
const nodesLimit = 100;
const connectionLimit = 100;
await mirror.update(postQueryWithToken, {
since: new Date(Date.now() - ttlSeconds * 1000),
now: () => new Date(),
// These properties are arbitrary tuning parameters.
nodesLimit,
connectionLimit,
// These values are the maxima allowed by GitHub.
nodesOfTypeLimit: 100,
connectionPageSize: 100,
});
return ((mirror.extract(resolvedId): any): Repository);
}
const GITHUB_GRAPHQL_SERVER = "https://api.github.com/graphql";
@ -185,23 +212,35 @@ export async function postQuery(
);
}
function ensureNoMorePages(result: any, path = []) {
if (result == null) {
return;
}
if (result.pageInfo) {
if (result.pageInfo.hasNextPage) {
console.error(result);
throw new Error(`More pages at: ${path.join()}`);
}
}
if (Array.isArray(result)) {
result.forEach((item, i) => {
ensureNoMorePages(item, [...path, i]);
});
} else if (typeof result === "object") {
Object.keys(result).forEach((k) => {
ensureNoMorePages(result[k], [...path, k]);
});
async function resolveRepositoryGraphqlId(
postQuery: ({+body: Body, +variables: mixed}) => Promise<any>,
repoId: RepoId
): Promise<Schema.ObjectId> {
const b = Queries.build;
const payload = {
body: [
b.query(
"ResolveRepositoryId",
[b.param("owner", "String!"), b.param("name", "String!")],
[
b.field(
"repository",
{owner: b.variable("owner"), name: b.variable("name")},
[b.field("id")]
),
]
),
],
variables: {owner: repoId.owner, name: repoId.name},
};
const data: {|+repository: null | {|+id: string|}|} = await postQuery(
payload
);
if (data.repository == null) {
throw new Error(
`No such repository: ${repoIdToString(repoId)} ` +
`(response data: ${JSON.stringify(data)})`
);
}
return data.repository.id;
}

View File

@ -24,11 +24,16 @@ export async function loadGithubData(options: Options): Promise<void> {
// > make requests for a single user or client ID concurrently.
const responses = [];
for (const repoId of options.repoIds) {
responses.push(await fetchGithubRepo(repoId, options.token));
responses.push(
await fetchGithubRepo(repoId, {
token: options.token,
cacheDirectory: options.cacheDirectory,
})
);
}
const view = new RelationalView();
for (const response of responses) {
view.addData(response);
view.addRepository(response);
}
view.compressByRemovingBody();
const blob: Uint8Array = pako.gzip(JSON.stringify(view));

View File

@ -2,7 +2,7 @@
import * as R from "./relationalView";
import * as N from "./nodes";
import {exampleData, exampleRelationalView} from "./example/example";
import {exampleRepository, exampleRelationalView} from "./example/example";
import * as MapUtil from "../../util/map";
describe("plugins/github/relationalView", () => {
@ -276,8 +276,7 @@ describe("plugins/github/relationalView", () => {
describe("reaction detection", () => {
it("set of all reactions matches snapshot", () => {
const view = new R.RelationalView();
view.addData(exampleData());
const view = exampleRelationalView();
const urlToReactions = new Map();
for (const reactable of view.reactableEntities()) {
const url = reactable.url();
@ -289,28 +288,26 @@ describe("plugins/github/relationalView", () => {
});
});
it("addData is idempotent", () => {
it("addRepository is idempotent", () => {
const rv1 = new R.RelationalView();
rv1.addData(exampleData());
rv1.addRepository(exampleRepository());
const rv2 = new R.RelationalView();
rv2.addData(exampleData());
rv2.addData(exampleData());
rv2.addRepository(exampleRepository());
rv2.addRepository(exampleRepository());
// may be fragile
expect(rv1).toEqual(rv2);
});
describe("compressByRemovingBody", () => {
it("doesn't mutate the original entries", () => {
const rv = new R.RelationalView();
rv.addData(exampleData());
const rv = exampleRelationalView();
const issue0 = Array.from(rv.issues())[0];
expect(issue0.body()).not.toEqual("");
rv.compressByRemovingBody();
expect(issue0.body()).not.toEqual("");
});
it("removes bodies from all posts", () => {
const rv = new R.RelationalView();
rv.addData(exampleData());
const rv = exampleRelationalView();
function somePostsHaveBodies() {
for (const posts of [
rv.issues(),
@ -331,8 +328,7 @@ describe("plugins/github/relationalView", () => {
expect(somePostsHaveBodies()).toBe(false);
});
it("removes messages from all commits", () => {
const rv = new R.RelationalView();
rv.addData(exampleData());
const rv = exampleRelationalView();
function someCommitsHaveMessages() {
for (const commit of rv.commits()) {
if (commit.message() !== "") {

View File

@ -1,15 +1,9 @@
// @flow
import {exampleData} from "./example/example";
import translateContinuations from "./translateContinuations";
describe("plugins/github/translateContinuations", () => {
describe("translateContinuations", () => {
it("works on the example data", () => {
expect(translateContinuations(exampleData())).toMatchSnapshot();
});
it("raises a warning if the defaultBranchRef is not a commit", () => {
const exampleData = {
repository: {