Implement exhaustive fetching for our GitHub query (#123)

Summary:
This commit completes the ad hoc pagination solution described in #117:
we implement pagination specifically for our current query against the
GitHub API. This is done in such a way that reasonable additions to the
query will not be hard to implement—for instance, if we want to fetch
a new kind of field, the marginal cost is at most a bit of extra
copy-and-paste and some modifications to tests. However, we should
certainly still plan to implement the fully automatic pagination system
described in #117.

Running on TensorBoard with the default page limits takes 30–33 seconds
over 7 queries, uses 103 GitHub API points (out of 5000 for any given
hour), and produces a JSON file that is 8.7MB (when pretty-printed).
This all seems quite reasonable to me.

Test Plan:
Extensive unit tests added. The snapshots are quite readable, by design.

For a real-world test, you can `yarn start` the artifact viewer and use
the GUI to fetch the data for tensorflow/tensorboard.

To demonstrate that the fetching process gives the same results
regardless of the page size, follow these steps:

 1. In `fetchGithubRepo.js`’s `postQuery` function, insert a new
    statement `console.error("Posting query...")` at the beginning. (It
    is important to print to stderr instead of stdout.)
 2. Run `yarn backend` and then invoke `fetchGithubRepo.js` on a repo
    large enough to require pagination, like SourceCred or TensorBoard.
    Pipe the result to `shasum -a 256` and note the SHA.
 3. In `github/graphql.js`, change the page size constants near the top
    of the file. Re-run step 2. The number of queries that are posted
    will vary significantly as the page size constants vary, but the
    checksum should remain the same.
 4. Repeat until satisfied. (I tried three sets of values: the standard
    values, the standard values but all divided by 10, and all 5s.)

wchargin-branch: ad-hoc-pagination
This commit is contained in:
William Chargin 2018-04-06 07:29:55 -07:00 committed by GitHub
parent e82b56e52c
commit 9d1200275e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 698 additions and 9 deletions

View File

@ -0,0 +1,202 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP
exports[`graphql #postQueryExhaustive resolves a representative query 1`] = `
Object {
"repository": Object {
"id": "opaque-repo",
"issues": Object {
"nodes": Array [
Object {
"author": Object {
"__typename": "User",
"id": "opaque-user-decentralion",
"login": "decentralion",
},
"body": "Like it says, please comment!",
"comments": Object {
"nodes": Array [
Object {
"author": Object {
"__typename": "User",
"id": "opaque-user-decentralion",
"login": "decentralion",
},
"body": "Here: I'll start.",
"id": "opaque-issue1comment1",
"url": "opaque://issue/1/comment/1",
},
Object {
"author": Object {
"__typename": "User",
"id": "opaque-user-wchargin",
"login": "wchargin",
},
"body": "Closing due to no fun allowed.",
"id": "opaque-issue1comment2",
"url": "opaque://issue/1/comment/2",
},
Object {
"author": Object {
"__typename": "User",
"id": "opaque-user-decentralion",
"login": "decentralion",
},
"body": "That is not very nice.",
"id": "opaque-issue1comment3",
"url": "opaque://issue/1/comment/3",
},
],
"pageInfo": Object {
"endCursor": "opaque-cursor-issue1comments-v2",
"hasNextPage": false,
},
},
"id": "opaque-issue1",
"number": 1,
"title": "Request for comments",
},
Object {
"author": Object {
"__typename": "User",
"id": "opaque-user-wchargin",
"login": "wchargin",
},
"body": "You can comment here, too.",
"comments": Object {
"nodes": Array [
Object {
"author": Object {
"__typename": "User",
"id": "opaque-user-decentralion",
"login": "decentralion",
},
"body": "What fun!",
"id": "opaque-issue3comment1",
"url": "opaque://issue/3/comment/1",
},
Object {
"author": Object {
"__typename": "User",
"id": "opaque-user-decentralion",
"login": "decentralion",
},
"body": "I will comment on this issue for a second time.",
"id": "opaque-issue3comment2",
"url": "opaque://issue/1/comment/3",
},
],
"pageInfo": Object {
"endCursor": "opaque-cursor-issue3comments-v2",
"hasNextPage": false,
},
},
"id": "opaque-issue3",
"number": 2,
"title": "Another",
},
Object {
"author": Object {
"__typename": "User",
"id": "opaque-user-wchargin",
"login": "wchargin",
},
"body": "My mailbox is out of space",
"comments": Object {
"nodes": Array [
Object {
"author": Object {
"__typename": "User",
"id": "opaque-user-decentralion",
"login": "decentralion",
},
"body": "But you posted the last issue",
"id": "opaque-issue4comment1",
"url": "opaque://issue/4/comment/1",
},
],
"pageInfo": Object {
"endCursor": "opaque-cursor-issue4comments-v2",
"hasNextPage": false,
},
},
"id": "opaque-issue4",
"number": 4,
"title": "Please stop making issues",
},
],
"pageInfo": Object {
"endCursor": "opaque-cursor-issues-v2",
"hasNextPage": false,
},
},
"pullRequests": Object {
"nodes": Array [
Object {
"author": Object {
"__typename": "User",
"id": "opaque-user-wchargin",
"login": "wchargin",
},
"body": "Surely this deserves much cred.",
"comments": Object {
"nodes": Array [],
"pageInfo": Object {
"endCursor": null,
"hasNextPage": false,
},
},
"id": "opaque-pr2",
"number": 2,
"reviews": Object {
"nodes": Array [
Object {
"author": Object {
"__typename": "User",
"id": "opaque-user-decentralion",
"login": "decentralion",
},
"body": "You actually introduced a new typo instead.",
"comments": Object {
"nodes": Array [],
"pageInfo": Object {
"endCursor": null,
"hasNextPage": false,
},
},
"id": "opaque-pr2review1",
"state": "CHANGES_REQUESTED",
},
Object {
"author": Object {
"__typename": "User",
"id": "opaque-user-decentralion",
"login": "decentralion",
},
"body": "Looks godo to me.",
"comments": Object {
"nodes": Array [],
"pageInfo": Object {
"endCursor": null,
"hasNextPage": false,
},
},
"id": "opaque-pr2review2",
"state": "APPROVED",
},
],
"pageInfo": Object {
"endCursor": "opaque-cursor-pr2reviews-v1",
"hasNextPage": false,
},
},
"title": "Fix typo in README",
},
],
"pageInfo": Object {
"endCursor": "opaque-cursor-prs-v0",
"hasNextPage": false,
},
},
},
}
`;

View File

@ -7,7 +7,7 @@
import fetch from "isomorphic-fetch";
import {stringify, inlineLayout} from "../../graphql/queries";
import {createQuery, createVariables} from "./graphql";
import {createQuery, createVariables, postQueryExhaustive} from "./graphql";
/**
* Scrape data from a GitHub repo using the GitHub API.
@ -45,15 +45,28 @@ export default function fetchGithubRepo(
throw new Error(`Invalid token: ${token}`);
}
const query = stringify.body(createQuery(), inlineLayout());
const body = createQuery();
const variables = createVariables(repoOwner, repoName);
const payload = {query, variables};
return postQuery(payload, token);
const payload = {body, variables};
return postQueryExhaustive(
(somePayload) => postQuery(somePayload, token),
payload
).then((x) => {
ensureNoMorePages(x);
// TODO: We wrap back up in the "data" object to maintain
// compatibility. At some point, let's strip this out and modify
// clients of `example-data.json`.
return {data: x};
});
}
const GITHUB_GRAPHQL_SERVER = "https://api.github.com/graphql";
function postQuery(payload, token) {
function postQuery({body, variables}, token) {
const payload = {
query: stringify.body(body, inlineLayout()),
variables: variables,
};
return fetch(GITHUB_GRAPHQL_SERVER, {
method: "POST",
body: JSON.stringify(payload),
@ -66,8 +79,7 @@ function postQuery(payload, token) {
if (x.errors) {
return Promise.reject(x);
}
ensureNoMorePages(x);
return Promise.resolve(x);
return Promise.resolve(x.data);
});
}

View File

@ -1,6 +1,11 @@
// @flow
import type {Body, FragmentDefinition, Selection} from "../../graphql/queries";
import type {
Body,
FragmentDefinition,
Selection,
QueryDefinition,
} from "../../graphql/queries";
import {build} from "../../graphql/queries";
/**
@ -340,6 +345,156 @@ function* continuationsFromReview(
}
}
/**
* Execute the given query, returning all pages of data throughout the
* query. That is: post the query, then find any entities that require
* more pages of data, fetch those additional pages, and merge the
* results. The `postQuery` function may be called multiple times.
*/
export async function postQueryExhaustive(
postQuery: ({body: Body, variables: {[string]: any}}) => Promise<any>,
payload: {body: Body, variables: {[string]: any}}
) {
const originalResult = await postQuery(payload);
return resolveContinuations(
postQuery,
originalResult,
Array.from(continuationsFromQuery(originalResult))
);
}
/**
* Given the result of a query and the continuations for that query,
* resolve the continuations and return the merged results.
*/
async function resolveContinuations(
postQuery: ({body: Body, variables: {[string]: any}}) => Promise<any>,
originalResult: any,
continuations: $ReadOnlyArray<Continuation>
): Promise<any> {
if (!continuations.length) {
return originalResult;
}
// Assign each continuation a nonce (unique name) so that we can refer
// to it unambiguously, and embed all the continuations into a query.
const embeddings = continuations.map((c, i) => ({
continuation: c,
nonce: `_n${String(i)}`,
}));
const b = build;
const query = b.query(
"Continuations",
[],
embeddings.map(({continuation, nonce}) =>
b.alias(
nonce,
b.field(
"node",
{id: b.literal(continuation.enclosingNodeId)},
continuation.selections.slice()
)
)
)
);
const body = [query, ...requiredFragments(query)];
const payload = {body, variables: {}};
// Send the continuation query, then merge these results into the
// original results---then recur, because the new results may
// themselves be incomplete.
const continuationResult = await postQuery(payload);
const mergedResults = embeddings.reduce((acc, {continuation, nonce}) => {
return merge(acc, continuationResult[nonce], continuation.destinationPath);
}, originalResult);
return resolveContinuations(
postQuery,
mergedResults,
Array.from(continuationsFromQuery(mergedResults))
);
}
/**
* A GraphQL query includes a query body and some fragment definitions.
* It is an error to include unneeded fragment definitions. Therefore,
* given a standard set of fragments and an arbitrary query body, we
* need to be able to select just the right set of fragments for our
* particular query.
*
* This function finds all fragments that are transitively used by the
* given query. That is, it finds all fragments used by the query, all
* fragments used by those fragments, and so on. Note that the universe
* of fragments is considered to be the result of `createFragments`; it
* is an error to use a fragment not defined in the result of that
* function.
*
* Equivalently, construct a graph where the nodes are (a) the query and
* (b) all possible fragments, and there is an edge from `a` to `b` if
* `a` references `b`. Then, this function finds the set of vertices
* reachable from the query node.
*/
export function requiredFragments(
query: QueryDefinition
): FragmentDefinition[] {
const fragmentsByName = {};
createFragments().forEach((fd) => {
fragmentsByName[fd.name] = fd;
});
// This function implements a BFS on the graph specified in the
// docstring, with the provisos that the nodes for fragments are the
// fragment names, not the fragments themselves, and that the query
// node is implicit (in the initial value of the `frontier`).
const requiredFragmentNames: Set<string> = new Set();
let frontier: Set<string> = new Set();
query.selections.forEach((selection) => {
for (const fragment of usedFragmentNames(selection)) {
frontier.add(fragment);
}
});
while (frontier.size > 0) {
frontier.forEach((name) => {
requiredFragmentNames.add(name);
});
const newFrontier: Set<string> = new Set();
for (const name of frontier) {
const fragment = fragmentsByName[name];
if (fragment == null) {
throw new Error(`Unknown fragment: "${fragment}"`);
}
fragment.selections.forEach((selection) => {
for (const fragment of usedFragmentNames(selection)) {
newFrontier.add(fragment);
}
});
}
frontier = newFrontier;
}
return createFragments().filter((fd) => requiredFragmentNames.has(fd.name));
}
/**
* Find all fragment names directly referenced by the given selection.
* This does not include transitive fragment references.
*/
function* usedFragmentNames(selection: Selection): Iterator<string> {
switch (selection.type) {
case "FRAGMENT_SPREAD":
yield selection.fragmentName;
break;
case "FIELD":
case "INLINE_FRAGMENT":
for (const subselection of selection.selections) {
yield* usedFragmentNames(subselection);
}
break;
default:
throw new Error(`Unknown selection type: ${selection.type}`);
}
}
/**
* Merge structured data from the given `source` into a given subpath of
* the `destination`. The original inputs are not modified.
@ -456,7 +611,7 @@ function mergeDirect<T>(destination: T, source: any): T {
* These fragments are used to construct the root query, and also to
* fetch more pages of specific entity types.
*/
function createFragments(): FragmentDefinition[] {
export function createFragments(): FragmentDefinition[] {
const b = build;
const makePageInfo = () =>
b.field("pageInfo", {}, [b.field("hasNextPage"), b.field("endCursor")]);

View File

@ -2,11 +2,17 @@
import type {Continuation} from "./graphql";
import {build} from "../../graphql/queries";
import {stringify, multilineLayout} from "../../graphql/queries";
import {
PAGE_LIMIT,
createQuery,
createVariables,
continuationsFromQuery,
continuationsFromContinuation,
createFragments,
merge,
postQueryExhaustive,
requiredFragments,
} from "./graphql";
describe("graphql", () => {
@ -631,4 +637,318 @@ describe("graphql", () => {
});
});
});
describe("#postQueryExhaustive", () => {
it("finds no fragments in an empty query", () => {
const b = build;
const query = b.query("Noop", [], []);
expect(requiredFragments(query)).toEqual([]);
});
it("finds a fragment with no dependencies", () => {
const b = build;
const query = b.query(
"FindReviewComments",
[],
[
b.field("node", {id: b.literal("some-user")}, [
b.inlineFragment("Actor", [b.fragmentSpread("whoami")]),
]),
]
);
const result = requiredFragments(query);
expect(result.map((fd) => fd.name).sort()).toEqual(["whoami"]);
result.forEach((fd) => expect(createFragments()).toContainEqual(fd));
});
it("transitively finds dependent fragments", () => {
const b = build;
const query = b.query(
"FindReviewComments",
[],
[
b.field("node", {id: b.literal("some-pull-request")}, [
b.inlineFragment("PullRequest", [
b.field(
"reviews",
{
first: b.literal(1),
},
[b.fragmentSpread("reviews")]
),
]),
]),
]
);
const result = requiredFragments(query);
expect(result.map((fd) => fd.name).sort()).toEqual([
"reviewComments",
"reviews",
"whoami",
]);
result.forEach((fd) => expect(createFragments()).toContainEqual(fd));
});
});
describe("#postQueryExhaustive", () => {
it("resolves a representative query", async () => {
const makeAuthor = (name) => ({
__typename: "User",
login: name,
id: `opaque-user-${name}`,
});
// We'll have three stages:
// - The original result will need more issues, and more
// comments for issue 1, and more reviews for PR 2.
// - The next result will need more issues, and comments for
// issues 1 (original issue) and 3 (new issue).
// - The final result will need no more data.
// We obey the contract pretty much exactly, except that we return
// far fewer results than are asked for by the query.
//
// Here is the response to the initial query.
const response0 = {
repository: {
id: "opaque-repo",
issues: {
pageInfo: {
hasNextPage: true,
endCursor: "opaque-cursor-issues-v0",
},
nodes: [
{
id: "opaque-issue1",
title: "Request for comments",
body: "Like it says, please comment!",
number: 1,
author: makeAuthor("decentralion"),
comments: {
pageInfo: {
hasNextPage: true,
endCursor: "opaque-cursor-issue1comments-v0",
},
nodes: [
{
id: "opaque-issue1comment1",
body: "Here: I'll start.",
url: "opaque://issue/1/comment/1",
author: makeAuthor("decentralion"),
},
],
},
},
],
},
pullRequests: {
pageInfo: {
hasNextPage: false,
endCursor: "opaque-cursor-prs-v0",
},
nodes: [
{
id: "opaque-pr2",
title: "Fix typo in README",
body: "Surely this deserves much cred.",
number: 2,
author: makeAuthor("wchargin"),
comments: {
pageInfo: {
hasNextPage: false,
endCursor: null,
},
nodes: [],
},
reviews: {
pageInfo: {
hasNextPage: true,
endCursor: "opaque-cursor-pr2reviews-v0",
},
nodes: [
{
id: "opaque-pr2review1",
body: "You actually introduced a new typo instead.",
author: makeAuthor("decentralion"),
state: "CHANGES_REQUESTED",
comments: {
pageInfo: {
hasNextPage: false,
endCursor: null,
},
nodes: [],
},
},
],
},
},
],
},
},
};
// Here is the response to the continuations generated from the
// first query.
const response1 = {
_n0: {
// Requested more issues.
issues: {
pageInfo: {
hasNextPage: true,
endCursor: "opaque-cursor-issues-v1",
},
nodes: [
{
id: "opaque-issue3",
title: "Another",
body: "You can comment here, too.",
number: 2,
author: makeAuthor("wchargin"),
comments: {
pageInfo: {
hasNextPage: true,
endCursor: "opaque-cursor-issue3comments-v1",
},
nodes: [
{
id: "opaque-issue3comment1",
body: "What fun!",
url: "opaque://issue/3/comment/1",
author: makeAuthor("decentralion"),
},
],
},
},
],
},
},
_n1: {
// Requested more comments for issue 1.
comments: {
pageInfo: {
hasNextPage: true,
endCursor: "opaque-cursor-issue1comments-v1",
},
nodes: [
{
id: "opaque-issue1comment2",
body: "Closing due to no fun allowed.",
url: "opaque://issue/1/comment/2",
author: makeAuthor("wchargin"),
},
],
},
},
_n2: {
// Requested more reviews for issue 2.
reviews: {
pageInfo: {
hasNextPage: false,
endCursor: "opaque-cursor-pr2reviews-v1",
},
nodes: [
{
id: "opaque-pr2review2",
body: "Looks godo to me.",
author: makeAuthor("decentralion"),
state: "APPROVED",
comments: {
pageInfo: {
hasNextPage: false,
endCursor: null,
},
nodes: [],
},
},
],
},
},
};
// Here is the response to the continuations generated from the
// second query.
const response2 = {
_n0: {
// Requested more issues.
issues: {
pageInfo: {
hasNextPage: false,
endCursor: "opaque-cursor-issues-v2",
},
nodes: [
{
id: "opaque-issue4",
title: "Please stop making issues",
body: "My mailbox is out of space",
number: 4,
author: makeAuthor("wchargin"),
comments: {
pageInfo: {
hasNextPage: false,
endCursor: "opaque-cursor-issue4comments-v2",
},
nodes: [
{
id: "opaque-issue4comment1",
body: "But you posted the last issue",
url: "opaque://issue/4/comment/1",
author: makeAuthor("decentralion"),
},
],
},
},
],
},
},
_n1: {
// Requested more comments for issue 1.
comments: {
pageInfo: {
hasNextPage: false,
endCursor: "opaque-cursor-issue1comments-v2",
},
nodes: [
{
id: "opaque-issue1comment3",
body: "That is not very nice.",
url: "opaque://issue/1/comment/3",
author: makeAuthor("decentralion"),
},
],
},
},
_n2: {
// Requested more comments for issue 3.
comments: {
pageInfo: {
hasNextPage: false,
endCursor: "opaque-cursor-issue3comments-v2",
},
nodes: [
{
id: "opaque-issue3comment2",
body: "I will comment on this issue for a second time.",
url: "opaque://issue/1/comment/3",
author: makeAuthor("decentralion"),
},
],
},
},
};
const postQuery = jest
.fn()
.mockReturnValueOnce(Promise.resolve(response0))
.mockReturnValueOnce(Promise.resolve(response1))
.mockReturnValueOnce(Promise.resolve(response2));
const result = await postQueryExhaustive(postQuery, {
body: createQuery(),
variables: createVariables("sourcecred", "discussion"),
});
expect(postQuery).toHaveBeenCalledTimes(3);
// Save the result snapshot for inspection. In particular, there
// shouldn't be any nodes in the snapshot that have more pages.
expect(result).toMatchSnapshot();
});
});
});