github: translate old format to structured format (#930)

Summary:
This implements the translation module described in #923. See that issue
for context.

Test Plan:
This is a mostly straightforward translation from one strongly typed
data structure to another, so Flow handles most of it.

As a check on the snapshot, run:

```
$ grep -e oid -e target -e mergeCommit \
> src/plugins/github/__snapshots__/translateContinuations.test.js.snap
      "target": Object {
        "oid": "6bd1b4c0b719c22c688a74863be07a699b7b9b34",
            "oid": "c430bd74455105f77215ece51945094ceeee6c86",
                "oid": "6d5b3aa31ebb68a06ceb46bbd6cf49b6ccd6f5e6",
                    "oid": "0a223346b4e6dec0127b1e6aa892c4ee0424b66a",
                        "oid": "ec91adb718a6045b492303f00d8e8beb957dc780",
                        "oid": "ecc889dc94cf6da17ae6eab5bb7b7155f577519d",
                            "oid": "ec91adb718a6045b492303f00d8e8beb957dc780",
        "mergeCommit": Object {
          "oid": "0a223346b4e6dec0127b1e6aa892c4ee0424b66a",
              "oid": "ec91adb718a6045b492303f00d8e8beb957dc780",
              "oid": "ecc889dc94cf6da17ae6eab5bb7b7155f577519d",
                  "oid": "ec91adb718a6045b492303f00d8e8beb957dc780",
        "mergeCommit": Object {
          "oid": "6d5b3aa31ebb68a06ceb46bbd6cf49b6ccd6f5e6",
              "oid": "0a223346b4e6dec0127b1e6aa892c4ee0424b66a",
                  "oid": "ec91adb718a6045b492303f00d8e8beb957dc780",
                  "oid": "ecc889dc94cf6da17ae6eab5bb7b7155f577519d",
                      "oid": "ec91adb718a6045b492303f00d8e8beb957dc780",
        "mergeCommit": null,
```

Cross-check this against [the example-github commits][commits] thus:

  - Note that commit `6bd1b4c` is the head commit, and is thus the root
    commit of the `target` chain.
  - Note that commits `0a22334` and `6d5b3aa`, which were merged via
    pull request, appear twice each: once in the history from head, and
    once as the merge commit of a pull request.
  - Note that commit `0a22334` has two parents at each occurrence.
  - Note that the unmerged pull request’s merge commit is `null`.

[commits]: https://github.com/sourcecred/example-github/commits/master

To run this on real-world data, apply the following patch:

```diff
diff --git a/src/plugins/github/fetchGithubRepo.js b/src/plugins/github/fetchGithubRepo.js
index 6ac201af..b14ca760 100644
--- a/src/plugins/github/fetchGithubRepo.js
+++ b/src/plugins/github/fetchGithubRepo.js
@@ -11,6 +11,7 @@ import {stringify, inlineLayout, type Body} from "../../graphql/queries";
 import {createQuery, createVariables, postQueryExhaustive} from "./graphql";
 import type {GithubResponseJSON} from "./graphql";
 import type {RepoId} from "../../core/repoId";
+import translateContinuations from "./translateContinuations";

 /**
  * Scrape data from a GitHub repo using the GitHub API.
@@ -44,6 +45,11 @@ export default function fetchGithubRepo(
     payload
   ).then((x: GithubResponseJSON) => {
     ensureNoMorePages(x);
+    console.warn("Translating continuations...");
+    for (const w of translateContinuations(x).warnings) {
+      console.warn(w);
+    }
+    console.warn("Done.");
     return x;
   });
 }
```

Then run:

```
$ yarn backend >/dev/null 2>/dev/null; echo $?
0
$ node ./bin/sourcecred.js load sourcecred/sourcecred --plugin github 2>&1 |
> ts -s '%.s'
55.015740 Translating continuations...
55.037217 { type: 'UNKNOWN_PARENT_OID',
55.037273   child: '0d38dde23a6de831315f3643a7d2bc15e8df7678',
55.037290   parent: 'cb8ba0eaa1abc1f921e7165bb19e29b40723ce65' }
55.037309 { type: 'UNKNOWN_PARENT_OID',
55.037336   child: 'd152f48ce4c2ed1d046bf6ed4f139e7e393ea660',
55.037359   parent: 'de7a8723963d9cd0437ef34f5942a071b850c0e7' }
55.037383 Done.
```

Note that the two commits in question were each merged into a non-master
branch, in #28 and #329 respectively. Note also that translating these
continuations took just 22 milliseconds.

wchargin-branch: github-translate-continuations
This commit is contained in:
William Chargin 2018-10-22 10:01:49 -07:00 committed by GitHub
parent 6499df6b6b
commit 993de9303a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 1673 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,331 @@
// @flow
// Temporary module to translate GraphQL results from the old format
// with manually resolved continuations to the format emitted by the
// Mirror module. See issue #923 for context.
import type {
AuthorJSON,
BotJSON,
CommentJSON,
CommitJSON,
GitObjectJSON,
GithubResponseJSON,
IssueJSON,
OrganizationJSON,
PullJSON,
ReactionJSON,
RefJSON,
RepositoryJSON,
ReviewCommentJSON,
ReviewJSON,
UserJSON,
} from "./graphql";
import type {
Actor,
Blob,
Bot,
Commit,
GitObject,
GitObjectID,
Issue,
IssueComment,
Organization,
PullRequest,
PullRequestReview,
PullRequestReviewComment,
Reaction,
Ref,
Repository,
RepositoryOwner,
Tag,
Tree,
User,
} from "./graphqlTypes";
export type Warning =
// We've never seen it happen, and don't know how it could. But the
// GitHub schema says that it can. This warning is more of a
// diagnostic to the SourceCred maintainers (if it comes up on a real
// repository, we can learn something!) than an indication that
// something has gone wrong.
| {|+type: "NON_COMMIT_REF_TARGET", +target: GitObjectJSON|}
// This can happen if a commit has a parent that we did not fetch. We
// only fetch commits that are Git-reachable from HEAD or are the direct
// merge commit of a pull request. We may therefore omit commits that
// disappeared from master after a force-push, or were an ancestor of a
// pull request that was merged into a branch other than master. See
// issue #923 for more context. If this is omitted, we will simply
// omit the offending parent commit.
| {|+type: "UNKNOWN_PARENT_OID", +child: GitObjectID, +parent: GitObjectID|};
export default function translate(
json: GithubResponseJSON
): {|
+result: Repository,
+warnings: $ReadOnlyArray<Warning>,
|} {
const repositoryJson = json.repository;
const warnings: Array<Warning> = [];
// Most of the work that this function does is exploding connections
// into lists of nodes. But commits require some special attention,
// because we have to resolve parent OIDs to actual parent commits.
// This means that it is most convenient to start by discovering all
// commits in the data.
const commits: Map<
GitObjectID,
{|
...Commit,
parents: Array<null | Commit>, // mutable: we build this incrementally
|}
> = new Map();
// First, create all the commit objects, initializing them with empty
// parent arrays. We put these temporarily into a map keyed by OID for
// deduplication: a commit may appear both in the linearized history
// from HEAD and also as the merge commit of a pull request, and we
// want to process it just once.
const commitJsons: $ReadOnlyArray<CommitJSON> = Array.from(
new Map(
Array.from(
(function*() {
if (repositoryJson.defaultBranchRef) {
const target = repositoryJson.defaultBranchRef.target;
switch (target.__typename) {
case "Commit":
yield* target.history.nodes;
break;
case "Tree":
case "Blob":
case "Tag":
warnings.push({type: "NON_COMMIT_REF_TARGET", target});
break;
// istanbul ignore next: unreachable per Flow
default:
throw new Error((target.type: empty));
}
}
for (const pull of repositoryJson.pulls.nodes) {
if (pull.mergeCommit) {
yield pull.mergeCommit;
}
}
})()
).map((json) => [json.oid, json])
).values()
);
for (const commitJson of commitJsons) {
const commit = {
__typename: "Commit",
author: {...commitJson.author},
id: commitJson.id,
message: commitJson.message,
oid: commitJson.oid,
parents: [],
url: commitJson.url,
};
commits.set(commit.oid, commit);
}
// Then, once all the objects have been created, we can set up the
// parents.
for (const commitJson of commitJsons) {
const commit = commits.get(commitJson.oid);
// istanbul ignore next: should not be possible
if (commit == null) {
throw new Error(
"invariant violation: commit came out of nowhere: " + commitJson.oid
);
}
for (const {oid: parentOid} of commitJson.parents.nodes) {
const parentCommit = commits.get(parentOid);
if (parentCommit == null) {
warnings.push({
type: "UNKNOWN_PARENT_OID",
child: commitJson.oid,
parent: parentOid,
});
} else {
commit.parents.push(parentCommit);
}
}
}
// The rest is mostly mechanical. The pattern is: we pull off and
// recursively translate the non-primitive fields of each object, and
// then add a typename and put back the primitives. For union types,
// we switch on the __typename and dispatch to the appropriate object
// translators.
function translateRepository(json: RepositoryJSON): Repository {
const {defaultBranchRef, issues, owner, pulls, ...rest} = json;
return {
__typename: "Repository",
defaultBranchRef:
defaultBranchRef == null
? null
: translateDefaultBranchRef(defaultBranchRef),
issues: issues.nodes.map(translateIssue),
owner: translateRepositoryOwner(owner),
pullRequests: pulls.nodes.map(translatePullRequest),
...rest,
};
}
function translateDefaultBranchRef(json: RefJSON): Ref {
const {target, ...rest} = json;
return {
__typename: "Ref",
target: translateDefaultBranchRefTarget(target),
...rest,
};
}
// This one is a bit wonky, because our `GitObjectJSON` type is not a
// good representation of the GitHub schema. In particular, a
// `GitObjectJSON` can represent a commit, but in a different form
// than our `CommitJSON`! This function _only_ applies to
// `GitObjectJSON`s that we fetched as the `target` of the
// `defaultBranchRef` of a repository. But these are the only
// `GitObjectJSON`s that we fetch, so it's okay.
function translateDefaultBranchRefTarget(json: GitObjectJSON): GitObject {
switch (json.__typename) {
case "Commit":
// The default branch ref is `null` if there are no commits, so
// the history must include at least one commit (the HEAD
// commit).
return lookUpCommit(json.history.nodes[0].oid);
case "Blob":
return ({...json}: Blob);
case "Tag":
return ({...json}: Tag);
case "Tree":
return ({...json}: Tree);
// istanbul ignore next: unreachable per Flow
default:
throw new Error((json.__typename: empty));
}
}
function lookUpCommit(oid: GitObjectID): Commit {
const commit = commits.get(oid);
// istanbul ignore if: unreachable: we explored all commits in
// the response, including this one.
if (commit == null) {
throw new Error("invariant violation: unknown commit: " + oid);
}
return commit;
}
function translateCommit(json: CommitJSON): Commit {
return lookUpCommit(json.oid);
}
function translateIssue(json: IssueJSON): Issue {
const {author, comments, reactions, ...rest} = json;
return {
__typename: "Issue",
author: author == null ? null : translateActor(author),
comments: comments.nodes.map(translateIssueComment),
reactions: reactions.nodes.map(translateReaction),
...rest,
};
}
function translateIssueComment(json: CommentJSON): IssueComment {
const {author, reactions, ...rest} = json;
return {
__typename: "IssueComment",
author: author == null ? null : translateActor(author),
reactions: reactions.nodes.map(translateReaction),
...rest,
};
}
function translateReaction(json: ReactionJSON): Reaction {
const {user, ...rest} = json;
return {
__typename: "Reaction",
user: user == null ? null : translateUser(user),
...rest,
};
}
function translateRepositoryOwner(
json: UserJSON | OrganizationJSON
): RepositoryOwner {
switch (json.__typename) {
case "User":
return translateUser(json);
case "Organization":
return translateOrganization(json);
// istanbul ignore next: unreachable per Flow
default:
throw new Error((json.__typename: empty));
}
}
function translateActor(json: AuthorJSON): Actor {
switch (json.__typename) {
case "User":
return translateUser(json);
case "Organization":
return translateOrganization(json);
case "Bot":
return translateBot(json);
// istanbul ignore next: unreachable per Flow
default:
throw new Error((json.__typename: empty));
}
}
function translateUser(json: UserJSON): User {
return {...json};
}
function translateOrganization(json: OrganizationJSON): Organization {
return {...json};
}
function translateBot(json: BotJSON): Bot {
return {...json};
}
function translatePullRequest(json: PullJSON): PullRequest {
const {author, comments, mergeCommit, reactions, reviews, ...rest} = json;
return {
__typename: "PullRequest",
author: author == null ? null : translateActor(author),
comments: comments.nodes.map(translateIssueComment),
mergeCommit: mergeCommit == null ? null : translateCommit(mergeCommit),
reactions: reactions.nodes.map(translateReaction),
reviews: reviews.nodes.map(translatePullRequestReview),
...rest,
};
}
function translatePullRequestReview(json: ReviewJSON): PullRequestReview {
const {author, comments, ...rest} = json;
return {
__typename: "PullRequestReview",
author: author == null ? null : translateActor(author),
comments: comments.nodes.map(translatePullRequestReviewComment),
...rest,
};
}
function translatePullRequestReviewComment(
json: ReviewCommentJSON
): PullRequestReviewComment {
const {author, reactions, ...rest} = json;
return {
__typename: "PullRequestReviewComment",
author: author == null ? null : translateActor(author),
reactions: reactions.nodes.map(translateReaction),
...rest,
};
}
const result = translateRepository(repositoryJson);
return {result, warnings};
}

View File

@ -0,0 +1,150 @@
// @flow
import {exampleData} from "./example/example";
import translateContinuations from "./translateContinuations";
describe("plugins/github/translateContinuations", () => {
describe("translateContinuations", () => {
it("works on the example data", () => {
expect(translateContinuations(exampleData())).toMatchSnapshot();
});
it("raises a warning if the defaultBranchRef is not a commit", () => {
const exampleData = {
repository: {
defaultBranchRef: {
id: "ref-id",
target: {
__typename: "Tree",
id: "tree-id",
oid: "123",
},
},
id: "repo-id",
issues: {
nodes: [],
pageInfo: {hasNextPage: false, endCursor: null},
},
name: "bar",
owner: {
__typename: "User",
id: "user-id",
login: "foo",
url: "https://github.com/foo",
},
pulls: {
nodes: [],
pageInfo: {hasNextPage: false, endCursor: null},
},
url: "https://github.com/foo/bar",
},
};
const {result, warnings} = translateContinuations(exampleData);
expect(result.defaultBranchRef).toEqual({
__typename: "Ref",
id: "ref-id",
target: {__typename: "Tree", id: "tree-id", oid: "123"},
});
expect(warnings).toEqual([
{
type: "NON_COMMIT_REF_TARGET",
target: {__typename: "Tree", id: "tree-id", oid: "123"},
},
]);
});
it("raises a warning if there is an unknown commit", () => {
const exampleData = {
repository: {
defaultBranchRef: null,
id: "repo-id",
issues: {
nodes: [],
pageInfo: {hasNextPage: false, endCursor: null},
},
name: "bar",
owner: {
__typename: "User",
id: "user-id",
login: "foo",
url: "https://github.com/foo",
},
pulls: {
nodes: [
{
id: "pr-id",
number: 1,
author: {
__typename: "Bot",
id: "bot-id",
login: "baz",
url: "https://github.com/baz",
},
additions: 7,
deletions: 9,
comments: {
nodes: [],
pageInfo: {hasNextPage: false, endCursor: null},
},
reviews: {
nodes: [],
pageInfo: {hasNextPage: false, endCursor: null},
},
reactions: {
nodes: [],
pageInfo: {hasNextPage: false, endCursor: null},
},
mergeCommit: {
id: "commit-id",
author: {
date: "2001-02-03T04:05:06",
user: null,
},
message: "where are my parents?",
oid: "456",
parents: {
nodes: [{oid: "789"}],
pageInfo: {hasNextPage: false, endCursor: "cursor-parents"},
},
url: "https://github.com/foo/bar/commit/456",
},
title: "something",
body: "whatever",
url: "https://github.com/foo/bar/pull/1",
},
],
pageInfo: {hasNextPage: false, endCursor: "cursor-pulls"},
},
url: "https://github.com/foo/bar",
},
};
const {result, warnings} = translateContinuations(exampleData);
const pr = result.pullRequests[0];
if (pr == null) {
throw new Error(String(pr));
}
expect(pr.mergeCommit).toEqual({
__typename: "Commit",
id: "commit-id",
author: {
date: "2001-02-03T04:05:06",
user: null,
},
message: "where are my parents?",
oid: "456",
parents: [
/* empty! */
],
url: "https://github.com/foo/bar/commit/456",
});
expect(warnings).toEqual([
{
type: "UNKNOWN_PARENT_OID",
child: "456",
parent: "789",
},
]);
});
});
});