Create script to scrape data from GitHub repos (#36)

Summary:
This tool grabs all the information that we think will be relevant for a
first-pass implementation of the SourceCred project graph. It includes a
tool to save the results to disk so that we avoid needlessly hitting the
GitHub API over and over.

Paired with @dandelionmane.

Test Plan:
The API doesn’t have tests, because we didn’t think that they would
provide much marginal value. But here’s how you invoke it:

    node bin/fetchAndPrintGitHubRepo.js sourcecred sourcecred "${TOKEN}" >/tmp/out

to crawl the repository `sourcecred/sourcecred` with the given API
token.

wchargin-branch: grab-github-data
This commit is contained in:
William Chargin 2018-02-26 17:11:57 -08:00 committed by GitHub
parent d41872b7b7
commit c5be6eceda
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 180 additions and 0 deletions

View File

@ -0,0 +1,50 @@
/*
* Command-line utility to fetch GitHub data using the API in
* ../fetchGitHubRepo, and print it to stdout. Useful for testing or
* saving some data to disk.
*
* Usage:
*
* node bin/fetchAndPrintGitHubRepo.js REPO_OWNER REPO_NAME [TOKEN]
*
* where TOKEN is an optional GitHub authentication token, as generated
* from https://github.com/settings/tokens/new.
*/
const fetchGitHubRepo = require("../fetchGitHubRepo");
function parseArgs() {
const argv = process.argv.slice(2);
const fail = () => {
const invocation = process.argv.slice(0, 2).join(" ");
throw new Error(`Usage: ${invocation} REPO_OWNER REPO_NAME [TOKEN]`);
};
if (argv.length < 2) {
fail();
}
const [repoOwner, repoName, ...rest] = argv;
const result = {repoOwner, repoName};
if (rest.length === 1) {
const token = rest[0];
// Sanity check on the token structure
if (token.length !== 40) {
throw new Error(
"Token, when provided, must be a 40-character hex string"
);
}
// We'll assume it's a hex string.
result.token = token;
} else if (rest.length > 1) {
fail();
}
return result;
}
function main() {
const args = parseArgs();
fetchGitHubRepo(args.repoOwner, args.repoName, args.token).then((data) => {
console.log(JSON.stringify(data));
});
}
main();

130
backend/fetchGitHubRepo.js Normal file
View File

@ -0,0 +1,130 @@
/*
* API to scrape data from a GitHub repo using the GitHub API. See the
* docstring of the default export for more details.
*/
const octokitFactory = require("@octokit/rest");
/**
* Scrape data from a GitHub repo using the GitHub API.
*
* @param {String} repoOwner
* the GitHub username of the owner of the repository to be scraped
* @param {String} repoName
* the name of the repository to be scraped
* @param {String?} token
* optional authentication token to be used for the GitHub API (used
* to get around rate limits); generate a token at:
* https://github.com/settings/tokens
* @return {Promise<object>}
* a promise that resolves to a JSON object containing the data
* scraped from the repository, with the following keys and data from
* the corresponding GitHub v3 endpoints:
* - issues: `/repos/:owner/:repo/issues`
* - pullRequests: `/repos/:owner/:repo/pulls`
* - issueComments: `/repos/:owner/:repo/issues/comments`
* - pullRequestComments: `/repos/:owner/:repo/pulls/comments`
* - pullRequestReviews: `/repos/:owner/:repo/pulls/:pull/reviews`,
* but concatenated over `:pull` so that the result is an array
* of objects
*/
module.exports = function fetchGitHubRepo(repoOwner, repoName, token) {
const authOptions = token ? {type: "token", token} : null;
return new Fetcher(repoOwner, repoName, authOptions).fetchAll();
};
class Fetcher {
constructor(repoOwner, repoName, authOptions = undefined) {
this._repoOwner = repoOwner;
this._repoName = repoName;
this._octokit = octokitFactory(octokitFactory);
if (authOptions) {
this._octokit.authenticate(authOptions);
}
}
// Adapted from:
// https://www.npmjs.com/package/@octokit/rest#pagination
async paginate(promise) {
let response = await promise;
let {data} = response;
while (this._octokit.hasNextPage(response)) {
response = await this._octokit.getNextPage(response);
data = [...data, ...response.data];
}
return data;
}
async fetchAll() {
const pullRequests = await this.fetchPullRequests();
return {
issues: await this.fetchIssues(),
pullRequests,
issueComments: await this.fetchIssueComments(),
pullRequestComments: await this.fetchPullRequestComments(),
pullRequestReviews: await this.fetchPullRequestReviews(pullRequests),
};
}
fetchPullRequests() {
return this.paginate(
this._octokit.pullRequests.getAll({
owner: this._repoOwner,
repo: this._repoName,
state: "all",
per_page: 100,
})
);
}
fetchIssues() {
return this.paginate(
this._octokit.issues.getForRepo({
owner: this._repoOwner,
repo: this._repoName,
state: "all",
per_page: 100,
})
);
}
fetchPullRequestComments() {
return this.paginate(
this._octokit.pullRequests.getCommentsForRepo({
owner: this._repoOwner,
repo: this._repoName,
per_page: 100,
})
);
}
fetchIssueComments() {
return this.paginate(
this._octokit.issues.getCommentsForRepo({
owner: this._repoOwner,
repo: this._repoName,
per_page: 100,
})
);
}
fetchIndividualPullRequestReviews(number) {
return this.paginate(
this._octokit.pullRequests.getReviews({
owner: this._repoOwner,
repo: this._repoName,
number: number,
per_page: 100,
})
);
}
fetchPullRequestReviews(allPullRequests) {
const reviewses = Promise.all(
allPullRequests.map((pr) =>
this.fetchIndividualPullRequestReviews(pr.number)
)
);
return reviewses.then((xss) => [].concat(...xss));
}
}