Create script to scrape data from GitHub repos (#36)
Summary: This tool grabs all the information that we think will be relevant for a first-pass implementation of the SourceCred project graph. It includes a tool to save the results to disk so that we avoid needlessly hitting the GitHub API over and over. Paired with @dandelionmane. Test Plan: The API doesn’t have tests, because we didn’t think that they would provide much marginal value. But here’s how you invoke it: node bin/fetchAndPrintGitHubRepo.js sourcecred sourcecred "${TOKEN}" >/tmp/out to crawl the repository `sourcecred/sourcecred` with the given API token. wchargin-branch: grab-github-data
This commit is contained in:
parent
d41872b7b7
commit
c5be6eceda
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Command-line utility to fetch GitHub data using the API in
|
||||
* ../fetchGitHubRepo, and print it to stdout. Useful for testing or
|
||||
* saving some data to disk.
|
||||
*
|
||||
* Usage:
|
||||
*
|
||||
* node bin/fetchAndPrintGitHubRepo.js REPO_OWNER REPO_NAME [TOKEN]
|
||||
*
|
||||
* where TOKEN is an optional GitHub authentication token, as generated
|
||||
* from https://github.com/settings/tokens/new.
|
||||
*/
|
||||
|
||||
const fetchGitHubRepo = require("../fetchGitHubRepo");
|
||||
|
||||
function parseArgs() {
|
||||
const argv = process.argv.slice(2);
|
||||
const fail = () => {
|
||||
const invocation = process.argv.slice(0, 2).join(" ");
|
||||
throw new Error(`Usage: ${invocation} REPO_OWNER REPO_NAME [TOKEN]`);
|
||||
};
|
||||
if (argv.length < 2) {
|
||||
fail();
|
||||
}
|
||||
const [repoOwner, repoName, ...rest] = argv;
|
||||
const result = {repoOwner, repoName};
|
||||
if (rest.length === 1) {
|
||||
const token = rest[0];
|
||||
// Sanity check on the token structure
|
||||
if (token.length !== 40) {
|
||||
throw new Error(
|
||||
"Token, when provided, must be a 40-character hex string"
|
||||
);
|
||||
}
|
||||
// We'll assume it's a hex string.
|
||||
result.token = token;
|
||||
} else if (rest.length > 1) {
|
||||
fail();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function main() {
|
||||
const args = parseArgs();
|
||||
fetchGitHubRepo(args.repoOwner, args.repoName, args.token).then((data) => {
|
||||
console.log(JSON.stringify(data));
|
||||
});
|
||||
}
|
||||
|
||||
main();
|
|
@ -0,0 +1,130 @@
|
|||
/*
|
||||
* API to scrape data from a GitHub repo using the GitHub API. See the
|
||||
* docstring of the default export for more details.
|
||||
*/
|
||||
|
||||
const octokitFactory = require("@octokit/rest");
|
||||
|
||||
/**
|
||||
* Scrape data from a GitHub repo using the GitHub API.
|
||||
*
|
||||
* @param {String} repoOwner
|
||||
* the GitHub username of the owner of the repository to be scraped
|
||||
* @param {String} repoName
|
||||
* the name of the repository to be scraped
|
||||
* @param {String?} token
|
||||
* optional authentication token to be used for the GitHub API (used
|
||||
* to get around rate limits); generate a token at:
|
||||
* https://github.com/settings/tokens
|
||||
* @return {Promise<object>}
|
||||
* a promise that resolves to a JSON object containing the data
|
||||
* scraped from the repository, with the following keys and data from
|
||||
* the corresponding GitHub v3 endpoints:
|
||||
* - issues: `/repos/:owner/:repo/issues`
|
||||
* - pullRequests: `/repos/:owner/:repo/pulls`
|
||||
* - issueComments: `/repos/:owner/:repo/issues/comments`
|
||||
* - pullRequestComments: `/repos/:owner/:repo/pulls/comments`
|
||||
* - pullRequestReviews: `/repos/:owner/:repo/pulls/:pull/reviews`,
|
||||
* but concatenated over `:pull` so that the result is an array
|
||||
* of objects
|
||||
*/
|
||||
module.exports = function fetchGitHubRepo(repoOwner, repoName, token) {
|
||||
const authOptions = token ? {type: "token", token} : null;
|
||||
return new Fetcher(repoOwner, repoName, authOptions).fetchAll();
|
||||
};
|
||||
|
||||
class Fetcher {
|
||||
constructor(repoOwner, repoName, authOptions = undefined) {
|
||||
this._repoOwner = repoOwner;
|
||||
this._repoName = repoName;
|
||||
this._octokit = octokitFactory(octokitFactory);
|
||||
if (authOptions) {
|
||||
this._octokit.authenticate(authOptions);
|
||||
}
|
||||
}
|
||||
|
||||
// Adapted from:
|
||||
// https://www.npmjs.com/package/@octokit/rest#pagination
|
||||
async paginate(promise) {
|
||||
let response = await promise;
|
||||
let {data} = response;
|
||||
while (this._octokit.hasNextPage(response)) {
|
||||
response = await this._octokit.getNextPage(response);
|
||||
data = [...data, ...response.data];
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
async fetchAll() {
|
||||
const pullRequests = await this.fetchPullRequests();
|
||||
return {
|
||||
issues: await this.fetchIssues(),
|
||||
pullRequests,
|
||||
issueComments: await this.fetchIssueComments(),
|
||||
pullRequestComments: await this.fetchPullRequestComments(),
|
||||
pullRequestReviews: await this.fetchPullRequestReviews(pullRequests),
|
||||
};
|
||||
}
|
||||
|
||||
fetchPullRequests() {
|
||||
return this.paginate(
|
||||
this._octokit.pullRequests.getAll({
|
||||
owner: this._repoOwner,
|
||||
repo: this._repoName,
|
||||
state: "all",
|
||||
per_page: 100,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
fetchIssues() {
|
||||
return this.paginate(
|
||||
this._octokit.issues.getForRepo({
|
||||
owner: this._repoOwner,
|
||||
repo: this._repoName,
|
||||
state: "all",
|
||||
per_page: 100,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
fetchPullRequestComments() {
|
||||
return this.paginate(
|
||||
this._octokit.pullRequests.getCommentsForRepo({
|
||||
owner: this._repoOwner,
|
||||
repo: this._repoName,
|
||||
per_page: 100,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
fetchIssueComments() {
|
||||
return this.paginate(
|
||||
this._octokit.issues.getCommentsForRepo({
|
||||
owner: this._repoOwner,
|
||||
repo: this._repoName,
|
||||
per_page: 100,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
fetchIndividualPullRequestReviews(number) {
|
||||
return this.paginate(
|
||||
this._octokit.pullRequests.getReviews({
|
||||
owner: this._repoOwner,
|
||||
repo: this._repoName,
|
||||
number: number,
|
||||
per_page: 100,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
fetchPullRequestReviews(allPullRequests) {
|
||||
const reviewses = Promise.all(
|
||||
allPullRequests.map((pr) =>
|
||||
this.fetchIndividualPullRequestReviews(pr.number)
|
||||
)
|
||||
);
|
||||
return reviewses.then((xss) => [].concat(...xss));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue