From c5be6eceda9d695e3cb3fc47f76a9bc85ed1995b Mon Sep 17 00:00:00 2001 From: William Chargin Date: Mon, 26 Feb 2018 17:11:57 -0800 Subject: [PATCH] Create script to scrape data from GitHub repos (#36) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: This tool grabs all the information that we think will be relevant for a first-pass implementation of the SourceCred project graph. It includes a tool to save the results to disk so that we avoid needlessly hitting the GitHub API over and over. Paired with @dandelionmane. Test Plan: The API doesn’t have tests, because we didn’t think that they would provide much marginal value. But here’s how you invoke it: node bin/fetchAndPrintGitHubRepo.js sourcecred sourcecred "${TOKEN}" >/tmp/out to crawl the repository `sourcecred/sourcecred` with the given API token. wchargin-branch: grab-github-data --- backend/bin/fetchAndPrintGitHubRepo.js | 50 ++++++++++ backend/fetchGitHubRepo.js | 130 +++++++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 backend/bin/fetchAndPrintGitHubRepo.js create mode 100644 backend/fetchGitHubRepo.js diff --git a/backend/bin/fetchAndPrintGitHubRepo.js b/backend/bin/fetchAndPrintGitHubRepo.js new file mode 100644 index 0000000..2033844 --- /dev/null +++ b/backend/bin/fetchAndPrintGitHubRepo.js @@ -0,0 +1,50 @@ +/* + * Command-line utility to fetch GitHub data using the API in + * ../fetchGitHubRepo, and print it to stdout. Useful for testing or + * saving some data to disk. + * + * Usage: + * + * node bin/fetchAndPrintGitHubRepo.js REPO_OWNER REPO_NAME [TOKEN] + * + * where TOKEN is an optional GitHub authentication token, as generated + * from https://github.com/settings/tokens/new. + */ + +const fetchGitHubRepo = require("../fetchGitHubRepo"); + +function parseArgs() { + const argv = process.argv.slice(2); + const fail = () => { + const invocation = process.argv.slice(0, 2).join(" "); + throw new Error(`Usage: ${invocation} REPO_OWNER REPO_NAME [TOKEN]`); + }; + if (argv.length < 2) { + fail(); + } + const [repoOwner, repoName, ...rest] = argv; + const result = {repoOwner, repoName}; + if (rest.length === 1) { + const token = rest[0]; + // Sanity check on the token structure + if (token.length !== 40) { + throw new Error( + "Token, when provided, must be a 40-character hex string" + ); + } + // We'll assume it's a hex string. + result.token = token; + } else if (rest.length > 1) { + fail(); + } + return result; +} + +function main() { + const args = parseArgs(); + fetchGitHubRepo(args.repoOwner, args.repoName, args.token).then((data) => { + console.log(JSON.stringify(data)); + }); +} + +main(); diff --git a/backend/fetchGitHubRepo.js b/backend/fetchGitHubRepo.js new file mode 100644 index 0000000..80e55c6 --- /dev/null +++ b/backend/fetchGitHubRepo.js @@ -0,0 +1,130 @@ +/* + * API to scrape data from a GitHub repo using the GitHub API. See the + * docstring of the default export for more details. + */ + +const octokitFactory = require("@octokit/rest"); + +/** + * Scrape data from a GitHub repo using the GitHub API. + * + * @param {String} repoOwner + * the GitHub username of the owner of the repository to be scraped + * @param {String} repoName + * the name of the repository to be scraped + * @param {String?} token + * optional authentication token to be used for the GitHub API (used + * to get around rate limits); generate a token at: + * https://github.com/settings/tokens + * @return {Promise} + * a promise that resolves to a JSON object containing the data + * scraped from the repository, with the following keys and data from + * the corresponding GitHub v3 endpoints: + * - issues: `/repos/:owner/:repo/issues` + * - pullRequests: `/repos/:owner/:repo/pulls` + * - issueComments: `/repos/:owner/:repo/issues/comments` + * - pullRequestComments: `/repos/:owner/:repo/pulls/comments` + * - pullRequestReviews: `/repos/:owner/:repo/pulls/:pull/reviews`, + * but concatenated over `:pull` so that the result is an array + * of objects + */ +module.exports = function fetchGitHubRepo(repoOwner, repoName, token) { + const authOptions = token ? {type: "token", token} : null; + return new Fetcher(repoOwner, repoName, authOptions).fetchAll(); +}; + +class Fetcher { + constructor(repoOwner, repoName, authOptions = undefined) { + this._repoOwner = repoOwner; + this._repoName = repoName; + this._octokit = octokitFactory(octokitFactory); + if (authOptions) { + this._octokit.authenticate(authOptions); + } + } + + // Adapted from: + // https://www.npmjs.com/package/@octokit/rest#pagination + async paginate(promise) { + let response = await promise; + let {data} = response; + while (this._octokit.hasNextPage(response)) { + response = await this._octokit.getNextPage(response); + data = [...data, ...response.data]; + } + return data; + } + + async fetchAll() { + const pullRequests = await this.fetchPullRequests(); + return { + issues: await this.fetchIssues(), + pullRequests, + issueComments: await this.fetchIssueComments(), + pullRequestComments: await this.fetchPullRequestComments(), + pullRequestReviews: await this.fetchPullRequestReviews(pullRequests), + }; + } + + fetchPullRequests() { + return this.paginate( + this._octokit.pullRequests.getAll({ + owner: this._repoOwner, + repo: this._repoName, + state: "all", + per_page: 100, + }) + ); + } + + fetchIssues() { + return this.paginate( + this._octokit.issues.getForRepo({ + owner: this._repoOwner, + repo: this._repoName, + state: "all", + per_page: 100, + }) + ); + } + + fetchPullRequestComments() { + return this.paginate( + this._octokit.pullRequests.getCommentsForRepo({ + owner: this._repoOwner, + repo: this._repoName, + per_page: 100, + }) + ); + } + + fetchIssueComments() { + return this.paginate( + this._octokit.issues.getCommentsForRepo({ + owner: this._repoOwner, + repo: this._repoName, + per_page: 100, + }) + ); + } + + fetchIndividualPullRequestReviews(number) { + return this.paginate( + this._octokit.pullRequests.getReviews({ + owner: this._repoOwner, + repo: this._repoName, + number: number, + per_page: 100, + }) + ); + } + + fetchPullRequestReviews(allPullRequests) { + const reviewses = Promise.all( + allPullRequests.map((pr) => + this.fetchIndividualPullRequestReviews(pr.number) + ) + ); + return reviewses.then((xss) => [].concat(...xss)); + } +}