mirror of
https://github.com/status-im/sourcecred.git
synced 2025-02-17 06:56:36 +00:00
Create script to scrape data from GitHub repos (#36)
Summary: This tool grabs all the information that we think will be relevant for a first-pass implementation of the SourceCred project graph. It includes a tool to save the results to disk so that we avoid needlessly hitting the GitHub API over and over. Paired with @dandelionmane. Test Plan: The API doesn’t have tests, because we didn’t think that they would provide much marginal value. But here’s how you invoke it: node bin/fetchAndPrintGitHubRepo.js sourcecred sourcecred "${TOKEN}" >/tmp/out to crawl the repository `sourcecred/sourcecred` with the given API token. wchargin-branch: grab-github-data
This commit is contained in:
parent
d41872b7b7
commit
c5be6eceda
50
backend/bin/fetchAndPrintGitHubRepo.js
Normal file
50
backend/bin/fetchAndPrintGitHubRepo.js
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
/*
|
||||||
|
* Command-line utility to fetch GitHub data using the API in
|
||||||
|
* ../fetchGitHubRepo, and print it to stdout. Useful for testing or
|
||||||
|
* saving some data to disk.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
*
|
||||||
|
* node bin/fetchAndPrintGitHubRepo.js REPO_OWNER REPO_NAME [TOKEN]
|
||||||
|
*
|
||||||
|
* where TOKEN is an optional GitHub authentication token, as generated
|
||||||
|
* from https://github.com/settings/tokens/new.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const fetchGitHubRepo = require("../fetchGitHubRepo");
|
||||||
|
|
||||||
|
function parseArgs() {
|
||||||
|
const argv = process.argv.slice(2);
|
||||||
|
const fail = () => {
|
||||||
|
const invocation = process.argv.slice(0, 2).join(" ");
|
||||||
|
throw new Error(`Usage: ${invocation} REPO_OWNER REPO_NAME [TOKEN]`);
|
||||||
|
};
|
||||||
|
if (argv.length < 2) {
|
||||||
|
fail();
|
||||||
|
}
|
||||||
|
const [repoOwner, repoName, ...rest] = argv;
|
||||||
|
const result = {repoOwner, repoName};
|
||||||
|
if (rest.length === 1) {
|
||||||
|
const token = rest[0];
|
||||||
|
// Sanity check on the token structure
|
||||||
|
if (token.length !== 40) {
|
||||||
|
throw new Error(
|
||||||
|
"Token, when provided, must be a 40-character hex string"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
// We'll assume it's a hex string.
|
||||||
|
result.token = token;
|
||||||
|
} else if (rest.length > 1) {
|
||||||
|
fail();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
function main() {
|
||||||
|
const args = parseArgs();
|
||||||
|
fetchGitHubRepo(args.repoOwner, args.repoName, args.token).then((data) => {
|
||||||
|
console.log(JSON.stringify(data));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
130
backend/fetchGitHubRepo.js
Normal file
130
backend/fetchGitHubRepo.js
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
/*
|
||||||
|
* API to scrape data from a GitHub repo using the GitHub API. See the
|
||||||
|
* docstring of the default export for more details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const octokitFactory = require("@octokit/rest");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scrape data from a GitHub repo using the GitHub API.
|
||||||
|
*
|
||||||
|
* @param {String} repoOwner
|
||||||
|
* the GitHub username of the owner of the repository to be scraped
|
||||||
|
* @param {String} repoName
|
||||||
|
* the name of the repository to be scraped
|
||||||
|
* @param {String?} token
|
||||||
|
* optional authentication token to be used for the GitHub API (used
|
||||||
|
* to get around rate limits); generate a token at:
|
||||||
|
* https://github.com/settings/tokens
|
||||||
|
* @return {Promise<object>}
|
||||||
|
* a promise that resolves to a JSON object containing the data
|
||||||
|
* scraped from the repository, with the following keys and data from
|
||||||
|
* the corresponding GitHub v3 endpoints:
|
||||||
|
* - issues: `/repos/:owner/:repo/issues`
|
||||||
|
* - pullRequests: `/repos/:owner/:repo/pulls`
|
||||||
|
* - issueComments: `/repos/:owner/:repo/issues/comments`
|
||||||
|
* - pullRequestComments: `/repos/:owner/:repo/pulls/comments`
|
||||||
|
* - pullRequestReviews: `/repos/:owner/:repo/pulls/:pull/reviews`,
|
||||||
|
* but concatenated over `:pull` so that the result is an array
|
||||||
|
* of objects
|
||||||
|
*/
|
||||||
|
module.exports = function fetchGitHubRepo(repoOwner, repoName, token) {
|
||||||
|
const authOptions = token ? {type: "token", token} : null;
|
||||||
|
return new Fetcher(repoOwner, repoName, authOptions).fetchAll();
|
||||||
|
};
|
||||||
|
|
||||||
|
class Fetcher {
|
||||||
|
constructor(repoOwner, repoName, authOptions = undefined) {
|
||||||
|
this._repoOwner = repoOwner;
|
||||||
|
this._repoName = repoName;
|
||||||
|
this._octokit = octokitFactory(octokitFactory);
|
||||||
|
if (authOptions) {
|
||||||
|
this._octokit.authenticate(authOptions);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Adapted from:
|
||||||
|
// https://www.npmjs.com/package/@octokit/rest#pagination
|
||||||
|
async paginate(promise) {
|
||||||
|
let response = await promise;
|
||||||
|
let {data} = response;
|
||||||
|
while (this._octokit.hasNextPage(response)) {
|
||||||
|
response = await this._octokit.getNextPage(response);
|
||||||
|
data = [...data, ...response.data];
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
async fetchAll() {
|
||||||
|
const pullRequests = await this.fetchPullRequests();
|
||||||
|
return {
|
||||||
|
issues: await this.fetchIssues(),
|
||||||
|
pullRequests,
|
||||||
|
issueComments: await this.fetchIssueComments(),
|
||||||
|
pullRequestComments: await this.fetchPullRequestComments(),
|
||||||
|
pullRequestReviews: await this.fetchPullRequestReviews(pullRequests),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
fetchPullRequests() {
|
||||||
|
return this.paginate(
|
||||||
|
this._octokit.pullRequests.getAll({
|
||||||
|
owner: this._repoOwner,
|
||||||
|
repo: this._repoName,
|
||||||
|
state: "all",
|
||||||
|
per_page: 100,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fetchIssues() {
|
||||||
|
return this.paginate(
|
||||||
|
this._octokit.issues.getForRepo({
|
||||||
|
owner: this._repoOwner,
|
||||||
|
repo: this._repoName,
|
||||||
|
state: "all",
|
||||||
|
per_page: 100,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fetchPullRequestComments() {
|
||||||
|
return this.paginate(
|
||||||
|
this._octokit.pullRequests.getCommentsForRepo({
|
||||||
|
owner: this._repoOwner,
|
||||||
|
repo: this._repoName,
|
||||||
|
per_page: 100,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fetchIssueComments() {
|
||||||
|
return this.paginate(
|
||||||
|
this._octokit.issues.getCommentsForRepo({
|
||||||
|
owner: this._repoOwner,
|
||||||
|
repo: this._repoName,
|
||||||
|
per_page: 100,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fetchIndividualPullRequestReviews(number) {
|
||||||
|
return this.paginate(
|
||||||
|
this._octokit.pullRequests.getReviews({
|
||||||
|
owner: this._repoOwner,
|
||||||
|
repo: this._repoName,
|
||||||
|
number: number,
|
||||||
|
per_page: 100,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fetchPullRequestReviews(allPullRequests) {
|
||||||
|
const reviewses = Promise.all(
|
||||||
|
allPullRequests.map((pr) =>
|
||||||
|
this.fetchIndividualPullRequestReviews(pr.number)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
return reviewses.then((xss) => [].concat(...xss));
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user