From 418b745d7ca5c0d9521792b7c0b1133bd6e90d57 Mon Sep 17 00:00:00 2001 From: William Chargin Date: Tue, 24 Apr 2018 13:57:10 -0700 Subject: [PATCH] Load Git repositories into memory (#139) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: In this newly added module, we load the structural state of a git repository into memory. We do not load into memory the contents of any blobs, so this is not enough information to perform any analysis requiring file diffing. However, it is sufficient to develop a notion of “this file was changed in this commit”, by simply diffing the trees. Test Plan: Unit tests added; `yarn test` suffices. Reading these snapshots is pretty easy, even though they’re filled with hashes: - First, read over the commit specifications on lines 69–83 of `loadRepository.test.js`, so you know what to expect. - In the snapshot file, keep handy the time-ordered list of commit SHAs at the bottom of the file, so that you know which commit SHA is which. - To verify that the large snapshot is correct: for each commit, read the corresponding tree object and make sure that the structure is correct. - To verify the small snapshot, just check that it’s the correct subset of the large snapshot. - If you want to verify that the SHA for a blob is correct, open a terminal and run `git hash-object -t blob --stdin`; then, enter the content of the blob and press ``. The result is the blob SHA. To run a sanity-check on a large repository: apply the following patch:
Patch to print out statistics about loaded repository ```diff diff --git a/config/paths.js b/config/paths.js index d2f25fb..8fa2023 100644 --- a/config/paths.js +++ b/config/paths.js @@ -62,5 +62,6 @@ module.exports = { fetchAndPrintGithubRepo: resolveApp( "src/plugins/github/bin/fetchAndPrintGithubRepo.js" ), + loadRepository: resolveApp("src/plugins/git/loadRepository.js"), }, }; diff --git a/src/plugins/git/loadRepository.js b/src/plugins/git/loadRepository.js index a76b66c..9380941 100644 --- a/src/plugins/git/loadRepository.js +++ b/src/plugins/git/loadRepository.js @@ -106,3 +106,7 @@ function findTrees(git: GitDriver, rootTrees: Set): Tree[] { } return result; } + +const result = loadRepository(...process.argv.slice(2)); +console.log("commits", result.commits.size); +console.log("trees", result.trees.size); ```
Then, run `yarn backend` and put the following script in `test.sh`:
Contents for `test.sh` ```shell #!/bin/bash set -eu repo="$1" ref="$2" via_node() { node bin/loadRepository.js "${repo}" "${ref}" } via_git() ( cd "${repo}" printf 'commits ' git rev-list "${ref}" | wc -l printf 'trees ' git rev-list "${ref}" | while read -r commit; do git rev-parse "${commit}^{tree}" git ls-tree -rt "${commit}" \ | grep ' tree ' \ | cut -f 1 | cut -d ' ' -f 3 done | sort | uniq | wc -l ) echo printf 'Running directly via git...\n' time a="$(via_git)" echo printf 'Running Node script...\n' time b="$(via_node)" diff -u <(cat <<<"${a}") <(cat <<<"${b}") ```
Finally, run `./test.sh /path/to/some/repo origin/master`, and verify that it exits successfully (zero diff). Here are some timing results on SourceCred and TensorBoard: - SourceCred: 0.973s via Node, 0.327s via git. - TensorBoard: 30.836s via Node, 6.895s via git. For TensorFlow, running via git takes 7m33.995s. Running via Node fails with an out-of-memory error after 39 minutes, with 10GB RAM and 4GB swap. See details below.
Full timing details, commit SHAs, and OOM error message ``` + ./test.sh /home/wchargin/git/sourcecred 01634aabcca3756b38e13aaf2f451cfbda2ad5ea Running directly via git... real 0m0.327s user 0m0.016s sys 0m0.052s Running Node script... real 0m0.973s user 0m0.268s sys 0m0.176s + ./test.sh /home/wchargin/git/tensorboard 7aa1ab9d60671056b8811b7099eec08650f2e4fd Running directly via git... real 0m6.895s user 0m0.600s sys 0m0.832s Running Node script... real 0m30.836s user 0m3.216s sys 0m10.588s + ./test.sh /home/wchargin/git/tensorflow 968addadfd4e4f5688eedc31f92a9066329ff6a7 Running directly via git... real 7m33.995s user 5m21.124s sys 1m5.476s Running Node script... FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory 1: node::Abort() [node] 2: 0x121a2cc [node] 3: v8::Utils::ReportOOMFailure(char const*, bool) [node] 4: v8::internal::V8::FatalProcessOutOfMemory(char const*, bool) [node] 5: v8::internal::Factory::NewFixedArray(int, v8::internal::PretenureFlag) [node] 6: v8::internal::DeoptimizationInputData::New(v8::internal::Isolate*, int, v8::internal::PretenureFlag) [node] 7: v8::internal::compiler::CodeGenerator::PopulateDeoptimizationData(v8::internal::Handle) [node] 8: v8::internal::compiler::CodeGenerator::FinalizeCode() [node] 9: v8::internal::compiler::PipelineImpl::FinalizeCode() [node] 10: v8::internal::compiler::PipelineCompilationJob::FinalizeJobImpl() [node] 11: v8::internal::Compiler::FinalizeCompilationJob(v8::internal::CompilationJob*) [node] 12: v8::internal::OptimizingCompileDispatcher::InstallOptimizedFunctions() [node] 13: v8::internal::Runtime_TryInstallOptimizedCode(int, v8::internal::Object**, v8::internal::Isolate*) [node] 14: 0x12dc8b08463d ```
wchargin-branch: load-git-repositories # Please enter the commit message for your changes. Lines starting # with '#' will be kept; you may remove them yourself if you want to. # An empty message aborts the commit. # # Date: Mon Apr 23 23:02:14 2018 -0700 # # HEAD detached at origin/wchargin-load-git-repositories # Changes to be committed: # modified: package.json # new file: src/plugins/git/__snapshots__/loadRepository.test.js.snap # new file: src/plugins/git/loadRepository.js # new file: src/plugins/git/loadRepository.test.js # # Untracked files: # out # runtests.sh # src/plugins/artifact/editor/ArtifactSetInput.js # src/plugins/git/repository.js # test.sh # todo # --- package.json | 4 +- .../__snapshots__/loadRepository.test.js.snap | 144 +++++++++++++++++ src/plugins/git/loadRepository.js | 121 ++++++++++++++ src/plugins/git/loadRepository.test.js | 153 ++++++++++++++++++ 4 files changed, 421 insertions(+), 1 deletion(-) create mode 100644 src/plugins/git/__snapshots__/loadRepository.test.js.snap create mode 100644 src/plugins/git/loadRepository.js create mode 100644 src/plugins/git/loadRepository.test.js diff --git a/package.json b/package.json index 2306bb9..f1bfa69 100644 --- a/package.json +++ b/package.json @@ -116,6 +116,8 @@ "babel-plugin-flow-react-proptypes": "^18.0.0", "enzyme": "^3.3.0", "enzyme-adapter-react-16": "^1.1.1", - "enzyme-to-json": "^3.3.3" + "enzyme-to-json": "^3.3.3", + "mkdirp": "^0.5.1", + "tmp": "^0.0.33" } } diff --git a/src/plugins/git/__snapshots__/loadRepository.test.js.snap b/src/plugins/git/__snapshots__/loadRepository.test.js.snap new file mode 100644 index 0000000..428f574 --- /dev/null +++ b/src/plugins/git/__snapshots__/loadRepository.test.js.snap @@ -0,0 +1,144 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`loadRepository loads from HEAD 1`] = ` +Object { + "commits": Map { + "677b340674bde17fdaac3b5f5eef929139ef2a52" => Object { + "hash": "677b340674bde17fdaac3b5f5eef929139ef2a52", + "treeHash": "6152a37dba8aa54dc4bc2d59c1f01c2afeba74b0", + }, + "4be43f1cda04e51e42fec0cfe8e1e2dff116e839" => Object { + "hash": "4be43f1cda04e51e42fec0cfe8e1e2dff116e839", + "treeHash": "93642dbd1793e84a6f529a1e1b1b4f87a4f5c878", + }, + "cbb26b570d1eed3c681b8f03ff31231c1bffd6d6" => Object { + "hash": "cbb26b570d1eed3c681b8f03ff31231c1bffd6d6", + "treeHash": "f6736d27cd7eb7e35ae22a906854c700eb5cf6c1", + }, + "301749e9af8cd6e9aee3a49a64029b98a4695e34" => Object { + "hash": "301749e9af8cd6e9aee3a49a64029b98a4695e34", + "treeHash": "4d5f2603a4b63aa68b8e51facf542a62e4c1d065", + }, + }, + "trees": Map { + "6152a37dba8aa54dc4bc2d59c1f01c2afeba74b0" => Object { + "entries": Map { + "README.txt" => Object { + "hash": "f1f2514ca6d7a6a1a0511957021b1995bf9ace1c", + "name": "README.txt", + "type": "blob", + }, + "src" => Object { + "hash": "78fc9c83023386854c6bfdc5761c0e58f68e226f", + "name": "src", + "type": "tree", + }, + }, + "hash": "6152a37dba8aa54dc4bc2d59c1f01c2afeba74b0", + }, + "93642dbd1793e84a6f529a1e1b1b4f87a4f5c878" => Object { + "entries": Map { + "README.txt" => Object { + "hash": "f1f2514ca6d7a6a1a0511957021b1995bf9ace1c", + "name": "README.txt", + "type": "blob", + }, + "TODOS.txt" => Object { + "hash": "ddec7477206c30c31b81482e56b877a0b3c2638b", + "name": "TODOS.txt", + "type": "blob", + }, + "src" => Object { + "hash": "78fc9c83023386854c6bfdc5761c0e58f68e226f", + "name": "src", + "type": "tree", + }, + }, + "hash": "93642dbd1793e84a6f529a1e1b1b4f87a4f5c878", + }, + "f6736d27cd7eb7e35ae22a906854c700eb5cf6c1" => Object { + "entries": Map { + "README.txt" => Object { + "hash": "f1f2514ca6d7a6a1a0511957021b1995bf9ace1c", + "name": "README.txt", + "type": "blob", + }, + "TODOS.txt" => Object { + "hash": "ddec7477206c30c31b81482e56b877a0b3c2638b", + "name": "TODOS.txt", + "type": "blob", + }, + "src" => Object { + "hash": "7b79d579b62994faba3b69fdf8aa442586c32681", + "name": "src", + "type": "tree", + }, + }, + "hash": "f6736d27cd7eb7e35ae22a906854c700eb5cf6c1", + }, + "4d5f2603a4b63aa68b8e51facf542a62e4c1d065" => Object { + "entries": Map { + "README.txt" => Object { + "hash": "f1f2514ca6d7a6a1a0511957021b1995bf9ace1c", + "name": "README.txt", + "type": "blob", + }, + }, + "hash": "4d5f2603a4b63aa68b8e51facf542a62e4c1d065", + }, + "78fc9c83023386854c6bfdc5761c0e58f68e226f" => Object { + "entries": Map { + "index.py" => Object { + "hash": "674b0b476989384510304846248b3acd16206782", + "name": "index.py", + "type": "blob", + }, + "quantum_gravity.py" => Object { + "hash": "aea4f28abb23abde151b0ead4063227f8bf6c0b0", + "name": "quantum_gravity.py", + "type": "blob", + }, + }, + "hash": "78fc9c83023386854c6bfdc5761c0e58f68e226f", + }, + "7b79d579b62994faba3b69fdf8aa442586c32681" => Object { + "entries": Map { + "index.py" => Object { + "hash": "674b0b476989384510304846248b3acd16206782", + "name": "index.py", + "type": "blob", + }, + "quantum_gravity.py" => Object { + "hash": "887ad856bbc1373da146106c86cb581ad78cdafe", + "name": "quantum_gravity.py", + "type": "blob", + }, + }, + "hash": "7b79d579b62994faba3b69fdf8aa442586c32681", + }, + }, +} +`; + +exports[`loadRepository processes an old commit 1`] = ` +Object { + "commits": Set { + "cbb26b570d1eed3c681b8f03ff31231c1bffd6d6", + "301749e9af8cd6e9aee3a49a64029b98a4695e34", + }, + "trees": Set { + "f6736d27cd7eb7e35ae22a906854c700eb5cf6c1", + "4d5f2603a4b63aa68b8e51facf542a62e4c1d065", + "7b79d579b62994faba3b69fdf8aa442586c32681", + }, +} +`; + +exports[`we create a deterministic repository 1`] = ` +Array [ + "301749e9af8cd6e9aee3a49a64029b98a4695e34", + "cbb26b570d1eed3c681b8f03ff31231c1bffd6d6", + "4be43f1cda04e51e42fec0cfe8e1e2dff116e839", + "677b340674bde17fdaac3b5f5eef929139ef2a52", +] +`; diff --git a/src/plugins/git/loadRepository.js b/src/plugins/git/loadRepository.js new file mode 100644 index 0000000..2503cd6 --- /dev/null +++ b/src/plugins/git/loadRepository.js @@ -0,0 +1,121 @@ +/* + * Load a git repository into memory. This dumps the commit and tree + * data into a structured form. Contents of blobs are not loaded. + * + * If the repository contains file names that are not valid UTF-8 + * strings, the result is undefined. + * + * Note: git(1) is a runtime dependency of this module. + */ +// @flow + +import {execFileSync} from "child_process"; + +export type GitDriver = (args: string[], options?: ExecOptions) => string; +type ExecOptions = Object; // close enough +export function localGit(repositoryPath: string): GitDriver { + return function git(args: string[], options?: ExecOptions): string { + // Throws an Error on shell failure. + return execFileSync( + "git", + ["-C", repositoryPath, ...args], + options + ).toString(); + }; +} + +export type Repository = {| + +commits: Map, + +trees: Map, +|}; +export type Hash = string; +export type Commit = {| + +hash: Hash, + +treeHash: Hash, +|}; +export type Tree = {| + +hash: Hash, + +entries: Map, // map from name +|}; +export type TreeEntry = {| + +type: "blob" | "commit" | "tree", + +name: string, + +hash: Hash, +|}; + +/** + * Load a Git repository from disk into memory. The `rootRef` should be + * a revision reference as accepted by `git rev-parse`: "HEAD" and + * "origin/master" will be common, while a specific SHA or tag might be + * used to fix a particular state of a repository. + */ +export function loadRepository( + repositoryPath: string, + rootRef: string +): Repository { + const git = localGit(repositoryPath); + const commits = findCommits(git, rootRef); + const trees = findTrees(git, new Set(commits.map((x) => x.treeHash))); + return {commits: hashMap(commits), trees: hashMap(trees)}; +} + +function hashMap(ts: $ReadOnlyArray): Map { + const result = new Map(); + ts.forEach((t) => { + result.set(t.hash, t); + }); + return result; +} + +function findCommits(git: GitDriver, rootRef: string): Commit[] { + return git(["log", "--oneline", "--pretty=%H %T", rootRef]) + .split("\n") + .filter((line) => line.length > 0) + .map((line) => { + const [hash, treeHash] = line.split(" "); + return {hash, treeHash}; + }); +} + +function findTrees(git: GitDriver, rootTrees: Set): Tree[] { + const result: Tree[] = []; + const visited: Set = new Set(); + const frontier: Set = new Set(rootTrees); + while (frontier.size > 0) { + const next = frontier.values().next(); + if (next.done) { + // Flow doesn't know that this is impossible, but it is. + throw new Error("Impossible! `frontier` had positive size."); + } + const treeHash: Hash = next.value; + visited.add(treeHash); + frontier.delete(treeHash); + const tree = loadTree(git, treeHash); + result.push(tree); + for (const entry of tree.entries.values()) { + if (entry.type === "tree" && !visited.has(entry.hash)) { + frontier.add(entry.hash); + } + } + } + return result; +} + +function loadTree(git: GitDriver, treeHash: Hash): Tree { + const entries: TreeEntry[] = git(["ls-tree", "--full-tree", "-z", treeHash]) + .split("\0") + .filter((line) => line.length > 0) + .map((line) => { + // See `git help ls-tree`, section OUTPUT FORMAT, for details. + const [metadata, name] = line.split("\t"); + const [mode, type, hash] = metadata.split(" "); + if (type !== "blob" && type !== "commit" && type !== "tree") { + throw new Error( + `entry ${treeHash}[${JSON.stringify(name)}] ` + + `has unexpected type "${type}"` + ); + } + return {name, type, hash}; + }); + return {hash: treeHash, entries: new Map(entries.map((e) => [e.name, e]))}; +} diff --git a/src/plugins/git/loadRepository.test.js b/src/plugins/git/loadRepository.test.js new file mode 100644 index 0000000..abebba9 --- /dev/null +++ b/src/plugins/git/loadRepository.test.js @@ -0,0 +1,153 @@ +// @flow + +import fs from "fs"; +import mkdirp from "mkdirp"; +import path from "path"; +import tmp from "tmp"; + +import type {GitDriver} from "./loadRepository"; +import {localGit, loadRepository} from "./loadRepository"; + +const cleanups: (() => void)[] = []; +afterAll(() => { + cleanups.forEach((f) => { + f(); + }); +}); + +function mkdtemp() { + const result = tmp.dirSync(); + cleanups.push(() => result.removeCallback()); + return result.name; +} + +function deterministicCommit(git: GitDriver, message: string): void { + git( + [ + "-c", + "user.name=Test Runner", + "-c", + "user.email=nobody@example.com", + "commit", + "-m", + message, + ], + { + env: { + TZ: "UTC", + GIT_AUTHOR_DATE: "2001-02-03T04:05:06", + GIT_COMMITTER_DATE: "2002-03-04T05:06:07", + }, + } + ); +} + +function createRepository(): {path: string, commits: string[]} { + const repositoryPath = mkdtemp(); + const git = localGit(repositoryPath); + + git(["init"]); + + function makeChangesAndCommit( + message: string, + changes: {[filename: string]: ?string} + ): string /* commit SHA */ { + Object.keys(changes).forEach((filename) => { + const filepath = path.join(repositoryPath, filename); + const dirpath = path.join(repositoryPath, path.dirname(filename)); + if (changes[filename] == null) { + fs.unlinkSync(filepath); + git(["rm", filename]); + } else { + const change = changes[filename]; + mkdirp.sync(dirpath); + fs.writeFileSync(filepath, change); + git(["add", filename]); + } + }); + deterministicCommit(git, message); + return git(["rev-parse", "HEAD"]).trim(); + } + + const commit1 = makeChangesAndCommit("Initial commit", { + "README.txt": "Amazing physics going on...\n", + }); + const commit2 = makeChangesAndCommit("Discover gravity", { + "src/index.py": "import antigravity\n", + "src/quantum_gravity.py": 'raise NotImplementedError("TODO(physicists)")\n', + "TODOS.txt": "1. Resolve quantum gravity\n", + }); + const commit3 = makeChangesAndCommit("Solve quantum gravity", { + "src/quantum_gravity.py": + "import random\nif random.random() < 0.5:\n import antigravity\n", + }); + const commit4 = makeChangesAndCommit("Clean up TODOS", { + "TODOS.txt": null, + }); + + return { + path: repositoryPath, + commits: [commit1, commit2, commit3, commit4], + }; +} + +test("we create a deterministic repository", () => { + expect(createRepository().commits).toMatchSnapshot(); +}); + +describe("loadRepository", () => { + it("loads from HEAD", () => { + const repository = createRepository(); + expect(loadRepository(repository.path, "HEAD")).toMatchSnapshot(); + }); + + it("processes an old commit", () => { + const repository = createRepository(); + const whole = loadRepository(repository.path, "HEAD"); + const part = loadRepository(repository.path, repository.commits[1]); + + // Check that `part` is a subset of `whole`... + for (const hash of part.commits.keys()) { + expect(part.commits.get(hash)).toEqual(whole.commits.get(hash)); + } + for (const hash of part.trees.keys()) { + expect(part.trees.get(hash)).toEqual(whole.trees.get(hash)); + } + + // ...and that it's the right subset. + expect({ + commits: new Set(part.commits.keys()), + trees: new Set(part.trees.keys()), + }).toMatchSnapshot(); + }); + + it("works with submodules", () => { + const repositoryPath = mkdtemp(); + const git = localGit(repositoryPath); + + const subproject = createRepository(); + + git(["init"]); + git(["submodule", "--quiet", "add", subproject.path, "physics"]); + deterministicCommit(git, "Initial commit"); + + const head = git(["rev-parse", "HEAD"]).trim(); + + const repository = loadRepository(repositoryPath, "HEAD"); + const commit = repository.commits.get(head); + expect(commit).toEqual(expect.anything()); + if (commit == null) { + throw new Error("Unreachable"); + } + const tree = repository.trees.get(commit.treeHash); + expect(tree).toEqual(expect.anything()); + if (tree == null) { + throw new Error("Unreachable"); + } + expect(tree.entries.get("physics")).toEqual({ + type: "commit", + name: "physics", + hash: subproject.commits[subproject.commits.length - 1], + }); + }); +});