From ad2470e5c67b9e237f75a6b48b636db4bf13913c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dandelion=20Man=C3=A9?= Date: Thu, 30 May 2019 17:15:15 +0300 Subject: [PATCH] Aggregate timestamp information on sourcecred load (#1162) This modifies `sourcecred load` so that it saves timestamp information for all of the loaded plugins in a single aggregated map. This is quite convenient, as it saves consumers of timestamp information from needing to worry about the (rather hacky) implementation whereby the data is fed from each adapter. Instead, consumers can just load the timestamp map. This will also make it much easier to use timestamp info in the research codebase. Test plan: The timestampMap module has testing around generating the map from the adapter and nodes, writing it, and reading it. I haven't added any testing to the `load` CLI command. I think it would be redundant as the updated snapshot test reveals that the map is getting serialized properly. Tests pass, and I have inspected the snapshot --- .../sourcecred/example-github/timestamps.json | 1 + src/analysis/temporal/timestampMap.js | 57 ++++++++++++ src/analysis/temporal/timestampMap.test.js | 87 +++++++++++++++++++ src/cli/load.js | 31 +++++++ src/cli/pagerank.js | 2 + 5 files changed, 178 insertions(+) create mode 100644 sharness/__snapshots__/example-github-load/data/sourcecred/example-github/timestamps.json create mode 100644 src/analysis/temporal/timestampMap.js create mode 100644 src/analysis/temporal/timestampMap.test.js diff --git a/sharness/__snapshots__/example-github-load/data/sourcecred/example-github/timestamps.json b/sharness/__snapshots__/example-github-load/data/sourcecred/example-github/timestamps.json new file mode 100644 index 0000000..8d01ced --- /dev/null +++ b/sharness/__snapshots__/example-github-load/data/sourcecred/example-github/timestamps.json @@ -0,0 +1 @@ +{"N\u0000sourcecred\u0000git\u0000COMMIT\u00000a223346b4e6dec0127b1e6aa892c4ee0424b66a\u0000":1519807427000,"N\u0000sourcecred\u0000git\u0000COMMIT\u00006bd1b4c0b719c22c688a74863be07a699b7b9b34\u0000":1536806901000,"N\u0000sourcecred\u0000git\u0000COMMIT\u00006d5b3aa31ebb68a06ceb46bbd6cf49b6ccd6f5e6\u0000":1519878354000,"N\u0000sourcecred\u0000git\u0000COMMIT\u0000c430bd74455105f77215ece51945094ceeee6c86\u0000":1536788634000,"N\u0000sourcecred\u0000git\u0000COMMIT\u0000ec91adb718a6045b492303f00d8e8beb957dc780\u0000":1519807271000,"N\u0000sourcecred\u0000git\u0000COMMIT\u0000ecc889dc94cf6da17ae6eab5bb7b7155f577519d\u0000":1519807329000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u000011\u0000420811872\u0000":1536789545000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u000011\u0000420813013\u0000":1536789813000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u000011\u0000420813206\u0000":1536789858000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u000011\u0000420813621\u0000":1536789965000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u00002\u0000373768703\u0000":1521217693000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u00002\u0000373768850\u0000":1521217725000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u00002\u0000385576185\u0000":1525137909000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u00002\u0000385576220\u0000":1525137925000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u00002\u0000385576248\u0000":1525137939000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u00002\u0000385576273\u0000":1525137951000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u00002\u0000385576920\u0000":1525138231000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u00002\u0000385576936\u0000":1525138238000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u00006\u0000373768442\u0000":1521217642000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u00006\u0000373768538\u0000":1521217661000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u00006\u0000385223316\u0000":1524973307000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000ISSUE\u0000sourcecred\u0000example-github\u00006\u0000417104047\u0000":1535576390000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000PULL\u0000sourcecred\u0000example-github\u00003\u0000369162222\u0000":1519807420000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000PULL\u0000sourcecred\u0000example-github\u00005\u0000396430464\u0000":1528764380000,"N\u0000sourcecred\u0000github\u0000COMMENT\u0000REVIEW\u0000sourcecred\u0000example-github\u00005\u0000100313899\u0000171460198\u0000":1519878210000,"N\u0000sourcecred\u0000github\u0000ISSUE\u0000sourcecred\u0000example-github\u00001\u0000":1519807088000,"N\u0000sourcecred\u0000github\u0000ISSUE\u0000sourcecred\u0000example-github\u000010\u0000":1530297021000,"N\u0000sourcecred\u0000github\u0000ISSUE\u0000sourcecred\u0000example-github\u000011\u0000":1536789479000,"N\u0000sourcecred\u0000github\u0000ISSUE\u0000sourcecred\u0000example-github\u000012\u0000":1536878086000,"N\u0000sourcecred\u0000github\u0000ISSUE\u0000sourcecred\u0000example-github\u000013\u0000":1536878137000,"N\u0000sourcecred\u0000github\u0000ISSUE\u0000sourcecred\u0000example-github\u00002\u0000":1519807129000,"N\u0000sourcecred\u0000github\u0000ISSUE\u0000sourcecred\u0000example-github\u00004\u0000":1519807454000,"N\u0000sourcecred\u0000github\u0000ISSUE\u0000sourcecred\u0000example-github\u00006\u0000":1521217624000,"N\u0000sourcecred\u0000github\u0000ISSUE\u0000sourcecred\u0000example-github\u00007\u0000":1521569949000,"N\u0000sourcecred\u0000github\u0000ISSUE\u0000sourcecred\u0000example-github\u00008\u0000":1521570243000,"N\u0000sourcecred\u0000github\u0000PULL\u0000sourcecred\u0000example-github\u00003\u0000":1519807399000,"N\u0000sourcecred\u0000github\u0000PULL\u0000sourcecred\u0000example-github\u00005\u0000":1519807636000,"N\u0000sourcecred\u0000github\u0000PULL\u0000sourcecred\u0000example-github\u00009\u0000":1525373595000,"N\u0000sourcecred\u0000github\u0000REPO\u0000sourcecred\u0000example-github\u0000":null,"N\u0000sourcecred\u0000github\u0000REVIEW\u0000sourcecred\u0000example-github\u00005\u0000100313899\u0000":1519878210000,"N\u0000sourcecred\u0000github\u0000REVIEW\u0000sourcecred\u0000example-github\u00005\u0000100314038\u0000":1519878296000,"N\u0000sourcecred\u0000github\u0000USERLIKE\u0000BOT\u0000credbot\u0000":null,"N\u0000sourcecred\u0000github\u0000USERLIKE\u0000USER\u0000decentralion\u0000":null,"N\u0000sourcecred\u0000github\u0000USERLIKE\u0000USER\u0000wchargin\u0000":null} \ No newline at end of file diff --git a/src/analysis/temporal/timestampMap.js b/src/analysis/temporal/timestampMap.js new file mode 100644 index 0000000..953f48b --- /dev/null +++ b/src/analysis/temporal/timestampMap.js @@ -0,0 +1,57 @@ +// @flow + +import path from "path"; +import fs from "fs-extra"; +import stringify from "json-stable-stringify"; +import * as MapUtil from "../../util/map"; +import {type RepoId, repoIdToString} from "../../core/repoId"; +import {type NodeAddressT, NodeAddress} from "../../core/graph"; +import {type IAnalysisAdapter, type MsSinceEpoch} from "../analysisAdapter"; +import {NodeTrie} from "../../core/trie"; + +export type TimestampMap = Map; + +export function createTimestampMap( + nodes: Iterable, + adapters: $ReadOnlyArray +): TimestampMap { + const adapterTrie: NodeTrie = new NodeTrie(); + for (const adapter of adapters) { + adapterTrie.add(adapter.declaration().nodePrefix, adapter); + } + const result = new Map(); + for (const node of nodes) { + const adapter = adapterTrie.getLast(node); + if (adapter == null) { + throw new Error(`No adapter for ${NodeAddress.toString(node)}`); + } + result.set(node, adapter.createdAt(node)); + } + return result; +} + +const TIMESTAMP_FILE = "timestamps.json"; +function basepath(sourcecredDirectory: string, repoId: RepoId) { + return path.join(sourcecredDirectory, "data", repoIdToString(repoId)); +} +function filepath(sourcecredDirectory: string, repoId: RepoId) { + return path.join(basepath(sourcecredDirectory, repoId), TIMESTAMP_FILE); +} + +export function writeTimestampMap( + stamps: TimestampMap, + sourcecredDirectory: string, + repoId: RepoId +) { + fs.ensureDirSync(basepath(sourcecredDirectory, repoId)); + const jsonString = stringify(MapUtil.toObject(stamps)); + fs.writeFileSync(filepath(sourcecredDirectory, repoId), jsonString); +} + +export function readTimestampMap( + sourcecredDirectory: string, + repoId: RepoId +): TimestampMap { + const contents = fs.readFileSync(filepath(sourcecredDirectory, repoId)); + return MapUtil.fromObject(JSON.parse(contents.toString())); +} diff --git a/src/analysis/temporal/timestampMap.test.js b/src/analysis/temporal/timestampMap.test.js new file mode 100644 index 0000000..957a4d3 --- /dev/null +++ b/src/analysis/temporal/timestampMap.test.js @@ -0,0 +1,87 @@ +// @flow + +import tmp from "tmp"; +import { + Graph, + type NodeAddressT, + NodeAddress, + EdgeAddress, +} from "../../core/graph"; +import {makeRepoId} from "../../core/repoId"; +import { + createTimestampMap, + readTimestampMap, + writeTimestampMap, +} from "./timestampMap"; + +describe("src/analysis/temporal/timestampMap", () => { + const foo = NodeAddress.fromParts(["foo"]); + const bar = NodeAddress.fromParts(["bar"]); + + describe("createTimestampMap", () => { + const declarationForPrefix = (prefixParts: string[]) => ({ + name: NodeAddress.fromParts(prefixParts), + nodePrefix: NodeAddress.fromParts(prefixParts), + edgePrefix: EdgeAddress.fromParts(prefixParts), + nodeTypes: [], + edgeTypes: [], + }); + const adapterForPrefix = ( + prefixParts: string[], + createdAt: (NodeAddressT) => number | null + ) => { + class Adapter { + declaration() { + return declarationForPrefix(prefixParts); + } + graph() { + return new Graph(); + } + createdAt(n: NodeAddressT) { + return createdAt(n); + } + } + return new Adapter(); + }; + it("matches the most specific adapter", () => { + const fooAdapter = adapterForPrefix(["foo"], (_) => 1); + const fallbackAdapter = adapterForPrefix([], (_) => null); + const nodes = [foo, bar]; + const tsMap = createTimestampMap(nodes, [fooAdapter, fallbackAdapter]); + // foo got its timestamp from the fooAdapter, not from the fallbackAdapter, + // even though it matched both. + expect(tsMap.get(foo)).toEqual(1); + // Bar matched the fallback adapter. + expect(tsMap.get(bar)).toEqual(null); + }); + it("throws an error if there is no matching adapter", () => { + const foo = NodeAddress.fromParts(["foo"]); + expect(() => createTimestampMap([foo], [])).toThrowError( + `No adapter for NodeAddress["foo"]` + ); + }); + }); + describe("{write,read}TimestampMap", () => { + const repo = makeRepoId("foo", "bar"); + it("throws an error if there is no timestamp map to read", () => { + const dir = tmp.dirSync().name; + expect(() => readTimestampMap(dir, repo)).toThrowError( + "ENOENT: no such file or directory" + ); + }); + it("can write/read the empty registry", () => { + const dir = tmp.dirSync().name; + const map = new Map(); + writeTimestampMap(map, dir, repo); + const map2 = readTimestampMap(dir, repo); + expect(map2).toEqual(map); + }); + it("can write/read a non-empty registry", () => { + const dir = tmp.dirSync().name; + const map = new Map([[foo, null], [bar, 3]]); + writeTimestampMap(map, dir, repo); + const map2 = readTimestampMap(dir, repo); + expect(map2).toEqual(map); + }); + }); +}); diff --git a/src/cli/load.js b/src/cli/load.js index 0cdfc52..6d114ec 100644 --- a/src/cli/load.js +++ b/src/cli/load.js @@ -6,15 +6,23 @@ import path from "path"; import * as NullUtil from "../util/null"; +import stringify from "json-stable-stringify"; import * as RepoIdRegistry from "../core/repoIdRegistry"; import {repoIdToString, stringToRepoId, type RepoId} from "../core/repoId"; import dedent from "../util/dedent"; import type {Command} from "./command"; import * as Common from "./common"; +import {loadGraph, type LoadGraphResult} from "../analysis/loadGraph"; +import {type IBackendAdapterLoader} from "../analysis/analysisAdapter"; +import { + createTimestampMap, + writeTimestampMap, +} from "../analysis/temporal/timestampMap"; import execDependencyGraph from "../tools/execDependencyGraph"; import {loadGithubData} from "../plugins/github/loadGithubData"; import {loadGitData} from "../plugins/git/loadGitData"; +import {defaultAdapterLoaders} from "./pagerank"; function usage(print: (string) => void): void { print( @@ -193,6 +201,7 @@ export const loadDefaultPlugins = async (options: LoadOptions) => { throw new Error("Load tasks failed."); } addToRepoIdRegistry(options.output); + saveTimestamps(defaultAdapterLoaders(), options.output); // HACK: Logically, we should have the PagerankTask be included in the // first execDependencyGraph run, depending on the other tasks completing. // @@ -261,6 +270,28 @@ function addToRepoIdRegistry(repoId) { RepoIdRegistry.writeRegistry(newRegistry, Common.sourcecredDirectory()); } +async function saveTimestamps( + adapterLoaders: $ReadOnlyArray, + repoId: RepoId +) { + const loadGraphResult: LoadGraphResult = await loadGraph( + Common.sourcecredDirectory(), + adapterLoaders, + repoId + ); + if (loadGraphResult.status !== "SUCCESS") { + throw new Error(`Unable to load graph: ${stringify(loadGraphResult)}`); + } + const {graph} = loadGraphResult; + // We load all the adapters twice (once in loadGraph, once here). + // Could de-duplicate, but it's marginal overhead compared to loading the data. + const adapters = await Promise.all( + adapterLoaders.map((a) => a.load(Common.sourcecredDirectory(), repoId)) + ); + const timestampMap = createTimestampMap(graph.nodes(), adapters); + writeTimestampMap(timestampMap, Common.sourcecredDirectory(), repoId); +} + export const help: Command = async (args, std) => { if (args.length === 0) { usage(std.out); diff --git a/src/cli/pagerank.js b/src/cli/pagerank.js index 64ecfd0..c64dfc2 100644 --- a/src/cli/pagerank.js +++ b/src/cli/pagerank.js @@ -169,6 +169,8 @@ export async function savePagerankGraph( await fs.writeFile(pgFile, stringify(pgJSON)); } +// TODO(#1120): This should be canonicalized somewhere more appropriate, +// e.g. in src/plugins/defaultPlugins.js export const defaultAdapterLoaders = () => [ new GithubAdapterLoader(), new GitAdapterLoader(),