Add sourcecred pagerank for backend pagerank (#1114)

This commit adds a new CLI command, `pagerank`, which runs PageRank on a given repository. At present, the command only ever uses the default weights, although I plan to make this configurable in the future. The command then saves the resultant pagerank graph in the SourceCred directory. On its own, this command is not yet very compelling, as it doesn't present any easily-consumed information (e.g. users' scores). However, it is the first step for building other commands which do just that. My intention is to make running this command the last step of `sourcecred load`, so that future commands may assume the existence of pagerank scores for any loaded repository. Test plan: The new command is thoroughly tested; see `cli/pagerank.test.js`. It also has nearly perfect code coverage (one line missing, the dependency-injected real function for loading graphs). Additionally, the following sequence of commands works: ``` $ yarn backend $ node bin/sourcecred.js load sourcecred/pm $ node bin/sourcecred.js pagerank sourcecred/pm $ cat $SOURCECRED_DIRECTORY/data/sourcecred/pm/pagerankGraph.json ``` Material progress on #967.
2025-01-13 14:14:57 +00:00 · 2019-03-25 18:05:58 -07:00 · 2019-03-25 18:05:58 -07:00 · 012c4f3eb7
commit 012c4f3eb7
parent 669f34d009
6 changed files with 521 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,6 +1,7 @@
 # Changelog

 ## [Unreleased]
+- Add the `pagerank` command (#1114)
 - Add description tooltips for node and edge types in the weight configuration UI (#1081)
 - Add the `export-graph` command (#1110)
 - Enable loading private repositories (#1085)
--- a/src/cli/help.js
+++ b/src/cli/help.js
@ -6,6 +6,7 @@ import dedent from "../util/dedent";

 import {help as loadHelp} from "./load";
 import {help as analyzeHelp} from "./analyze";
+import {help as pagerankHelp} from "./pagerank";
 import {help as exportGraphHelp} from "./exportGraph";

 const help: Command = async (args, std) => {
@ -18,6 +19,7 @@ const help: Command = async (args, std) => {
    help: metaHelp,
    load: loadHelp,
    analyze: analyzeHelp,
+    pagerank: pagerankHelp,
    "export-graph": exportGraphHelp,
  };
  if (subHelps[command] !== undefined) {
@ -40,6 +42,7 @@ function usage(print: (string) => void): void {
      load          load repository data into SourceCred
      analyze       analyze cred for a loaded repository
      export-graph  print a raw SourceCred graph
+      pagerank      recompute cred scores
      help          show this help message

    Use 'sourcecred help COMMAND' for help about an individual command.
--- a/src/cli/pagerank.js
+++ b/src/cli/pagerank.js
@ -0,0 +1,210 @@
+// @flow
+// Implementation of `sourcecred pagerank`.
+
+import fs from "fs-extra";
+import path from "path";
+
+import {Graph} from "../core/graph";
+import {
+  PagerankGraph,
+  DEFAULT_SYNTHETIC_LOOP_WEIGHT,
+  DEFAULT_CONVERGENCE_THRESHOLD,
+  DEFAULT_MAX_ITERATIONS,
+} from "../core/pagerankGraph";
+import {repoIdToString, stringToRepoId, type RepoId} from "../core/repoId";
+import dedent from "../util/dedent";
+import type {Command} from "./command";
+import * as Common from "./common";
+import stringify from "json-stable-stringify";
+import {loadGraph, type LoadGraphResult} from "../analysis/loadGraph";
+
+import {
+  type WeightedTypes,
+  combineWeights,
+  defaultWeightsForDeclaration,
+} from "../analysis/weights";
+import {weightsToEdgeEvaluator} from "../analysis/weightsToEdgeEvaluator";
+import type {IAnalysisAdapter} from "../analysis/analysisAdapter";
+import {AnalysisAdapter as GithubAnalysisAdapter} from "../plugins/github/analysisAdapter";
+import {AnalysisAdapter as GitAnalysisAdapter} from "../plugins/git/analysisAdapter";
+import {FallbackAdapter} from "../analysis/fallbackAdapter";
+
+function usage(print: (string) => void): void {
+  print(
+    dedent`\
+    usage: sourcecred pagerank REPO_ID [--help]
+
+    Runs PageRank for a given REPO_ID, and saves the resultant
+    PagerankGraph to the SOURCECRED_DIRECTORY. Data must already
+    be loaded for the given REPO_ID, using 'sourcecred load REPO_ID'.
+
+    PageRank is always run with the default plugin weights. We expect
+    to make the weights configurable in the future.
+
+    REPO_ID refers to a GitHub repository in the form OWNER/NAME: for
+    example, torvalds/linux. The REPO_ID may be a "combined" repo as
+    created by the --output flag to sourcecred load.
+
+    Running this command may take a lot of heap. If it fails with an
+    out of memory (OOM) issue, try manually increasing the heap size
+    by passing the following argument to node process:
+    '--max_old_space_size=8192'
+    as in:
+    'node --max_old_space_size=8192 bin/sourcecred.js pagerank ...'
+
+    Arguments:
+        REPO_ID
+            Already-loaded repository for which to load data.
+
+        --help
+            Show this help message and exit, as 'sourcecred help pagerank'.
+
+    Environment Variables:
+        SOURCECRED_DIRECTORY
+            Directory owned by SourceCred, in which data, caches,
+            registries, etc. are stored. Optional: defaults to a
+            directory 'sourcecred' under your OS's temporary directory;
+            namely:
+                ${Common.defaultSourcecredDirectory()}
+    `.trimRight()
+  );
+}
+
+function die(std, message) {
+  std.err("fatal: " + message);
+  std.err("fatal: run 'sourcecred help pagerank' for help");
+  return 1;
+}
+
+/**
+ * Harness to create a Pagerank CLI command.
+ * It's factored so as to make it easy to test the CLI bits, separately
+ * from the core logic.
+ * It takes a `loader`, which loads the graph corresponding to a RepoId,
+ * a `pagerankRunner` which runs pagerank on that graph, and a `saver`
+ * which is responsible for saving the resultant PagerankGraph to disk.
+ */
+export function makePagerankCommand(
+  loadGraph: (RepoId) => Promise<LoadGraphResult>,
+  runPagerank: (Graph) => Promise<PagerankGraph>,
+  savePagerankGraph: (RepoId, PagerankGraph) => Promise<void>
+): Command {
+  return async function pagerank(args, std) {
+    let repoId: RepoId | null = null;
+    for (let i = 0; i < args.length; i++) {
+      switch (args[i]) {
+        case "--help": {
+          usage(std.out);
+          return 0;
+        }
+        default: {
+          if (repoId != null) {
+            return die(std, "multiple repository IDs provided");
+          }
+          // Should be a repository.
+          repoId = stringToRepoId(args[i]);
+          break;
+        }
+      }
+    }
+
+    if (repoId == null) {
+      return die(std, "no repository ID provided");
+    }
+
+    const result: LoadGraphResult = await loadGraph(repoId);
+
+    switch (result.status) {
+      case "REPO_NOT_LOADED": {
+        const repoIdStr = repoIdToString(repoId);
+        std.err(`fatal: repository ID ${repoIdStr} not loaded`);
+        std.err(`Try running \`sourcecred load ${repoIdStr}\` first.`);
+        return 1;
+      }
+      case "PLUGIN_FAILURE": {
+        std.err(
+          `fatal: plugin "${result.pluginName}" errored: ${
+            result.error.message
+          }`
+        );
+        return 1;
+      }
+      case "SUCCESS": {
+        const pagerankGraph = await runPagerank(result.graph);
+        await savePagerankGraph(repoId, pagerankGraph);
+        return 0;
+      }
+      // istanbul ignore next: unreachable per Flow
+      default: {
+        std.err(`Unexpected status: ${(result.status: empty)}`);
+        return 1;
+      }
+    }
+  };
+}
+
+export async function runPagerank(
+  weights: WeightedTypes,
+  graph: Graph
+): Promise<PagerankGraph> {
+  const evaluator = weightsToEdgeEvaluator(weights);
+  const pagerankGraph = new PagerankGraph(
+    graph,
+    evaluator,
+    DEFAULT_SYNTHETIC_LOOP_WEIGHT
+  );
+  await pagerankGraph.runPagerank({
+    maxIterations: DEFAULT_MAX_ITERATIONS,
+    convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
+  });
+  return pagerankGraph;
+}
+
+export async function savePagerankGraph(
+  directory: string,
+  repoId: RepoId,
+  pg: PagerankGraph
+): Promise<void> {
+  const pgJSON = pg.toJSON();
+  const pgDir = path.join(directory, "data", repoIdToString(repoId));
+  await fs.ensureDir(pgDir);
+  const pgFile = path.join(pgDir, "pagerankGraph.json");
+  await fs.writeFile(pgFile, stringify(pgJSON));
+}
+
+function weightsForAdapters(
+  adapters: $ReadOnlyArray<IAnalysisAdapter>
+): WeightedTypes {
+  const declarations = adapters.map((a) => a.declaration());
+  return combineWeights(declarations.map(defaultWeightsForDeclaration));
+}
+
+export const defaultAdapters = () => [
+  new GithubAnalysisAdapter(),
+  new GitAnalysisAdapter(),
+  new FallbackAdapter(),
+];
+const defaultLoader = (r: RepoId) =>
+  loadGraph(Common.sourcecredDirectory(), defaultAdapters(), r);
+export const defaultWeights = () => weightsForAdapters(defaultAdapters());
+export const defaultPagerank = (g: Graph) => runPagerank(defaultWeights(), g);
+export const defaultSaver = (r: RepoId, pg: PagerankGraph) =>
+  savePagerankGraph(Common.sourcecredDirectory(), r, pg);
+
+export const pagerankCommand = makePagerankCommand(
+  defaultLoader,
+  defaultPagerank,
+  defaultSaver
+);
+
+export const help: Command = async (args, std) => {
+  if (args.length === 0) {
+    usage(std.out);
+    return 0;
+  } else {
+    usage(std.err);
+    return 1;
+  }
+};
+
+export default pagerankCommand;
--- a/src/cli/pagerank.test.js
+++ b/src/cli/pagerank.test.js
@ -0,0 +1,295 @@
+// @flow
+
+import tmp from "tmp";
+import path from "path";
+import fs from "fs-extra";
+
+import {run} from "./testUtil";
+import * as NullUtil from "../util/null";
+import {
+  help,
+  makePagerankCommand,
+  savePagerankGraph,
+  runPagerank,
+  defaultWeights,
+  defaultPagerank,
+  defaultAdapters,
+  defaultSaver,
+} from "./pagerank";
+import {Graph, NodeAddress, EdgeAddress} from "../core/graph";
+import {advancedGraph} from "../core/graphTestUtil";
+import {
+  PagerankGraph,
+  DEFAULT_SYNTHETIC_LOOP_WEIGHT,
+  DEFAULT_CONVERGENCE_THRESHOLD,
+  DEFAULT_MAX_ITERATIONS,
+} from "../core/pagerankGraph";
+import type {NodeType, EdgeType} from "../analysis/types";
+import {fallbackDeclaration} from "../analysis/fallbackDeclaration";
+import {
+  defaultWeightsForDeclaration,
+  combineWeights,
+} from "../analysis/weights";
+
+import {weightsToEdgeEvaluator} from "../analysis/weightsToEdgeEvaluator";
+
+import {makeRepoId, repoIdToString} from "../core/repoId";
+
+describe("cli/pagerank", () => {
+  describe("'help' command", () => {
+    it("prints usage when given no arguments", async () => {
+      expect(await run(help, [])).toEqual({
+        exitCode: 0,
+        stdout: expect.arrayContaining([
+          expect.stringMatching(/^usage: sourcecred pagerank/),
+        ]),
+        stderr: [],
+      });
+    });
+    it("fails when given arguments", async () => {
+      expect(await run(help, ["foo/bar"])).toEqual({
+        exitCode: 1,
+        stdout: [],
+        stderr: expect.arrayContaining([
+          expect.stringMatching(/^usage: sourcecred pagerank/),
+        ]),
+      });
+    });
+  });
+
+  describe("'pagerank' command", () => {
+    it("prints usage with '--help'", async () => {
+      const pagerank = makePagerankCommand(jest.fn(), jest.fn(), jest.fn());
+      expect(await run(pagerank, ["--help"])).toEqual({
+        exitCode: 0,
+        stdout: expect.arrayContaining([
+          expect.stringMatching(/^usage: sourcecred pagerank/),
+        ]),
+        stderr: [],
+      });
+    });
+
+    it("errors if no repoId is provided", async () => {
+      const pagerank = makePagerankCommand(jest.fn(), jest.fn(), jest.fn());
+      expect(await run(pagerank, [])).toEqual({
+        exitCode: 1,
+        stdout: [],
+        stderr: expect.arrayContaining([
+          "fatal: no repository ID provided",
+          "fatal: run 'sourcecred help pagerank' for help",
+        ]),
+      });
+    });
+
+    it("errors if multiple repos are provided", async () => {
+      const pagerank = makePagerankCommand(jest.fn(), jest.fn(), jest.fn());
+      expect(await run(pagerank, ["foo/bar", "zod/zoink"])).toEqual({
+        exitCode: 1,
+        stdout: [],
+        stderr: [
+          "fatal: multiple repository IDs provided",
+          "fatal: run 'sourcecred help pagerank' for help",
+        ],
+      });
+    });
+
+    it("errors if the repoId was not loaded first", async () => {
+      const loadResult = {status: "REPO_NOT_LOADED"};
+      const loader = () => new Promise((resolve) => resolve(loadResult));
+      const pagerank = makePagerankCommand(loader, jest.fn(), jest.fn());
+      const result = run(pagerank, ["zod/zoink"]);
+      expect(await result).toEqual({
+        exitCode: 1,
+        stdout: [],
+        stderr: [
+          "fatal: repository ID zod/zoink not loaded",
+          "Try running `sourcecred load zod/zoink` first.",
+        ],
+      });
+    });
+
+    it("passes the right arguments to loadGraph", async () => {
+      const mockLoader = jest.fn();
+      const pagerank = makePagerankCommand(mockLoader, jest.fn(), jest.fn());
+      const repoId = makeRepoId("foo", "bar");
+      await run(pagerank, ["foo/bar"]);
+      expect(mockLoader).toHaveBeenCalledWith(repoId);
+    });
+
+    it("prints a message if there was a plugin failure", async () => {
+      const failure = {
+        status: "PLUGIN_FAILURE",
+        pluginName: "foo",
+        error: new Error("FooError"),
+      };
+      const loader = (_unused_repoId) =>
+        new Promise((resolve) => resolve(failure));
+      const command = makePagerankCommand(loader, jest.fn(), jest.fn());
+      const result = await run(command, ["foo/bar"]);
+      expect(result).toEqual({
+        exitCode: 1,
+        stdout: [],
+        stderr: ['fatal: plugin "foo" errored: FooError'],
+      });
+    });
+
+    describe("on successful load", () => {
+      const graph = () => new Graph().addNode(NodeAddress.empty);
+      const graphResult = () => ({status: "SUCCESS", graph: graph()});
+      const loader = (_unused_repoId) =>
+        new Promise((resolve) => resolve(graphResult()));
+      const evaluator = (_unused_edge) => ({toWeight: 1, froWeight: 1});
+      const pagerankGraph = () => new PagerankGraph(graph(), evaluator, 0.001);
+      const mockPagerankRunner = (_unused_graph) =>
+        new Promise((resolve) => resolve(pagerankGraph()));
+
+      it("passes the loaded graph to the pagerank runner", async () => {
+        const mock = jest.fn();
+        const command = makePagerankCommand(loader, mock, jest.fn());
+        await run(command, ["foo/bar"]);
+        expect(mock).toHaveBeenCalledWith(graph());
+      });
+
+      it("passes the resultant pagerankGraph to the saver", async () => {
+        const mock = jest.fn();
+        const command = makePagerankCommand(loader, mockPagerankRunner, mock);
+        await run(command, ["foo/bar"]);
+        const repoId = makeRepoId("foo", "bar");
+        expect(mock).toHaveBeenCalledWith(repoId, pagerankGraph());
+      });
+
+      it("returns with exit code 0 and nothing printed to stdout/stderr", async () => {
+        const command = makePagerankCommand(
+          loader,
+          mockPagerankRunner,
+          jest.fn()
+        );
+        const result = await run(command, ["foo/bar"]);
+        expect(result).toEqual({
+          exitCode: 0,
+          stdout: [],
+          stderr: [],
+        });
+      });
+    });
+  });
+
+  describe("savePagerankGraph", () => {
+    it("saves the PagerankGraphJSON to the right filepath", async () => {
+      const graph = new Graph().addNode(NodeAddress.empty);
+      const evaluator = (_unused_edge) => ({toWeight: 1, froWeight: 2});
+      const prg = new PagerankGraph(graph, evaluator);
+      const dirname = tmp.dirSync().name;
+      const repoId = makeRepoId("foo", "bar");
+      await savePagerankGraph(dirname, repoId, prg);
+      const expectedPath = path.join(
+        dirname,
+        "data",
+        repoIdToString(repoId),
+        "pagerankGraph.json"
+      );
+      const blob = fs.readFileSync(expectedPath).toString();
+      const json = JSON.parse(blob);
+      expect(json).toEqual(prg.toJSON());
+    });
+  });
+
+  describe("runPagerank", () => {
+    it("computes pagerank with the given weights", async () => {
+      const nodeType: NodeType = {
+        name: "foo",
+        pluralName: "foos",
+        prefix: NodeAddress.fromParts(["src"]),
+        defaultWeight: 3,
+        description: "an example node type",
+      };
+      const edgeType: EdgeType = {
+        forwardName: "bars",
+        backwardName: "barred by",
+        defaultForwardWeight: 5,
+        defaultBackwardWeight: 3,
+        prefix: EdgeAddress.fromParts(["hom"]),
+        description: "an example edge type",
+      };
+      const exampleDeclaration = {
+        name: "example",
+        nodePrefix: NodeAddress.fromParts(["src"]),
+        edgePrefix: EdgeAddress.fromParts(["hom"]),
+        nodeTypes: [nodeType],
+        edgeTypes: [edgeType],
+      };
+
+      const exampleWeightedTypes = defaultWeightsForDeclaration(
+        exampleDeclaration
+      );
+      const fallbackWeightedTypes = defaultWeightsForDeclaration(
+        fallbackDeclaration
+      );
+      const weightedTypes = combineWeights([
+        exampleWeightedTypes,
+        fallbackWeightedTypes,
+      ]);
+
+      const graph = advancedGraph().graph1();
+      const actualPagerankGraph = await runPagerank(weightedTypes, graph);
+      const expectedPagerankGraph = new PagerankGraph(
+        graph,
+        weightsToEdgeEvaluator(weightedTypes),
+        DEFAULT_SYNTHETIC_LOOP_WEIGHT
+      );
+      await expectedPagerankGraph.runPagerank({
+        convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
+        maxIterations: DEFAULT_MAX_ITERATIONS,
+      });
+      expect(actualPagerankGraph.equals(expectedPagerankGraph)).toBe(true);
+    });
+    it("default pageRank is robust to nodes that are not owned by any plugin", async () => {
+      const graph = new Graph().addNode(NodeAddress.empty).addEdge({
+        address: EdgeAddress.empty,
+        src: NodeAddress.empty,
+        dst: NodeAddress.empty,
+      });
+      await defaultPagerank(graph);
+    });
+  });
+  it("default weights contain every node and edge type from adapters", () => {
+    const ws = defaultWeights();
+    for (const adapter of defaultAdapters()) {
+      const declaration = adapter.declaration();
+      for (const nodeType of declaration.nodeTypes) {
+        const weightedNodeType = NullUtil.get(ws.nodes.get(nodeType.prefix));
+        expect(weightedNodeType.weight).toEqual(nodeType.defaultWeight);
+        expect(weightedNodeType.type).toEqual(nodeType);
+      }
+      for (const edgeType of declaration.edgeTypes) {
+        const weightedEdgeType = NullUtil.get(ws.edges.get(edgeType.prefix));
+        expect(weightedEdgeType.forwardWeight).toEqual(
+          edgeType.defaultForwardWeight
+        );
+        expect(weightedEdgeType.backwardWeight).toEqual(
+          edgeType.defaultBackwardWeight
+        );
+        expect(weightedEdgeType.type).toEqual(edgeType);
+      }
+    }
+  });
+  it("defaultSaver saves to sourcecred directory", async () => {
+    const dirname = tmp.dirSync().name;
+    process.env.SOURCECRED_DIRECTORY = dirname;
+    const repoId = makeRepoId("foo", "bar");
+    const prg = new PagerankGraph(
+      new Graph().addNode(NodeAddress.empty),
+      (_unused_edge) => ({toWeight: 1, froWeight: 2})
+    );
+    await defaultSaver(repoId, prg);
+    const expectedPath = path.join(
+      dirname,
+      "data",
+      "foo/bar",
+      "pagerankGraph.json"
+    );
+    const blob = await fs.readFile(expectedPath);
+    const actualJSON = JSON.parse(blob.toString());
+    expect(actualJSON).toEqual(prg.toJSON());
+  });
+});
--- a/src/cli/sourcecred.js
+++ b/src/cli/sourcecred.js
@ -9,6 +9,7 @@ import help from "./help";
 import load from "./load";
 import analyze from "./analyze";
 import exportGraph from "./exportGraph";
+import pagerank from "./pagerank";

 const sourcecred: Command = async (args, std) => {
  if (args.length === 0) {
@ -28,6 +29,8 @@ const sourcecred: Command = async (args, std) => {
      return analyze(args.slice(1), std);
    case "export-graph":
      return exportGraph(args.slice(1), std);
+    case "pagerank":
+      return pagerank(args.slice(1), std);
    default:
      std.err("fatal: unknown command: " + JSON.stringify(args[0]));
      std.err("fatal: run 'sourcecred help' for commands and usage");
--- a/src/cli/sourcecred.test.js
+++ b/src/cli/sourcecred.test.js
@ -15,6 +15,7 @@ jest.mock("./help", () => mockCommand("help"));
 jest.mock("./load", () => mockCommand("load"));
 jest.mock("./analyze", () => mockCommand("analyze"));
 jest.mock("./exportGraph", () => mockCommand("export-graph"));
+jest.mock("./pagerank", () => mockCommand("pagerank"));

 describe("cli/sourcecred", () => {
  it("fails with usage when invoked with no arguments", async () => {
@ -75,6 +76,14 @@ describe("cli/sourcecred", () => {
    });
  });

+  it("responds to 'pagerank'", async () => {
+    expect(await run(sourcecred, ["pagerank", "foo/bar", "foo/baz"])).toEqual({
+      exitCode: 2,
+      stdout: ['out(pagerank): ["foo/bar","foo/baz"]'],
+      stderr: ["err(pagerank)"],
+    });
+  });
+
  it("fails given an unknown command", async () => {
    expect(await run(sourcecred, ["wat"])).toEqual({
      exitCode: 1,