Factor out _intervalResult in timelinePagerank (#1799)

As part of work for #1773, I want to add a lot more complexity to the logic for computing individual time-slices of pagerank scores, so that we can trace how much score flowed on each individual edge. This means adding more complexity to the _computeTimelineDistribution function; however, that function is an untested wrapper, so I'm hesitant to add more complexity directly. Instead, I'm first factoring out an _intervalResult method and a corresponding type, which computes the scores for a given timeline interval. I've also added sanity checking tests for this method. In a followon commit, I'll add more logic for tracking edge-level score flows. Test plan: This is just a refactor, maintaining existing behavior and adding tests. `yarn test --full` passes. Since our sharness tests include doing a full load (including timeline cred computation) on realistic data from GitHub, this gives us confidence that there hasn't been any change to cred semantics.
2020-05-30 14:14:32 -07:00 · 2020-05-30 14:14:32 -07:00 · 27aeacc906
parent 71c0b0d66d
commit 27aeacc906
2 changed files with 109 additions and 25 deletions
--- a/src/core/algorithm/timelinePagerank.js
+++ b/src/core/algorithm/timelinePagerank.js
@ -24,10 +24,7 @@ import {
 } from "./graphToMarkovChain";
 import {findStationaryDistribution, type PagerankParams} from "./markovChain";
-/**
+export type IntervalResult = {|
 * Represents raw PageRank distributions on a graph over time.
 */
 export type TimelineDistributions = $ReadOnlyArray<{|
  // The interval for this slice
  +interval: Interval,
  // The total node weight within this interval (normalized to account for the
@ -37,7 +34,11 @@ export type TimelineDistributions = $ReadOnlyArray<{|
  // The raw score distribution over nodes for this interval (i.e. sums to 1).
  // Uses the canonical graph node order.
  +distribution: Distribution,
-|}>;
+|};
 /**
 * Represents raw PageRank distributions on a graph over time.
 */
 export type TimelineDistributions = $ReadOnlyArray<IntervalResult>;
 export const SYNTHETIC_LOOP_WEIGHT = 1e-3;
@ -206,28 +207,47 @@ export async function _computeTimelineDistribution(
    const nodeToConnections = NullUtil.get(
      nodeToConnectionsIterator.next().value
    );
-    const {chain} = createOrderedSparseMarkovChain(nodeToConnections);
+    const result = await _intervalResult(
-
+      nodeWeights,
-    const seed = weightedDistribution(nodeOrder, nodeWeights);
+      nodeToConnections,
-    if (pi0 == null) {
+      nodeOrder,
      pi0 = seed;
    }
    const params: PagerankParams = {chain, alpha, seed, pi0};
    const distributionResult = await findStationaryDistribution(params, {
      verbose: false,
      convergenceThreshold: 1e-7,
      maxIterations: 255,
      yieldAfterMs: 30,
    });
    const intervalWeight = sum(nodeWeights.values());
    results.push({
      interval,
-      intervalWeight,
+      pi0,
-      distribution: distributionResult.pi,
+      alpha
-    });
+    );
    results.push(result);
    // Use the latest convergce results as the starting point for the next run
    // of PageRank
-    pi0 = distributionResult.pi;
+    pi0 = result.distribution;
  }
  return results;
 }
 export async function _intervalResult(
  nodeWeights: Map<NodeAddressT, number>,
  nodeToConnections: NodeToConnections,
  nodeOrder: $ReadOnlyArray<NodeAddressT>,
  interval: Interval,
  pi0: Distribution | null,
  alpha: number
 ): Promise<IntervalResult> {
  const {chain} = createOrderedSparseMarkovChain(nodeToConnections);
  const seed = weightedDistribution(nodeOrder, nodeWeights);
  if (pi0 == null) {
    pi0 = seed;
  }
  const params: PagerankParams = {chain, alpha, seed, pi0};
  const distributionResult = await findStationaryDistribution(params, {
    verbose: false,
    convergenceThreshold: 1e-7,
    maxIterations: 255,
    yieldAfterMs: 30,
  });
  const intervalWeight = sum(nodeWeights.values());
  return {
    interval,
    intervalWeight,
    distribution: distributionResult.pi,
  };
 }
--- a/src/core/algorithm/timelinePagerank.test.js
+++ b/src/core/algorithm/timelinePagerank.test.js
@ -2,12 +2,13 @@
 import {sum} from "d3-array";
 import * as NullUtil from "../../util/null";
-import {node, edge} from "../graphTestUtil";
+import {node, edge, advancedGraph} from "../graphTestUtil";
 import {Graph, type EdgeAddressT, type Edge} from "../graph";
 import {
  _timelineNodeWeights,
  _timelineNodeToConnections,
  SYNTHETIC_LOOP_WEIGHT,
  _intervalResult,
 } from "./timelinePagerank";
 import {
  createConnections,
@ -107,4 +108,67 @@ describe("src/core/algorithm/timelinePagerank", () => {
      expect(chains[3]).toEqual(chain4);
    });
  });
  describe("_intervalResult", () => {
    async function example() {
      const {graph1, nodes} = advancedGraph();
      const g = graph1();
      const nodeWeights = new Map()
        .set(nodes.src.address, 1)
        .set(nodes.isolated.address, 2);
      const edgeFn = (_unused_edge) => ({forwards: 1, backwards: 0.5});
      const nodeToConnections = createConnections(g, edgeFn, 1e-3);
      const nodeOrder = Array.from(g.nodes()).map((x) => x.address);
      const interval = {endTimeMs: 1000, startTimeMs: 0};
      const pi0 = null;
      const alpha = 0.05;
      const result = await _intervalResult(
        nodeWeights,
        nodeToConnections,
        nodeOrder,
        interval,
        pi0,
        alpha
      );
      return {
        graph: g,
        nodes,
        nodeOrder,
        nodeWeights,
        edgeFn,
        nodeToConnections,
        interval,
        pi0,
        alpha,
        result,
      };
    }
    it("passes through the interval", async () => {
      const {result, interval} = await example();
      expect(result.interval).toEqual(interval);
    });
    it("computes the summed nodeWeight", async () => {
      const {result, nodeWeights} = await example();
      const actualIntervalWeight = sum(nodeWeights.values());
      expect(result.intervalWeight).toEqual(actualIntervalWeight);
    });
    it("produces sane score distribution on an example graph", async () => {
      const {result, nodes, nodeOrder} = await example();
      function getScore(a) {
        const idx = nodeOrder.indexOf(a.address);
        if (idx === -1) {
          throw new Error("bad address");
        }
        return result.distribution[idx];
      }
      const isoScore = getScore(nodes.isolated);
      const srcScore = getScore(nodes.src);
      const dstScore = getScore(nodes.dst);
      expect(isoScore + srcScore + dstScore).toBeCloseTo(1);
      // It has 2/3rd weight, and is isolated, so it's simple
      expect(isoScore).toBeCloseTo(2 / 3);
      // src has the weight, and dst doesnt, so it should have a higher score
      expect(srcScore).toBeGreaterThan(dstScore);
    });
  });
 });