PagerankGraph reuses existing distribution (#1160)

This commit modifies `PagerankGraph.runPagerank` so that rather than always starting from a uniform distribution, it starts with the PagerankGraph's existing score distribution. The PagerankGraph initializes with a uniform score over nodes, so it has the exact same behavior on the first time that runPagerank is called. On subsequent calls, PageRank will likely converge a lot faster, because it's starting from converged scores. (It should still be a lot faster in cases where e.g. the user has tweaked the weights.) In certain degerate cases, this could change the resultant scores. Specifically, if there are isolated nodes in the graph and alpha=0, then the isolated nodes' final scores depends on the initial score. In general, I think this won't be an issue as we expect alpha > 0 in normal usage. Test plan: I added a unit test to verify this property, by checking that running PageRank with maxIterations==0 on an already-converged graph results in a still-converged graph. Also, existing tests pass. I think we can now close #1020.
2019-05-29 20:29:28 +03:00 · 2019-05-29 20:29:28 +03:00 · 4dc97fcc57
parent 14eee06799
commit 4dc97fcc57
2 changed files with 28 additions and 5 deletions
--- a/src/core/pagerankGraph.js
+++ b/src/core/pagerankGraph.js
@ -26,7 +26,6 @@ import {
  findStationaryDistribution,
  type PagerankParams,
  type PagerankOptions as CorePagerankOptions,
  uniformDistribution,
 } from "../core/attribution/markovChain";
 import * as NullUtil from "../util/null";
@ -456,9 +455,13 @@ export class PagerankGraph {
   * individual delta in a node's score between the present and previous
   * iteration is less than or equal to `options.convergenceThreshold`.
   *
-   * TODO(#1020): Make `runPagerank` use the current nodes' scores as a
+   * Note that if runPagerank is called multiple times on the same
-   * starting point for computation, rather than re-generating from
+   * PagerankGraph, it will re-use the last stationary distribution as the
-   * scratch every time `runPagerank` is called.
+   * starting point for running PageRank again. In general, this will result in
   * improved performance, and it will not usually affect the outcome from
   * PageRank. However, in certain circumstances, it could result in different
   * outputs. For example, if there are isolated nodes and no seed vector, then
   * the initial distribution may matter.
   */
  async runPagerank(
    options?: PagerankOptions
@ -476,11 +479,15 @@ export class PagerankGraph {
      this._syntheticLoopWeight
    );
    const osmc = createOrderedSparseMarkovChain(connections);
    const pi0 = new Float64Array(osmc.chain.length);
    osmc.nodeOrder.forEach(
      (n: NodeAddressT, i) => (pi0[i] = NullUtil.get(this.node(n)).score)
    );
    const params: PagerankParams = {
      chain: osmc.chain,
      alpha: fullOptions.alpha,
      seed: weightedDistribution(osmc.nodeOrder, fullOptions.seed),
-      pi0: uniformDistribution(osmc.chain.length),
+      pi0,
    };
    const coreOptions: CorePagerankOptions = {
      verbose: false,
--- a/src/core/pagerankGraph.test.js
+++ b/src/core/pagerankGraph.test.js
@ -609,6 +609,22 @@ describe("core/pagerankGraph", () => {
      expect(results.convergenceDelta).toBeLessThan(convergenceThreshold);
      checkProbabilityDistribution(pg);
    });
    it("re-uses existing scores as a starting point", async () => {
      const pg = examplePagerankGraph();
      const convergenceThreshold = 0.001;
      const results1 = await pg.runPagerank({
        maxIterations: 170,
        convergenceThreshold,
      });
      expect(results1.convergenceDelta).toBeLessThan(convergenceThreshold);
      // It should still converge without any iterations, because it uses the
      // final distribution as a starting point
      const results2 = await pg.runPagerank({
        maxIterations: 0,
        convergenceThreshold,
      });
      expect(results2.convergenceDelta).toEqual(results1.convergenceDelta);
    });
  });
  describe("equals", () => {