Add support for seed vectors to PagerankGraph (#1135)

This commit modifies `PagerankGraph.runPagerank` so that the user can provide an alpha and seed vector. The seed vector is specified via a map of weights, which will be normalized into a probability distribution over all the nodes in the graph. In the event that the map is empty (or the total weight is otherwise 0), a uniform distribution is created. To effect this change, a helper function called `weightedDistribution` has been added (and thoroughly tested) in the `graphToMarkovChain` module. Then, that function is used in `pagerankGraph.runPagerank` (along with light testing). Currently, the default alpha is set to 0, to ensure consistency with the legacy pagerank implementation in `analysis/pagerank`. Once that has been replaced with `PagerankGraph`, we can consider changing the defualt alpha to non-zero (thus removing the need for synthetic self-loops). I took a different approach in the [odyssey-hackathon repo][commit]. The previous approach was a much more complicated (and fairly redundant) API, that allowed specifying "NO_SEED", "UNIFORM_SEED", "SELECTED_SEED", and "SPECIFIED_SEED". I'm much happier with this API and implementation. [commit]: ed07861073 Test plan: Unit tests included; run `yarn test`.
2019-05-05 18:57:41 +03:00 · 2019-05-05 18:57:41 +03:00 · 79017a477b
parent e7bc025379
commit 79017a477b
4 changed files with 181 additions and 4 deletions
--- a/src/core/attribution/graphToMarkovChain.js
+++ b/src/core/attribution/graphToMarkovChain.js
@ -1,7 +1,11 @@
 // @flow

-import {type Edge, type Graph, type NodeAddressT} from "../graph";
-import type {Distribution, SparseMarkovChain} from "./markovChain";
+import {type Edge, type Graph, type NodeAddressT, NodeAddress} from "../graph";
+import {
+  type Distribution,
+  type SparseMarkovChain,
+  uniformDistribution,
+} from "./markovChain";
 import * as MapUtil from "../../util/map";
 import * as NullUtil from "../../util/null";

@ -31,6 +35,54 @@ export function adjacencySource(target: NodeAddressT, adjacency: Adjacency) {
  }
 }

+/**
+ * Create a Distribution using provided node weights.
+ *
+ * weightedDistribution takes in a node order (as a read only array of NodeAddressT),
+ * and a map providing weights for a subset of those nodes. It returns a Distribution
+ * with the invariant that every node's weight is proportional to its relative weight
+ * in the weights map. For example, in a case where there were three nodes and they
+ * had weights of 0, 1, and 3 respectively, the distribution would be [0, 0.25, 0.75].
+ *
+ * If a node address is not present in the weight map, its weight is assumed to be 0.
+ * If any weight is negative or non-finite, an error will be thrown.
+ * If the sum of all weights is 0, then a uniform distribution will be returned.
+ * If the weight map assigned weight to nodes which are not in the node order, an error
+ * will be thrown.
+ */
+export function weightedDistribution(
+  nodeOrder: $ReadOnlyArray<NodeAddressT>,
+  weights: Map<NodeAddressT, number>
+): Distribution {
+  let totalWeight = 0;
+  for (const [address, weight] of weights.entries()) {
+    if (weight < 0 || !isFinite(weight)) {
+      throw new Error(
+        `Invalid weight ${weight} associated with address ${NodeAddress.toString(
+          address
+        )}`
+      );
+    }
+    totalWeight += weight;
+  }
+  if (totalWeight === 0) {
+    return uniformDistribution(nodeOrder.length);
+  }
+  let numEncounteredWeights = 0;
+  const distribution = new Float64Array(nodeOrder.length);
+  for (let i = 0; i < distribution.length; i++) {
+    const weight = weights.get(nodeOrder[i]);
+    if (weight != null) {
+      numEncounteredWeights++;
+      distribution[i] = weight / totalWeight;
+    }
+  }
+  if (numEncounteredWeights !== weights.size) {
+    throw new Error("weights included nodes not present in the nodeOrder");
+  }
+  return distribution;
+}
+
 export type NodeDistribution = Map<NodeAddressT, Probability>;

 export type NodeToConnections = Map<NodeAddressT, $ReadOnlyArray<Connection>>;
--- a/src/core/attribution/graphToMarkovChain.test.js
+++ b/src/core/attribution/graphToMarkovChain.test.js
@ -10,6 +10,7 @@ import {
  normalize,
  normalizeNeighbors,
  permute,
+  weightedDistribution,
 } from "./graphToMarkovChain";
 import * as MapUtil from "../../util/map";

@ -288,4 +289,72 @@ describe("core/attribution/graphToMarkovChain", () => {
      );
    });
  });
+
+  describe("weightedDistribution", () => {
+    const a = NodeAddress.fromParts(["a"]);
+    const b = NodeAddress.fromParts(["b"]);
+    const c = NodeAddress.fromParts(["c"]);
+    const d = NodeAddress.fromParts(["d"]);
+    const order = () => [a, b, c, d];
+    it("gives a uniform distribution for an empty map", () => {
+      expect(weightedDistribution(order(), new Map())).toEqual(
+        new Float64Array([0.25, 0.25, 0.25, 0.25])
+      );
+    });
+    it("gives a uniform distribution for a map with 0 weight", () => {
+      const map = new Map().set(a, 0);
+      expect(weightedDistribution(order(), map)).toEqual(
+        new Float64Array([0.25, 0.25, 0.25, 0.25])
+      );
+    });
+    it("can put all weight on one node", () => {
+      const map = new Map().set(b, 0.1);
+      expect(weightedDistribution(order(), map)).toEqual(
+        new Float64Array([0, 1, 0, 0])
+      );
+    });
+    it("can split weight unequally", () => {
+      const map = new Map().set(b, 1).set(c, 3);
+      expect(weightedDistribution(order(), map)).toEqual(
+        new Float64Array([0, 0.25, 0.75, 0])
+      );
+    });
+    it("can create a uniform distribution if all weights are equal", () => {
+      const map = new Map()
+        .set(a, 1)
+        .set(b, 1)
+        .set(c, 1)
+        .set(d, 1);
+      expect(weightedDistribution(order(), map)).toEqual(
+        new Float64Array([0.25, 0.25, 0.25, 0.25])
+      );
+    });
+    describe("errors if", () => {
+      it("has a weighted node that is not in the order", () => {
+        const z = NodeAddress.fromParts(["z"]);
+        const map = new Map().set(z, 1);
+        expect(() => weightedDistribution(order(), map)).toThrowError(
+          "weights included nodes not present in the nodeOrder"
+        );
+      });
+      it("has a node with negative weight", () => {
+        const map = new Map().set(a, -1);
+        expect(() => weightedDistribution(order(), map)).toThrowError(
+          "Invalid weight -1"
+        );
+      });
+      it("has a node with NaN weight", () => {
+        const map = new Map().set(a, NaN);
+        expect(() => weightedDistribution(order(), map)).toThrowError(
+          "Invalid weight NaN"
+        );
+      });
+      it("has a node with infinite weight", () => {
+        const map = new Map().set(a, Infinity);
+        expect(() => weightedDistribution(order(), map)).toThrowError(
+          "Invalid weight Infinity"
+        );
+      });
+    });
+  });
 });
--- a/src/core/pagerankGraph.js
+++ b/src/core/pagerankGraph.js
@ -20,6 +20,7 @@ import {
  createConnections,
  createOrderedSparseMarkovChain,
  type EdgeWeight,
+  weightedDistribution,
 } from "./attribution/graphToMarkovChain";
 import {
  findStationaryDistribution,
@ -75,10 +76,30 @@ export type PagerankOptions = {|
  // Maximum number of iterations before we give up on PageRank Convergence
  // Defaults to DEFAULT_MAX_ITERATIONS if not provided.
  +maxIterations?: number,
+
  // PageRank will stop running once the diff between the previous iteration
  // and the latest is less than this threshold.
  // Defaults to DEFAULT_CONVERGENCE_THRESHOLD if not provided.
  +convergenceThreshold?: number,
+
+  // Specifies a seed vector for PageRank "teleportation".
+  // At every step, some proportion `alpha` of the weight will
+  // teleport to the seed.
+  //
+  // The seed is specified as a map from node addresses to weights.
+  // The resultant seed will be a proper distribution over all the graph's available
+  // nodes, with each node's weight proportional to its weight in the seed. In the case
+  // that the total weight in the seed is 0 (e.g. an empty map was passed), then the
+  // seed vector will be a uniform distribution.
+  //
+  // Specifying any negative, NaN, or infinite weights is an error.
+  // Specifying weights for nodes that are not in the graph is also an error.
+  +seed?: Map<NodeAddressT, number>,
+
+  // Specifies the probability with which score 'teleports' to the seed vector.
+  // If alpha=0, then the teleportation never happens. If alpha=1, then PageRank
+  // always converges to precisely the seed vector. Defaults to DEFAULT_ALPHA.
+  +alpha?: number,
 |};

 export type PagerankConvergenceReport = {|
@ -92,11 +113,17 @@ export type PagerankConvergenceReport = {|
 export const DEFAULT_SYNTHETIC_LOOP_WEIGHT = 1e-3;
 export const DEFAULT_MAX_ITERATIONS = 255;
 export const DEFAULT_CONVERGENCE_THRESHOLD = 1e-7;
+// TODO(@decentralion): Change default alpha to be a small non-zero value
+// once we choose an appropriate value.
+export const DEFAULT_ALPHA = 0;
+export const DEFAULT_SEED: () => Map<NodeAddressT, number> = () => new Map();

 function defaultOptions(): PagerankOptions {
  return {
    maxIterations: DEFAULT_MAX_ITERATIONS,
    convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
+    alpha: DEFAULT_ALPHA,
+    seed: DEFAULT_SEED(),
  };
 }

@ -441,8 +468,8 @@ export class PagerankGraph {
    const osmc = createOrderedSparseMarkovChain(connections);
    const params: PagerankParams = {
      chain: osmc.chain,
-      alpha: 0,
-      seed: uniformDistribution(osmc.chain.length),
+      alpha: fullOptions.alpha,
+      seed: weightedDistribution(osmc.nodeOrder, fullOptions.seed),
      pi0: uniformDistribution(osmc.chain.length),
    };
    const coreOptions: CorePagerankOptions = {
--- a/src/core/pagerankGraph.test.js
+++ b/src/core/pagerankGraph.test.js
@ -14,6 +14,8 @@ import {
  Direction,
  DEFAULT_MAX_ITERATIONS,
  DEFAULT_CONVERGENCE_THRESHOLD,
+  DEFAULT_ALPHA,
+  DEFAULT_SEED,
 } from "./pagerankGraph";
 import {advancedGraph} from "./graphTestUtil";
 import * as NullUtil from "../util/null";
@ -500,11 +502,38 @@ describe("core/pagerankGraph", () => {
      pg3.runPagerank({
        maxIterations: DEFAULT_MAX_ITERATIONS,
        convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
+        alpha: DEFAULT_ALPHA,
+        seed: DEFAULT_SEED(),
      });
      expect(pg1.equals(pg2)).toBe(true);
      expect(pg1.equals(pg3)).toBe(true);
    });

+    describe("alpha and seed parameters", () => {
+      // The logic for seeded PageRank (and for generating the seed distribution via weights)
+      // are both thoroughly unit-tested. Therefore, these tests only sanity check that the
+      // parameters are getting consumed properly based on easily tested properties.
+      it("seed is irrelevant if alpha is 0", async () => {
+        const pg1 = examplePagerankGraph();
+        const pg2 = examplePagerankGraph();
+        const {nodes} = advancedGraph();
+        const seed1 = new Map().set(nodes.src(), 1);
+        const seed2 = new Map().set(nodes.dst(), 1);
+        await pg1.runPagerank({seed: seed1, alpha: 0});
+        await pg2.runPagerank({seed: seed2, alpha: 0});
+        expect(pg1.equals(pg2)).toBe(true);
+      });
+
+      it("seed is returned directly if alpha is 1", async () => {
+        const pg = examplePagerankGraph();
+        const src = advancedGraph().nodes.src;
+        const seed = new Map().set(src(), 1);
+        await pg.runPagerank({seed, alpha: 1});
+        const score = NullUtil.get(pg.node(src())).score;
+        expect(score).toBe(1);
+      });
+    });
+
    it("promise rejects if the graph was modified", async () => {
      const pg = examplePagerankGraph();
      pg.graph().addNode(NodeAddress.empty);