Expose contributions structure of Markov chains (#490)

Summary: When we convert a graph to a Markov chain, each cell in the transition matrix is a sum of edge weights from the given `src` to the given `dst`, plus the synthetic self-loop needed for stability. Performing this sum loses information: given the transition matrix, a client cannot determine how much a particular edge contributed to the score of a node without redoing the relevant computations. In this commit, we expose the structure of these contributions (i.e., edges and synthetic loops). This changes the API of `graphToMarkovChain.js`, but it does not change the resulting Markov chains. It also does not change the API of `pagerank.js`. In particular, clients of `pagerank.js` will not have access to the contributions structure that we have just created. Test Plan: Existing unit tests have been updated to use the new API, and pass without change. An additional test is added for a newly exposed function, even though this function is also tested extensively as part of later downstream tests. In one snapshot, one value changes from `0.25` to `0.25 + 1.7e-16`. The other values in the enclosing distribution do not change, so I think that it is more likely that this is due to floating-point instability than an actual bug. (I’m not sure where exactly I commuted or associated an operation, but it’s quite possible that I may have done so). To compensate, I added an additional check that the values in the stationary distribution sum to `1.0` within `1e-9` tolerance; this check passes. wchargin-branch: expose-contributions
2025-01-27 12:55:14 +00:00 · 2018-07-05 16:08:46 -07:00 · 2018-07-05 16:08:46 -07:00 · 761a44c561
commit 761a44c561
parent 8921b5b942
5 changed files with 213 additions and 44 deletions
--- a/src/core/attribution/snapshots/pagerank.test.js.snap
+++ b/src/core/attribution/snapshots/pagerank.test.js.snap
@ -47,7 +47,7 @@ Array [
    "parts": Array [
      "loop",
    ],
-    "probability": 0.25,
+    "probability": 0.25000000000000017,
  },
  Object {
    "parts": Array [
--- a/src/core/attribution/graphToMarkovChain.js
+++ b/src/core/attribution/graphToMarkovChain.js
@ -1,12 +1,48 @@
 // @flow

-import {type Edge, type Graph, type NodeAddressT, NodeAddress} from "../graph";
+import {
+  type Edge,
+  type Graph,
+  type Neighbor,
+  type NodeAddressT,
+  NodeAddress,
+} from "../graph";
 import type {Distribution, SparseMarkovChain} from "./markovChain";

 export type Probability = number;
+export type Contributor =
+  | {|+type: "SYNTHETIC_LOOP"|}
+  | {|+type: "NEIGHBOR", +neighbor: Neighbor|};
+export type Contribution = {|
+  +contributor: Contributor,
+  // This `weight` is a conditional probability: given that you're at
+  // the source of this contribution's contributor, what's the
+  // probability that you travel along this contribution to the target?
+  +weight: Probability,
+|};
+
+export function contributorSource(
+  target: NodeAddressT,
+  contributor: Contributor
+) {
+  switch (contributor.type) {
+    case "SYNTHETIC_LOOP":
+      return target;
+    case "NEIGHBOR":
+      return contributor.neighbor.node;
+    default:
+      throw new Error((contributor.type: empty));
+  }
+}
+
 export type PagerankResult = Map<NodeAddressT, Probability>;

-type AddressMapMarkovChain = Map<
+export type NodeToContributions = Map<
+  NodeAddressT,
+  $ReadOnlyArray<Contribution>
+>;
+
+type NodeAddressMarkovChain = Map<
  NodeAddressT,
  /* in-neighbors */ Map<NodeAddressT, Probability>
 >;
@ -21,63 +57,101 @@ export type EdgeWeight = {|
  +froWeight: number, // weight from dst to src
 |};

-function graphToAddressMapMarkovChain(
+export function createContributions(
  graph: Graph,
  edgeWeight: (Edge) => EdgeWeight,
-  selfLoopEdgeWeight: number
-): AddressMapMarkovChain {
-  const inNeighbors: AddressMapMarkovChain = new Map();
+  syntheticLoopWeight: number
+): NodeToContributions {
+  const result = new Map();
  const totalOutWeight: Map<NodeAddressT, number> = new Map();
  for (const node of graph.nodes()) {
-    inNeighbors.set(node, new Map());
+    result.set(node, []);
    totalOutWeight.set(node, 0);
  }

-  function moreWeight(src, dst, weight) {
-    const neighbors = inNeighbors.get(dst);
-    if (neighbors == null) {
+  function processContribution(
+    target: NodeAddressT,
+    contribution: Contribution
+  ) {
+    const contributions = result.get(target);
+    if (contributions == null) {
      // Should be impossible based on graph invariants.
-      throw new Error("missing dst: " + NodeAddress.toString(dst));
+      throw new Error("missing target: " + NodeAddress.toString(target));
    }
-    neighbors.set(src, weight + (neighbors.get(src) || 0));
+    (((contributions: $ReadOnlyArray<Contribution>): any): Contribution[]).push(
+      contribution
+    );

-    const priorOutWeight = totalOutWeight.get(src);
+    const source = contributorSource(target, contribution.contributor);
+    const priorOutWeight = totalOutWeight.get(source);
    if (priorOutWeight == null) {
      // Should be impossible based on graph invariants.
-      throw new Error("missing src: " + NodeAddress.toString(src));
+      throw new Error("missing source: " + NodeAddress.toString(source));
    }
-    totalOutWeight.set(src, priorOutWeight + weight);
+    totalOutWeight.set(source, priorOutWeight + contribution.weight);
  }

  // Add self-loops.
  for (const node of graph.nodes()) {
-    moreWeight(node, node, selfLoopEdgeWeight);
+    processContribution(node, {
+      contributor: {type: "SYNTHETIC_LOOP"},
+      weight: syntheticLoopWeight,
+    });
  }

  // Process edges.
  for (const edge of graph.edges()) {
    const {toWeight, froWeight} = edgeWeight(edge);
    const {src, dst} = edge;
-    moreWeight(src, dst, toWeight);
-    moreWeight(dst, src, froWeight);
+    processContribution(dst, {
+      contributor: {type: "NEIGHBOR", neighbor: {node: src, edge}},
+      weight: toWeight,
+    });
+    processContribution(src, {
+      contributor: {type: "NEIGHBOR", neighbor: {node: dst, edge}},
+      weight: froWeight,
+    });
  }

  // Normalize in-weights.
-  for (const neighbors of inNeighbors.values()) {
-    for (const [neighbor, weight] of neighbors.entries()) {
-      const normalization = totalOutWeight.get(neighbor);
+  for (const [target, contributions] of result.entries()) {
+    for (const contribution of contributions) {
+      const source = contributorSource(target, contribution.contributor);
+      const normalization = totalOutWeight.get(source);
      if (normalization == null) {
        // Should be impossible.
-        throw new Error("missing node: " + NodeAddress.toString(neighbor));
+        throw new Error("missing node: " + NodeAddress.toString(source));
      }
-      neighbors.set(neighbor, weight / normalization);
+      const newWeight: typeof contribution.weight =
+        contribution.weight / normalization;
+      // (any-cast because property is not writable)
+      (contribution: any).weight = newWeight;
    }
  }
-  return inNeighbors;
+
+  return result;
 }

-function addressMapMarkovChainToOrderedSparseMarkovChain(
-  chain: AddressMapMarkovChain
+function createNodeAddressMarkovChain(
+  ntc: NodeToContributions
+): NodeAddressMarkovChain {
+  const result: NodeAddressMarkovChain = new Map();
+  for (const [target, contributions] of ntc.entries()) {
+    const inNeighbors = new Map();
+    result.set(target, inNeighbors);
+    for (const contribution of contributions) {
+      const source = contributorSource(target, contribution.contributor);
+      inNeighbors.set(
+        source,
+        contribution.weight + (inNeighbors.get(source) || 0)
+      );
+    }
+  }
+  return result;
+}
+
+function nodeAddressMarkovChainToOrderedSparseMarkovChain(
+  chain: NodeAddressMarkovChain
 ): OrderedSparseMarkovChain {
  const nodeOrder = Array.from(chain.keys());
  const addressToIndex: Map<NodeAddressT, number> = new Map();
@ -112,14 +186,11 @@ function addressMapMarkovChainToOrderedSparseMarkovChain(
  };
 }

-export function graphToOrderedSparseMarkovChain(
-  graph: Graph,
-  edgeWeight: (Edge) => EdgeWeight,
-  selfLoopEdgeWeight: number
+export function createOrderedSparseMarkovChain(
+  contributions: NodeToContributions
 ): OrderedSparseMarkovChain {
-  return addressMapMarkovChainToOrderedSparseMarkovChain(
-    graphToAddressMapMarkovChain(graph, edgeWeight, selfLoopEdgeWeight)
-  );
+  const chain = createNodeAddressMarkovChain(contributions);
+  return nodeAddressMarkovChainToOrderedSparseMarkovChain(chain);
 }

 /**
@ -147,7 +218,10 @@ export function permute(
    );
    newChain.push({neighbor: newNeighbors, weight});
  }
-  return {nodeOrder: newOrder, chain: newChain};
+  return {
+    nodeOrder: newOrder,
+    chain: newChain,
+  };
 }

 /**
--- a/src/core/attribution/graphToMarkovChain.test.js
+++ b/src/core/attribution/graphToMarkovChain.test.js
@ -1,9 +1,12 @@
 // @flow

+import sortBy from "lodash.sortby";
+
 import {EdgeAddress, Graph, NodeAddress} from "../graph";
 import {
  distributionToPagerankResult,
-  graphToOrderedSparseMarkovChain,
+  createContributions,
+  createOrderedSparseMarkovChain,
  normalize,
  normalizeNeighbors,
  permute,
@ -77,14 +80,96 @@ describe("core/attribution/graphToMarkovChain", () => {
    expect(actual).toEqual(expected);
  });

-  describe("graphToOrderedSparseMarkovChain", () => {
+  describe("createContributions", () => {
+    // The tests for `createOrderedSparseMarkovChain` also must invoke
+    // `createContributions`, so we add only light testing separately.
+    it("works on a simple asymmetric chain", () => {
+      const n1 = NodeAddress.fromParts(["n1"]);
+      const n2 = NodeAddress.fromParts(["n2"]);
+      const n3 = NodeAddress.fromParts(["sink"]);
+      const e1 = {src: n1, dst: n2, address: EdgeAddress.fromParts(["e1"])};
+      const e2 = {src: n2, dst: n3, address: EdgeAddress.fromParts(["e2"])};
+      const e3 = {src: n1, dst: n3, address: EdgeAddress.fromParts(["e3"])};
+      const e4 = {src: n3, dst: n3, address: EdgeAddress.fromParts(["e4"])};
+      const g = new Graph()
+        .addNode(n1)
+        .addNode(n2)
+        .addNode(n3)
+        .addEdge(e1)
+        .addEdge(e2)
+        .addEdge(e3)
+        .addEdge(e4);
+      const edgeWeight = () => ({toWeight: 6.0, froWeight: 3.0});
+      const actual = createContributions(g, edgeWeight, 1.0);
+      // Total out-weights (for normalization factors):
+      //   - for `n1`: 2 out, 0 in, 1 synthetic: 12 + 0 + 1 = 13
+      //   - for `n2`: 1 out, 1 in, 1 synthetic: 6 + 3 + 1 = 10
+      //   - for `n3`: 1 out, 3 in, 1 synthetic: 6 + 9 + 1 = 16
+      const expected = new Map()
+        .set(n1, [
+          {contributor: {type: "SYNTHETIC_LOOP"}, weight: 1 / 13},
+          {
+            contributor: {type: "NEIGHBOR", neighbor: {node: n2, edge: e1}},
+            weight: 3 / 10,
+          },
+          {
+            contributor: {type: "NEIGHBOR", neighbor: {node: n3, edge: e3}},
+            weight: 3 / 16,
+          },
+        ])
+        .set(n2, [
+          {contributor: {type: "SYNTHETIC_LOOP"}, weight: 1 / 10},
+          {
+            contributor: {type: "NEIGHBOR", neighbor: {node: n1, edge: e1}},
+            weight: 6 / 13,
+          },
+          {
+            contributor: {type: "NEIGHBOR", neighbor: {node: n3, edge: e2}},
+            weight: 3 / 16,
+          },
+        ])
+        .set(n3, [
+          {contributor: {type: "SYNTHETIC_LOOP"}, weight: 1 / 16},
+          {
+            contributor: {type: "NEIGHBOR", neighbor: {node: n2, edge: e2}},
+            weight: 6 / 10,
+          },
+          {
+            contributor: {type: "NEIGHBOR", neighbor: {node: n1, edge: e3}},
+            weight: 6 / 13,
+          },
+          {
+            contributor: {type: "NEIGHBOR", neighbor: {node: n3, edge: e4}},
+            // this loop, as an out-edge
+            weight: 3 / 16,
+          },
+          {
+            contributor: {type: "NEIGHBOR", neighbor: {node: n3, edge: e4}},
+            // this loop, as an in-edge
+            weight: 6 / 16,
+          },
+        ]);
+      const canonicalize = (map) =>
+        new Map(
+          Array.from(map.entries()).map(([k, v]) => [
+            k,
+            sortBy(v, (x) => JSON.stringify(x)),
+          ])
+        );
+      expect(canonicalize(actual)).toEqual(canonicalize(expected));
+    });
+  });
+
+  describe("createOrderedSparseMarkovChain", () => {
    it("works on a trivial one-node chain with no edge", () => {
      const n = NodeAddress.fromParts(["foo"]);
      const g = new Graph().addNode(n);
      const edgeWeight = (_unused_edge) => {
        throw new Error("Don't even look at me");
      };
-      const osmc = graphToOrderedSparseMarkovChain(g, edgeWeight, 1e-3);
+      const osmc = createOrderedSparseMarkovChain(
+        createContributions(g, edgeWeight, 1e-3)
+      );
      const expected = {
        nodeOrder: [n],
        chain: [
@ -94,7 +179,7 @@ describe("core/attribution/graphToMarkovChain", () => {
      expect(normalize(osmc)).toEqual(normalize(expected));
    });

-    it("works on a simple asymmetric two-node chain", () => {
+    it("works on a simple asymmetric chain", () => {
      const n1 = NodeAddress.fromParts(["n1"]);
      const n2 = NodeAddress.fromParts(["n2"]);
      const n3 = NodeAddress.fromParts(["sink"]);
@ -111,7 +196,9 @@ describe("core/attribution/graphToMarkovChain", () => {
        .addEdge(e3)
        .addEdge(e4);
      const edgeWeight = () => ({toWeight: 1, froWeight: 0});
-      const osmc = graphToOrderedSparseMarkovChain(g, edgeWeight, 0.0);
+      const osmc = createOrderedSparseMarkovChain(
+        createContributions(g, edgeWeight, 0.0)
+      );
      const expected = {
        nodeOrder: [n1, n2, n3],
        chain: [
@ -147,7 +234,9 @@ describe("core/attribution/graphToMarkovChain", () => {
        .addEdge(e2)
        .addEdge(e3);
      const edgeWeight = () => ({toWeight: 1, froWeight: 1});
-      const osmc = graphToOrderedSparseMarkovChain(g, edgeWeight, 0.0);
+      const osmc = createOrderedSparseMarkovChain(
+        createContributions(g, edgeWeight, 0.0)
+      );
      const expected = {
        nodeOrder: [n1, n2, n3],
        chain: [
@ -177,7 +266,9 @@ describe("core/attribution/graphToMarkovChain", () => {
        // arithmetic simple.
        return {toWeight: 4 - epsilon / 2, froWeight: 1 - epsilon / 2};
      }
-      const osmc = graphToOrderedSparseMarkovChain(g, edgeWeight, epsilon);
+      const osmc = createOrderedSparseMarkovChain(
+        createContributions(g, edgeWeight, epsilon)
+      );
      // Edges from `src`:
      //   - to `src` with weight `epsilon`
      //   - to `dst` with weight `4 - epsilon / 2`
--- a/src/core/attribution/pagerank.js
+++ b/src/core/attribution/pagerank.js
@ -4,7 +4,8 @@ import {type Edge, Graph} from "../graph";
 import {
  type PagerankResult,
  distributionToPagerankResult,
-  graphToOrderedSparseMarkovChain,
+  createContributions,
+  createOrderedSparseMarkovChain,
  type EdgeWeight,
 } from "./graphToMarkovChain";

@ -39,11 +40,12 @@ export function pagerank(
    ...defaultOptions(),
    ...(options || {}),
  };
-  const osmc = graphToOrderedSparseMarkovChain(
+  const contributions = createContributions(
    graph,
    edgeWeight,
    fullOptions.selfLoopWeight
  );
+  const osmc = createOrderedSparseMarkovChain(contributions);
  const distribution = findStationaryDistribution(osmc.chain, {
    verbose: fullOptions.verbose,
    convergenceThreshold: fullOptions.convergenceThreshold,
--- a/src/core/attribution/pagerank.test.js
+++ b/src/core/attribution/pagerank.test.js
@ -5,6 +5,8 @@ import {NodeAddress} from "../graph";
 import {advancedGraph} from "../graphTestUtil";

 function snapshotPagerankResult(result) {
+  const prTotal = Array.from(result.values()).reduce((a, b) => a + b, 0);
+  expect(prTotal).toBeCloseTo(1.0, 1e-9);
  const partsToProbability = [];
  const sortedKeys = Array.from(result.keys()).sort();
  for (const key of sortedKeys) {