Expose contributions structure of Markov chains (#490)

Summary:
When we convert a graph to a Markov chain, each cell in the transition
matrix is a sum of edge weights from the given `src` to the given `dst`,
plus the synthetic self-loop needed for stability. Performing this sum
loses information: given the transition matrix, a client cannot
determine how much a particular edge contributed to the score of a node
without redoing the relevant computations. In this commit, we expose the
structure of these contributions (i.e., edges and synthetic loops).

This changes the API of `graphToMarkovChain.js`, but it does not change
the resulting Markov chains. It also does not change the API of
`pagerank.js`. In particular, clients of `pagerank.js` will not have
access to the contributions structure that we have just created.

Test Plan:
Existing unit tests have been updated to use the new API, and pass
without change. An additional test is added for a newly exposed
function, even though this function is also tested extensively as part
of later downstream tests.

In one snapshot, one value changes from `0.25` to `0.25 + 1.7e-16`. The
other values in the enclosing distribution do not change, so I think
that it is more likely that this is due to floating-point instability
than an actual bug. (I’m not sure where exactly I commuted or associated
an operation, but it’s quite possible that I may have done so). To
compensate, I added an additional check that the values in the
stationary distribution sum to `1.0` within `1e-9` tolerance; this check
passes.

wchargin-branch: expose-contributions
This commit is contained in:
William Chargin 2018-07-05 16:08:46 -07:00 committed by GitHub
parent 8921b5b942
commit 761a44c561
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 213 additions and 44 deletions

View File

@ -47,7 +47,7 @@ Array [
"parts": Array [
"loop",
],
"probability": 0.25,
"probability": 0.25000000000000017,
},
Object {
"parts": Array [

View File

@ -1,12 +1,48 @@
// @flow
import {type Edge, type Graph, type NodeAddressT, NodeAddress} from "../graph";
import {
type Edge,
type Graph,
type Neighbor,
type NodeAddressT,
NodeAddress,
} from "../graph";
import type {Distribution, SparseMarkovChain} from "./markovChain";
export type Probability = number;
export type Contributor =
| {|+type: "SYNTHETIC_LOOP"|}
| {|+type: "NEIGHBOR", +neighbor: Neighbor|};
export type Contribution = {|
+contributor: Contributor,
// This `weight` is a conditional probability: given that you're at
// the source of this contribution's contributor, what's the
// probability that you travel along this contribution to the target?
+weight: Probability,
|};
export function contributorSource(
target: NodeAddressT,
contributor: Contributor
) {
switch (contributor.type) {
case "SYNTHETIC_LOOP":
return target;
case "NEIGHBOR":
return contributor.neighbor.node;
default:
throw new Error((contributor.type: empty));
}
}
export type PagerankResult = Map<NodeAddressT, Probability>;
type AddressMapMarkovChain = Map<
export type NodeToContributions = Map<
NodeAddressT,
$ReadOnlyArray<Contribution>
>;
type NodeAddressMarkovChain = Map<
NodeAddressT,
/* in-neighbors */ Map<NodeAddressT, Probability>
>;
@ -21,63 +57,101 @@ export type EdgeWeight = {|
+froWeight: number, // weight from dst to src
|};
function graphToAddressMapMarkovChain(
export function createContributions(
graph: Graph,
edgeWeight: (Edge) => EdgeWeight,
selfLoopEdgeWeight: number
): AddressMapMarkovChain {
const inNeighbors: AddressMapMarkovChain = new Map();
syntheticLoopWeight: number
): NodeToContributions {
const result = new Map();
const totalOutWeight: Map<NodeAddressT, number> = new Map();
for (const node of graph.nodes()) {
inNeighbors.set(node, new Map());
result.set(node, []);
totalOutWeight.set(node, 0);
}
function moreWeight(src, dst, weight) {
const neighbors = inNeighbors.get(dst);
if (neighbors == null) {
function processContribution(
target: NodeAddressT,
contribution: Contribution
) {
const contributions = result.get(target);
if (contributions == null) {
// Should be impossible based on graph invariants.
throw new Error("missing dst: " + NodeAddress.toString(dst));
throw new Error("missing target: " + NodeAddress.toString(target));
}
neighbors.set(src, weight + (neighbors.get(src) || 0));
(((contributions: $ReadOnlyArray<Contribution>): any): Contribution[]).push(
contribution
);
const priorOutWeight = totalOutWeight.get(src);
const source = contributorSource(target, contribution.contributor);
const priorOutWeight = totalOutWeight.get(source);
if (priorOutWeight == null) {
// Should be impossible based on graph invariants.
throw new Error("missing src: " + NodeAddress.toString(src));
throw new Error("missing source: " + NodeAddress.toString(source));
}
totalOutWeight.set(src, priorOutWeight + weight);
totalOutWeight.set(source, priorOutWeight + contribution.weight);
}
// Add self-loops.
for (const node of graph.nodes()) {
moreWeight(node, node, selfLoopEdgeWeight);
processContribution(node, {
contributor: {type: "SYNTHETIC_LOOP"},
weight: syntheticLoopWeight,
});
}
// Process edges.
for (const edge of graph.edges()) {
const {toWeight, froWeight} = edgeWeight(edge);
const {src, dst} = edge;
moreWeight(src, dst, toWeight);
moreWeight(dst, src, froWeight);
processContribution(dst, {
contributor: {type: "NEIGHBOR", neighbor: {node: src, edge}},
weight: toWeight,
});
processContribution(src, {
contributor: {type: "NEIGHBOR", neighbor: {node: dst, edge}},
weight: froWeight,
});
}
// Normalize in-weights.
for (const neighbors of inNeighbors.values()) {
for (const [neighbor, weight] of neighbors.entries()) {
const normalization = totalOutWeight.get(neighbor);
for (const [target, contributions] of result.entries()) {
for (const contribution of contributions) {
const source = contributorSource(target, contribution.contributor);
const normalization = totalOutWeight.get(source);
if (normalization == null) {
// Should be impossible.
throw new Error("missing node: " + NodeAddress.toString(neighbor));
throw new Error("missing node: " + NodeAddress.toString(source));
}
neighbors.set(neighbor, weight / normalization);
const newWeight: typeof contribution.weight =
contribution.weight / normalization;
// (any-cast because property is not writable)
(contribution: any).weight = newWeight;
}
}
return inNeighbors;
return result;
}
function addressMapMarkovChainToOrderedSparseMarkovChain(
chain: AddressMapMarkovChain
function createNodeAddressMarkovChain(
ntc: NodeToContributions
): NodeAddressMarkovChain {
const result: NodeAddressMarkovChain = new Map();
for (const [target, contributions] of ntc.entries()) {
const inNeighbors = new Map();
result.set(target, inNeighbors);
for (const contribution of contributions) {
const source = contributorSource(target, contribution.contributor);
inNeighbors.set(
source,
contribution.weight + (inNeighbors.get(source) || 0)
);
}
}
return result;
}
function nodeAddressMarkovChainToOrderedSparseMarkovChain(
chain: NodeAddressMarkovChain
): OrderedSparseMarkovChain {
const nodeOrder = Array.from(chain.keys());
const addressToIndex: Map<NodeAddressT, number> = new Map();
@ -112,14 +186,11 @@ function addressMapMarkovChainToOrderedSparseMarkovChain(
};
}
export function graphToOrderedSparseMarkovChain(
graph: Graph,
edgeWeight: (Edge) => EdgeWeight,
selfLoopEdgeWeight: number
export function createOrderedSparseMarkovChain(
contributions: NodeToContributions
): OrderedSparseMarkovChain {
return addressMapMarkovChainToOrderedSparseMarkovChain(
graphToAddressMapMarkovChain(graph, edgeWeight, selfLoopEdgeWeight)
);
const chain = createNodeAddressMarkovChain(contributions);
return nodeAddressMarkovChainToOrderedSparseMarkovChain(chain);
}
/**
@ -147,7 +218,10 @@ export function permute(
);
newChain.push({neighbor: newNeighbors, weight});
}
return {nodeOrder: newOrder, chain: newChain};
return {
nodeOrder: newOrder,
chain: newChain,
};
}
/**

View File

@ -1,9 +1,12 @@
// @flow
import sortBy from "lodash.sortby";
import {EdgeAddress, Graph, NodeAddress} from "../graph";
import {
distributionToPagerankResult,
graphToOrderedSparseMarkovChain,
createContributions,
createOrderedSparseMarkovChain,
normalize,
normalizeNeighbors,
permute,
@ -77,14 +80,96 @@ describe("core/attribution/graphToMarkovChain", () => {
expect(actual).toEqual(expected);
});
describe("graphToOrderedSparseMarkovChain", () => {
describe("createContributions", () => {
// The tests for `createOrderedSparseMarkovChain` also must invoke
// `createContributions`, so we add only light testing separately.
it("works on a simple asymmetric chain", () => {
const n1 = NodeAddress.fromParts(["n1"]);
const n2 = NodeAddress.fromParts(["n2"]);
const n3 = NodeAddress.fromParts(["sink"]);
const e1 = {src: n1, dst: n2, address: EdgeAddress.fromParts(["e1"])};
const e2 = {src: n2, dst: n3, address: EdgeAddress.fromParts(["e2"])};
const e3 = {src: n1, dst: n3, address: EdgeAddress.fromParts(["e3"])};
const e4 = {src: n3, dst: n3, address: EdgeAddress.fromParts(["e4"])};
const g = new Graph()
.addNode(n1)
.addNode(n2)
.addNode(n3)
.addEdge(e1)
.addEdge(e2)
.addEdge(e3)
.addEdge(e4);
const edgeWeight = () => ({toWeight: 6.0, froWeight: 3.0});
const actual = createContributions(g, edgeWeight, 1.0);
// Total out-weights (for normalization factors):
// - for `n1`: 2 out, 0 in, 1 synthetic: 12 + 0 + 1 = 13
// - for `n2`: 1 out, 1 in, 1 synthetic: 6 + 3 + 1 = 10
// - for `n3`: 1 out, 3 in, 1 synthetic: 6 + 9 + 1 = 16
const expected = new Map()
.set(n1, [
{contributor: {type: "SYNTHETIC_LOOP"}, weight: 1 / 13},
{
contributor: {type: "NEIGHBOR", neighbor: {node: n2, edge: e1}},
weight: 3 / 10,
},
{
contributor: {type: "NEIGHBOR", neighbor: {node: n3, edge: e3}},
weight: 3 / 16,
},
])
.set(n2, [
{contributor: {type: "SYNTHETIC_LOOP"}, weight: 1 / 10},
{
contributor: {type: "NEIGHBOR", neighbor: {node: n1, edge: e1}},
weight: 6 / 13,
},
{
contributor: {type: "NEIGHBOR", neighbor: {node: n3, edge: e2}},
weight: 3 / 16,
},
])
.set(n3, [
{contributor: {type: "SYNTHETIC_LOOP"}, weight: 1 / 16},
{
contributor: {type: "NEIGHBOR", neighbor: {node: n2, edge: e2}},
weight: 6 / 10,
},
{
contributor: {type: "NEIGHBOR", neighbor: {node: n1, edge: e3}},
weight: 6 / 13,
},
{
contributor: {type: "NEIGHBOR", neighbor: {node: n3, edge: e4}},
// this loop, as an out-edge
weight: 3 / 16,
},
{
contributor: {type: "NEIGHBOR", neighbor: {node: n3, edge: e4}},
// this loop, as an in-edge
weight: 6 / 16,
},
]);
const canonicalize = (map) =>
new Map(
Array.from(map.entries()).map(([k, v]) => [
k,
sortBy(v, (x) => JSON.stringify(x)),
])
);
expect(canonicalize(actual)).toEqual(canonicalize(expected));
});
});
describe("createOrderedSparseMarkovChain", () => {
it("works on a trivial one-node chain with no edge", () => {
const n = NodeAddress.fromParts(["foo"]);
const g = new Graph().addNode(n);
const edgeWeight = (_unused_edge) => {
throw new Error("Don't even look at me");
};
const osmc = graphToOrderedSparseMarkovChain(g, edgeWeight, 1e-3);
const osmc = createOrderedSparseMarkovChain(
createContributions(g, edgeWeight, 1e-3)
);
const expected = {
nodeOrder: [n],
chain: [
@ -94,7 +179,7 @@ describe("core/attribution/graphToMarkovChain", () => {
expect(normalize(osmc)).toEqual(normalize(expected));
});
it("works on a simple asymmetric two-node chain", () => {
it("works on a simple asymmetric chain", () => {
const n1 = NodeAddress.fromParts(["n1"]);
const n2 = NodeAddress.fromParts(["n2"]);
const n3 = NodeAddress.fromParts(["sink"]);
@ -111,7 +196,9 @@ describe("core/attribution/graphToMarkovChain", () => {
.addEdge(e3)
.addEdge(e4);
const edgeWeight = () => ({toWeight: 1, froWeight: 0});
const osmc = graphToOrderedSparseMarkovChain(g, edgeWeight, 0.0);
const osmc = createOrderedSparseMarkovChain(
createContributions(g, edgeWeight, 0.0)
);
const expected = {
nodeOrder: [n1, n2, n3],
chain: [
@ -147,7 +234,9 @@ describe("core/attribution/graphToMarkovChain", () => {
.addEdge(e2)
.addEdge(e3);
const edgeWeight = () => ({toWeight: 1, froWeight: 1});
const osmc = graphToOrderedSparseMarkovChain(g, edgeWeight, 0.0);
const osmc = createOrderedSparseMarkovChain(
createContributions(g, edgeWeight, 0.0)
);
const expected = {
nodeOrder: [n1, n2, n3],
chain: [
@ -177,7 +266,9 @@ describe("core/attribution/graphToMarkovChain", () => {
// arithmetic simple.
return {toWeight: 4 - epsilon / 2, froWeight: 1 - epsilon / 2};
}
const osmc = graphToOrderedSparseMarkovChain(g, edgeWeight, epsilon);
const osmc = createOrderedSparseMarkovChain(
createContributions(g, edgeWeight, epsilon)
);
// Edges from `src`:
// - to `src` with weight `epsilon`
// - to `dst` with weight `4 - epsilon / 2`

View File

@ -4,7 +4,8 @@ import {type Edge, Graph} from "../graph";
import {
type PagerankResult,
distributionToPagerankResult,
graphToOrderedSparseMarkovChain,
createContributions,
createOrderedSparseMarkovChain,
type EdgeWeight,
} from "./graphToMarkovChain";
@ -39,11 +40,12 @@ export function pagerank(
...defaultOptions(),
...(options || {}),
};
const osmc = graphToOrderedSparseMarkovChain(
const contributions = createContributions(
graph,
edgeWeight,
fullOptions.selfLoopWeight
);
const osmc = createOrderedSparseMarkovChain(contributions);
const distribution = findStationaryDistribution(osmc.chain, {
verbose: fullOptions.verbose,
convergenceThreshold: fullOptions.convergenceThreshold,

View File

@ -5,6 +5,8 @@ import {NodeAddress} from "../graph";
import {advancedGraph} from "../graphTestUtil";
function snapshotPagerankResult(result) {
const prTotal = Array.from(result.values()).reduce((a, b) => a + b, 0);
expect(prTotal).toBeCloseTo(1.0, 1e-9);
const partsToProbability = [];
const sortedKeys = Array.from(result.keys()).sort();
for (const key of sortedKeys) {