Add `analysis/timeline/timelinePagerank`
As the name would suggest, this module allows computing timeline PageRank on a graph. See documentation in the module for details on the algorithm. Test plan: The module has incomplete testing. The timelinePagerank function calls out to iterators for getting the time-decayed node weights and the time-decayed markov chain; these iterators are tested. However, the wrapper logic that composes these pieces together, calculates seed vectors, and runs PageRank is not tested. I felt it would be a pain to test and settled for reviewing the code carefully, and putting a cautionary note at the top of the function.
This commit is contained in:
parent
cb236eff5d
commit
87720c4868
|
@ -0,0 +1,243 @@
|
||||||
|
// @flow
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Core logic for computing timeline PageRank on a graph.
|
||||||
|
*/
|
||||||
|
import {sum} from "d3-array";
|
||||||
|
import * as NullUtil from "../../util/null";
|
||||||
|
import {Graph, type NodeAddressT, type Edge, type Node} from "../../core/graph";
|
||||||
|
import {type NodeAndEdgeTypes} from "../types";
|
||||||
|
import {type Weights} from "../weights";
|
||||||
|
import {type Interval, partitionGraph} from "./interval";
|
||||||
|
import {
|
||||||
|
nodeWeightEvaluator,
|
||||||
|
edgeWeightEvaluator,
|
||||||
|
type NodeWeightEvaluator,
|
||||||
|
type EdgeWeightEvaluator,
|
||||||
|
} from "../weightEvaluator";
|
||||||
|
import {weightedDistribution} from "../../core/attribution/nodeDistribution";
|
||||||
|
import {type Distribution} from "../../core/attribution/distribution";
|
||||||
|
import {
|
||||||
|
createOrderedSparseMarkovChain,
|
||||||
|
createConnections,
|
||||||
|
} from "../../core/attribution/graphToMarkovChain";
|
||||||
|
import {
|
||||||
|
findStationaryDistribution,
|
||||||
|
type PagerankParams,
|
||||||
|
type SparseMarkovChain,
|
||||||
|
} from "../../core/attribution/markovChain";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents raw PageRank distributions on a graph over time.
|
||||||
|
*/
|
||||||
|
export type TimelineDistributions = $ReadOnlyArray<{|
|
||||||
|
// The interval for this slice
|
||||||
|
+interval: Interval,
|
||||||
|
// The total node weight within this interval (normalized to account for the
|
||||||
|
// exponential decay). We can use this to normalize the amount of cred
|
||||||
|
// created in this interval.
|
||||||
|
+intervalWeight: number,
|
||||||
|
// The raw score distribution over nodes for this interval (i.e. sums to 1).
|
||||||
|
// Uses the canonical graph node order.
|
||||||
|
+distribution: Distribution,
|
||||||
|
|}>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs timeline PageRank on a graph.
|
||||||
|
*
|
||||||
|
* The graph is partitioned into weeklong intervals. For each interval, we run
|
||||||
|
* PageRank, assigning seed weight to nodes which have already been created,
|
||||||
|
* and considering edges which have already been created. Node weights and edge
|
||||||
|
* weights both decay by the `intervalDecay` every week.
|
||||||
|
*
|
||||||
|
* Detailed description:
|
||||||
|
*
|
||||||
|
* First, we partition the graph into week-long intervals using the `interval`
|
||||||
|
* module. For each such interval, we will produce a score distribution over
|
||||||
|
* all the nodes in the graph, and a total weight for the interval, based on
|
||||||
|
* the decayed weights of all the nodes that had been created as of that
|
||||||
|
* interval.
|
||||||
|
*
|
||||||
|
* To get the score distribution, we create a node weight map for each
|
||||||
|
* interval, and a markov chain for each interval.
|
||||||
|
*
|
||||||
|
* In the node weight map, every node is initially assigned full weight in the
|
||||||
|
* interval of its creation, and in every interval thereafter its weight decays
|
||||||
|
* by `intervalDecay`. However, note that the 'full weight' is normalized based
|
||||||
|
* on the interval decay. To make this concrete, suppose that we have an
|
||||||
|
* interval decay of 0.5, and a node with a base weight of 8. We want the total
|
||||||
|
* weight given to this node (across all intervals) to equal 8, so that
|
||||||
|
* weight-based cred normalization will be independent of the interval decay.
|
||||||
|
* So we give it a normalized weight of 4 in the first interval, 2 in the
|
||||||
|
* second, and so forth.
|
||||||
|
*
|
||||||
|
* For each interval, we also create a markov chain which describes the graph
|
||||||
|
* state as of that interval. That markov chain only contains weight for edges
|
||||||
|
* which were already created as of the interval; any edges that are not yet
|
||||||
|
* created effectively have a weight of 0. Like the node weights, edge weights
|
||||||
|
* are created with full weight, and then decay by `intervalDecay` each
|
||||||
|
* interval. Unlike the node weights, we don't need to normalize based on
|
||||||
|
* intervalDecay, since markov chains always normalize the edge weights.
|
||||||
|
*
|
||||||
|
* Given the markov chain and node weights for the interval, we compute the
|
||||||
|
* score distribution by running finding the stationary distribution for that
|
||||||
|
* markov chain, using the node weights as the seed vector. As ever, the
|
||||||
|
* `alpha` parameter determines how much probability mass returns to the seed
|
||||||
|
* vector. (See the docs in markovChain.js for more details.) The score
|
||||||
|
* distribution uses the Graph's canonical node ordering.
|
||||||
|
*
|
||||||
|
* The output array contains an element for each interval in the graph
|
||||||
|
* partitioning. Each element contains the interval, the distribution, as well
|
||||||
|
* as the sum of all node weights for that interval (so that we can normalize
|
||||||
|
* cred).
|
||||||
|
*
|
||||||
|
* The method is factored apart into a wrapper function which does validation,
|
||||||
|
* and iterators that produce the node weights and the markov chains. This
|
||||||
|
* facilitates testing the timelime logic separately. Finally, we compose all
|
||||||
|
* the pieces and run PageRank for each interval.
|
||||||
|
*/
|
||||||
|
export async function timelinePagerank(
|
||||||
|
graph: Graph,
|
||||||
|
types: NodeAndEdgeTypes,
|
||||||
|
weights: Weights,
|
||||||
|
intervalDecay: number,
|
||||||
|
alpha: number
|
||||||
|
): Promise<TimelineDistributions> {
|
||||||
|
if (intervalDecay < 0 || intervalDecay > 1 || !isFinite(intervalDecay)) {
|
||||||
|
throw new Error(`invalid intervalDecay: ${intervalDecay}`);
|
||||||
|
}
|
||||||
|
if (alpha < 0 || alpha > 1 || !isFinite(alpha)) {
|
||||||
|
throw new Error(`invalid alpha: ${alpha}`);
|
||||||
|
}
|
||||||
|
// Produce the evaluators we will use to get the baseline weight for each
|
||||||
|
// node and edge
|
||||||
|
const nodeEvaluator = nodeWeightEvaluator(types.nodeTypes, weights);
|
||||||
|
const edgeEvaluator = edgeWeightEvaluator(types.edgeTypes, weights);
|
||||||
|
|
||||||
|
const graphPartitionSlices = partitionGraph(graph);
|
||||||
|
if (graphPartitionSlices.length === 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
const intervals = graphPartitionSlices.map((x) => x.interval);
|
||||||
|
const nodeCreationHistory = graphPartitionSlices.map((x) => x.nodes);
|
||||||
|
const edgeCreationHistory = graphPartitionSlices.map((x) => x.edges);
|
||||||
|
const nodeOrder = Array.from(graph.nodes()).map((x) => x.address);
|
||||||
|
const nodeWeightIterator = _timelineNodeWeights(
|
||||||
|
nodeCreationHistory,
|
||||||
|
nodeEvaluator,
|
||||||
|
intervalDecay
|
||||||
|
);
|
||||||
|
const markovChainIterator = _timelineMarkovChain(
|
||||||
|
graph,
|
||||||
|
edgeCreationHistory,
|
||||||
|
edgeEvaluator,
|
||||||
|
intervalDecay
|
||||||
|
);
|
||||||
|
return _computeTimelineDistribution(
|
||||||
|
nodeOrder,
|
||||||
|
intervals,
|
||||||
|
nodeWeightIterator,
|
||||||
|
markovChainIterator,
|
||||||
|
alpha
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function* _timelineNodeWeights(
|
||||||
|
nodeCreationHistory: $ReadOnlyArray<$ReadOnlyArray<Node>>,
|
||||||
|
nodeEvaluator: NodeWeightEvaluator,
|
||||||
|
intervalDecay: number
|
||||||
|
): Iterator<Map<NodeAddressT, number>> {
|
||||||
|
let lastNodeWeights = new Map();
|
||||||
|
for (const nodes of nodeCreationHistory) {
|
||||||
|
const nodeWeights = new Map();
|
||||||
|
// Decay all the previous weights.
|
||||||
|
for (const [address, weight] of lastNodeWeights.entries()) {
|
||||||
|
nodeWeights.set(address, weight * intervalDecay);
|
||||||
|
}
|
||||||
|
// Add new nodes at full weight.
|
||||||
|
for (const {address} of nodes) {
|
||||||
|
// Normalize by (1 - intervalDecay) so that the total weight of a node across
|
||||||
|
// intervals converges to the full base weight
|
||||||
|
const normalizedWeight = nodeEvaluator(address) * (1 - intervalDecay);
|
||||||
|
nodeWeights.set(address, normalizedWeight);
|
||||||
|
}
|
||||||
|
yield nodeWeights;
|
||||||
|
lastNodeWeights = nodeWeights;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function* _timelineMarkovChain(
|
||||||
|
graph: Graph,
|
||||||
|
edgeCreationHistory: $ReadOnlyArray<$ReadOnlyArray<Edge>>,
|
||||||
|
edgeEvaluator: EdgeWeightEvaluator,
|
||||||
|
intervalDecay: number
|
||||||
|
): Iterator<SparseMarkovChain> {
|
||||||
|
const edgeWeights = new Map();
|
||||||
|
for (const edges of edgeCreationHistory) {
|
||||||
|
for (const [address, {forwards, backwards}] of edgeWeights.entries()) {
|
||||||
|
edgeWeights.set(address, {
|
||||||
|
forwards: forwards * intervalDecay,
|
||||||
|
backwards: backwards * intervalDecay,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
for (const {address} of edges) {
|
||||||
|
edgeWeights.set(address, edgeEvaluator(address));
|
||||||
|
}
|
||||||
|
const defaultEdgeWeight = Object.freeze({forwards: 0, backwards: 0});
|
||||||
|
const currentEdgeWeight = (e: Edge) => {
|
||||||
|
return NullUtil.orElse(edgeWeights.get(e.address), defaultEdgeWeight);
|
||||||
|
};
|
||||||
|
// Construct a new Markov chain corresponding to the current weights
|
||||||
|
// of the edges.
|
||||||
|
// TODO: Rather than constructing a markov chain from scratch, we can
|
||||||
|
// update the markov chain in-place. This should result in a significant
|
||||||
|
// performance improvement. We will need to change the markov chain
|
||||||
|
// representation to do so (we should add a `totalOutWeight` array to the
|
||||||
|
// chain, so that we can efficiently update the total weight as we add new
|
||||||
|
// connections, rather than needing to re-normalize the whole chain for
|
||||||
|
// each interval).
|
||||||
|
const chain = createOrderedSparseMarkovChain(
|
||||||
|
createConnections(graph, currentEdgeWeight, 1e-3)
|
||||||
|
).chain;
|
||||||
|
yield chain;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warning: This function is untested.
|
||||||
|
// Modify with care.
|
||||||
|
export async function _computeTimelineDistribution(
|
||||||
|
nodeOrder: $ReadOnlyArray<NodeAddressT>,
|
||||||
|
intervals: $ReadOnlyArray<Interval>,
|
||||||
|
nodeWeightIterator: Iterator<Map<NodeAddressT, number>>,
|
||||||
|
markovChainIterator: Iterator<SparseMarkovChain>,
|
||||||
|
alpha: number
|
||||||
|
): Promise<TimelineDistributions> {
|
||||||
|
const results = [];
|
||||||
|
let pi0: Distribution | null = null;
|
||||||
|
for (const interval of intervals) {
|
||||||
|
const nodeWeights = NullUtil.get(nodeWeightIterator.next().value);
|
||||||
|
const chain = NullUtil.get(markovChainIterator.next().value);
|
||||||
|
|
||||||
|
const seed = weightedDistribution(nodeOrder, nodeWeights);
|
||||||
|
if (pi0 == null) {
|
||||||
|
pi0 = seed;
|
||||||
|
}
|
||||||
|
const params: PagerankParams = {chain, alpha, seed, pi0};
|
||||||
|
const distributionResult = await findStationaryDistribution(params, {
|
||||||
|
verbose: false,
|
||||||
|
convergenceThreshold: 1e-7,
|
||||||
|
maxIterations: 255,
|
||||||
|
yieldAfterMs: 30,
|
||||||
|
});
|
||||||
|
const intervalWeight = sum(nodeWeights.values());
|
||||||
|
results.push({
|
||||||
|
interval,
|
||||||
|
intervalWeight,
|
||||||
|
distribution: distributionResult.pi,
|
||||||
|
});
|
||||||
|
// Use the latest convergce results as the starting point for the next run
|
||||||
|
// of PageRank
|
||||||
|
pi0 = distributionResult.pi;
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
|
@ -0,0 +1,101 @@
|
||||||
|
// @flow
|
||||||
|
|
||||||
|
import {sum} from "d3-array";
|
||||||
|
import * as NullUtil from "../../util/null";
|
||||||
|
import {node, edge} from "../../core/graphTestUtil";
|
||||||
|
import {Graph, type EdgeAddressT, type Edge} from "../../core/graph";
|
||||||
|
import {_timelineNodeWeights, _timelineMarkovChain} from "./timelinePagerank";
|
||||||
|
import {
|
||||||
|
createConnections,
|
||||||
|
createOrderedSparseMarkovChain,
|
||||||
|
type EdgeWeight,
|
||||||
|
} from "../../core/attribution/graphToMarkovChain";
|
||||||
|
import {type SparseMarkovChain} from "../../core/attribution/markovChain";
|
||||||
|
|
||||||
|
describe("src/analysis/timeline/timelinePagerank", () => {
|
||||||
|
describe("_timelineNodeWeights", () => {
|
||||||
|
it("works in a simple case", () => {
|
||||||
|
const foo = node("foo");
|
||||||
|
const bar = node("bar");
|
||||||
|
const nodeCreationHistory = [[foo], [], [bar]];
|
||||||
|
const evaluator = (n) => (n === foo.address ? 4 : 1);
|
||||||
|
const weights = Array.from(
|
||||||
|
_timelineNodeWeights(nodeCreationHistory, evaluator, 0.5)
|
||||||
|
);
|
||||||
|
const expected = [
|
||||||
|
new Map([[foo.address, 2]]),
|
||||||
|
new Map([[foo.address, 1]]),
|
||||||
|
new Map([[foo.address, 0.5], [bar.address, 0.5]]),
|
||||||
|
];
|
||||||
|
expect(weights).toEqual(expected);
|
||||||
|
});
|
||||||
|
it("properly normalizes based on the interval decay", () => {
|
||||||
|
const foo = node("foo");
|
||||||
|
const history = new Array(100).fill([]);
|
||||||
|
history[0] = [foo];
|
||||||
|
const evaluator = (_) => 42;
|
||||||
|
const sumWeight = (decay: number) => {
|
||||||
|
const weightsIterator = _timelineNodeWeights(history, evaluator, decay);
|
||||||
|
return sum(weightsIterator, (x) => x.get(foo.address));
|
||||||
|
};
|
||||||
|
expect(sumWeight(0.5)).toBeCloseTo(42);
|
||||||
|
expect(sumWeight(0.9)).toBeCloseTo(42);
|
||||||
|
expect(sumWeight(0.1)).toBeCloseTo(42);
|
||||||
|
});
|
||||||
|
it("handles an empty history", () => {
|
||||||
|
const weights = _timelineNodeWeights([], (_) => 1, 0.5);
|
||||||
|
expect(Array.from(weights)).toHaveLength(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("_timelineMarkovChain", () => {
|
||||||
|
it("works for a simple case", () => {
|
||||||
|
const a = node("a");
|
||||||
|
const b = node("b");
|
||||||
|
const e1 = edge("e1", a, b);
|
||||||
|
const e2 = edge("e2", a, a);
|
||||||
|
const graph = new Graph()
|
||||||
|
.addNode(a)
|
||||||
|
.addNode(b)
|
||||||
|
.addEdge(e1)
|
||||||
|
.addEdge(e2);
|
||||||
|
|
||||||
|
function weightsToChain(
|
||||||
|
w: Map<EdgeAddressT, EdgeWeight>
|
||||||
|
): SparseMarkovChain {
|
||||||
|
const edgeWeight = (e: Edge) =>
|
||||||
|
NullUtil.orElse(w.get(e.address), {forwards: 0, backwards: 0});
|
||||||
|
const connections = createConnections(graph, edgeWeight, 1e-3);
|
||||||
|
return createOrderedSparseMarkovChain(connections).chain;
|
||||||
|
}
|
||||||
|
|
||||||
|
const edgeCreationHistory = [[], [e1], [], [e2]];
|
||||||
|
const edgeEvaluator = (_) => ({forwards: 1, backwards: 0});
|
||||||
|
const chainIterator = _timelineMarkovChain(
|
||||||
|
graph,
|
||||||
|
edgeCreationHistory,
|
||||||
|
edgeEvaluator,
|
||||||
|
0.5
|
||||||
|
);
|
||||||
|
const chains = Array.from(chainIterator);
|
||||||
|
|
||||||
|
const w1 = new Map();
|
||||||
|
const chain1 = weightsToChain(w1);
|
||||||
|
expect(chains[0]).toEqual(chain1);
|
||||||
|
|
||||||
|
const w2 = new Map().set(e1.address, {forwards: 1, backwards: 0});
|
||||||
|
const chain2 = weightsToChain(w2);
|
||||||
|
expect(chains[1]).toEqual(chain2);
|
||||||
|
|
||||||
|
const w3 = new Map().set(e1.address, {forwards: 1 / 2, backwards: 0});
|
||||||
|
const chain3 = weightsToChain(w3);
|
||||||
|
expect(chains[2]).toEqual(chain3);
|
||||||
|
|
||||||
|
const w4 = new Map()
|
||||||
|
.set(e1.address, {forwards: 1 / 4, backwards: 0})
|
||||||
|
.set(e2.address, {forwards: 1, backwards: 0});
|
||||||
|
const chain4 = weightsToChain(w4);
|
||||||
|
expect(chains[3]).toEqual(chain4);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
Loading…
Reference in New Issue