Factor out _intervalResult in timelinePagerank (#1799)
As part of work for #1773, I want to add a lot more complexity to the logic for computing individual time-slices of pagerank scores, so that we can trace how much score flowed on each individual edge. This means adding more complexity to the _computeTimelineDistribution function; however, that function is an untested wrapper, so I'm hesitant to add more complexity directly. Instead, I'm first factoring out an _intervalResult method and a corresponding type, which computes the scores for a given timeline interval. I've also added sanity checking tests for this method. In a followon commit, I'll add more logic for tracking edge-level score flows. Test plan: This is just a refactor, maintaining existing behavior and adding tests. `yarn test --full` passes. Since our sharness tests include doing a full load (including timeline cred computation) on realistic data from GitHub, this gives us confidence that there hasn't been any change to cred semantics.
This commit is contained in:
parent
71c0b0d66d
commit
27aeacc906
|
@ -24,10 +24,7 @@ import {
|
||||||
} from "./graphToMarkovChain";
|
} from "./graphToMarkovChain";
|
||||||
import {findStationaryDistribution, type PagerankParams} from "./markovChain";
|
import {findStationaryDistribution, type PagerankParams} from "./markovChain";
|
||||||
|
|
||||||
/**
|
export type IntervalResult = {|
|
||||||
* Represents raw PageRank distributions on a graph over time.
|
|
||||||
*/
|
|
||||||
export type TimelineDistributions = $ReadOnlyArray<{|
|
|
||||||
// The interval for this slice
|
// The interval for this slice
|
||||||
+interval: Interval,
|
+interval: Interval,
|
||||||
// The total node weight within this interval (normalized to account for the
|
// The total node weight within this interval (normalized to account for the
|
||||||
|
@ -37,7 +34,11 @@ export type TimelineDistributions = $ReadOnlyArray<{|
|
||||||
// The raw score distribution over nodes for this interval (i.e. sums to 1).
|
// The raw score distribution over nodes for this interval (i.e. sums to 1).
|
||||||
// Uses the canonical graph node order.
|
// Uses the canonical graph node order.
|
||||||
+distribution: Distribution,
|
+distribution: Distribution,
|
||||||
|}>;
|
|};
|
||||||
|
/**
|
||||||
|
* Represents raw PageRank distributions on a graph over time.
|
||||||
|
*/
|
||||||
|
export type TimelineDistributions = $ReadOnlyArray<IntervalResult>;
|
||||||
|
|
||||||
export const SYNTHETIC_LOOP_WEIGHT = 1e-3;
|
export const SYNTHETIC_LOOP_WEIGHT = 1e-3;
|
||||||
|
|
||||||
|
@ -206,6 +207,30 @@ export async function _computeTimelineDistribution(
|
||||||
const nodeToConnections = NullUtil.get(
|
const nodeToConnections = NullUtil.get(
|
||||||
nodeToConnectionsIterator.next().value
|
nodeToConnectionsIterator.next().value
|
||||||
);
|
);
|
||||||
|
const result = await _intervalResult(
|
||||||
|
nodeWeights,
|
||||||
|
nodeToConnections,
|
||||||
|
nodeOrder,
|
||||||
|
interval,
|
||||||
|
pi0,
|
||||||
|
alpha
|
||||||
|
);
|
||||||
|
results.push(result);
|
||||||
|
// Use the latest convergce results as the starting point for the next run
|
||||||
|
// of PageRank
|
||||||
|
pi0 = result.distribution;
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function _intervalResult(
|
||||||
|
nodeWeights: Map<NodeAddressT, number>,
|
||||||
|
nodeToConnections: NodeToConnections,
|
||||||
|
nodeOrder: $ReadOnlyArray<NodeAddressT>,
|
||||||
|
interval: Interval,
|
||||||
|
pi0: Distribution | null,
|
||||||
|
alpha: number
|
||||||
|
): Promise<IntervalResult> {
|
||||||
const {chain} = createOrderedSparseMarkovChain(nodeToConnections);
|
const {chain} = createOrderedSparseMarkovChain(nodeToConnections);
|
||||||
|
|
||||||
const seed = weightedDistribution(nodeOrder, nodeWeights);
|
const seed = weightedDistribution(nodeOrder, nodeWeights);
|
||||||
|
@ -220,14 +245,9 @@ export async function _computeTimelineDistribution(
|
||||||
yieldAfterMs: 30,
|
yieldAfterMs: 30,
|
||||||
});
|
});
|
||||||
const intervalWeight = sum(nodeWeights.values());
|
const intervalWeight = sum(nodeWeights.values());
|
||||||
results.push({
|
return {
|
||||||
interval,
|
interval,
|
||||||
intervalWeight,
|
intervalWeight,
|
||||||
distribution: distributionResult.pi,
|
distribution: distributionResult.pi,
|
||||||
});
|
};
|
||||||
// Use the latest convergce results as the starting point for the next run
|
|
||||||
// of PageRank
|
|
||||||
pi0 = distributionResult.pi;
|
|
||||||
}
|
|
||||||
return results;
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,12 +2,13 @@
|
||||||
|
|
||||||
import {sum} from "d3-array";
|
import {sum} from "d3-array";
|
||||||
import * as NullUtil from "../../util/null";
|
import * as NullUtil from "../../util/null";
|
||||||
import {node, edge} from "../graphTestUtil";
|
import {node, edge, advancedGraph} from "../graphTestUtil";
|
||||||
import {Graph, type EdgeAddressT, type Edge} from "../graph";
|
import {Graph, type EdgeAddressT, type Edge} from "../graph";
|
||||||
import {
|
import {
|
||||||
_timelineNodeWeights,
|
_timelineNodeWeights,
|
||||||
_timelineNodeToConnections,
|
_timelineNodeToConnections,
|
||||||
SYNTHETIC_LOOP_WEIGHT,
|
SYNTHETIC_LOOP_WEIGHT,
|
||||||
|
_intervalResult,
|
||||||
} from "./timelinePagerank";
|
} from "./timelinePagerank";
|
||||||
import {
|
import {
|
||||||
createConnections,
|
createConnections,
|
||||||
|
@ -107,4 +108,67 @@ describe("src/core/algorithm/timelinePagerank", () => {
|
||||||
expect(chains[3]).toEqual(chain4);
|
expect(chains[3]).toEqual(chain4);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("_intervalResult", () => {
|
||||||
|
async function example() {
|
||||||
|
const {graph1, nodes} = advancedGraph();
|
||||||
|
const g = graph1();
|
||||||
|
const nodeWeights = new Map()
|
||||||
|
.set(nodes.src.address, 1)
|
||||||
|
.set(nodes.isolated.address, 2);
|
||||||
|
const edgeFn = (_unused_edge) => ({forwards: 1, backwards: 0.5});
|
||||||
|
const nodeToConnections = createConnections(g, edgeFn, 1e-3);
|
||||||
|
const nodeOrder = Array.from(g.nodes()).map((x) => x.address);
|
||||||
|
const interval = {endTimeMs: 1000, startTimeMs: 0};
|
||||||
|
const pi0 = null;
|
||||||
|
const alpha = 0.05;
|
||||||
|
const result = await _intervalResult(
|
||||||
|
nodeWeights,
|
||||||
|
nodeToConnections,
|
||||||
|
nodeOrder,
|
||||||
|
interval,
|
||||||
|
pi0,
|
||||||
|
alpha
|
||||||
|
);
|
||||||
|
return {
|
||||||
|
graph: g,
|
||||||
|
nodes,
|
||||||
|
nodeOrder,
|
||||||
|
nodeWeights,
|
||||||
|
edgeFn,
|
||||||
|
nodeToConnections,
|
||||||
|
interval,
|
||||||
|
pi0,
|
||||||
|
alpha,
|
||||||
|
result,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
it("passes through the interval", async () => {
|
||||||
|
const {result, interval} = await example();
|
||||||
|
expect(result.interval).toEqual(interval);
|
||||||
|
});
|
||||||
|
it("computes the summed nodeWeight", async () => {
|
||||||
|
const {result, nodeWeights} = await example();
|
||||||
|
const actualIntervalWeight = sum(nodeWeights.values());
|
||||||
|
expect(result.intervalWeight).toEqual(actualIntervalWeight);
|
||||||
|
});
|
||||||
|
it("produces sane score distribution on an example graph", async () => {
|
||||||
|
const {result, nodes, nodeOrder} = await example();
|
||||||
|
function getScore(a) {
|
||||||
|
const idx = nodeOrder.indexOf(a.address);
|
||||||
|
if (idx === -1) {
|
||||||
|
throw new Error("bad address");
|
||||||
|
}
|
||||||
|
return result.distribution[idx];
|
||||||
|
}
|
||||||
|
const isoScore = getScore(nodes.isolated);
|
||||||
|
const srcScore = getScore(nodes.src);
|
||||||
|
const dstScore = getScore(nodes.dst);
|
||||||
|
expect(isoScore + srcScore + dstScore).toBeCloseTo(1);
|
||||||
|
// It has 2/3rd weight, and is isolated, so it's simple
|
||||||
|
expect(isoScore).toBeCloseTo(2 / 3);
|
||||||
|
// src has the weight, and dst doesnt, so it should have a higher score
|
||||||
|
expect(srcScore).toBeGreaterThan(dstScore);
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
Loading…
Reference in New Issue