Factor out _intervalResult in timelinePagerank (#1799)

As part of work for #1773, I want to add a lot more complexity to the
logic for computing individual time-slices of pagerank scores, so that
we can trace how much score flowed on each individual edge. This means
adding more complexity to the _computeTimelineDistribution function;
however, that function is an untested wrapper, so I'm hesitant to add
more complexity directly.

Instead, I'm first factoring out an _intervalResult method and a
corresponding type, which computes the scores for a given timeline
interval. I've also added sanity checking tests for this method. In a
followon commit, I'll add more logic for tracking edge-level score
flows.

Test plan: This is just a refactor, maintaining existing behavior and
adding tests. `yarn test --full` passes. Since our sharness tests
include doing a full load (including timeline cred computation) on
realistic data from GitHub, this gives us confidence that there hasn't
been any change to cred semantics.
This commit is contained in:
Dandelion Mané 2020-05-30 14:14:32 -07:00 committed by GitHub
parent 71c0b0d66d
commit 27aeacc906
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 109 additions and 25 deletions

View File

@ -24,10 +24,7 @@ import {
} from "./graphToMarkovChain"; } from "./graphToMarkovChain";
import {findStationaryDistribution, type PagerankParams} from "./markovChain"; import {findStationaryDistribution, type PagerankParams} from "./markovChain";
/** export type IntervalResult = {|
* Represents raw PageRank distributions on a graph over time.
*/
export type TimelineDistributions = $ReadOnlyArray<{|
// The interval for this slice // The interval for this slice
+interval: Interval, +interval: Interval,
// The total node weight within this interval (normalized to account for the // The total node weight within this interval (normalized to account for the
@ -37,7 +34,11 @@ export type TimelineDistributions = $ReadOnlyArray<{|
// The raw score distribution over nodes for this interval (i.e. sums to 1). // The raw score distribution over nodes for this interval (i.e. sums to 1).
// Uses the canonical graph node order. // Uses the canonical graph node order.
+distribution: Distribution, +distribution: Distribution,
|}>; |};
/**
* Represents raw PageRank distributions on a graph over time.
*/
export type TimelineDistributions = $ReadOnlyArray<IntervalResult>;
export const SYNTHETIC_LOOP_WEIGHT = 1e-3; export const SYNTHETIC_LOOP_WEIGHT = 1e-3;
@ -206,28 +207,47 @@ export async function _computeTimelineDistribution(
const nodeToConnections = NullUtil.get( const nodeToConnections = NullUtil.get(
nodeToConnectionsIterator.next().value nodeToConnectionsIterator.next().value
); );
const {chain} = createOrderedSparseMarkovChain(nodeToConnections); const result = await _intervalResult(
nodeWeights,
const seed = weightedDistribution(nodeOrder, nodeWeights); nodeToConnections,
if (pi0 == null) { nodeOrder,
pi0 = seed;
}
const params: PagerankParams = {chain, alpha, seed, pi0};
const distributionResult = await findStationaryDistribution(params, {
verbose: false,
convergenceThreshold: 1e-7,
maxIterations: 255,
yieldAfterMs: 30,
});
const intervalWeight = sum(nodeWeights.values());
results.push({
interval, interval,
intervalWeight, pi0,
distribution: distributionResult.pi, alpha
}); );
results.push(result);
// Use the latest convergce results as the starting point for the next run // Use the latest convergce results as the starting point for the next run
// of PageRank // of PageRank
pi0 = distributionResult.pi; pi0 = result.distribution;
} }
return results; return results;
} }
export async function _intervalResult(
nodeWeights: Map<NodeAddressT, number>,
nodeToConnections: NodeToConnections,
nodeOrder: $ReadOnlyArray<NodeAddressT>,
interval: Interval,
pi0: Distribution | null,
alpha: number
): Promise<IntervalResult> {
const {chain} = createOrderedSparseMarkovChain(nodeToConnections);
const seed = weightedDistribution(nodeOrder, nodeWeights);
if (pi0 == null) {
pi0 = seed;
}
const params: PagerankParams = {chain, alpha, seed, pi0};
const distributionResult = await findStationaryDistribution(params, {
verbose: false,
convergenceThreshold: 1e-7,
maxIterations: 255,
yieldAfterMs: 30,
});
const intervalWeight = sum(nodeWeights.values());
return {
interval,
intervalWeight,
distribution: distributionResult.pi,
};
}

View File

@ -2,12 +2,13 @@
import {sum} from "d3-array"; import {sum} from "d3-array";
import * as NullUtil from "../../util/null"; import * as NullUtil from "../../util/null";
import {node, edge} from "../graphTestUtil"; import {node, edge, advancedGraph} from "../graphTestUtil";
import {Graph, type EdgeAddressT, type Edge} from "../graph"; import {Graph, type EdgeAddressT, type Edge} from "../graph";
import { import {
_timelineNodeWeights, _timelineNodeWeights,
_timelineNodeToConnections, _timelineNodeToConnections,
SYNTHETIC_LOOP_WEIGHT, SYNTHETIC_LOOP_WEIGHT,
_intervalResult,
} from "./timelinePagerank"; } from "./timelinePagerank";
import { import {
createConnections, createConnections,
@ -107,4 +108,67 @@ describe("src/core/algorithm/timelinePagerank", () => {
expect(chains[3]).toEqual(chain4); expect(chains[3]).toEqual(chain4);
}); });
}); });
describe("_intervalResult", () => {
async function example() {
const {graph1, nodes} = advancedGraph();
const g = graph1();
const nodeWeights = new Map()
.set(nodes.src.address, 1)
.set(nodes.isolated.address, 2);
const edgeFn = (_unused_edge) => ({forwards: 1, backwards: 0.5});
const nodeToConnections = createConnections(g, edgeFn, 1e-3);
const nodeOrder = Array.from(g.nodes()).map((x) => x.address);
const interval = {endTimeMs: 1000, startTimeMs: 0};
const pi0 = null;
const alpha = 0.05;
const result = await _intervalResult(
nodeWeights,
nodeToConnections,
nodeOrder,
interval,
pi0,
alpha
);
return {
graph: g,
nodes,
nodeOrder,
nodeWeights,
edgeFn,
nodeToConnections,
interval,
pi0,
alpha,
result,
};
}
it("passes through the interval", async () => {
const {result, interval} = await example();
expect(result.interval).toEqual(interval);
});
it("computes the summed nodeWeight", async () => {
const {result, nodeWeights} = await example();
const actualIntervalWeight = sum(nodeWeights.values());
expect(result.intervalWeight).toEqual(actualIntervalWeight);
});
it("produces sane score distribution on an example graph", async () => {
const {result, nodes, nodeOrder} = await example();
function getScore(a) {
const idx = nodeOrder.indexOf(a.address);
if (idx === -1) {
throw new Error("bad address");
}
return result.distribution[idx];
}
const isoScore = getScore(nodes.isolated);
const srcScore = getScore(nodes.src);
const dstScore = getScore(nodes.dst);
expect(isoScore + srcScore + dstScore).toBeCloseTo(1);
// It has 2/3rd weight, and is isolated, so it's simple
expect(isoScore).toBeCloseTo(2 / 3);
// src has the weight, and dst doesnt, so it should have a higher score
expect(srcScore).toBeGreaterThan(dstScore);
});
});
}); });