Remove core/pagerankGraph (#1533)

The `pagerankGraph` module was an attempt to do a better job of
co-ordinating the data needed to run Pagerank, by wrapping the Graph
class alongside context on edge weights, etc. However, it was obsoleted
by work on TimelineCred. Thus, we can remove it entirely. I intend to
make another attempt at collecting all the data needed for cred analysis
in a way that doesn't couple with plugin code, and this time it will be
timeline-aware.

Test plan: `yarn test`
This commit is contained in:
Dandelion Mané 2020-01-09 12:23:51 -08:00 committed by GitHub
parent 6b85296e55
commit be7b465f98
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 4 additions and 1442 deletions

View File

@ -1,11 +1,6 @@
// @flow
import {type Edge, Graph, NodeAddress, type NodeAddressT} from "../core/graph";
import {
DEFAULT_MAX_ITERATIONS,
DEFAULT_CONVERGENCE_THRESHOLD,
DEFAULT_SYNTHETIC_LOOP_WEIGHT,
} from "../core/pagerankGraph";
import {
distributionToNodeDistribution,
createConnections,
@ -43,6 +38,10 @@ export type PagerankOptions = $Shape<FullPagerankOptions>;
export type {EdgeWeight} from "../core/attribution/graphToMarkovChain";
export type EdgeEvaluator = (Edge) => EdgeWeight;
export const DEFAULT_SYNTHETIC_LOOP_WEIGHT = 1e-3;
export const DEFAULT_MAX_ITERATIONS = 255;
export const DEFAULT_CONVERGENCE_THRESHOLD = 1e-7;
function defaultOptions(): PagerankOptions {
return {
verbose: false,

View File

@ -1,129 +0,0 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP
exports[`core/pagerankGraph to/from JSON matches expected snapshot 1`] = `
Array [
Object {
"type": "sourcecred/pagerankGraph",
"version": "0.2.0",
},
Object {
"backwardsWeights": Array [
0,
0,
0,
],
"forwardsWeights": Array [
1,
1,
1,
],
"graphJSON": Array [
Object {
"type": "sourcecred/graph",
"version": "0.8.0",
},
Object {
"edges": Array [
Object {
"address": Array [
"full-dangling",
],
"dstIndex": 4,
"srcIndex": 4,
"timestampMs": 0,
},
Object {
"address": Array [
"half-dangling",
],
"dstIndex": 4,
"srcIndex": 1,
"timestampMs": 0,
},
Object {
"address": Array [
"hom",
"1",
],
"dstIndex": 0,
"srcIndex": 5,
"timestampMs": 0,
},
Object {
"address": Array [
"hom",
"2",
],
"dstIndex": 0,
"srcIndex": 5,
"timestampMs": 0,
},
Object {
"address": Array [
"loop",
],
"dstIndex": 3,
"srcIndex": 3,
"timestampMs": 0,
},
],
"nodes": Array [
Object {
"description": "dst",
"index": 0,
"timestampMs": null,
},
Object {
"description": "halfIsolated",
"index": 1,
"timestampMs": null,
},
Object {
"description": "isolated",
"index": 2,
"timestampMs": null,
},
Object {
"description": "loop",
"index": 3,
"timestampMs": null,
},
Object {
"description": "src",
"index": 5,
"timestampMs": null,
},
],
"sortedNodeAddresses": Array [
Array [
"dst",
],
Array [
"halfIsolated",
],
Array [
"isolated",
],
Array [
"loop",
],
Array [
"phantom",
],
Array [
"src",
],
],
},
],
"scores": Array [
0.2,
0.2,
0.2,
0.2,
0.2,
],
"syntheticLoopWeight": 0.001,
},
]
`;

View File

@ -1,632 +0,0 @@
// @flow
import deepEqual from "lodash.isequal";
import {toCompat, fromCompat, type Compatible} from "../util/compat";
import {
Graph,
type Node,
type Edge,
type NodeAddressT,
type EdgeAddressT,
type GraphJSON,
NodeAddress,
type NeighborsOptions,
} from "./graph";
import {
createConnections,
createOrderedSparseMarkovChain,
type EdgeWeight,
} from "./attribution/graphToMarkovChain";
import {
distributionToNodeDistribution,
weightedDistribution,
} from "./attribution/nodeDistribution";
import {
findStationaryDistribution,
type PagerankParams,
type PagerankOptions as CorePagerankOptions,
} from "../core/attribution/markovChain";
import * as NullUtil from "../util/null";
export {Direction} from "./graph";
export type {DirectionT, NeighborsOptions} from "./graph";
export type {EdgeWeight} from "./attribution/graphToMarkovChain";
export type EdgeEvaluator = (Edge) => EdgeWeight;
export type ScoredNode = {|
+node: Node,
+score: number,
|};
export type WeightedEdge = {|
+edge: Edge,
+weight: EdgeWeight,
|};
export type PagerankGraphEdgesOptions = {|
+addressPrefix?: EdgeAddressT,
+srcPrefix?: NodeAddressT,
+dstPrefix?: NodeAddressT,
|};
export type ScoredNeighbor = {|
// The neighbor node, with its score
+scoredNode: ScoredNode,
// The edge connecting the target to its neighbor node, with its weight
+weightedEdge: WeightedEdge,
// How much score (in absolute terms) was provided to the target by
// the neighbor node through this weightedEdge
+scoreContribution: number,
|};
export opaque type PagerankGraphJSON = Compatible<{|
+graphJSON: GraphJSON,
// Score for every node, ordered by the sorted node address.
+scores: $ReadOnlyArray<number>,
// Weights for every edge, ordered by sorted edge address.
// We could save the EdgeWeights directly rather than having separate arrays
// for forwardsWeights and backwardsWeights, but this would lead to an
// inflated JSON representation because we would be needlessly duplicating
// the keys "forwards" and "backwards" themselves.
+forwardsWeights: $ReadOnlyArray<number>,
+backwardsWeights: $ReadOnlyArray<number>,
+syntheticLoopWeight: number,
|}>;
/**
* Options to control how PageRank runs and when it stops
*/
export type FullPagerankOptions = {|
// Maximum number of iterations before we give up on PageRank Convergence
// Defaults to DEFAULT_MAX_ITERATIONS if not provided.
+maxIterations: number,
// PageRank will stop running once the diff between the previous iteration
// and the latest is less than this threshold.
// Defaults to DEFAULT_CONVERGENCE_THRESHOLD if not provided.
+convergenceThreshold: number,
// Specifies a seed vector for PageRank "teleportation".
// At every step, some proportion `alpha` of the weight will
// teleport to the seed.
//
// The seed is specified as a map from node addresses to weights.
// The resultant seed will be a proper distribution over all the graph's available
// nodes, with each node's weight proportional to its weight in the seed. In the case
// that the total weight in the seed is 0 (e.g. an empty map was passed), then the
// seed vector will be a uniform distribution.
//
// Specifying any negative, NaN, or infinite weights is an error.
// Specifying weights for nodes that are not in the graph is also an error.
+seed: Map<NodeAddressT, number>,
// Specifies the probability with which score 'teleports' to the seed vector.
// If alpha=0, then the teleportation never happens. If alpha=1, then PageRank
// always converges to precisely the seed vector. Defaults to DEFAULT_ALPHA.
+alpha: number,
|};
export type PagerankOptions = $Shape<FullPagerankOptions>;
export type PagerankConvergenceReport = {|
// A quantitative measure of how close to convergence the final distribution was.
// Ideally, this value should be near zero.
// It shows the maximum absolute-valued change of any entry in the distribution
// if one more Markov action is taken.
+convergenceDelta: number,
|};
export const DEFAULT_SYNTHETIC_LOOP_WEIGHT = 1e-3;
export const DEFAULT_MAX_ITERATIONS = 255;
export const DEFAULT_CONVERGENCE_THRESHOLD = 1e-7;
// TODO(@decentralion): Change default alpha to be a small non-zero value
// once we choose an appropriate value.
export const DEFAULT_ALPHA = 0;
export const DEFAULT_SEED: () => Map<NodeAddressT, number> = () => new Map();
function defaultOptions(): FullPagerankOptions {
return {
maxIterations: DEFAULT_MAX_ITERATIONS,
convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
alpha: DEFAULT_ALPHA,
seed: DEFAULT_SEED(),
};
}
const COMPAT_INFO = {type: "sourcecred/pagerankGraph", version: "0.2.0"};
/**
* PagerankGraph is a wrapper over the Graph class, which adds
* the ability to run PageRank to compute scores on the Graph.
*
* Every node in the underlying Graph is assigned a numerical score in
* the range [0, 1]. Provided that there are any nodes, the sum of all
* the scores will be 1 (i.e. the scores are a probability
* distribution). The scores are assigned by the [PageRank] algorithm;
* i.e. a node recieves score in proportion to the score of its
* neighbors. When the PagerankGraph is first constructed, the scores
* are initialized to a uniform distribution.
*
* [PageRank]: https://en.wikipedia.org/wiki/PageRank
*
* Every edge in the Graph is assigned an `EdgeWeight`, which includes a
* `forwards` (weight from the `src` to the `dst`) and a `backwards`
* (weight from the `dst` back to the `src`). Both `forwards` and
* `backwards` must be nonnegative numbers. The weights influence how
* score flows from node to node. For example, if the node `root` is
* connected to `a` with a weight of `1` and to `b` with a weight of `2`,
* then `b` will recieve twice as much score from `root` as `a` does.
*
* Every node in the PagerankGraph has an associated `score`. Provided
* that the graph has at least one node, the scores are a probability
* distribution over the nodes; i.e. every score is in the range [0,1]
* and the scores sum to 1.
*
* This class is intended to closely mirror the Graph API so as to
* present a consistent and familiar interface.
*
* At present, PagerankGraph does not support any modification to the
* underlying Graph; doing so will invalidate PagerankGraph and cause
* its methods to throw errors.
*/
export class PagerankGraph {
// The Graph backing this PagerankGraph
_graph: Graph;
// The score for each Node in the Graph
_scores: Map<NodeAddressT, number>;
// The EdgeWeight for each Edge in the Graph
_edgeWeights: Map<EdgeAddressT, EdgeWeight>;
// Weight used to connect nodes to themselves, to avoid isolated
// nodes.
_syntheticLoopWeight: number;
// Modification count of the underlying Graph. Used to determine
// when this PageRankGraph is in an invalid state (due to changes
// to the graph backing it).
_graphModificationCount: number;
// Sum of all outWeights for a node, including the synthetic weight
_totalOutWeight: Map<NodeAddressT, number>;
/**
* Constructs a new PagerankGraph.
*
* Note that constructing a PagerankGraph around an empty graph is illegal,
* as it is impossible to define a probability distribution over zero
* nodes.
*/
constructor(
// The Graph backing this PagerankGraph. Must not be empty.
graph: Graph,
// Provides the initial EdgeWeight for every edge
edgeEvaluator: EdgeEvaluator,
// The weight we use to connect every node to itself
// to ensure there are no isolated nodes. Defaults to
// DEFAULT_SYNTHETIC_LOOP_WEIGHT.
syntheticLoopWeight: ?number
): void {
if (graph.equals(new Graph())) {
throw new Error("Cannot construct PagerankGraph with empty graph.");
}
this._graph = graph;
this._graphModificationCount = graph.modificationCount();
this._syntheticLoopWeight = NullUtil.orElse(
syntheticLoopWeight,
DEFAULT_SYNTHETIC_LOOP_WEIGHT
);
if (this._syntheticLoopWeight <= 0) {
throw new Error("syntheticLoopWeight must be > 0");
}
// Initialize scores to the uniform distribution over every node
this._scores = new Map();
const graphNodes = Array.from(this._graph.nodes());
for (const node of graphNodes) {
this._scores.set(node.address, 1 / graphNodes.length);
}
this.setEdgeEvaluator(edgeEvaluator);
}
/**
* Changes all of the PagerankGraph's edge weights
* by applying the new EdgeEvaluator.
*/
setEdgeEvaluator(edgeEvaluator: EdgeEvaluator): this {
this._totalOutWeight = new Map();
this._edgeWeights = new Map();
for (const node of this._graph.nodes()) {
this._totalOutWeight.set(node.address, this._syntheticLoopWeight);
}
const addOutWeight = (node: NodeAddressT, weight: number) => {
const previousWeight = NullUtil.get(this._totalOutWeight.get(node));
const newWeight = previousWeight + weight;
this._totalOutWeight.set(node, newWeight);
};
for (const edge of this._graph.edges({showDangling: false})) {
const weights = edgeEvaluator(edge);
this._edgeWeights.set(edge.address, weights);
addOutWeight(edge.src, weights.forwards);
addOutWeight(edge.dst, weights.backwards);
}
return this;
}
/**
* Retrieves the Graph backing this PagerankGraph.
*/
graph(): Graph {
this._verifyGraphNotModified();
return this._graph;
}
/**
* Returns the PagerankGraph's synthetic loop weight.
*
* The synthetic loop weight simulates a "phantom loop" connecting
* every node to itself. This ensures that every node has at least
* one outgoing connection, so that the corresponding markov chain
* used for PageRank is well-defined.
*
* In general, the synthetic loop weight should be quite small.
* By default, we set it to 1e-3.
*/
syntheticLoopWeight(): number {
return this._syntheticLoopWeight;
}
*_nodesIterator(iterator: Iterator<Node>): Iterator<ScoredNode> {
for (const node of iterator) {
const score = NullUtil.get(this._scores.get(node.address));
yield {node, score};
}
}
/**
* Provides node and score for every node in the underlying graph.
*
* Optionally, provide a node prefix to return an iterator containing
* only node/score objects whose nodes match the provided node prefix.
* See Graph.nodes and Address.hasPrefix for details.
*/
nodes(options?: {|+prefix: NodeAddressT|}): Iterator<ScoredNode> {
this._verifyGraphNotModified();
const iterator = this._graph.nodes(options);
return this._nodesIterator(iterator);
}
/**
* Retrieve a node from the graph, along with its score.
*
* TODO(#1020): Allow optional filtering, as in Graph.node.
*/
node(x: NodeAddressT): ?ScoredNode {
this._verifyGraphNotModified();
const score = this._scores.get(x);
if (score == null) {
return undefined;
} else {
const node = NullUtil.get(this._graph.node(x));
return {node, score};
}
}
/**
* Provides edge and weight for every edge in the underlying graph.
*
* Optionally, provide an EdgesOptions parameter to return an
* iterator containing edges matching the EdgesOptions prefix
* filter parameters. See Graph.edges for details.
*
* In contrast to Graph.edges, dangling edges will never be included,
* as we do not assign weights to danging edges.
*/
edges(options?: PagerankGraphEdgesOptions): Iterator<WeightedEdge> {
this._verifyGraphNotModified();
const graphOptions = {
showDangling: false,
addressPrefix: undefined,
srcPrefix: undefined,
dstPrefix: undefined,
};
if (options != null) {
graphOptions.addressPrefix = options.addressPrefix;
graphOptions.srcPrefix = options.srcPrefix;
graphOptions.dstPrefix = options.dstPrefix;
}
const iterator = this._graph.edges(graphOptions);
return this._edgesIterator(iterator);
}
*_edgesIterator(iterator: Iterator<Edge>): Iterator<WeightedEdge> {
for (const edge of iterator) {
const weight = NullUtil.get(this._edgeWeights.get(edge.address));
yield {edge, weight};
}
}
/**
* Provides the edge and weight for a particular edge, if present.
*
* TODO(#1020): Allow optional filtering, as in Graph.edge.
*/
edge(a: EdgeAddressT): ?WeightedEdge {
this._verifyGraphNotModified();
const edge = this._graph.edge(a);
if (edge != null && this._graph.isDanglingEdge(a) === false) {
const weight = NullUtil.get(this._edgeWeights.get(edge.address));
return {edge, weight};
}
return undefined;
}
/**
* Provides the total out weight for a node, i.e. every edge weight pointed
* away from the node, plus the syntheticLoopWeight.
*
* The total out weight is needed to interpret the actual significance of any
* particular edge's weight, as edge weights are normalized by the totalOutWeight
* so that the normalized weights going out of a node always sum to 1.
*/
totalOutWeight(node: NodeAddressT): number {
this._verifyGraphNotModified();
const weight = this._totalOutWeight.get(node);
if (weight == null) {
throw new Error(
`Tried to get outWeight for non-existent node ${NodeAddress.toString(
node
)}`
);
}
return weight;
}
/**
* Provides the Neighbors to a target node, along with how those
* neighbors contributed to the node's score.
*
* See the docs on `Graph.neighbors` for the semantics of what a `Neighbor`
* is. This call augments the Neighbors from graph, so that for each neighbor
* we also have the neighbor node's score, the EdgeWeight for the edge, and a
* scoreContribution, which shows how much score was contributed to the
* target node from that Neighbor.
*
* When the PagerankGraph is well-converged, it will be the case that a
* node's score is equal to the score contribution from each neighbor plus
* the synthetic loop's score contribution.
*
* When the PagerankGraph is not well-converged, the score contributions are
* meaningless.
*/
neighbors(
target: NodeAddressT,
options: NeighborsOptions
): Iterator<ScoredNeighbor> {
this._verifyGraphNotModified();
if (!this.graph().hasNode(target)) {
throw new Error(
`Tried to find neighbors of non-existent node ${NodeAddress.toString(
target
)}`
);
}
return this._neighborsIterator(target, options);
}
*_neighborsIterator(
target: NodeAddressT,
options: NeighborsOptions
): Iterator<ScoredNeighbor> {
const graphNeighbors = this.graph().neighbors(target, options);
for (const {node, edge} of graphNeighbors) {
const scoredNode = NullUtil.get(this.node(node.address));
const weightedEdge = NullUtil.get(this.edge(edge.address));
// We compute how much of target's score is attributable to the neighbor.
// First, we find out how much edge weight there was from node to target,
// based on whether it was an IN-edge or OUT-edge or loop.
let relevantEdgeWeight = 0;
if (edge.src === target) {
relevantEdgeWeight += weightedEdge.weight.backwards;
}
if (edge.dst === target) {
relevantEdgeWeight += weightedEdge.weight.forwards;
}
// We normalize this edge weight by the total outWeight for `node`.
const normalizedEdgeWeight =
relevantEdgeWeight / this.totalOutWeight(node.address);
// Then we directly compute the score contribution
const scoreContribution = scoredNode.score * normalizedEdgeWeight;
yield {scoredNode, weightedEdge, scoreContribution};
}
}
/**
* Returns how much of a node's score came from its synthetic loop.
* For most nodes, this should be near zero. However, if the node has no
* outgoing edge edge weight (e.g. it is isolated), then this value
* may be larger.
*
* The results of syntheticLoopScoreContribution are not meaningful if the
* PagerankGraph is not converged.
*/
syntheticLoopScoreContribution(node: NodeAddressT): number {
this._verifyGraphNotModified();
const scoredNode = this.node(node);
if (scoredNode == null) {
throw new Error(
"Cannot get syntheticLoopScoreContribution for non-existent node"
);
}
return (
(scoredNode.score * this._syntheticLoopWeight) / this.totalOutWeight(node)
);
}
/**
* Asynchronously run PageRank to re-compute scores.
*
* Calling this method constructs a [Markov Chain] corresponding
* to the underlying graph and its associated edge weights,
* and then iteratively converges to the stationary distribution
* of that chain, according to the [PageRank algorithm].
*
* [Markov Chain]: https://brilliant.org/wiki/markov-chains/
* [PageRank algorithm]: https://en.wikipedia.org/wiki/PageRank
*
* The `PagerankConvergenceOptions` gives guidance on how to run
* PageRank. PageRank will continue running until either
* `options.maxIterations` has been exceeded, or until the largest
* individual delta in a node's score between the present and previous
* iteration is less than or equal to `options.convergenceThreshold`.
*
* Note that if runPagerank is called multiple times on the same
* PagerankGraph, it will re-use the last stationary distribution as the
* starting point for running PageRank again. In general, this will result in
* improved performance, and it will not usually affect the outcome from
* PageRank. However, in certain circumstances, it could result in different
* outputs. For example, if there are isolated nodes and no seed vector, then
* the initial distribution may matter.
*/
async runPagerank(
options?: PagerankOptions
): Promise<PagerankConvergenceReport> {
this._verifyGraphNotModified();
const fullOptions = {
...defaultOptions(),
...(options || {}),
};
const edgeEvaluator = (x: Edge) =>
NullUtil.get(this._edgeWeights.get(x.address));
const connections = createConnections(
this._graph,
edgeEvaluator,
this._syntheticLoopWeight
);
const osmc = createOrderedSparseMarkovChain(connections);
const pi0 = new Float64Array(osmc.chain.length);
osmc.nodeOrder.forEach(
(n: NodeAddressT, i) => (pi0[i] = NullUtil.get(this.node(n)).score)
);
const params: PagerankParams = {
chain: osmc.chain,
alpha: fullOptions.alpha,
seed: weightedDistribution(osmc.nodeOrder, fullOptions.seed),
pi0,
};
const coreOptions: CorePagerankOptions = {
verbose: false,
convergenceThreshold: fullOptions.convergenceThreshold,
maxIterations: fullOptions.maxIterations,
yieldAfterMs: 30,
};
const distributionResult = await findStationaryDistribution(
params,
coreOptions
);
this._scores = distributionToNodeDistribution(
osmc.nodeOrder,
distributionResult.pi
);
return {
convergenceDelta: distributionResult.convergenceDelta,
};
}
/**
* Returns whether another PagerankGraph is equal to this one.
*
* PagerankGraphs are considered equal if they have the same nodes with
* the same scores, and the same edges with the same weights, and the same
* syntheticLoopWeight.
*
* The modification history of the underlying Graph is irrelevant to
* equality.
*/
equals(that: PagerankGraph): boolean {
if (!(that instanceof PagerankGraph)) {
throw new Error(`Expected PagerankGraph, got ${String(that)}`);
}
this._verifyGraphNotModified();
return (
this.graph().equals(that.graph()) &&
deepEqual(this._scores, that._scores) &&
deepEqual(this._edgeWeights, that._edgeWeights) &&
this._syntheticLoopWeight === that._syntheticLoopWeight
);
}
/**
* Serialize this graph into a PagerankJSON object.
*
* Returns a plain JavaScript object.
*
* For space efficency, we store the node scores as an array of numbers in
* node-address-sorted order, and we store the edge weights as two arrays of
* numbers in edge-address-sorted-order.
*/
toJSON(): PagerankGraphJSON {
this._verifyGraphNotModified();
const graphJSON = this.graph().toJSON();
const scores = Array.from(this.nodes()).map((x) => x.score);
const edgeWeights = Array.from(this.edges()).map((x) => x.weight);
const forwardsWeights: number[] = edgeWeights.map((x) => x.forwards);
const backwardsWeights: number[] = edgeWeights.map((x) => x.backwards);
const rawJSON = {
graphJSON,
scores,
forwardsWeights,
backwardsWeights,
syntheticLoopWeight: this.syntheticLoopWeight(),
};
return toCompat(COMPAT_INFO, rawJSON);
}
static fromJSON(json: PagerankGraphJSON): PagerankGraph {
const {
forwardsWeights,
backwardsWeights,
scores,
graphJSON,
syntheticLoopWeight,
} = fromCompat(COMPAT_INFO, json);
const graph = Graph.fromJSON(graphJSON);
const nodeAddresses = Array.from(graph.nodes()).map((x) => x.address);
const scoreMap: Map<NodeAddressT, number> = new Map();
for (let i = 0; i < nodeAddresses.length; i++) {
scoreMap.set(nodeAddresses[i], scores[i]);
}
const edgeAddresses = Array.from(graph.edges({showDangling: false})).map(
(x) => x.address
);
const edgeWeights: Map<EdgeAddressT, EdgeWeight> = new Map();
for (let i = 0; i < edgeAddresses.length; i++) {
const forwards = forwardsWeights[i];
const backwards = backwardsWeights[i];
edgeWeights.set(edgeAddresses[i], {forwards, backwards});
}
function evaluator(e: Edge): EdgeWeight {
return NullUtil.get(edgeWeights.get(e.address));
}
const prg = new PagerankGraph(graph, evaluator, syntheticLoopWeight);
// TODO(#1020): It's a little hacky to force the scores in like this;
// consider adding an optional constructor argument to allow manually
// setting the scores at construction time, if we ever find a use case
// that needs it.
prg._scores = scoreMap;
return prg;
}
_verifyGraphNotModified() {
if (this._graph.modificationCount() !== this._graphModificationCount) {
throw new Error(
"Error: The PagerankGraph's underlying Graph has been modified."
);
}
}
}

View File

@ -1,676 +0,0 @@
// @flow
import sortBy from "lodash.sortby";
import {
Graph,
NodeAddress,
EdgeAddress,
type NodeAddressT,
type Edge,
type EdgesOptions,
} from "./graph";
import {
PagerankGraph,
Direction,
DEFAULT_MAX_ITERATIONS,
DEFAULT_CONVERGENCE_THRESHOLD,
DEFAULT_ALPHA,
DEFAULT_SEED,
type PagerankGraphEdgesOptions,
} from "./pagerankGraph";
import {advancedGraph, node, partsNode, partsEdge} from "./graphTestUtil";
import * as NullUtil from "../util/null";
describe("core/pagerankGraph", () => {
const defaultEvaluator = (_unused_edge) => ({forwards: 1, backwards: 0});
const nonEmptyGraph = () => new Graph().addNode(node("hi"));
function examplePagerankGraph(
edgeEvaluator = defaultEvaluator
): PagerankGraph {
const g = advancedGraph().graph1();
return new PagerankGraph(g, edgeEvaluator);
}
async function convergedPagerankGraph(): Promise<PagerankGraph> {
const pg = examplePagerankGraph();
await pg.runPagerank({maxIterations: 100, convergenceThreshold: 1e-4});
return pg;
}
it("cannot construct PagerankGraph with empty Graph", () => {
const eg1 = new Graph();
const eg2 = new Graph().addNode(node("hi")).removeNode(node("hi").address);
expect(() => new PagerankGraph(eg1, defaultEvaluator)).toThrowError(
"empty graph"
);
expect(() => new PagerankGraph(eg2, defaultEvaluator)).toThrowError(
"empty graph"
);
});
describe("setEdgeEvaluator", () => {
it("is idempotent", () => {
const e1 = examplePagerankGraph(defaultEvaluator);
const e2 = examplePagerankGraph(defaultEvaluator);
e2.setEdgeEvaluator(defaultEvaluator);
expect(e1.equals(e2)).toBe(true);
});
it("graphs with changed edge weights are not equal", () => {
const e1 = examplePagerankGraph();
const e2 = examplePagerankGraph();
e2.setEdgeEvaluator(() => ({forwards: 3, backwards: 9}));
expect(e1.equals(e2)).toBe(false);
});
it("graphs are distinct but with identical scores if evaluators are the same modulo multiplication", async () => {
// Think of this test as a bit more of an "e2e sanity check", verifying
// a few properties at once.
// We start with two example graphs with edge evaluators that are the same, except the scores
// are different by a scalar multiple of 3.
// So we know the scores should all turn out the same, but the graphs will be different,
// because the edge weights are nominally distinct.
const e1 = examplePagerankGraph(() => ({forwards: 3, backwards: 6}));
const e2 = examplePagerankGraph(() => ({forwards: 1, backwards: 2}));
expect(e1.equals(e2)).toBe(false);
await e1.runPagerank();
await e2.runPagerank();
for (const {node, score} of e1.nodes()) {
const otherScore = NullUtil.get(e2.node(node.address)).score;
expect(otherScore).toBeCloseTo(score);
}
});
});
describe("node / nodes", () => {
it("node returns undefined for node not in the graph", () => {
const g = nonEmptyGraph();
const pg = new PagerankGraph(g, defaultEvaluator);
expect(pg.node(NodeAddress.empty)).toBe(undefined);
});
it("nodes yields the same nodes as are in the graph", () => {
const g = advancedGraph().graph1();
const pg = new PagerankGraph(g, defaultEvaluator);
const graphNodes = Array.from(g.nodes());
const pgNodes = Array.from(pg.nodes()).map((x) => x.node);
expect(graphNodes).toEqual(pgNodes);
});
it("node and nodes both return consistent scores", async () => {
const pg = await convergedPagerankGraph();
for (const {node, score} of pg.nodes()) {
expect(score).toEqual(NullUtil.get(pg.node(node.address)).score);
}
});
it("node and nodes both throw an error if underlying graph is modified", () => {
const pg = new PagerankGraph(nonEmptyGraph(), defaultEvaluator);
pg.graph().addNode(node("foo"));
expect(() => pg.nodes()).toThrowError(
"underlying Graph has been modified"
);
expect(() => pg.node(NodeAddress.empty)).toThrowError(
"underlying Graph has been modified"
);
});
});
describe("node prefix filter matches graph filter", () => {
const n1 = partsNode([]);
const n2 = partsNode(["foo"]);
const n3 = partsNode(["foo", "bar"]);
const n4 = partsNode(["zod", "bar"]);
const g = () =>
new Graph()
.addNode(n1)
.addNode(n2)
.addNode(n3)
.addNode(n4);
const pg = () => new PagerankGraph(g(), defaultEvaluator);
function expectPagerankGraphToEqualGraph(
options: {|+prefix: NodeAddressT|} | void
) {
const pagerankGraphNodes = Array.from(pg().nodes(options)).sort();
const graphNodes = Array.from(g().nodes(options)).sort();
pagerankGraphNodes.forEach(
(pgNode, i) =>
expect(pgNode.node).toEqual(graphNodes[i]) &&
expect(pgNode.score).toBe(0.25)
);
}
it("with no options object", () => {
expectPagerankGraphToEqualGraph(undefined);
});
it("with prefix filter", () => {
expectPagerankGraphToEqualGraph({prefix: n2.address});
});
it("with empty prefix", () => {
expectPagerankGraphToEqualGraph({prefix: NodeAddress.empty});
});
it("with prefix that matches nothing", () => {
expectPagerankGraphToEqualGraph({prefix: NodeAddress.fromParts(["2"])});
});
});
describe("node prefix filter", () => {
it("requires a prefix when options are specified", () => {
const pg = new PagerankGraph(nonEmptyGraph(), defaultEvaluator);
// $ExpectFlowError
expect(() => pg.nodes({})).toThrow("prefix");
});
});
describe("edge/edges", () => {
it("edges returns the non-dangling edges in the base graph", () => {
const g = advancedGraph().graph1();
const pg = new PagerankGraph(g, defaultEvaluator);
const graphEdges = Array.from(g.edges({showDangling: false}));
const pgEdges = Array.from(pg.edges()).map((x) => x.edge);
expect(graphEdges.length).toEqual(pgEdges.length);
const addressAccessor = (x: Edge) => x.address;
const sortedGraphEdges = sortBy(graphEdges, addressAccessor);
const sortedPagerankEdges = sortBy(pgEdges, addressAccessor);
expect(sortedGraphEdges).toEqual(sortedPagerankEdges);
});
it("edge/edges both correctly return the edge weights", () => {
const edgeEvaluator = ({address, src, dst}) => {
return {
forwards: address.length + src.length,
backwards: address.length + dst.length,
};
};
const g = advancedGraph().graph1();
const pg = new PagerankGraph(g, edgeEvaluator);
for (const {edge, weight} of pg.edges()) {
expect(edgeEvaluator(edge)).toEqual(weight);
expect(NullUtil.get(pg.edge(edge.address)).weight).toEqual(weight);
}
});
it("edge returns undefined for address not in the graph", () => {
const pg = new PagerankGraph(nonEmptyGraph(), defaultEvaluator);
expect(pg.edge(EdgeAddress.empty)).toBe(undefined);
});
it("edge returns null for dangling edge", () => {
const {graph1, edges} = advancedGraph();
const pg = new PagerankGraph(graph1(), defaultEvaluator);
expect(pg.edge(edges.halfDanglingEdge.address)).toEqual(undefined);
});
it("edge and edges both throw an error if underlying graph is modified", () => {
const pg = new PagerankGraph(nonEmptyGraph(), defaultEvaluator);
pg.graph().addNode(node("foo"));
expect(() => pg.edges()).toThrowError(
"underlying Graph has been modified"
);
expect(() => pg.edge(EdgeAddress.empty)).toThrowError(
"underlying Graph has been modified"
);
});
});
describe("totalOutWeight", () => {
it("errors on a modified graph", () => {
const eg = examplePagerankGraph();
const badNode = node("badNode");
eg.graph().addNode(badNode);
expect(() => eg.totalOutWeight(badNode.address)).toThrowError(
"has been modified"
);
});
it("errors on nonexistent node", () => {
const eg = examplePagerankGraph();
expect(() =>
eg.totalOutWeight(NodeAddress.fromParts(["nonexistent"]))
).toThrowError("non-existent node");
});
function verifyOutWeights(pg: PagerankGraph) {
const outWeight: Map<NodeAddressT, number> = new Map();
for (const node of pg.graph().nodes()) {
outWeight.set(node.address, pg.syntheticLoopWeight());
}
const addOutWeight = (node: NodeAddressT, weight: number) => {
const previousWeight = NullUtil.get(outWeight.get(node));
const newWeight = previousWeight + weight;
outWeight.set(node, newWeight);
};
for (const {edge, weight} of pg.edges()) {
addOutWeight(edge.src, weight.forwards);
addOutWeight(edge.dst, weight.backwards);
}
for (const node of pg.graph().nodes()) {
expect(pg.totalOutWeight(node.address)).toEqual(
outWeight.get(node.address)
);
}
}
it("computes outWeight correctly on the example graph", () => {
const edgeEvaluator = (_unused_edge) => ({forwards: 1, backwards: 2});
const eg = examplePagerankGraph(edgeEvaluator);
verifyOutWeights(eg);
});
it("outWeight is always the syntheticLoopWeight when edges have no weight", () => {
const zeroEvaluator = (_unused_edge) => ({forwards: 0, backwards: 0});
const syntheticLoopWeight = 0.1337;
const pg = new PagerankGraph(
advancedGraph().graph1(),
zeroEvaluator,
syntheticLoopWeight
);
for (const {node} of pg.nodes()) {
expect(pg.totalOutWeight(node.address)).toEqual(syntheticLoopWeight);
}
});
it("outWeight is computed correctly after JSON deserialization", () => {
// I added this test because the outWeight map is a cache that is computed
// once, in the constructor, and since the JSON deserialization invokes
// the constructor and then hacks variables around a bit, I want to ensure the
// outWeight cache is still generated properly.
const eg = examplePagerankGraph();
const eg_ = PagerankGraph.fromJSON(eg.toJSON());
verifyOutWeights(eg_);
});
});
describe("edge filtering", () => {
const src1 = partsNode(["src", "1"]);
const src2 = partsNode(["src", "2"]);
const dst1 = partsNode(["dst", "1"]);
const dst2 = partsNode(["dst", "2"]);
const e11 = partsEdge(["e", "1", "1"], src1, dst1);
const e12 = partsEdge(["e", "1", "2"], src1, dst2);
const e21 = partsEdge(["e", "2", "1"], src2, dst1);
const e22 = partsEdge(["e", "2", "2"], src2, dst2);
const graph = () => {
const g = new Graph();
[src1, src2, dst1, dst2].forEach((n) => g.addNode(n));
[e11, e12, e21, e22].forEach((e) => g.addEdge(e));
return g;
};
const pagerankGraph = () => new PagerankGraph(graph(), defaultEvaluator);
function expectConsistentEdges(options: PagerankGraphEdgesOptions | void) {
const pagerankGraphEdges = Array.from(pagerankGraph().edges(options));
pagerankGraphEdges.forEach((e) => {
expect(e.weight.backwards).toBe(0);
expect(e.weight.forwards).toBe(1);
});
const graphOptions: EdgesOptions =
options == null
? {showDangling: false}
: {...options, showDangling: false};
const graphEdges = Array.from(graph().edges(graphOptions));
expect(pagerankGraphEdges.map((e) => e.edge)).toEqual(graphEdges);
}
describe("edge filter matches graph edge filter", () => {
it("finds all edges when no options are specified", () => {
expectConsistentEdges(undefined);
});
it("finds all edges when all-inclusive filters are specified", () => {
expectConsistentEdges({
addressPrefix: EdgeAddress.fromParts(["e"]),
srcPrefix: NodeAddress.fromParts(["src"]),
dstPrefix: NodeAddress.fromParts(["dst"]),
});
});
it("finds edges by address prefix", () => {
expectConsistentEdges({
addressPrefix: EdgeAddress.fromParts(["e", "1"]),
});
});
it("finds edges by src prefix", () => {
expectConsistentEdges({
srcPrefix: NodeAddress.fromParts(["src", "1"]),
});
});
it("finds edges by dst prefix", () => {
expectConsistentEdges({
dstPrefix: NodeAddress.fromParts(["dst", "1"]),
});
});
it("yields nothing for disjoint filters", () => {
expectConsistentEdges({
addressPrefix: EdgeAddress.fromParts(["e", "1"]),
srcPrefix: NodeAddress.fromParts(["src", "2"]),
});
});
it("yields appropriate filter intersection", () => {
expectConsistentEdges({
srcPrefix: NodeAddress.fromParts(["src", "1"]),
dstPrefix: NodeAddress.fromParts(["dst", "2"]),
});
});
});
});
describe("neighbors", () => {
const allNeighbors = () => ({
direction: Direction.ANY,
nodePrefix: NodeAddress.empty,
edgePrefix: EdgeAddress.empty,
});
it("is an error to call neighbors after modifying the underlying graph", () => {
const pg = examplePagerankGraph();
pg.graph().addNode(partsNode(["foomfazzle"]));
expect(() =>
pg.neighbors(NodeAddress.fromParts(["src"]), allNeighbors())
).toThrowError("has been modified");
});
it("it is an error to call neighbors on a non-existent node", () => {
const pg = examplePagerankGraph();
expect(() =>
pg.neighbors(NodeAddress.fromParts(["foomfazzle"]), allNeighbors())
).toThrowError("non-existent node");
});
it("neighbors returns results consistent with Graph.neighbors", () => {
const directions = [Direction.IN, Direction.ANY, Direction.OUT];
const nodePrefixes = [
NodeAddress.empty,
NodeAddress.fromParts(["src"]),
NodeAddress.fromParts(["nonexistent"]),
];
const edgePrefixes = [
EdgeAddress.empty,
EdgeAddress.fromParts(["hom"]),
EdgeAddress.fromParts(["nonexistent"]),
];
const targets = [
NodeAddress.fromParts(["src"]),
NodeAddress.fromParts(["loop"]),
];
const graph = advancedGraph().graph1();
const pagerankGraph = new PagerankGraph(graph, defaultEvaluator);
for (const direction of directions) {
for (const nodePrefix of nodePrefixes) {
for (const edgePrefix of edgePrefixes) {
for (const target of targets) {
const options = {direction, nodePrefix, edgePrefix};
const prgNeighbors = Array.from(
pagerankGraph.neighbors(target, options)
);
const gNeighbors = Array.from(graph.neighbors(target, options));
const reducedPrgNeighbors = prgNeighbors.map((s) => ({
node: s.scoredNode.node,
edge: s.weightedEdge.edge,
}));
expect(gNeighbors).toEqual(reducedPrgNeighbors);
}
}
}
}
});
});
describe("score decomposition", () => {
const allNeighbors = () => ({
direction: Direction.ANY,
nodePrefix: NodeAddress.empty,
edgePrefix: EdgeAddress.empty,
});
it("neighbor's scored contributions are computed correctly", async () => {
const pg = await convergedPagerankGraph();
for (const {node} of pg.nodes()) {
const target = node.address;
for (const {
scoredNode,
weightedEdge,
scoreContribution,
} of pg.neighbors(target, allNeighbors())) {
let rawWeight = 0;
if (weightedEdge.edge.dst === target) {
rawWeight += weightedEdge.weight.forwards;
}
if (weightedEdge.edge.src === target) {
rawWeight += weightedEdge.weight.backwards;
}
const normalizedWeight =
rawWeight / pg.totalOutWeight(scoredNode.node.address);
expect(scoreContribution).toEqual(
scoredNode.score * normalizedWeight
);
}
}
});
it("synthetic score contributions are computed correctly", async () => {
const pg = await convergedPagerankGraph();
for (const {node, score} of pg.nodes()) {
const {address} = node;
expect(pg.syntheticLoopScoreContribution(address)).toEqual(
(score * pg.syntheticLoopWeight()) / pg.totalOutWeight(address)
);
}
});
it("neighbors score contributions + synthetic score contribution == node score", async () => {
// Note: I've verified that test fails if we don't properly handle loop
// neighbors (need to add the edge forwards and backwards if the neighbor
// is a loop).
const pg = await convergedPagerankGraph();
for (const {node, score} of pg.nodes()) {
// We need to include the score that came from the synthetic loop edge
// (should be near zero for non-isolated nodes)
let summedScoreContributions: number = pg.syntheticLoopScoreContribution(
node.address
);
for (const scoredNeighbor of pg.neighbors(
node.address,
allNeighbors()
)) {
summedScoreContributions += scoredNeighbor.scoreContribution;
}
expect(summedScoreContributions).toBeCloseTo(score);
}
});
});
describe("runPagerank", () => {
// The mathematical semantics of PageRank are thoroughly tested
// in the markovChain module. The goal for these tests is just
// to make sure that the API calls are glued together properly,
// so it's mostly option + sanity checking
function checkUniformDistribution(pg: PagerankGraph) {
const nodes = Array.from(pg.nodes());
for (const {score} of nodes) {
expect(score).toEqual(1 / nodes.length);
}
}
function checkProbabilityDistribution(pg: PagerankGraph) {
let total = 0;
for (const {score} of pg.nodes()) {
expect(score).toBeGreaterThanOrEqual(0);
expect(score).toBeLessThanOrEqual(1);
total += score;
}
expect(total).toBeCloseTo(1);
}
it("runs PageRank with default options if not specified", () => {
const pg1 = examplePagerankGraph();
const pg2 = examplePagerankGraph();
const pg3 = examplePagerankGraph();
pg1.runPagerank();
pg2.runPagerank({});
pg3.runPagerank({
maxIterations: DEFAULT_MAX_ITERATIONS,
convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
alpha: DEFAULT_ALPHA,
seed: DEFAULT_SEED(),
});
expect(pg1.equals(pg2)).toBe(true);
expect(pg1.equals(pg3)).toBe(true);
});
describe("alpha and seed parameters", () => {
// The logic for seeded PageRank (and for generating the seed distribution via weights)
// are both thoroughly unit-tested. Therefore, these tests only sanity check that the
// parameters are getting consumed properly based on easily tested properties.
it("seed is irrelevant if alpha is 0", async () => {
const pg1 = examplePagerankGraph();
const pg2 = examplePagerankGraph();
const {nodes} = advancedGraph();
const seed1 = new Map().set(nodes.src.address, 1);
const seed2 = new Map().set(nodes.dst.address, 1);
await pg1.runPagerank({seed: seed1, alpha: 0});
await pg2.runPagerank({seed: seed2, alpha: 0});
expect(pg1.equals(pg2)).toBe(true);
});
it("seed is returned directly if alpha is 1", async () => {
const pg = examplePagerankGraph();
const src = advancedGraph().nodes.src;
const seed = new Map().set(src.address, 1);
await pg.runPagerank({seed, alpha: 1});
const score = NullUtil.get(pg.node(src.address)).score;
expect(score).toBe(1);
});
});
it("promise rejects if the graph was modified", async () => {
const pg = examplePagerankGraph();
pg.graph().addNode(node("foo"));
expect(
pg.runPagerank({maxIterations: 1, convergenceThreshold: 1})
).rejects.toThrow("underlying Graph has been modified");
// It's possible that you could avoid the rejection if you
// make the modification after calling runPagerank (but before
// promise resolves). However, since every getter also checks
// for modification, this is not a serious issue.
});
it("scores are a uniform distribution prior to running PageRank", () => {
checkUniformDistribution(examplePagerankGraph());
});
it("respects maxIterations==0", async () => {
const pg = examplePagerankGraph();
const results = await pg.runPagerank({
maxIterations: 0,
convergenceThreshold: 0,
});
expect(results.convergenceDelta).toBeGreaterThan(0);
checkUniformDistribution(pg);
});
it("will limit at max iterations when convergence threshld is low", async () => {
const pg = examplePagerankGraph();
const convergenceThreshold = 1e-18;
const results = await pg.runPagerank({
maxIterations: 17,
convergenceThreshold,
});
expect(results.convergenceDelta).toBeGreaterThan(convergenceThreshold);
checkProbabilityDistribution(pg);
});
it("will converge when threshold is high", async () => {
const pg = examplePagerankGraph();
const convergenceThreshold = 0.01;
const results = await pg.runPagerank({
maxIterations: 170,
convergenceThreshold,
});
expect(results.convergenceDelta).toBeLessThan(convergenceThreshold);
checkProbabilityDistribution(pg);
});
it("re-uses existing scores as a starting point", async () => {
const pg = examplePagerankGraph();
const convergenceThreshold = 0.001;
const results1 = await pg.runPagerank({
maxIterations: 170,
convergenceThreshold,
});
expect(results1.convergenceDelta).toBeLessThan(convergenceThreshold);
// It should still converge without any iterations, because it uses the
// final distribution as a starting point
const results2 = await pg.runPagerank({
maxIterations: 0,
convergenceThreshold,
});
expect(results2.convergenceDelta).toEqual(results1.convergenceDelta);
});
});
describe("equals", () => {
it("PagerankGraph is equal to itself", () => {
const pg = examplePagerankGraph();
expect(pg.equals(pg)).toBe(true);
});
it("two identicalPagerankGraphs are equal", () => {
const pg1 = examplePagerankGraph();
const pg2 = examplePagerankGraph();
expect(pg1.equals(pg2)).toBe(true);
});
it("unequal syntheticLoopWeight => unequal", () => {
const pg1 = new PagerankGraph(nonEmptyGraph(), defaultEvaluator, 0.1);
const pg2 = new PagerankGraph(nonEmptyGraph(), defaultEvaluator, 0.2);
expect(pg1.equals(pg2)).toBe(false);
});
it("unequal graph => unequal", () => {
const pg1 = new PagerankGraph(nonEmptyGraph(), defaultEvaluator, 0.1);
const g2 = nonEmptyGraph().addNode(node("foo"));
const pg2 = new PagerankGraph(g2, defaultEvaluator, 0.1);
expect(pg1.equals(pg2)).toBe(false);
});
it("unequal scores => unequal", async () => {
const pg1 = examplePagerankGraph();
const pg2 = examplePagerankGraph();
await pg1.runPagerank({maxIterations: 2, convergenceThreshold: 0.001});
expect(pg1.equals(pg2)).toBe(false);
});
it("unequal edge weights => unequal", () => {
const evaluator1 = (_unused_edge) => ({forwards: 1, backwards: 1});
const evaluator2 = (_unused_edge) => ({forwards: 0, backwards: 1});
const pg1 = new PagerankGraph(advancedGraph().graph1(), evaluator1);
const pg2 = new PagerankGraph(advancedGraph().graph1(), evaluator2);
expect(pg1.equals(pg2)).toBe(false);
});
it("different modification history => still equal", () => {
// advancedGraph.graph1 and graph2 are identical except for their
// construction history
const pg1 = new PagerankGraph(advancedGraph().graph1(), defaultEvaluator);
const pg2 = new PagerankGraph(advancedGraph().graph2(), defaultEvaluator);
expect(pg1.equals(pg2)).toBe(true);
});
it("throws an error if comparing PagerankGraph to non-PagerankGraph", () => {
const pg = examplePagerankGraph();
const g = new Graph();
// $ExpectFlowError
expect(() => pg.equals(g)).toThrowError("Expected PagerankGraph");
});
it("throws an error if the underlying graph is modified", () => {
const pg = examplePagerankGraph();
pg.graph().addNode(node("modification"));
expect(() => pg.equals(pg)).toThrowError("has been modified");
});
});
describe("to/from JSON", () => {
it("to->fro is identity", async () => {
const pg = await convergedPagerankGraph();
const pgJSON = pg.toJSON();
const pg_ = PagerankGraph.fromJSON(pgJSON);
expect(pg.equals(pg_)).toBe(true);
});
it("fro->to is identity", async () => {
const pg = await convergedPagerankGraph();
const pgJSON = pg.toJSON();
const pg_ = PagerankGraph.fromJSON(pgJSON);
const pgJSON_ = pg_.toJSON();
expect(pgJSON).toEqual(pgJSON_);
});
it("is canonical with respect to the graph's history", async () => {
const pg1 = new PagerankGraph(advancedGraph().graph1(), defaultEvaluator);
const pg2 = new PagerankGraph(advancedGraph().graph2(), defaultEvaluator);
const pg1JSON = pg1.toJSON();
const pg2JSON = pg2.toJSON();
expect(pg1JSON).toEqual(pg2JSON);
});
it("matches expected snapshot", () => {
const pgJSON = examplePagerankGraph().toJSON();
expect(pgJSON).toMatchSnapshot();
});
});
});