Start work on the PagerankGraph (#1057)

* Start work on the PagerankGraph

This commit begins work on the `PagerankGraph` class, as described in
[#1020]. As of this commit, the `PagerankGraph` has basic functionality
like retrieving nodes and edges, and running PageRank. However, it is
missing utility functionality like equality testing and serialization,
and doesn't yet have score decomposition logic.

This was mostly produced during a [live coding session]. Thanks to
@BrianLitwin, @anthrocypher, and @wchargin for participating.

Test plan:
The new code is thoroughly unit tested. Please review the test coverage,
and also the quality of the documentation.

[#1020]: https://github.com/sourcecred/sourcecred/issues/1020
[live coding session]: https://github.com/sourcecred/mission/issues/14

* Improvements from self-review

- Don't allow PRG around empty graph, as there's no way to make it
a valid probability distribution

* Add issue ref in TODOs
This commit is contained in:
Dandelion Mané 2019-02-14 11:24:35 -07:00 committed by GitHub
parent dcda8bde1d
commit b51491ce1a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 459 additions and 0 deletions

281
src/core/pagerankGraph.js Normal file
View File

@ -0,0 +1,281 @@
// @flow
import {Graph, type Edge, type NodeAddressT, type EdgeAddressT} from "./graph";
import {
distributionToNodeDistribution,
createConnections,
createOrderedSparseMarkovChain,
type EdgeWeight,
} from "./attribution/graphToMarkovChain";
import {findStationaryDistribution} from "../core/attribution/markovChain";
import * as NullUtil from "../util/null";
export type {EdgeWeight} from "./attribution/graphToMarkovChain";
export type EdgeEvaluator = (Edge) => EdgeWeight;
export type ScoredNode = {|
+node: NodeAddressT,
+score: number,
|};
export type WeightedEdge = {|
+edge: Edge,
+weight: EdgeWeight,
|};
/**
* Options to control how PageRank runs and when it stops
*/
export type PagerankConvergenceOptions = {|
// Maximum number of iterations before we give up on PageRank Convergence
+maxIterations: number,
// PageRank will stop running once the diff between the previous iteration
// and the latest is less than this threshold
+convergenceThreshold: number,
|};
export type PagerankConvergenceReport = {|
// A quantitative measure of how close to convergence the final distribution was.
// Ideally, this value should be near zero.
// It shows the maximum absolute-valued change of any entry in the distribution
// if one more Markov action is taken.
+convergenceDelta: number,
|};
export const DEFAULT_SYNTHETIC_LOOP_WEIGHT = 1e-3;
/**
* PagerankGraph is a wrapper over the Graph class, which adds
* the ability to run PageRank to compute scores on the Graph.
*
* Every node in the underlying Graph is assigned a numerical score in
* the range [0, 1]. Provided that there are any nodes, the sum of all
* the scores will be 1 (i.e. the scores are a probability
* distribution). The scores are assigned by the [PageRank] algorithm;
* i.e. a node recieves score in proportion to the score of its
* neighbors. When the PagerankGraph is first constructed, the scores
* are initialized to a uniform distribution.
*
* [PageRank]: https://en.wikipedia.org/wiki/PageRank
*
* Every edge in the Graph is assigned an `EdgeWeight`, which includes a
* `toWeight` (weight from the `src` to the `dst`) and a `froWeight`
* (weight from the `dst` back to the `src`). Both `toWeight` and
* `froWeight` must be nonnegative numbers. The weights influence how
* score flows from node to node. For example, if the node `root` is
* connected to `a` with a weight of `1` and to `b` with a weight of `2`,
* then `b` will recieve twice as much score from `root` as `a` does.
*
* Every node in the PagerankGraph has an associated `score`. Provided
* that the graph has at least one node, the scores are a probability
* distribution over the nodes; i.e. every score is in the range [0,1]
* and the scores sum to 1.
*
* This class is intended to closely mirror the Graph API so as to
* present a consistent and familiar interface.
*
* At present, PagerankGraph does not support any modification to the
* underlying Graph; doing so will invalidate PagerankGraph and cause
* its methods to throw errors.
*/
export class PagerankGraph {
// The Graph backing this PagerankGraph
_graph: Graph;
// The score for each Node in the Graph
_scores: Map<NodeAddressT, number>;
// The EdgeWeight for each Edge in the Graph
_edgeWeights: Map<EdgeAddressT, EdgeWeight>;
// Weight used to connect nodes to themselves, to avoid isolated
// nodes.
_syntheticLoopWeight: number;
// Modification count of the underlying Graph. Used to determine
// when this PageRankGraph is in an invalid state (due to changes
// to the graph backing it).
_graphModificationCount: number;
/**
* Constructs a new PagerankGraph.
*
* Note that constructing a PagerankGraph around an empty graph is illegal,
* as it is impossible to define a probability distribution over zero
* nodes.
*/
constructor(
// The Graph backing this PagerankGraph. Must not be empty.
graph: Graph,
// Provides the initial EdgeWeight for every edge
edgeEvaluator: EdgeEvaluator,
// The weight we use to connect every node to itself
// to ensure there are no isolated nodes. Defaults to
// DEFAULT_SYNTHETIC_LOOP_WEIGHT.
syntheticLoopWeight: ?number
): void {
if (graph.equals(new Graph())) {
throw new Error("Cannot construct PagerankGraph with empty graph.");
}
this._graph = graph;
this._graphModificationCount = graph.modificationCount();
this._syntheticLoopWeight = NullUtil.orElse(
syntheticLoopWeight,
DEFAULT_SYNTHETIC_LOOP_WEIGHT
);
if (this._syntheticLoopWeight <= 0) {
throw new Error("syntheticLoopWeight must be > 0");
}
// Initialize scores to the uniform distribution over every node
this._scores = new Map();
const graphNodes = Array.from(this._graph.nodes());
for (const node of graphNodes) {
this._scores.set(node, 1 / graphNodes.length);
}
this._edgeWeights = new Map();
for (const edge of this._graph.edges()) {
this._edgeWeights.set(edge.address, edgeEvaluator(edge));
}
}
/**
* Retrieves the Graph backing this PagerankGraph.
*/
graph(): Graph {
this._verifyGraphNotModified();
return this._graph;
}
/**
* Returns the PagerankGraph's synthetic loop weight.
*
* The synthetic loop weight simulates a "phantom loop" connecting
* every node to itself. This ensures that every node has at least
* one outgoing connection, so that the corresponding markov chain
* used for PageRank is well-defined.
*
* In general, the synthetic loop weight should be quite small.
* By default, we set it to 1e-3.
*/
syntheticLoopWeight(): number {
return this._syntheticLoopWeight;
}
*_nodesIterator(): Iterator<ScoredNode> {
for (const node of this._graph.nodes()) {
const score = NullUtil.get(this._scores.get(node));
yield {node, score};
}
}
/**
* Provides node and score for every node in the underlying graph.
*
* TODO(#1020): Allow optional filtering, as in Graph.nodes.
*/
nodes(): Iterator<ScoredNode> {
this._verifyGraphNotModified();
return this._nodesIterator();
}
/**
* Retrieve a node from the graph, along with its score.
*
* TODO(#1020): Allow optional filtering, as in Graph.node.
*/
node(x: NodeAddressT): ?ScoredNode {
this._verifyGraphNotModified();
const score = this._scores.get(x);
if (score == null) {
return null;
} else {
return {node: x, score};
}
}
/**
* Provides edge and weight for every edge in the underlying graph.
*
* TODO(#1020): Allow optional filtering, as in Graph.edges.
*/
edges(): Iterator<WeightedEdge> {
this._verifyGraphNotModified();
return this._edgesIterator();
}
*_edgesIterator(): Iterator<WeightedEdge> {
for (const edge of this._graph.edges()) {
const weight = NullUtil.get(this._edgeWeights.get(edge.address));
yield {edge, weight};
}
}
/**
* Provides the edge and weight for a particular edge, if present.
*
* TODO(#1020): Allow optional filtering, as in Graph.edge.
*/
edge(a: EdgeAddressT): ?WeightedEdge {
this._verifyGraphNotModified();
const edge = this._graph.edge(a);
if (edge != null) {
const weight = NullUtil.get(this._edgeWeights.get(edge.address));
return {edge, weight};
}
return null;
}
/**
* Asynchronously run PageRank to re-compute scores.
*
* Calling this method constructs a [Markov Chain] corresponding
* to the underlying graph and its associated edge weights,
* and then iteratively converges to the stationary distribution
* of that chain, according to the [PageRank algorithm].
*
* [Markov Chain]: https://brilliant.org/wiki/markov-chains/
* [PageRank algorithm]: https://en.wikipedia.org/wiki/PageRank
*
* The `PagerankConvergenceOptions` gives guidance on how to run
* PageRank. PageRank will continue running until either
* `options.maxIterations` has been exceeded, or until the largest
* individual delta in a node's score between the present and previous
* iteration is less than or equal to `options.convergenceThreshold`.
*
* TODO(#1020): Make `runPagerank` use the current nodes' scores as a
* starting point for computation, rather than re-generating from
* scratch every time `runPagerank` is called.
*/
async runPagerank(
options: PagerankConvergenceOptions
): Promise<PagerankConvergenceReport> {
this._verifyGraphNotModified();
const edgeEvaluator = (x: Edge) =>
NullUtil.get(this._edgeWeights.get(x.address));
const connections = createConnections(
this._graph,
edgeEvaluator,
this._syntheticLoopWeight
);
const osmc = createOrderedSparseMarkovChain(connections);
const distributionResult = await findStationaryDistribution(osmc.chain, {
verbose: false,
convergenceThreshold: options.convergenceThreshold,
maxIterations: options.maxIterations,
yieldAfterMs: 30,
});
this._scores = distributionToNodeDistribution(
osmc.nodeOrder,
distributionResult.pi
);
return {
convergenceDelta: distributionResult.convergenceDelta,
};
}
_verifyGraphNotModified() {
if (this._graph.modificationCount() !== this._graphModificationCount) {
throw new Error(
"Error: The PagerankGraph's underlying Graph has been modified."
);
}
}
}

View File

@ -0,0 +1,178 @@
// @flow
import sortBy from "lodash.sortby";
import {Graph, NodeAddress, EdgeAddress, type Edge} from "./graph";
import {PagerankGraph} from "./pagerankGraph";
import {advancedGraph} from "./graphTestUtil";
import * as NullUtil from "../util/null";
describe("core/pagerankGraph", () => {
const defaultEvaluator = (_unused_edge) => ({toWeight: 1, froWeight: 0});
const nonEmptyGraph = () =>
new Graph().addNode(NodeAddress.fromParts(["hi"]));
it("cannot construct PagerankGraph with empty Graph", () => {
const eg1 = new Graph();
const eg2 = new Graph()
.addNode(NodeAddress.empty)
.removeNode(NodeAddress.empty);
expect(() => new PagerankGraph(eg1, defaultEvaluator)).toThrowError(
"empty graph"
);
expect(() => new PagerankGraph(eg2, defaultEvaluator)).toThrowError(
"empty graph"
);
});
describe("node / nodes", () => {
it("node returns null for node not in the graph", () => {
const g = nonEmptyGraph();
const pg = new PagerankGraph(g, defaultEvaluator);
expect(pg.node(NodeAddress.empty)).toEqual(null);
});
it("nodes yields the same nodes as are in the graph", () => {
const g = advancedGraph().graph1();
const pg = new PagerankGraph(g, defaultEvaluator);
const graphNodes = Array.from(g.nodes());
const pgNodes = Array.from(pg.nodes()).map((x) => x.node);
expect(graphNodes.length).toEqual(pgNodes.length);
expect(new Set(graphNodes)).toEqual(new Set(pgNodes));
});
it("node and nodes both return consistent scores", async () => {
const g = advancedGraph().graph1();
const pg = new PagerankGraph(g, defaultEvaluator);
await pg.runPagerank({maxIterations: 1, convergenceThreshold: 0.001});
for (const {node, score} of pg.nodes()) {
expect(score).toEqual(NullUtil.get(pg.node(node)).score);
}
});
it("node and nodes both throw an error if underlying graph is modified", () => {
const pg = new PagerankGraph(nonEmptyGraph(), defaultEvaluator);
pg.graph().addNode(NodeAddress.empty);
expect(() => pg.nodes()).toThrowError(
"underlying Graph has been modified"
);
expect(() => pg.node(NodeAddress.empty)).toThrowError(
"underlying Graph has been modified"
);
});
});
describe("edge/edges", () => {
it("edges returns the same edges as are in the graph", () => {
const g = advancedGraph().graph1();
const pg = new PagerankGraph(g, defaultEvaluator);
const graphEdges = Array.from(g.edges());
const pgEdges = Array.from(pg.edges()).map((x) => x.edge);
expect(graphEdges.length).toEqual(pgEdges.length);
const addressAccessor = (x: Edge) => x.address;
const sortedGraphEdges = sortBy(graphEdges, addressAccessor);
const sortedPagerankEdges = sortBy(pgEdges, addressAccessor);
expect(sortedGraphEdges).toEqual(sortedPagerankEdges);
});
it("edge/edges both correctly return the edge weights", () => {
const edgeEvaluator = ({address, src, dst}) => {
return {
toWeight: address.length + src.length,
froWeight: address.length + dst.length,
};
};
const g = advancedGraph().graph1();
const pg = new PagerankGraph(g, edgeEvaluator);
for (const {edge, weight} of pg.edges()) {
expect(edgeEvaluator(edge)).toEqual(weight);
expect(NullUtil.get(pg.edge(edge.address)).weight).toEqual(weight);
}
});
it("edge returns null for address not in the graph", () => {
const pg = new PagerankGraph(nonEmptyGraph(), defaultEvaluator);
expect(pg.edge(EdgeAddress.empty)).toEqual(null);
});
it("edge and edges both throw an error if underlying graph is modified", () => {
const pg = new PagerankGraph(nonEmptyGraph(), defaultEvaluator);
pg.graph().addNode(NodeAddress.empty);
expect(() => pg.edges()).toThrowError(
"underlying Graph has been modified"
);
expect(() => pg.edge(EdgeAddress.empty)).toThrowError(
"underlying Graph has been modified"
);
});
});
describe("runPagerank", () => {
// The mathematical semantics of PageRank are thoroughly tested
// in the markovChain module. The goal for these tests is just
// to make sure that the API calls are glued together properly,
// so it's mostly option + sanity checking
function checkUniformDistribution(pg: PagerankGraph) {
const nodes = Array.from(pg.nodes());
for (const {score} of nodes) {
expect(score).toEqual(1 / nodes.length);
}
}
function checkProbabilityDistribution(pg: PagerankGraph) {
let total = 0;
for (const {score} of pg.nodes()) {
expect(score).toBeGreaterThanOrEqual(0);
expect(score).toBeLessThanOrEqual(1);
total += score;
}
expect(total).toBeCloseTo(1);
}
function examplePagerankGraph() {
const g = advancedGraph().graph1();
return new PagerankGraph(g, defaultEvaluator);
}
it("promise rejects if the graph was modified", async () => {
const pg = examplePagerankGraph();
pg.graph().addNode(NodeAddress.empty);
expect(
pg.runPagerank({maxIterations: 1, convergenceThreshold: 1})
).rejects.toThrow("underlying Graph has been modified");
// It's possible that you could avoid the rejection if you
// make the modification after calling runPagerank (but before
// promise resolves). However, since every getter also checks
// for modification, this is not a serious issue.
});
it("scores are a uniform distribution prior to running PageRank", () => {
checkUniformDistribution(examplePagerankGraph());
});
it("respects maxIterations==0", async () => {
const pg = examplePagerankGraph();
const results = await pg.runPagerank({
maxIterations: 0,
convergenceThreshold: 0,
});
expect(results.convergenceDelta).toBeGreaterThan(0);
checkUniformDistribution(pg);
});
it("will limit at max iterations when convergence threshld is low", async () => {
const pg = examplePagerankGraph();
const convergenceThreshold = 1e-18;
const results = await pg.runPagerank({
maxIterations: 17,
convergenceThreshold,
});
expect(results.convergenceDelta).toBeGreaterThan(convergenceThreshold);
checkProbabilityDistribution(pg);
});
it("will converge when threshold is high", async () => {
const pg = examplePagerankGraph();
const convergenceThreshold = 0.01;
const results = await pg.runPagerank({
maxIterations: 170,
convergenceThreshold,
});
expect(results.convergenceDelta).toBeLessThan(convergenceThreshold);
checkProbabilityDistribution(pg);
});
});
});