From 79017a477b5ae802995126ceab0e8e2b4c677edc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dandelion=20Man=C3=A9?= Date: Sun, 5 May 2019 18:57:41 +0300 Subject: [PATCH] Add support for seed vectors to PagerankGraph (#1135) This commit modifies `PagerankGraph.runPagerank` so that the user can provide an alpha and seed vector. The seed vector is specified via a map of weights, which will be normalized into a probability distribution over all the nodes in the graph. In the event that the map is empty (or the total weight is otherwise 0), a uniform distribution is created. To effect this change, a helper function called `weightedDistribution` has been added (and thoroughly tested) in the `graphToMarkovChain` module. Then, that function is used in `pagerankGraph.runPagerank` (along with light testing). Currently, the default alpha is set to 0, to ensure consistency with the legacy pagerank implementation in `analysis/pagerank`. Once that has been replaced with `PagerankGraph`, we can consider changing the defualt alpha to non-zero (thus removing the need for synthetic self-loops). I took a different approach in the [odyssey-hackathon repo][commit]. The previous approach was a much more complicated (and fairly redundant) API, that allowed specifying "NO_SEED", "UNIFORM_SEED", "SELECTED_SEED", and "SPECIFIED_SEED". I'm much happier with this API and implementation. [commit]: https://github.com/sourcecred/odyssey-hackathon/commit/ed07861073f1a5c803a378b39515b338b80693eb Test plan: Unit tests included; run `yarn test`. --- src/core/attribution/graphToMarkovChain.js | 56 ++++++++++++++- .../attribution/graphToMarkovChain.test.js | 69 +++++++++++++++++++ src/core/pagerankGraph.js | 31 ++++++++- src/core/pagerankGraph.test.js | 29 ++++++++ 4 files changed, 181 insertions(+), 4 deletions(-) diff --git a/src/core/attribution/graphToMarkovChain.js b/src/core/attribution/graphToMarkovChain.js index 3788adc..d5ee40e 100644 --- a/src/core/attribution/graphToMarkovChain.js +++ b/src/core/attribution/graphToMarkovChain.js @@ -1,7 +1,11 @@ // @flow -import {type Edge, type Graph, type NodeAddressT} from "../graph"; -import type {Distribution, SparseMarkovChain} from "./markovChain"; +import {type Edge, type Graph, type NodeAddressT, NodeAddress} from "../graph"; +import { + type Distribution, + type SparseMarkovChain, + uniformDistribution, +} from "./markovChain"; import * as MapUtil from "../../util/map"; import * as NullUtil from "../../util/null"; @@ -31,6 +35,54 @@ export function adjacencySource(target: NodeAddressT, adjacency: Adjacency) { } } +/** + * Create a Distribution using provided node weights. + * + * weightedDistribution takes in a node order (as a read only array of NodeAddressT), + * and a map providing weights for a subset of those nodes. It returns a Distribution + * with the invariant that every node's weight is proportional to its relative weight + * in the weights map. For example, in a case where there were three nodes and they + * had weights of 0, 1, and 3 respectively, the distribution would be [0, 0.25, 0.75]. + * + * If a node address is not present in the weight map, its weight is assumed to be 0. + * If any weight is negative or non-finite, an error will be thrown. + * If the sum of all weights is 0, then a uniform distribution will be returned. + * If the weight map assigned weight to nodes which are not in the node order, an error + * will be thrown. + */ +export function weightedDistribution( + nodeOrder: $ReadOnlyArray, + weights: Map +): Distribution { + let totalWeight = 0; + for (const [address, weight] of weights.entries()) { + if (weight < 0 || !isFinite(weight)) { + throw new Error( + `Invalid weight ${weight} associated with address ${NodeAddress.toString( + address + )}` + ); + } + totalWeight += weight; + } + if (totalWeight === 0) { + return uniformDistribution(nodeOrder.length); + } + let numEncounteredWeights = 0; + const distribution = new Float64Array(nodeOrder.length); + for (let i = 0; i < distribution.length; i++) { + const weight = weights.get(nodeOrder[i]); + if (weight != null) { + numEncounteredWeights++; + distribution[i] = weight / totalWeight; + } + } + if (numEncounteredWeights !== weights.size) { + throw new Error("weights included nodes not present in the nodeOrder"); + } + return distribution; +} + export type NodeDistribution = Map; export type NodeToConnections = Map>; diff --git a/src/core/attribution/graphToMarkovChain.test.js b/src/core/attribution/graphToMarkovChain.test.js index d5ffc03..d4dab90 100644 --- a/src/core/attribution/graphToMarkovChain.test.js +++ b/src/core/attribution/graphToMarkovChain.test.js @@ -10,6 +10,7 @@ import { normalize, normalizeNeighbors, permute, + weightedDistribution, } from "./graphToMarkovChain"; import * as MapUtil from "../../util/map"; @@ -288,4 +289,72 @@ describe("core/attribution/graphToMarkovChain", () => { ); }); }); + + describe("weightedDistribution", () => { + const a = NodeAddress.fromParts(["a"]); + const b = NodeAddress.fromParts(["b"]); + const c = NodeAddress.fromParts(["c"]); + const d = NodeAddress.fromParts(["d"]); + const order = () => [a, b, c, d]; + it("gives a uniform distribution for an empty map", () => { + expect(weightedDistribution(order(), new Map())).toEqual( + new Float64Array([0.25, 0.25, 0.25, 0.25]) + ); + }); + it("gives a uniform distribution for a map with 0 weight", () => { + const map = new Map().set(a, 0); + expect(weightedDistribution(order(), map)).toEqual( + new Float64Array([0.25, 0.25, 0.25, 0.25]) + ); + }); + it("can put all weight on one node", () => { + const map = new Map().set(b, 0.1); + expect(weightedDistribution(order(), map)).toEqual( + new Float64Array([0, 1, 0, 0]) + ); + }); + it("can split weight unequally", () => { + const map = new Map().set(b, 1).set(c, 3); + expect(weightedDistribution(order(), map)).toEqual( + new Float64Array([0, 0.25, 0.75, 0]) + ); + }); + it("can create a uniform distribution if all weights are equal", () => { + const map = new Map() + .set(a, 1) + .set(b, 1) + .set(c, 1) + .set(d, 1); + expect(weightedDistribution(order(), map)).toEqual( + new Float64Array([0.25, 0.25, 0.25, 0.25]) + ); + }); + describe("errors if", () => { + it("has a weighted node that is not in the order", () => { + const z = NodeAddress.fromParts(["z"]); + const map = new Map().set(z, 1); + expect(() => weightedDistribution(order(), map)).toThrowError( + "weights included nodes not present in the nodeOrder" + ); + }); + it("has a node with negative weight", () => { + const map = new Map().set(a, -1); + expect(() => weightedDistribution(order(), map)).toThrowError( + "Invalid weight -1" + ); + }); + it("has a node with NaN weight", () => { + const map = new Map().set(a, NaN); + expect(() => weightedDistribution(order(), map)).toThrowError( + "Invalid weight NaN" + ); + }); + it("has a node with infinite weight", () => { + const map = new Map().set(a, Infinity); + expect(() => weightedDistribution(order(), map)).toThrowError( + "Invalid weight Infinity" + ); + }); + }); + }); }); diff --git a/src/core/pagerankGraph.js b/src/core/pagerankGraph.js index 1ae75ca..2fcbf87 100644 --- a/src/core/pagerankGraph.js +++ b/src/core/pagerankGraph.js @@ -20,6 +20,7 @@ import { createConnections, createOrderedSparseMarkovChain, type EdgeWeight, + weightedDistribution, } from "./attribution/graphToMarkovChain"; import { findStationaryDistribution, @@ -75,10 +76,30 @@ export type PagerankOptions = {| // Maximum number of iterations before we give up on PageRank Convergence // Defaults to DEFAULT_MAX_ITERATIONS if not provided. +maxIterations?: number, + // PageRank will stop running once the diff between the previous iteration // and the latest is less than this threshold. // Defaults to DEFAULT_CONVERGENCE_THRESHOLD if not provided. +convergenceThreshold?: number, + + // Specifies a seed vector for PageRank "teleportation". + // At every step, some proportion `alpha` of the weight will + // teleport to the seed. + // + // The seed is specified as a map from node addresses to weights. + // The resultant seed will be a proper distribution over all the graph's available + // nodes, with each node's weight proportional to its weight in the seed. In the case + // that the total weight in the seed is 0 (e.g. an empty map was passed), then the + // seed vector will be a uniform distribution. + // + // Specifying any negative, NaN, or infinite weights is an error. + // Specifying weights for nodes that are not in the graph is also an error. + +seed?: Map, + + // Specifies the probability with which score 'teleports' to the seed vector. + // If alpha=0, then the teleportation never happens. If alpha=1, then PageRank + // always converges to precisely the seed vector. Defaults to DEFAULT_ALPHA. + +alpha?: number, |}; export type PagerankConvergenceReport = {| @@ -92,11 +113,17 @@ export type PagerankConvergenceReport = {| export const DEFAULT_SYNTHETIC_LOOP_WEIGHT = 1e-3; export const DEFAULT_MAX_ITERATIONS = 255; export const DEFAULT_CONVERGENCE_THRESHOLD = 1e-7; +// TODO(@decentralion): Change default alpha to be a small non-zero value +// once we choose an appropriate value. +export const DEFAULT_ALPHA = 0; +export const DEFAULT_SEED: () => Map = () => new Map(); function defaultOptions(): PagerankOptions { return { maxIterations: DEFAULT_MAX_ITERATIONS, convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD, + alpha: DEFAULT_ALPHA, + seed: DEFAULT_SEED(), }; } @@ -441,8 +468,8 @@ export class PagerankGraph { const osmc = createOrderedSparseMarkovChain(connections); const params: PagerankParams = { chain: osmc.chain, - alpha: 0, - seed: uniformDistribution(osmc.chain.length), + alpha: fullOptions.alpha, + seed: weightedDistribution(osmc.nodeOrder, fullOptions.seed), pi0: uniformDistribution(osmc.chain.length), }; const coreOptions: CorePagerankOptions = { diff --git a/src/core/pagerankGraph.test.js b/src/core/pagerankGraph.test.js index 9f76e97..2e76305 100644 --- a/src/core/pagerankGraph.test.js +++ b/src/core/pagerankGraph.test.js @@ -14,6 +14,8 @@ import { Direction, DEFAULT_MAX_ITERATIONS, DEFAULT_CONVERGENCE_THRESHOLD, + DEFAULT_ALPHA, + DEFAULT_SEED, } from "./pagerankGraph"; import {advancedGraph} from "./graphTestUtil"; import * as NullUtil from "../util/null"; @@ -500,11 +502,38 @@ describe("core/pagerankGraph", () => { pg3.runPagerank({ maxIterations: DEFAULT_MAX_ITERATIONS, convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD, + alpha: DEFAULT_ALPHA, + seed: DEFAULT_SEED(), }); expect(pg1.equals(pg2)).toBe(true); expect(pg1.equals(pg3)).toBe(true); }); + describe("alpha and seed parameters", () => { + // The logic for seeded PageRank (and for generating the seed distribution via weights) + // are both thoroughly unit-tested. Therefore, these tests only sanity check that the + // parameters are getting consumed properly based on easily tested properties. + it("seed is irrelevant if alpha is 0", async () => { + const pg1 = examplePagerankGraph(); + const pg2 = examplePagerankGraph(); + const {nodes} = advancedGraph(); + const seed1 = new Map().set(nodes.src(), 1); + const seed2 = new Map().set(nodes.dst(), 1); + await pg1.runPagerank({seed: seed1, alpha: 0}); + await pg2.runPagerank({seed: seed2, alpha: 0}); + expect(pg1.equals(pg2)).toBe(true); + }); + + it("seed is returned directly if alpha is 1", async () => { + const pg = examplePagerankGraph(); + const src = advancedGraph().nodes.src; + const seed = new Map().set(src(), 1); + await pg.runPagerank({seed, alpha: 1}); + const score = NullUtil.get(pg.node(src())).score; + expect(score).toBe(1); + }); + }); + it("promise rejects if the graph was modified", async () => { const pg = examplePagerankGraph(); pg.graph().addNode(NodeAddress.empty);