Add support for seed vectors to PagerankGraph (#1135)

This commit modifies `PagerankGraph.runPagerank` so that the user can
provide an alpha and seed vector. The seed vector is specified via a map
of weights, which will be normalized into a probability distribution
over all the nodes in the graph. In the event that the map is empty (or
the total weight is otherwise 0), a uniform distribution is created.

To effect this change, a helper function called `weightedDistribution`
has been added (and thoroughly tested) in the `graphToMarkovChain`
module. Then, that function is used in `pagerankGraph.runPagerank`
(along with light testing).

Currently, the default alpha is set to 0, to ensure consistency with the
legacy pagerank implementation in `analysis/pagerank`. Once that has
been replaced with `PagerankGraph`, we can consider changing the defualt
alpha to non-zero (thus removing the need for synthetic self-loops).

I took a different approach in the [odyssey-hackathon repo][commit].
The previous approach was a much more complicated (and fairly redundant)
API, that allowed specifying "NO_SEED", "UNIFORM_SEED", "SELECTED_SEED",
and "SPECIFIED_SEED". I'm much happier with this API and implementation.

[commit]: ed07861073

Test plan: Unit tests included; run `yarn test`.
This commit is contained in:
Dandelion Mané 2019-05-05 18:57:41 +03:00 committed by GitHub
parent e7bc025379
commit 79017a477b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 181 additions and 4 deletions

View File

@ -1,7 +1,11 @@
// @flow // @flow
import {type Edge, type Graph, type NodeAddressT} from "../graph"; import {type Edge, type Graph, type NodeAddressT, NodeAddress} from "../graph";
import type {Distribution, SparseMarkovChain} from "./markovChain"; import {
type Distribution,
type SparseMarkovChain,
uniformDistribution,
} from "./markovChain";
import * as MapUtil from "../../util/map"; import * as MapUtil from "../../util/map";
import * as NullUtil from "../../util/null"; import * as NullUtil from "../../util/null";
@ -31,6 +35,54 @@ export function adjacencySource(target: NodeAddressT, adjacency: Adjacency) {
} }
} }
/**
* Create a Distribution using provided node weights.
*
* weightedDistribution takes in a node order (as a read only array of NodeAddressT),
* and a map providing weights for a subset of those nodes. It returns a Distribution
* with the invariant that every node's weight is proportional to its relative weight
* in the weights map. For example, in a case where there were three nodes and they
* had weights of 0, 1, and 3 respectively, the distribution would be [0, 0.25, 0.75].
*
* If a node address is not present in the weight map, its weight is assumed to be 0.
* If any weight is negative or non-finite, an error will be thrown.
* If the sum of all weights is 0, then a uniform distribution will be returned.
* If the weight map assigned weight to nodes which are not in the node order, an error
* will be thrown.
*/
export function weightedDistribution(
nodeOrder: $ReadOnlyArray<NodeAddressT>,
weights: Map<NodeAddressT, number>
): Distribution {
let totalWeight = 0;
for (const [address, weight] of weights.entries()) {
if (weight < 0 || !isFinite(weight)) {
throw new Error(
`Invalid weight ${weight} associated with address ${NodeAddress.toString(
address
)}`
);
}
totalWeight += weight;
}
if (totalWeight === 0) {
return uniformDistribution(nodeOrder.length);
}
let numEncounteredWeights = 0;
const distribution = new Float64Array(nodeOrder.length);
for (let i = 0; i < distribution.length; i++) {
const weight = weights.get(nodeOrder[i]);
if (weight != null) {
numEncounteredWeights++;
distribution[i] = weight / totalWeight;
}
}
if (numEncounteredWeights !== weights.size) {
throw new Error("weights included nodes not present in the nodeOrder");
}
return distribution;
}
export type NodeDistribution = Map<NodeAddressT, Probability>; export type NodeDistribution = Map<NodeAddressT, Probability>;
export type NodeToConnections = Map<NodeAddressT, $ReadOnlyArray<Connection>>; export type NodeToConnections = Map<NodeAddressT, $ReadOnlyArray<Connection>>;

View File

@ -10,6 +10,7 @@ import {
normalize, normalize,
normalizeNeighbors, normalizeNeighbors,
permute, permute,
weightedDistribution,
} from "./graphToMarkovChain"; } from "./graphToMarkovChain";
import * as MapUtil from "../../util/map"; import * as MapUtil from "../../util/map";
@ -288,4 +289,72 @@ describe("core/attribution/graphToMarkovChain", () => {
); );
}); });
}); });
describe("weightedDistribution", () => {
const a = NodeAddress.fromParts(["a"]);
const b = NodeAddress.fromParts(["b"]);
const c = NodeAddress.fromParts(["c"]);
const d = NodeAddress.fromParts(["d"]);
const order = () => [a, b, c, d];
it("gives a uniform distribution for an empty map", () => {
expect(weightedDistribution(order(), new Map())).toEqual(
new Float64Array([0.25, 0.25, 0.25, 0.25])
);
});
it("gives a uniform distribution for a map with 0 weight", () => {
const map = new Map().set(a, 0);
expect(weightedDistribution(order(), map)).toEqual(
new Float64Array([0.25, 0.25, 0.25, 0.25])
);
});
it("can put all weight on one node", () => {
const map = new Map().set(b, 0.1);
expect(weightedDistribution(order(), map)).toEqual(
new Float64Array([0, 1, 0, 0])
);
});
it("can split weight unequally", () => {
const map = new Map().set(b, 1).set(c, 3);
expect(weightedDistribution(order(), map)).toEqual(
new Float64Array([0, 0.25, 0.75, 0])
);
});
it("can create a uniform distribution if all weights are equal", () => {
const map = new Map()
.set(a, 1)
.set(b, 1)
.set(c, 1)
.set(d, 1);
expect(weightedDistribution(order(), map)).toEqual(
new Float64Array([0.25, 0.25, 0.25, 0.25])
);
});
describe("errors if", () => {
it("has a weighted node that is not in the order", () => {
const z = NodeAddress.fromParts(["z"]);
const map = new Map().set(z, 1);
expect(() => weightedDistribution(order(), map)).toThrowError(
"weights included nodes not present in the nodeOrder"
);
});
it("has a node with negative weight", () => {
const map = new Map().set(a, -1);
expect(() => weightedDistribution(order(), map)).toThrowError(
"Invalid weight -1"
);
});
it("has a node with NaN weight", () => {
const map = new Map().set(a, NaN);
expect(() => weightedDistribution(order(), map)).toThrowError(
"Invalid weight NaN"
);
});
it("has a node with infinite weight", () => {
const map = new Map().set(a, Infinity);
expect(() => weightedDistribution(order(), map)).toThrowError(
"Invalid weight Infinity"
);
});
});
});
}); });

View File

@ -20,6 +20,7 @@ import {
createConnections, createConnections,
createOrderedSparseMarkovChain, createOrderedSparseMarkovChain,
type EdgeWeight, type EdgeWeight,
weightedDistribution,
} from "./attribution/graphToMarkovChain"; } from "./attribution/graphToMarkovChain";
import { import {
findStationaryDistribution, findStationaryDistribution,
@ -75,10 +76,30 @@ export type PagerankOptions = {|
// Maximum number of iterations before we give up on PageRank Convergence // Maximum number of iterations before we give up on PageRank Convergence
// Defaults to DEFAULT_MAX_ITERATIONS if not provided. // Defaults to DEFAULT_MAX_ITERATIONS if not provided.
+maxIterations?: number, +maxIterations?: number,
// PageRank will stop running once the diff between the previous iteration // PageRank will stop running once the diff between the previous iteration
// and the latest is less than this threshold. // and the latest is less than this threshold.
// Defaults to DEFAULT_CONVERGENCE_THRESHOLD if not provided. // Defaults to DEFAULT_CONVERGENCE_THRESHOLD if not provided.
+convergenceThreshold?: number, +convergenceThreshold?: number,
// Specifies a seed vector for PageRank "teleportation".
// At every step, some proportion `alpha` of the weight will
// teleport to the seed.
//
// The seed is specified as a map from node addresses to weights.
// The resultant seed will be a proper distribution over all the graph's available
// nodes, with each node's weight proportional to its weight in the seed. In the case
// that the total weight in the seed is 0 (e.g. an empty map was passed), then the
// seed vector will be a uniform distribution.
//
// Specifying any negative, NaN, or infinite weights is an error.
// Specifying weights for nodes that are not in the graph is also an error.
+seed?: Map<NodeAddressT, number>,
// Specifies the probability with which score 'teleports' to the seed vector.
// If alpha=0, then the teleportation never happens. If alpha=1, then PageRank
// always converges to precisely the seed vector. Defaults to DEFAULT_ALPHA.
+alpha?: number,
|}; |};
export type PagerankConvergenceReport = {| export type PagerankConvergenceReport = {|
@ -92,11 +113,17 @@ export type PagerankConvergenceReport = {|
export const DEFAULT_SYNTHETIC_LOOP_WEIGHT = 1e-3; export const DEFAULT_SYNTHETIC_LOOP_WEIGHT = 1e-3;
export const DEFAULT_MAX_ITERATIONS = 255; export const DEFAULT_MAX_ITERATIONS = 255;
export const DEFAULT_CONVERGENCE_THRESHOLD = 1e-7; export const DEFAULT_CONVERGENCE_THRESHOLD = 1e-7;
// TODO(@decentralion): Change default alpha to be a small non-zero value
// once we choose an appropriate value.
export const DEFAULT_ALPHA = 0;
export const DEFAULT_SEED: () => Map<NodeAddressT, number> = () => new Map();
function defaultOptions(): PagerankOptions { function defaultOptions(): PagerankOptions {
return { return {
maxIterations: DEFAULT_MAX_ITERATIONS, maxIterations: DEFAULT_MAX_ITERATIONS,
convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD, convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
alpha: DEFAULT_ALPHA,
seed: DEFAULT_SEED(),
}; };
} }
@ -441,8 +468,8 @@ export class PagerankGraph {
const osmc = createOrderedSparseMarkovChain(connections); const osmc = createOrderedSparseMarkovChain(connections);
const params: PagerankParams = { const params: PagerankParams = {
chain: osmc.chain, chain: osmc.chain,
alpha: 0, alpha: fullOptions.alpha,
seed: uniformDistribution(osmc.chain.length), seed: weightedDistribution(osmc.nodeOrder, fullOptions.seed),
pi0: uniformDistribution(osmc.chain.length), pi0: uniformDistribution(osmc.chain.length),
}; };
const coreOptions: CorePagerankOptions = { const coreOptions: CorePagerankOptions = {

View File

@ -14,6 +14,8 @@ import {
Direction, Direction,
DEFAULT_MAX_ITERATIONS, DEFAULT_MAX_ITERATIONS,
DEFAULT_CONVERGENCE_THRESHOLD, DEFAULT_CONVERGENCE_THRESHOLD,
DEFAULT_ALPHA,
DEFAULT_SEED,
} from "./pagerankGraph"; } from "./pagerankGraph";
import {advancedGraph} from "./graphTestUtil"; import {advancedGraph} from "./graphTestUtil";
import * as NullUtil from "../util/null"; import * as NullUtil from "../util/null";
@ -500,11 +502,38 @@ describe("core/pagerankGraph", () => {
pg3.runPagerank({ pg3.runPagerank({
maxIterations: DEFAULT_MAX_ITERATIONS, maxIterations: DEFAULT_MAX_ITERATIONS,
convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD, convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
alpha: DEFAULT_ALPHA,
seed: DEFAULT_SEED(),
}); });
expect(pg1.equals(pg2)).toBe(true); expect(pg1.equals(pg2)).toBe(true);
expect(pg1.equals(pg3)).toBe(true); expect(pg1.equals(pg3)).toBe(true);
}); });
describe("alpha and seed parameters", () => {
// The logic for seeded PageRank (and for generating the seed distribution via weights)
// are both thoroughly unit-tested. Therefore, these tests only sanity check that the
// parameters are getting consumed properly based on easily tested properties.
it("seed is irrelevant if alpha is 0", async () => {
const pg1 = examplePagerankGraph();
const pg2 = examplePagerankGraph();
const {nodes} = advancedGraph();
const seed1 = new Map().set(nodes.src(), 1);
const seed2 = new Map().set(nodes.dst(), 1);
await pg1.runPagerank({seed: seed1, alpha: 0});
await pg2.runPagerank({seed: seed2, alpha: 0});
expect(pg1.equals(pg2)).toBe(true);
});
it("seed is returned directly if alpha is 1", async () => {
const pg = examplePagerankGraph();
const src = advancedGraph().nodes.src;
const seed = new Map().set(src(), 1);
await pg.runPagerank({seed, alpha: 1});
const score = NullUtil.get(pg.node(src())).score;
expect(score).toBe(1);
});
});
it("promise rejects if the graph was modified", async () => { it("promise rejects if the graph was modified", async () => {
const pg = examplePagerankGraph(); const pg = examplePagerankGraph();
pg.graph().addNode(NodeAddress.empty); pg.graph().addNode(NodeAddress.empty);