Add support for seed vectors to PagerankGraph (#1135)
This commit modifies `PagerankGraph.runPagerank` so that the user can
provide an alpha and seed vector. The seed vector is specified via a map
of weights, which will be normalized into a probability distribution
over all the nodes in the graph. In the event that the map is empty (or
the total weight is otherwise 0), a uniform distribution is created.
To effect this change, a helper function called `weightedDistribution`
has been added (and thoroughly tested) in the `graphToMarkovChain`
module. Then, that function is used in `pagerankGraph.runPagerank`
(along with light testing).
Currently, the default alpha is set to 0, to ensure consistency with the
legacy pagerank implementation in `analysis/pagerank`. Once that has
been replaced with `PagerankGraph`, we can consider changing the defualt
alpha to non-zero (thus removing the need for synthetic self-loops).
I took a different approach in the [odyssey-hackathon repo][commit].
The previous approach was a much more complicated (and fairly redundant)
API, that allowed specifying "NO_SEED", "UNIFORM_SEED", "SELECTED_SEED",
and "SPECIFIED_SEED". I'm much happier with this API and implementation.
[commit]: ed07861073
Test plan: Unit tests included; run `yarn test`.
This commit is contained in:
parent
e7bc025379
commit
79017a477b
|
@ -1,7 +1,11 @@
|
||||||
// @flow
|
// @flow
|
||||||
|
|
||||||
import {type Edge, type Graph, type NodeAddressT} from "../graph";
|
import {type Edge, type Graph, type NodeAddressT, NodeAddress} from "../graph";
|
||||||
import type {Distribution, SparseMarkovChain} from "./markovChain";
|
import {
|
||||||
|
type Distribution,
|
||||||
|
type SparseMarkovChain,
|
||||||
|
uniformDistribution,
|
||||||
|
} from "./markovChain";
|
||||||
import * as MapUtil from "../../util/map";
|
import * as MapUtil from "../../util/map";
|
||||||
import * as NullUtil from "../../util/null";
|
import * as NullUtil from "../../util/null";
|
||||||
|
|
||||||
|
@ -31,6 +35,54 @@ export function adjacencySource(target: NodeAddressT, adjacency: Adjacency) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a Distribution using provided node weights.
|
||||||
|
*
|
||||||
|
* weightedDistribution takes in a node order (as a read only array of NodeAddressT),
|
||||||
|
* and a map providing weights for a subset of those nodes. It returns a Distribution
|
||||||
|
* with the invariant that every node's weight is proportional to its relative weight
|
||||||
|
* in the weights map. For example, in a case where there were three nodes and they
|
||||||
|
* had weights of 0, 1, and 3 respectively, the distribution would be [0, 0.25, 0.75].
|
||||||
|
*
|
||||||
|
* If a node address is not present in the weight map, its weight is assumed to be 0.
|
||||||
|
* If any weight is negative or non-finite, an error will be thrown.
|
||||||
|
* If the sum of all weights is 0, then a uniform distribution will be returned.
|
||||||
|
* If the weight map assigned weight to nodes which are not in the node order, an error
|
||||||
|
* will be thrown.
|
||||||
|
*/
|
||||||
|
export function weightedDistribution(
|
||||||
|
nodeOrder: $ReadOnlyArray<NodeAddressT>,
|
||||||
|
weights: Map<NodeAddressT, number>
|
||||||
|
): Distribution {
|
||||||
|
let totalWeight = 0;
|
||||||
|
for (const [address, weight] of weights.entries()) {
|
||||||
|
if (weight < 0 || !isFinite(weight)) {
|
||||||
|
throw new Error(
|
||||||
|
`Invalid weight ${weight} associated with address ${NodeAddress.toString(
|
||||||
|
address
|
||||||
|
)}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
totalWeight += weight;
|
||||||
|
}
|
||||||
|
if (totalWeight === 0) {
|
||||||
|
return uniformDistribution(nodeOrder.length);
|
||||||
|
}
|
||||||
|
let numEncounteredWeights = 0;
|
||||||
|
const distribution = new Float64Array(nodeOrder.length);
|
||||||
|
for (let i = 0; i < distribution.length; i++) {
|
||||||
|
const weight = weights.get(nodeOrder[i]);
|
||||||
|
if (weight != null) {
|
||||||
|
numEncounteredWeights++;
|
||||||
|
distribution[i] = weight / totalWeight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (numEncounteredWeights !== weights.size) {
|
||||||
|
throw new Error("weights included nodes not present in the nodeOrder");
|
||||||
|
}
|
||||||
|
return distribution;
|
||||||
|
}
|
||||||
|
|
||||||
export type NodeDistribution = Map<NodeAddressT, Probability>;
|
export type NodeDistribution = Map<NodeAddressT, Probability>;
|
||||||
|
|
||||||
export type NodeToConnections = Map<NodeAddressT, $ReadOnlyArray<Connection>>;
|
export type NodeToConnections = Map<NodeAddressT, $ReadOnlyArray<Connection>>;
|
||||||
|
|
|
@ -10,6 +10,7 @@ import {
|
||||||
normalize,
|
normalize,
|
||||||
normalizeNeighbors,
|
normalizeNeighbors,
|
||||||
permute,
|
permute,
|
||||||
|
weightedDistribution,
|
||||||
} from "./graphToMarkovChain";
|
} from "./graphToMarkovChain";
|
||||||
import * as MapUtil from "../../util/map";
|
import * as MapUtil from "../../util/map";
|
||||||
|
|
||||||
|
@ -288,4 +289,72 @@ describe("core/attribution/graphToMarkovChain", () => {
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("weightedDistribution", () => {
|
||||||
|
const a = NodeAddress.fromParts(["a"]);
|
||||||
|
const b = NodeAddress.fromParts(["b"]);
|
||||||
|
const c = NodeAddress.fromParts(["c"]);
|
||||||
|
const d = NodeAddress.fromParts(["d"]);
|
||||||
|
const order = () => [a, b, c, d];
|
||||||
|
it("gives a uniform distribution for an empty map", () => {
|
||||||
|
expect(weightedDistribution(order(), new Map())).toEqual(
|
||||||
|
new Float64Array([0.25, 0.25, 0.25, 0.25])
|
||||||
|
);
|
||||||
|
});
|
||||||
|
it("gives a uniform distribution for a map with 0 weight", () => {
|
||||||
|
const map = new Map().set(a, 0);
|
||||||
|
expect(weightedDistribution(order(), map)).toEqual(
|
||||||
|
new Float64Array([0.25, 0.25, 0.25, 0.25])
|
||||||
|
);
|
||||||
|
});
|
||||||
|
it("can put all weight on one node", () => {
|
||||||
|
const map = new Map().set(b, 0.1);
|
||||||
|
expect(weightedDistribution(order(), map)).toEqual(
|
||||||
|
new Float64Array([0, 1, 0, 0])
|
||||||
|
);
|
||||||
|
});
|
||||||
|
it("can split weight unequally", () => {
|
||||||
|
const map = new Map().set(b, 1).set(c, 3);
|
||||||
|
expect(weightedDistribution(order(), map)).toEqual(
|
||||||
|
new Float64Array([0, 0.25, 0.75, 0])
|
||||||
|
);
|
||||||
|
});
|
||||||
|
it("can create a uniform distribution if all weights are equal", () => {
|
||||||
|
const map = new Map()
|
||||||
|
.set(a, 1)
|
||||||
|
.set(b, 1)
|
||||||
|
.set(c, 1)
|
||||||
|
.set(d, 1);
|
||||||
|
expect(weightedDistribution(order(), map)).toEqual(
|
||||||
|
new Float64Array([0.25, 0.25, 0.25, 0.25])
|
||||||
|
);
|
||||||
|
});
|
||||||
|
describe("errors if", () => {
|
||||||
|
it("has a weighted node that is not in the order", () => {
|
||||||
|
const z = NodeAddress.fromParts(["z"]);
|
||||||
|
const map = new Map().set(z, 1);
|
||||||
|
expect(() => weightedDistribution(order(), map)).toThrowError(
|
||||||
|
"weights included nodes not present in the nodeOrder"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
it("has a node with negative weight", () => {
|
||||||
|
const map = new Map().set(a, -1);
|
||||||
|
expect(() => weightedDistribution(order(), map)).toThrowError(
|
||||||
|
"Invalid weight -1"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
it("has a node with NaN weight", () => {
|
||||||
|
const map = new Map().set(a, NaN);
|
||||||
|
expect(() => weightedDistribution(order(), map)).toThrowError(
|
||||||
|
"Invalid weight NaN"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
it("has a node with infinite weight", () => {
|
||||||
|
const map = new Map().set(a, Infinity);
|
||||||
|
expect(() => weightedDistribution(order(), map)).toThrowError(
|
||||||
|
"Invalid weight Infinity"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
|
@ -20,6 +20,7 @@ import {
|
||||||
createConnections,
|
createConnections,
|
||||||
createOrderedSparseMarkovChain,
|
createOrderedSparseMarkovChain,
|
||||||
type EdgeWeight,
|
type EdgeWeight,
|
||||||
|
weightedDistribution,
|
||||||
} from "./attribution/graphToMarkovChain";
|
} from "./attribution/graphToMarkovChain";
|
||||||
import {
|
import {
|
||||||
findStationaryDistribution,
|
findStationaryDistribution,
|
||||||
|
@ -75,10 +76,30 @@ export type PagerankOptions = {|
|
||||||
// Maximum number of iterations before we give up on PageRank Convergence
|
// Maximum number of iterations before we give up on PageRank Convergence
|
||||||
// Defaults to DEFAULT_MAX_ITERATIONS if not provided.
|
// Defaults to DEFAULT_MAX_ITERATIONS if not provided.
|
||||||
+maxIterations?: number,
|
+maxIterations?: number,
|
||||||
|
|
||||||
// PageRank will stop running once the diff between the previous iteration
|
// PageRank will stop running once the diff between the previous iteration
|
||||||
// and the latest is less than this threshold.
|
// and the latest is less than this threshold.
|
||||||
// Defaults to DEFAULT_CONVERGENCE_THRESHOLD if not provided.
|
// Defaults to DEFAULT_CONVERGENCE_THRESHOLD if not provided.
|
||||||
+convergenceThreshold?: number,
|
+convergenceThreshold?: number,
|
||||||
|
|
||||||
|
// Specifies a seed vector for PageRank "teleportation".
|
||||||
|
// At every step, some proportion `alpha` of the weight will
|
||||||
|
// teleport to the seed.
|
||||||
|
//
|
||||||
|
// The seed is specified as a map from node addresses to weights.
|
||||||
|
// The resultant seed will be a proper distribution over all the graph's available
|
||||||
|
// nodes, with each node's weight proportional to its weight in the seed. In the case
|
||||||
|
// that the total weight in the seed is 0 (e.g. an empty map was passed), then the
|
||||||
|
// seed vector will be a uniform distribution.
|
||||||
|
//
|
||||||
|
// Specifying any negative, NaN, or infinite weights is an error.
|
||||||
|
// Specifying weights for nodes that are not in the graph is also an error.
|
||||||
|
+seed?: Map<NodeAddressT, number>,
|
||||||
|
|
||||||
|
// Specifies the probability with which score 'teleports' to the seed vector.
|
||||||
|
// If alpha=0, then the teleportation never happens. If alpha=1, then PageRank
|
||||||
|
// always converges to precisely the seed vector. Defaults to DEFAULT_ALPHA.
|
||||||
|
+alpha?: number,
|
||||||
|};
|
|};
|
||||||
|
|
||||||
export type PagerankConvergenceReport = {|
|
export type PagerankConvergenceReport = {|
|
||||||
|
@ -92,11 +113,17 @@ export type PagerankConvergenceReport = {|
|
||||||
export const DEFAULT_SYNTHETIC_LOOP_WEIGHT = 1e-3;
|
export const DEFAULT_SYNTHETIC_LOOP_WEIGHT = 1e-3;
|
||||||
export const DEFAULT_MAX_ITERATIONS = 255;
|
export const DEFAULT_MAX_ITERATIONS = 255;
|
||||||
export const DEFAULT_CONVERGENCE_THRESHOLD = 1e-7;
|
export const DEFAULT_CONVERGENCE_THRESHOLD = 1e-7;
|
||||||
|
// TODO(@decentralion): Change default alpha to be a small non-zero value
|
||||||
|
// once we choose an appropriate value.
|
||||||
|
export const DEFAULT_ALPHA = 0;
|
||||||
|
export const DEFAULT_SEED: () => Map<NodeAddressT, number> = () => new Map();
|
||||||
|
|
||||||
function defaultOptions(): PagerankOptions {
|
function defaultOptions(): PagerankOptions {
|
||||||
return {
|
return {
|
||||||
maxIterations: DEFAULT_MAX_ITERATIONS,
|
maxIterations: DEFAULT_MAX_ITERATIONS,
|
||||||
convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
|
convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
|
||||||
|
alpha: DEFAULT_ALPHA,
|
||||||
|
seed: DEFAULT_SEED(),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -441,8 +468,8 @@ export class PagerankGraph {
|
||||||
const osmc = createOrderedSparseMarkovChain(connections);
|
const osmc = createOrderedSparseMarkovChain(connections);
|
||||||
const params: PagerankParams = {
|
const params: PagerankParams = {
|
||||||
chain: osmc.chain,
|
chain: osmc.chain,
|
||||||
alpha: 0,
|
alpha: fullOptions.alpha,
|
||||||
seed: uniformDistribution(osmc.chain.length),
|
seed: weightedDistribution(osmc.nodeOrder, fullOptions.seed),
|
||||||
pi0: uniformDistribution(osmc.chain.length),
|
pi0: uniformDistribution(osmc.chain.length),
|
||||||
};
|
};
|
||||||
const coreOptions: CorePagerankOptions = {
|
const coreOptions: CorePagerankOptions = {
|
||||||
|
|
|
@ -14,6 +14,8 @@ import {
|
||||||
Direction,
|
Direction,
|
||||||
DEFAULT_MAX_ITERATIONS,
|
DEFAULT_MAX_ITERATIONS,
|
||||||
DEFAULT_CONVERGENCE_THRESHOLD,
|
DEFAULT_CONVERGENCE_THRESHOLD,
|
||||||
|
DEFAULT_ALPHA,
|
||||||
|
DEFAULT_SEED,
|
||||||
} from "./pagerankGraph";
|
} from "./pagerankGraph";
|
||||||
import {advancedGraph} from "./graphTestUtil";
|
import {advancedGraph} from "./graphTestUtil";
|
||||||
import * as NullUtil from "../util/null";
|
import * as NullUtil from "../util/null";
|
||||||
|
@ -500,11 +502,38 @@ describe("core/pagerankGraph", () => {
|
||||||
pg3.runPagerank({
|
pg3.runPagerank({
|
||||||
maxIterations: DEFAULT_MAX_ITERATIONS,
|
maxIterations: DEFAULT_MAX_ITERATIONS,
|
||||||
convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
|
convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
|
||||||
|
alpha: DEFAULT_ALPHA,
|
||||||
|
seed: DEFAULT_SEED(),
|
||||||
});
|
});
|
||||||
expect(pg1.equals(pg2)).toBe(true);
|
expect(pg1.equals(pg2)).toBe(true);
|
||||||
expect(pg1.equals(pg3)).toBe(true);
|
expect(pg1.equals(pg3)).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("alpha and seed parameters", () => {
|
||||||
|
// The logic for seeded PageRank (and for generating the seed distribution via weights)
|
||||||
|
// are both thoroughly unit-tested. Therefore, these tests only sanity check that the
|
||||||
|
// parameters are getting consumed properly based on easily tested properties.
|
||||||
|
it("seed is irrelevant if alpha is 0", async () => {
|
||||||
|
const pg1 = examplePagerankGraph();
|
||||||
|
const pg2 = examplePagerankGraph();
|
||||||
|
const {nodes} = advancedGraph();
|
||||||
|
const seed1 = new Map().set(nodes.src(), 1);
|
||||||
|
const seed2 = new Map().set(nodes.dst(), 1);
|
||||||
|
await pg1.runPagerank({seed: seed1, alpha: 0});
|
||||||
|
await pg2.runPagerank({seed: seed2, alpha: 0});
|
||||||
|
expect(pg1.equals(pg2)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("seed is returned directly if alpha is 1", async () => {
|
||||||
|
const pg = examplePagerankGraph();
|
||||||
|
const src = advancedGraph().nodes.src;
|
||||||
|
const seed = new Map().set(src(), 1);
|
||||||
|
await pg.runPagerank({seed, alpha: 1});
|
||||||
|
const score = NullUtil.get(pg.node(src())).score;
|
||||||
|
expect(score).toBe(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
it("promise rejects if the graph was modified", async () => {
|
it("promise rejects if the graph was modified", async () => {
|
||||||
const pg = examplePagerankGraph();
|
const pg = examplePagerankGraph();
|
||||||
pg.graph().addNode(NodeAddress.empty);
|
pg.graph().addNode(NodeAddress.empty);
|
||||||
|
|
Loading…
Reference in New Issue