Add support for seed vectors to PagerankGraph (#1135)
This commit modifies `PagerankGraph.runPagerank` so that the user can
provide an alpha and seed vector. The seed vector is specified via a map
of weights, which will be normalized into a probability distribution
over all the nodes in the graph. In the event that the map is empty (or
the total weight is otherwise 0), a uniform distribution is created.
To effect this change, a helper function called `weightedDistribution`
has been added (and thoroughly tested) in the `graphToMarkovChain`
module. Then, that function is used in `pagerankGraph.runPagerank`
(along with light testing).
Currently, the default alpha is set to 0, to ensure consistency with the
legacy pagerank implementation in `analysis/pagerank`. Once that has
been replaced with `PagerankGraph`, we can consider changing the defualt
alpha to non-zero (thus removing the need for synthetic self-loops).
I took a different approach in the [odyssey-hackathon repo][commit].
The previous approach was a much more complicated (and fairly redundant)
API, that allowed specifying "NO_SEED", "UNIFORM_SEED", "SELECTED_SEED",
and "SPECIFIED_SEED". I'm much happier with this API and implementation.
[commit]: ed07861073
Test plan: Unit tests included; run `yarn test`.
This commit is contained in:
parent
e7bc025379
commit
79017a477b
|
@ -1,7 +1,11 @@
|
|||
// @flow
|
||||
|
||||
import {type Edge, type Graph, type NodeAddressT} from "../graph";
|
||||
import type {Distribution, SparseMarkovChain} from "./markovChain";
|
||||
import {type Edge, type Graph, type NodeAddressT, NodeAddress} from "../graph";
|
||||
import {
|
||||
type Distribution,
|
||||
type SparseMarkovChain,
|
||||
uniformDistribution,
|
||||
} from "./markovChain";
|
||||
import * as MapUtil from "../../util/map";
|
||||
import * as NullUtil from "../../util/null";
|
||||
|
||||
|
@ -31,6 +35,54 @@ export function adjacencySource(target: NodeAddressT, adjacency: Adjacency) {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a Distribution using provided node weights.
|
||||
*
|
||||
* weightedDistribution takes in a node order (as a read only array of NodeAddressT),
|
||||
* and a map providing weights for a subset of those nodes. It returns a Distribution
|
||||
* with the invariant that every node's weight is proportional to its relative weight
|
||||
* in the weights map. For example, in a case where there were three nodes and they
|
||||
* had weights of 0, 1, and 3 respectively, the distribution would be [0, 0.25, 0.75].
|
||||
*
|
||||
* If a node address is not present in the weight map, its weight is assumed to be 0.
|
||||
* If any weight is negative or non-finite, an error will be thrown.
|
||||
* If the sum of all weights is 0, then a uniform distribution will be returned.
|
||||
* If the weight map assigned weight to nodes which are not in the node order, an error
|
||||
* will be thrown.
|
||||
*/
|
||||
export function weightedDistribution(
|
||||
nodeOrder: $ReadOnlyArray<NodeAddressT>,
|
||||
weights: Map<NodeAddressT, number>
|
||||
): Distribution {
|
||||
let totalWeight = 0;
|
||||
for (const [address, weight] of weights.entries()) {
|
||||
if (weight < 0 || !isFinite(weight)) {
|
||||
throw new Error(
|
||||
`Invalid weight ${weight} associated with address ${NodeAddress.toString(
|
||||
address
|
||||
)}`
|
||||
);
|
||||
}
|
||||
totalWeight += weight;
|
||||
}
|
||||
if (totalWeight === 0) {
|
||||
return uniformDistribution(nodeOrder.length);
|
||||
}
|
||||
let numEncounteredWeights = 0;
|
||||
const distribution = new Float64Array(nodeOrder.length);
|
||||
for (let i = 0; i < distribution.length; i++) {
|
||||
const weight = weights.get(nodeOrder[i]);
|
||||
if (weight != null) {
|
||||
numEncounteredWeights++;
|
||||
distribution[i] = weight / totalWeight;
|
||||
}
|
||||
}
|
||||
if (numEncounteredWeights !== weights.size) {
|
||||
throw new Error("weights included nodes not present in the nodeOrder");
|
||||
}
|
||||
return distribution;
|
||||
}
|
||||
|
||||
export type NodeDistribution = Map<NodeAddressT, Probability>;
|
||||
|
||||
export type NodeToConnections = Map<NodeAddressT, $ReadOnlyArray<Connection>>;
|
||||
|
|
|
@ -10,6 +10,7 @@ import {
|
|||
normalize,
|
||||
normalizeNeighbors,
|
||||
permute,
|
||||
weightedDistribution,
|
||||
} from "./graphToMarkovChain";
|
||||
import * as MapUtil from "../../util/map";
|
||||
|
||||
|
@ -288,4 +289,72 @@ describe("core/attribution/graphToMarkovChain", () => {
|
|||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("weightedDistribution", () => {
|
||||
const a = NodeAddress.fromParts(["a"]);
|
||||
const b = NodeAddress.fromParts(["b"]);
|
||||
const c = NodeAddress.fromParts(["c"]);
|
||||
const d = NodeAddress.fromParts(["d"]);
|
||||
const order = () => [a, b, c, d];
|
||||
it("gives a uniform distribution for an empty map", () => {
|
||||
expect(weightedDistribution(order(), new Map())).toEqual(
|
||||
new Float64Array([0.25, 0.25, 0.25, 0.25])
|
||||
);
|
||||
});
|
||||
it("gives a uniform distribution for a map with 0 weight", () => {
|
||||
const map = new Map().set(a, 0);
|
||||
expect(weightedDistribution(order(), map)).toEqual(
|
||||
new Float64Array([0.25, 0.25, 0.25, 0.25])
|
||||
);
|
||||
});
|
||||
it("can put all weight on one node", () => {
|
||||
const map = new Map().set(b, 0.1);
|
||||
expect(weightedDistribution(order(), map)).toEqual(
|
||||
new Float64Array([0, 1, 0, 0])
|
||||
);
|
||||
});
|
||||
it("can split weight unequally", () => {
|
||||
const map = new Map().set(b, 1).set(c, 3);
|
||||
expect(weightedDistribution(order(), map)).toEqual(
|
||||
new Float64Array([0, 0.25, 0.75, 0])
|
||||
);
|
||||
});
|
||||
it("can create a uniform distribution if all weights are equal", () => {
|
||||
const map = new Map()
|
||||
.set(a, 1)
|
||||
.set(b, 1)
|
||||
.set(c, 1)
|
||||
.set(d, 1);
|
||||
expect(weightedDistribution(order(), map)).toEqual(
|
||||
new Float64Array([0.25, 0.25, 0.25, 0.25])
|
||||
);
|
||||
});
|
||||
describe("errors if", () => {
|
||||
it("has a weighted node that is not in the order", () => {
|
||||
const z = NodeAddress.fromParts(["z"]);
|
||||
const map = new Map().set(z, 1);
|
||||
expect(() => weightedDistribution(order(), map)).toThrowError(
|
||||
"weights included nodes not present in the nodeOrder"
|
||||
);
|
||||
});
|
||||
it("has a node with negative weight", () => {
|
||||
const map = new Map().set(a, -1);
|
||||
expect(() => weightedDistribution(order(), map)).toThrowError(
|
||||
"Invalid weight -1"
|
||||
);
|
||||
});
|
||||
it("has a node with NaN weight", () => {
|
||||
const map = new Map().set(a, NaN);
|
||||
expect(() => weightedDistribution(order(), map)).toThrowError(
|
||||
"Invalid weight NaN"
|
||||
);
|
||||
});
|
||||
it("has a node with infinite weight", () => {
|
||||
const map = new Map().set(a, Infinity);
|
||||
expect(() => weightedDistribution(order(), map)).toThrowError(
|
||||
"Invalid weight Infinity"
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
@ -20,6 +20,7 @@ import {
|
|||
createConnections,
|
||||
createOrderedSparseMarkovChain,
|
||||
type EdgeWeight,
|
||||
weightedDistribution,
|
||||
} from "./attribution/graphToMarkovChain";
|
||||
import {
|
||||
findStationaryDistribution,
|
||||
|
@ -75,10 +76,30 @@ export type PagerankOptions = {|
|
|||
// Maximum number of iterations before we give up on PageRank Convergence
|
||||
// Defaults to DEFAULT_MAX_ITERATIONS if not provided.
|
||||
+maxIterations?: number,
|
||||
|
||||
// PageRank will stop running once the diff between the previous iteration
|
||||
// and the latest is less than this threshold.
|
||||
// Defaults to DEFAULT_CONVERGENCE_THRESHOLD if not provided.
|
||||
+convergenceThreshold?: number,
|
||||
|
||||
// Specifies a seed vector for PageRank "teleportation".
|
||||
// At every step, some proportion `alpha` of the weight will
|
||||
// teleport to the seed.
|
||||
//
|
||||
// The seed is specified as a map from node addresses to weights.
|
||||
// The resultant seed will be a proper distribution over all the graph's available
|
||||
// nodes, with each node's weight proportional to its weight in the seed. In the case
|
||||
// that the total weight in the seed is 0 (e.g. an empty map was passed), then the
|
||||
// seed vector will be a uniform distribution.
|
||||
//
|
||||
// Specifying any negative, NaN, or infinite weights is an error.
|
||||
// Specifying weights for nodes that are not in the graph is also an error.
|
||||
+seed?: Map<NodeAddressT, number>,
|
||||
|
||||
// Specifies the probability with which score 'teleports' to the seed vector.
|
||||
// If alpha=0, then the teleportation never happens. If alpha=1, then PageRank
|
||||
// always converges to precisely the seed vector. Defaults to DEFAULT_ALPHA.
|
||||
+alpha?: number,
|
||||
|};
|
||||
|
||||
export type PagerankConvergenceReport = {|
|
||||
|
@ -92,11 +113,17 @@ export type PagerankConvergenceReport = {|
|
|||
export const DEFAULT_SYNTHETIC_LOOP_WEIGHT = 1e-3;
|
||||
export const DEFAULT_MAX_ITERATIONS = 255;
|
||||
export const DEFAULT_CONVERGENCE_THRESHOLD = 1e-7;
|
||||
// TODO(@decentralion): Change default alpha to be a small non-zero value
|
||||
// once we choose an appropriate value.
|
||||
export const DEFAULT_ALPHA = 0;
|
||||
export const DEFAULT_SEED: () => Map<NodeAddressT, number> = () => new Map();
|
||||
|
||||
function defaultOptions(): PagerankOptions {
|
||||
return {
|
||||
maxIterations: DEFAULT_MAX_ITERATIONS,
|
||||
convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
|
||||
alpha: DEFAULT_ALPHA,
|
||||
seed: DEFAULT_SEED(),
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -441,8 +468,8 @@ export class PagerankGraph {
|
|||
const osmc = createOrderedSparseMarkovChain(connections);
|
||||
const params: PagerankParams = {
|
||||
chain: osmc.chain,
|
||||
alpha: 0,
|
||||
seed: uniformDistribution(osmc.chain.length),
|
||||
alpha: fullOptions.alpha,
|
||||
seed: weightedDistribution(osmc.nodeOrder, fullOptions.seed),
|
||||
pi0: uniformDistribution(osmc.chain.length),
|
||||
};
|
||||
const coreOptions: CorePagerankOptions = {
|
||||
|
|
|
@ -14,6 +14,8 @@ import {
|
|||
Direction,
|
||||
DEFAULT_MAX_ITERATIONS,
|
||||
DEFAULT_CONVERGENCE_THRESHOLD,
|
||||
DEFAULT_ALPHA,
|
||||
DEFAULT_SEED,
|
||||
} from "./pagerankGraph";
|
||||
import {advancedGraph} from "./graphTestUtil";
|
||||
import * as NullUtil from "../util/null";
|
||||
|
@ -500,11 +502,38 @@ describe("core/pagerankGraph", () => {
|
|||
pg3.runPagerank({
|
||||
maxIterations: DEFAULT_MAX_ITERATIONS,
|
||||
convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
|
||||
alpha: DEFAULT_ALPHA,
|
||||
seed: DEFAULT_SEED(),
|
||||
});
|
||||
expect(pg1.equals(pg2)).toBe(true);
|
||||
expect(pg1.equals(pg3)).toBe(true);
|
||||
});
|
||||
|
||||
describe("alpha and seed parameters", () => {
|
||||
// The logic for seeded PageRank (and for generating the seed distribution via weights)
|
||||
// are both thoroughly unit-tested. Therefore, these tests only sanity check that the
|
||||
// parameters are getting consumed properly based on easily tested properties.
|
||||
it("seed is irrelevant if alpha is 0", async () => {
|
||||
const pg1 = examplePagerankGraph();
|
||||
const pg2 = examplePagerankGraph();
|
||||
const {nodes} = advancedGraph();
|
||||
const seed1 = new Map().set(nodes.src(), 1);
|
||||
const seed2 = new Map().set(nodes.dst(), 1);
|
||||
await pg1.runPagerank({seed: seed1, alpha: 0});
|
||||
await pg2.runPagerank({seed: seed2, alpha: 0});
|
||||
expect(pg1.equals(pg2)).toBe(true);
|
||||
});
|
||||
|
||||
it("seed is returned directly if alpha is 1", async () => {
|
||||
const pg = examplePagerankGraph();
|
||||
const src = advancedGraph().nodes.src;
|
||||
const seed = new Map().set(src(), 1);
|
||||
await pg.runPagerank({seed, alpha: 1});
|
||||
const score = NullUtil.get(pg.node(src())).score;
|
||||
expect(score).toBe(1);
|
||||
});
|
||||
});
|
||||
|
||||
it("promise rejects if the graph was modified", async () => {
|
||||
const pg = examplePagerankGraph();
|
||||
pg.graph().addNode(NodeAddress.empty);
|
||||
|
|
Loading…
Reference in New Issue