From 402945809824e633ace5df26ac0b2c724cff1cb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dandelion=20Man=C3=A9?= Date: Thu, 13 Jun 2019 23:24:37 +0300 Subject: [PATCH] Factor out distribution modules (#1182) This pulls distribution related code out of `markovChain.js` into the new `distribution.js` module, and from `graphToMarkovChain.js` into `nodeDistribution.js`. Since the `computeDelta` method is now exported, I've added some unit tests. Test plan: `yarn test` passes. --- src/analysis/nodeScore.js | 2 +- src/analysis/pagerank.js | 4 +- .../pagerankNodeDecomposition.test.js | 2 +- src/core/attribution/distribution.js | 35 ++++++++ src/core/attribution/distribution.test.js | 52 +++++++++++ src/core/attribution/graphToMarkovChain.js | 61 +------------ .../attribution/graphToMarkovChain.test.js | 6 +- src/core/attribution/markovChain.js | 32 +------ src/core/attribution/markovChain.test.js | 9 +- src/core/attribution/nodeDistribution.js | 67 ++++++++++++++ src/core/attribution/nodeDistribution.test.js | 89 +++++++++++++++++++ src/core/pagerankGraph.js | 6 +- 12 files changed, 266 insertions(+), 99 deletions(-) create mode 100644 src/core/attribution/distribution.js create mode 100644 src/core/attribution/distribution.test.js create mode 100644 src/core/attribution/nodeDistribution.js create mode 100644 src/core/attribution/nodeDistribution.test.js diff --git a/src/analysis/nodeScore.js b/src/analysis/nodeScore.js index 347e2f7..7642ea2 100644 --- a/src/analysis/nodeScore.js +++ b/src/analysis/nodeScore.js @@ -1,7 +1,7 @@ // @flow import {NodeAddress, type NodeAddressT} from "../core/graph"; -import type {NodeDistribution} from "../core/attribution/graphToMarkovChain"; +import type {NodeDistribution} from "../core/attribution/nodeDistribution"; export type NodeScore = Map; diff --git a/src/analysis/pagerank.js b/src/analysis/pagerank.js index 94c431a..cde7d5c 100644 --- a/src/analysis/pagerank.js +++ b/src/analysis/pagerank.js @@ -23,10 +23,10 @@ import { findStationaryDistribution, type PagerankParams, type PagerankOptions as CorePagerankOptions, - uniformDistribution, } from "../core/attribution/markovChain"; +import {uniformDistribution} from "../core/attribution/distribution"; -export type {NodeDistribution} from "../core/attribution/graphToMarkovChain"; +export type {NodeDistribution} from "../core/attribution/nodeDistribution"; export type {PagerankNodeDecomposition} from "./pagerankNodeDecomposition"; export type PagerankOptions = {| +selfLoopWeight?: number, diff --git a/src/analysis/pagerankNodeDecomposition.test.js b/src/analysis/pagerankNodeDecomposition.test.js index c0a30a2..f5feafd 100644 --- a/src/analysis/pagerankNodeDecomposition.test.js +++ b/src/analysis/pagerankNodeDecomposition.test.js @@ -9,8 +9,8 @@ import { import { findStationaryDistribution, type PagerankParams, - uniformDistribution, } from "../core/attribution/markovChain"; +import {uniformDistribution} from "../core/attribution/distribution"; import { decompose, type PagerankNodeDecomposition, diff --git a/src/core/attribution/distribution.js b/src/core/attribution/distribution.js new file mode 100644 index 0000000..394d62a --- /dev/null +++ b/src/core/attribution/distribution.js @@ -0,0 +1,35 @@ +// @flow + +/** + * A distribution over the integers `0` through `n - 1`, where `n` is + * the length of the array. The value at index `i` is the probability of + * `i` in the distribution. The values should sum to 1. + */ +export type Distribution = Float64Array; + +export function uniformDistribution(n: number): Distribution { + if (isNaN(n) || !isFinite(n) || n !== Math.floor(n) || n <= 0) { + throw new Error("expected positive integer, but got: " + n); + } + return new Float64Array(n).fill(1 / n); +} + +/** + * Compute the maximum difference (in absolute value) between components in two + * distributions. + * + * Equivalent to $\norm{pi0 - pi1}_\infty$. + */ +export function computeDelta(pi0: Distribution, pi1: Distribution) { + if (pi0.length === 0 || pi0.length !== pi1.length) { + throw new Error("invalid input"); + } + let maxDelta = -Infinity; + // Here, we assume that `pi0.nodeOrder` and `pi1.nodeOrder` are the + // same (i.e., there has been no permutation). + pi0.forEach((x, i) => { + const delta = Math.abs(x - pi1[i]); + maxDelta = Math.max(delta, maxDelta); + }); + return maxDelta; +} diff --git a/src/core/attribution/distribution.test.js b/src/core/attribution/distribution.test.js new file mode 100644 index 0000000..095b66c --- /dev/null +++ b/src/core/attribution/distribution.test.js @@ -0,0 +1,52 @@ +// @flow + +import {uniformDistribution, computeDelta} from "./distribution"; + +describe("core/attribution/distribution", () => { + describe("uniformDistribution", () => { + describe("errors for: ", () => { + [ + [NaN, "NaN"], + [-1, "negatives"], + [0, "zero"], + [1.337, "non-integer"], + ].forEach(([value, name]) => { + it(name, () => { + expect(() => uniformDistribution(value)).toThrowError( + "expected positive integer" + ); + }); + }); + }); + it("returns a uniform distribution of size 1", () => { + expect(uniformDistribution(1)).toEqual(new Float64Array([1])); + }); + it("returns a uniform distribution of size 2", () => { + expect(uniformDistribution(2)).toEqual(new Float64Array([0.5, 0.5])); + }); + }); + + describe("computeDelta", () => { + const u = uniformDistribution; + it("errors on empty array", () => { + expect(() => + computeDelta(new Float64Array([]), new Float64Array([])) + ).toThrowError("invalid input"); + }); + it("works on size-1 array", () => { + expect(computeDelta(u(1), u(1))).toEqual(0); + }); + it("errors on mismatched sizes", () => { + expect(() => computeDelta(u(1), u(2))).toThrowError("invalid input"); + }); + it("correctly computes max delta", () => { + const pi = new Float64Array([0.5, 0.0, 0.5]); + expect(computeDelta(u(3), pi)).toEqual(1 / 3); + }); + it("doesn't depend on argument order", () => { + // implies that it uses Math.abs for delta computation + const pi = new Float64Array([0.5, 0.0, 0.5]); + expect(computeDelta(u(3), pi)).toEqual(computeDelta(pi, u(3))); + }); + }); +}); diff --git a/src/core/attribution/graphToMarkovChain.js b/src/core/attribution/graphToMarkovChain.js index 8b7ba2e..44a001b 100644 --- a/src/core/attribution/graphToMarkovChain.js +++ b/src/core/attribution/graphToMarkovChain.js @@ -1,15 +1,12 @@ // @flow -import {type Edge, type Graph, type NodeAddressT, NodeAddress} from "../graph"; -import { - type Distribution, - type SparseMarkovChain, - uniformDistribution, -} from "./markovChain"; +import {type Edge, type Graph, type NodeAddressT} from "../graph"; +import {type Distribution} from "./distribution"; +import {type Probability, type NodeDistribution} from "./nodeDistribution"; +import {type SparseMarkovChain} from "./markovChain"; import * as MapUtil from "../../util/map"; import * as NullUtil from "../../util/null"; -export type Probability = number; export type Adjacency = | {|+type: "SYNTHETIC_LOOP"|} | {|+type: "IN_EDGE", +edge: Edge|} @@ -35,56 +32,6 @@ export function adjacencySource(target: NodeAddressT, adjacency: Adjacency) { } } -/** - * Create a Distribution using provided node weights. - * - * weightedDistribution takes in a node order (as a read only array of NodeAddressT), - * and a map providing weights for a subset of those nodes. It returns a Distribution - * with the invariant that every node's weight is proportional to its relative weight - * in the weights map. For example, in a case where there were three nodes and they - * had weights of 0, 1, and 3 respectively, the distribution would be [0, 0.25, 0.75]. - * - * If a node address is not present in the weight map, its weight is assumed to be 0. - * If any weight is negative or non-finite, an error will be thrown. - * If the sum of all weights is 0, then a uniform distribution will be returned. - * If the weight map assigned weight to nodes which are not in the node order, an error - * will be thrown. - */ -export function weightedDistribution( - nodeOrder: $ReadOnlyArray, - weights: Map -): Distribution { - let totalWeight = 0; - for (const [address, weight] of weights.entries()) { - if (weight < 0 || !isFinite(weight)) { - throw new Error( - `Invalid weight ${weight} associated with address ${NodeAddress.toString( - address - )}` - ); - } - totalWeight += weight; - } - if (totalWeight === 0) { - return uniformDistribution(nodeOrder.length); - } - let numEncounteredWeights = 0; - const distribution = new Float64Array(nodeOrder.length); - for (let i = 0; i < distribution.length; i++) { - const weight = weights.get(nodeOrder[i]); - if (weight != null) { - numEncounteredWeights++; - distribution[i] = weight / totalWeight; - } - } - if (numEncounteredWeights !== weights.size) { - throw new Error("weights included nodes not present in the nodeOrder"); - } - return distribution; -} - -export type NodeDistribution = Map; - export type NodeToConnections = Map>; type NodeAddressMarkovChain = Map< diff --git a/src/core/attribution/graphToMarkovChain.test.js b/src/core/attribution/graphToMarkovChain.test.js index 22e54f9..eafcd0e 100644 --- a/src/core/attribution/graphToMarkovChain.test.js +++ b/src/core/attribution/graphToMarkovChain.test.js @@ -4,14 +4,16 @@ import sortBy from "lodash.sortby"; import {Graph, NodeAddress} from "../graph"; import { - distributionToNodeDistribution, createConnections, createOrderedSparseMarkovChain, normalize, normalizeNeighbors, permute, - weightedDistribution, } from "./graphToMarkovChain"; +import { + distributionToNodeDistribution, + weightedDistribution, +} from "./nodeDistribution"; import * as MapUtil from "../../util/map"; import {node, advancedGraph, edge} from "../graphTestUtil"; diff --git a/src/core/attribution/markovChain.js b/src/core/attribution/markovChain.js index 2ec0846..c868b73 100644 --- a/src/core/attribution/markovChain.js +++ b/src/core/attribution/markovChain.js @@ -1,12 +1,6 @@ // @flow -/** - * A distribution over the integers `0` through `n - 1`, where `n` is - * the length of the array. The value at index `i` is the probability of - * `i` in the distribution. The values should sum to 1. - */ -export type Distribution = Float64Array; - +import {computeDelta, type Distribution} from "./distribution"; /** * The data inputs to running PageRank. * @@ -123,13 +117,6 @@ export function sparseMarkovChainFromTransitionMatrix( }); } -export function uniformDistribution(n: number): Distribution { - if (isNaN(n) || !isFinite(n) || n !== Math.floor(n) || n <= 0) { - throw new Error("expected positive integer, but got: " + n); - } - return new Float64Array(n).fill(1 / n); -} - function sparseMarkovChainActionInto( chain: SparseMarkovChain, seed: Distribution, @@ -159,23 +146,6 @@ export function sparseMarkovChainAction( return result; } -/** - * Compute the maximum difference (in absolute value) between components in two - * distributions. - * - * Equivalent to $\norm{pi0 - pi1}_\infty$. - */ -export function computeDelta(pi0: Distribution, pi1: Distribution) { - let maxDelta = -Infinity; - // Here, we assume that `pi0.nodeOrder` and `pi1.nodeOrder` are the - // same (i.e., there has been no permutation). - pi0.forEach((x, i) => { - const delta = Math.abs(x - pi1[i]); - maxDelta = Math.max(delta, maxDelta); - }); - return maxDelta; -} - function* findStationaryDistributionGenerator( params: PagerankParams, options: {| diff --git a/src/core/attribution/markovChain.test.js b/src/core/attribution/markovChain.test.js index 22a973a..0a71bde 100644 --- a/src/core/attribution/markovChain.test.js +++ b/src/core/attribution/markovChain.test.js @@ -1,12 +1,15 @@ // @flow -import type {Distribution, SparseMarkovChain} from "./markovChain"; import { + type Distribution, + uniformDistribution, + computeDelta, +} from "./distribution"; +import { + type SparseMarkovChain, findStationaryDistribution, sparseMarkovChainAction, sparseMarkovChainFromTransitionMatrix, - uniformDistribution, - computeDelta, type StationaryDistributionResult, type PagerankParams, } from "./markovChain"; diff --git a/src/core/attribution/nodeDistribution.js b/src/core/attribution/nodeDistribution.js new file mode 100644 index 0000000..27133d5 --- /dev/null +++ b/src/core/attribution/nodeDistribution.js @@ -0,0 +1,67 @@ +// @flow + +import {type NodeAddressT, NodeAddress} from "../graph"; +import {type Distribution, uniformDistribution} from "./distribution"; + +export type Probability = number; +export type NodeDistribution = Map; + +export function distributionToNodeDistribution( + nodeOrder: $ReadOnlyArray, + pi: Distribution +): NodeDistribution { + const result = new Map(); + nodeOrder.forEach((node, i) => { + const probability = pi[i]; + result.set(node, probability); + }); + return result; +} + +/** + * Create a Distribution using provided node weights. + * + * weightedDistribution takes in a node order (as a read only array of NodeAddressT), + * and a map providing weights for a subset of those nodes. It returns a Distribution + * with the invariant that every node's weight is proportional to its relative weight + * in the weights map. For example, in a case where there were three nodes and they + * had weights of 0, 1, and 3 respectively, the distribution would be [0, 0.25, 0.75]. + * + * If a node address is not present in the weight map, its weight is assumed to be 0. + * If any weight is negative or non-finite, an error will be thrown. + * If the sum of all weights is 0, then a uniform distribution will be returned. + * If the weight map assigned weight to nodes which are not in the node order, an error + * will be thrown. + */ +export function weightedDistribution( + nodeOrder: $ReadOnlyArray, + weights: Map +): Distribution { + let totalWeight = 0; + for (const [address, weight] of weights.entries()) { + if (weight < 0 || !isFinite(weight)) { + throw new Error( + `Invalid weight ${weight} associated with address ${NodeAddress.toString( + address + )}` + ); + } + totalWeight += weight; + } + if (totalWeight === 0) { + return uniformDistribution(nodeOrder.length); + } + let numEncounteredWeights = 0; + const distribution = new Float64Array(nodeOrder.length); + for (let i = 0; i < distribution.length; i++) { + const weight = weights.get(nodeOrder[i]); + if (weight != null) { + numEncounteredWeights++; + distribution[i] = weight / totalWeight; + } + } + if (numEncounteredWeights !== weights.size) { + throw new Error("weights included nodes not present in the nodeOrder"); + } + return distribution; +} diff --git a/src/core/attribution/nodeDistribution.test.js b/src/core/attribution/nodeDistribution.test.js new file mode 100644 index 0000000..864e189 --- /dev/null +++ b/src/core/attribution/nodeDistribution.test.js @@ -0,0 +1,89 @@ +// @flow + +import {NodeAddress} from "../../core/graph"; +import { + weightedDistribution, + distributionToNodeDistribution, +} from "./nodeDistribution"; + +describe("core/attribution/nodeDistribution", () => { + const n1 = NodeAddress.fromParts(["n1"]); + const n2 = NodeAddress.fromParts(["n2"]); + + describe("distributionToNodeDistribution", () => { + it("works", () => { + const pi = new Float64Array([0.25, 0.75]); + expect(distributionToNodeDistribution([n1, n2], pi)).toEqual( + new Map().set(n1, 0.25).set(n2, 0.75) + ); + }); + }); + + describe("weightedDistribution", () => { + const a = NodeAddress.fromParts(["a"]); + const b = NodeAddress.fromParts(["b"]); + const c = NodeAddress.fromParts(["c"]); + const d = NodeAddress.fromParts(["d"]); + const order = () => [a, b, c, d]; + it("gives a uniform distribution for an empty map", () => { + expect(weightedDistribution(order(), new Map())).toEqual( + new Float64Array([0.25, 0.25, 0.25, 0.25]) + ); + }); + it("gives a uniform distribution for a map with 0 weight", () => { + const map = new Map().set(a, 0); + expect(weightedDistribution(order(), map)).toEqual( + new Float64Array([0.25, 0.25, 0.25, 0.25]) + ); + }); + it("can put all weight on one node", () => { + const map = new Map().set(b, 0.1); + expect(weightedDistribution(order(), map)).toEqual( + new Float64Array([0, 1, 0, 0]) + ); + }); + it("can split weight unequally", () => { + const map = new Map().set(b, 1).set(c, 3); + expect(weightedDistribution(order(), map)).toEqual( + new Float64Array([0, 0.25, 0.75, 0]) + ); + }); + it("can create a uniform distribution if all weights are equal", () => { + const map = new Map() + .set(a, 1) + .set(b, 1) + .set(c, 1) + .set(d, 1); + expect(weightedDistribution(order(), map)).toEqual( + new Float64Array([0.25, 0.25, 0.25, 0.25]) + ); + }); + describe("errors if", () => { + it("has a weighted node that is not in the order", () => { + const z = NodeAddress.fromParts(["z"]); + const map = new Map().set(z, 1); + expect(() => weightedDistribution(order(), map)).toThrowError( + "weights included nodes not present in the nodeOrder" + ); + }); + it("has a node with negative weight", () => { + const map = new Map().set(a, -1); + expect(() => weightedDistribution(order(), map)).toThrowError( + "Invalid weight -1" + ); + }); + it("has a node with NaN weight", () => { + const map = new Map().set(a, NaN); + expect(() => weightedDistribution(order(), map)).toThrowError( + "Invalid weight NaN" + ); + }); + it("has a node with infinite weight", () => { + const map = new Map().set(a, Infinity); + expect(() => weightedDistribution(order(), map)).toThrowError( + "Invalid weight Infinity" + ); + }); + }); + }); +}); diff --git a/src/core/pagerankGraph.js b/src/core/pagerankGraph.js index e4afcba..1b6b1cb 100644 --- a/src/core/pagerankGraph.js +++ b/src/core/pagerankGraph.js @@ -16,12 +16,14 @@ import { type NeighborsOptions, } from "./graph"; import { - distributionToNodeDistribution, createConnections, createOrderedSparseMarkovChain, type EdgeWeight, - weightedDistribution, } from "./attribution/graphToMarkovChain"; +import { + distributionToNodeDistribution, + weightedDistribution, +} from "./attribution/nodeDistribution"; import { findStationaryDistribution, type PagerankParams,