PagerankGraph reuses existing distribution (#1160)

This commit modifies `PagerankGraph.runPagerank` so that rather than
always starting from a uniform distribution, it starts with the
PagerankGraph's existing score distribution. The PagerankGraph
initializes with a uniform score over nodes, so it has the exact same
behavior on the first time that runPagerank is called. On subsequent
calls, PageRank will likely converge a lot faster, because it's starting
from converged scores. (It should still be a lot faster in cases where
e.g. the user has tweaked the weights.)

In certain degerate cases, this could change the resultant scores.
Specifically, if there are isolated nodes in the graph and alpha=0, then
the isolated nodes' final scores depends on the initial score. In
general, I think this won't be an issue as we expect alpha > 0 in normal
usage.

Test plan: I added a unit test to verify this property, by checking that
running PageRank with maxIterations==0 on an already-converged graph
results in a still-converged graph. Also, existing tests pass.

I think we can now close #1020.
This commit is contained in:
Dandelion Mané 2019-05-29 20:29:28 +03:00 committed by GitHub
parent 14eee06799
commit 4dc97fcc57
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 28 additions and 5 deletions

View File

@ -26,7 +26,6 @@ import {
findStationaryDistribution, findStationaryDistribution,
type PagerankParams, type PagerankParams,
type PagerankOptions as CorePagerankOptions, type PagerankOptions as CorePagerankOptions,
uniformDistribution,
} from "../core/attribution/markovChain"; } from "../core/attribution/markovChain";
import * as NullUtil from "../util/null"; import * as NullUtil from "../util/null";
@ -456,9 +455,13 @@ export class PagerankGraph {
* individual delta in a node's score between the present and previous * individual delta in a node's score between the present and previous
* iteration is less than or equal to `options.convergenceThreshold`. * iteration is less than or equal to `options.convergenceThreshold`.
* *
* TODO(#1020): Make `runPagerank` use the current nodes' scores as a * Note that if runPagerank is called multiple times on the same
* starting point for computation, rather than re-generating from * PagerankGraph, it will re-use the last stationary distribution as the
* scratch every time `runPagerank` is called. * starting point for running PageRank again. In general, this will result in
* improved performance, and it will not usually affect the outcome from
* PageRank. However, in certain circumstances, it could result in different
* outputs. For example, if there are isolated nodes and no seed vector, then
* the initial distribution may matter.
*/ */
async runPagerank( async runPagerank(
options?: PagerankOptions options?: PagerankOptions
@ -476,11 +479,15 @@ export class PagerankGraph {
this._syntheticLoopWeight this._syntheticLoopWeight
); );
const osmc = createOrderedSparseMarkovChain(connections); const osmc = createOrderedSparseMarkovChain(connections);
const pi0 = new Float64Array(osmc.chain.length);
osmc.nodeOrder.forEach(
(n: NodeAddressT, i) => (pi0[i] = NullUtil.get(this.node(n)).score)
);
const params: PagerankParams = { const params: PagerankParams = {
chain: osmc.chain, chain: osmc.chain,
alpha: fullOptions.alpha, alpha: fullOptions.alpha,
seed: weightedDistribution(osmc.nodeOrder, fullOptions.seed), seed: weightedDistribution(osmc.nodeOrder, fullOptions.seed),
pi0: uniformDistribution(osmc.chain.length), pi0,
}; };
const coreOptions: CorePagerankOptions = { const coreOptions: CorePagerankOptions = {
verbose: false, verbose: false,

View File

@ -609,6 +609,22 @@ describe("core/pagerankGraph", () => {
expect(results.convergenceDelta).toBeLessThan(convergenceThreshold); expect(results.convergenceDelta).toBeLessThan(convergenceThreshold);
checkProbabilityDistribution(pg); checkProbabilityDistribution(pg);
}); });
it("re-uses existing scores as a starting point", async () => {
const pg = examplePagerankGraph();
const convergenceThreshold = 0.001;
const results1 = await pg.runPagerank({
maxIterations: 170,
convergenceThreshold,
});
expect(results1.convergenceDelta).toBeLessThan(convergenceThreshold);
// It should still converge without any iterations, because it uses the
// final distribution as a starting point
const results2 = await pg.runPagerank({
maxIterations: 0,
convergenceThreshold,
});
expect(results2.convergenceDelta).toEqual(results1.convergenceDelta);
});
}); });
describe("equals", () => { describe("equals", () => {