Add lossy cred compression strategy (#1832)

This adds a new `compressByThreshold` method in the credResult module,
which compresses the CredResult by removing interval-level cred data for
flows of cred that are below a threshold.

Test plan: Unit tests included; `yarn test` passes.
This commit is contained in:
Dandelion Mané 2020-06-01 18:29:30 -07:00 committed by GitHub
parent f615ec89a3
commit d556776cca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 156 additions and 2 deletions

View File

@ -122,3 +122,84 @@ export function computeCredData(scores: TimelineCredScores): CredData {
intervalEnds,
};
}
/**
* Compress the cred data by removing all time-level info on
* flows/accumulations that sum to less than the threshold.
*
* E.g. if we set the threshold to 10 and a node has only 9 cred, we store its
* summary info but not how those flows split across time.
*
* If the node had 11 cred but only 1 cred from seed and 0 from synthetic loop,
* then we store the timing info for its cred, but not for its seed or
* synthetic loop flows.
*
* Likewise for edges, we separately decide whether to store the forward flow
* and the backward flow.
*/
export function compressByThreshold(x: CredData, threshold: number): CredData {
const {
nodeSummaries,
nodeOverTime,
edgeSummaries,
edgeOverTime,
intervalEnds,
} = x;
const newNodeOverTime = nodeOverTime.map((d, i) => {
if (d == null) {
// It might be null if the data was already compressed. The function
// should be idempotent. This way we can chain compression strategies
// later on.
return null;
}
const s = nodeSummaries[i];
if (s.cred < threshold) {
// If the cred is below threshold, then we know both the seed flow and
// the synthetic loop flow are below threshold, since the cred is the sum
// of those flows plus the flows from edges. So we can shortcut straight
// to returning null.
return null;
}
return {
// We get a space efficiency boost here, since for the majority of nodes,
// even though they have material cred, they have little seed or
// synthetic loop flow. So we can save ourselves from storing large
// arrays of near-zero values.
cred: d.cred,
seedFlow: s.seedFlow < threshold ? null : d.seedFlow,
syntheticLoopFlow:
s.syntheticLoopFlow < threshold ? null : d.syntheticLoopFlow,
};
});
const newEdgeOverTime = edgeOverTime.map((d, i) => {
if (d == null) {
// It might be null if the data was already compressed. The function
// should be idempotent. This way we can chain compression strategies
// later on.
return null;
}
const {forwardFlow, backwardFlow} = edgeSummaries[i];
const checkF = forwardFlow >= threshold;
const checkB = backwardFlow >= threshold;
if (checkF || checkB) {
// The edge might be effectively unidrectional--in that case let's not
// waste space storing data for the direction that had very little cred
// flow.
return {
forwardFlow: checkF ? d.forwardFlow : null,
backwardFlow: checkB ? d.backwardFlow : null,
};
}
return null;
});
return {
nodeOverTime: newNodeOverTime,
edgeOverTime: newEdgeOverTime,
nodeSummaries,
edgeSummaries,
intervalEnds,
};
}

View File

@ -1,6 +1,6 @@
// @flow
import {computeCredData} from "./credData";
import {computeCredData, compressByThreshold} from "./credData";
import type {TimelineCredScores} from "../core/algorithm/distributionToCred";
describe("src/analysis/credData", () => {
@ -52,4 +52,56 @@ describe("src/analysis/credData", () => {
};
expect(computeCredData(scores)).toEqual(expected);
});
it("compresses by threshold correctly", () => {
const intervalEnds = [100, 200];
const nodeSummaries = [
{cred: 14, seedFlow: 0, syntheticLoopFlow: 0.2},
{cred: 20, seedFlow: 20, syntheticLoopFlow: 0},
{cred: 1, seedFlow: 1, syntheticLoopFlow: 0},
];
const nodeOverTime = [
{cred: [4, 10], seedFlow: [0, 0], syntheticLoopFlow: [0.1, 0.1]},
{cred: [10, 10], seedFlow: [10, 10], syntheticLoopFlow: [0, 0]},
{cred: [5, 0], seedFlow: [0, 0], syntheticLoopFlow: [0, 0]},
];
const edgeSummaries = [
{forwardFlow: 20, backwardFlow: 2},
{
forwardFlow: 10,
backwardFlow: 10,
},
{forwardFlow: 1, backwardFlow: 1},
];
const edgeOverTime = [
{forwardFlow: [19, 1], backwardFlow: [2, 0]},
{forwardFlow: [5, 5], backwardFlow: [9, 1]},
{forwardFlow: [1, 0], backwardFlow: [0, 1]},
];
const input = {
intervalEnds,
nodeSummaries,
nodeOverTime,
edgeSummaries,
edgeOverTime,
};
const expected = {
intervalEnds,
nodeSummaries,
nodeOverTime: [
{cred: [4, 10], seedFlow: null, syntheticLoopFlow: null},
{cred: [10, 10], seedFlow: [10, 10], syntheticLoopFlow: null},
null,
],
edgeSummaries,
edgeOverTime: [
{forwardFlow: [19, 1], backwardFlow: null},
{forwardFlow: [5, 5], backwardFlow: [9, 1]},
null,
],
};
const result = compressByThreshold(input, 10);
expect(result).toEqual(expected);
// Check that it's idempotent too.
expect(compressByThreshold(result, 10)).toEqual(result);
});
});

View File

@ -20,7 +20,11 @@ import {
fromJSON as pluginsFromJSON,
} from "./pluginDeclaration";
import {timelinePagerank} from "../core/algorithm/timelinePagerank";
import {type CredData, computeCredData} from "./credData";
import {
type CredData,
computeCredData,
compressByThreshold as _compressByThreshold,
} from "./credData";
import {distributionToCred} from "../core/algorithm/distributionToCred";
/**
@ -56,6 +60,23 @@ export async function compute(
return {weightedGraph: wg, credData, params, plugins};
}
// Lossily compress a CredResult, by throwing away time-specific cred
// data for any flows that summed to less than `threshold` cred.
// We may want to implement more sophisticated and context-aware strategies
// in the future.
export function compressByThreshold(
x: CredResult,
threshold: number
): CredResult {
const {params, plugins, weightedGraph, credData} = x;
return {
params,
plugins,
weightedGraph,
credData: _compressByThreshold(credData, threshold),
};
}
const COMPAT_INFO = {type: "sourcecred/credResult", version: "0.1.0"};
export type CredResultJSON = Compatible<{|