Add lossy cred compression strategy (#1832)
This adds a new `compressByThreshold` method in the credResult module, which compresses the CredResult by removing interval-level cred data for flows of cred that are below a threshold. Test plan: Unit tests included; `yarn test` passes.
This commit is contained in:
parent
f615ec89a3
commit
d556776cca
|
@ -122,3 +122,84 @@ export function computeCredData(scores: TimelineCredScores): CredData {
|
|||
intervalEnds,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Compress the cred data by removing all time-level info on
|
||||
* flows/accumulations that sum to less than the threshold.
|
||||
*
|
||||
* E.g. if we set the threshold to 10 and a node has only 9 cred, we store its
|
||||
* summary info but not how those flows split across time.
|
||||
*
|
||||
* If the node had 11 cred but only 1 cred from seed and 0 from synthetic loop,
|
||||
* then we store the timing info for its cred, but not for its seed or
|
||||
* synthetic loop flows.
|
||||
*
|
||||
* Likewise for edges, we separately decide whether to store the forward flow
|
||||
* and the backward flow.
|
||||
*/
|
||||
export function compressByThreshold(x: CredData, threshold: number): CredData {
|
||||
const {
|
||||
nodeSummaries,
|
||||
nodeOverTime,
|
||||
edgeSummaries,
|
||||
edgeOverTime,
|
||||
intervalEnds,
|
||||
} = x;
|
||||
|
||||
const newNodeOverTime = nodeOverTime.map((d, i) => {
|
||||
if (d == null) {
|
||||
// It might be null if the data was already compressed. The function
|
||||
// should be idempotent. This way we can chain compression strategies
|
||||
// later on.
|
||||
return null;
|
||||
}
|
||||
const s = nodeSummaries[i];
|
||||
if (s.cred < threshold) {
|
||||
// If the cred is below threshold, then we know both the seed flow and
|
||||
// the synthetic loop flow are below threshold, since the cred is the sum
|
||||
// of those flows plus the flows from edges. So we can shortcut straight
|
||||
// to returning null.
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
// We get a space efficiency boost here, since for the majority of nodes,
|
||||
// even though they have material cred, they have little seed or
|
||||
// synthetic loop flow. So we can save ourselves from storing large
|
||||
// arrays of near-zero values.
|
||||
cred: d.cred,
|
||||
seedFlow: s.seedFlow < threshold ? null : d.seedFlow,
|
||||
syntheticLoopFlow:
|
||||
s.syntheticLoopFlow < threshold ? null : d.syntheticLoopFlow,
|
||||
};
|
||||
});
|
||||
|
||||
const newEdgeOverTime = edgeOverTime.map((d, i) => {
|
||||
if (d == null) {
|
||||
// It might be null if the data was already compressed. The function
|
||||
// should be idempotent. This way we can chain compression strategies
|
||||
// later on.
|
||||
return null;
|
||||
}
|
||||
const {forwardFlow, backwardFlow} = edgeSummaries[i];
|
||||
const checkF = forwardFlow >= threshold;
|
||||
const checkB = backwardFlow >= threshold;
|
||||
if (checkF || checkB) {
|
||||
// The edge might be effectively unidrectional--in that case let's not
|
||||
// waste space storing data for the direction that had very little cred
|
||||
// flow.
|
||||
return {
|
||||
forwardFlow: checkF ? d.forwardFlow : null,
|
||||
backwardFlow: checkB ? d.backwardFlow : null,
|
||||
};
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
return {
|
||||
nodeOverTime: newNodeOverTime,
|
||||
edgeOverTime: newEdgeOverTime,
|
||||
nodeSummaries,
|
||||
edgeSummaries,
|
||||
intervalEnds,
|
||||
};
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// @flow
|
||||
|
||||
import {computeCredData} from "./credData";
|
||||
import {computeCredData, compressByThreshold} from "./credData";
|
||||
import type {TimelineCredScores} from "../core/algorithm/distributionToCred";
|
||||
|
||||
describe("src/analysis/credData", () => {
|
||||
|
@ -52,4 +52,56 @@ describe("src/analysis/credData", () => {
|
|||
};
|
||||
expect(computeCredData(scores)).toEqual(expected);
|
||||
});
|
||||
it("compresses by threshold correctly", () => {
|
||||
const intervalEnds = [100, 200];
|
||||
const nodeSummaries = [
|
||||
{cred: 14, seedFlow: 0, syntheticLoopFlow: 0.2},
|
||||
{cred: 20, seedFlow: 20, syntheticLoopFlow: 0},
|
||||
{cred: 1, seedFlow: 1, syntheticLoopFlow: 0},
|
||||
];
|
||||
const nodeOverTime = [
|
||||
{cred: [4, 10], seedFlow: [0, 0], syntheticLoopFlow: [0.1, 0.1]},
|
||||
{cred: [10, 10], seedFlow: [10, 10], syntheticLoopFlow: [0, 0]},
|
||||
{cred: [5, 0], seedFlow: [0, 0], syntheticLoopFlow: [0, 0]},
|
||||
];
|
||||
const edgeSummaries = [
|
||||
{forwardFlow: 20, backwardFlow: 2},
|
||||
{
|
||||
forwardFlow: 10,
|
||||
backwardFlow: 10,
|
||||
},
|
||||
{forwardFlow: 1, backwardFlow: 1},
|
||||
];
|
||||
const edgeOverTime = [
|
||||
{forwardFlow: [19, 1], backwardFlow: [2, 0]},
|
||||
{forwardFlow: [5, 5], backwardFlow: [9, 1]},
|
||||
{forwardFlow: [1, 0], backwardFlow: [0, 1]},
|
||||
];
|
||||
const input = {
|
||||
intervalEnds,
|
||||
nodeSummaries,
|
||||
nodeOverTime,
|
||||
edgeSummaries,
|
||||
edgeOverTime,
|
||||
};
|
||||
const expected = {
|
||||
intervalEnds,
|
||||
nodeSummaries,
|
||||
nodeOverTime: [
|
||||
{cred: [4, 10], seedFlow: null, syntheticLoopFlow: null},
|
||||
{cred: [10, 10], seedFlow: [10, 10], syntheticLoopFlow: null},
|
||||
null,
|
||||
],
|
||||
edgeSummaries,
|
||||
edgeOverTime: [
|
||||
{forwardFlow: [19, 1], backwardFlow: null},
|
||||
{forwardFlow: [5, 5], backwardFlow: [9, 1]},
|
||||
null,
|
||||
],
|
||||
};
|
||||
const result = compressByThreshold(input, 10);
|
||||
expect(result).toEqual(expected);
|
||||
// Check that it's idempotent too.
|
||||
expect(compressByThreshold(result, 10)).toEqual(result);
|
||||
});
|
||||
});
|
||||
|
|
|
@ -20,7 +20,11 @@ import {
|
|||
fromJSON as pluginsFromJSON,
|
||||
} from "./pluginDeclaration";
|
||||
import {timelinePagerank} from "../core/algorithm/timelinePagerank";
|
||||
import {type CredData, computeCredData} from "./credData";
|
||||
import {
|
||||
type CredData,
|
||||
computeCredData,
|
||||
compressByThreshold as _compressByThreshold,
|
||||
} from "./credData";
|
||||
import {distributionToCred} from "../core/algorithm/distributionToCred";
|
||||
|
||||
/**
|
||||
|
@ -56,6 +60,23 @@ export async function compute(
|
|||
return {weightedGraph: wg, credData, params, plugins};
|
||||
}
|
||||
|
||||
// Lossily compress a CredResult, by throwing away time-specific cred
|
||||
// data for any flows that summed to less than `threshold` cred.
|
||||
// We may want to implement more sophisticated and context-aware strategies
|
||||
// in the future.
|
||||
export function compressByThreshold(
|
||||
x: CredResult,
|
||||
threshold: number
|
||||
): CredResult {
|
||||
const {params, plugins, weightedGraph, credData} = x;
|
||||
return {
|
||||
params,
|
||||
plugins,
|
||||
weightedGraph,
|
||||
credData: _compressByThreshold(credData, threshold),
|
||||
};
|
||||
}
|
||||
|
||||
const COMPAT_INFO = {type: "sourcecred/credResult", version: "0.1.0"};
|
||||
|
||||
export type CredResultJSON = Compatible<{|
|
||||
|
|
Loading…
Reference in New Issue