diff --git a/src/analysis/credData.js b/src/analysis/credData.js index 4f6a484..c69d1dd 100644 --- a/src/analysis/credData.js +++ b/src/analysis/credData.js @@ -122,3 +122,84 @@ export function computeCredData(scores: TimelineCredScores): CredData { intervalEnds, }; } + +/** + * Compress the cred data by removing all time-level info on + * flows/accumulations that sum to less than the threshold. + * + * E.g. if we set the threshold to 10 and a node has only 9 cred, we store its + * summary info but not how those flows split across time. + * + * If the node had 11 cred but only 1 cred from seed and 0 from synthetic loop, + * then we store the timing info for its cred, but not for its seed or + * synthetic loop flows. + * + * Likewise for edges, we separately decide whether to store the forward flow + * and the backward flow. + */ +export function compressByThreshold(x: CredData, threshold: number): CredData { + const { + nodeSummaries, + nodeOverTime, + edgeSummaries, + edgeOverTime, + intervalEnds, + } = x; + + const newNodeOverTime = nodeOverTime.map((d, i) => { + if (d == null) { + // It might be null if the data was already compressed. The function + // should be idempotent. This way we can chain compression strategies + // later on. + return null; + } + const s = nodeSummaries[i]; + if (s.cred < threshold) { + // If the cred is below threshold, then we know both the seed flow and + // the synthetic loop flow are below threshold, since the cred is the sum + // of those flows plus the flows from edges. So we can shortcut straight + // to returning null. + return null; + } + return { + // We get a space efficiency boost here, since for the majority of nodes, + // even though they have material cred, they have little seed or + // synthetic loop flow. So we can save ourselves from storing large + // arrays of near-zero values. + cred: d.cred, + seedFlow: s.seedFlow < threshold ? null : d.seedFlow, + syntheticLoopFlow: + s.syntheticLoopFlow < threshold ? null : d.syntheticLoopFlow, + }; + }); + + const newEdgeOverTime = edgeOverTime.map((d, i) => { + if (d == null) { + // It might be null if the data was already compressed. The function + // should be idempotent. This way we can chain compression strategies + // later on. + return null; + } + const {forwardFlow, backwardFlow} = edgeSummaries[i]; + const checkF = forwardFlow >= threshold; + const checkB = backwardFlow >= threshold; + if (checkF || checkB) { + // The edge might be effectively unidrectional--in that case let's not + // waste space storing data for the direction that had very little cred + // flow. + return { + forwardFlow: checkF ? d.forwardFlow : null, + backwardFlow: checkB ? d.backwardFlow : null, + }; + } + return null; + }); + + return { + nodeOverTime: newNodeOverTime, + edgeOverTime: newEdgeOverTime, + nodeSummaries, + edgeSummaries, + intervalEnds, + }; +} diff --git a/src/analysis/credData.test.js b/src/analysis/credData.test.js index 4efad05..7b22e2b 100644 --- a/src/analysis/credData.test.js +++ b/src/analysis/credData.test.js @@ -1,6 +1,6 @@ // @flow -import {computeCredData} from "./credData"; +import {computeCredData, compressByThreshold} from "./credData"; import type {TimelineCredScores} from "../core/algorithm/distributionToCred"; describe("src/analysis/credData", () => { @@ -52,4 +52,56 @@ describe("src/analysis/credData", () => { }; expect(computeCredData(scores)).toEqual(expected); }); + it("compresses by threshold correctly", () => { + const intervalEnds = [100, 200]; + const nodeSummaries = [ + {cred: 14, seedFlow: 0, syntheticLoopFlow: 0.2}, + {cred: 20, seedFlow: 20, syntheticLoopFlow: 0}, + {cred: 1, seedFlow: 1, syntheticLoopFlow: 0}, + ]; + const nodeOverTime = [ + {cred: [4, 10], seedFlow: [0, 0], syntheticLoopFlow: [0.1, 0.1]}, + {cred: [10, 10], seedFlow: [10, 10], syntheticLoopFlow: [0, 0]}, + {cred: [5, 0], seedFlow: [0, 0], syntheticLoopFlow: [0, 0]}, + ]; + const edgeSummaries = [ + {forwardFlow: 20, backwardFlow: 2}, + { + forwardFlow: 10, + backwardFlow: 10, + }, + {forwardFlow: 1, backwardFlow: 1}, + ]; + const edgeOverTime = [ + {forwardFlow: [19, 1], backwardFlow: [2, 0]}, + {forwardFlow: [5, 5], backwardFlow: [9, 1]}, + {forwardFlow: [1, 0], backwardFlow: [0, 1]}, + ]; + const input = { + intervalEnds, + nodeSummaries, + nodeOverTime, + edgeSummaries, + edgeOverTime, + }; + const expected = { + intervalEnds, + nodeSummaries, + nodeOverTime: [ + {cred: [4, 10], seedFlow: null, syntheticLoopFlow: null}, + {cred: [10, 10], seedFlow: [10, 10], syntheticLoopFlow: null}, + null, + ], + edgeSummaries, + edgeOverTime: [ + {forwardFlow: [19, 1], backwardFlow: null}, + {forwardFlow: [5, 5], backwardFlow: [9, 1]}, + null, + ], + }; + const result = compressByThreshold(input, 10); + expect(result).toEqual(expected); + // Check that it's idempotent too. + expect(compressByThreshold(result, 10)).toEqual(result); + }); }); diff --git a/src/analysis/credResult.js b/src/analysis/credResult.js index 7775455..08f7156 100644 --- a/src/analysis/credResult.js +++ b/src/analysis/credResult.js @@ -20,7 +20,11 @@ import { fromJSON as pluginsFromJSON, } from "./pluginDeclaration"; import {timelinePagerank} from "../core/algorithm/timelinePagerank"; -import {type CredData, computeCredData} from "./credData"; +import { + type CredData, + computeCredData, + compressByThreshold as _compressByThreshold, +} from "./credData"; import {distributionToCred} from "../core/algorithm/distributionToCred"; /** @@ -56,6 +60,23 @@ export async function compute( return {weightedGraph: wg, credData, params, plugins}; } +// Lossily compress a CredResult, by throwing away time-specific cred +// data for any flows that summed to less than `threshold` cred. +// We may want to implement more sophisticated and context-aware strategies +// in the future. +export function compressByThreshold( + x: CredResult, + threshold: number +): CredResult { + const {params, plugins, weightedGraph, credData} = x; + return { + params, + plugins, + weightedGraph, + credData: _compressByThreshold(credData, threshold), + }; +} + const COMPAT_INFO = {type: "sourcecred/credResult", version: "0.1.0"}; export type CredResultJSON = Compatible<{|