Add lossy cred compression strategy (#1832)

This adds a new `compressByThreshold` method in the credResult module, which compresses the CredResult by removing interval-level cred data for flows of cred that are below a threshold. Test plan: Unit tests included; `yarn test` passes.
2020-06-01 18:29:30 -07:00 · 2020-06-01 18:29:30 -07:00 · d556776cca
parent f615ec89a3
commit d556776cca
3 changed files with 156 additions and 2 deletions
--- a/src/analysis/credData.js
+++ b/src/analysis/credData.js
@ -122,3 +122,84 @@ export function computeCredData(scores: TimelineCredScores): CredData {
    intervalEnds,
  };
 }
+
+/**
+ * Compress the cred data by removing all time-level info on
+ * flows/accumulations that sum to less than the threshold.
+ *
+ * E.g. if we set the threshold to 10 and a node has only 9 cred, we store its
+ * summary info but not how those flows split across time.
+ *
+ * If the node had 11 cred but only 1 cred from seed and 0 from synthetic loop,
+ * then we store the timing info for its cred, but not for its seed or
+ * synthetic loop flows.
+ *
+ * Likewise for edges, we separately decide whether to store the forward flow
+ * and the backward flow.
+ */
+export function compressByThreshold(x: CredData, threshold: number): CredData {
+  const {
+    nodeSummaries,
+    nodeOverTime,
+    edgeSummaries,
+    edgeOverTime,
+    intervalEnds,
+  } = x;
+
+  const newNodeOverTime = nodeOverTime.map((d, i) => {
+    if (d == null) {
+      // It might be null if the data was already compressed. The function
+      // should be idempotent. This way we can chain compression strategies
+      // later on.
+      return null;
+    }
+    const s = nodeSummaries[i];
+    if (s.cred < threshold) {
+      // If the cred is below threshold, then we know both the seed flow and
+      // the synthetic loop flow are below threshold, since the cred is the sum
+      // of those flows plus the flows from edges. So we can shortcut straight
+      // to returning null.
+      return null;
+    }
+    return {
+      // We get a space efficiency boost here, since for the majority of nodes,
+      // even though they have material cred, they have little seed or
+      // synthetic loop flow. So we can save ourselves from storing large
+      // arrays of near-zero values.
+      cred: d.cred,
+      seedFlow: s.seedFlow < threshold ? null : d.seedFlow,
+      syntheticLoopFlow:
+        s.syntheticLoopFlow < threshold ? null : d.syntheticLoopFlow,
+    };
+  });
+
+  const newEdgeOverTime = edgeOverTime.map((d, i) => {
+    if (d == null) {
+      // It might be null if the data was already compressed. The function
+      // should be idempotent. This way we can chain compression strategies
+      // later on.
+      return null;
+    }
+    const {forwardFlow, backwardFlow} = edgeSummaries[i];
+    const checkF = forwardFlow >= threshold;
+    const checkB = backwardFlow >= threshold;
+    if (checkF || checkB) {
+      // The edge might be effectively unidrectional--in that case let's not
+      // waste space storing data for the direction that had very little cred
+      // flow.
+      return {
+        forwardFlow: checkF ? d.forwardFlow : null,
+        backwardFlow: checkB ? d.backwardFlow : null,
+      };
+    }
+    return null;
+  });
+
+  return {
+    nodeOverTime: newNodeOverTime,
+    edgeOverTime: newEdgeOverTime,
+    nodeSummaries,
+    edgeSummaries,
+    intervalEnds,
+  };
+}
--- a/src/analysis/credData.test.js
+++ b/src/analysis/credData.test.js
@ -1,6 +1,6 @@
 // @flow

-import {computeCredData} from "./credData";
+import {computeCredData, compressByThreshold} from "./credData";
 import type {TimelineCredScores} from "../core/algorithm/distributionToCred";

 describe("src/analysis/credData", () => {
@ -52,4 +52,56 @@ describe("src/analysis/credData", () => {
    };
    expect(computeCredData(scores)).toEqual(expected);
  });
+  it("compresses by threshold correctly", () => {
+    const intervalEnds = [100, 200];
+    const nodeSummaries = [
+      {cred: 14, seedFlow: 0, syntheticLoopFlow: 0.2},
+      {cred: 20, seedFlow: 20, syntheticLoopFlow: 0},
+      {cred: 1, seedFlow: 1, syntheticLoopFlow: 0},
+    ];
+    const nodeOverTime = [
+      {cred: [4, 10], seedFlow: [0, 0], syntheticLoopFlow: [0.1, 0.1]},
+      {cred: [10, 10], seedFlow: [10, 10], syntheticLoopFlow: [0, 0]},
+      {cred: [5, 0], seedFlow: [0, 0], syntheticLoopFlow: [0, 0]},
+    ];
+    const edgeSummaries = [
+      {forwardFlow: 20, backwardFlow: 2},
+      {
+        forwardFlow: 10,
+        backwardFlow: 10,
+      },
+      {forwardFlow: 1, backwardFlow: 1},
+    ];
+    const edgeOverTime = [
+      {forwardFlow: [19, 1], backwardFlow: [2, 0]},
+      {forwardFlow: [5, 5], backwardFlow: [9, 1]},
+      {forwardFlow: [1, 0], backwardFlow: [0, 1]},
+    ];
+    const input = {
+      intervalEnds,
+      nodeSummaries,
+      nodeOverTime,
+      edgeSummaries,
+      edgeOverTime,
+    };
+    const expected = {
+      intervalEnds,
+      nodeSummaries,
+      nodeOverTime: [
+        {cred: [4, 10], seedFlow: null, syntheticLoopFlow: null},
+        {cred: [10, 10], seedFlow: [10, 10], syntheticLoopFlow: null},
+        null,
+      ],
+      edgeSummaries,
+      edgeOverTime: [
+        {forwardFlow: [19, 1], backwardFlow: null},
+        {forwardFlow: [5, 5], backwardFlow: [9, 1]},
+        null,
+      ],
+    };
+    const result = compressByThreshold(input, 10);
+    expect(result).toEqual(expected);
+    // Check that it's idempotent too.
+    expect(compressByThreshold(result, 10)).toEqual(result);
+  });
 });
--- a/src/analysis/credResult.js
+++ b/src/analysis/credResult.js
@ -20,7 +20,11 @@ import {
  fromJSON as pluginsFromJSON,
 } from "./pluginDeclaration";
 import {timelinePagerank} from "../core/algorithm/timelinePagerank";
-import {type CredData, computeCredData} from "./credData";
+import {
+  type CredData,
+  computeCredData,
+  compressByThreshold as _compressByThreshold,
+} from "./credData";
 import {distributionToCred} from "../core/algorithm/distributionToCred";

 /**
@ -56,6 +60,23 @@ export async function compute(
  return {weightedGraph: wg, credData, params, plugins};
 }

+// Lossily compress a CredResult, by throwing away time-specific cred
+// data for any flows that summed to less than `threshold` cred.
+// We may want to implement more sophisticated and context-aware strategies
+// in the future.
+export function compressByThreshold(
+  x: CredResult,
+  threshold: number
+): CredResult {
+  const {params, plugins, weightedGraph, credData} = x;
+  return {
+    params,
+    plugins,
+    weightedGraph,
+    credData: _compressByThreshold(credData, threshold),
+  };
+}
+
 const COMPAT_INFO = {type: "sourcecred/credResult", version: "0.1.0"};

 export type CredResultJSON = Compatible<{|