From 7f81337d74ec0a34686e02d89e6754d4f80eb2ae Mon Sep 17 00:00:00 2001 From: William Chargin Date: Sat, 1 Sep 2018 10:42:30 -0700 Subject: [PATCH] Store GitHub data gzipped at rest (#751) Summary: We store the relational view in `view.json.gz` instead of `view.json`, taking advantage of the isomorphic `pako` library for gzip encoding and decoding. Sample space savings (note that post bodies are included; i.e., #747 has not been applied): SAVE OLD (B) NEW (B) REPO 89.7% 25326 2617 sourcecred/example-github 82.9% 3257576 555948 sourcecred/sourcecred 85.2% 11287621 1665884 ipfs/js-ipfs 88.0% 20953425 2520358 gitcoinco/web 84.4% 38196825 5951459 ipfs/go-ipfs 84.9% 205770642 31101452 tensorflow/tensorflow
Script to generate space savings output ```shell savings() { printf '% 7s % 11s % 11s %s\n' 'SAVE' 'OLD (B)' 'NEW (B)' 'REPO' for repo; do file="${SOURCECRED_DIRECTORY}/data/${repo}/github/view.json.gz" if ! [ -f "${file}" ]; then printf >&2 'warn: no such file %s\n' "${file}" continue fi script="$(sed -e 's/^ *//' < Closes #750. Test Plan: Comparing the raw old version with the decompressed new version shows that they are identical: ``` $ <~/tmp/sourcecred/data/sourcecred/example-github/github/view.json \ > shasum -a 256 - 63853b9d3f918274aafacf5198787e18185a61b9c95faf640a1e61f5d11fa19f - $ <~/tmp/sourcecred/data/sourcecred/example-github/github/view.json.gz \ > gzip -dc | shasum -a 256 63853b9d3f918274aafacf5198787e18185a61b9c95faf640a1e61f5d11fa19f - ``` Additionally, `yarn test --full` passes, and `yarn start` still loads data and runs PageRank properly. wchargin-branch: gzip-relational-view --- CHANGELOG.md | 1 + package.json | 1 + sharness/test_build_static_site.t | 4 ++-- src/plugins/github/loadGithubData.js | 5 +++-- src/plugins/github/pluginAdapter.js | 8 ++++++-- yarn.lock | 2 +- 6 files changed, 14 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c80696..620c672 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## [Unreleased] +- Store GitHub data compressed at rest, reducing space usage by 6–8× (#750) - Improve weight sliders display (#736) - Separate bots from users in the UI (#720) - Add a feedback link to the prototype (#715) diff --git a/package.json b/package.json index e6763d6..e8ca6bb 100644 --- a/package.json +++ b/package.json @@ -19,6 +19,7 @@ "lodash.sortby": "^4.7.0", "mkdirp": "^0.5.1", "object-assign": "4.1.1", + "pako": "^1.0.6", "promise": "8.0.1", "react": "^16.4.1", "react-dom": "^16.4.1", diff --git a/sharness/test_build_static_site.t b/sharness/test_build_static_site.t index c54c90e..1c5c19a 100755 --- a/sharness/test_build_static_site.t +++ b/sharness/test_build_static_site.t @@ -219,7 +219,7 @@ test_expect_success TWO_REPOS \ test_expect_success TWO_REPOS \ "TWO_REPOS: should have data for the two repositories" ' for repo in sourcecred/example-git sourcecred/example-github; do - for file in github/view.json; do + for file in github/view.json.gz; do test -s "${data_dir}/${repo}/${file}" || return done done @@ -253,7 +253,7 @@ test_expect_success NO_REPOS \ test_expect_success NO_REPOS \ "NO_REPOS: should not have repository data" ' for repo in sourcecred/example-git sourcecred/example-github; do - for file in git/graph.json github/view.json; do + for file in git/graph.json github/view.json.gz; do test_must_fail test -f "${data_dir}/${repo}/${file}" || return done done diff --git a/src/plugins/github/loadGithubData.js b/src/plugins/github/loadGithubData.js index 752d35e..f9626a5 100644 --- a/src/plugins/github/loadGithubData.js +++ b/src/plugins/github/loadGithubData.js @@ -2,6 +2,7 @@ import fs from "fs-extra"; import path from "path"; +import pako from "pako"; import fetchGithubRepo from "./fetchGithubRepo"; import {RelationalView} from "./relationalView"; @@ -29,7 +30,7 @@ export async function loadGithubData(options: Options): Promise { for (const response of responses) { view.addData(response); } - const blob = JSON.stringify(view); - const outputFilename = path.join(options.outputDirectory, "view.json"); + const blob: Uint8Array = pako.gzip(JSON.stringify(view)); + const outputFilename = path.join(options.outputDirectory, "view.json.gz"); return fs.writeFile(outputFilename, blob); } diff --git a/src/plugins/github/pluginAdapter.js b/src/plugins/github/pluginAdapter.js index 1a4bd08..db05beb 100644 --- a/src/plugins/github/pluginAdapter.js +++ b/src/plugins/github/pluginAdapter.js @@ -1,4 +1,6 @@ // @flow +import pako from "pako"; + import type { StaticPluginAdapter as IStaticPluginAdapter, DynamicPluginAdapter as IDynamicPluginAdapater, @@ -94,13 +96,15 @@ export class StaticPluginAdapter implements IStaticPluginAdapter { } async load(assets: Assets, repo: Repo): Promise { const url = assets.resolve( - `/api/v1/data/data/${repo.owner}/${repo.name}/github/view.json` + `/api/v1/data/data/${repo.owner}/${repo.name}/github/view.json.gz` ); const response = await fetch(url); if (!response.ok) { return Promise.reject(response); } - const json = await response.json(); + const arrayBuffer = await response.arrayBuffer(); + const blob = new Uint8Array(arrayBuffer); + const json = JSON.parse(pako.ungzip(blob, {to: "string"})); const view = RelationalView.fromJSON(json); const graph = createGraph(view); return new DynamicPluginAdapter(view, graph); diff --git a/yarn.lock b/yarn.lock index 36bc56a..18868fc 100644 --- a/yarn.lock +++ b/yarn.lock @@ -5658,7 +5658,7 @@ p-try@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/p-try/-/p-try-1.0.0.tgz#cbc79cdbaf8fd4228e13f621f2b1a237c1b207b3" -pako@~1.0.5: +pako@^1.0.6, pako@~1.0.5: version "1.0.6" resolved "https://registry.yarnpkg.com/pako/-/pako-1.0.6.tgz#0101211baa70c4bca4a0f63f2206e97b7dfaf258"