Store GitHub data gzipped at rest (#751)

Summary:
We store the relational view in `view.json.gz` instead of `view.json`,
taking advantage of the isomorphic `pako` library for gzip encoding and
decoding.

Sample space savings (note that post bodies are included; i.e., #747 has
not been applied):

       SAVE     OLD (B)     NEW (B) REPO
      89.7%       25326        2617 sourcecred/example-github
      82.9%     3257576      555948 sourcecred/sourcecred
      85.2%    11287621     1665884 ipfs/js-ipfs
      88.0%    20953425     2520358 gitcoinco/web
      84.4%    38196825     5951459 ipfs/go-ipfs
      84.9%   205770642    31101452 tensorflow/tensorflow

<details>
<summary>Script to generate space savings output</summary>

```shell
savings() {
    printf '% 7s % 11s % 11s %s\n' 'SAVE' 'OLD (B)' 'NEW (B)' 'REPO'
    for repo; do
        file="${SOURCECRED_DIRECTORY}/data/${repo}/github/view.json.gz"
        if ! [ -f "${file}" ]; then
            printf >&2 'warn: no such file %s\n' "${file}"
            continue
        fi
        script="$(sed -e 's/^ *//' <<EOF
            repo = '${repo}'
            pre_size = $(<"${file}" gzip -dc | wc -c)
            post_size = $(<"${file}" wc -c)
            percentage = '%0.1f%%' % (100 * (1 - post_size / pre_size))
            p = '% 7s % 11d % 11d %s' % (percentage, pre_size, post_size, repo)
            print(p)
EOF
        )"
        python3 -c "${script}"
    done
}
```

</details>

Closes #750.

Test Plan:
Comparing the raw old version with the decompressed new version shows
that they are identical:

```
$ <~/tmp/sourcecred/data/sourcecred/example-github/github/view.json \
> shasum -a 256 -
63853b9d3f918274aafacf5198787e18185a61b9c95faf640a1e61f5d11fa19f  -
$ <~/tmp/sourcecred/data/sourcecred/example-github/github/view.json.gz \
> gzip -dc | shasum -a 256
63853b9d3f918274aafacf5198787e18185a61b9c95faf640a1e61f5d11fa19f  -
```

Additionally, `yarn test --full` passes, and `yarn start` still loads
data and runs PageRank properly.

wchargin-branch: gzip-relational-view
This commit is contained in:
William Chargin 2018-09-01 10:42:30 -07:00 committed by GitHub
parent f1a6b37524
commit 7f81337d74
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 14 additions and 7 deletions

View File

@ -1,6 +1,7 @@
# Changelog
## [Unreleased]
- Store GitHub data compressed at rest, reducing space usage by 68× (#750)
- Improve weight sliders display (#736)
- Separate bots from users in the UI (#720)
- Add a feedback link to the prototype (#715)

View File

@ -19,6 +19,7 @@
"lodash.sortby": "^4.7.0",
"mkdirp": "^0.5.1",
"object-assign": "4.1.1",
"pako": "^1.0.6",
"promise": "8.0.1",
"react": "^16.4.1",
"react-dom": "^16.4.1",

View File

@ -219,7 +219,7 @@ test_expect_success TWO_REPOS \
test_expect_success TWO_REPOS \
"TWO_REPOS: should have data for the two repositories" '
for repo in sourcecred/example-git sourcecred/example-github; do
for file in github/view.json; do
for file in github/view.json.gz; do
test -s "${data_dir}/${repo}/${file}" || return
done
done
@ -253,7 +253,7 @@ test_expect_success NO_REPOS \
test_expect_success NO_REPOS \
"NO_REPOS: should not have repository data" '
for repo in sourcecred/example-git sourcecred/example-github; do
for file in git/graph.json github/view.json; do
for file in git/graph.json github/view.json.gz; do
test_must_fail test -f "${data_dir}/${repo}/${file}" || return
done
done

View File

@ -2,6 +2,7 @@
import fs from "fs-extra";
import path from "path";
import pako from "pako";
import fetchGithubRepo from "./fetchGithubRepo";
import {RelationalView} from "./relationalView";
@ -29,7 +30,7 @@ export async function loadGithubData(options: Options): Promise<void> {
for (const response of responses) {
view.addData(response);
}
const blob = JSON.stringify(view);
const outputFilename = path.join(options.outputDirectory, "view.json");
const blob: Uint8Array = pako.gzip(JSON.stringify(view));
const outputFilename = path.join(options.outputDirectory, "view.json.gz");
return fs.writeFile(outputFilename, blob);
}

View File

@ -1,4 +1,6 @@
// @flow
import pako from "pako";
import type {
StaticPluginAdapter as IStaticPluginAdapter,
DynamicPluginAdapter as IDynamicPluginAdapater,
@ -94,13 +96,15 @@ export class StaticPluginAdapter implements IStaticPluginAdapter {
}
async load(assets: Assets, repo: Repo): Promise<IDynamicPluginAdapater> {
const url = assets.resolve(
`/api/v1/data/data/${repo.owner}/${repo.name}/github/view.json`
`/api/v1/data/data/${repo.owner}/${repo.name}/github/view.json.gz`
);
const response = await fetch(url);
if (!response.ok) {
return Promise.reject(response);
}
const json = await response.json();
const arrayBuffer = await response.arrayBuffer();
const blob = new Uint8Array(arrayBuffer);
const json = JSON.parse(pako.ungzip(blob, {to: "string"}));
const view = RelationalView.fromJSON(json);
const graph = createGraph(view);
return new DynamicPluginAdapter(view, graph);

View File

@ -5658,7 +5658,7 @@ p-try@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/p-try/-/p-try-1.0.0.tgz#cbc79cdbaf8fd4228e13f621f2b1a237c1b207b3"
pako@~1.0.5:
pako@^1.0.6, pako@~1.0.5:
version "1.0.6"
resolved "https://registry.yarnpkg.com/pako/-/pako-1.0.6.tgz#0101211baa70c4bca4a0f63f2206e97b7dfaf258"