first working version generating graphs

Signed-off-by: Jakub Sokołowski <jakub@status.im>
2020-07-08 14:15:47 +02:00 · 2020-07-08 14:15:47 +02:00 · 3a4f3ffc4a
parent 7a8063839f
commit 3a4f3ffc4a
5 changed files with 129 additions and 0 deletions
--- a/graph.py
+++ b/graph.py
@ -0,0 +1,25 @@
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt; plt.rcdefaults()
+import matplotlib.colors as mcolors
+from operator import attrgetter
+
+class PDGraphPeers():
+
+    def __init__(self, data):
+        self.df = pd.DataFrame(data)
+
+    def unique_peers_counts(self):
+        return self.df.groupby(['Peer'])['Date'].nunique()
+
+    def number_of_days(self, exclude=20):
+        nu_peers = self.unique_peers_counts()
+        ex_twenty_day = nu_peers[nu_peers > exclude]
+        ax = sns.distplot(ex_twenty_day, kde=False, hist=True)
+        ax.set(
+            title='Distribution of number of days per peers excluding 20 days',
+            xlabel='# of days', 
+            ylabel='# of peers'
+        )
+        return ax.get_figure()
--- a/main.py
+++ b/main.py
@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+from optparse import OptionParser
+
+from query import ESQueryPeers
+from graph import PDGraphPeers
+
+HELP_DESCRIPTION='This generates a CSV with buckets of peer_ids for every day.'
+HELP_EXAMPLE='Example: ./unique_count.py -i "logstash-2019.11.*" -f peer_id'
+
+def parse_opts():
+    parser = OptionParser(description=HELP_DESCRIPTION, epilog=HELP_EXAMPLE)
+    parser.add_option('-H', '--host', dest='es_host', default='localhost',
+                      help='ElasticSearch host.')
+    parser.add_option('-P', '--port', dest='es_port', default=9200,
+                      help='ElasticSearch port.')
+    parser.add_option('-i', '--index-pattern', default='logstash-*',
+                      help='Patter for matching indices.')
+    parser.add_option('-f', '--field', type='str', default='peer_id',
+                      help='Name of the field to count.')
+    parser.add_option('-m', '--max-size', default=10000,
+                      help='Max number of counts to find.')
+    (opts, args) = parser.parse_args()
+
+    if not opts.field:
+        parser.error('No field name specified!')
+    
+    return (opts, args)
+
+def main():
+    (opts, args) = parse_opts()
+
+    esq = ESQueryPeers(opts.es_host, opts.es_port)
+
+    data = []
+    for index in esq.get_indices(opts.index_pattern):
+        print('Index: {}'.format(index))
+        data.extend(esq.get_peers(index, opts.field, opts.max_size))
+
+    pdg = PDGraphPeers(data)
+
+    print(pdg.unique_peers_counts())
+    plot = pdg.number_of_days()
+    plot.savefig("output.png")
+
+if __name__ == '__main__':
+    main()
--- a/output.png
+++ b/output.png
--- a/query.py
+++ b/query.py
@ -0,0 +1,47 @@
+import hashlib
+from elasticsearch import Elasticsearch
+
+def remove_prefix(text, prefix):
+    return text[text.startswith(prefix) and len(prefix):]
+
+def hash_string(text):
+    return hashlib.sha256(text.encode('utf-8')).hexdigest()
+
+class ESQueryPeers():
+
+    def __init__(self, host='localhost', port=9200, timeout=1200):
+        self.client = Elasticsearch(
+            [{ 'host': host,
+               'port': port, }],
+            timeout=timeout,
+            retry_on_timeout=True
+        )
+        self.cluster = self.client.info().get('cluster_name')
+    
+    def get_indices(self, pattern='logstash-*'):
+        return self.client.indices.get(index=pattern).keys()
+
+    def get_peers(self, index, field='peer_id', max_query=10000):
+        body = {
+            'size': 0, # Don't return actual values
+            'aggs': { 'peers': {
+                'terms': {
+                    'field': field,
+                    'size': 10000,
+                },
+            }, },
+        }
+        # Query
+        resp = self.client.search(index=index, body=body)
+        aggs = resp.get('aggregations')
+
+        # Collect results as list of dicts
+        rval = []
+        for bucket in aggs['peers']['buckets']:
+            rval.append({
+                'Date': remove_prefix(index, 'logstash-'),
+                'Peer': hash_string(bucket['key']),
+                'Count': bucket['doc_count'],
+            })
+
+        return rval
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,11 @@
+cycler==0.10.0
+kiwisolver==1.2.0
+matplotlib==3.2.2
+numpy==1.19.0
+pandas==1.0.5
+pyparsing==2.4.7
+python-dateutil==2.8.1
+pytz==2020.1
+scipy==1.5.1
+seaborn==0.10.1
+six==1.15.0