first working version generating graphs

Signed-off-by: Jakub Sokołowski <jakub@status.im>
This commit is contained in:
Jakub Sokołowski 2020-07-08 14:15:47 +02:00
parent 7a8063839f
commit 3a4f3ffc4a
No known key found for this signature in database
GPG Key ID: 4EF064D0E6D63020
5 changed files with 129 additions and 0 deletions

25
graph.py Normal file
View File

@ -0,0 +1,25 @@
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt; plt.rcdefaults()
import matplotlib.colors as mcolors
from operator import attrgetter
class PDGraphPeers():
def __init__(self, data):
self.df = pd.DataFrame(data)
def unique_peers_counts(self):
return self.df.groupby(['Peer'])['Date'].nunique()
def number_of_days(self, exclude=20):
nu_peers = self.unique_peers_counts()
ex_twenty_day = nu_peers[nu_peers > exclude]
ax = sns.distplot(ex_twenty_day, kde=False, hist=True)
ax.set(
title='Distribution of number of days per peers excluding 20 days',
xlabel='# of days',
ylabel='# of peers'
)
return ax.get_figure()

46
main.py Executable file
View File

@ -0,0 +1,46 @@
#!/usr/bin/env python3
from optparse import OptionParser
from query import ESQueryPeers
from graph import PDGraphPeers
HELP_DESCRIPTION='This generates a CSV with buckets of peer_ids for every day.'
HELP_EXAMPLE='Example: ./unique_count.py -i "logstash-2019.11.*" -f peer_id'
def parse_opts():
parser = OptionParser(description=HELP_DESCRIPTION, epilog=HELP_EXAMPLE)
parser.add_option('-H', '--host', dest='es_host', default='localhost',
help='ElasticSearch host.')
parser.add_option('-P', '--port', dest='es_port', default=9200,
help='ElasticSearch port.')
parser.add_option('-i', '--index-pattern', default='logstash-*',
help='Patter for matching indices.')
parser.add_option('-f', '--field', type='str', default='peer_id',
help='Name of the field to count.')
parser.add_option('-m', '--max-size', default=10000,
help='Max number of counts to find.')
(opts, args) = parser.parse_args()
if not opts.field:
parser.error('No field name specified!')
return (opts, args)
def main():
(opts, args) = parse_opts()
esq = ESQueryPeers(opts.es_host, opts.es_port)
data = []
for index in esq.get_indices(opts.index_pattern):
print('Index: {}'.format(index))
data.extend(esq.get_peers(index, opts.field, opts.max_size))
pdg = PDGraphPeers(data)
print(pdg.unique_peers_counts())
plot = pdg.number_of_days()
plot.savefig("output.png")
if __name__ == '__main__':
main()

BIN
output.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

47
query.py Normal file
View File

@ -0,0 +1,47 @@
import hashlib
from elasticsearch import Elasticsearch
def remove_prefix(text, prefix):
return text[text.startswith(prefix) and len(prefix):]
def hash_string(text):
return hashlib.sha256(text.encode('utf-8')).hexdigest()
class ESQueryPeers():
def __init__(self, host='localhost', port=9200, timeout=1200):
self.client = Elasticsearch(
[{ 'host': host,
'port': port, }],
timeout=timeout,
retry_on_timeout=True
)
self.cluster = self.client.info().get('cluster_name')
def get_indices(self, pattern='logstash-*'):
return self.client.indices.get(index=pattern).keys()
def get_peers(self, index, field='peer_id', max_query=10000):
body = {
'size': 0, # Don't return actual values
'aggs': { 'peers': {
'terms': {
'field': field,
'size': 10000,
},
}, },
}
# Query
resp = self.client.search(index=index, body=body)
aggs = resp.get('aggregations')
# Collect results as list of dicts
rval = []
for bucket in aggs['peers']['buckets']:
rval.append({
'Date': remove_prefix(index, 'logstash-'),
'Peer': hash_string(bucket['key']),
'Count': bucket['doc_count'],
})
return rval

11
requirements.txt Normal file
View File

@ -0,0 +1,11 @@
cycler==0.10.0
kiwisolver==1.2.0
matplotlib==3.2.2
numpy==1.19.0
pandas==1.0.5
pyparsing==2.4.7
python-dateutil==2.8.1
pytz==2020.1
scipy==1.5.1
seaborn==0.10.1
six==1.15.0