infra-utils/elasticsearch/unique_count.py

#!/usr/bin/env python3
import os
import csv
import hashlib
from optparse import OptionParser
from elasticsearch import Elasticsearch

HELP_DESCRIPTION='This generates a CSV with buckets of peer_ids for every day.'
HELP_EXAMPLE='Example: ./unique_count.py -i "logstash-2019.11.*" -f peer_id'

def parse_opts():
    parser = OptionParser(description=HELP_DESCRIPTION, epilog=HELP_EXAMPLE)
    parser.add_option('-H', '--host', dest='es_host', default='localhost',
                      help='ElasticSearch host.')
    parser.add_option('-P', '--port', dest='es_port', default=9200,
                      help='ElasticSearch port.')
    parser.add_option('-i', '--index-pattern', default='logstash-*',
                      help='Patter for matching indices.')
    parser.add_option('-f', '--field', type='str', default='peer_id',
                      help='Name of the field to count.')
    parser.add_option('-o', '--out-file', type='str', default='out.csv',
                      help='Filename of CSV to write to.')
    parser.add_option('-m', '--max-size', default=10000,
                      help='Max number of counts to find.')
    (opts, args) = parser.parse_args()

    if not opts.field:
        parser.error('No field name specified!')
    
    return (opts, args)

def remove_prefix(text, prefix):
    return text[text.startswith(prefix) and len(prefix):]

def hash_string(text):
    return hashlib.sha256(text.encode('utf-8')).hexdigest()

def main():
    (opts, args) = parse_opts()

    es = Elasticsearch(
        [{ 'host': opts.es_host,
           'port': opts.es_port }],
        timeout=1200,
        retry_on_timeout=True
    )
    
    print('Cluster: {}'.format(es.info().get('cluster_name')))
    
    indices = es.indices.get(index=opts.index_pattern).keys()
    
    body = {
        'size': 0,
        'aggs': { 'peers': {
            'terms': {
                'field': opts.field,
                'size': 10000,
            },
        }, },
    }

    csv_field_names = ['date', 'peer', 'count']

    with open(opts.out_file, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=csv_field_names)
        writer.writeheader()

        for index in indices:
            resp = es.search(index=index, body=body)
            aggs = resp.get('aggregations')
            print('{:22} count: {:6}'.format(index, len(aggs['peers']['buckets'])))

            for bucket in aggs['peers']['buckets']:
                writer.writerow({
                    'date': remove_prefix(index, 'logstash-'),
                    'peer': hash_string(bucket['key']),
                    'count': bucket['doc_count'],
                })

if __name__ == '__main__':
    main()
elasticsearch: add unique_count.py for creating CSV of peers Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:52:31 +00:00			`#!/usr/bin/env python3`
			`import os`
			`import csv`
elasticsearch: hash peer_id's before putting them into CSV Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:56:16 +00:00			`import hashlib`
elasticsearch: add unique_count.py for creating CSV of peers Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:52:31 +00:00			`from optparse import OptionParser`
			`from elasticsearch import Elasticsearch`

			`HELP_DESCRIPTION='This generates a CSV with buckets of peer_ids for every day.'`
			`HELP_EXAMPLE='Example: ./unique_count.py -i "logstash-2019.11.*" -f peer_id'`

			`def parse_opts():`
			`parser = OptionParser(description=HELP_DESCRIPTION, epilog=HELP_EXAMPLE)`
			`parser.add_option('-H', '--host', dest='es_host', default='localhost',`
			`help='ElasticSearch host.')`
			`parser.add_option('-P', '--port', dest='es_port', default=9200,`
			`help='ElasticSearch port.')`
			`parser.add_option('-i', '--index-pattern', default='logstash-*',`
			`help='Patter for matching indices.')`
			`parser.add_option('-f', '--field', type='str', default='peer_id',`
			`help='Name of the field to count.')`
			`parser.add_option('-o', '--out-file', type='str', default='out.csv',`
			`help='Filename of CSV to write to.')`
			`parser.add_option('-m', '--max-size', default=10000,`
			`help='Max number of counts to find.')`
			`(opts, args) = parser.parse_args()`

			`if not opts.field:`
			`parser.error('No field name specified!')`

			`return (opts, args)`

			`def remove_prefix(text, prefix):`
			`return text[text.startswith(prefix) and len(prefix):]`

elasticsearch: hash peer_id's before putting them into CSV Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:56:16 +00:00			`def hash_string(text):`
			`return hashlib.sha256(text.encode('utf-8')).hexdigest()`

elasticsearch: add unique_count.py for creating CSV of peers Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:52:31 +00:00			`def main():`
			`(opts, args) = parse_opts()`

			`es = Elasticsearch(`
			`[{ 'host': opts.es_host,`
			`'port': opts.es_port }],`
			`timeout=1200,`
			`retry_on_timeout=True`
			`)`

			`print('Cluster: {}'.format(es.info().get('cluster_name')))`

			`indices = es.indices.get(index=opts.index_pattern).keys()`

			`body = {`
elasticsearch: hash peer_id's before putting them into CSV Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:56:16 +00:00			`'size': 0,`
			`'aggs': { 'peers': {`
			`'terms': {`
			`'field': opts.field,`
			`'size': 10000,`
elasticsearch: add unique_count.py for creating CSV of peers Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:52:31 +00:00			`},`
			`}, },`
			`}`

elasticsearch: hash peer_id's before putting them into CSV Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:56:16 +00:00			`csv_field_names = ['date', 'peer', 'count']`
elasticsearch: add unique_count.py for creating CSV of peers Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:52:31 +00:00
			`with open(opts.out_file, 'w') as f:`
			`writer = csv.DictWriter(f, fieldnames=csv_field_names)`
			`writer.writeheader()`

			`for index in indices:`
			`resp = es.search(index=index, body=body)`
			`aggs = resp.get('aggregations')`
elasticsearch: hash peer_id's before putting them into CSV Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:56:16 +00:00			`print('{:22} count: {:6}'.format(index, len(aggs['peers']['buckets'])))`
elasticsearch: add unique_count.py for creating CSV of peers Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:52:31 +00:00
elasticsearch: hash peer_id's before putting them into CSV Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:56:16 +00:00			`for bucket in aggs['peers']['buckets']:`
elasticsearch: add unique_count.py for creating CSV of peers Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:52:31 +00:00			`writer.writerow({`
elasticsearch: hash peer_id's before putting them into CSV Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:56:16 +00:00			`'date': remove_prefix(index, 'logstash-'),`
			`'peer': hash_string(bucket['key']),`
			`'count': bucket['doc_count'],`
elasticsearch: add unique_count.py for creating CSV of peers Signed-off-by: Jakub Sokołowski <jakub@status.im> 2020-06-30 07:52:31 +00:00			`})`

			`if __name__ == '__main__':`
			`main()`