diff --git a/elasticsearch/unique_count.py b/elasticsearch/unique_count.py new file mode 100755 index 0000000..1296b62 --- /dev/null +++ b/elasticsearch/unique_count.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +import os +import csv +from optparse import OptionParser +from elasticsearch import Elasticsearch + +HELP_DESCRIPTION='This generates a CSV with buckets of peer_ids for every day.' +HELP_EXAMPLE='Example: ./unique_count.py -i "logstash-2019.11.*" -f peer_id' + +def parse_opts(): + parser = OptionParser(description=HELP_DESCRIPTION, epilog=HELP_EXAMPLE) + parser.add_option('-H', '--host', dest='es_host', default='localhost', + help='ElasticSearch host.') + parser.add_option('-P', '--port', dest='es_port', default=9200, + help='ElasticSearch port.') + parser.add_option('-i', '--index-pattern', default='logstash-*', + help='Patter for matching indices.') + parser.add_option('-f', '--field', type='str', default='peer_id', + help='Name of the field to count.') + parser.add_option('-o', '--out-file', type='str', default='out.csv', + help='Filename of CSV to write to.') + parser.add_option('-m', '--max-size', default=10000, + help='Max number of counts to find.') + (opts, args) = parser.parse_args() + + if not opts.field: + parser.error('No field name specified!') + + return (opts, args) + +def remove_prefix(text, prefix): + return text[text.startswith(prefix) and len(prefix):] + +def main(): + (opts, args) = parse_opts() + + es = Elasticsearch( + [{ 'host': opts.es_host, + 'port': opts.es_port }], + timeout=1200, + retry_on_timeout=True + ) + + print('Cluster: {}'.format(es.info().get('cluster_name'))) + + indices = es.indices.get(index=opts.index_pattern).keys() + + body = { + "size": 0, + "aggs": { "peers": { + "terms": { + "field": opts.field, + "size": 10000, + #"min_doc_count": 100, + }, + }, }, + } + + csv_field_names = ["date", "peer", "count"] + + with open(opts.out_file, 'w') as f: + writer = csv.DictWriter(f, fieldnames=csv_field_names) + writer.writeheader() + + for index in indices: + resp = es.search(index=index, body=body) + aggs = resp.get('aggregations') + print('{:22} count: {:6}'.format(index, len(aggs["peers"]["buckets"]))) + + for bucket in aggs["peers"]["buckets"]: + writer.writerow({ + "date": remove_prefix(index, 'logstash-'), + "peer": bucket["key"], + "count": bucket["doc_count"], + }) + +if __name__ == '__main__': + main()