add get_present_days() and use it to skip indices

Signed-off-by: Jakub Sokołowski <jakub@status.im>
This commit is contained in:
Jakub Sokołowski 2020-07-14 12:04:10 +02:00
parent 7d025dc40d
commit 0642e52d26
No known key found for this signature in database
GPG Key ID: 4EF064D0E6D63020
4 changed files with 62 additions and 11 deletions

View File

@ -8,9 +8,27 @@ The script queries an ElasticSearch endpoint for `logstash-*` indices and aggreg
This data is pushed to a PostgreSQL database in the following format:
```
peers=> \d peers;
Table "public.peers"
┌────────┬───────────────────────┬───────────┬──────────┬─────────┐
│ Column │ Type │ Collation │ Nullable │ Default │
├────────┼───────────────────────┼───────────┼──────────┼─────────┤
│ date │ date │ │ │ │
│ peer │ character varying(64) │ │ │ │
│ count │ integer │ │ │ │
└────────┴───────────────────────┴───────────┴──────────┴─────────┘
```
# Example
```
peers=> select * from peers limit 3;
┌────────────┬──────────────────────────────────────────────────────────────────┬───────┐
│ date │ peer │ count │
├────────────┼──────────────────────────────────────────────────────────────────┼───────┤
│ 2020-06-01 │ a18d4417b1d2fbddd7f9474250f703ba20472be5e1131bc09e35e9b18c1a5bf7 │ 1300 │
│ 2020-06-01 │ 7dba96249159cef53fbb5ec010c2d7799fec7dcaf8b1d9754559ce9fbd463328 │ 652 │
│ 2020-06-01 │ 3a13adfa4799f9505c83fab18d49a47f6de09344db3d96e18678c5d3c92f717e │ 632 │
└────────────┴──────────────────────────────────────────────────────────────────┴───────┘
(3 rows)
```

16
main.py
View File

@ -54,14 +54,22 @@ def main():
opts.db_port
)
data = []
days = psg.get_present_days()
present_indices = ['logstash-{}'.format(d.replace('-', '.')) for d in days]
peers = []
for index in esq.get_indices(opts.index_pattern):
if index in present_indices:
continue
print('Index: {}'.format(index))
data.extend(esq.get_peers(index, opts.field, opts.max_size))
peers.extend(esq.get_peers(index, opts.field, opts.max_size))
rval = psg.get_most_recent_day()
if len(peers) == 0:
print('Nothing to insert into database.')
exit(0)
rval = psg.inject_peers(peers)
print(rval)
if __name__ == '__main__':
main()

View File

@ -1,4 +1,5 @@
import psycopg2
from datetime import datetime
class PGDatabase:
_SCHEMA = """
@ -20,6 +21,21 @@ class PGDatabase:
self.c.execute(self._SCHEMA)
self.db.commit()
def get_most_recent_day(self):
rval = self.c.execute('SELECT date FROM peers ORDER BY date LIMIT 1;')
def get_last_day(self):
self.c.execute('SELECT date FROM peers ORDER BY date DESC LIMIT 1;')
return self.c.fetchone()
def get_present_days(self):
self.c.execute('SELECT DISTINCT date FROM peers;')
return [d[0].strftime('%Y-%m-%d') for d in self.c.fetchall()]
def inject_peers(self, peers):
args = ','.join(
self.c.mogrify('(%s,%s,%s)', peer.to_tuple()).decode('utf-8')
for peer in peers
)
rval = self.c.execute(
'INSERT INTO peers(date, peer, count) VALUES {}'.format(args)
)
self.db.commit()
return rval

View File

@ -9,6 +9,15 @@ def remove_prefix(text, prefix):
def hash_string(text):
return hashlib.sha256(text.encode('utf-8')).hexdigest()
class Peer:
def __init__(self, date, peer, count):
self.date = date
self.peer = peer
self.count = count
def to_tuple(self):
return (self.date, self.peer, self.count)
class ESQueryPeers():
def __init__(self, host='localhost', port=9200, timeout=1200):
@ -40,10 +49,10 @@ class ESQueryPeers():
# Collect results as list of dicts
rval = []
for bucket in aggs['peers']['buckets']:
rval.append({
'Date': remove_prefix(index, 'logstash-'),
'Peer': hash_string(bucket['key']),
'Count': bucket['doc_count'],
})
rval.append(Peer(
date = remove_prefix(index, 'logstash-'),
peer = hash_string(bucket['key']),
count = bucket['doc_count']
))
return rval