add get_present_days() and use it to skip indices

Signed-off-by: Jakub Sokołowski <jakub@status.im>
2020-07-14 12:04:10 +02:00 · 2020-07-14 12:04:10 +02:00 · 0642e52d26
parent 7d025dc40d
commit 0642e52d26
4 changed files with 62 additions and 11 deletions
--- a/README.md
+++ b/README.md
@ -8,9 +8,27 @@ The script queries an ElasticSearch endpoint for `logstash-*` indices and aggreg
 This data is pushed to a PostgreSQL database in the following format:
 ```
 peers=> \d peers;
                       Table "public.peers"
 ┌────────┬───────────────────────┬───────────┬──────────┬─────────┐
 │ Column │         Type          │ Collation │ Nullable │ Default │
 ├────────┼───────────────────────┼───────────┼──────────┼─────────┤
 │ date   │ date                  │           │          │         │
 │ peer   │ character varying(64) │           │          │         │
 │ count  │ integer               │           │          │         │
 └────────┴───────────────────────┴───────────┴──────────┴─────────┘
 ```
 # Example
 ```
 peers=> select * from peers limit 3;
 ┌────────────┬──────────────────────────────────────────────────────────────────┬───────┐
 │    date    │                               peer                               │ count │
 ├────────────┼──────────────────────────────────────────────────────────────────┼───────┤
 │ 2020-06-01 │ a18d4417b1d2fbddd7f9474250f703ba20472be5e1131bc09e35e9b18c1a5bf7 │  1300 │
 │ 2020-06-01 │ 7dba96249159cef53fbb5ec010c2d7799fec7dcaf8b1d9754559ce9fbd463328 │   652 │
 │ 2020-06-01 │ 3a13adfa4799f9505c83fab18d49a47f6de09344db3d96e18678c5d3c92f717e │   632 │
 └────────────┴──────────────────────────────────────────────────────────────────┴───────┘
 (3 rows)
 ```
--- a/main.py
+++ b/main.py
@ -54,14 +54,22 @@ def main():
        opts.db_port
    )
-    data = []
+    days = psg.get_present_days()
    present_indices = ['logstash-{}'.format(d.replace('-', '.')) for d in days]
    peers = []
    for index in esq.get_indices(opts.index_pattern):
        if index in present_indices:
            continue
        print('Index: {}'.format(index))
-        data.extend(esq.get_peers(index, opts.field, opts.max_size))
+        peers.extend(esq.get_peers(index, opts.field, opts.max_size))
-    rval = psg.get_most_recent_day()
+    if len(peers) == 0:
        print('Nothing to insert into database.')
        exit(0)
    rval = psg.inject_peers(peers)
    print(rval)
 if __name__ == '__main__':
    main()
--- a/postgres.py
+++ b/postgres.py
@ -1,4 +1,5 @@
 import psycopg2
 from datetime import datetime
 class PGDatabase:
    _SCHEMA = """
@ -20,6 +21,21 @@ class PGDatabase:
        self.c.execute(self._SCHEMA)
        self.db.commit()
-    def get_most_recent_day(self):
+    def get_last_day(self):
-        rval = self.c.execute('SELECT date FROM peers ORDER BY date LIMIT 1;')
+        self.c.execute('SELECT date FROM peers ORDER BY date DESC LIMIT 1;')
        return self.c.fetchone()
    def get_present_days(self):
        self.c.execute('SELECT DISTINCT date FROM peers;')
        return [d[0].strftime('%Y-%m-%d') for d in self.c.fetchall()]
    def inject_peers(self, peers):
        args = ','.join(
            self.c.mogrify('(%s,%s,%s)', peer.to_tuple()).decode('utf-8')
            for peer in peers
        )
        rval = self.c.execute(
            'INSERT INTO peers(date, peer, count) VALUES {}'.format(args)
        )
        self.db.commit()
        return rval
--- a/query.py
+++ b/query.py
@ -9,6 +9,15 @@ def remove_prefix(text, prefix):
 def hash_string(text):
    return hashlib.sha256(text.encode('utf-8')).hexdigest()
 class Peer:
    def __init__(self, date, peer, count):
        self.date = date
        self.peer = peer
        self.count = count
    def to_tuple(self):
        return (self.date, self.peer, self.count)
 class ESQueryPeers():
    def __init__(self, host='localhost', port=9200, timeout=1200):
@ -40,10 +49,10 @@ class ESQueryPeers():
        # Collect results as list of dicts
        rval = []
        for bucket in aggs['peers']['buckets']:
-            rval.append({
+            rval.append(Peer(
-                'Date': remove_prefix(index, 'logstash-'),
+                date = remove_prefix(index, 'logstash-'),
-                'Peer': hash_string(bucket['key']),
+                peer = hash_string(bucket['key']),
-                'Count': bucket['doc_count'],
+                count = bucket['doc_count']
-            })
+            ))
        return rval