research/whisper_scalability/whisper.py

# Util and format functions
#-----------------------------------------------------------

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

# https://web.archive.org/web/20111010015624/http://blogmag.net/blog/read/38/Print_human_readable_file_size
# TODO: Get rid of bytes and KB, always print as as MB and above, then %3.1f
def sizeof_fmt(num):
    for x in ['bytes','KB','MB','GB','TB']:
        if num < 1024.0:
            return "%6.1f%s" % (num, x)
        num /= 1024.0

def magnitude_fmt(num):
    for x in ['','k','m']:
        if num < 1000:
            return "%2d%s" % (num, x)
        num /= 1000

# Color format based on daily bandwidth usage
# <10mb/d = good, <30mb/d ok, <100mb/d bad, 100mb/d+ fail.
def load_color_prefix(load):
    if load < (1024 * 1000 * 10):
        color_level = bcolors.OKBLUE
    elif load < (1024 * 1000 * 30):
        color_level = bcolors.OKGREEN
    elif load < (1024 * 1000 * 100):
        color_level = bcolors.WARNING
    else:
        color_level = bcolors.FAIL
    return color_level

def load_color_fmt(load, string):
    return load_color_prefix(load) + string + bcolors.ENDC

def print_header(string):
    print bcolors.HEADER + string + bcolors.ENDC + "\n"

def print_assumptions(xs):
    print "Assumptions:"
    for x in xs:
        print x
    print ""

def usage_str(load_users_fn, n_users):
    load = load_users_fn(n_users)
    return load_color_fmt(load, "For " + magnitude_fmt(n_users) + " users, receiving bandwidth is " + sizeof_fmt(load_users_fn(n_users)) + "/day")

def print_usage(load_users):
    print usage_str(load_users, 100)
    print usage_str(load_users, 100 * 100)
    print usage_str(load_users, 100 * 100 * 100)
    print ""

# Assumptions
#-----------------------------------------------------------

# We assume a node is not relaying messages, but only sending
#
# Goal:
# - make it user-bound, not network-bound
# - reasonable bw and fetch time
# ~1GB per month, ~ 30 mb per day, ~1 mb per hour

envelope_size = 1024 # 1kb

# Due to negotiation, data sync, etc
# Rough assumed overhead, constant factor
envelopes_per_message = 10

received_messages_per_day = 100

# Assume half of all messages are in 1:1 and group chat
# XXX: Implicitly assume message/envelope ratio same for 1:1 and public,
# probably not true due to things like key negotiation and data sync
private_message_proportion = 0.5

# Number of partitions for partition topic
n_partitions = 5000

# On Bloom filter, false positive rate:
#
# Bloom logic
# f: in_set?(s, x) => (maybe, no)
# if false_positive high => lots of maybe => direct hits
# test happens at routing node and depends on what filter preference peer has,
# OR what request mailserver receives

bloom_size = 512     # size of filter, m
bloom_hash_fns = 3   # number of hash functions, k

# This correspond to topics in bloom filter
# Might be a tad too high, assuming roughly maps to conversations
# I.e. public chat + contact code + partition topic (1 topic per convo)
bloom_elements = 100 # elements in set, n

# Assuming optimal number of hash functions, i.e. k=(m/n)ln 2
# (512/100)*math.log(2) ~ 3.46
# Note that this is very sensitive, so if 200 element you want 1 hash fn, and
# if 50 topics you want 7. Understanding the implications using a suboptimal
# number of hash function is left as an exercise to the reader.
#
# Implied false positive rate (https://hur.st/bloomfilter/?n=100&p=&m=512&k=3)
# p=~0.087, roughly.
bloom_false_positive = 0.1 # false positive rate, p
# Sensitivity to n:
# n=50 => p=1%, n=100 => p=10%, n=200 => 30%
#
# Note that false positivity has two faces, one is in terms of extra bandwidth usage
# The other is in terms of anonymity / plausible deniability for listening on topic
# I.e. N envelopes go to node => 1% false positive rate => 1% of N goes to recipient node
# Even if they only wanted 1 message!
#
# The false positive is a factor of total network traffic

# If you are connected to two peers, you often get same message from both peers
# Even though both are acting according to protocol
# E.g. see https://our.status.im/whisper-pss-comparison/
# With mailservers and non perfect queries this might be higher
# On the other hand, with one mailserver it might be lower
benign_duplicate_receives = 2

# Assumption strings
a1 = "- A1. Envelope size (static): " + str(envelope_size) + "kb"
a2 = "- A2. Envelopes / message (static): " + str(envelopes_per_message)
a3 = "- A3. Received messages / day (static): " + str(received_messages_per_day)
a4 = "- A4. Only receiving messages meant for you."
a5 = "- A5. Received messages for everyone."
a6 = "- A6. Proportion of private messages (static): " + str(private_message_proportion)
a7 = "- A7. Public messages only received by relevant recipients (static)."
a8 = "- A8. All private messages are received by everyone (same topic) (static)."
a9 = "- A9. Private messages are partitioned evenly across partition shards (static), n=" + str(n_partitions)
a10 = "- A10. Bloom filter size (m) (static): " + str(bloom_size)
a11 = "- A11. Bloom filter hash functions (k) (static): " + str(bloom_hash_fns)
a12 = "- A12. Bloom filter elements, i.e. topics, (n) (static): " + str(bloom_elements)
a13 = "- A13. Bloom filter assuming optimal k choice (sensitive to m, n)."
a14 = "- A14. Bloom filter false positive proportion of full traffic, p=" + str(bloom_false_positive)
a15 = "- A15. Benign duplicate receives factor (static): " + str(benign_duplicate_receives)
a16 = "- A16. Assuming no bad envelopes, bad PoW, expired, etc (static)."

# Cases
#-----------------------------------------------------------

# Case 1: only receiving messages meant for you
def case1():
    def load_users(n_users):
        return envelope_size * envelopes_per_message * \
            received_messages_per_day

    print_header("Case 1. Only receiving messages meant for you")
    print_assumptions([a1, a2, a3, a4])
    print_usage(load_users)
    print("------------------------------------------------------------")

# Case 2: receiving all messages
def case2():

    def load_users(n_users):
        return envelope_size * envelopes_per_message * \
            received_messages_per_day * n_users

    print_header("Case 2. Receiving messages for everyone")
    print_assumptions([a1, a2, a3, a5])
    print_usage(load_users)
    print("------------------------------------------------------------")

# Case 3: all private messages go over one discovery topic
def case3():

    # Public scales per usage, all private messages are received
    # over one discovery topic
    def load_users(n_users):
        load_private = envelope_size * envelopes_per_message * \
            received_messages_per_day * n_users
        load_public = envelope_size * envelopes_per_message * \
            received_messages_per_day
        total_load = load_private * private_message_proportion + \
            load_public * (1 - private_message_proportion)
        return total_load

    print_header("Case 3. All private messages go over one discovery topic")
    print_assumptions([a1, a2, a3, a6, a7, a8])
    print_usage(load_users)
    print("------------------------------------------------------------")

# Case 4: all private messages are partitioned into shards
def case4():

    def load_users(n_users):
        if n_users < n_partitions:
            # Assume spread out, not colliding
            factor_load = 1
        else:
            # Assume spread out evenly, collides proportional to users
            factor_load = n_users / n_partitions
        load_private = envelope_size * envelopes_per_message * \
            received_messages_per_day * factor_load
        load_public = envelope_size * envelopes_per_message * \
            received_messages_per_day
        total_load = load_private * private_message_proportion + \
            load_public * (1 - private_message_proportion)
        return total_load

    print_header("Case 4. All private messages are partitioned into shards")
    print_assumptions([a1, a2, a3, a6, a7, a9])
    print_usage(load_users)
    print("------------------------------------------------------------")

# Case 5: all messages are passed through a bloom filter with a certain false positive rate
def case5():

    def load_users(n_users):
        if n_users < n_partitions:
            # Assume spread out, not colliding
            factor_load = 1
        else:
            # Assume spread out evenly, collides proportional to users
            factor_load = n_users / n_partitions
        load_private = envelope_size * envelopes_per_message * \
            received_messages_per_day * factor_load
        load_public = envelope_size * envelopes_per_message * \
            received_messages_per_day
        total_load = load_private * private_message_proportion + \
            load_public * (1 - private_message_proportion)

        # false positive total network traffic, assuming full node relaying
        network_load = envelope_size * envelopes_per_message * \
            received_messages_per_day * n_users
        false_positive_load = network_load * bloom_false_positive

        return total_load + false_positive_load

    print_header("Case 5. Case 4 + All messages are passed through bloom filter with false positive rate")
    print_assumptions([a1, a2, a3, a6, a7, a9, a10, a11, a12, a13, a14])
    print_usage(load_users)
    print("------------------------------------------------------------")


# Case 6: Same as case 5 but with duplicate receives
def case6():

    def load_users(n_users):
        if n_users < n_partitions:
            # Assume spread out, not colliding
            factor_load = 1
        else:
            # Assume spread out evenly, collides proportional to users
            factor_load = n_users / n_partitions
        load_private = envelope_size * envelopes_per_message * \
            received_messages_per_day * factor_load
        load_public = envelope_size * envelopes_per_message * \
            received_messages_per_day
        total_load = load_private * private_message_proportion + \
            load_public * (1 - private_message_proportion)

        # false positive total network traffic, assuming full node relaying
        network_load = envelope_size * envelopes_per_message * \
            received_messages_per_day * n_users
        false_positive_load = network_load * bloom_false_positive

        return (total_load + false_positive_load) * benign_duplicate_receives

    print_header("Case 6. Case 5 + Benign duplicate receives")
    print_assumptions([a1, a2, a3, a6, a7, a9, a10, a11, a12, a13, a14, a15, a16])
    print_usage(load_users)
    print("------------------------------------------------------------")

# Case 7: Waka mode - like Infura but for chat, no metadata connection
def case7():

    def load_users(n_users):
        if n_users < n_partitions:
            # Assume spread out, not colliding
            factor_load = 1
        else:
            # Assume spread out evenly, collides proportional to users
            factor_load = n_users / n_partitions
        load_private = envelope_size * envelopes_per_message * \
            received_messages_per_day * factor_load
        load_public = envelope_size * envelopes_per_message * \
            received_messages_per_day
        total_load = load_private * private_message_proportion + \
            load_public * (1 - private_message_proportion)

        return total_load

    print_header("Case 7. Waka mode - no metadata protection with bloom filter and one node connected; still static shard")
    print("Next step up is to either only use contact code, or shard more aggressively.")
    print("Note that this requires change of other nodes behavior, not just local node.")
    print("")
    print_assumptions([a1, a2, a3, a6, a7, a9])
    print_usage(load_users)
    print("------------------------------------------------------------")

# Run cases
#-----------------------------------------------------------


# Print goals
print("")
print(bcolors.HEADER + "Whisper theoretical model. Attempts to encode characteristics of it.")
print("")
print("Goals:")
print("1. Ensure network scales by being user or usage bound, as opposed to bandwidth growing in proportion to network size.")
print("2. Staying with in a reasonable bandwidth limit for limited data plans.")
print("3. Do the above without materially impacting existing nodes.")
print("" + bcolors.ENDC)

case1()
case2()
case3()
case4()
case5()
case6()

print("")
print("Assumptions not covered so far:")
print("- Offline case (impacts duplicates, bloom filter if rotated, bad envelopes)")
print("")

case7()

# Notes
#-----------------------------------------------------------

# What did I observe? I observed 15GB/m = 500mb per day. This was with
# discovery topic. After case 6, with case 3 discovery multiplier (x50, and
# maybe tiny bit fewer bloom_n), this roughly checks out. Also heavy user +
# envelope size. And number of users?

# Things left to encode:
# - Bugs / invalid / bad envelopes
# - Offline case dominant
# - percentage_offline
#   - impacts mailservers
#   - and also data sync
# - Unknowns?

# Feedback:
# Which of these assumptions are false?
# Any assumptions or conditions not accurately captured?
# Which are most interesting to you?
# Which do we want to verify, and what metrics do we need to verify?

# Misc:
# - If we x100 users tomorrow, how can we move the partition topic?
# - Show: path we are on today, and alternative path
# - Also not captured: fallover of relaying node, if it exceeds bandwidth link
# - It'd be neat if you could encode assumptions set
# - Get secondary out of model confirmation
# - How many unique public keys have we seen in common chats the last month?