Fix/better handling of no-stats-tracking rules
[flowspec-xdp] / collision_prob.py
1 # Approximate collisions in an N-bucket hash table
2 # Using suggested approximate formula from https://stats.stackexchange.com/questions/1308/extending-the-birthday-paradox-to-more-than-2-people
3
4 import math
5
6 def poisson_pdf(avg, k):
7         return (avg ** k) / (math.factorial(k) * math.e**avg)
8
9 def poisson_cdf(avg, k):
10         # Lazy version of inclusive cdf from 0 through k
11         it = 0
12         for i in range(0, k+1):
13                 it += poisson_pdf(avg, i)
14         return it
15
16 def poisson_pow(n, b, k):
17         if k == 0:
18                 return poisson_cdf(n/b, 0)**b
19         sub = 0
20         for i in range(0, k):
21                 sub += poisson_pow(n, b, i)
22         return poisson_cdf(n/b, k)**b - sub
23
24 def inv_poisson_pow(n, b, k):
25         it = 0
26         # This loop is equivalent to the below two-step:
27         #for i in range(1, k):
28         #       it += poisson_pow(n, b, i)
29         it -= poisson_cdf(n/b, 0)**b
30         it += poisson_cdf(n/b, k-1)**b
31         return 1 - it
32
33 def print_entry(e, t, b):
34         # We have t/b buckets, and want the probability that no bucket has >= b+1 entries
35         print("\t%ldK entries, %ld-buckets: %.20f" % (e/1000, b, inv_poisson_pow(e, t/b, b+1)))
36
37 # Base cases which can be compared with WolframAlpha
38 # Note that we have to multiply by b to undo the division in print_entry
39 # https://www.wolframalpha.com/input/?i=birthday+problem+calculator&assumption=%7B%22F%22%2C+%22BirthdayProblem%22%2C+%22pbds%22%7D+-%3E%2220000%22&assumption=%7B%22F%22%2C+%22BirthdayProblem%22%2C+%22n%22%7D+-%3E%22300%22&assumption=%22FSelect%22+-%3E+%7B%7B%22BirthdayProblem%22%7D%7D
40 #print_entry(300, 20000, 1)
41 #print_entry(300, 20000*2, 2)
42
43 print("Table and bucket sizes mapped to rough element count which has a 1% bucket-overflow probability")
44 print("Note that we currently have a hard-coded bucket size of 16 elements")
45 print()
46
47 print("128K table * 16 bytes = %dMiB." % (128*16/1024))
48 print_entry(4000, 128*1024, 4)
49 print_entry(15000, 128*1024, 8)
50 print_entry(33000, 128*1024, 16)
51 print_entry(53000, 128*1024, 32)
52
53 print("256K table * 16 bytes = %dMiB." % (256*16/1024))
54 print_entry(7000, 256*1024, 4)
55 print_entry(28000, 256*1024, 8)
56 print_entry(63000, 256*1024, 16)
57 print_entry(104000, 256*1024, 32)
58
59 print("512K table * 16 bytes = %dMiB." % (512*16/1024))
60 print_entry(13000, 512*1024, 4)
61 print_entry(52000, 512*1024, 8)
62 print_entry(119000, 512*1024, 16)
63 print_entry(200000, 512*1024, 32)
64
65 print("1M table * 16 bytes = %dMiB." % (1024*16/1024))
66 print_entry(23000, 1024*1024, 4)
67 print_entry(95000, 1024*1024, 8)
68 print_entry(227000, 1024*1024, 16)
69 print_entry(387000, 1024*1024, 32)
70
71 print("2M table * 16 bytes = %dMiB." % (2*1024*16/1024))
72 print_entry(40000, 2*1024*1024, 4)
73 print_entry(175000, 2*1024*1024, 8)
74 print_entry(431000, 2*1024*1024, 16)
75 print_entry(749000, 2*1024*1024, 32)