Stop dumping match count before update, users can get it if they want
[flowspec-xdp] / genrules.py
index 37c533ea6832919253fdbd04d6d96887cfd8b7e4..a9f3d16db50c54ee04830757a4d5482fd8d7f39c 100755 (executable)
@@ -12,6 +12,10 @@ IP_PROTO_ICMPV6 = 58
 IP_PROTO_TCP = 6
 IP_PROTO_UDP = 17
 
+ORD_LESS = 0
+ORD_GREATER = 1
+ORD_EQUAL = 2
+
 class ASTAction(Enum):
     OR = 1
     AND = 2
@@ -175,28 +179,67 @@ def parse_bit_expr(expr):
     return ASTNode(ASTAction.EXPR, BitExpr(expr))
 
 
+class IpRule:
+    def __init__(self, ty, offset, net, proto):
+        self.ty = ty
+        self.offset = offset
+        if offset is None:
+            self.offset = 0
+        self.net = net
+        self.proto = proto
+
+    def ord(self, other):
+        assert self.ty == other.ty
+        assert self.proto == other.proto
+        if self.offset < other.offset:
+            return ORD_LESS
+        if self.offset > other.offset:
+            return ORD_GREATER
+
+        if self.net.overlaps(other.net):
+            if self.net.prefixlen > other.net.prefixlen:
+                return ORD_LESS
+            elif self.net.prefixlen < other.net.prefixlen:
+                return ORD_GREATER
+        else:
+            if self.net < other.net:
+                return ORD_LESS
+            else:
+                assert self.net > other.net
+                return ORD_GREATER
+
+        return ORD_EQUAL
+
+    def __lt__(self, other):
+        return self.ord(other) == ORD_LESS
+
+    def __eq__(self, other):
+        return type(other) == IpRule and self.ty == other.ty and self.offset == other.offset and self.net == other.net and self.proto == other.proto
+
+    def __str__(self):
+        if self.proto == 4:
+            assert self.offset == 0
+            return f"""if ((ip->{self.ty} & MASK4({self.net.prefixlen})) != BIGEND32({int(self.net.network_address)}ULL))
+       break;"""
+        else:
+            u32s = [(int(self.net.network_address) >> (3*32)) & 0xffffffff,
+                    (int(self.net.network_address) >> (2*32)) & 0xffffffff,
+                    (int(self.net.network_address) >> (1*32)) & 0xffffffff,
+                    (int(self.net.network_address) >> (0*32)) & 0xffffffff]
+            if self.offset == 0:
+                mask = f"MASK6({self.net.prefixlen})"
+            else:
+                mask = f"MASK6_OFFS({self.offset}, {self.net.prefixlen})"
+            return f"""if ((ip6->{self.ty} & {mask}) != (BIGEND128({u32s[0]}ULL, {u32s[1]}ULL, {u32s[2]}ULL, {u32s[3]}ULL) & {mask}))
+       break;"""
 def ip_to_rule(proto, inip, ty, offset):
     if proto == 4:
         assert offset is None
         net = ipaddress.IPv4Network(inip.strip())
-        if net.prefixlen == 0:
-            return ""
-        return f"""if ((ip->{ty} & MASK4({net.prefixlen})) != BIGEND32({int(net.network_address)}ULL))
-       break;"""
+        return IpRule(ty, offset, net, 4)
     else:
         net = ipaddress.IPv6Network(inip.strip())
-        if net.prefixlen == 0:
-            return ""
-        u32s = [(int(net.network_address) >> (3*32)) & 0xffffffff,
-                (int(net.network_address) >> (2*32)) & 0xffffffff,
-                (int(net.network_address) >> (1*32)) & 0xffffffff,
-                (int(net.network_address) >> (0*32)) & 0xffffffff]
-        if offset is None:
-            mask = f"MASK6({net.prefixlen})"
-        else:
-            mask = f"MASK6_OFFS({offset}, {net.prefixlen})"
-        return f"""if ((ip6->{ty} & {mask}) != (BIGEND128({u32s[0]}ULL, {u32s[1]}ULL, {u32s[2]}ULL, {u32s[3]}ULL) & {mask}))
-       break;"""
+        return IpRule(ty, offset, net, 6)
 
 def fragment_to_rule(ipproto, rules):
     ast = parse_ast(rules, parse_frag_expr, False)
@@ -239,10 +282,10 @@ def dscp_to_rule(proto, rules):
 def port_to_rule(ty, rules):
     if ty == "port" :
         ast = parse_ast(rules, parse_numbers_expr, True)
-        return "if (!ports_valid) break;\nif (!( " + ast.write("sport", "dport") + " )) break;"
+        return "if (sport == -1 || dport == -1) break;\nif (!( " + ast.write("sport", "dport") + " )) break;"
 
     ast = parse_ast(rules, parse_numbers_expr, True)
-    return "if (!ports_valid) break;\nif (!( " + ast.write(ty) + " )) break;"
+    return "if (" + ty + " == -1) break;\nif (!( " + ast.write(ty) + " )) break;"
 
 def tcp_flags_to_rule(rules):
     ast = parse_ast(rules, parse_bit_expr, False)
@@ -256,6 +299,100 @@ def flow_label_to_rule(rules):
     return f"""if (ip6 == NULL) break;
 if (!( {ast.write("((((uint32_t)(ip6->flow_lbl[0] & 0xf)) << 2*8) | (((uint32_t)ip6->flow_lbl[1]) << 1*8) | (uint32_t)ip6->flow_lbl[0])")} )) break;"""
 
+class RuleAction(Enum):
+    CONDITIONS = 1
+    ACTION = 2
+    LIST = 3
+class RuleNode:
+    def __init__(self, ty, action, inner):
+        self.ty = ty
+        self.action = action
+        self.inner = inner
+        if ty == RuleAction.ACTION:
+            assert inner is None
+            assert type(action) == str
+        elif ty == RuleAction.LIST:
+            assert type(inner) == list
+            assert action is None
+            for item in inner:
+                assert type(item) == RuleNode
+        else:
+            assert ty == RuleAction.CONDITIONS
+            assert type(action) == list
+            assert type(inner) == RuleNode
+
+    def __lt__(self, other):
+        assert self.ty == RuleAction.CONDITIONS
+        assert other.ty == RuleAction.CONDITIONS
+
+        o = ORD_EQUAL
+
+        # RFC first has us sort by dest, then source, then other conditions. We don't implement the
+        # other conditions because doing so requires re-implementing the Flowspec wire format,
+        # which isn't trivial. However, we do implement the source/dest sorting in the hopes it
+        # allows us to group rules according to source/dest IP and hopefully LLVM optimizes out
+        # later rules.
+
+        selfdest = next(filter(lambda a : type(a) == IpRule and a.ty == "daddr", self.action), None)
+        otherdest = next(filter(lambda a : type(a) == IpRule and a.ty == "daddr", self.action), None)
+        if o == ORD_EQUAL and selfdest is not None and otherdest is not None:
+            o = selfdest.ord(otherdest)
+
+        if o == ORD_EQUAL:
+            selfsrc = next(filter(lambda a : type(a) == IpRule and a.ty == "saddr", self.action), None)
+            othersrc = next(filter(lambda a : type(a) == IpRule and a.ty == "saddr", self.action), None)
+            if selfsrc is not None and othersrc is None:
+                return True
+            elif selfsrc is None and othersrc is not None:
+                return False
+            elif selfsrc is not None and othersrc is not None:
+                o = selfsrc.ord(othersrc)
+
+        if o == ORD_LESS:
+            return True
+        return self.action < other.action
+
+    def maybe_join(self, neighbor):
+        if self.ty == RuleAction.CONDITIONS and neighbor.ty == RuleAction.CONDITIONS:
+            overlapping_conditions = [x for x in self.action if x in neighbor.action]
+            if len(overlapping_conditions) != 0:
+                us = RuleNode(RuleAction.CONDITIONS, [x for x in self.action if x not in overlapping_conditions], self.inner)
+                them = RuleNode(RuleAction.CONDITIONS, [x for x in neighbor.action if x not in overlapping_conditions], neighbor.inner)
+                self.action = overlapping_conditions
+                if self.inner.ty == RuleAction.LIST and us.action == []:
+                    self.inner.inner.append(them)
+                else:
+                    self.inner = RuleNode(RuleAction.LIST, None, [us, them])
+                self.inner.flatten()
+                return True
+        return False
+
+    def flatten(self):
+        # LLVM can be pretty bad at optimizing out common subexpressions. Thus, we have to do a
+        # pass here to toptimize out common subexpressions in back-to-back rules.
+        # See https://bugs.llvm.org/show_bug.cgi?id=52455
+        assert self.ty == RuleAction.LIST
+        did_update = True
+        while did_update:
+            did_update = False
+            for i in range(0, len(self.inner) - 1):
+                if self.inner[i].maybe_join(self.inner[i + 1]):
+                    del self.inner[i + 1]
+                    did_update = True
+                    break
+
+    def write(self, out, pfx="\t"):
+        if self.ty == RuleAction.CONDITIONS:
+            out.write(pfx + "do {\\\n")
+            for cond in self.action:
+                out.write("\t" + pfx + str(cond).replace("\n", " \\\n\t" + pfx) + " \\\n")
+            self.inner.write(out, pfx)
+            out.write(pfx + "} while(0);\\\n")
+        elif self.ty == RuleAction.LIST:
+            for item in self.inner:
+                item.write(out, pfx + "\t")
+        else:
+            out.write("\t" + pfx + self.action.strip().replace("\n", " \\\n\t" + pfx) + " \\\n")
 
 with open("rules.h", "w") as out:
     parse = argparse.ArgumentParser()
@@ -291,12 +428,13 @@ with open("rules.h", "w") as out:
             assert False
         out.write("#define REQ_8021Q " + args.vlan_tag + "\n")
 
-    rules6 = ""
-    rules4 = ""
+    rules6 = []
+    rules4 = []
     use_v6_frags = False
-    rulecnt = 0
+    stats_rulecnt = 0
     ratelimitcnt = 0
     v4persrcratelimits = []
+    v5persrcratelimits = []
     v6persrcratelimits = []
 
     lastrule = None
@@ -312,19 +450,15 @@ with open("rules.h", "w") as out:
             t = lastrule.split("{")
             if t[0].strip() == "flow4":
                 proto = 4
-                rules4 += "\tdo {\\\n"
             elif t[0].strip() == "flow6":
                 proto = 6
-                rules6 += "\tdo {\\\n"
             else:
                 continue
 
+            conditions = []
             def write_rule(r):
-                global rules4, rules6
-                if proto == 6:
-                    rules6 += "\t\t" + r.replace("\n", " \\\n\t\t") + " \\\n"
-                else:
-                    rules4 += "\t\t" + r.replace("\n", " \\\n\t\t") + " \\\n"
+                global conditions
+                conditions.append(r)
 
             rule = t[1].split("}")[0].strip()
             for step in rule.split(";"):
@@ -366,9 +500,14 @@ with open("rules.h", "w") as out:
                 else:
                     assert False
 
+            actions = ""
+            def write_rule(r):
+                global actions
+                actions += r + "\n"
+
             # Now write the match handling!
             first_action = None
-            stats_action = None
+            stats_action = ""
             last_action = None
             for community in line.split("("):
                 if not community.startswith("generic, "):
@@ -394,80 +533,57 @@ with open("rules.h", "w") as out:
                     elif exp == 0xff:
                         # NaN/INF. Just treat as INF and accept
                         first_action = None
-                    elif exp <= 127: # < 1
+                    elif exp < 127: # < 1
                         first_action = "{stats_replace}\nreturn XDP_DROP;"
-                    elif exp >= 127 + 63: # The count won't even fit in 64-bits, just accept
+                    elif exp >= 127 + 29: # We can't handle the precision required with ns this high
                         first_action = None
                     else:
                         mantissa = low_bytes & ((1 << 23) - 1)
                         value = 1.0 + mantissa / (2**23)
                         value *= 2**(exp-127)
+
+                        first_action =   "int64_t time_masked = bpf_ktime_get_ns() & RATE_TIME_MASK;\n"
+                        first_action += f"int64_t per_pkt_ns = (1000000000LL << RATE_BUCKET_INTEGER_BITS) / {math.floor(value)};\n"
                         if ty == "0x8006" or ty == "0x8306":
-                            accessor = "rate->rate.sent_bytes"
+                            first_action += "uint64_t amt = data_end - pktdata;\n"
                         else:
-                            accessor = "rate->rate.sent_packets"
-                        # Note that int64_t will overflow after 292 years of uptime
-                        first_action = "int64_t time = bpf_ktime_get_ns();\n"
-                        first_action +=  "uint64_t allowed_since_last = 0;\n"
+                            first_action += "uint64_t amt = 1;\n"
                         if ty == "0x8006" or ty == "0x800c":
-                            spin_lock = "bpf_spin_lock(&rate->lock);"
-                            spin_unlock = "bpf_spin_unlock(&rate->lock);"
                             first_action += f"const uint32_t ratelimitidx = {ratelimitcnt};\n"
                             first_action += "struct ratelimit *rate = bpf_map_lookup_elem(&rate_map, &ratelimitidx);\n"
                             ratelimitcnt += 1
+                            first_action +=  "int matched = 0;\n"
+                            first_action += "DO_RATE_LIMIT(bpf_spin_lock(&rate->lock), rate, time_masked, amt, per_pkt_ns, matched);\n"
+                            first_action += "if (rate) { bpf_spin_unlock(&rate->lock); }\n"
                         else:
-                            spin_lock = "/* No locking as we're per-CPU */"
-                            spin_unlock = "/* No locking as we're per-CPU */"
                             if proto == 4:
                                 if mid_byte > 32:
                                     continue
                                 first_action += f"const uint32_t srcip = ip->saddr & MASK4({mid_byte});\n"
                                 first_action += f"void *rate_map = &v4_src_rate_{len(v4persrcratelimits)};\n"
-                                v4persrcratelimits.append((high_byte + 1) * 1024)
+                                first_action += f"int matched = check_v4_persrc_ratelimit(srcip, rate_map, {(high_byte + 1) * 4096}, time_masked, amt, per_pkt_ns);\n"
+                                v4persrcratelimits.append((high_byte + 1) * 4096)
+                            elif mid_byte <= 64:
+                                first_action += f"const uint64_t srcip = BE128BEHIGH64(ip6->saddr & MASK6({mid_byte}));\n"
+                                first_action += f"void *rate_map = &v5_src_rate_{len(v5persrcratelimits)};\n"
+                                first_action += f"int matched = check_v5_persrc_ratelimit(srcip, rate_map, {(high_byte + 1) * 4096}, time_masked, amt, per_pkt_ns);\n"
+                                v5persrcratelimits.append((high_byte + 1) * 4096)
                             else:
                                 if mid_byte > 128:
                                     continue
                                 first_action += f"const uint128_t srcip = ip6->saddr & MASK6({mid_byte});\n"
                                 first_action += f"void *rate_map = &v6_src_rate_{len(v6persrcratelimits)};\n"
-                                v6persrcratelimits.append((high_byte + 1) * 1024)
-                            first_action += f"struct percpu_ratelimit *rate = bpf_map_lookup_elem(rate_map, &srcip);\n"
-                        first_action +=  "if (rate) {\n"
-                        first_action += f"\t{spin_lock}\n"
-                        first_action += f"\tif (likely({accessor} > 0))" + " {\n"
-                        first_action +=  "\t\tint64_t diff = time - rate->sent_time;\n"
-                        # Unlikely or not, if the flow is slow, take a perf hit (though with the else if branch it doesn't matter)
-                        first_action +=  "\t\tif (unlikely(diff > 1000000000))\n"
-                        first_action += f"\t\t\t{accessor} = 0;\n"
-                        first_action +=  "\t\telse if (likely(diff > 0))\n"
-                        first_action += f"\t\t\tallowed_since_last = ((uint64_t)diff) * {math.floor(value)} / 1000000000;\n"
-                        first_action +=  "\t}\n"
-                        first_action += f"\tif ({accessor} - ((int64_t)allowed_since_last) <= 0)" + " {\n"
-                        if ty == "0x8006" or ty == "0x8306":
-                            first_action += f"\t\t{accessor} = data_end - pktdata;\n"
-                        else:
-                            first_action += f"\t\t{accessor} = 1;\n"
-                        first_action +=  "\t\trate->sent_time = time;\n"
-                        first_action += f"\t\t{spin_unlock}\n"
-                        first_action +=  "\t} else {\n"
-                        first_action += f"\t\t{spin_unlock}\n"
-                        first_action +=  "\t\t{stats_replace}\n"
-                        first_action +=  "\t\treturn XDP_DROP;\n"
-                        first_action +=  "\t}\n"
-                        if ty == "0x8306" or ty == "0x830c":
-                            first_action +=  "} else {\n"
-                            first_action +=  "\tstruct percpu_ratelimit new_rate = { .sent_time = time, };\n"
-                            first_action +=  "\trate = &new_rate;\n"
-                            if ty == "0x8006" or ty == "0x8306":
-                                first_action += f"\t\t{accessor} = data_end - pktdata;\n"
-                            else:
-                                first_action += f"\t\t{accessor} = 1;\n"
-                            first_action +=  "\tbpf_map_update_elem(rate_map, &srcip, rate, BPF_ANY);\n"
+                                first_action += f"int matched = check_v6_persrc_ratelimit(srcip, rate_map, {(high_byte + 1) * 4096}, time_masked, amt, per_pkt_ns);\n"
+                                v6persrcratelimits.append((high_byte + 1) * 4096)
+                        first_action +=  "if (matched) {\n"
+                        first_action +=  "\t{stats_replace}\n"
+                        first_action +=  "\treturn XDP_DROP;\n"
                         first_action +=  "}\n"
                 elif ty == "0x8007":
                     if low_bytes & 1 == 0:
                         last_action = "return XDP_PASS;"
                     if low_bytes & 2 == 2:
-                        stats_action = f"const uint32_t ruleidx = STATIC_RULE_CNT + {rulecnt};\n"
+                        stats_action = f"const uint32_t ruleidx = STATIC_RULE_CNT + {stats_rulecnt};\n"
                         stats_action += "INCREMENT_MATCH(ruleidx);"
                 elif ty == "0x8008":
                     assert False # We do not implement the redirect action
@@ -487,32 +603,53 @@ with open("rules.h", "w") as out:
                         write_rule("ip6->flow_lbl[0] = (ip6->flow_lbl[0] & 0x3f) | " + str((low_bytes & 3) << 6) + ";")
             if first_action is not None:
                 write_rule(first_action.replace("{stats_replace}", stats_action))
-            if stats_action is not None and (first_action is None or "{stats_replace}" not in first_action):
+            if stats_action != "" and (first_action is None or "{stats_replace}" not in first_action):
                 write_rule(stats_action)
             if last_action is not None:
                 write_rule(last_action)
             if proto == 6:
-                rules6 += "\t} while(0);\\\n"
+                rules6.append(RuleNode(RuleAction.CONDITIONS, conditions, RuleNode(RuleAction.ACTION, actions, None)))
             else:
-                rules4 += "\t} while(0);\\\n"
-            rulecnt += 1
+                rules4.append(RuleNode(RuleAction.CONDITIONS, conditions, RuleNode(RuleAction.ACTION, actions, None)))
+            if stats_action != "":
+                print(rule)
+                stats_rulecnt += 1
             lastrule = None
 
     out.write("\n")
-    out.write(f"#define RULECNT {rulecnt}\n")
+    out.write(f"#define STATS_RULECNT {stats_rulecnt}\n")
     if ratelimitcnt != 0:
         out.write(f"#define RATE_CNT {ratelimitcnt}\n")
-    if rules4 != "":
+
+    if len(rules4) != 0:
         out.write("#define NEED_V4_PARSE\n")
-        out.write("#define RULES4 {\\\n" + rules4 + "}\n")
-    if rules6:
+        out.write("#define RULES4 {\\\n")
+        # First sort the rules according to the RFC, then make it a single
+        # LIST rule and call flatten() to unify redundant conditions
+        rules4.sort()
+        rules4 = RuleNode(RuleAction.LIST, None, rules4)
+        rules4.flatten()
+        rules4.write(out)
+        out.write("}\n")
+
+    if len(rules6) != 0:
         out.write("#define NEED_V6_PARSE\n")
-        out.write("#define RULES6 {\\\n" + rules6 + "}\n")
+        out.write("#define RULES6 {\\\n")
+        # First sort the rules according to the RFC, then make it a single
+        # LIST rule and call flatten() to unify redundant conditions
+        rules6.sort()
+        rules6 = RuleNode(RuleAction.LIST, None, rules6)
+        rules6.flatten()
+        rules6.write(out)
+        out.write("}\n")
+
     if args.v6frag == "ignore-parse-if-rule":
         if use_v6_frags:
             out.write("#define PARSE_V6_FRAG PARSE\n")
     with open("maps.h", "w") as out:
         for idx, limit in enumerate(v4persrcratelimits):
-            out.write(f"V4_SRC_RATE_DEFINE({idx}, {limit})\n")
+            out.write(f"SRC_RATE_DEFINE(4, {idx}, {limit})\n")
+        for idx, limit in enumerate(v5persrcratelimits):
+            out.write(f"SRC_RATE_DEFINE(5, {idx}, {limit})\n")
         for idx, limit in enumerate(v6persrcratelimits):
-            out.write(f"V6_SRC_RATE_DEFINE({idx}, {limit})\n")
+            out.write(f"SRC_RATE_DEFINE(6, {idx}, {limit})\n")