Move the data structures and helper functions for the TCP hash table to flow.c, making it a general hash table indexing sides of flows. This is largely code motion and straightforward renames. There are two semantic changes: * flow_hash_lookup() now needs to verify that the entry has a matching protocol as well as matching addresses, ports and interface * When moving entries in the flow table tcp_tap_conn_update() could previously assume that only the TAP side of the TCP connection was hashed and might need an update. For the general flow hash table either side of a flow could be hashed, so when we move an entry in the flow table we need to allow for updating both sides in the hash table. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++- flow.h | 11 +++-- tcp.c | 146 +++++---------------------------------------------------- 3 files changed, 147 insertions(+), 138 deletions(-) diff --git a/flow.c b/flow.c index 70036fc..3d016c3 100644 --- a/flow.c +++ b/flow.c @@ -40,6 +40,16 @@ union flow flowtab[FLOW_MAX]; /* Last time the flow timers ran */ static struct timespec flow_timer_run; +/* Hash table to index it */ +#define FLOW_HASH_LOAD 70 /* % */ +#define FLOW_HASH_SIZE ((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD)) + +/* Table for lookup from flowside information */ +static flow_sidx_t flow_hashtab[FLOW_HASH_SIZE]; + +static_assert(ARRAY_SIZE(flow_hashtab) >= 2 * FLOW_MAX, +"Safe linear probing requires hash table with more entries than the number of sides in the flow table"); + /** flow_log_ - Log flow-related message * @f: flow the message is related to * @pri: Log priority @@ -244,8 +254,8 @@ void flow_alloc_cancel(union flow *flow) * * Return: hash value */ -uint64_t flow_hash(const struct ctx *c, uint8_t proto, - const struct flowside *fside) +static uint64_t flow_hash(const struct ctx *c, uint8_t proto, + const struct flowside *fside) { struct siphash_state state = SIPHASH_INIT(c->hash_secret); @@ -257,6 +267,115 @@ uint64_t flow_hash(const struct ctx *c, uint8_t proto, fside->fport << 16 | fside->eport); } +/** + * flow_sidx_hash() - Calculate hash value for given side of a given flow + * @c: Execution context + * @sidx: Flow & side index to get hash for + * + * Return: hash value, of the flow & side represented by @sidx + */ +static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx) +{ + const struct flow_common *f = &flow_at_sidx(sidx)->f; + return flow_hash(c, FLOW_PROTO(f), &f->side[sidx.side]); +} + +/** + * flow_hash_probe() - Find hash bucket for a flow + * @c: Execution context + * @sidx: Flow and side to find bucket for + * + * Return: If @sidx is in the hash table, its current bucket, otherwise a + * suitable free bucket for it. + */ +static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx) +{ + unsigned b = flow_sidx_hash(c, sidx) % FLOW_HASH_SIZE; + + /* Linear probing */ + while (!flow_sidx_eq(flow_hashtab[b], FLOW_SIDX_NONE) && + !flow_sidx_eq(flow_hashtab[b], sidx)) + b = mod_sub(b, 1, FLOW_HASH_SIZE); + + return b; +} + +/** + * flow_hash_insert() - Insert side of a flow into into hash table + * @c: Execution context + * @sidx: Flow & side index + */ +void flow_hash_insert(const struct ctx *c, flow_sidx_t sidx) +{ + unsigned b = flow_hash_probe(c, sidx); + + flow_hashtab[b] = sidx; + flow_dbg(flow_at_sidx(sidx), "hash table insert: bucket: %u", b); +} + +/** + * flow_hash_remove() - Drop side of a flow from the hash table + * @c: Execution context + * @sidx: Side of flow to remove + */ +void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx) +{ + unsigned b = flow_hash_probe(c, sidx), s; + + if (flow_sidx_eq(flow_hashtab[b], FLOW_SIDX_NONE)) + return; /* Redundant remove */ + + flow_dbg(flow_at_sidx(sidx), "hash table remove: bucket: %u", b); + + /* Scan the remainder of the cluster */ + for (s = mod_sub(b, 1, FLOW_HASH_SIZE); + !flow_sidx_eq(flow_hashtab[s], FLOW_SIDX_NONE); + s = mod_sub(s, 1, FLOW_HASH_SIZE)) { + unsigned h = flow_sidx_hash(c, flow_hashtab[s]) % FLOW_HASH_SIZE; + + if (!mod_between(h, s, b, FLOW_HASH_SIZE)) { + /* flow_hashtab[s] can live in flow_hashtab[b]'s slot */ + debug("hash table remove: shuffle %u -> %u", s, b); + flow_hashtab[b] = flow_hashtab[s]; + b = s; + } + } + + flow_hashtab[b] = FLOW_SIDX_NONE; +} + +/** + * flow_hash_lookup() - Look up a flow given addressing information + * @c: Execution context + * @proto: Protocol of the flow (IP L4 protocol number) + * @pif: Interface of the flow + * @af: Address family, AF_INET or AF_INET6 + * @eaddr: Guest side endpoint address (guest local address) + * @faddr: Guest side forwarding address (guest remote address) + * @eport: Guest side endpoint port (guest local port) + * @fport: Guest side forwarding port (guest remote port) + * + * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found + */ +flow_sidx_t flow_hash_lookup(const struct ctx *c, uint8_t proto, uint8_t pif, + int af, const void *eaddr, const void *faddr, + in_port_t eport, in_port_t fport) +{ + struct flowside fside; + union flow *flow; + int b; + + flowside_from_af(&fside, pif, af, faddr, fport, eaddr, eport); + + b = flow_hash(c, proto, &fside) % FLOW_HASH_SIZE; + while ((flow = flow_at_sidx(flow_hashtab[b])) && + FLOW_PROTO(&flow->f) == proto && + !flowside_eq(&flow->f.side[flow_hashtab[b].side], &fside)) + b = (b + 1) % FLOW_HASH_SIZE; + + return flow_hashtab[b]; +} + /** * flow_defer_handler() - Handler for per-flow deferred and timed tasks * @c: Execution context @@ -334,7 +453,12 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) */ void flow_init(void) { + unsigned b; + /* Initial state is a single free block containing the whole table */ flowtab[0].free.n = FLOW_MAX; flowtab[0].free.next = FLOW_MAX; + + for (b = 0; b < FLOW_HASH_SIZE; b++) + flow_hashtab[b] = FLOW_SIDX_NONE; } diff --git a/flow.h b/flow.h index ab831d1..adb5d44 100644 --- a/flow.h +++ b/flow.h @@ -133,9 +133,6 @@ void flow_new_dbg(const struct flow_common *f, unsigned side); void flow_fwd_dbg(const struct flow_common *f, unsigned side); #define FLOW_FWD_DBG(flow, side) (flow_fwd_dbg(&(flow)->f, (side))) -uint64_t flow_hash(const struct ctx *c, uint8_t proto, - const struct flowside *fside); - /** * struct flow_sidx - ID for one side of a specific flow * @side: Side referenced (0 or 1) @@ -161,6 +158,12 @@ static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b) return (a.flow == b.flow) && (a.side == b.side); } +void flow_hash_insert(const struct ctx *c, flow_sidx_t sidx); +void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx); +flow_sidx_t flow_hash_lookup(const struct ctx *c, uint8_t proto, uint8_t pif, + int af, const void *eaddr, const void *faddr, + in_port_t eport, in_port_t fport); + union flow; void flow_init(void); @@ -180,4 +183,6 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) flow_dbg((f), __VA_ARGS__); \ } while (0) +void flow_init(void); + #endif /* FLOW_H */ diff --git a/tcp.c b/tcp.c index a7907f2..b6d046f 100644 --- a/tcp.c +++ b/tcp.c @@ -308,9 +308,6 @@ #define TCP_FRAMES \ (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) -#define TCP_HASH_TABLE_LOAD 70 /* % */ -#define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) - #define MAX_WS 8 #define MAX_WINDOW (1 << (16 + (MAX_WS))) @@ -575,12 +572,6 @@ static unsigned int tcp6_l2_flags_buf_used; #define CONN(idx) (&(FLOW(idx)->tcp)) -/* Table for lookup from flowside information */ -static flow_sidx_t tc_hash[TCP_HASH_TABLE_SIZE]; - -static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, - "Safe linear probing requires hash table larger than connection table"); - /* Pools for pre-opened sockets (in init) */ int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; @@ -773,9 +764,6 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, tcp_timer_ctl(c, conn); } -static void tcp_hash_remove(const struct ctx *c, - const struct tcp_tap_conn *conn); - /** * conn_event_do() - Set and log connection events, update epoll state * @c: Execution context @@ -821,7 +809,7 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, num == -1 ? "CLOSED" : tcp_event_str[num]); if (event == CLOSED) - tcp_hash_remove(c, conn); + flow_hash_remove(c, FLOW_SIDX(conn, TAPSIDE)); else if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) conn_flag(c, conn, ACTIVE_CLOSE); else @@ -1134,116 +1122,6 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find, return -1; } -/** - * tcp_conn_hash() - Calculate hash bucket of an existing connection - * @c: Execution context - * @conn: Connection - * - * Return: hash value, needs to be adjusted for table size - */ -static uint64_t tcp_conn_hash(const struct ctx *c, - const struct tcp_tap_conn *conn) -{ - return flow_hash(c, IPPROTO_TCP, TAPFSIDE(conn)); -} - -/** - * tcp_hash_probe() - Find hash bucket for a connection - * @c: Execution context - * @conn: Connection to find bucket for - * - * Return: If @conn is in the table, its current bucket, otherwise a suitable - * free bucket for it. - */ -static inline unsigned tcp_hash_probe(const struct ctx *c, - const struct tcp_tap_conn *conn) -{ - flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE); - unsigned b = tcp_conn_hash(c, conn) % TCP_HASH_TABLE_SIZE; - - /* Linear probing */ - while (!flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) && - !flow_sidx_eq(tc_hash[b], sidx)) - b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); - - return b; -} - -/** - * tcp_hash_insert() - Insert connection into hash table, chain link - * @c: Execution context - * @conn: Connection pointer - */ -static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) -{ - unsigned b = tcp_hash_probe(c, conn); - - tc_hash[b] = FLOW_SIDX(conn, TAPSIDE); - flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); -} - -/** - * tcp_hash_remove() - Drop connection from hash table, chain unlink - * @c: Execution context - * @conn: Connection pointer - */ -static void tcp_hash_remove(const struct ctx *c, - const struct tcp_tap_conn *conn) -{ - unsigned b = tcp_hash_probe(c, conn), s; - union flow *flow = flow_at_sidx(tc_hash[b]); - - if (!flow) - return; /* Redundant remove */ - - flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b); - - /* Scan the remainder of the cluster */ - for (s = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); - (flow = flow_at_sidx(tc_hash[s])); - s = mod_sub(s, 1, TCP_HASH_TABLE_SIZE)) { - unsigned h = tcp_conn_hash(c, &flow->tcp) % TCP_HASH_TABLE_SIZE; - - if (!mod_between(h, s, b, TCP_HASH_TABLE_SIZE)) { - /* tc_hash[s] can live in tc_hash[b]'s slot */ - debug("hash table remove: shuffle %u -> %u", s, b); - tc_hash[b] = tc_hash[s]; - b = s; - } - } - - tc_hash[b] = FLOW_SIDX_NONE; -} - -/** - * tcp_hash_lookup() - Look up connection given remote address and ports - * @c: Execution context - * @af: Address family, AF_INET or AF_INET6 - * @eaddr: Guest side endpoint address (guest local address) - * @faddr: Guest side forwarding address (guest remote address) - * @eport: Guest side endpoint port (guest local port) - * @fport: Guest side forwarding port (guest remote port) - * - * Return: connection pointer, if found, -ENOENT otherwise - */ -static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, int af, - const void *eaddr, const void *faddr, - in_port_t eport, in_port_t fport) -{ - struct flowside fside; - union flow *flow; - unsigned b; - - flowside_from_af(&fside, PIF_TAP, af, faddr, fport, eaddr, eport); - - b = flow_hash(c, IPPROTO_TCP, &fside) % TCP_HASH_TABLE_SIZE; - while ((flow = flow_at_sidx(tc_hash[b])) && - !flowside_eq(&flow->f.side[TAPSIDE], &fside)) - b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); - - return &flow->tcp; -} - /** * tcp_flow_defer() - Deferred per-flow handling (clean up closed connections) * @flow: Flow table entry for this connection @@ -1949,7 +1827,7 @@ static void tcp_conn_from_tap(struct ctx *c, tcp_seq_init(c, conn, now); conn->seq_ack_from_tap = conn->seq_to_tap; - tcp_hash_insert(c, conn); + flow_hash_insert(c, FLOW_SIDX(conn, TAPSIDE)); if (!bind(s, sa, sl)) { tcp_rst(c, conn); /* Nobody is listening then */ @@ -2452,6 +2330,8 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, struct tcp_tap_conn *conn; size_t optlen, len; struct tcphdr *th; + union flow *flow; + flow_sidx_t sidx; int ack_due = 0; char *opts; int count; @@ -2468,17 +2348,22 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL); opts = packet_get(p, idx, sizeof(*th), optlen, NULL); - conn = tcp_hash_lookup(c, af, saddr, daddr, - ntohs(th->source), ntohs(th->dest)); + sidx = flow_hash_lookup(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr, + ntohs(th->source), ntohs(th->dest)); + flow = flow_at_sidx(sidx); /* New connection from tap */ - if (!conn) { + if (!flow) { if (opts && th->syn && !th->ack) tcp_conn_from_tap(c, af, saddr, daddr, th, opts, optlen, now); return 1; } + ASSERT(flow->f.type == FLOW_TCP); + ASSERT(sidx.side == TAPSIDE); + conn = &flow->tcp; + flow_trace(conn, "packet length %zu from tap", len); if (th->rst) { @@ -2660,7 +2545,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, ASSERT(flow_complete(&conn->f)); tcp_seq_init(c, conn, now); - tcp_hash_insert(c, conn); + flow_hash_insert(c, FLOW_SIDX(conn, TAPSIDE)); conn->seq_ack_from_tap = conn->seq_to_tap; @@ -3032,11 +2917,6 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { - unsigned b; - - for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) - tc_hash[b] = FLOW_SIDX_NONE; - if (c->ifi4) tcp_sock4_iov_init(c); -- 2.43.0