[PATCH 0/3] RFC: TCP hash change changes, in preparation for flow table

older
[PATCH] ndp: Extend lifetime of...

David Gibson

4 Dec 2023 4 Dec '23

4:16 a.m.

I now have an in-progress draft of a unified hash table to go with the unified flow table. This turns out to be easier if we first make some preliminary changes to the structure of the TCP hash table. So, here those are for review. This is based on my already posted series introducing the rudimentary version of the unified flow table. David Gibson (3): tcp: Switch hash table to linear probing instead of chaining tcp: Implement hash table with indices rather than pointers tcp: Don't account for hash table size in tcp_hash() flow.h | 11 ++++ tcp.c | 144 ++++++++++++++++++++++++++++------------------------- tcp_conn.h | 2 - util.h | 13 +++++ 4 files changed, 101 insertions(+), 69 deletions(-) -- 2.43.0

Show replies by date

David Gibson

4 Dec 4 Dec

4:16 a.m.

New subject: [PATCH 1/3] tcp: Switch hash table to linear probing instead of chaining

Currently we deal with hash collisions by letting a hash bucket contain multiple entries, forming a linked list using an index in the connection structure. That's a pretty standard and simple approach, but in our case we can use an even simpler one: linear probing. Here if a hash bucket is occupied we just move onto the next one until we find a feww one. This slightly simplifies lookup and more importantly saves some precious bytes in the connection structure by removing the need for a link. It does require some additional complexity for hash removal. This approach can perform poorly with hash table load is high. However, we already size our hash table of pointers larger than the connection table, which puts an upper bound on the load. It's relatively cheap to decrease that bound if we find we need to. I adapted the linear probing operations from Knuth's The Art of Computer Programming, Volume 3, 2nd Edition. Specifically Algorithm L and Algorithm R in Section 6.4. Note that there is an error in Algorithm R as printed, see errata at [0]. [0] https://www-cs-faculty.stanford.edu/~knuth/all3-prepre.ps.gz Signed-off-by: David Gibson --- tcp.c | 111 +++++++++++++++++++++++++++-------------------------- tcp_conn.h | 2 - util.h | 13 +++++++ 3 files changed, 69 insertions(+), 57 deletions(-) diff --git a/tcp.c b/tcp.c index 17c7cba..09acf7f 100644 --- a/tcp.c +++ b/tcp.c @@ -573,22 +573,12 @@ static unsigned int tcp6_l2_flags_buf_used; #define CONN(idx) (&(FLOW(idx)->tcp)) -/** conn_at_idx() - Find a connection by index, if present - * @idx: Index of connection to lookup - * - * Return: pointer to connection, or NULL if @idx is out of bounds - */ -static inline struct tcp_tap_conn *conn_at_idx(unsigned idx) -{ - if (idx >= FLOW_MAX) - return NULL; - ASSERT(CONN(idx)->f.type == FLOW_TCP); - return CONN(idx); -} - /* Table for lookup from remote address, local port, remote port */ static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE]; +static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, + "Safe linear probing requires hash table larger than connection table"); + /* Pools for pre-opened sockets (in init) */ int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; @@ -1196,6 +1186,27 @@ static unsigned int tcp_conn_hash(const struct ctx *c, return tcp_hash(c, &conn->faddr, conn->eport, conn->fport); } +/** + * tcp_hash_probe() - Find hash bucket for a connection + * @c: Execution context + * @conn: Connection to find bucket for + * + * Return: If @conn is in the table, its current bucket, otherwise a suitable + * free bucket for it. + */ +static inline unsigned tcp_hash_probe(const struct ctx *c, + const struct tcp_tap_conn *conn) +{ + unsigned b; + + /* Linear probing */ + for (b = tcp_conn_hash(c, conn); tc_hash[b] && tc_hash[b] != conn; + b = (b + 1) % TCP_HASH_TABLE_SIZE) + ; + + return b; +} + /** * tcp_hash_insert() - Insert connection into hash table, chain link * @c: Execution context @@ -1203,14 +1214,10 @@ static unsigned int tcp_conn_hash(const struct ctx *c, */ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) { - int b; + unsigned b = tcp_hash_probe(c, conn); - b = tcp_hash(c, &conn->faddr, conn->eport, conn->fport); - conn->next_index = tc_hash[b] ? FLOW_IDX(tc_hash[b]) : -1U; tc_hash[b] = conn; - - flow_dbg(conn, "hash table insert: sock %i, bucket: %i, next: %p", - conn->sock, b, (void *)conn_at_idx(conn->next_index)); + flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); } /** @@ -1221,23 +1228,27 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) static void tcp_hash_remove(const struct ctx *c, const struct tcp_tap_conn *conn) { - struct tcp_tap_conn *entry, *prev = NULL; - int b = tcp_conn_hash(c, conn); + unsigned b = tcp_hash_probe(c, conn), s; - for (entry = tc_hash[b]; entry; - prev = entry, entry = conn_at_idx(entry->next_index)) { - if (entry == conn) { - if (prev) - prev->next_index = conn->next_index; - else - tc_hash[b] = conn_at_idx(conn->next_index); - break; + if (!tc_hash[b]) + return; /* Redundant remove */ + + flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b); + + /* Scan the remainder of the cluster */ + for (s = (b + 1) % TCP_HASH_TABLE_SIZE; tc_hash[s]; + s = (s + 1) % TCP_HASH_TABLE_SIZE) { + unsigned h = tcp_conn_hash(c, tc_hash[s]); + + if (in_mod_range(h, b, s, TCP_HASH_TABLE_SIZE)) { + /* tc_hash[s] can live in tc_hash[b]'s slot */ + debug("hash table remove: shuffle %u -> %u", s, b); + tc_hash[b] = tc_hash[s]; + b = s; } } - flow_dbg(conn, "hash table remove: sock %i, bucket: %i, new: %p", - conn->sock, b, - (void *)(prev ? conn_at_idx(prev->next_index) : tc_hash[b])); + tc_hash[b] = NULL; } /** @@ -1250,24 +1261,15 @@ void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, struct tcp_tap_conn *new) { - struct tcp_tap_conn *entry, *prev = NULL; - int b = tcp_conn_hash(c, old); + unsigned b = tcp_hash_probe(c, old); - for (entry = tc_hash[b]; entry; - prev = entry, entry = conn_at_idx(entry->next_index)) { - if (entry == old) { - if (prev) - prev->next_index = FLOW_IDX(new); - else - tc_hash[b] = new; - break; - } - } + if (!tc_hash[b]) + return; /* Not in hash table, nothing to update */ + + tc_hash[b] = new; debug("TCP: hash table update: old index %u, new index %u, sock %i, " - "bucket: %i, old: %p, new: %p", - FLOW_IDX(old), FLOW_IDX(new), new->sock, b, - (void *)old, (void *)new); + "bucket: %u", FLOW_IDX(old), FLOW_IDX(new), new->sock, b); tcp_epoll_ctl(c, new); } @@ -1287,17 +1289,16 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, in_port_t eport, in_port_t fport) { union inany_addr aany; - struct tcp_tap_conn *conn; - int b; + unsigned b; inany_from_af(&aany, af, faddr); - b = tcp_hash(c, &aany, eport, fport); - for (conn = tc_hash[b]; conn; conn = conn_at_idx(conn->next_index)) { - if (tcp_hash_match(conn, &aany, eport, fport)) - return conn; - } - return NULL; + for (b = tcp_hash(c, &aany, eport, fport); + tc_hash[b] && !tcp_hash_match(tc_hash[b], &aany, eport, fport); + b = (b + 1) % TCP_HASH_TABLE_SIZE) + ; + + return tc_hash[b]; } /** diff --git a/tcp_conn.h b/tcp_conn.h index 3900305..e3400bb 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -13,7 +13,6 @@ * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) * @f: Generic flow information * @in_epoll: Is the connection in the epoll set? - * @next_index: Connection index of next item in hash chain, -1 for none * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number * @events: Connection events, implying connection states @@ -40,7 +39,6 @@ struct tcp_tap_conn { struct flow_common f; bool in_epoll :1; - unsigned next_index :FLOW_INDEX_BITS + 2; #define TCP_RETRANS_BITS 3 unsigned int retrans :TCP_RETRANS_BITS; diff --git a/util.h b/util.h index b1106e8..4550a35 100644 --- a/util.h +++ b/util.h @@ -225,6 +225,19 @@ int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); int write_file(const char *path, const char *buf); +/** + * in_mod_range() - Determine if a value is in a cyclic range + * @x, @i, @j: Unsigned values < @m + * @m: Modulus + * + * Returns true iff @x is in the cyclic range of values from @i..@j (mod @m), + * inclusive of @i, exclusive of @j. + */ +static inline bool in_mod_range(unsigned x, unsigned i, unsigned j, unsigned m) +{ + return ((x - i) % m) < ((j - i) % m); +} + /* * Workarounds for https://github.com/llvm/llvm-project/issues/58992 * -- 2.43.0

Stefano Brivio

6 Dec 6 Dec

11:43 p.m.

New subject: [PATCH 1/3] tcp: Switch hash table to linear probing instead of chaining

On Mon, 4 Dec 2023 14:16:09 +1100 David Gibson wrote:

...

Currently we deal with hash collisions by letting a hash bucket contain multiple entries, forming a linked list using an index in the connection structure.

That's a pretty standard and simple approach, but in our case we can use an even simpler one: linear probing. Here if a hash bucket is occupied we just move onto the next one until we find a feww one. This slightly simplifies lookup and more importantly saves some precious bytes in the connection structure by removing the need for a link. It does require some additional complexity for hash removal.

This approach can perform poorly with hash table load is high. However, we already size our hash table of pointers larger than the connection table, which puts an upper bound on the load. It's relatively cheap to decrease that bound if we find we need to.

I adapted the linear probing operations from Knuth's The Art of Computer Programming, Volume 3, 2nd Edition. Specifically Algorithm L and Algorithm R in Section 6.4. Note that there is an error in Algorithm R as printed, see errata at [0].

[0] https://www-cs-faculty.stanford.edu/~knuth/all3-prepre.ps.gz

Signed-off-by: David Gibson --- tcp.c | 111 +++++++++++++++++++++++++++-------------------------- tcp_conn.h | 2 - util.h | 13 +++++++ 3 files changed, 69 insertions(+), 57 deletions(-)

diff --git a/tcp.c b/tcp.c index 17c7cba..09acf7f 100644 --- a/tcp.c +++ b/tcp.c @@ -573,22 +573,12 @@ static unsigned int tcp6_l2_flags_buf_used;

#define CONN(idx) (&(FLOW(idx)->tcp))

-/** conn_at_idx() - Find a connection by index, if present - * @idx: Index of connection to lookup - * - * Return: pointer to connection, or NULL if @idx is out of bounds - */ -static inline struct tcp_tap_conn *conn_at_idx(unsigned idx) -{ - if (idx >= FLOW_MAX) - return NULL; - ASSERT(CONN(idx)->f.type == FLOW_TCP); - return CONN(idx); -} - /* Table for lookup from remote address, local port, remote port */ static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE];

+static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, + "Safe linear probing requires hash table larger than connection table"); + /* Pools for pre-opened sockets (in init) */ int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; @@ -1196,6 +1186,27 @@ static unsigned int tcp_conn_hash(const struct ctx *c, return tcp_hash(c, &conn->faddr, conn->eport, conn->fport); }

+/** + * tcp_hash_probe() - Find hash bucket for a connection + * @c: Execution context + * @conn: Connection to find bucket for + * + * Return: If @conn is in the table, its current bucket, otherwise a suitable + * free bucket for it. + */ +static inline unsigned tcp_hash_probe(const struct ctx *c, + const struct tcp_tap_conn *conn) +{ + unsigned b; + + /* Linear probing */ + for (b = tcp_conn_hash(c, conn); tc_hash[b] && tc_hash[b] != conn; + b = (b + 1) % TCP_HASH_TABLE_SIZE) + ; + + return b; +} + /** * tcp_hash_insert() - Insert connection into hash table, chain link * @c: Execution context @@ -1203,14 +1214,10 @@ static unsigned int tcp_conn_hash(const struct ctx *c, */ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) { - int b; + unsigned b = tcp_hash_probe(c, conn);

- b = tcp_hash(c, &conn->faddr, conn->eport, conn->fport); - conn->next_index = tc_hash[b] ? FLOW_IDX(tc_hash[b]) : -1U; tc_hash[b] = conn; - - flow_dbg(conn, "hash table insert: sock %i, bucket: %i, next: %p", - conn->sock, b, (void *)conn_at_idx(conn->next_index)); + flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); }

/** @@ -1221,23 +1228,27 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) static void tcp_hash_remove(const struct ctx *c, const struct tcp_tap_conn *conn) { - struct tcp_tap_conn *entry, *prev = NULL; - int b = tcp_conn_hash(c, conn); + unsigned b = tcp_hash_probe(c, conn), s;

- for (entry = tc_hash[b]; entry; - prev = entry, entry = conn_at_idx(entry->next_index)) { - if (entry == conn) { - if (prev) - prev->next_index = conn->next_index; - else - tc_hash[b] = conn_at_idx(conn->next_index); - break; + if (!tc_hash[b]) + return; /* Redundant remove */ + + flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b); + + /* Scan the remainder of the cluster */ + for (s = (b + 1) % TCP_HASH_TABLE_SIZE; tc_hash[s]; + s = (s + 1) % TCP_HASH_TABLE_SIZE) { + unsigned h = tcp_conn_hash(c, tc_hash[s]); + + if (in_mod_range(h, b, s, TCP_HASH_TABLE_SIZE)) { + /* tc_hash[s] can live in tc_hash[b]'s slot */ + debug("hash table remove: shuffle %u -> %u", s, b); + tc_hash[b] = tc_hash[s]; + b = s; } }

This makes intuitively sense to me, but I can't wrap my head around the fact that it corresponds to algorithm R. Step R3 implies that, if h *is* (cyclically) between b and s, you should skip the move and go back to R2 right away. The condition here seems to be reversed, though. What am I missing? -- Stefano

David Gibson

7 Dec 7 Dec

5:11 a.m.

New subject: [PATCH 1/3] tcp: Switch hash table to linear probing instead of chaining

On Wed, Dec 06, 2023 at 11:43:29PM +0100, Stefano Brivio wrote:

...

On Mon, 4 Dec 2023 14:16:09 +1100 David Gibson wrote:

...
Currently we deal with hash collisions by letting a hash bucket contain multiple entries, forming a linked list using an index in the connection structure.

That's a pretty standard and simple approach, but in our case we can use an even simpler one: linear probing. Here if a hash bucket is occupied we just move onto the next one until we find a feww one. This slightly simplifies lookup and more importantly saves some precious bytes in the connection structure by removing the need for a link. It does require some additional complexity for hash removal.

This approach can perform poorly with hash table load is high. However, we already size our hash table of pointers larger than the connection table, which puts an upper bound on the load. It's relatively cheap to decrease that bound if we find we need to.

I adapted the linear probing operations from Knuth's The Art of Computer Programming, Volume 3, 2nd Edition. Specifically Algorithm L and Algorithm R in Section 6.4. Note that there is an error in Algorithm R as printed, see errata at [0].

[0] https://www-cs-faculty.stanford.edu/~knuth/all3-prepre.ps.gz

Signed-off-by: David Gibson --- tcp.c | 111 +++++++++++++++++++++++++++-------------------------- tcp_conn.h | 2 - util.h | 13 +++++++ 3 files changed, 69 insertions(+), 57 deletions(-)

diff --git a/tcp.c b/tcp.c index 17c7cba..09acf7f 100644 --- a/tcp.c +++ b/tcp.c @@ -573,22 +573,12 @@ static unsigned int tcp6_l2_flags_buf_used;

#define CONN(idx) (&(FLOW(idx)->tcp))

-/** conn_at_idx() - Find a connection by index, if present - * @idx: Index of connection to lookup - * - * Return: pointer to connection, or NULL if @idx is out of bounds - */ -static inline struct tcp_tap_conn *conn_at_idx(unsigned idx) -{ - if (idx >= FLOW_MAX) - return NULL; - ASSERT(CONN(idx)->f.type == FLOW_TCP); - return CONN(idx); -} - /* Table for lookup from remote address, local port, remote port */ static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE];

+static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, + "Safe linear probing requires hash table larger than connection table"); + /* Pools for pre-opened sockets (in init) */ int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; @@ -1196,6 +1186,27 @@ static unsigned int tcp_conn_hash(const struct ctx *c, return tcp_hash(c, &conn->faddr, conn->eport, conn->fport); }

+/** + * tcp_hash_probe() - Find hash bucket for a connection + * @c: Execution context + * @conn: Connection to find bucket for + * + * Return: If @conn is in the table, its current bucket, otherwise a suitable + * free bucket for it. + */ +static inline unsigned tcp_hash_probe(const struct ctx *c, + const struct tcp_tap_conn *conn) +{ + unsigned b; + + /* Linear probing */ + for (b = tcp_conn_hash(c, conn); tc_hash[b] && tc_hash[b] != conn; + b = (b + 1) % TCP_HASH_TABLE_SIZE) + ; + + return b; +} + /** * tcp_hash_insert() - Insert connection into hash table, chain link * @c: Execution context @@ -1203,14 +1214,10 @@ static unsigned int tcp_conn_hash(const struct ctx *c, */ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) { - int b; + unsigned b = tcp_hash_probe(c, conn);

- b = tcp_hash(c, &conn->faddr, conn->eport, conn->fport); - conn->next_index = tc_hash[b] ? FLOW_IDX(tc_hash[b]) : -1U; tc_hash[b] = conn; - - flow_dbg(conn, "hash table insert: sock %i, bucket: %i, next: %p", - conn->sock, b, (void *)conn_at_idx(conn->next_index)); + flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); }

/** @@ -1221,23 +1228,27 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) static void tcp_hash_remove(const struct ctx *c, const struct tcp_tap_conn *conn) { - struct tcp_tap_conn *entry, *prev = NULL; - int b = tcp_conn_hash(c, conn); + unsigned b = tcp_hash_probe(c, conn), s;

- for (entry = tc_hash[b]; entry; - prev = entry, entry = conn_at_idx(entry->next_index)) { - if (entry == conn) { - if (prev) - prev->next_index = conn->next_index; - else - tc_hash[b] = conn_at_idx(conn->next_index); - break; + if (!tc_hash[b]) + return; /* Redundant remove */ + + flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b); + + /* Scan the remainder of the cluster */ + for (s = (b + 1) % TCP_HASH_TABLE_SIZE; tc_hash[s]; + s = (s + 1) % TCP_HASH_TABLE_SIZE) { + unsigned h = tcp_conn_hash(c, tc_hash[s]); + + if (in_mod_range(h, b, s, TCP_HASH_TABLE_SIZE)) { + /* tc_hash[s] can live in tc_hash[b]'s slot */ + debug("hash table remove: shuffle %u -> %u", s, b); + tc_hash[b] = tc_hash[s]; + b = s; } }

This makes intuitively sense to me, but I can't wrap my head around the fact that it corresponds to algorithm R. Step R3 implies that, if h *is* (cyclically) between b and s, you should skip the move and go back to R2 right away. The condition here seems to be reversed, though. What am I missing?

Ugh... this is doing my head in a bit, because there are a bunch of stacked negatives. Ok, so the original is: "If r lies cyclically between i and j, go back to R2" Or equivalently a loop body of if (in_mod_range(r, i, j)) continue; /* Step R4/R1 stuff */ Now in this version we have r => h, i => s and j => b, so if (in_mod_range(h, s, b)) continue; /* Step R4/R1 stuff */ Or equivalently if (!in_mod_range(h, s, b)) /* Step R4/R1 stuff */; And because of how "cyclically between" works, that becomes: if (in_mod_range(h, b, s)) /* Step R4/R1 stuff */; Which is what I have. But... the original is probing backwards through buckets whereas I'm probing forwards, which I think reverses it again. Yeah.. I'm pretty sure my version is wrong. If h == (s-1) > b, for example, it definitely can't live in slot b, because probing would start at slot s-1, and never reach b. Hrm.. I'm going to switch mine to stepping backwards, like the Knuth, just so I'm less confused. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson

Stefano Brivio

11 Dec 11 Dec

10 a.m.

New subject: [PATCH 1/3] tcp: Switch hash table to linear probing instead of chaining

On Thu, 7 Dec 2023 15:11:42 +1100 David Gibson wrote:

...

On Wed, Dec 06, 2023 at 11:43:29PM +0100, Stefano Brivio wrote:

...
On Mon, 4 Dec 2023 14:16:09 +1100 David Gibson wrote:

...
Currently we deal with hash collisions by letting a hash bucket contain multiple entries, forming a linked list using an index in the connection structure.

That's a pretty standard and simple approach, but in our case we can use an even simpler one: linear probing. Here if a hash bucket is occupied we just move onto the next one until we find a feww one. This slightly simplifies lookup and more importantly saves some precious bytes in the connection structure by removing the need for a link. It does require some additional complexity for hash removal.

This approach can perform poorly with hash table load is high. However, we already size our hash table of pointers larger than the connection table, which puts an upper bound on the load. It's relatively cheap to decrease that bound if we find we need to.

I adapted the linear probing operations from Knuth's The Art of Computer Programming, Volume 3, 2nd Edition. Specifically Algorithm L and Algorithm R in Section 6.4. Note that there is an error in Algorithm R as printed, see errata at [0].

[0] https://www-cs-faculty.stanford.edu/~knuth/all3-prepre.ps.gz

Signed-off-by: David Gibson --- tcp.c | 111 +++++++++++++++++++++++++++-------------------------- tcp_conn.h | 2 - util.h | 13 +++++++ 3 files changed, 69 insertions(+), 57 deletions(-)

diff --git a/tcp.c b/tcp.c index 17c7cba..09acf7f 100644 --- a/tcp.c +++ b/tcp.c @@ -573,22 +573,12 @@ static unsigned int tcp6_l2_flags_buf_used;

#define CONN(idx) (&(FLOW(idx)->tcp))

-/** conn_at_idx() - Find a connection by index, if present - * @idx: Index of connection to lookup - * - * Return: pointer to connection, or NULL if @idx is out of bounds - */ -static inline struct tcp_tap_conn *conn_at_idx(unsigned idx) -{ - if (idx >= FLOW_MAX) - return NULL; - ASSERT(CONN(idx)->f.type == FLOW_TCP); - return CONN(idx); -} - /* Table for lookup from remote address, local port, remote port */ static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE];

+static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, + "Safe linear probing requires hash table larger than connection table"); + /* Pools for pre-opened sockets (in init) */ int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; @@ -1196,6 +1186,27 @@ static unsigned int tcp_conn_hash(const struct ctx *c, return tcp_hash(c, &conn->faddr, conn->eport, conn->fport); }

+/** + * tcp_hash_probe() - Find hash bucket for a connection + * @c: Execution context + * @conn: Connection to find bucket for + * + * Return: If @conn is in the table, its current bucket, otherwise a suitable + * free bucket for it. + */ +static inline unsigned tcp_hash_probe(const struct ctx *c, + const struct tcp_tap_conn *conn) +{ + unsigned b; + + /* Linear probing */ + for (b = tcp_conn_hash(c, conn); tc_hash[b] && tc_hash[b] != conn; + b = (b + 1) % TCP_HASH_TABLE_SIZE) + ; + + return b; +} + /** * tcp_hash_insert() - Insert connection into hash table, chain link * @c: Execution context @@ -1203,14 +1214,10 @@ static unsigned int tcp_conn_hash(const struct ctx *c, */ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) { - int b; + unsigned b = tcp_hash_probe(c, conn);

- b = tcp_hash(c, &conn->faddr, conn->eport, conn->fport); - conn->next_index = tc_hash[b] ? FLOW_IDX(tc_hash[b]) : -1U; tc_hash[b] = conn; - - flow_dbg(conn, "hash table insert: sock %i, bucket: %i, next: %p", - conn->sock, b, (void *)conn_at_idx(conn->next_index)); + flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); }

/** @@ -1221,23 +1228,27 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) static void tcp_hash_remove(const struct ctx *c, const struct tcp_tap_conn *conn) { - struct tcp_tap_conn *entry, *prev = NULL; - int b = tcp_conn_hash(c, conn); + unsigned b = tcp_hash_probe(c, conn), s;

- for (entry = tc_hash[b]; entry; - prev = entry, entry = conn_at_idx(entry->next_index)) { - if (entry == conn) { - if (prev) - prev->next_index = conn->next_index; - else - tc_hash[b] = conn_at_idx(conn->next_index); - break; + if (!tc_hash[b]) + return; /* Redundant remove */ + + flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b); + + /* Scan the remainder of the cluster */ + for (s = (b + 1) % TCP_HASH_TABLE_SIZE; tc_hash[s]; + s = (s + 1) % TCP_HASH_TABLE_SIZE) { + unsigned h = tcp_conn_hash(c, tc_hash[s]); + + if (in_mod_range(h, b, s, TCP_HASH_TABLE_SIZE)) { + /* tc_hash[s] can live in tc_hash[b]'s slot */ + debug("hash table remove: shuffle %u -> %u", s, b); + tc_hash[b] = tc_hash[s]; + b = s; } }

This makes intuitively sense to me, but I can't wrap my head around the fact that it corresponds to algorithm R. Step R3 implies that, if h *is* (cyclically) between b and s, you should skip the move and go back to R2 right away. The condition here seems to be reversed, though. What am I missing?

Ugh... this is doing my head in a bit, because there are a bunch of stacked negatives. Ok, so the original is:

"If r lies cyclically between i and j, go back to R2"

Or equivalently a loop body of

if (in_mod_range(r, i, j)) continue; /* Step R4/R1 stuff */

Now in this version we have r => h, i => s and j => b, so

if (in_mod_range(h, s, b)) continue; /* Step R4/R1 stuff */

Or equivalently

if (!in_mod_range(h, s, b)) /* Step R4/R1 stuff */;

And because of how "cyclically between" works, that becomes:

Hah, this ^^^

...

if (in_mod_range(h, b, s)) /* Step R4/R1 stuff */;

is what I was missing. -- Stefano

David Gibson

4 Dec 4 Dec

4:16 a.m.

New subject: [PATCH 2/3] tcp: Implement hash table with indices rather than pointers

We implement our hash table with pointers to the entry for each bucket (or NULL). However, the entries are always allocated within the flow table, meaning that a flow index will suffice, halving the size of the hash table. For TCP, just a flow index would be enough, but future uses will want to expand the hash table to cover indexing either side of a flow, so use a flow_sidx_t as the type for each hash bucket. Signed-off-by: David Gibson --- flow.h | 11 +++++++++++ tcp.c | 34 +++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/flow.h b/flow.h index c2a5190..959b461 100644 --- a/flow.h +++ b/flow.h @@ -53,6 +53,17 @@ static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t), #define FLOW_SIDX_NONE ((flow_sidx_t){ .flow = FLOW_MAX }) +/** + * flow_sidx_eq() - Test if two sidx values are equal + * @a, @b: sidx values + * + * Return: true iff @a and @b refer to the same side of the same flow + */ +static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b) +{ + return (a.flow == b.flow) && (a.side == b.side); +} + union flow; void flow_table_compact(struct ctx *c, union flow *hole); diff --git a/tcp.c b/tcp.c index 09acf7f..7e438b7 100644 --- a/tcp.c +++ b/tcp.c @@ -574,7 +574,7 @@ static unsigned int tcp6_l2_flags_buf_used; #define CONN(idx) (&(FLOW(idx)->tcp)) /* Table for lookup from remote address, local port, remote port */ -static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE]; +static flow_sidx_t tc_hash[TCP_HASH_TABLE_SIZE]; static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, "Safe linear probing requires hash table larger than connection table"); @@ -1197,10 +1197,13 @@ static unsigned int tcp_conn_hash(const struct ctx *c, static inline unsigned tcp_hash_probe(const struct ctx *c, const struct tcp_tap_conn *conn) { + flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE); unsigned b; /* Linear probing */ - for (b = tcp_conn_hash(c, conn); tc_hash[b] && tc_hash[b] != conn; + for (b = tcp_conn_hash(c, conn); + !flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) && + !flow_sidx_eq(tc_hash[b], sidx); b = (b + 1) % TCP_HASH_TABLE_SIZE) ; @@ -1216,7 +1219,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) { unsigned b = tcp_hash_probe(c, conn); - tc_hash[b] = conn; + tc_hash[b] = FLOW_SIDX(conn, TAPSIDE); flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); } @@ -1229,16 +1232,18 @@ static void tcp_hash_remove(const struct ctx *c, const struct tcp_tap_conn *conn) { unsigned b = tcp_hash_probe(c, conn), s; + union flow *flow = flow_at_sidx(tc_hash[b]); - if (!tc_hash[b]) + if (!flow) return; /* Redundant remove */ flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b); /* Scan the remainder of the cluster */ - for (s = (b + 1) % TCP_HASH_TABLE_SIZE; tc_hash[s]; + for (s = (b + 1) % TCP_HASH_TABLE_SIZE; + (flow = flow_at_sidx(tc_hash[s])); s = (s + 1) % TCP_HASH_TABLE_SIZE) { - unsigned h = tcp_conn_hash(c, tc_hash[s]); + unsigned h = tcp_conn_hash(c, &flow->tcp); if (in_mod_range(h, b, s, TCP_HASH_TABLE_SIZE)) { /* tc_hash[s] can live in tc_hash[b]'s slot */ @@ -1248,7 +1253,7 @@ static void tcp_hash_remove(const struct ctx *c, } } - tc_hash[b] = NULL; + tc_hash[b] = FLOW_SIDX_NONE; } /** @@ -1263,10 +1268,10 @@ void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, { unsigned b = tcp_hash_probe(c, old); - if (!tc_hash[b]) + if (!flow_at_sidx(tc_hash[b])) return; /* Not in hash table, nothing to update */ - tc_hash[b] = new; + tc_hash[b] = FLOW_SIDX(new, TAPSIDE); debug("TCP: hash table update: old index %u, new index %u, sock %i, " "bucket: %u", FLOW_IDX(old), FLOW_IDX(new), new->sock, b); @@ -1289,16 +1294,18 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, in_port_t eport, in_port_t fport) { union inany_addr aany; + union flow *flow; unsigned b; inany_from_af(&aany, af, faddr); for (b = tcp_hash(c, &aany, eport, fport); - tc_hash[b] && !tcp_hash_match(tc_hash[b], &aany, eport, fport); + (flow = flow_at_sidx(tc_hash[b])) + && !tcp_hash_match(&flow->tcp, &aany, eport, fport); b = (b + 1) % TCP_HASH_TABLE_SIZE) ; - return tc_hash[b]; + return &flow->tcp; } /** @@ -3090,6 +3097,11 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { + unsigned b; + + for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) + tc_hash[b] = FLOW_SIDX_NONE; + if (c->ifi4) tcp_sock4_iov_init(c); -- 2.43.0

Stefano Brivio

6 Dec 6 Dec

8:37 p.m.

New subject: [PATCH 2/3] tcp: Implement hash table with indices rather than pointers

On Mon, 4 Dec 2023 14:16:10 +1100 David Gibson wrote:

...

We implement our hash table with pointers to the entry for each bucket (or NULL). However, the entries are always allocated within the flow table, meaning that a flow index will suffice, halving the size of the hash table.

For TCP, just a flow index would be enough, but future uses will want to expand the hash table to cover indexing either side of a flow, so use a flow_sidx_t as the type for each hash bucket.

Signed-off-by: David Gibson --- flow.h | 11 +++++++++++ tcp.c | 34 +++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/flow.h b/flow.h index c2a5190..959b461 100644 --- a/flow.h +++ b/flow.h @@ -53,6 +53,17 @@ static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t),

#define FLOW_SIDX_NONE ((flow_sidx_t){ .flow = FLOW_MAX })

In hindsight, while reviewing the functions below: FLOW_MAX should probably be MAX_FROM_BITS(FLOW_INDEX_BITS) - 1 instead (and those >= comparisons would happily become >), so that we don't need to have a "maximum" value that's also not allowed (then, strictly speaking, it's more than the maximum).

...

+/** + * flow_sidx_eq() - Test if two sidx values are equal + * @a, @b: sidx values + * + * Return: true iff @a and @b refer to the same side of the same flow + */ +static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b) +{ + return (a.flow == b.flow) && (a.side == b.side); +} + union flow;

void flow_table_compact(struct ctx *c, union flow *hole); diff --git a/tcp.c b/tcp.c index 09acf7f..7e438b7 100644 --- a/tcp.c +++ b/tcp.c @@ -574,7 +574,7 @@ static unsigned int tcp6_l2_flags_buf_used; #define CONN(idx) (&(FLOW(idx)->tcp))

/* Table for lookup from remote address, local port, remote port */ -static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE]; +static flow_sidx_t tc_hash[TCP_HASH_TABLE_SIZE];

static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, "Safe linear probing requires hash table larger than connection table"); @@ -1197,10 +1197,13 @@ static unsigned int tcp_conn_hash(const struct ctx *c, static inline unsigned tcp_hash_probe(const struct ctx *c, const struct tcp_tap_conn *conn) { + flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE); unsigned b;

/* Linear probing */ - for (b = tcp_conn_hash(c, conn); tc_hash[b] && tc_hash[b] != conn; + for (b = tcp_conn_hash(c, conn); + !flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) &&

Do we actually need to check for FLOW_SIDX_NONE explicitly? That is, sidx we get as input here should never be FLOW_SIDX_NONE. I wonder if it makes sense to take care of the possible "overflow" outcome from step L4. of algorithm L you mentioned in 1/3. It *shouldn't* because you're enforcing the minimum size of the hash table, I wonder if it's a good idea anyway.

...

+ !flow_sidx_eq(tc_hash[b], sidx); b = (b + 1) % TCP_HASH_TABLE_SIZE) ;

I respect the fact that this is fundamentally a for loop. :) On the other hand: unsigned b = tcp_conn_hash(c, conn); while (!flow_sidx_eq(tc_hash[b], sidx)) b = (b + 1) % TCP_HASH_TABLE_SIZE); ...would be a bit more readable?

...

@@ -1216,7 +1219,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) { unsigned b = tcp_hash_probe(c, conn);

- tc_hash[b] = conn; + tc_hash[b] = FLOW_SIDX(conn, TAPSIDE); flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); }

@@ -1229,16 +1232,18 @@ static void tcp_hash_remove(const struct ctx *c, const struct tcp_tap_conn *conn) { unsigned b = tcp_hash_probe(c, conn), s; + union flow *flow = flow_at_sidx(tc_hash[b]);

- if (!tc_hash[b]) + if (!flow) return; /* Redundant remove */

flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b);

/* Scan the remainder of the cluster */ - for (s = (b + 1) % TCP_HASH_TABLE_SIZE; tc_hash[s]; + for (s = (b + 1) % TCP_HASH_TABLE_SIZE; + (flow = flow_at_sidx(tc_hash[s])); s = (s + 1) % TCP_HASH_TABLE_SIZE) { - unsigned h = tcp_conn_hash(c, tc_hash[s]); + unsigned h = tcp_conn_hash(c, &flow->tcp);

if (in_mod_range(h, b, s, TCP_HASH_TABLE_SIZE)) { /* tc_hash[s] can live in tc_hash[b]'s slot */ @@ -1248,7 +1253,7 @@ static void tcp_hash_remove(const struct ctx *c, } }

- tc_hash[b] = NULL; + tc_hash[b] = FLOW_SIDX_NONE; }

/** @@ -1263,10 +1268,10 @@ void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, { unsigned b = tcp_hash_probe(c, old);

- if (!tc_hash[b]) + if (!flow_at_sidx(tc_hash[b])) return; /* Not in hash table, nothing to update */

- tc_hash[b] = new; + tc_hash[b] = FLOW_SIDX(new, TAPSIDE);

debug("TCP: hash table update: old index %u, new index %u, sock %i, " "bucket: %u", FLOW_IDX(old), FLOW_IDX(new), new->sock, b); @@ -1289,16 +1294,18 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, in_port_t eport, in_port_t fport) { union inany_addr aany; + union flow *flow; unsigned b;

inany_from_af(&aany, af, faddr);

for (b = tcp_hash(c, &aany, eport, fport); - tc_hash[b] && !tcp_hash_match(tc_hash[b], &aany, eport, fport); + (flow = flow_at_sidx(tc_hash[b])) + && !tcp_hash_match(&flow->tcp, &aany, eport, fport);

Same as above about readability (somehow clashing with correctness).

...

b = (b + 1) % TCP_HASH_TABLE_SIZE) ;

- return tc_hash[b]; + return &flow->tcp; }

/** @@ -3090,6 +3097,11 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { + unsigned b; + + for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) + tc_hash[b] = FLOW_SIDX_NONE; + if (c->ifi4) tcp_sock4_iov_init(c);

-- Stefano

David Gibson

7 Dec 7 Dec

2:04 a.m.

New subject: [PATCH 2/3] tcp: Implement hash table with indices rather than pointers

On Wed, Dec 06, 2023 at 08:37:27PM +0100, Stefano Brivio wrote:

...

On Mon, 4 Dec 2023 14:16:10 +1100 David Gibson wrote:

...
We implement our hash table with pointers to the entry for each bucket (or NULL). However, the entries are always allocated within the flow table, meaning that a flow index will suffice, halving the size of the hash table.

For TCP, just a flow index would be enough, but future uses will want to expand the hash table to cover indexing either side of a flow, so use a flow_sidx_t as the type for each hash bucket.

Signed-off-by: David Gibson --- flow.h | 11 +++++++++++ tcp.c | 34 +++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/flow.h b/flow.h index c2a5190..959b461 100644 --- a/flow.h +++ b/flow.h @@ -53,6 +53,17 @@ static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t),

#define FLOW_SIDX_NONE ((flow_sidx_t){ .flow = FLOW_MAX })

In hindsight, while reviewing the functions below: FLOW_MAX should probably be MAX_FROM_BITS(FLOW_INDEX_BITS) - 1 instead (and those >= comparisons would happily become >), so that we don't need to have a "maximum" value that's also not allowed (then, strictly speaking, it's more than the maximum).

Right, either that or name the variable MAX_NUM_FLOWS or something. Eh, whatever.

...

...
+/** + * flow_sidx_eq() - Test if two sidx values are equal + * @a, @b: sidx values + * + * Return: true iff @a and @b refer to the same side of the same flow + */ +static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b) +{ + return (a.flow == b.flow) && (a.side == b.side); +} + union flow;

void flow_table_compact(struct ctx *c, union flow *hole); diff --git a/tcp.c b/tcp.c index 09acf7f..7e438b7 100644 --- a/tcp.c +++ b/tcp.c @@ -574,7 +574,7 @@ static unsigned int tcp6_l2_flags_buf_used; #define CONN(idx) (&(FLOW(idx)->tcp))

/* Table for lookup from remote address, local port, remote port */ -static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE]; +static flow_sidx_t tc_hash[TCP_HASH_TABLE_SIZE];

static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, "Safe linear probing requires hash table larger than connection table"); @@ -1197,10 +1197,13 @@ static unsigned int tcp_conn_hash(const struct ctx *c, static inline unsigned tcp_hash_probe(const struct ctx *c, const struct tcp_tap_conn *conn) { + flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE); unsigned b;

/* Linear probing */ - for (b = tcp_conn_hash(c, conn); tc_hash[b] && tc_hash[b] != conn; + for (b = tcp_conn_hash(c, conn); + !flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) &&

Do we actually need to check for FLOW_SIDX_NONE explicitly? That is, sidx we get as input here should never be FLOW_SIDX_NONE.

Yes: we need to stop when we reach something matching @sidx *or* we hit an empty entry. Otherwise we'll never terminate if the entry isn't in there.

...

I wonder if it makes sense to take care of the possible "overflow" outcome from step L4. of algorithm L you mentioned in 1/3. It *shouldn't* because you're enforcing the minimum size of the hash table, I wonder if it's a good idea anyway.

Yeah, I wondered that too, it's probably a good idea for safety. I'll look at implementing that.

...

...
+ !flow_sidx_eq(tc_hash[b], sidx); b = (b + 1) % TCP_HASH_TABLE_SIZE) ;

I respect the fact that this is fundamentally a for loop. :) On the other hand:

unsigned b = tcp_conn_hash(c, conn);

while (!flow_sidx_eq(tc_hash[b], sidx)) b = (b + 1) % TCP_HASH_TABLE_SIZE);

...would be a bit more readable?

Hm, fair point. I think the while looked uglier in some earlier versions before I added the _probe() helper so it was duplicated in several places.

...

...
@@ -1216,7 +1219,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) { unsigned b = tcp_hash_probe(c, conn);

- tc_hash[b] = conn; + tc_hash[b] = FLOW_SIDX(conn, TAPSIDE); flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); }

@@ -1229,16 +1232,18 @@ static void tcp_hash_remove(const struct ctx *c, const struct tcp_tap_conn *conn) { unsigned b = tcp_hash_probe(c, conn), s; + union flow *flow = flow_at_sidx(tc_hash[b]);

- if (!tc_hash[b]) + if (!flow) return; /* Redundant remove */

flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b);

/* Scan the remainder of the cluster */ - for (s = (b + 1) % TCP_HASH_TABLE_SIZE; tc_hash[s]; + for (s = (b + 1) % TCP_HASH_TABLE_SIZE; + (flow = flow_at_sidx(tc_hash[s])); s = (s + 1) % TCP_HASH_TABLE_SIZE) { - unsigned h = tcp_conn_hash(c, tc_hash[s]); + unsigned h = tcp_conn_hash(c, &flow->tcp);

if (in_mod_range(h, b, s, TCP_HASH_TABLE_SIZE)) { /* tc_hash[s] can live in tc_hash[b]'s slot */ @@ -1248,7 +1253,7 @@ static void tcp_hash_remove(const struct ctx *c, } }

- tc_hash[b] = NULL; + tc_hash[b] = FLOW_SIDX_NONE; }

/** @@ -1263,10 +1268,10 @@ void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, { unsigned b = tcp_hash_probe(c, old);

- if (!tc_hash[b]) + if (!flow_at_sidx(tc_hash[b])) return; /* Not in hash table, nothing to update */

- tc_hash[b] = new; + tc_hash[b] = FLOW_SIDX(new, TAPSIDE);

debug("TCP: hash table update: old index %u, new index %u, sock %i, " "bucket: %u", FLOW_IDX(old), FLOW_IDX(new), new->sock, b); @@ -1289,16 +1294,18 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, in_port_t eport, in_port_t fport) { union inany_addr aany; + union flow *flow; unsigned b;

inany_from_af(&aany, af, faddr);

for (b = tcp_hash(c, &aany, eport, fport); - tc_hash[b] && !tcp_hash_match(tc_hash[b], &aany, eport, fport); + (flow = flow_at_sidx(tc_hash[b])) + && !tcp_hash_match(&flow->tcp, &aany, eport, fport);

Same as above about readability (somehow clashing with correctness).

...
b = (b + 1) % TCP_HASH_TABLE_SIZE) ;

- return tc_hash[b]; + return &flow->tcp; }

/** @@ -3090,6 +3097,11 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { + unsigned b; + + for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) + tc_hash[b] = FLOW_SIDX_NONE; + if (c->ifi4) tcp_sock4_iov_init(c);

-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson

David Gibson

6:10 a.m.

New subject: [PATCH 2/3] tcp: Implement hash table with indices rather than pointers

On Thu, Dec 07, 2023 at 12:04:18PM +1100, David Gibson wrote:

...

On Wed, Dec 06, 2023 at 08:37:27PM +0100, Stefano Brivio wrote:

...
On Mon, 4 Dec 2023 14:16:10 +1100 David Gibson wrote: [snip] I wonder if it makes sense to take care of the possible "overflow" outcome from step L4. of algorithm L you mentioned in 1/3. It *shouldn't* because you're enforcing the minimum size of the hash table, I wonder if it's a good idea anyway.

Yeah, I wondered that too, it's probably a good idea for safety. I'll look at implementing that.

Hrm.. so this turns out to be trickier than I thought. The difficulty is that it means hash_probe() now needs to be able to return a failure for the "table full" case. That makes the signature much uglier to deal with. I can still do it if you think it's worth it, but I'll post v2 without that change. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson

Stefano Brivio

7:20 a.m.

New subject: [PATCH 2/3] tcp: Implement hash table with indices rather than pointers

On Thu, 7 Dec 2023 12:04:18 +1100 David Gibson wrote:

...

On Wed, Dec 06, 2023 at 08:37:27PM +0100, Stefano Brivio wrote:

...
On Mon, 4 Dec 2023 14:16:10 +1100 David Gibson wrote:

...
We implement our hash table with pointers to the entry for each bucket (or NULL). However, the entries are always allocated within the flow table, meaning that a flow index will suffice, halving the size of the hash table.

For TCP, just a flow index would be enough, but future uses will want to expand the hash table to cover indexing either side of a flow, so use a flow_sidx_t as the type for each hash bucket.

Signed-off-by: David Gibson --- flow.h | 11 +++++++++++ tcp.c | 34 +++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/flow.h b/flow.h index c2a5190..959b461 100644 --- a/flow.h +++ b/flow.h @@ -53,6 +53,17 @@ static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t),

#define FLOW_SIDX_NONE ((flow_sidx_t){ .flow = FLOW_MAX })

In hindsight, while reviewing the functions below: FLOW_MAX should probably be MAX_FROM_BITS(FLOW_INDEX_BITS) - 1 instead (and those >= comparisons would happily become >), so that we don't need to have a "maximum" value that's also not allowed (then, strictly speaking, it's more than the maximum).

Right, either that or name the variable MAX_NUM_FLOWS or something. Eh, whatever.

...
...
+/** + * flow_sidx_eq() - Test if two sidx values are equal + * @a, @b: sidx values + * + * Return: true iff @a and @b refer to the same side of the same flow + */ +static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b) +{ + return (a.flow == b.flow) && (a.side == b.side); +} + union flow;

void flow_table_compact(struct ctx *c, union flow *hole); diff --git a/tcp.c b/tcp.c index 09acf7f..7e438b7 100644 --- a/tcp.c +++ b/tcp.c @@ -574,7 +574,7 @@ static unsigned int tcp6_l2_flags_buf_used; #define CONN(idx) (&(FLOW(idx)->tcp))

/* Table for lookup from remote address, local port, remote port */ -static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE]; +static flow_sidx_t tc_hash[TCP_HASH_TABLE_SIZE];

static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, "Safe linear probing requires hash table larger than connection table"); @@ -1197,10 +1197,13 @@ static unsigned int tcp_conn_hash(const struct ctx *c, static inline unsigned tcp_hash_probe(const struct ctx *c, const struct tcp_tap_conn *conn) { + flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE); unsigned b;

/* Linear probing */ - for (b = tcp_conn_hash(c, conn); tc_hash[b] && tc_hash[b] != conn; + for (b = tcp_conn_hash(c, conn); + !flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) &&

Do we actually need to check for FLOW_SIDX_NONE explicitly? That is, sidx we get as input here should never be FLOW_SIDX_NONE.

Yes: we need to stop when we reach something matching @sidx *or* we hit an empty entry. Otherwise we'll never terminate if the entry isn't in there.

Ah, right, sorry, for a moment I read this as: !flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) && flow_sidx_eq(tc_hash[b], sidx); where sidx != FLOW_SIDX_NONE would have the first comparison redundant. But it's not the case, of course. -- Stefano

David Gibson

4 Dec 4 Dec

4:16 a.m.

New subject: [PATCH 3/3] tcp: Don't account for hash table size in tcp_hash()

Currently tcp_hash() returns the hash bucket for a value, that is the hash modulo the size of the hash table. Usually it's a bit more flexible to have hash functions return a "raw" hash value and perform the modulus in the callers. That allows the same hash function to be used for multiple tables of different sizes, or to re-use the hash for other purposes. We don't do anything like that with tcp_hash() at present, but we have some plans to do so. Prepare for that by making tcp_hash() and tcp_conn_hash() return raw hash values. Signed-off-by: David Gibson --- tcp.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/tcp.c b/tcp.c index 7e438b7..6b12feb 100644 --- a/tcp.c +++ b/tcp.c @@ -1159,18 +1159,15 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn, * @eport: Guest side endpoint port * @fport: Guest side forwarding port * - * Return: hash value, already modulo size of the hash table + * Return: hash value, needs to be adjusted for table size */ -static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *faddr, - in_port_t eport, in_port_t fport) +static uint64_t tcp_hash(const struct ctx *c, const union inany_addr *faddr, + in_port_t eport, in_port_t fport) { struct siphash_state state = SIPHASH_INIT(c->hash_secret); - uint64_t hash; inany_siphash_feed(&state, faddr); - hash = siphash_final(&state, 20, (uint64_t)eport << 16 | fport); - - return (unsigned int)(hash % TCP_HASH_TABLE_SIZE); + return siphash_final(&state, 20, (uint64_t)eport << 16 | fport); } /** @@ -1178,10 +1175,10 @@ static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *faddr, * @c: Execution context * @conn: Connection * - * Return: hash value, already modulo size of the hash table + * Return: hash value, needs to be adjusted for table size */ -static unsigned int tcp_conn_hash(const struct ctx *c, - const struct tcp_tap_conn *conn) +static uint64_t tcp_conn_hash(const struct ctx *c, + const struct tcp_tap_conn *conn) { return tcp_hash(c, &conn->faddr, conn->eport, conn->fport); } @@ -1201,7 +1198,7 @@ static inline unsigned tcp_hash_probe(const struct ctx *c, unsigned b; /* Linear probing */ - for (b = tcp_conn_hash(c, conn); + for (b = tcp_conn_hash(c, conn) % TCP_HASH_TABLE_SIZE; !flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) && !flow_sidx_eq(tc_hash[b], sidx); b = (b + 1) % TCP_HASH_TABLE_SIZE) @@ -1243,7 +1240,7 @@ static void tcp_hash_remove(const struct ctx *c, for (s = (b + 1) % TCP_HASH_TABLE_SIZE; (flow = flow_at_sidx(tc_hash[s])); s = (s + 1) % TCP_HASH_TABLE_SIZE) { - unsigned h = tcp_conn_hash(c, &flow->tcp); + unsigned h = tcp_conn_hash(c, &flow->tcp) % TCP_HASH_TABLE_SIZE; if (in_mod_range(h, b, s, TCP_HASH_TABLE_SIZE)) { /* tc_hash[s] can live in tc_hash[b]'s slot */ @@ -1299,7 +1296,7 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, inany_from_af(&aany, af, faddr); - for (b = tcp_hash(c, &aany, eport, fport); + for (b = tcp_hash(c, &aany, eport, fport) % TCP_HASH_TABLE_SIZE; (flow = flow_at_sidx(tc_hash[b])) && !tcp_hash_match(&flow->tcp, &aany, eport, fport); b = (b + 1) % TCP_HASH_TABLE_SIZE) -- 2.43.0

826

Age (days ago)

833

Last active (days ago)

List overview

Download

10 comments

2 participants

participants (2)

David Gibson
Stefano Brivio

[PATCH 0/3] RFC: TCP hash change changes, in preparation for flow table

tags

participants (2)