Here's a series of what I hope will be the last preliminaries for the flow table which can be applied before we take the cost of increasing the size of entries with generic address information. Changes since "Unified flow table v5": * Small fixes to state verification * New patch passing specific typed structures into deferred/timer callbacks * Terminology change from "forwarded side" to "target side" * Assorted wording and style tweaks based on Stefano's review David Gibson (6): flow: Properly type callbacks to protocol specific handlers inany: Better helpers for using inany and specific family addrs together flow: Clarify and enforce flow state transitions flow: Make side 0 always be the initiating side flow: Record the pifs for each side of each flow tcp: Remove interim 'tapside' field from connection flow.c | 209 ++++++++++++++++++++++++++++++++++----------------- flow.h | 118 +++++++++++++++++++++++++++-- flow_table.h | 10 +++ icmp.c | 18 ++--- icmp_flow.h | 2 +- inany.c | 17 +---- inany.h | 97 +++++++++++++++++++++++- pif.h | 1 - tcp.c | 68 ++++++++--------- tcp_conn.h | 7 +- tcp_splice.c | 25 +++--- 11 files changed, 417 insertions(+), 155 deletions(-) -- 2.45.1
The flow dispatches deferred and timer handling for flows centrally, but needs to call into protocol specific code for the handling of individual flows. Currently this passes a general union flow *. It makes more sense to pass the specific relevant flow type structure. That brings the check on the flow type adjacent to casting to the union variant which it tags. Arguably, this is a slight abstraction violation since it involves the generic flow code using protocol specific types. It's already calling into protocol specific functions, so I don't think this really makes any difference. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 8 ++++---- icmp.c | 6 ++---- icmp_flow.h | 2 +- tcp.c | 10 ++++------ tcp_conn.h | 6 +++--- tcp_splice.c | 12 +++++------- 6 files changed, 19 insertions(+), 25 deletions(-) diff --git a/flow.c b/flow.c index 80dd269e..e257f890 100644 --- a/flow.c +++ b/flow.c @@ -292,17 +292,17 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) ASSERT(false); break; case FLOW_TCP: - closed = tcp_flow_defer(flow); + closed = tcp_flow_defer(&flow->tcp); break; case FLOW_TCP_SPLICE: - closed = tcp_splice_flow_defer(flow); + closed = tcp_splice_flow_defer(&flow->tcp_splice); if (!closed && timer) - tcp_splice_timer(c, flow); + tcp_splice_timer(c, &flow->tcp_splice); break; case FLOW_PING4: case FLOW_PING6: if (timer) - closed = icmp_ping_timer(c, flow, now); + closed = icmp_ping_timer(c, &flow->ping, now); break; default: /* Assume other flow types don't need any handling */ diff --git a/icmp.c b/icmp.c index 1c5cf84b..eb559c18 100644 --- a/icmp.c +++ b/icmp.c @@ -289,16 +289,14 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, /** * icmp_ping_timer() - Handler for timed events related to a given flow * @c: Execution context - * @flow: flow table entry to check for timeout + * @pingf: Ping flow to check for timeout * @now: Current timestamp * * Return: true if the flow is ready to free, false otherwise */ -bool icmp_ping_timer(const struct ctx *c, union flow *flow, +bool icmp_ping_timer(const struct ctx *c, const struct icmp_ping_flow *pingf, const struct timespec *now) { - const struct icmp_ping_flow *pingf = &flow->ping; - if (now->tv_sec - pingf->ts <= ICMP_ECHO_TIMEOUT) return false; diff --git a/icmp_flow.h b/icmp_flow.h index 5a2eed9c..c9847eae 100644 --- a/icmp_flow.h +++ b/icmp_flow.h @@ -25,7 +25,7 @@ struct icmp_ping_flow { uint16_t id; }; -bool icmp_ping_timer(const struct ctx *c, union flow *flow, +bool icmp_ping_timer(const struct ctx *c, const struct icmp_ping_flow *pingf, const struct timespec *now); #endif /* ICMP_FLOW_H */ diff --git a/tcp.c b/tcp.c index 21d0af06..a8ba5858 100644 --- a/tcp.c +++ b/tcp.c @@ -1221,15 +1221,13 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, /** * tcp_flow_defer() - Deferred per-flow handling (clean up closed connections) - * @flow: Flow table entry for this connection + * @conn: Connection to handle * - * Return: true if the flow is ready to free, false otherwise + * Return: true if the connection is ready to free, false otherwise */ -bool tcp_flow_defer(union flow *flow) +bool tcp_flow_defer(const struct tcp_tap_conn *conn) { - const struct tcp_tap_conn *conn = &flow->tcp; - - if (flow->tcp.events != CLOSED) + if (conn->events != CLOSED) return false; close(conn->sock); diff --git a/tcp_conn.h b/tcp_conn.h index d280b222..52bd8156 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -155,9 +155,9 @@ struct tcp_splice_conn { extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; -bool tcp_flow_defer(union flow *flow); -bool tcp_splice_flow_defer(union flow *flow); -void tcp_splice_timer(const struct ctx *c, union flow *flow); +bool tcp_flow_defer(const struct tcp_tap_conn *conn); +bool tcp_splice_flow_defer(struct tcp_splice_conn *conn); +void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn); int tcp_conn_pool_sock(int pool[]); int tcp_conn_sock(const struct ctx *c, sa_family_t af); int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af); diff --git a/tcp_splice.c b/tcp_splice.c index 4c36b721..fb00bc22 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -235,16 +235,15 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, /** * tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed) - * @flow: Flow table entry for this connection + * @conn: Connection entry to handle * * Return: true if the flow is ready to free, false otherwise */ -bool tcp_splice_flow_defer(union flow *flow) +bool tcp_splice_flow_defer(struct tcp_splice_conn *conn) { - struct tcp_splice_conn *conn = &flow->tcp_splice; unsigned side; - if (!(flow->tcp_splice.flags & CLOSING)) + if (!(conn->flags & CLOSING)) return false; for (side = 0; side < SIDES; side++) { @@ -786,11 +785,10 @@ void tcp_splice_init(struct ctx *c) /** * tcp_splice_timer() - Timer for spliced connections * @c: Execution context - * @flow: Flow table entry + * @conn: Connection to handle */ -void tcp_splice_timer(const struct ctx *c, union flow *flow) +void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn) { - struct tcp_splice_conn *conn = &flow->tcp_splice; int side; ASSERT(!(conn->flags & CLOSING)); -- 2.45.1
This adds some extra inany helpers for comparing an inany address to addresses of a specific family (including special addresses), and building an inany from an IPv4 address (either statically or at runtime). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- inany.c | 17 ++-------- inany.h | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- tcp.c | 29 +++++++---------- 3 files changed, 106 insertions(+), 37 deletions(-) diff --git a/inany.c b/inany.c index c8479a75..5e391dc7 100644 --- a/inany.c +++ b/inany.c @@ -17,21 +17,8 @@ #include "siphash.h" #include "inany.h" -const union inany_addr inany_loopback4 = { - .v4mapped = { - .zero = { 0 }, - .one = { 0xff, 0xff, }, - .a4 = IN4ADDR_LOOPBACK_INIT, - }, -}; - -const union inany_addr inany_any4 = { - .v4mapped = { - .zero = { 0 }, - .one = { 0xff, 0xff, }, - .a4 = IN4ADDR_ANY_INIT, - }, -}; +const union inany_addr inany_loopback4 = INANY_INIT4(IN4ADDR_LOOPBACK_INIT); +const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT); /** inany_ntop - Convert an IPv[46] address to text format * @src: IPv[46] address diff --git a/inany.h b/inany.h index 407690e2..47b66fa9 100644 --- a/inany.h +++ b/inany.h @@ -43,6 +43,17 @@ extern const union inany_addr inany_any4; #define in4addr_loopback (inany_loopback4.v4mapped.a4) #define in4addr_any (inany_any4.v4mapped.a4) +#define INANY_INIT4(a4init) { \ + .v4mapped = { \ + .zero = { 0 }, \ + .one = { 0xff, 0xff }, \ + .a4 = a4init, \ + }, \ + } + +#define inany_from_v4(a4) \ + ((union inany_addr)INANY_INIT4((a4))) + /** union sockaddr_inany - Either a sockaddr_in or a sockaddr_in6 * @sa_family: Address family, AF_INET or AF_INET6 * @sa: Plain struct sockaddr (useful to avoid casts) @@ -79,16 +90,84 @@ static inline bool inany_equals(const union inany_addr *a, return IN6_ARE_ADDR_EQUAL(&a->a6, &b->a6); } +/** inany_equals4 - Compare an IPv[46] address to an IPv4 address + * @a: IPv[46] addresses + * @b: IPv4 address + * + * Return: true if @a and @b are the same address + */ +static inline bool inany_equals4(const union inany_addr *a, + const struct in_addr *b) +{ + const struct in_addr *a4 = inany_v4(a); + + return a4 && IN4_ARE_ADDR_EQUAL(a4, b); +} + +/** inany_equals6 - Compare an IPv[46] address to an IPv6 address + * @a: IPv[46] addresses + * @b: IPv6 address + * + * Return: true if @a and @b are the same address + */ +static inline bool inany_equals6(const union inany_addr *a, + const struct in6_addr *b) +{ + return IN6_ARE_ADDR_EQUAL(&a->a6, b); +} + +/** inany_is_loopback4() - Check if address is IPv4 loopback + * @a: IPv[46] address + * + * Return: true if @a is in 127.0.0.1/8 + */ +static inline bool inany_is_loopback4(const union inany_addr *a) +{ + const struct in_addr *v4 = inany_v4(a); + + return v4 && IN4_IS_ADDR_LOOPBACK(v4); +} + +/** inany_is_loopback6() - Check if address is IPv6 loopback + * @a: IPv[46] address + * + * Return: true if @a is in ::1 + */ +static inline bool inany_is_loopback6(const union inany_addr *a) +{ + return IN6_IS_ADDR_LOOPBACK(&a->a6); +} + /** inany_is_loopback() - Check if address is loopback * @a: IPv[46] address * * Return: true if @a is either ::1 or in 127.0.0.1/8 */ static inline bool inany_is_loopback(const union inany_addr *a) +{ + return inany_is_loopback4(a) || inany_is_loopback6(a); +} + +/** inany_is_unspecified4() - Check if address is unspecified IPv4 + * @a: IPv[46] address + * + * Return: true if @a is 0.0.0.0 + */ +static inline bool inany_is_unspecified4(const union inany_addr *a) { const struct in_addr *v4 = inany_v4(a); - return IN6_IS_ADDR_LOOPBACK(&a->a6) || (v4 && IN4_IS_ADDR_LOOPBACK(v4)); + return v4 && IN4_IS_ADDR_UNSPECIFIED(v4); +} + +/** inany_is_unspecified6() - Check if address is unspecified IPv6 + * @a: IPv[46] address + * + * Return: true if @a is :: + */ +static inline bool inany_is_unspecified6(const union inany_addr *a) +{ + return IN6_IS_ADDR_UNSPECIFIED(&a->a6); } /** inany_is_unspecified() - Check if address is unspecified @@ -98,10 +177,20 @@ static inline bool inany_is_loopback(const union inany_addr *a) */ static inline bool inany_is_unspecified(const union inany_addr *a) { - const struct in_addr *v4 = inany_v4(a); + return inany_is_unspecified4(a) || inany_is_unspecified6(a); +} - return IN6_IS_ADDR_UNSPECIFIED(&a->a6) || - (v4 && IN4_IS_ADDR_UNSPECIFIED(v4)); +/* FIXME: consider handling of IPv4 link-local addresses */ + +/** inany_is_linklocal6() - Check if address is link-local IPv6 + * @a: IPv[46] address + * + * Return: true if @a is in fe80::/10 (IPv6 link local unicast) + */ +/* cppcheck-suppress unusedFunction */ +static inline bool inany_is_linklocal6(const union inany_addr *a) +{ + return IN6_IS_ADDR_LINKLOCAL(&a->a6); } /** inany_is_multicast() - Check if address is multicast or broadcast diff --git a/tcp.c b/tcp.c index a8ba5858..4512af0b 100644 --- a/tcp.c +++ b/tcp.c @@ -2687,24 +2687,17 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) */ static void tcp_snat_inbound(const struct ctx *c, union inany_addr *addr) { - struct in_addr *addr4 = inany_v4(addr); - - if (addr4) { - if (IN4_IS_ADDR_LOOPBACK(addr4) || - IN4_IS_ADDR_UNSPECIFIED(addr4) || - IN4_ARE_ADDR_EQUAL(addr4, &c->ip4.addr_seen)) - *addr4 = c->ip4.gw; - } else { - struct in6_addr *addr6 = &addr->a6; - - if (IN6_IS_ADDR_LOOPBACK(addr6) || - IN6_ARE_ADDR_EQUAL(addr6, &c->ip6.addr_seen) || - IN6_ARE_ADDR_EQUAL(addr6, &c->ip6.addr)) { - if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) - *addr6 = c->ip6.gw; - else - *addr6 = c->ip6.addr_ll; - } + if (inany_is_loopback4(addr) || + inany_is_unspecified4(addr) || + inany_equals4(addr, &c->ip4.addr_seen)) { + *addr = inany_from_v4(c->ip4.gw); + } else if (inany_is_loopback6(addr) || + inany_equals6(addr, &c->ip6.addr_seen) || + inany_equals6(addr, &c->ip6.addr)) { + if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) + addr->a6 = c->ip6.gw; + else + addr->a6 = c->ip6.addr_ll; } } -- 2.45.1
Flows move over several different states in their lifetime. The rules for these are documented in comments, but they're pretty complex and a number of the transitions are implicit, which makes this pretty fragile and error prone. Change the code to explicitly track the states in a field. Make all transitions explicit and logged. To the extent that it's practical in C, enforce what can and can't be done in various states with ASSERT()s. While we're at it, tweak the docs to clarify the restrictions on each state a bit. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 146 ++++++++++++++++++++++++++++++--------------------- flow.h | 81 ++++++++++++++++++++++++++-- flow_table.h | 9 ++++ icmp.c | 4 +- tcp.c | 8 ++- tcp_splice.c | 4 +- 6 files changed, 183 insertions(+), 69 deletions(-) diff --git a/flow.c b/flow.c index e257f890..da71fe10 100644 --- a/flow.c +++ b/flow.c @@ -18,6 +18,15 @@ #include "flow.h" #include "flow_table.h" +const char *flow_state_str[] = { + [FLOW_STATE_FREE] = "FREE", + [FLOW_STATE_NEW] = "NEW", + [FLOW_STATE_TYPED] = "TYPED", + [FLOW_STATE_ACTIVE] = "ACTIVE", +}; +static_assert(ARRAY_SIZE(flow_state_str) == FLOW_NUM_STATES, + "flow_state_str[] doesn't match enum flow_state"); + const char *flow_type_str[] = { [FLOW_TYPE_NONE] = "<none>", [FLOW_TCP] = "TCP connection", @@ -39,46 +48,6 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, /* Global Flow Table */ -/** - * DOC: Theory of Operation - flow entry life cycle - * - * An individual flow table entry moves through these logical states, usually in - * this order. - * - * FREE - Part of the general pool of free flow table entries - * Operations: - * - flow_alloc() finds an entry and moves it to ALLOC state - * - * ALLOC - A tentatively allocated entry - * Operations: - * - flow_alloc_cancel() returns the entry to FREE state - * - FLOW_START() set the entry's type and moves to START state - * Caveats: - * - It's not safe to write fields in the flow entry - * - It's not safe to allocate further entries with flow_alloc() - * - It's not safe to return to the main epoll loop (use FLOW_START() - * to move to START state before doing so) - * - It's not safe to use flow_*() logging functions - * - * START - An entry being prepared by flow type specific code - * Operations: - * - Flow type specific fields may be accessed - * - flow_*() logging functions - * - flow_alloc_cancel() returns the entry to FREE state - * Caveats: - * - Returning to the main epoll loop or allocating another entry - * with flow_alloc() implicitly moves the entry to ACTIVE state. - * - * ACTIVE - An active flow entry managed by flow type specific code - * Operations: - * - Flow type specific fields may be accessed - * - flow_*() logging functions - * - Flow may be expired by returning 'true' from flow type specific - * deferred or timer handler. This will return it to FREE state. - * Caveats: - * - It's not safe to call flow_alloc_cancel() - */ - /** * DOC: Theory of Operation - allocating and freeing flow entries * @@ -132,6 +101,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, unsigned flow_first_free; union flow flowtab[FLOW_MAX]; +static const union flow *flow_new_entry; /* = NULL */ /* Last time the flow timers ran */ static struct timespec flow_timer_run; @@ -144,6 +114,7 @@ static struct timespec flow_timer_run; */ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) { + const char *type_or_state; char msg[BUFSIZ]; va_list args; @@ -151,40 +122,65 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) (void)vsnprintf(msg, sizeof(msg), fmt, args); va_end(args); - logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg); + /* Show type if it's set, otherwise the state */ + if (f->state < FLOW_STATE_TYPED) + type_or_state = FLOW_STATE(f); + else + type_or_state = FLOW_TYPE(f); + + logmsg(pri, "Flow %u (%s): %s", flow_idx(f), type_or_state, msg); +} + +/** + * flow_set_state() - Change flow's state + * @f: Flow changing state + * @state: New state + */ +static void flow_set_state(struct flow_common *f, enum flow_state state) +{ + uint8_t oldstate = f->state; + + ASSERT(state < FLOW_NUM_STATES); + ASSERT(oldstate < FLOW_NUM_STATES); + + f->state = state; + flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], + FLOW_STATE(f)); } /** - * flow_start() - Set flow type for new flow and log - * @flow: Flow to set type for + * flow_set_type() - Set type and move to TYPED + * @flow: Flow to change state * @type: Type for new flow * @iniside: Which side initiated the new flow * * Return: @flow - * - * Should be called before setting any flow type specific fields in the flow - * table entry. */ -union flow *flow_start(union flow *flow, enum flow_type type, - unsigned iniside) +union flow *flow_set_type(union flow *flow, enum flow_type type, + unsigned iniside) { + struct flow_common *f = &flow->f; + + ASSERT(type != FLOW_TYPE_NONE); + ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_NEW); + ASSERT(f->type == FLOW_TYPE_NONE); + (void)iniside; - flow->f.type = type; - flow_dbg(flow, "START %s", flow_type_str[flow->f.type]); + f->type = type; + flow_set_state(f, FLOW_STATE_TYPED); return flow; } /** - * flow_end() - Clear flow type for finished flow and log - * @flow: Flow to clear + * flow_activate() - Move flow to ACTIVE + * @f: Flow to change state */ -static void flow_end(union flow *flow) +void flow_activate(struct flow_common *f) { - if (flow->f.type == FLOW_TYPE_NONE) - return; /* Nothing to do */ + ASSERT(&flow_new_entry->f == f && f->state == FLOW_STATE_TYPED); - flow_dbg(flow, "END %s", flow_type_str[flow->f.type]); - flow->f.type = FLOW_TYPE_NONE; + flow_set_state(f, FLOW_STATE_ACTIVE); + flow_new_entry = NULL; } /** @@ -196,9 +192,12 @@ union flow *flow_alloc(void) { union flow *flow = &flowtab[flow_first_free]; + ASSERT(!flow_new_entry); + if (flow_first_free >= FLOW_MAX) return NULL; + ASSERT(flow->f.state == FLOW_STATE_FREE); ASSERT(flow->f.type == FLOW_TYPE_NONE); ASSERT(flow->free.n >= 1); ASSERT(flow_first_free + flow->free.n <= FLOW_MAX); @@ -221,7 +220,10 @@ union flow *flow_alloc(void) flow_first_free = flow->free.next; } + flow_new_entry = flow; memset(flow, 0, sizeof(*flow)); + flow_set_state(&flow->f, FLOW_STATE_NEW); + return flow; } @@ -233,15 +235,21 @@ union flow *flow_alloc(void) */ void flow_alloc_cancel(union flow *flow) { + ASSERT(flow_new_entry == flow); + ASSERT(flow->f.state == FLOW_STATE_NEW || + flow->f.state == FLOW_STATE_TYPED); ASSERT(flow_first_free > FLOW_IDX(flow)); - flow_end(flow); + flow_set_state(&flow->f, FLOW_STATE_FREE); + memset(flow, 0, sizeof(*flow)); + /* Put it back in a length 1 free cluster, don't attempt to fully * reverse flow_alloc()s steps. This will get folded together the next * time flow_defer_handler runs anyway() */ flow->free.n = 1; flow->free.next = flow_first_free; flow_first_free = FLOW_IDX(flow); + flow_new_entry = NULL; } /** @@ -261,11 +269,14 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) flow_timer_run = *now; } + ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */ + for (idx = 0; idx < FLOW_MAX; idx++) { union flow *flow = &flowtab[idx]; bool closed = false; - if (flow->f.type == FLOW_TYPE_NONE) { + switch (flow->f.state) { + case FLOW_STATE_FREE: { unsigned skip = flow->free.n; /* First entry of a free cluster must have n >= 1 */ @@ -287,6 +298,20 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) continue; } + case FLOW_STATE_NEW: + case FLOW_STATE_TYPED: + /* Incomplete flow at end of cycle */ + ASSERT(false); + break; + + case FLOW_STATE_ACTIVE: + /* Nothing to do */ + break; + + default: + ASSERT(false); + } + switch (flow->f.type) { case FLOW_TYPE_NONE: ASSERT(false); @@ -310,7 +335,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) } if (closed) { - flow_end(flow); + flow_set_state(&flow->f, FLOW_STATE_FREE); + memset(flow, 0, sizeof(*flow)); if (free_head) { /* Add slot to current free cluster */ diff --git a/flow.h b/flow.h index c943c441..e61d35e8 100644 --- a/flow.h +++ b/flow.h @@ -9,6 +9,68 @@ #define FLOW_TIMER_INTERVAL 1000 /* ms */ +/** + * enum flow_state - States of a flow table entry + * + * An individual flow table entry moves through these states, usually in this + * order. + * General rules: + * - Code outside flow.c should never write common fields of union flow. + * - The state field may always be read. + * + * FREE - Part of the general pool of free flow table entries + * Operations: + * - flow_alloc() finds an entry and moves it to NEW + * + * NEW - Freshly allocated, uninitialised entry + * Operations: + * - flow_alloc_cancel() returns the entry to FREE + * - FLOW_SET_TYPE() sets the entry's type and moves to TYPED + * Caveats: + * - No fields other than state may be accessed + * - At most one entry may be NEW or TYPED at a time, so it's unsafe + * to use flow_alloc() again until this entry moves to ACTIVE or + * FREE + * - You may not return to the main epoll loop while any flow is NEW + * + * TYPED - Generic info initialised, type specific initialisation underway + * Operations: + * - All common fields may be read + * - Type specific fields may be read and written + * - flow_alloc_cancel() returns the entry to FREE + * - FLOW_ACTIVATE() moves the entry to ACTIVE + * Caveats: + * - At most one entry may be NEW or TYPED at a time, so it's unsafe + * to use flow_alloc() again until this entry moves to ACTIVE or + * FREE + * - You may not return to the main epoll loop while any flow is + * TYPED + * + * ACTIVE - An active, fully-initialised flow entry + * Operations: + * - All common fields may be read + * - Type specific fields may be read and written + * - Flow returns to FREE when it expires, signalled by returning + * 'true' from flow type specific deferred or timer handler + * Caveats: + * - flow_alloc_cancel() may not be called on it + */ +enum flow_state { + FLOW_STATE_FREE, + FLOW_STATE_NEW, + FLOW_STATE_TYPED, + FLOW_STATE_ACTIVE, + + FLOW_NUM_STATES, +}; +#define FLOW_STATE_BITS 8 +static_assert(FLOW_NUM_STATES <= (1 << FLOW_STATE_BITS), + "Too many flow states for FLOW_STATE_BITS"); + +extern const char *flow_state_str[]; +#define FLOW_STATE(f) \ + ((f)->state < FLOW_NUM_STATES ? flow_state_str[(f)->state] : "?") + /** * enum flow_type - Different types of packet flows we track */ @@ -26,6 +88,9 @@ enum flow_type { FLOW_NUM_TYPES, }; +#define FLOW_TYPE_BITS 8 +static_assert(FLOW_NUM_TYPES <= (1 << FLOW_TYPE_BITS), + "Too many flow types for FLOW_TYPE_BITS"); extern const char *flow_type_str[]; #define FLOW_TYPE(f) \ @@ -37,10 +102,21 @@ extern const uint8_t flow_proto[]; /** * struct flow_common - Common fields for packet flows + * @state: State of the flow table entry * @type: Type of packet flow */ struct flow_common { +#ifdef __GNUC__ + enum flow_state state:FLOW_STATE_BITS; + enum flow_type type:FLOW_TYPE_BITS; +#else + uint8_t state; + static_assert(sizeof(uint8_t) * 8 >= FLOW_STATE_BITS, + "Not enough bits for state field"); uint8_t type; + static_assert(sizeof(uint8_t) * 8 >= FLOW_TYPE_BITS, + "Not enough bits for type field"); +#endif }; #define FLOW_INDEX_BITS 17 /* 128k - 1 */ @@ -49,11 +125,6 @@ struct flow_common { #define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */ #define FLOW_FILE_PRESSURE 30 /* % of c->nofile */ -union flow *flow_start(union flow *flow, enum flow_type type, - unsigned iniside); -#define FLOW_START(flow_, t_, var_, i_) \ - (&flow_start((flow_), (t_), (i_))->var_) - /** * struct flow_sidx - ID for one side of a specific flow * @side: Side referenced (0 or 1) diff --git a/flow_table.h b/flow_table.h index b7e5529a..17b9ea70 100644 --- a/flow_table.h +++ b/flow_table.h @@ -107,4 +107,13 @@ static inline flow_sidx_t flow_sidx(const struct flow_common *f, union flow *flow_alloc(void); void flow_alloc_cancel(union flow *flow); +union flow *flow_set_type(union flow *flow, enum flow_type type, + unsigned iniside); +#define FLOW_SET_TYPE(flow_, t_, var_, i_) \ + (&flow_set_type((flow_), (t_), (i_))->var_) + +void flow_activate(struct flow_common *f); +#define FLOW_ACTIVATE(flow_) \ + (flow_activate(&(flow_)->f)) + #endif /* FLOW_TABLE_H */ diff --git a/icmp.c b/icmp.c index eb559c18..fc729557 100644 --- a/icmp.c +++ b/icmp.c @@ -167,7 +167,7 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, if (!flow) return NULL; - pingf = FLOW_START(flow, flowtype, ping, TAPSIDE); + pingf = FLOW_SET_TYPE(flow, flowtype, ping, TAPSIDE); pingf->seq = -1; pingf->id = id; @@ -198,6 +198,8 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, *id_sock = pingf; + FLOW_ACTIVATE(pingf); + return pingf; cancel: diff --git a/tcp.c b/tcp.c index 4512af0b..64fe46e0 100644 --- a/tcp.c +++ b/tcp.c @@ -2004,7 +2004,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, goto cancel; } - conn = FLOW_START(flow, FLOW_TCP, tcp, TAPSIDE); + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp, TAPSIDE); conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2075,6 +2075,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, } tcp_epoll_ctl(c, conn); + FLOW_ACTIVATE(conn); return; cancel: @@ -2715,7 +2716,8 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, const union sockaddr_inany *sa, const struct timespec *now) { - struct tcp_tap_conn *conn = FLOW_START(flow, FLOW_TCP, tcp, SOCKSIDE); + struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp, + SOCKSIDE); conn->sock = s; conn->timer = -1; @@ -2738,6 +2740,8 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, conn_flag(c, conn, ACK_FROM_TAP_DUE); tcp_get_sndbuf(conn); + + FLOW_ACTIVATE(conn); } /** diff --git a/tcp_splice.c b/tcp_splice.c index fb00bc22..852a93b4 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -471,7 +471,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, return false; } - conn = FLOW_START(flow, FLOW_TCP_SPLICE, tcp_splice, 0); + conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, tcp_splice, 0); conn->flags = af == AF_INET ? 0 : SPLICE_V6; conn->s[0] = s0; @@ -485,6 +485,8 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, if (tcp_splice_connect(c, conn, af, pif1, dstport)) conn_flag(c, conn, CLOSING); + FLOW_ACTIVATE(conn); + return true; } -- 2.45.1
Each flow in the flow table has two sides, 0 and 1, representing the two interfaces between which passt/pasta will forward data for that flow. Which side is which is currently up to the protocol specific code: TCP uses side 0 for the host/"sock" side and 1 for the guest/"tap" side, except for spliced connections where it uses 0 for the initiating side and 1 for the target side. ICMP also uses 0 for the host/"sock" side and 1 for the guest/"tap" side, but in its case the latter is always also the initiating side. Make this generically consistent by always using side 0 for the initiating side and 1 for the target side. This doesn't simplify a lot for now, and arguably makes TCP slightly more complex, since we add an extra field to the connection structure to record which is the guest facing side. This is an interim change, which we'll be able to remove later. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au>q --- flow.c | 5 +---- flow.h | 5 +++++ flow_table.h | 6 ++---- icmp.c | 8 ++------ tcp.c | 19 ++++++++----------- tcp_conn.h | 3 ++- tcp_splice.c | 2 +- 7 files changed, 21 insertions(+), 27 deletions(-) diff --git a/flow.c b/flow.c index da71fe10..91c00465 100644 --- a/flow.c +++ b/flow.c @@ -152,12 +152,10 @@ static void flow_set_state(struct flow_common *f, enum flow_state state) * flow_set_type() - Set type and move to TYPED * @flow: Flow to change state * @type: Type for new flow - * @iniside: Which side initiated the new flow * * Return: @flow */ -union flow *flow_set_type(union flow *flow, enum flow_type type, - unsigned iniside) +union flow *flow_set_type(union flow *flow, enum flow_type type) { struct flow_common *f = &flow->f; @@ -165,7 +163,6 @@ union flow *flow_set_type(union flow *flow, enum flow_type type, ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_NEW); ASSERT(f->type == FLOW_TYPE_NONE); - (void)iniside; f->type = type; flow_set_state(f, FLOW_STATE_TYPED); return flow; diff --git a/flow.h b/flow.h index e61d35e8..95309389 100644 --- a/flow.h +++ b/flow.h @@ -100,6 +100,11 @@ extern const uint8_t flow_proto[]; #define FLOW_PROTO(f) \ ((f)->type < FLOW_NUM_TYPES ? flow_proto[(f)->type] : 0) +#define SIDES 2 + +#define INISIDE 0 /* Initiating side */ +#define TGTSIDE 1 /* Target side */ + /** * struct flow_common - Common fields for packet flows * @state: State of the flow table entry diff --git a/flow_table.h b/flow_table.h index 17b9ea70..28e36b9a 100644 --- a/flow_table.h +++ b/flow_table.h @@ -107,10 +107,8 @@ static inline flow_sidx_t flow_sidx(const struct flow_common *f, union flow *flow_alloc(void); void flow_alloc_cancel(union flow *flow); -union flow *flow_set_type(union flow *flow, enum flow_type type, - unsigned iniside); -#define FLOW_SET_TYPE(flow_, t_, var_, i_) \ - (&flow_set_type((flow_), (t_), (i_))->var_) +union flow *flow_set_type(union flow *flow, enum flow_type type); +#define FLOW_SET_TYPE(flow_, t_, var_) (&flow_set_type((flow_), (t_))->var_) void flow_activate(struct flow_common *f); #define FLOW_ACTIVATE(flow_) \ diff --git a/icmp.c b/icmp.c index fc729557..3567ad7e 100644 --- a/icmp.c +++ b/icmp.c @@ -45,10 +45,6 @@ #define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */ #define ICMP_NUM_IDS (1U << 16) -/* Sides of a flow as we use them for ping streams */ -#define SOCKSIDE 0 -#define TAPSIDE 1 - #define PINGF(idx) (&(FLOW(idx)->ping)) /* Indexed by ICMP echo identifier */ @@ -167,7 +163,7 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, if (!flow) return NULL; - pingf = FLOW_SET_TYPE(flow, flowtype, ping, TAPSIDE); + pingf = FLOW_SET_TYPE(flow, flowtype, ping); pingf->seq = -1; pingf->id = id; @@ -180,7 +176,7 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, bind_if = c->ip6.ifname_out; } - ref.flowside = FLOW_SIDX(flow, SOCKSIDE); + ref.flowside = FLOW_SIDX(flow, TGTSIDE); pingf->sock = sock_l4(c, af, flow_proto[flowtype], bind_addr, bind_if, 0, ref.data); diff --git a/tcp.c b/tcp.c index 64fe46e0..0313dcb9 100644 --- a/tcp.c +++ b/tcp.c @@ -303,10 +303,6 @@ #include "flow_table.h" -/* Sides of a flow as we use them in "tap" connections */ -#define SOCKSIDE 0 -#define TAPSIDE 1 - #define TCP_FRAMES_MEM 128 #define TCP_FRAMES \ (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) @@ -581,7 +577,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, - .flowside = FLOW_SIDX(conn, SOCKSIDE) }; + .flowside = FLOW_SIDX(conn, !conn->tapside), }; struct epoll_event ev = { .data.u64 = ref.u64 }; if (conn->events == CLOSED) { @@ -1134,7 +1130,7 @@ static uint64_t tcp_conn_hash(const struct ctx *c, static inline unsigned tcp_hash_probe(const struct ctx *c, const struct tcp_tap_conn *conn) { - flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE); + flow_sidx_t sidx = FLOW_SIDX(conn, conn->tapside); unsigned b = tcp_conn_hash(c, conn) % TCP_HASH_TABLE_SIZE; /* Linear probing */ @@ -1154,7 +1150,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) { unsigned b = tcp_hash_probe(c, conn); - tc_hash[b] = FLOW_SIDX(conn, TAPSIDE); + tc_hash[b] = FLOW_SIDX(conn, conn->tapside); flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); } @@ -2004,7 +2000,8 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, goto cancel; } - conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp, TAPSIDE); + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); + conn->tapside = INISIDE; conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2716,9 +2713,9 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, const union sockaddr_inany *sa, const struct timespec *now) { - struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp, - SOCKSIDE); + struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); + conn->tapside = TGTSIDE; conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; @@ -2875,7 +2872,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) struct tcp_tap_conn *conn = CONN(ref.flowside.flow); ASSERT(conn->f.type == FLOW_TCP); - ASSERT(ref.flowside.side == SOCKSIDE); + ASSERT(ref.flowside.side == !conn->tapside); if (conn->events == CLOSED) return; diff --git a/tcp_conn.h b/tcp_conn.h index 52bd8156..e8c51c34 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -13,6 +13,7 @@ * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) * @f: Generic flow information * @in_epoll: Is the connection in the epoll set? + * @tapside: Which side of the flow faces the tap/guest interface * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number * @events: Connection events, implying connection states @@ -39,6 +40,7 @@ struct tcp_tap_conn { struct flow_common f; bool in_epoll :1; + unsigned tapside :1; #define TCP_RETRANS_BITS 3 unsigned int retrans :TCP_RETRANS_BITS; @@ -106,7 +108,6 @@ struct tcp_tap_conn { uint32_t seq_init_from_tap; }; -#define SIDES 2 /** * struct tcp_splice_conn - Descriptor for a spliced TCP connection * @f: Generic flow information diff --git a/tcp_splice.c b/tcp_splice.c index 852a93b4..31e21732 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -471,7 +471,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, return false; } - conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, tcp_splice, 0); + conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, tcp_splice); conn->flags = af == AF_INET ? 0 : SPLICE_V6; conn->s[0] = s0; -- 2.45.1
Currently we have no generic information flows apart from the type and state, everything else is specific to the flow type. Start introducing generic flow information by recording the pifs which the flow connects. To keep track of what information is valid, introduce new flow states: INI for when the initiating side information is complete, and TGT for when both sides information is complete, but we haven't chosen the flow type yet. For now, these states don't do an awful lot, but they'll become more important as we add more generic information. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++---- flow.h | 44 +++++++++++++++++++++++++++++++++++------ flow_table.h | 3 +++ icmp.c | 2 ++ pif.h | 1 - tcp.c | 10 +++++++++- tcp_splice.c | 9 +++++---- 7 files changed, 109 insertions(+), 16 deletions(-) diff --git a/flow.c b/flow.c index 91c00465..d05aa495 100644 --- a/flow.c +++ b/flow.c @@ -21,6 +21,8 @@ const char *flow_state_str[] = { [FLOW_STATE_FREE] = "FREE", [FLOW_STATE_NEW] = "NEW", + [FLOW_STATE_INI] = "INI", + [FLOW_STATE_TGT] = "TGT", [FLOW_STATE_TYPED] = "TYPED", [FLOW_STATE_ACTIVE] = "ACTIVE", }; @@ -146,22 +148,63 @@ static void flow_set_state(struct flow_common *f, enum flow_state state) f->state = state; flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], FLOW_STATE(f)); + + if (MAX(state, oldstate) >= FLOW_STATE_TGT) + flow_log_(f, LOG_DEBUG, "%s => %s", pif_name(f->pif[INISIDE]), + pif_name(f->pif[TGTSIDE])); + else if (MAX(state, oldstate) >= FLOW_STATE_INI) + flow_log_(f, LOG_DEBUG, "%s => ?", pif_name(f->pif[INISIDE])); +} + +/** + * flow_initiate() - Move flow to INI, setting INISIDE details + * @flow: Flow to change state + * @pif: pif of the initiating side + */ +void flow_initiate(union flow *flow, uint8_t pif) +{ + struct flow_common *f = &flow->f; + + ASSERT(pif != PIF_NONE); + ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_NEW); + ASSERT(f->type == FLOW_TYPE_NONE); + ASSERT(f->pif[INISIDE] == PIF_NONE && f->pif[TGTSIDE] == PIF_NONE); + + f->pif[INISIDE] = pif; + flow_set_state(f, FLOW_STATE_INI); +} + +/** + * flow_target() - Move flow to TGT, setting TGTSIDE details + * @flow: Flow to change state + * @pif: pif of the target side + */ +void flow_target(union flow *flow, uint8_t pif) +{ + struct flow_common *f = &flow->f; + + ASSERT(pif != PIF_NONE); + ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_INI); + ASSERT(f->type == FLOW_TYPE_NONE); + ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] == PIF_NONE); + + f->pif[TGTSIDE] = pif; + flow_set_state(f, FLOW_STATE_TGT); } /** * flow_set_type() - Set type and move to TYPED * @flow: Flow to change state - * @type: Type for new flow - * - * Return: @flow + * @pif: pif of the initiating side */ union flow *flow_set_type(union flow *flow, enum flow_type type) { struct flow_common *f = &flow->f; ASSERT(type != FLOW_TYPE_NONE); - ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_NEW); + ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_TGT); ASSERT(f->type == FLOW_TYPE_NONE); + ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] != PIF_NONE); f->type = type; flow_set_state(f, FLOW_STATE_TYPED); @@ -175,6 +218,7 @@ union flow *flow_set_type(union flow *flow, enum flow_type type) void flow_activate(struct flow_common *f) { ASSERT(&flow_new_entry->f == f && f->state == FLOW_STATE_TYPED); + ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] != PIF_NONE); flow_set_state(f, FLOW_STATE_ACTIVE); flow_new_entry = NULL; @@ -234,6 +278,8 @@ void flow_alloc_cancel(union flow *flow) { ASSERT(flow_new_entry == flow); ASSERT(flow->f.state == FLOW_STATE_NEW || + flow->f.state == FLOW_STATE_INI || + flow->f.state == FLOW_STATE_TGT || flow->f.state == FLOW_STATE_TYPED); ASSERT(flow_first_free > FLOW_IDX(flow)); @@ -296,6 +342,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) } case FLOW_STATE_NEW: + case FLOW_STATE_INI: + case FLOW_STATE_TGT: case FLOW_STATE_TYPED: /* Incomplete flow at end of cycle */ ASSERT(false); diff --git a/flow.h b/flow.h index 95309389..29ef9f12 100644 --- a/flow.h +++ b/flow.h @@ -25,14 +25,42 @@ * NEW - Freshly allocated, uninitialised entry * Operations: * - flow_alloc_cancel() returns the entry to FREE + * - flow_initiate() sets the entry's INISIDE details and moves to + * INI * - FLOW_SET_TYPE() sets the entry's type and moves to TYPED * Caveats: * - No fields other than state may be accessed - * - At most one entry may be NEW or TYPED at a time, so it's unsafe - * to use flow_alloc() again until this entry moves to ACTIVE or - * FREE + * - At most one entry may be NEW, INI, TGT or TYPED at a time, so + * it's unsafe to use flow_alloc() again until this entry moves to + * ACTIVE or FREE * - You may not return to the main epoll loop while any flow is NEW * + * INI - An entry with INISIDE common information completed + * Operations: + * - Common fields related to INISIDE may be read + * - flow_alloc_cancel() returns the entry to FREE + * - flow_target() sets the entry's TGTSIDE details and moves to TGT + * Caveats: + * - Other common fields may not be read + * - Type specific fields may not be read or written + * - At most one entry may be NEW, INI, TGT or TYPED at a time, so + * it's unsafe to use flow_alloc() again until this entry moves to + * ACTIVE or FREE + * - You may not return to the main epoll loop while any flow is INI + * + * TGT - An entry with only INISIDE and TGTSIDE common information completed + * Operations: + * - Common fields related to INISIDE & TGTSIDE may be read + * - flow_alloc_cancel() returns the entry to FREE + * - FLOW_SET_TYPE() sets the entry's type and moves to TYPED + * Caveats: + * - Other common fields may not be read + * - Type specific fields may not be read or written + * - At most one entry may be NEW, INI, TGT or TYPED at a time, so + * it's unsafe to use flow_alloc() again until this entry moves to + * ACTIVE or FREE + * - You may not return to the main epoll loop while any flow is TGT + * * TYPED - Generic info initialised, type specific initialisation underway * Operations: * - All common fields may be read @@ -40,9 +68,9 @@ * - flow_alloc_cancel() returns the entry to FREE * - FLOW_ACTIVATE() moves the entry to ACTIVE * Caveats: - * - At most one entry may be NEW or TYPED at a time, so it's unsafe - * to use flow_alloc() again until this entry moves to ACTIVE or - * FREE + * - At most one entry may be NEW, INI, TGT or TYPED at a time, so + * it's unsafe to use flow_alloc() again until this entry moves to + * ACTIVE or FREE * - You may not return to the main epoll loop while any flow is * TYPED * @@ -58,6 +86,8 @@ enum flow_state { FLOW_STATE_FREE, FLOW_STATE_NEW, + FLOW_STATE_INI, + FLOW_STATE_TGT, FLOW_STATE_TYPED, FLOW_STATE_ACTIVE, @@ -109,6 +139,7 @@ extern const uint8_t flow_proto[]; * struct flow_common - Common fields for packet flows * @state: State of the flow table entry * @type: Type of packet flow + * @pif[]: Interface for each side of the flow */ struct flow_common { #ifdef __GNUC__ @@ -122,6 +153,7 @@ struct flow_common { static_assert(sizeof(uint8_t) * 8 >= FLOW_TYPE_BITS, "Not enough bits for type field"); #endif + uint8_t pif[SIDES]; }; #define FLOW_INDEX_BITS 17 /* 128k - 1 */ diff --git a/flow_table.h b/flow_table.h index 28e36b9a..1b163491 100644 --- a/flow_table.h +++ b/flow_table.h @@ -107,6 +107,9 @@ static inline flow_sidx_t flow_sidx(const struct flow_common *f, union flow *flow_alloc(void); void flow_alloc_cancel(union flow *flow); +void flow_initiate(union flow *flow, uint8_t pif); +void flow_target(union flow *flow, uint8_t pif); + union flow *flow_set_type(union flow *flow, enum flow_type type); #define FLOW_SET_TYPE(flow_, t_, var_) (&flow_set_type((flow_), (t_))->var_) diff --git a/icmp.c b/icmp.c index 3567ad7e..80330f6f 100644 --- a/icmp.c +++ b/icmp.c @@ -163,6 +163,8 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, if (!flow) return NULL; + flow_initiate(flow, PIF_TAP); + flow_target(flow, PIF_HOST); pingf = FLOW_SET_TYPE(flow, flowtype, ping); pingf->seq = -1; diff --git a/pif.h b/pif.h index bd529364..ca85b349 100644 --- a/pif.h +++ b/pif.h @@ -38,7 +38,6 @@ static inline const char *pif_type(enum pif_type pt) return "?"; } -/* cppcheck-suppress unusedFunction */ static inline const char *pif_name(uint8_t pif) { return pif_type(pif); diff --git a/tcp.c b/tcp.c index 0313dcb9..dad425cb 100644 --- a/tcp.c +++ b/tcp.c @@ -1948,6 +1948,8 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, if (!(flow = flow_alloc())) return; + flow_initiate(flow, PIF_TAP); + if (af == AF_INET) { if (IN4_IS_ADDR_UNSPECIFIED(saddr) || IN4_IS_ADDR_BROADCAST(saddr) || @@ -2000,6 +2002,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, goto cancel; } + flow_target(flow, PIF_HOST); conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); conn->tapside = INISIDE; conn->sock = s; @@ -2713,7 +2716,10 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, const union sockaddr_inany *sa, const struct timespec *now) { - struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); + struct tcp_tap_conn *conn; + + flow_target(flow, PIF_TAP); + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); conn->tapside = TGTSIDE; conn->sock = s; @@ -2762,6 +2768,8 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, if (s < 0) goto cancel; + flow_initiate(flow, ref.tcp_listen.pif); + if (sa.sa_family == AF_INET) { const struct in_addr *addr = &sa.sa4.sin_addr; in_port_t port = sa.sa4.sin_port; diff --git a/tcp_splice.c b/tcp_splice.c index 31e21732..b8f64a95 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -431,7 +431,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, union inany_addr src; in_port_t srcport; sa_family_t af; - uint8_t pif1; + uint8_t tgtpif; if (c->mode != MODE_PASTA) return false; @@ -455,7 +455,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, return true; } - pif1 = PIF_HOST; + tgtpif = PIF_HOST; dstport += c->tcp.fwd_out.delta[dstport]; break; @@ -463,7 +463,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, if (!inany_is_loopback(&src)) return false; - pif1 = PIF_SPLICE; + tgtpif = PIF_SPLICE; dstport += c->tcp.fwd_in.delta[dstport]; break; @@ -471,6 +471,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, return false; } + flow_target(flow, tgtpif); conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, tcp_splice); conn->flags = af == AF_INET ? 0 : SPLICE_V6; @@ -482,7 +483,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0); - if (tcp_splice_connect(c, conn, af, pif1, dstport)) + if (tcp_splice_connect(c, conn, af, tgtpif, dstport)) conn_flag(c, conn, CLOSING); FLOW_ACTIVATE(conn); -- 2.45.1
We recently introduced this field to keep track of which side of a TCP flow is the guest/tap facing one. Now that we generically record which pif each side of each flow is connected to, we can easily derive that, and no longer need to keep track of it explicitly. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 12 ++++++------ tcp_conn.h | 2 -- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/tcp.c b/tcp.c index dad425cb..25fcf020 100644 --- a/tcp.c +++ b/tcp.c @@ -368,6 +368,8 @@ #define OPT_SACK 5 #define OPT_TS 8 +#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP) + #define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) #define CONN_V6(conn) (!CONN_V4(conn)) #define CONN_IS_CLOSING(conn) \ @@ -577,7 +579,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, - .flowside = FLOW_SIDX(conn, !conn->tapside), }; + .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; struct epoll_event ev = { .data.u64 = ref.u64 }; if (conn->events == CLOSED) { @@ -1130,8 +1132,8 @@ static uint64_t tcp_conn_hash(const struct ctx *c, static inline unsigned tcp_hash_probe(const struct ctx *c, const struct tcp_tap_conn *conn) { - flow_sidx_t sidx = FLOW_SIDX(conn, conn->tapside); unsigned b = tcp_conn_hash(c, conn) % TCP_HASH_TABLE_SIZE; + flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE(conn)); /* Linear probing */ while (!flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) && @@ -1150,7 +1152,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) { unsigned b = tcp_hash_probe(c, conn); - tc_hash[b] = FLOW_SIDX(conn, conn->tapside); + tc_hash[b] = FLOW_SIDX(conn, TAPSIDE(conn)); flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); } @@ -2004,7 +2006,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, flow_target(flow, PIF_HOST); conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); - conn->tapside = INISIDE; conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2721,7 +2722,6 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, flow_target(flow, PIF_TAP); conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); - conn->tapside = TGTSIDE; conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; @@ -2880,7 +2880,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) struct tcp_tap_conn *conn = CONN(ref.flowside.flow); ASSERT(conn->f.type == FLOW_TCP); - ASSERT(ref.flowside.side == !conn->tapside); + ASSERT(conn->f.pif[ref.flowside.side] != PIF_TAP); if (conn->events == CLOSED) return; diff --git a/tcp_conn.h b/tcp_conn.h index e8c51c34..5f8c8fb6 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -13,7 +13,6 @@ * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) * @f: Generic flow information * @in_epoll: Is the connection in the epoll set? - * @tapside: Which side of the flow faces the tap/guest interface * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number * @events: Connection events, implying connection states @@ -40,7 +39,6 @@ struct tcp_tap_conn { struct flow_common f; bool in_epoll :1; - unsigned tapside :1; #define TCP_RETRANS_BITS 3 unsigned int retrans :TCP_RETRANS_BITS; -- 2.45.1
On Tue, 21 May 2024 15:57:02 +1000 David Gibson <david(a)gibson.dropbear.id.au> wrote:Here's a series of what I hope will be the last preliminaries for the flow table which can be applied before we take the cost of increasing the size of entries with generic address information. Changes since "Unified flow table v5": * Small fixes to state verification * New patch passing specific typed structures into deferred/timer callbacks * Terminology change from "forwarded side" to "target side" * Assorted wording and style tweaks based on Stefano's review David Gibson (6): flow: Properly type callbacks to protocol specific handlers inany: Better helpers for using inany and specific family addrs together flow: Clarify and enforce flow state transitions flow: Make side 0 always be the initiating side flow: Record the pifs for each side of each flow tcp: Remove interim 'tapside' field from connectionApplied. -- Stefano