Flows move over several different states in their lifetime. The rules for these are documented in comments, but they're pretty complex and a number of the transitions are implicit, which makes this pretty fragile and error prone. Change the code to explicitly track the states in a field. Make all transitions explicit and logged. To the extent that it's practical in C, enforce what can and can't be done in various states with ASSERT()s. While we're at it, tweak the docs to clarify the restrictions on each state a bit. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 146 ++++++++++++++++++++++++++++++--------------------- flow.h | 81 ++++++++++++++++++++++++++-- flow_table.h | 9 ++++ icmp.c | 4 +- tcp.c | 8 ++- tcp_splice.c | 4 +- 6 files changed, 183 insertions(+), 69 deletions(-) diff --git a/flow.c b/flow.c index e257f890..da71fe10 100644 --- a/flow.c +++ b/flow.c @@ -18,6 +18,15 @@ #include "flow.h" #include "flow_table.h" +const char *flow_state_str[] = { + [FLOW_STATE_FREE] = "FREE", + [FLOW_STATE_NEW] = "NEW", + [FLOW_STATE_TYPED] = "TYPED", + [FLOW_STATE_ACTIVE] = "ACTIVE", +}; +static_assert(ARRAY_SIZE(flow_state_str) == FLOW_NUM_STATES, + "flow_state_str[] doesn't match enum flow_state"); + const char *flow_type_str[] = { [FLOW_TYPE_NONE] = "<none>", [FLOW_TCP] = "TCP connection", @@ -39,46 +48,6 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, /* Global Flow Table */ -/** - * DOC: Theory of Operation - flow entry life cycle - * - * An individual flow table entry moves through these logical states, usually in - * this order. - * - * FREE - Part of the general pool of free flow table entries - * Operations: - * - flow_alloc() finds an entry and moves it to ALLOC state - * - * ALLOC - A tentatively allocated entry - * Operations: - * - flow_alloc_cancel() returns the entry to FREE state - * - FLOW_START() set the entry's type and moves to START state - * Caveats: - * - It's not safe to write fields in the flow entry - * - It's not safe to allocate further entries with flow_alloc() - * - It's not safe to return to the main epoll loop (use FLOW_START() - * to move to START state before doing so) - * - It's not safe to use flow_*() logging functions - * - * START - An entry being prepared by flow type specific code - * Operations: - * - Flow type specific fields may be accessed - * - flow_*() logging functions - * - flow_alloc_cancel() returns the entry to FREE state - * Caveats: - * - Returning to the main epoll loop or allocating another entry - * with flow_alloc() implicitly moves the entry to ACTIVE state. - * - * ACTIVE - An active flow entry managed by flow type specific code - * Operations: - * - Flow type specific fields may be accessed - * - flow_*() logging functions - * - Flow may be expired by returning 'true' from flow type specific - * deferred or timer handler. This will return it to FREE state. - * Caveats: - * - It's not safe to call flow_alloc_cancel() - */ - /** * DOC: Theory of Operation - allocating and freeing flow entries * @@ -132,6 +101,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, unsigned flow_first_free; union flow flowtab[FLOW_MAX]; +static const union flow *flow_new_entry; /* = NULL */ /* Last time the flow timers ran */ static struct timespec flow_timer_run; @@ -144,6 +114,7 @@ static struct timespec flow_timer_run; */ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) { + const char *type_or_state; char msg[BUFSIZ]; va_list args; @@ -151,40 +122,65 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) (void)vsnprintf(msg, sizeof(msg), fmt, args); va_end(args); - logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg); + /* Show type if it's set, otherwise the state */ + if (f->state < FLOW_STATE_TYPED) + type_or_state = FLOW_STATE(f); + else + type_or_state = FLOW_TYPE(f); + + logmsg(pri, "Flow %u (%s): %s", flow_idx(f), type_or_state, msg); +} + +/** + * flow_set_state() - Change flow's state + * @f: Flow changing state + * @state: New state + */ +static void flow_set_state(struct flow_common *f, enum flow_state state) +{ + uint8_t oldstate = f->state; + + ASSERT(state < FLOW_NUM_STATES); + ASSERT(oldstate < FLOW_NUM_STATES); + + f->state = state; + flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], + FLOW_STATE(f)); } /** - * flow_start() - Set flow type for new flow and log - * @flow: Flow to set type for + * flow_set_type() - Set type and move to TYPED + * @flow: Flow to change state * @type: Type for new flow * @iniside: Which side initiated the new flow * * Return: @flow - * - * Should be called before setting any flow type specific fields in the flow - * table entry. */ -union flow *flow_start(union flow *flow, enum flow_type type, - unsigned iniside) +union flow *flow_set_type(union flow *flow, enum flow_type type, + unsigned iniside) { + struct flow_common *f = &flow->f; + + ASSERT(type != FLOW_TYPE_NONE); + ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_NEW); + ASSERT(f->type == FLOW_TYPE_NONE); + (void)iniside; - flow->f.type = type; - flow_dbg(flow, "START %s", flow_type_str[flow->f.type]); + f->type = type; + flow_set_state(f, FLOW_STATE_TYPED); return flow; } /** - * flow_end() - Clear flow type for finished flow and log - * @flow: Flow to clear + * flow_activate() - Move flow to ACTIVE + * @f: Flow to change state */ -static void flow_end(union flow *flow) +void flow_activate(struct flow_common *f) { - if (flow->f.type == FLOW_TYPE_NONE) - return; /* Nothing to do */ + ASSERT(&flow_new_entry->f == f && f->state == FLOW_STATE_TYPED); - flow_dbg(flow, "END %s", flow_type_str[flow->f.type]); - flow->f.type = FLOW_TYPE_NONE; + flow_set_state(f, FLOW_STATE_ACTIVE); + flow_new_entry = NULL; } /** @@ -196,9 +192,12 @@ union flow *flow_alloc(void) { union flow *flow = &flowtab[flow_first_free]; + ASSERT(!flow_new_entry); + if (flow_first_free >= FLOW_MAX) return NULL; + ASSERT(flow->f.state == FLOW_STATE_FREE); ASSERT(flow->f.type == FLOW_TYPE_NONE); ASSERT(flow->free.n >= 1); ASSERT(flow_first_free + flow->free.n <= FLOW_MAX); @@ -221,7 +220,10 @@ union flow *flow_alloc(void) flow_first_free = flow->free.next; } + flow_new_entry = flow; memset(flow, 0, sizeof(*flow)); + flow_set_state(&flow->f, FLOW_STATE_NEW); + return flow; } @@ -233,15 +235,21 @@ union flow *flow_alloc(void) */ void flow_alloc_cancel(union flow *flow) { + ASSERT(flow_new_entry == flow); + ASSERT(flow->f.state == FLOW_STATE_NEW || + flow->f.state == FLOW_STATE_TYPED); ASSERT(flow_first_free > FLOW_IDX(flow)); - flow_end(flow); + flow_set_state(&flow->f, FLOW_STATE_FREE); + memset(flow, 0, sizeof(*flow)); + /* Put it back in a length 1 free cluster, don't attempt to fully * reverse flow_alloc()s steps. This will get folded together the next * time flow_defer_handler runs anyway() */ flow->free.n = 1; flow->free.next = flow_first_free; flow_first_free = FLOW_IDX(flow); + flow_new_entry = NULL; } /** @@ -261,11 +269,14 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) flow_timer_run = *now; } + ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */ + for (idx = 0; idx < FLOW_MAX; idx++) { union flow *flow = &flowtab[idx]; bool closed = false; - if (flow->f.type == FLOW_TYPE_NONE) { + switch (flow->f.state) { + case FLOW_STATE_FREE: { unsigned skip = flow->free.n; /* First entry of a free cluster must have n >= 1 */ @@ -287,6 +298,20 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) continue; } + case FLOW_STATE_NEW: + case FLOW_STATE_TYPED: + /* Incomplete flow at end of cycle */ + ASSERT(false); + break; + + case FLOW_STATE_ACTIVE: + /* Nothing to do */ + break; + + default: + ASSERT(false); + } + switch (flow->f.type) { case FLOW_TYPE_NONE: ASSERT(false); @@ -310,7 +335,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) } if (closed) { - flow_end(flow); + flow_set_state(&flow->f, FLOW_STATE_FREE); + memset(flow, 0, sizeof(*flow)); if (free_head) { /* Add slot to current free cluster */ diff --git a/flow.h b/flow.h index c943c441..e61d35e8 100644 --- a/flow.h +++ b/flow.h @@ -9,6 +9,68 @@ #define FLOW_TIMER_INTERVAL 1000 /* ms */ +/** + * enum flow_state - States of a flow table entry + * + * An individual flow table entry moves through these states, usually in this + * order. + * General rules: + * - Code outside flow.c should never write common fields of union flow. + * - The state field may always be read. + * + * FREE - Part of the general pool of free flow table entries + * Operations: + * - flow_alloc() finds an entry and moves it to NEW + * + * NEW - Freshly allocated, uninitialised entry + * Operations: + * - flow_alloc_cancel() returns the entry to FREE + * - FLOW_SET_TYPE() sets the entry's type and moves to TYPED + * Caveats: + * - No fields other than state may be accessed + * - At most one entry may be NEW or TYPED at a time, so it's unsafe + * to use flow_alloc() again until this entry moves to ACTIVE or + * FREE + * - You may not return to the main epoll loop while any flow is NEW + * + * TYPED - Generic info initialised, type specific initialisation underway + * Operations: + * - All common fields may be read + * - Type specific fields may be read and written + * - flow_alloc_cancel() returns the entry to FREE + * - FLOW_ACTIVATE() moves the entry to ACTIVE + * Caveats: + * - At most one entry may be NEW or TYPED at a time, so it's unsafe + * to use flow_alloc() again until this entry moves to ACTIVE or + * FREE + * - You may not return to the main epoll loop while any flow is + * TYPED + * + * ACTIVE - An active, fully-initialised flow entry + * Operations: + * - All common fields may be read + * - Type specific fields may be read and written + * - Flow returns to FREE when it expires, signalled by returning + * 'true' from flow type specific deferred or timer handler + * Caveats: + * - flow_alloc_cancel() may not be called on it + */ +enum flow_state { + FLOW_STATE_FREE, + FLOW_STATE_NEW, + FLOW_STATE_TYPED, + FLOW_STATE_ACTIVE, + + FLOW_NUM_STATES, +}; +#define FLOW_STATE_BITS 8 +static_assert(FLOW_NUM_STATES <= (1 << FLOW_STATE_BITS), + "Too many flow states for FLOW_STATE_BITS"); + +extern const char *flow_state_str[]; +#define FLOW_STATE(f) \ + ((f)->state < FLOW_NUM_STATES ? flow_state_str[(f)->state] : "?") + /** * enum flow_type - Different types of packet flows we track */ @@ -26,6 +88,9 @@ enum flow_type { FLOW_NUM_TYPES, }; +#define FLOW_TYPE_BITS 8 +static_assert(FLOW_NUM_TYPES <= (1 << FLOW_TYPE_BITS), + "Too many flow types for FLOW_TYPE_BITS"); extern const char *flow_type_str[]; #define FLOW_TYPE(f) \ @@ -37,10 +102,21 @@ extern const uint8_t flow_proto[]; /** * struct flow_common - Common fields for packet flows + * @state: State of the flow table entry * @type: Type of packet flow */ struct flow_common { +#ifdef __GNUC__ + enum flow_state state:FLOW_STATE_BITS; + enum flow_type type:FLOW_TYPE_BITS; +#else + uint8_t state; + static_assert(sizeof(uint8_t) * 8 >= FLOW_STATE_BITS, + "Not enough bits for state field"); uint8_t type; + static_assert(sizeof(uint8_t) * 8 >= FLOW_TYPE_BITS, + "Not enough bits for type field"); +#endif }; #define FLOW_INDEX_BITS 17 /* 128k - 1 */ @@ -49,11 +125,6 @@ struct flow_common { #define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */ #define FLOW_FILE_PRESSURE 30 /* % of c->nofile */ -union flow *flow_start(union flow *flow, enum flow_type type, - unsigned iniside); -#define FLOW_START(flow_, t_, var_, i_) \ - (&flow_start((flow_), (t_), (i_))->var_) - /** * struct flow_sidx - ID for one side of a specific flow * @side: Side referenced (0 or 1) diff --git a/flow_table.h b/flow_table.h index b7e5529a..17b9ea70 100644 --- a/flow_table.h +++ b/flow_table.h @@ -107,4 +107,13 @@ static inline flow_sidx_t flow_sidx(const struct flow_common *f, union flow *flow_alloc(void); void flow_alloc_cancel(union flow *flow); +union flow *flow_set_type(union flow *flow, enum flow_type type, + unsigned iniside); +#define FLOW_SET_TYPE(flow_, t_, var_, i_) \ + (&flow_set_type((flow_), (t_), (i_))->var_) + +void flow_activate(struct flow_common *f); +#define FLOW_ACTIVATE(flow_) \ + (flow_activate(&(flow_)->f)) + #endif /* FLOW_TABLE_H */ diff --git a/icmp.c b/icmp.c index eb559c18..fc729557 100644 --- a/icmp.c +++ b/icmp.c @@ -167,7 +167,7 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, if (!flow) return NULL; - pingf = FLOW_START(flow, flowtype, ping, TAPSIDE); + pingf = FLOW_SET_TYPE(flow, flowtype, ping, TAPSIDE); pingf->seq = -1; pingf->id = id; @@ -198,6 +198,8 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, *id_sock = pingf; + FLOW_ACTIVATE(pingf); + return pingf; cancel: diff --git a/tcp.c b/tcp.c index 4512af0b..64fe46e0 100644 --- a/tcp.c +++ b/tcp.c @@ -2004,7 +2004,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, goto cancel; } - conn = FLOW_START(flow, FLOW_TCP, tcp, TAPSIDE); + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp, TAPSIDE); conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2075,6 +2075,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, } tcp_epoll_ctl(c, conn); + FLOW_ACTIVATE(conn); return; cancel: @@ -2715,7 +2716,8 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, const union sockaddr_inany *sa, const struct timespec *now) { - struct tcp_tap_conn *conn = FLOW_START(flow, FLOW_TCP, tcp, SOCKSIDE); + struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp, + SOCKSIDE); conn->sock = s; conn->timer = -1; @@ -2738,6 +2740,8 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, conn_flag(c, conn, ACK_FROM_TAP_DUE); tcp_get_sndbuf(conn); + + FLOW_ACTIVATE(conn); } /** diff --git a/tcp_splice.c b/tcp_splice.c index fb00bc22..852a93b4 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -471,7 +471,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, return false; } - conn = FLOW_START(flow, FLOW_TCP_SPLICE, tcp_splice, 0); + conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, tcp_splice, 0); conn->flags = af == AF_INET ? 0 : SPLICE_V6; conn->s[0] = s0; @@ -485,6 +485,8 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, if (tcp_splice_connect(c, conn, af, pif1, dstport)) conn_flag(c, conn, CLOSING); + FLOW_ACTIVATE(conn); + return true; } -- 2.45.1