On Tue, Aug 19, 2025 at 11:10:05PM -0400, Jon Maloy wrote:
We add a cache table to keep partial contents of the kernel ARP/NDP tables. This way, we drastically reduce the number of netlink calls to read those tables.
Do you have data to suggest this is necessary? It's a lot of code to optimise something only needed for some pretty uncommon cases.
We create dummy cache entries representing non-(not-yet)-existing ARP/NDP entries when needed. We add a short expiration time to each such entry, so that we can know when to make repeated calls to the kernel tables in the beginning. We also add an access counter to the entries, to ensure that the timer becomes longer and the call frequency abates over time if no ARP/NDP entry is created.
For regular entries we use a much longer timer, with the purpose to update the entry in the rare case that a remote host changes its MAC address.
Signed-off-by: Jon Maloy
--- arp.c | 3 +- conf.c | 2 + flow.c | 5 +- fwd.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fwd.h | 4 ++ ndp.c | 3 +- tcp.c | 3 +- 7 files changed, 218 insertions(+), 8 deletions(-) diff --git a/arp.c b/arp.c index c37867a..040d4fe 100644 --- a/arp.c +++ b/arp.c @@ -29,7 +29,6 @@ #include "dhcp.h" #include "passt.h" #include "tap.h" -#include "netlink.h"
/** * arp() - Check if this is a supported ARP message, reply as needed @@ -79,7 +78,7 @@ int arp(const struct ctx *c, const struct pool *p) */ inany_from_af(&tgt, AF_INET, am->tip); if (!fwd_inany_nat(c, &tgt)) - nl_neigh_mac_get(nl_sock, &tgt, c->ifi4, am->sha); + fwd_neigh_mac_get(c, &tgt, c->ifi4, am->sha);
memcpy(swap, am->tip, sizeof(am->tip)); memcpy(am->tip, am->sip, sizeof(am->tip)); diff --git a/conf.c b/conf.c index f47f48e..0abdbf7 100644 --- a/conf.c +++ b/conf.c @@ -2122,6 +2122,8 @@ void conf(struct ctx *c, int argc, char **argv) c->udp.fwd_out.mode = fwd_default;
fwd_scan_ports_init(c); + if (fwd_mac_cache_init()) + die("Failed to initiate neighnor MAC cache");
"neighnor"
if (!c->quiet) conf_print(c); diff --git a/flow.c b/flow.c index d7b3fd1..3ca30f1 100644 --- a/flow.c +++ b/flow.c @@ -459,7 +459,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow, */ ifi = inany_v4(&ini->oaddr) ? c->ifi4 : c->ifi6; if (!fwd_inany_nat(c, &ini->oaddr)) - nl_neigh_mac_get(nl_sock, &ini->oaddr, ifi, f->tap_omac); + fwd_neigh_mac_get(c, &ini->oaddr, ifi, f->tap_omac); break;
case PIF_SPLICE: @@ -474,7 +474,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow, */ ifi = inany_v4(&ini->eaddr) ? c->ifi4 : c->ifi6; if (!fwd_inany_nat(c, &ini->eaddr)) - nl_neigh_mac_get(nl_sock, &ini->eaddr, ifi, f->tap_omac); + fwd_neigh_mac_get(c, &ini->eaddr, ifi, f->tap_omac); break;
default: @@ -491,6 +491,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
f->pif[TGTSIDE] = tgtpif; flow_set_state(f, FLOW_STATE_TGT); + return tgt; }
diff --git a/fwd.c b/fwd.c index 55bf5f2..6647764 100644 --- a/fwd.c +++ b/fwd.c @@ -19,6 +19,8 @@ #include
#include #include +#include +#include #include "util.h" #include "ip.h" @@ -26,6 +28,8 @@ #include "passt.h" #include "lineread.h" #include "flow_table.h" +#include "inany.h" +#include "netlink.h"
/* Empheral port range: values from RFC 6335 */ static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14); @@ -33,6 +37,208 @@ static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
+#define MAC_CACHE_BUCKETS 1024 +#define MAC_CACHE_RENEWAL 3600 /* Refresh entry from ARP/NDP every hour */ + +/* Partial cache of ARP/NDP table contents */ +struct mac_cache_entry { + union inany_addr key; + unsigned char mac[ETH_ALEN]; + struct timespec expiry; + uint32_t count; + struct mac_cache_entry *next; +}; + +struct mac_cache_table { + struct mac_cache_entry **buckets; + size_t nbuckets; +}; + +static struct mac_cache_table mac_cache; + +/** + * timespec_before() - Check the relation between two pints in time + * @a: Point in time to be tested + * @b: Point in time test a against + * Return: True if a comes before b, otherwise b + */ +static inline bool timespec_before(const struct timespec *a, + const struct timespec *b) +{ + return (a->tv_sec < b->tv_sec) || + (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec); +} + +/** + * mac_entry_is_dummy() - Check if a cache entry is a placeholder + * @c: Execution context + * @e: Cache entry + * + * Return: True if the entry is a placeholder, false otherwise + */ +bool mac_entry_is_dummy(const struct ctx *c, const struct mac_cache_entry *e) +{ + return !memcmp(c->our_tap_mac, e->mac, ETH_ALEN); +} + +/** + * mac_entry_expired() - Check if a cache entry has expired + * @e: Cache entry + * + * Return: True if the entry has expired, false otherwise + */ +static bool mac_entry_expired(const struct mac_cache_entry *e) +{ + struct timespec now; + + clock_gettime(CLOCK_MONOTONIC, &now); + return timespec_before(&e->expiry, &now); +} + +/** + * mac_entry_set_expiry() - Set the time for a cache entry to expire + * @e: Cache entry + * @expiry: Expiration time, in seconds from current moment. + * + * Return: The result of the hash + */ +static void mac_entry_set_expiry(struct mac_cache_entry *e, int expiry) +{ + clock_gettime(CLOCK_MONOTONIC, &e->expiry); + e->expiry.tv_sec += expiry; +} + +/** + * inany_hash32() - Hash the contenst of key into an integer + * @key: IPv4 or IPv6 address, used as key + * + * Return: The result of the hash + */ +static inline uint32_t inany_hash32(const struct ctx *c, + const union inany_addr *key) +{ + struct siphash_state st = SIPHASH_INIT(c->hash_secret); + + inany_siphash_feed(&st, key); + + return (uint32_t)siphash_final(&st, sizeof(*key), 0); +} + +/** + * fwd_mac_cache_bucket_idx() - Find the table index of an entry + * @c: Execution context + * @key: IPv4 or IPv6 address, used as key for the hash lookup + * @nbuckets: Number of buckets in the table + * + * Return: The index found + */ +static inline size_t fwd_mac_cache_bucket_idx(const struct ctx *c, + const union inany_addr *key, + size_t nbuckets) +{ + uint32_t h = inany_hash32(c, key); + + return (nbuckets & (nbuckets - 1)) ? (h % nbuckets) : (h & (nbuckets - 1));
I don't think this is worth the complexity: I wouldn't be confident the conditional branch is less expensive than an integer divmod. And if it is, it wouldn't suprise me if the compiler did that optimisation itself.
+} + +/** + * fwd_mac_cache_find() - Find an entry in the ARP/NDP cache table + * @c: Execution context + * @key: IPv4 or IPv6 address, used as key for the hash lookup + * + * Return: Pointer to the entry on success, NULL on failure. + */ +static struct mac_cache_entry *fwd_mac_cache_find(const struct ctx *c, + const union inany_addr *key) +{ + const struct mac_cache_table *t = &mac_cache; + struct mac_cache_entry *e; + size_t idx; + + idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets); + for (e = t->buckets[idx]; e; e = e->next) + if (inany_equals(&e->key, key)) + return e; + return NULL; +} + +/** + * fwd_mac_cache_add() - Add a new entry to the ARP/NDP cache table + * @c: Execution context + * @key: IPv4 or IPv6 address, used as key for the hash lookup + * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable + * + * Return: Pointer to the new entry on success, NULL on failure. + */ +static struct mac_cache_entry *fwd_mac_cache_add(const struct ctx *c, + const union inany_addr *key, + const unsigned char *mac) +{ + struct mac_cache_table *t = &mac_cache; + size_t idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets); + struct mac_cache_entry *e; + + e = calloc(1, sizeof(*e)); + if (!e) + return NULL; + e->key = *key; + memcpy(e->mac, mac, ETH_ALEN); + e->count = 0; + e->next = t->buckets[idx]; + t->buckets[idx] = e; + + return e; +} + +/** + * fwd_neigh_mac_get() - Find a MAC address the ARP/NDP cache table + * @c: Execution context + * @addr: IPv4 or IPv6 address + * @ifi: Interface index + * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable + * + * Return: 0 on success, -1 on failure. + */ +int fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr, + int ifi, unsigned char *mac) +{ + struct mac_cache_entry *e = fwd_mac_cache_find(c, addr); + bool refresh = false; + + if (e) + refresh = mac_entry_expired(e); + else if ((e = fwd_mac_cache_add(c, addr, mac))) + refresh = true; + else + return -1; + + if (refresh) { + nl_neigh_mac_get(nl_sock, addr, ifi, e->mac); + if (mac_entry_is_dummy(c, e)) + mac_entry_set_expiry(e, e->count++); + else + mac_entry_set_expiry(e, MAC_CACHE_RENEWAL); + } + memcpy(mac, e->mac, ETH_ALEN); + + return 0; +} + +/** + * fwd_mac_cache_init() - Initiate ARP/NDP cache table + * + * Return: 0 on success, -1 on failure. + */ +int fwd_mac_cache_init(void) +{ + struct mac_cache_table *t = &mac_cache; + + t->nbuckets = MAC_CACHE_BUCKETS; + t->buckets = calloc(t->nbuckets, sizeof(*t->buckets)); + + return t->buckets ? 0 : -1; +} + /** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral * * Work out what ports the host thinks are emphemeral and record it for later diff --git a/fwd.h b/fwd.h index c8d485d..ebfc114 100644 --- a/fwd.h +++ b/fwd.h @@ -58,4 +58,8 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, const struct flowside *ini, struct flowside *tgt); bool fwd_inany_nat(const struct ctx *c, const union inany_addr *addr);
+int fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr, + int ifi, unsigned char *mac); +int fwd_mac_cache_init(void); + #endif /* FWD_H */ diff --git a/ndp.c b/ndp.c index 19b9b28..af957c9 100644 --- a/ndp.c +++ b/ndp.c @@ -32,7 +32,6 @@ #include "passt.h" #include "tap.h" #include "log.h" -#include "netlink.h"
#define RT_LIFETIME 65535
@@ -222,7 +221,7 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst, */ inany_from_af(&tgt, AF_INET6, addr); if (!fwd_inany_nat(c, &tgt)) - nl_neigh_mac_get(nl_sock, &tgt, c->ifi6, na.target_l2_addr.mac); + fwd_neigh_mac_get(c, &tgt, c->ifi6, na.target_l2_addr.mac);
ndp_send(c, dst, &na, sizeof(na)); } diff --git a/tcp.c b/tcp.c index 28f9ef5..6b5b4ff 100644 --- a/tcp.c +++ b/tcp.c @@ -309,7 +309,6 @@ #include "tcp_internal.h" #include "tcp_buf.h" #include "tcp_vu.h" -#include "netlink.h"
#ifndef __USE_MISC /* From Linux UAPI, missing in netinet/tcp.h provided by musl */ @@ -1908,7 +1907,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af, memcpy(src_mac, c->our_tap_mac, ETH_ALEN); inany_from_af(&tgt, af, daddr); if (!fwd_inany_nat(c, &tgt)) - nl_neigh_mac_get(nl_sock, &tgt, ifi, src_mac); + fwd_neigh_mac_get(c, &tgt, ifi, src_mac);
if (af == AF_INET) { struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP);
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson