We add a cache table to keep partial contents of the kernel ARP/NDP
tables. This way, we drastically reduce the number of netlink calls
to read those tables.
We create placeholder cache entries representing non- or not-yet-
existing ARP/NDP entries when needed. We add a short expiration time
to each such entry, so that we can know when to make repeated calls to
the kernel tables in the beginning. We also add an access counter to the
entries, to ensure that the timer becomes longer and the call frequency
abates over time if no ARP/NDP entry shows up.
For regular entries we use a much longer timer, with the purpose to
update the entry in the rare case that a remote host changes its
MAC address.
Signed-off-by: Jon Maloy
---
v5: - Moved to earlier in series to reduce rebase conflicts
---
conf.c | 2 +
fwd.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fwd.h | 11 +++
netlink.c | 2 -
4 files changed, 219 insertions(+), 2 deletions(-)
diff --git a/conf.c b/conf.c
index f47f48e..11d9d63 100644
--- a/conf.c
+++ b/conf.c
@@ -2122,6 +2122,8 @@ void conf(struct ctx *c, int argc, char **argv)
c->udp.fwd_out.mode = fwd_default;
fwd_scan_ports_init(c);
+ if (fwd_mac_cache_init())
+ die("Failed to initiate neighbour MAC cache");
if (!c->quiet)
conf_print(c);
diff --git a/fwd.c b/fwd.c
index 250cf56..236e58e 100644
--- a/fwd.c
+++ b/fwd.c
@@ -19,6 +19,8 @@
#include
#include
#include
+#include
+#include
#include "util.h"
#include "ip.h"
@@ -26,6 +28,8 @@
#include "passt.h"
#include "lineread.h"
#include "flow_table.h"
+#include "inany.h"
+#include "netlink.h"
/* Empheral port range: values from RFC 6335 */
static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
@@ -33,6 +37,208 @@ static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
+#define MAC_CACHE_BUCKETS 1024 /* Must be power of two */
+#define MAC_CACHE_RENEWAL 3600 /* Refresh entry from ARP/NDP every hour */
+
+/* Partial cache of ARP/NDP table contents */
+struct mac_cache_entry {
+ union inany_addr key;
+ unsigned char mac[ETH_ALEN];
+ struct timespec expiry;
+ uint32_t count;
+
+ /* Hash bucket chain */
+ struct mac_cache_entry *next;
+};
+
+struct mac_cache_table {
+ struct mac_cache_entry **buckets;
+ size_t nbuckets;
+ size_t size;
+};
+
+static struct mac_cache_table mac_cache;
+const uint8_t undefined_mac[6] = {0, 0, 0, 0, 0, 0};
+
+/**
+ * fwd_mac_cache_bucket_idx() - Find the table index of an entry
+ * @c: Execution context
+ * @key: IPv4 or IPv6 address, used as key for the hash lookup
+ * @nbuckets: Number of buckets in the table
+ *
+ * Return: The index found
+ */
+static inline size_t fwd_mac_cache_bucket_idx(const struct ctx *c,
+ const union inany_addr *key,
+ size_t nbuckets)
+{
+ struct siphash_state st = SIPHASH_INIT(c->hash_secret);
+ uint32_t h;
+
+ inany_siphash_feed(&st, key);
+ h = siphash_final(&st, sizeof(*key), 0);
+
+ return ((size_t)h) & (nbuckets - 1);
+}
+
+/**
+ * timespec_before() - Check the relation between two pints in time
+ * @a: Point in time to be tested
+ * @b: Point in time test a against
+ * Return: True if a comes before b, otherwise b
+ */
+static inline bool timespec_before(const struct timespec *a,
+ const struct timespec *b)
+{
+ return (a->tv_sec < b->tv_sec) ||
+ (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec);
+}
+
+/**
+ * mac_entry_placeholder() - Check if a cache entry is a placeholder
+ * @e: Cache entry
+ *
+ * Return: True if the entry is a placeholder, false otherwise
+ */
+bool mac_entry_placeholder(const struct mac_cache_entry *e)
+{
+ return mac_undefined(e->mac);
+}
+
+/**
+ * mac_entry_expired() - Check if a cache entry has expired
+ * @e: Cache entry
+ *
+ * Return: True if the entry has expired, false otherwise
+ */
+static bool mac_entry_expired(const struct mac_cache_entry *e)
+{
+ struct timespec now;
+
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ return timespec_before(&e->expiry, &now);
+}
+
+/**
+ * mac_entry_set_expiry() - Set the time for a cache entry to expire
+ * @e: Cache entry
+ * @expiry: Expiration time, in seconds from current moment.
+ *
+ * Return: The result of the hash
+ */
+static void mac_entry_set_expiry(struct mac_cache_entry *e, int expiry)
+{
+ clock_gettime(CLOCK_MONOTONIC, &e->expiry);
+ e->expiry.tv_sec += expiry;
+}
+
+/**
+ * fwd_mac_cache_find() - Find an entry in the ARP/NDP cache table
+ * @c: Execution context
+ * @key: IPv4 or IPv6 address, used as key for the hash lookup
+ *
+ * Return: Pointer to the entry on success, NULL on failure.
+ */
+static struct mac_cache_entry *fwd_mac_cache_find(const struct ctx *c,
+ const union inany_addr *key)
+{
+ const struct mac_cache_table *t = &mac_cache;
+ struct mac_cache_entry *e;
+ size_t idx;
+
+ idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets);
+ for (e = t->buckets[idx]; e; e = e->next)
+ if (inany_equals(&e->key, key))
+ return e;
+ return NULL;
+}
+
+/**
+ * fwd_mac_cache_add() - Add a new entry to the ARP/NDP cache table
+ * @c: Execution context
+ * @key: IPv4 or IPv6 address, used as key for the hash lookup
+ * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable
+ *
+ * Return: Pointer to the new entry on success, NULL on failure.
+ */
+static struct mac_cache_entry *fwd_mac_cache_add(const struct ctx *c,
+ const union inany_addr *key,
+ const unsigned char *mac)
+{
+ struct mac_cache_table *t = &mac_cache;
+ size_t idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets);
+ struct mac_cache_entry *e;
+
+ e = calloc(1, sizeof(*e));
+ if (!e)
+ return NULL;
+
+ e->key = *key;
+ memcpy(e->mac, mac, ETH_ALEN);
+ e->count = 0;
+ e->next = t->buckets[idx];
+ t->buckets[idx] = e;
+
+ return e;
+}
+
+/**
+ * fwd_neigh_mac_get() - Find a MAC address the ARP/NDP cache table
+ * @c: Execution context
+ * @addr: IPv4 or IPv6 address
+ * @ifi: Interface index
+ * @mac: Buffer for Ethernet MAC to return, found or default.
+ *
+ * Return: true if real MAC found, false if not found or failure
+ */
+bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
+ uint8_t *mac)
+{
+ struct mac_cache_entry *e = fwd_mac_cache_find(c, addr);
+ int ifi = inany_v4(addr) ? c->ifi4 : c->ifi6;
+ bool refresh = false;
+ bool ret = false;
+
+ if (e)
+ refresh = mac_entry_expired(e);
+ else if ((e = fwd_mac_cache_add(c, addr, mac)))
+ refresh = true;
+ else
+ return false;
+
+ if (!refresh) {
+ ret = !mac_entry_placeholder(e);
+ } else {
+ ret = nl_neigh_mac_get(nl_sock, addr, ifi, e->mac);
+ mac_entry_set_expiry(e, MAC_CACHE_RENEWAL);
+ }
+
+ if (ret) {
+ memcpy(mac, e->mac, ETH_ALEN);
+ return true;
+ }
+
+ /* Do linear back-off of new netlink calls if nothing found */
+ mac_entry_set_expiry(e, e->count++);
+ memcpy(mac, c->our_tap_mac, ETH_ALEN);
+ return false;
+}
+
+/**
+ * fwd_mac_cache_init() - Initiate ARP/NDP cache table
+ *
+ * Return: 0 on success, -1 on failure.
+ */
+int fwd_mac_cache_init(void)
+{
+ struct mac_cache_table *t = &mac_cache;
+
+ t->nbuckets = MAC_CACHE_BUCKETS;
+ t->buckets = calloc(t->nbuckets, sizeof(*t->buckets));
+ t->size = 0;
+ return t->buckets ? 0 : -1;
+}
+
/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
*
* Work out what ports the host thinks are emphemeral and record it for later
diff --git a/fwd.h b/fwd.h
index 65c7c96..80da4b1 100644
--- a/fwd.h
+++ b/fwd.h
@@ -42,6 +42,13 @@ struct fwd_ports {
in_port_t delta[NUM_PORTS];
};
+extern const unsigned char undefined_mac[];
+
+static inline bool mac_undefined(const uint8_t *mac)
+{
+ return !memcmp(mac, undefined_mac, 6);
+}
+
void fwd_scan_ports_tcp(struct fwd_ports *fwd, const struct fwd_ports *rev);
void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
const struct fwd_ports *tcp_fwd,
@@ -57,4 +64,8 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt);
+bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
+ uint8_t *mac);
+int fwd_mac_cache_init(void);
+
#endif /* FWD_H */
diff --git a/netlink.c b/netlink.c
index 1ca2c9a..3ba0597 100644
--- a/netlink.c
+++ b/netlink.c
@@ -878,8 +878,6 @@ bool nl_neigh_mac_get(int s, const union inany_addr *addr,
}
}
}
- if (status < 0)
- warn("netlink: RTM_NEWNEIGH failed: %s", strerror_(-status));
return found;
}
--
2.50.1