This series of patches adds vhost-user support to passt and then allows passt to connect to QEMU network backend using virtqueue rather than a socket. With QEMU, rather than using to connect: -netdev stream,id=s,server=off,addr.type=unix,addr.path=/tmp/passt_1.socket we will use: -chardev socket,id=chr0,path=/tmp/passt_1.socket -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0 The memory backend is needed to share data between passt and QEMU. Performance comparison between "-netdev stream" and "-netdev vhost-user": $ iperf3 -c localhost -p 10001 -t 60 -6 -u -b 50G socket: [ 5] 0.00-60.05 sec 95.6 GBytes 13.7 Gbits/sec 0.017 ms 6998988/10132413 (69%) receiver vhost-user: [ 5] 0.00-60.04 sec 237 GBytes 33.9 Gbits/sec 0.006 ms 53673/7813770 (0.69%) receiver $ iperf3 -c localhost -p 10001 -t 60 -4 -u -b 50G socket: [ 5] 0.00-60.05 sec 98.9 GBytes 14.1 Gbits/sec 0.018 ms 6260735/9501832 (66%) receiver vhost-user: [ 5] 0.00-60.05 sec 235 GBytes 33.7 Gbits/sec 0.008 ms 37581/7752699 (0.48%) receiver $ iperf3 -c localhost -p 10001 -t 60 -6 socket: [ 5] 0.00-60.00 sec 17.3 GBytes 2.48 Gbits/sec 0 sender [ 5] 0.00-60.06 sec 17.3 GBytes 2.48 Gbits/sec receiver vhost-user: [ 5] 0.00-60.00 sec 191 GBytes 27.4 Gbits/sec 0 sender [ 5] 0.00-60.05 sec 191 GBytes 27.3 Gbits/sec receiver $ iperf3 -c localhost -p 10001 -t 60 -4 socket: [ 5] 0.00-60.00 sec 15.6 GBytes 2.24 Gbits/sec 0 sender [ 5] 0.00-60.06 sec 15.6 GBytes 2.24 Gbits/sec receiver vhost-user: [ 5] 0.00-60.00 sec 189 GBytes 27.1 Gbits/sec 0 sender [ 5] 0.00-60.04 sec 189 GBytes 27.0 Gbits/sec receiver Laurent Vivier (5): packet: replace struct desc by struct iovec vhost-user: introduce virtio API vhost-user: introduce vhost-user API iov: add iov_count() vhost-user: add vhost-user Makefile | 4 +- checksum.c | 1 - conf.c | 18 +- iov.c | 35 +- iov.h | 2 + packet.c | 81 ++-- packet.h | 16 +- passt.c | 14 +- passt.h | 10 + pcap.c | 1 - tap.c | 106 ++++- tap.h | 5 +- tcp.c | 17 +- tcp_vu.c | 547 ++++++++++++++++++++++++ tcp_vu.h | 9 + udp.c | 54 ++- udp_internal.h | 39 ++ udp_vu.c | 237 +++++++++++ udp_vu.h | 8 + util.h | 11 + vhost_user.c | 1077 ++++++++++++++++++++++++++++++++++++++++++++++++ vhost_user.h | 137 ++++++ virtio.c | 442 ++++++++++++++++++++ virtio.h | 122 ++++++ 24 files changed, 2886 insertions(+), 107 deletions(-) create mode 100644 tcp_vu.c create mode 100644 tcp_vu.h create mode 100644 udp_internal.h create mode 100644 udp_vu.c create mode 100644 udp_vu.h create mode 100644 vhost_user.c create mode 100644 vhost_user.h create mode 100644 virtio.c create mode 100644 virtio.h -- 2.45.2
Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- packet.c | 75 +++++++++++++++++++++++++++++++------------------------- packet.h | 14 ++--------- 2 files changed, 43 insertions(+), 46 deletions(-) diff --git a/packet.c b/packet.c index ccfc84607709..af2a539a1794 100644 --- a/packet.c +++ b/packet.c @@ -22,6 +22,36 @@ #include "util.h" #include "log.h" +static int packet_check_range(const struct pool *p, size_t offset, size_t len, + const char *start, const char *func, int line) +{ + if (start < p->buf) { + if (func) { + trace("add packet start %p before buffer start %p, " + "%s:%i", (void *)start, (void *)p->buf, func, line); + } + return -1; + } + + if (start + len + offset > p->buf + p->buf_size) { + if (func) { + trace("packet offset plus length %lu from size %lu, " + "%s:%i", start - p->buf + len + offset, + p->buf_size, func, line); + } + return -1; + } + +#if UINTPTR_MAX == UINT64_MAX + if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) { + trace("add packet start %p, buffer start %p, %s:%i", + (void *)start, (void *)p->buf, func, line); + return -1; + } +#endif + + return 0; +} /** * packet_add_do() - Add data as packet descriptor to given pool * @p: Existing pool @@ -41,34 +71,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start, return; } - if (start < p->buf) { - trace("add packet start %p before buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); + if (packet_check_range(p, 0, len, start, func, line)) return; - } - - if (start + len > p->buf + p->buf_size) { - trace("add packet start %p, length: %zu, buffer end %p, %s:%i", - (void *)start, len, (void *)(p->buf + p->buf_size), - func, line); - return; - } if (len > UINT16_MAX) { trace("add packet length %zu, %s:%i", len, func, line); return; } -#if UINTPTR_MAX == UINT64_MAX - if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) { - trace("add packet start %p, buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); - return; - } -#endif - - p->pkt[idx].offset = start - p->buf; - p->pkt[idx].len = len; + p->pkt[idx].iov_base = (void *)start; + p->pkt[idx].iov_len = len; p->count++; } @@ -104,28 +116,23 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (p->pkt[idx].offset + len + offset > p->buf_size) { + if (len + offset > p->pkt[idx].iov_len) { if (func) { - trace("packet offset plus length %zu from size %zu, " - "%s:%i", p->pkt[idx].offset + len + offset, - p->buf_size, func, line); + trace("data length %zu, offset %zu from length %zu, " + "%s:%i", len, offset, p->pkt[idx].iov_len, + func, line); } return NULL; } - if (len + offset > p->pkt[idx].len) { - if (func) { - trace("data length %zu, offset %zu from length %u, " - "%s:%i", len, offset, p->pkt[idx].len, - func, line); - } + if (packet_check_range(p, offset, len, p->pkt[idx].iov_base, + func, line)) return NULL; - } if (left) - *left = p->pkt[idx].len - offset - len; + *left = p->pkt[idx].iov_len - offset - len; - return p->buf + p->pkt[idx].offset + offset; + return (char *)p->pkt[idx].iov_base + offset; } /** diff --git a/packet.h b/packet.h index a784b07bbed5..8377dcf678bb 100644 --- a/packet.h +++ b/packet.h @@ -6,16 +6,6 @@ #ifndef PACKET_H #define PACKET_H -/** - * struct desc - Generic offset-based descriptor within buffer - * @offset: Offset of descriptor relative to buffer start, 32-bit limit - * @len: Length of descriptor, host order, 16-bit limit - */ -struct desc { - uint32_t offset; - uint16_t len; -}; - /** * struct pool - Generic pool of packets stored in a buffer * @buf: Buffer storing packet descriptors @@ -29,7 +19,7 @@ struct pool { size_t buf_size; size_t size; size_t count; - struct desc pkt[1]; + struct iovec pkt[1]; }; void packet_add_do(struct pool *p, size_t len, const char *start, @@ -54,7 +44,7 @@ struct _name ## _t { \ size_t buf_size; \ size_t size; \ size_t count; \ - struct desc pkt[_size]; \ + struct iovec pkt[_size]; \ } #define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \ -- 2.45.2
On Fri, Jun 21, 2024 at 04:56:36PM +0200, Laurent Vivier wrote: Needs a commit message.Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- packet.c | 75 +++++++++++++++++++++++++++++++------------------------- packet.h | 14 ++--------- 2 files changed, 43 insertions(+), 46 deletions(-) diff --git a/packet.c b/packet.c index ccfc84607709..af2a539a1794 100644 --- a/packet.c +++ b/packet.c @@ -22,6 +22,36 @@ #include "util.h" #include "log.h"Function comment, please.+static int packet_check_range(const struct pool *p, size_t offset, size_t len, + const char *start, const char *func, int line) +{ + if (start < p->buf) { + if (func) { + trace("add packet start %p before buffer start %p, " + "%s:%i", (void *)start, (void *)p->buf, func, line); + } + return -1;Pre-existing, but I wonder if these should be assert()s. Are there any cases where we'd hit this path that don't indicate a bug in the caller?+ } + + if (start + len + offset > p->buf + p->buf_size) {Also pre-existing, but I wonder if we should check for overflow of (Start + len + offset).+ if (func) { + trace("packet offset plus length %lu from size %lu, " + "%s:%i", start - p->buf + len + offset, + p->buf_size, func, line); + } + return -1; + } + +#if UINTPTR_MAX == UINT64_MAX + if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {I don't think this check is relevant any more if we're going to iovecs - this was just because the offset in struct desc was only 32-bit.+ trace("add packet start %p, buffer start %p, %s:%i", + (void *)start, (void *)p->buf, func, line); + return -1; + } +#endif + + return 0; +} /** * packet_add_do() - Add data as packet descriptor to given pool * @p: Existing pool @@ -41,34 +71,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start, return; } - if (start < p->buf) { - trace("add packet start %p before buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); + if (packet_check_range(p, 0, len, start, func, line)) return; - } - - if (start + len > p->buf + p->buf_size) { - trace("add packet start %p, length: %zu, buffer end %p, %s:%i", - (void *)start, len, (void *)(p->buf + p->buf_size), - func, line); - return; - } if (len > UINT16_MAX) { trace("add packet length %zu, %s:%i", len, func, line); return; } -#if UINTPTR_MAX == UINT64_MAX - if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) { - trace("add packet start %p, buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); - return; - } -#endif - - p->pkt[idx].offset = start - p->buf; - p->pkt[idx].len = len; + p->pkt[idx].iov_base = (void *)start; + p->pkt[idx].iov_len = len; p->count++; } @@ -104,28 +116,23 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (p->pkt[idx].offset + len + offset > p->buf_size) { + if (len + offset > p->pkt[idx].iov_len) { if (func) { - trace("packet offset plus length %zu from size %zu, " - "%s:%i", p->pkt[idx].offset + len + offset, - p->buf_size, func, line); + trace("data length %zu, offset %zu from length %zu, " + "%s:%i", len, offset, p->pkt[idx].iov_len, + func, line);I'm not sure either the old or new message is particularly descriptive here :/} return NULL; } - if (len + offset > p->pkt[idx].len) { - if (func) { - trace("data length %zu, offset %zu from length %u, " - "%s:%i", len, offset, p->pkt[idx].len, - func, line); - } + if (packet_check_range(p, offset, len, p->pkt[idx].iov_base, + func, line))Ah.. right.. in this case we certainly don't want ASSERT()s in packet_check_range(). Still wonder if that would make more sense for the packet add case, however. A couple of other points: * You've effectively switched the order of the two different tests here (one range checking against the entire buffer, one range checking against a single packet). Any reason for that? * Do we actually need the entire-buffer check here on the _get() side? Isn't it enough to ensure that packets lie within the buffer when they're inserted? Pre-existing, again, AFAICT.return NULL; - } if (left) - *left = p->pkt[idx].len - offset - len; + *left = p->pkt[idx].iov_len - offset - len; - return p->buf + p->pkt[idx].offset + offset; + return (char *)p->pkt[idx].iov_base + offset; } /** diff --git a/packet.h b/packet.h index a784b07bbed5..8377dcf678bb 100644 --- a/packet.h +++ b/packet.h @@ -6,16 +6,6 @@ #ifndef PACKET_H #define PACKET_H -/** - * struct desc - Generic offset-based descriptor within buffer - * @offset: Offset of descriptor relative to buffer start, 32-bit limit - * @len: Length of descriptor, host order, 16-bit limit - */ -struct desc { - uint32_t offset; - uint16_t len; -}; - /** * struct pool - Generic pool of packets stored in a buffer * @buf: Buffer storing packet descriptors @@ -29,7 +19,7 @@ struct pool { size_t buf_size; size_t size; size_t count; - struct desc pkt[1]; + struct iovec pkt[1]; }; void packet_add_do(struct pool *p, size_t len, const char *start, @@ -54,7 +44,7 @@ struct _name ## _t { \ size_t buf_size; \ size_t size; \ size_t count; \ - struct desc pkt[_size]; \ + struct iovec pkt[_size]; \ } #define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
On 24/06/2024 04:48, David Gibson wrote:On Fri, Jun 21, 2024 at 04:56:36PM +0200, Laurent Vivier wrote: Needs a commit message. > Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> > --- > packet.c | 75 +++++++++++++++++++++++++++++++------------------------- > packet.h | 14 ++--------- > 2 files changed, 43 insertions(+), 46 deletions(-) > > diff --git a/packet.c b/packet.c > index ccfc84607709..af2a539a1794 100644 > --- a/packet.c > +++ b/packet.c...Originally, I didn't want to change the existing behaviour. Only to move code, and to use a common function for packet_add_do() and packet_get_do(). But if you think it should be better I can update the code for that:+ } + + if (start + len + offset > p->buf + p->buf_size) {Also pre-existing, but I wonder if we should check for overflow of (Start + len + offset).I agree.+ if (func) { + trace("packet offset plus length %lu from size %lu, " + "%s:%i", start - p->buf + len + offset, + p->buf_size, func, line); + } + return -1; + } + +#if UINTPTR_MAX == UINT64_MAX + if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {I don't think this check is relevant any more if we're going to iovecs - this was just because the offset in struct desc was only 32-bit.I think the func and line parameters will help to understand the problem, and the others why the trace is triggered.+ trace("add packet start %p, buffer start %p, %s:%i", + (void *)start, (void *)p->buf, func, line); + return -1; + } +#endif + + return 0; +} /** * packet_add_do() - Add data as packet descriptor to given pool * @p: Existing pool @@ -41,34 +71,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start, return; } - if (start < p->buf) { - trace("add packet start %p before buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); + if (packet_check_range(p, 0, len, start, func, line)) return; - } - - if (start + len > p->buf + p->buf_size) { - trace("add packet start %p, length: %zu, buffer end %p, %s:%i", - (void *)start, len, (void *)(p->buf + p->buf_size), - func, line); - return; - } if (len > UINT16_MAX) { trace("add packet length %zu, %s:%i", len, func, line); return; } -#if UINTPTR_MAX == UINT64_MAX - if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) { - trace("add packet start %p, buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); - return; - } -#endif - - p->pkt[idx].offset = start - p->buf; - p->pkt[idx].len = len; + p->pkt[idx].iov_base = (void *)start; + p->pkt[idx].iov_len = len; p->count++; } @@ -104,28 +116,23 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (p->pkt[idx].offset + len + offset > p->buf_size) { + if (len + offset > p->pkt[idx].iov_len) { if (func) { - trace("packet offset plus length %zu from size %zu, " - "%s:%i", p->pkt[idx].offset + len + offset, - p->buf_size, func, line); + trace("data length %zu, offset %zu from length %zu, " + "%s:%i", len, offset, p->pkt[idx].iov_len, + func, line);I'm not sure either the old or new message is particularly descriptive here :/The idea is to check the parameters are valid before checking the buffer is valid.} return NULL; } - if (len + offset > p->pkt[idx].len) { - if (func) { - trace("data length %zu, offset %zu from length %u, " - "%s:%i", len, offset, p->pkt[idx].len, - func, line); - } + if (packet_check_range(p, offset, len, p->pkt[idx].iov_base, + func, line))Ah.. right.. in this case we certainly don't want ASSERT()s in packet_check_range(). Still wonder if that would make more sense for the packet add case, however. A couple of other points: * You've effectively switched the order of the two different tests here (one range checking against the entire buffer, one range checking against a single packet). Any reason for that?* Do we actually need the entire-buffer check here on the _get() side? Isn't it enough to ensure that packets lie within the buffer when they're inserted? Pre-existing, again, AFAICT.I wanted to keep the idea introduced in bb708111833e ("treewide: Packet abstraction with mandatory boundary checks") and checking we don't read outside of the buffer. Thanks, Laurent
On Thu, Jul 04, 2024 at 05:52:09PM +0200, Laurent Vivier wrote:On 24/06/2024 04:48, David Gibson wrote:Well, I think we should be more careful here, but as you say I don't think it necessarily belongs as part of this series.On Fri, Jun 21, 2024 at 04:56:36PM +0200, Laurent Vivier wrote: Needs a commit message. > Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> > --- > packet.c | 75 +++++++++++++++++++++++++++++++------------------------- > packet.h | 14 ++--------- > 2 files changed, 43 insertions(+), 46 deletions(-) > > diff --git a/packet.c b/packet.c > index ccfc84607709..af2a539a1794 100644 > --- a/packet.c > +++ b/packet.c...Originally, I didn't want to change the existing behaviour. Only to move code, and to use a common function for packet_add_do() and packet_get_do(). But if you think it should be better I can update the code for that:+ } + + if (start + len + offset > p->buf + p->buf_size) {Also pre-existing, but I wonder if we should check for overflow of (Start + len + offset).Hmm, yeah, I guess so.I agree.+ if (func) { + trace("packet offset plus length %lu from size %lu, " + "%s:%i", start - p->buf + len + offset, + p->buf_size, func, line); + } + return -1; + } + +#if UINTPTR_MAX == UINT64_MAX + if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) {I don't think this check is relevant any more if we're going to iovecs - this was just because the offset in struct desc was only 32-bit.I think the func and line parameters will help to understand the problem, and the others why the trace is triggered.+ trace("add packet start %p, buffer start %p, %s:%i", + (void *)start, (void *)p->buf, func, line); + return -1; + } +#endif + + return 0; +} /** * packet_add_do() - Add data as packet descriptor to given pool * @p: Existing pool @@ -41,34 +71,16 @@ void packet_add_do(struct pool *p, size_t len, const char *start, return; } - if (start < p->buf) { - trace("add packet start %p before buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); + if (packet_check_range(p, 0, len, start, func, line)) return; - } - - if (start + len > p->buf + p->buf_size) { - trace("add packet start %p, length: %zu, buffer end %p, %s:%i", - (void *)start, len, (void *)(p->buf + p->buf_size), - func, line); - return; - } if (len > UINT16_MAX) { trace("add packet length %zu, %s:%i", len, func, line); return; } -#if UINTPTR_MAX == UINT64_MAX - if ((uintptr_t)start - (uintptr_t)p->buf > UINT32_MAX) { - trace("add packet start %p, buffer start %p, %s:%i", - (void *)start, (void *)p->buf, func, line); - return; - } -#endif - - p->pkt[idx].offset = start - p->buf; - p->pkt[idx].len = len; + p->pkt[idx].iov_base = (void *)start; + p->pkt[idx].iov_len = len; p->count++; } @@ -104,28 +116,23 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (p->pkt[idx].offset + len + offset > p->buf_size) { + if (len + offset > p->pkt[idx].iov_len) { if (func) { - trace("packet offset plus length %zu from size %zu, " - "%s:%i", p->pkt[idx].offset + len + offset, - p->buf_size, func, line); + trace("data length %zu, offset %zu from length %zu, " + "%s:%i", len, offset, p->pkt[idx].iov_len, + func, line);I'm not sure either the old or new message is particularly descriptive here :/Ok, makes sense.The idea is to check the parameters are valid before checking the buffer is valid.} return NULL; } - if (len + offset > p->pkt[idx].len) { - if (func) { - trace("data length %zu, offset %zu from length %u, " - "%s:%i", len, offset, p->pkt[idx].len, - func, line); - } + if (packet_check_range(p, offset, len, p->pkt[idx].iov_base, + func, line))Ah.. right.. in this case we certainly don't want ASSERT()s in packet_check_range(). Still wonder if that would make more sense for the packet add case, however. A couple of other points: * You've effectively switched the order of the two different tests here (one range checking against the entire buffer, one range checking against a single packet). Any reason for that?Hm, ok. -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson* Do we actually need the entire-buffer check here on the _get() side? Isn't it enough to ensure that packets lie within the buffer when they're inserted? Pre-existing, again, AFAICT.I wanted to keep the idea introduced in bb708111833e ("treewide: Packet abstraction with mandatory boundary checks") and checking we don't read outside of the buffer.
Add virtio.c and virtio.h that define the functions needed to manage virtqueues. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- Makefile | 4 +- util.h | 11 ++ virtio.c | 446 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ virtio.h | 123 +++++++++++++++ 4 files changed, 582 insertions(+), 2 deletions(-) create mode 100644 virtio.c create mode 100644 virtio.h diff --git a/Makefile b/Makefile index 09fc461d087e..39613a7cf1f2 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c + tcp_buf.c tcp_splice.c udp.c util.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h + udp.h util.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/util.h b/util.h index eebb027be487..56c4e2e7b4fe 100644 --- a/util.h +++ b/util.h @@ -48,6 +48,9 @@ #define ROUND_DOWN(x, y) ((x) & ~((y) - 1)) #define ROUND_UP(x, y) (((x) + (y) - 1) & ~((y) - 1)) +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) + #define MAX_FROM_BITS(n) (((1U << (n)) - 1)) #define BIT(n) (1UL << (n)) @@ -116,6 +119,14 @@ #define htonl_constant(x) (__bswap_constant_32(x)) #endif +static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); } +#define smp_mb() do { barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); } while (0) +#define smp_mb_release() do { barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); } while (0) +#define smp_mb_acquire() do { barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); } while (0) + +#define smp_wmb() smp_mb_release() +#define smp_rmb() smp_mb_acquire() + #define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8) int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, void *arg); diff --git a/virtio.c b/virtio.c new file mode 100644 index 000000000000..50ec8b5119ed --- /dev/null +++ b/virtio.c @@ -0,0 +1,446 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c */ + +#include <stddef.h> +#include <endian.h> +#include <string.h> +#include <errno.h> +#include <sys/eventfd.h> +#include <sys/socket.h> + +#include "util.h" +#include "virtio.h" + +#define VIRTQUEUE_MAX_SIZE 1024 + +/* Translate guest physical address to our virtual address. */ +static void *vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) +{ + unsigned int i; + + if (*plen == 0) + return NULL; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const VuDevRegion *r = &dev->regions[i]; + + if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { + if ((guest_addr + *plen) > (r->gpa + r->size)) + *plen = r->gpa + r->size - guest_addr; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(guest_addr - r->gpa + r->mmap_addr + r->mmap_offset); + } + } + + return NULL; +} + +static inline uint16_t vring_avail_flags(const VuVirtq *vq) +{ + return le16toh(vq->vring.avail->flags); +} + +static inline uint16_t vring_avail_idx(VuVirtq *vq) +{ + vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); + + return vq->shadow_avail_idx; +} + +static inline uint16_t vring_avail_ring(const VuVirtq *vq, int i) +{ + return le16toh(vq->vring.avail->ring[i]); +} + +static inline uint16_t vring_get_used_event(const VuVirtq *vq) +{ + return vring_avail_ring(vq, vq->vring.num); +} + +static bool virtqueue_get_head(VuDev *dev, const VuVirtq *vq, + unsigned int idx, unsigned int *head) +{ + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ + *head = vring_avail_ring(vq, idx % vq->vring.num); + + /* If their number is silly, that's a fatal mistake. */ + if (*head >= vq->vring.num) { + vu_panic(dev, "Guest says index %u is available", *head); + return false; + } + + return true; +} + +static int +virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc, + uint64_t addr, size_t len) +{ + uint64_t read_len; + + if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) + return -1; + + if (len == 0) + return -1; + + while (len) { + const struct vring_desc *ori_desc; + + read_len = len; + ori_desc = vu_gpa_to_va(dev, &read_len, addr); + if (!ori_desc) + return -1; + + memcpy(desc, ori_desc, read_len); + len -= read_len; + addr += read_len; + desc += read_len; + } + + return 0; +} + +enum { + VIRTQUEUE_READ_DESC_ERROR = -1, + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ +}; + +static int +virtqueue_read_next_desc(VuDev *dev, const struct vring_desc *desc, + int i, unsigned int max, unsigned int *next) +{ + /* If this descriptor says it doesn't chain, we're done. */ + if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) + return VIRTQUEUE_READ_DESC_DONE; + + /* Check they're not leading us off end of descriptors. */ + *next = le16toh(desc[i].next); + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + + if (*next >= max) { + vu_panic(dev, "Desc next is %u", *next); + return VIRTQUEUE_READ_DESC_ERROR; + } + + return VIRTQUEUE_READ_DESC_MORE; +} + +bool vu_queue_empty(const VuDev *dev, VuVirtq *vq) +{ + if (dev->broken || + !vq->vring.avail) { + return true; + } + + if (vq->shadow_avail_idx != vq->last_avail_idx) + return false; + + return vring_avail_idx(vq) == vq->last_avail_idx; +} + +static bool vring_notify(const VuDev *dev, VuVirtq *vq) +{ + uint16_t old, new; + bool v; + + /* We need to expose used array entries before checking used event. */ + smp_mb(); + + /* Always notify when queue is empty (when feature acknowledge) */ + if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && vu_queue_empty(dev, vq)) { + return true; + } + + if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); + + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + old = vq->signalled_used; + new = vq->signalled_used = vq->used_idx; + return !v || vring_need_event(vring_get_used_event(vq), new, old); +} + +/* cppcheck-suppress unusedFunction */ +void vu_queue_notify(VuDev *dev, VuVirtq *vq) +{ + if (dev->broken || !vq->vring.avail) + return; + + if (!vring_notify(dev, vq)) { + debug("skipped notify..."); + return; + } + + if (eventfd_write(vq->call_fd, 1) < 0) + vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); +} + +static inline void vring_set_avail_event(VuVirtq *vq, uint16_t val) +{ + uint16_t val_le = htole16(val); + + if (!vq->notification) + return; + + memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t)); +} + +static bool virtqueue_map_desc(VuDev *dev, + unsigned int *p_num_sg, struct iovec *iov, + unsigned int max_num_sg, + uint64_t pa, size_t sz) +{ + unsigned int num_sg = *p_num_sg; + + ASSERT(num_sg <= max_num_sg); + + if (!sz) { + vu_panic(dev, "virtio: zero sized buffers are not allowed"); + return false; + } + + while (sz) { + uint64_t len = sz; + + if (num_sg == max_num_sg) { + vu_panic(dev, "virtio: too many descriptors in indirect table"); + return false; + } + + iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); + if (iov[num_sg].iov_base == NULL) { + vu_panic(dev, "virtio: invalid address for buffers"); + return false; + } + iov[num_sg].iov_len = len; + num_sg++; + sz -= len; + pa += len; + } + + *p_num_sg = num_sg; + return true; +} + +static int +vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, + VuVirtqElement *elem) +{ + const struct vring_desc *desc = vq->vring.desc; + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + unsigned int out_num = 0, in_num = 0; + unsigned int max = vq->vring.num; + unsigned int i = idx; + uint64_t read_len; + int rc; + + if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { + unsigned int desc_len; + uint64_t desc_addr; + + if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + return -1; + } + + /* loop over the indirect descriptor table */ + desc_addr = le64toh(desc[i].addr); + desc_len = le32toh(desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); + if (desc && read_len != desc_len) { + /* Failed to use zero copy */ + desc = NULL; + if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len)) + desc = desc_buf; + } + if (!desc) { + vu_panic(dev, "Invalid indirect buffer table"); + return -1; + } + i = 0; + } + + /* Collect all the descriptors */ + do { + if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { + if (!virtqueue_map_desc(dev, &in_num, elem->in_sg, + elem->in_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } else { + if (in_num) { + vu_panic(dev, "Incorrect order for descriptors"); + return -1; + } + if (!virtqueue_map_desc(dev, &out_num, elem->out_sg, + elem->out_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } + + /* If we've got too many, that implies a descriptor loop. */ + if ((in_num + out_num) > max) { + vu_panic(dev, "Looped descriptor"); + return -1; + } + rc = virtqueue_read_next_desc(dev, desc, i, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) { + vu_panic(dev, "read descriptor error"); + return -1; + } + + elem->index = idx; + elem->in_num = in_num; + elem->out_num = out_num; + + return 0; +} + +/* cppcheck-suppress unusedFunction */ +int vu_queue_pop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem) +{ + unsigned int head; + int ret; + + if (dev->broken || !vq->vring.avail) + return -1; + + if (vu_queue_empty(dev, vq)) + return -1; + + /* + * Needed after virtio_queue_empty(), see comment in + * virtqueue_num_heads(). + */ + smp_rmb(); + + if (vq->inuse >= vq->vring.num) { + vu_panic(dev, "Virtqueue size exceeded"); + return -1; + } + + if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) + return -1; + + if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + vring_set_avail_event(vq, vq->last_avail_idx); + + ret = vu_queue_map_desc(dev, vq, head, elem); + + if (ret < 0) + return ret; + + vq->inuse++; + + return 0; +} + +void vu_queue_detach_element(VuDev *dev, VuVirtq *vq, + unsigned int index, size_t len) +{ + (void)dev; + (void)index; + (void)len; + + vq->inuse--; + /* unmap, when DMA support is added */ +} + +/* cppcheck-suppress unusedFunction */ +void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len) +{ + vq->last_avail_idx--; + vu_queue_detach_element(dev, vq, index, len); +} + +/* cppcheck-suppress unusedFunction */ +bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) +{ + (void)dev; + if (num > vq->inuse) + return false; + + vq->last_avail_idx -= num; + vq->inuse -= num; + return true; +} + +static inline void vring_used_write(VuVirtq *vq, + const struct vring_used_elem *uelem, int i) +{ + struct vring_used *used = vq->vring.used; + + used->ring[i] = *uelem; +} + +void vu_queue_fill_by_index(const VuDev *dev, VuVirtq *vq, unsigned int index, + unsigned int len, unsigned int idx) +{ + struct vring_used_elem uelem; + + if (dev->broken || !vq->vring.avail) + return; + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = htole32(index); + uelem.len = htole32(len); + vring_used_write(vq, &uelem, idx); +} + +void vu_queue_fill(const VuDev *dev, VuVirtq *vq, const VuVirtqElement *elem, + unsigned int len, unsigned int idx) +{ + vu_queue_fill_by_index(dev, vq, elem->index, len, idx); +} + +static inline void vring_used_idx_set(VuVirtq *vq, uint16_t val) +{ + vq->vring.used->idx = htole16(val); + + vq->used_idx = val; +} + +void vu_queue_flush(const VuDev *dev, VuVirtq *vq, unsigned int count) +{ + uint16_t old, new; + + if (dev->broken || !vq->vring.avail) + return; + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + + old = vq->used_idx; + new = old + count; + vring_used_idx_set(vq, new); + vq->inuse -= count; + if ((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old)) + vq->signalled_used_valid = false; +} + +/* cppcheck-suppress unusedFunction */ +void vu_queue_push(const VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, unsigned int len) +{ + vu_queue_fill(dev, vq, elem, len, 0); + vu_queue_flush(dev, vq, 1); +} + diff --git a/virtio.h b/virtio.h new file mode 100644 index 000000000000..61398bb432bc --- /dev/null +++ b/virtio.h @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// +/* come parts copied from QEMU subprojects/libvhost-user/libvhost-user.h */ + +#ifndef VIRTIO_H +#define VIRTIO_H + +#include <stdbool.h> +#include <linux/vhost_types.h> + +#define VIRTQUEUE_MAX_SIZE 1024 + +#define vu_panic(vdev, ...) \ + do { \ + (vdev)->broken = true; \ + err( __VA_ARGS__ ); \ + } while (0) + +typedef struct VuRing { + unsigned int num; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + uint32_t flags; +} VuRing; + +typedef struct VuVirtq { + VuRing vring; + + /* Next head to pop */ + uint16_t last_avail_idx; + + /* Last avail_idx read from VQ. */ + uint16_t shadow_avail_idx; + + uint16_t used_idx; + + /* Last used index value we have signalled on */ + uint16_t signalled_used; + + /* Last used index value we have signalled on */ + bool signalled_used_valid; + + bool notification; + + unsigned int inuse; + + int call_fd; + int kick_fd; + int err_fd; + unsigned int enable; + bool started; + + /* Guest addresses of our ring */ + struct vhost_vring_addr vra; +} VuVirtq; + +typedef struct VuDevRegion { + uint64_t gpa; + uint64_t size; + uint64_t qva; + uint64_t mmap_offset; + uint64_t mmap_addr; +} VuDevRegion; + +#define VHOST_USER_MAX_QUEUES 2 + +/* + * Set a reasonable maximum number of ram slots, which will be supported by + * any architecture. + */ +#define VHOST_USER_MAX_RAM_SLOTS 32 + +typedef struct VuDev { + uint32_t nregions; + VuDevRegion regions[VHOST_USER_MAX_RAM_SLOTS]; + VuVirtq vq[VHOST_USER_MAX_QUEUES]; + uint64_t features; + uint64_t protocol_features; + bool broken; + int hdrlen; +} VuDev; + +typedef struct VuVirtqElement { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + struct iovec *in_sg; + struct iovec *out_sg; +} VuVirtqElement; + +static inline bool has_feature(uint64_t features, unsigned int fbit) +{ + return !!(features & (1ULL << fbit)); +} + +static inline bool vu_has_feature(const VuDev *vdev, unsigned int fbit) +{ + return has_feature(vdev->features, fbit); +} + +/* cppcheck-suppress unusedFunction */ +static inline bool vu_has_protocol_feature(const VuDev *vdev, unsigned int fbit) +{ + return has_feature(vdev->protocol_features, fbit); +} + +bool vu_queue_empty(const VuDev *dev, VuVirtq *vq); +void vu_queue_notify(VuDev *dev, VuVirtq *vq); +int vu_queue_pop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem); +void vu_queue_detach_element(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len); +void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len); +bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num); + +void vu_queue_fill_by_index(const VuDev *dev, VuVirtq *vq, unsigned int index, + unsigned int len, unsigned int idx); +void vu_queue_fill(const VuDev *dev, VuVirtq *vq, const VuVirtqElement *elem, + unsigned int len, unsigned int idx); +void vu_queue_flush(const VuDev *dev, VuVirtq *vq, unsigned int count); +void vu_queue_push(const VuDev *dev, VuVirtq *vq, const VuVirtqElement *elem, + unsigned int len); +#endif /* VIRTIO_H */ -- 2.45.2
On Fri, Jun 21, 2024 at 04:56:37PM +0200, Laurent Vivier wrote:Add virtio.c and virtio.h that define the functions needed to manage virtqueues. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- Makefile | 4 +- util.h | 11 ++ virtio.c | 446 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ virtio.h | 123 +++++++++++++++ 4 files changed, 582 insertions(+), 2 deletions(-) create mode 100644 virtio.c create mode 100644 virtio.h diff --git a/Makefile b/Makefile index 09fc461d087e..39613a7cf1f2 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c + tcp_buf.c tcp_splice.c udp.c util.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h + udp.h util.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/util.h b/util.h index eebb027be487..56c4e2e7b4fe 100644 --- a/util.h +++ b/util.h @@ -48,6 +48,9 @@ #define ROUND_DOWN(x, y) ((x) & ~((y) - 1)) #define ROUND_UP(x, y) (((x) + (y) - 1) & ~((y) - 1)) +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) + #define MAX_FROM_BITS(n) (((1U << (n)) - 1)) #define BIT(n) (1UL << (n)) @@ -116,6 +119,14 @@ #define htonl_constant(x) (__bswap_constant_32(x)) #endif +static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); } +#define smp_mb() do { barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); } while (0) +#define smp_mb_release() do { barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); } while (0) +#define smp_mb_acquire() do { barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); } while (0) + +#define smp_wmb() smp_mb_release() +#define smp_rmb() smp_mb_acquire() + #define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8) int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, void *arg); diff --git a/virtio.c b/virtio.c new file mode 100644 index 000000000000..50ec8b5119ed --- /dev/null +++ b/virtio.c @@ -0,0 +1,446 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +Needs an actual "Copyright" invocation as well as the SPDX stuff. Which, yes, is a bit fiddly given that it's largely taken from qemu.+/* some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c */So, there are obvious stylistic differences between this and the rest of the passt code for that reason. As I think I said on an earlier draft, I think we need to go fully one way or the other: either a) rewrite this entirely in passt style or b) change the whole thing so little that it's trivial to pull in new versions from qemu. This seems to be somewhere in the middle.+ +#include <stddef.h> +#include <endian.h> +#include <string.h> +#include <errno.h> +#include <sys/eventfd.h> +#include <sys/socket.h> + +#include "util.h" +#include "virtio.h" + +#define VIRTQUEUE_MAX_SIZE 1024 + +/* Translate guest physical address to our virtual address. */ +static void *vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) +{ + unsigned int i; + + if (*plen == 0) + return NULL; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const VuDevRegion *r = &dev->regions[i]; + + if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { + if ((guest_addr + *plen) > (r->gpa + r->size)) + *plen = r->gpa + r->size - guest_addr; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(guest_addr - r->gpa + r->mmap_addr + r->mmap_offset); + } + } + + return NULL; +} + +static inline uint16_t vring_avail_flags(const VuVirtq *vq) +{ + return le16toh(vq->vring.avail->flags); +} + +static inline uint16_t vring_avail_idx(VuVirtq *vq) +{ + vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); + + return vq->shadow_avail_idx; +} + +static inline uint16_t vring_avail_ring(const VuVirtq *vq, int i) +{ + return le16toh(vq->vring.avail->ring[i]); +} + +static inline uint16_t vring_get_used_event(const VuVirtq *vq) +{ + return vring_avail_ring(vq, vq->vring.num); +} + +static bool virtqueue_get_head(VuDev *dev, const VuVirtq *vq, + unsigned int idx, unsigned int *head) +{ + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ + *head = vring_avail_ring(vq, idx % vq->vring.num); + + /* If their number is silly, that's a fatal mistake. */ + if (*head >= vq->vring.num) { + vu_panic(dev, "Guest says index %u is available", *head); + return false; + } + + return true; +} + +static int +virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc, + uint64_t addr, size_t len) +{ + uint64_t read_len; + + if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) + return -1; + + if (len == 0) + return -1; + + while (len) { + const struct vring_desc *ori_desc; + + read_len = len; + ori_desc = vu_gpa_to_va(dev, &read_len, addr); + if (!ori_desc) + return -1; + + memcpy(desc, ori_desc, read_len); + len -= read_len; + addr += read_len; + desc += read_len; + } + + return 0; +} + +enum { + VIRTQUEUE_READ_DESC_ERROR = -1, + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ +}; + +static int +virtqueue_read_next_desc(VuDev *dev, const struct vring_desc *desc, + int i, unsigned int max, unsigned int *next) +{ + /* If this descriptor says it doesn't chain, we're done. */ + if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) + return VIRTQUEUE_READ_DESC_DONE; + + /* Check they're not leading us off end of descriptors. */ + *next = le16toh(desc[i].next); + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + + if (*next >= max) { + vu_panic(dev, "Desc next is %u", *next); + return VIRTQUEUE_READ_DESC_ERROR; + } + + return VIRTQUEUE_READ_DESC_MORE; +} + +bool vu_queue_empty(const VuDev *dev, VuVirtq *vq) +{ + if (dev->broken || + !vq->vring.avail) { + return true; + } + + if (vq->shadow_avail_idx != vq->last_avail_idx) + return false; + + return vring_avail_idx(vq) == vq->last_avail_idx; +} + +static bool vring_notify(const VuDev *dev, VuVirtq *vq) +{ + uint16_t old, new; + bool v; + + /* We need to expose used array entries before checking used event. */ + smp_mb(); + + /* Always notify when queue is empty (when feature acknowledge) */ + if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && vu_queue_empty(dev, vq)) { + return true; + } + + if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); + + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + old = vq->signalled_used; + new = vq->signalled_used = vq->used_idx; + return !v || vring_need_event(vring_get_used_event(vq), new, old); +} + +/* cppcheck-suppress unusedFunction */ +void vu_queue_notify(VuDev *dev, VuVirtq *vq) +{ + if (dev->broken || !vq->vring.avail) + return; + + if (!vring_notify(dev, vq)) { + debug("skipped notify..."); + return; + } + + if (eventfd_write(vq->call_fd, 1) < 0) + vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); +} + +static inline void vring_set_avail_event(VuVirtq *vq, uint16_t val) +{ + uint16_t val_le = htole16(val); + + if (!vq->notification) + return; + + memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t)); +} + +static bool virtqueue_map_desc(VuDev *dev, + unsigned int *p_num_sg, struct iovec *iov, + unsigned int max_num_sg, + uint64_t pa, size_t sz) +{ + unsigned int num_sg = *p_num_sg; + + ASSERT(num_sg <= max_num_sg); + + if (!sz) { + vu_panic(dev, "virtio: zero sized buffers are not allowed"); + return false; + } + + while (sz) { + uint64_t len = sz; + + if (num_sg == max_num_sg) { + vu_panic(dev, "virtio: too many descriptors in indirect table"); + return false; + } + + iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); + if (iov[num_sg].iov_base == NULL) { + vu_panic(dev, "virtio: invalid address for buffers"); + return false; + } + iov[num_sg].iov_len = len; + num_sg++; + sz -= len; + pa += len; + } + + *p_num_sg = num_sg; + return true; +} + +static int +vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, + VuVirtqElement *elem) +{ + const struct vring_desc *desc = vq->vring.desc; + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + unsigned int out_num = 0, in_num = 0; + unsigned int max = vq->vring.num; + unsigned int i = idx; + uint64_t read_len; + int rc; + + if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { + unsigned int desc_len; + uint64_t desc_addr; + + if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + return -1; + } + + /* loop over the indirect descriptor table */ + desc_addr = le64toh(desc[i].addr); + desc_len = le32toh(desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); + if (desc && read_len != desc_len) { + /* Failed to use zero copy */ + desc = NULL; + if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len)) + desc = desc_buf; + } + if (!desc) { + vu_panic(dev, "Invalid indirect buffer table"); + return -1; + } + i = 0; + } + + /* Collect all the descriptors */ + do { + if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { + if (!virtqueue_map_desc(dev, &in_num, elem->in_sg, + elem->in_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } else { + if (in_num) { + vu_panic(dev, "Incorrect order for descriptors"); + return -1; + } + if (!virtqueue_map_desc(dev, &out_num, elem->out_sg, + elem->out_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } + + /* If we've got too many, that implies a descriptor loop. */ + if ((in_num + out_num) > max) { + vu_panic(dev, "Looped descriptor"); + return -1; + } + rc = virtqueue_read_next_desc(dev, desc, i, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) { + vu_panic(dev, "read descriptor error"); + return -1; + } + + elem->index = idx; + elem->in_num = in_num; + elem->out_num = out_num; + + return 0; +} + +/* cppcheck-suppress unusedFunction */ +int vu_queue_pop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem) +{ + unsigned int head; + int ret; + + if (dev->broken || !vq->vring.avail) + return -1; + + if (vu_queue_empty(dev, vq)) + return -1; + + /* + * Needed after virtio_queue_empty(), see comment in + * virtqueue_num_heads(). + */ + smp_rmb(); + + if (vq->inuse >= vq->vring.num) { + vu_panic(dev, "Virtqueue size exceeded"); + return -1; + } + + if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) + return -1; + + if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + vring_set_avail_event(vq, vq->last_avail_idx); + + ret = vu_queue_map_desc(dev, vq, head, elem); + + if (ret < 0) + return ret; + + vq->inuse++; + + return 0; +} + +void vu_queue_detach_element(VuDev *dev, VuVirtq *vq, + unsigned int index, size_t len) +{ + (void)dev; + (void)index; + (void)len; + + vq->inuse--; + /* unmap, when DMA support is added */ +} + +/* cppcheck-suppress unusedFunction */ +void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len) +{ + vq->last_avail_idx--; + vu_queue_detach_element(dev, vq, index, len); +} + +/* cppcheck-suppress unusedFunction */ +bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) +{ + (void)dev; + if (num > vq->inuse) + return false; + + vq->last_avail_idx -= num; + vq->inuse -= num; + return true; +} + +static inline void vring_used_write(VuVirtq *vq, + const struct vring_used_elem *uelem, int i) +{ + struct vring_used *used = vq->vring.used; + + used->ring[i] = *uelem; +} + +void vu_queue_fill_by_index(const VuDev *dev, VuVirtq *vq, unsigned int index, + unsigned int len, unsigned int idx) +{ + struct vring_used_elem uelem; + + if (dev->broken || !vq->vring.avail) + return; + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = htole32(index); + uelem.len = htole32(len); + vring_used_write(vq, &uelem, idx); +} + +void vu_queue_fill(const VuDev *dev, VuVirtq *vq, const VuVirtqElement *elem, + unsigned int len, unsigned int idx) +{ + vu_queue_fill_by_index(dev, vq, elem->index, len, idx); +} + +static inline void vring_used_idx_set(VuVirtq *vq, uint16_t val) +{ + vq->vring.used->idx = htole16(val); + + vq->used_idx = val; +} + +void vu_queue_flush(const VuDev *dev, VuVirtq *vq, unsigned int count) +{ + uint16_t old, new; + + if (dev->broken || !vq->vring.avail) + return; + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + + old = vq->used_idx; + new = old + count; + vring_used_idx_set(vq, new); + vq->inuse -= count; + if ((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old)) + vq->signalled_used_valid = false; +} + +/* cppcheck-suppress unusedFunction */ +void vu_queue_push(const VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, unsigned int len) +{ + vu_queue_fill(dev, vq, elem, len, 0); + vu_queue_flush(dev, vq, 1); +} + diff --git a/virtio.h b/virtio.h new file mode 100644 index 000000000000..61398bb432bc --- /dev/null +++ b/virtio.h @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// +/* come parts copied from QEMU subprojects/libvhost-user/libvhost-user.h */ + +#ifndef VIRTIO_H +#define VIRTIO_H + +#include <stdbool.h> +#include <linux/vhost_types.h> + +#define VIRTQUEUE_MAX_SIZE 1024 + +#define vu_panic(vdev, ...) \ + do { \ + (vdev)->broken = true; \ + err( __VA_ARGS__ ); \Wouldn't it be simpler to just use die() in place of vu_panic(). This is trying to keep the program running even if the vu device is broken, but if our channel to the guest is broken, I don't think passt is really worth saving.+ } while (0) + +typedef struct VuRing { + unsigned int num; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + uint32_t flags; +} VuRing; + +typedef struct VuVirtq { + VuRing vring; + + /* Next head to pop */ + uint16_t last_avail_idx; + + /* Last avail_idx read from VQ. */ + uint16_t shadow_avail_idx; + + uint16_t used_idx; + + /* Last used index value we have signalled on */ + uint16_t signalled_used; + + /* Last used index value we have signalled on */ + bool signalled_used_valid; + + bool notification; + + unsigned int inuse; + + int call_fd; + int kick_fd; + int err_fd; + unsigned int enable; + bool started; + + /* Guest addresses of our ring */ + struct vhost_vring_addr vra; +} VuVirtq; + +typedef struct VuDevRegion { + uint64_t gpa; + uint64_t size; + uint64_t qva; + uint64_t mmap_offset; + uint64_t mmap_addr; +} VuDevRegion; + +#define VHOST_USER_MAX_QUEUES 2 + +/* + * Set a reasonable maximum number of ram slots, which will be supported by + * any architecture. + */ +#define VHOST_USER_MAX_RAM_SLOTS 32 + +typedef struct VuDev { + uint32_t nregions; + VuDevRegion regions[VHOST_USER_MAX_RAM_SLOTS]; + VuVirtq vq[VHOST_USER_MAX_QUEUES]; + uint64_t features; + uint64_t protocol_features; + bool broken; + int hdrlen; +} VuDev; + +typedef struct VuVirtqElement { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + struct iovec *in_sg; + struct iovec *out_sg; +} VuVirtqElement; + +static inline bool has_feature(uint64_t features, unsigned int fbit) +{ + return !!(features & (1ULL << fbit)); +} + +static inline bool vu_has_feature(const VuDev *vdev, unsigned int fbit) +{ + return has_feature(vdev->features, fbit); +} + +/* cppcheck-suppress unusedFunction */ +static inline bool vu_has_protocol_feature(const VuDev *vdev, unsigned int fbit) +{ + return has_feature(vdev->protocol_features, fbit); +} + +bool vu_queue_empty(const VuDev *dev, VuVirtq *vq); +void vu_queue_notify(VuDev *dev, VuVirtq *vq); +int vu_queue_pop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem); +void vu_queue_detach_element(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len); +void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len); +bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num); + +void vu_queue_fill_by_index(const VuDev *dev, VuVirtq *vq, unsigned int index, + unsigned int len, unsigned int idx); +void vu_queue_fill(const VuDev *dev, VuVirtq *vq, const VuVirtqElement *elem, + unsigned int len, unsigned int idx); +void vu_queue_flush(const VuDev *dev, VuVirtq *vq, unsigned int count); +void vu_queue_push(const VuDev *dev, VuVirtq *vq, const VuVirtqElement *elem, + unsigned int len); +#endif /* VIRTIO_H */-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
On 24/06/2024 04:56, David Gibson wrote:On Fri, Jun 21, 2024 at 04:56:37PM +0200, Laurent Vivier wrote: > Add virtio.c and virtio.h that define the functions needed > to manage virtqueues. > > Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> > --- > Makefile | 4 +- > util.h | 11 ++ > virtio.c | 446 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ > virtio.h | 123 +++++++++++++++ > 4 files changed, 582 insertions(+), 2 deletions(-) > create mode 100644 virtio.c > create mode 100644 virtio.h >...I'm updating thatdiff --git a/virtio.c b/virtio.c new file mode 100644 index 000000000000..50ec8b5119ed --- /dev/null +++ b/virtio.c @@ -0,0 +1,446 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +Needs an actual "Copyright" invocation as well as the SPDX stuff. Which, yes, is a bit fiddly given that it's largely taken from qemu.As the orignal code in QEMU don't change a lot I chose to update it to match passt coding style. I'm adding the function headers. What else is missing? ...+/* some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c */So, there are obvious stylistic differences between this and the rest of the passt code for that reason. As I think I said on an earlier draft, I think we need to go fully one way or the other: either a) rewrite this entirely in passt style or b) change the whole thing so little that it's trivial to pull in new versions from qemu. This seems to be somewhere in the middle.I agree. Thanks, Laurentdiff --git a/virtio.h b/virtio.h new file mode 100644 index 000000000000..61398bb432bc --- /dev/null +++ b/virtio.h @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// +/* come parts copied from QEMU subprojects/libvhost-user/libvhost-user.h */ + +#ifndef VIRTIO_H +#define VIRTIO_H + +#include <stdbool.h> +#include <linux/vhost_types.h> + +#define VIRTQUEUE_MAX_SIZE 1024 + +#define vu_panic(vdev, ...) \ + do { \ + (vdev)->broken = true; \ + err( __VA_ARGS__ ); \Wouldn't it be simpler to just use die() in place of vu_panic(). This is trying to keep the program running even if the vu device is broken, but if our channel to the guest is broken, I don't think passt is really worth saving.
On Fri, Jul 05, 2024 at 05:06:12PM +0200, Laurent Vivier wrote:On 24/06/2024 04:56, David Gibson wrote:Ok.On Fri, Jun 21, 2024 at 04:56:37PM +0200, Laurent Vivier wrote: > Add virtio.c and virtio.h that define the functions needed > to manage virtqueues. > > Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> > --- > Makefile | 4 +- > util.h | 11 ++ > virtio.c | 446 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ > virtio.h | 123 +++++++++++++++ > 4 files changed, 582 insertions(+), 2 deletions(-) > create mode 100644 virtio.c > create mode 100644 virtio.h >...I'm updating thatdiff --git a/virtio.c b/virtio.c new file mode 100644 index 000000000000..50ec8b5119ed --- /dev/null +++ b/virtio.c @@ -0,0 +1,446 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +Needs an actual "Copyright" invocation as well as the SPDX stuff. Which, yes, is a bit fiddly given that it's largely taken from qemu.From memory: - No braces for one line blocks - snake_case instead of CamelCase - Don't use typedefs for structures or unions - Tabs instead of spacesAs the orignal code in QEMU don't change a lot I chose to update it to match passt coding style. I'm adding the function headers. What else is missing?+/* some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c */So, there are obvious stylistic differences between this and the rest of the passt code for that reason. As I think I said on an earlier draft, I think we need to go fully one way or the other: either a) rewrite this entirely in passt style or b) change the whole thing so little that it's trivial to pull in new versions from qemu. This seems to be somewhere in the middle....-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibsonI agree. Thanks, Laurentdiff --git a/virtio.h b/virtio.h new file mode 100644 index 000000000000..61398bb432bc --- /dev/null +++ b/virtio.h @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// +/* come parts copied from QEMU subprojects/libvhost-user/libvhost-user.h */ + +#ifndef VIRTIO_H +#define VIRTIO_H + +#include <stdbool.h> +#include <linux/vhost_types.h> + +#define VIRTQUEUE_MAX_SIZE 1024 + +#define vu_panic(vdev, ...) \ + do { \ + (vdev)->broken = true; \ + err( __VA_ARGS__ ); \Wouldn't it be simpler to just use die() in place of vu_panic(). This is trying to keep the program running even if the vu device is broken, but if our channel to the guest is broken, I don't think passt is really worth saving.
Add vhost_user.c and vhost_user.h that define the functions needed to implement vhost-user backend. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- Makefile | 4 +- iov.c | 1 - passt.c | 2 + passt.h | 8 + tap.c | 19 +- tap.h | 2 + vhost_user.c | 1083 ++++++++++++++++++++++++++++++++++++++++++++++++++ vhost_user.h | 137 +++++++ virtio.c | 3 - virtio.h | 1 - 10 files changed, 1252 insertions(+), 8 deletions(-) create mode 100644 vhost_user.c create mode 100644 vhost_user.h diff --git a/Makefile b/Makefile index 39613a7cf1f2..b2da6ad62103 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c virtio.c + tcp_buf.c tcp_splice.c udp.c util.c vhost_user.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h virtio.h + udp.h util.h vhost_user.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/iov.c b/iov.c index 3f9e229a305f..3741db21790f 100644 --- a/iov.c +++ b/iov.c @@ -68,7 +68,6 @@ size_t iov_skip_bytes(const struct iovec *iov, size_t n, * * Returns: The number of bytes successfully copied. */ -/* cppcheck-suppress unusedFunction */ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, const void *buf, size_t bytes) { diff --git a/passt.c b/passt.c index a5e2c5a8e151..9d21c545b9cf 100644 --- a/passt.c +++ b/passt.c @@ -73,6 +73,8 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device", [EPOLL_TYPE_TAP_PASST] = "connected qemu socket", [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", + [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", + [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); diff --git a/passt.h b/passt.h index 46d073a2a6fd..af10d0bfe4ef 100644 --- a/passt.h +++ b/passt.h @@ -22,6 +22,7 @@ union epoll_ref; #include "fwd.h" #include "tcp.h" #include "udp.h" +#include "vhost_user.h" /** * enum epoll_type - Different types of fds we poll over @@ -51,6 +52,10 @@ enum epoll_type { EPOLL_TYPE_TAP_PASST, /* socket listening for qemu socket connections */ EPOLL_TYPE_TAP_LISTEN, + /* vhost-user command socket */ + EPOLL_TYPE_VHOST_CMD, + /* vhost-user kick event socket */ + EPOLL_TYPE_VHOST_KICK, EPOLL_NUM_TYPES, }; @@ -224,6 +229,7 @@ struct ip6_ctx { * @no_map_gw: Don't map connections, untracked UDP to gateway to host * @low_wmem: Low probed net.core.wmem_max * @low_rmem: Low probed net.core.rmem_max + * @vdev: vhost-user device */ struct ctx { enum passt_modes mode; @@ -288,6 +294,8 @@ struct ctx { int low_wmem; int low_rmem; + + struct VuDev vdev; }; void proto_update_l2_buf(const unsigned char *eth_d, diff --git a/tap.c b/tap.c index c9aeff19f177..be272d25b642 100644 --- a/tap.c +++ b/tap.c @@ -977,7 +977,7 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) * tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket * @c: Execution context */ -static void tap_sock_reset(struct ctx *c) +void tap_sock_reset(struct ctx *c) { if (c->one_off) { info("Client closed connection, exiting"); @@ -1296,6 +1296,23 @@ static void tap_sock_tun_init(struct ctx *c) epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); } +void tap_sock_update_buf(void *base, size_t size) +{ + int i; + + pool_tap4_storage.buf = base; + pool_tap4_storage.buf_size = size; + pool_tap6_storage.buf = base; + pool_tap6_storage.buf_size = size; + + for (i = 0; i < TAP_SEQS; i++) { + tap4_l4[i].p.buf = base; + tap4_l4[i].p.buf_size = size; + tap6_l4[i].p.buf = base; + tap6_l4[i].p.buf_size = size; + } +} + /** * tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor * @c: Execution context diff --git a/tap.h b/tap.h index d496bd0e4b99..3b2dde41ae8d 100644 --- a/tap.h +++ b/tap.h @@ -69,6 +69,8 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now); int tap_sock_unix_open(char *sock_path); +void tap_sock_reset(struct ctx *c); +void tap_sock_update_buf(void *base, size_t size); void tap_sock_init(struct ctx *c); void tap_flush_pools(void); void tap_handler(struct ctx *c, const struct timespec *now); diff --git a/vhost_user.c b/vhost_user.c new file mode 100644 index 000000000000..4ac0a3e53499 --- /dev/null +++ b/vhost_user.c @@ -0,0 +1,1083 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* some parts from QEMU subprojects/libvhost-user/libvhost-user.c */ + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <stddef.h> +#include <string.h> +#include <assert.h> +#include <stdbool.h> +#include <inttypes.h> +#include <time.h> +#include <net/ethernet.h> +#include <netinet/in.h> +#include <sys/epoll.h> +#include <sys/eventfd.h> +#include <sys/mman.h> +#include <linux/vhost_types.h> +#include <linux/virtio_net.h> + +#include "util.h" +#include "passt.h" +#include "tap.h" +#include "vhost_user.h" + +#define VHOST_USER_VERSION 1 + +/* cppcheck-suppress unusedFunction */ +void vu_print_capabilities(void) +{ + printf("{\n"); + printf(" \"type\": \"net\"\n"); + printf("}\n"); + exit(EXIT_SUCCESS); +} + +static const char * +vu_request_to_string(unsigned int req) +{ + if (req < VHOST_USER_MAX) { +#define REQ(req) [req] = #req + static const char * const vu_request_str[] = { + REQ(VHOST_USER_NONE), + REQ(VHOST_USER_GET_FEATURES), + REQ(VHOST_USER_SET_FEATURES), + REQ(VHOST_USER_SET_OWNER), + REQ(VHOST_USER_RESET_OWNER), + REQ(VHOST_USER_SET_MEM_TABLE), + REQ(VHOST_USER_SET_LOG_BASE), + REQ(VHOST_USER_SET_LOG_FD), + REQ(VHOST_USER_SET_VRING_NUM), + REQ(VHOST_USER_SET_VRING_ADDR), + REQ(VHOST_USER_SET_VRING_BASE), + REQ(VHOST_USER_GET_VRING_BASE), + REQ(VHOST_USER_SET_VRING_KICK), + REQ(VHOST_USER_SET_VRING_CALL), + REQ(VHOST_USER_SET_VRING_ERR), + REQ(VHOST_USER_GET_PROTOCOL_FEATURES), + REQ(VHOST_USER_SET_PROTOCOL_FEATURES), + REQ(VHOST_USER_GET_QUEUE_NUM), + REQ(VHOST_USER_SET_VRING_ENABLE), + REQ(VHOST_USER_SEND_RARP), + REQ(VHOST_USER_NET_SET_MTU), + REQ(VHOST_USER_SET_BACKEND_REQ_FD), + REQ(VHOST_USER_IOTLB_MSG), + REQ(VHOST_USER_SET_VRING_ENDIAN), + REQ(VHOST_USER_GET_CONFIG), + REQ(VHOST_USER_SET_CONFIG), + REQ(VHOST_USER_POSTCOPY_ADVISE), + REQ(VHOST_USER_POSTCOPY_LISTEN), + REQ(VHOST_USER_POSTCOPY_END), + REQ(VHOST_USER_GET_INFLIGHT_FD), + REQ(VHOST_USER_SET_INFLIGHT_FD), + REQ(VHOST_USER_GPU_SET_SOCKET), + REQ(VHOST_USER_VRING_KICK), + REQ(VHOST_USER_GET_MAX_MEM_SLOTS), + REQ(VHOST_USER_ADD_MEM_REG), + REQ(VHOST_USER_REM_MEM_REG), + REQ(VHOST_USER_MAX), + }; +#undef REQ + return vu_request_str[req]; + } + + return "unknown"; +} + +/* Translate qemu virtual address to our virtual address. */ +static void *qva_to_va(VuDev *dev, uint64_t qemu_addr) +{ + unsigned int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const VuDevRegion *r = &dev->regions[i]; + + if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(qemu_addr - r->qva + r->mmap_addr + + r->mmap_offset); + } + } + + return NULL; +} + +static void +vmsg_close_fds(const VhostUserMsg *vmsg) +{ + int i; + + for (i = 0; i < vmsg->fd_num; i++) + close(vmsg->fds[i]); +} + +static void vu_remove_watch(VuDev *vdev, int fd) +{ + const struct ctx *c = (struct ctx *) ((char *)vdev - + offsetof(struct ctx, vdev)); + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL); +} + +/* Set reply payload.u64 and clear request flags and fd_num */ +static void vmsg_set_reply_u64(struct VhostUserMsg *vmsg, uint64_t val) +{ + vmsg->hdr.flags = 0; /* defaults will be set by vu_send_reply() */ + vmsg->hdr.size = sizeof(vmsg->payload.u64); + vmsg->payload.u64 = val; + vmsg->fd_num = 0; +} + +static ssize_t vu_message_read_default(VuDev *dev, int conn_fd, struct VhostUserMsg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * + sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + .msg_controllen = sizeof(control), + }; + size_t fd_size; + struct cmsghdr *cmsg; + ssize_t ret, sz_payload; + + ret = recvmsg(conn_fd, &msg, MSG_DONTWAIT); + if (ret < 0) { + if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + vu_panic(dev, "Error while recvmsg: %s", strerror(errno)); + goto out; + } + + vmsg->fd_num = 0; + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + fd_size = cmsg->cmsg_len - CMSG_LEN(0); + vmsg->fd_num = fd_size / sizeof(int); + memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); + break; + } + } + + sz_payload = vmsg->hdr.size; + if ((size_t)sz_payload > sizeof(vmsg->payload)) { + vu_panic(dev, + "Error: too big message request: %d, size: vmsg->size: %zd, " + "while sizeof(vmsg->payload) = %zu", + vmsg->hdr.request, sz_payload, sizeof(vmsg->payload)); + goto out; + } + + if (sz_payload) { + do { + ret = recv(conn_fd, &vmsg->payload, sz_payload, 0); + } while (ret < 0 && (errno == EINTR || errno == EAGAIN)); + + if (ret < sz_payload) { + vu_panic(dev, "Error while reading: %s", strerror(errno)); + goto out; + } + } + + return 1; +out: + vmsg_close_fds(vmsg); + + return -ECONNRESET; +} + +static int vu_message_write(VuDev *dev, int conn_fd, struct VhostUserMsg *vmsg) +{ + int rc; + const uint8_t *p = (uint8_t *)vmsg; + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + }; + + memset(control, 0, sizeof(control)); + assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); + if (vmsg->fd_num > 0) { + size_t fdsize = vmsg->fd_num * sizeof(int); + struct cmsghdr *cmsg; + + msg.msg_controllen = CMSG_SPACE(fdsize); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); + } else { + msg.msg_controllen = 0; + } + + do { + rc = sendmsg(conn_fd, &msg, 0); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + if (vmsg->hdr.size) { + do { + if (vmsg->data) + rc = write(conn_fd, vmsg->data, vmsg->hdr.size); + else + rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->hdr.size); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + } + + if (rc <= 0) { + vu_panic(dev, "Error while writing: %s", strerror(errno)); + return false; + } + + return true; +} + +static int vu_send_reply(VuDev *dev, int conn_fd, struct VhostUserMsg *msg) +{ + msg->hdr.flags &= ~VHOST_USER_VERSION_MASK; + msg->hdr.flags |= VHOST_USER_VERSION; + msg->hdr.flags |= VHOST_USER_REPLY_MASK; + + return vu_message_write(dev, conn_fd, msg); +} + +static bool vu_get_features_exec(struct VhostUserMsg *msg) +{ + uint64_t features = + 1ULL << VIRTIO_F_VERSION_1 | + 1ULL << VIRTIO_NET_F_MRG_RXBUF | + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; + + vmsg_set_reply_u64(msg, features); + + debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64); + + return true; +} + +static void +vu_set_enable_all_rings(VuDev *vdev, bool enabled) +{ + uint16_t i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) + vdev->vq[i].enable = enabled; +} + +static bool +vu_set_features_exec(VuDev *vdev, struct VhostUserMsg *msg) +{ + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vdev->features = msg->payload.u64; + if (!vu_has_feature(vdev, VIRTIO_F_VERSION_1)) { + /* + * We only support devices conforming to VIRTIO 1.0 or + * later + */ + vu_panic(vdev, "virtio legacy devices aren't supported by passt"); + return false; + } + + if (!vu_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) + vu_set_enable_all_rings(vdev, true); + + /* virtio-net features */ + + if (vu_has_feature(vdev, VIRTIO_F_VERSION_1) || + vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { + vdev->hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + vdev->hdrlen = sizeof(struct virtio_net_hdr); + } + + return false; +} + +static bool +vu_set_owner_exec(void) +{ + return false; +} + +static bool map_ring(VuDev *vdev, VuVirtq *vq) +{ + vq->vring.desc = qva_to_va(vdev, vq->vra.desc_user_addr); + vq->vring.used = qva_to_va(vdev, vq->vra.used_user_addr); + vq->vring.avail = qva_to_va(vdev, vq->vra.avail_user_addr); + + debug("Setting virtq addresses:"); + debug(" vring_desc at %p", (void *)vq->vring.desc); + debug(" vring_used at %p", (void *)vq->vring.used); + debug(" vring_avail at %p", (void *)vq->vring.avail); + + return !(vq->vring.desc && vq->vring.used && vq->vring.avail); +} + +/* cppcheck-suppress unusedFunction */ +int vu_packet_check_range(void *buf, size_t offset, size_t len, const char *start, + const char *func, int line) +{ + VuDevRegion *dev_region; + + for (dev_region = buf; dev_region->mmap_addr; dev_region++) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + char *m = (char *)dev_region->mmap_addr; + + if (m <= start && + start + offset + len < m + dev_region->mmap_offset + + dev_region->size) + return 0; + } + if (func) + trace("cannot find region, %s:%i", func, line); + + return -1; +} + +/* + * #syscalls:passt mmap munmap + */ + +static bool vu_set_mem_table_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + unsigned int i; + struct VhostUserMemory m = msg->payload.memory, *memory = &m; + + for (i = 0; i < vdev->nregions; i++) { + VuDevRegion *r = &vdev->regions[i]; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + void *mm = (void *)r->mmap_addr; + + if (mm) + munmap(mm, r->size + r->mmap_offset); + } + vdev->nregions = memory->nregions; + + debug("Nregions: %u", memory->nregions); + for (i = 0; i < vdev->nregions; i++) { + void *mmap_addr; + VhostUserMemory_region *msg_region = &memory->regions[i]; + VuDevRegion *dev_region = &vdev->regions[i]; + + debug("Region %d", i); + debug(" guest_phys_addr: 0x%016"PRIx64, + msg_region->guest_phys_addr); + debug(" memory_size: 0x%016"PRIx64, + msg_region->memory_size); + debug(" userspace_addr 0x%016"PRIx64, + msg_region->userspace_addr); + debug(" mmap_offset 0x%016"PRIx64, + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned, and we use huge + * pages. + */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, + msg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) { + vu_panic(vdev, "region mmap error: %s", strerror(errno)); + } else { + dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + debug(" mmap_addr: 0x%016"PRIx64, + dev_region->mmap_addr); + } + + close(msg->fds[i]); + } + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + if (vdev->vq[i].vring.desc) { + if (map_ring(vdev, &vdev->vq[i])) + vu_panic(vdev, "remapping queue %d during setmemtable", i); + } + } + + /* XXX */ + ASSERT(vdev->nregions < VHOST_USER_MAX_RAM_SLOTS - 1); + vdev->regions[vdev->nregions].mmap_addr = 0; /* mark EOF for vu_packet_check_range() */ + + tap_sock_update_buf(vdev->regions, 0); + + return false; +} + +static bool vu_set_vring_num_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int num = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.num: %u", num); + vdev->vq[idx].vring.num = num; + + return false; +} + +static bool vu_set_vring_addr_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + struct vhost_vring_addr addr = msg->payload.addr, *vra = &addr; + VuVirtq *vq = &vdev->vq[vra->index]; + + debug("vhost_vring_addr:"); + debug(" index: %d", vra->index); + debug(" flags: %d", vra->flags); + debug(" desc_user_addr: 0x%016" PRIx64, (uint64_t)vra->desc_user_addr); + debug(" used_user_addr: 0x%016" PRIx64, (uint64_t)vra->used_user_addr); + debug(" avail_user_addr: 0x%016" PRIx64, (uint64_t)vra->avail_user_addr); + debug(" log_guest_addr: 0x%016" PRIx64, (uint64_t)vra->log_guest_addr); + + vq->vra = *vra; + vq->vring.flags = vra->flags; + vq->vring.log_guest_addr = vra->log_guest_addr; + + if (map_ring(vdev, vq)) { + vu_panic(vdev, "Invalid vring_addr message"); + return false; + } + + vq->used_idx = le16toh(vq->vring.used->idx); + + if (vq->last_avail_idx != vq->used_idx) { + debug("Last avail index != used index: %u != %u", + vq->last_avail_idx, vq->used_idx); + } + + return false; +} + +static bool vu_set_vring_base_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int num = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.num: %u", num); + vdev->vq[idx].shadow_avail_idx = vdev->vq[idx].last_avail_idx = num; + + return false; +} + +static bool vu_get_vring_base_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + unsigned int idx = msg->payload.state.index; + + debug("State.index: %u", idx); + msg->payload.state.num = vdev->vq[idx].last_avail_idx; + msg->hdr.size = sizeof(msg->payload.state); + + vdev->vq[idx].started = false; + + if (vdev->vq[idx].call_fd != -1) { + close(vdev->vq[idx].call_fd); + vdev->vq[idx].call_fd = -1; + } + if (vdev->vq[idx].kick_fd != -1) { + vu_remove_watch(vdev, vdev->vq[idx].kick_fd); + close(vdev->vq[idx].kick_fd); + vdev->vq[idx].kick_fd = -1; + } + + return true; +} + +static void vu_set_watch(VuDev *vdev, int fd) +{ + const struct ctx *c = (struct ctx *) + ((char *)vdev - offsetof(struct ctx, vdev)); + union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_KICK, .fd = fd }; + struct epoll_event ev = { 0 }; + + ev.data.u64 = ref.u64; + ev.events = EPOLLIN; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev); +} + +static int vu_wait_queue(const VuVirtq *vq) +{ + eventfd_t kick_data; + ssize_t rc; + int status; + + /* wait the kernel to put new entries in the queue */ + + status = fcntl(vq->kick_fd, F_GETFL); + if (status == -1) + return -1; + + fcntl(vq->kick_fd, F_SETFL, status & ~O_NONBLOCK); + rc = eventfd_read(vq->kick_fd, &kick_data); + fcntl(vq->kick_fd, F_SETFL, status); + if (rc == -1) + return -1; + + return 0; +} + +/* cppcheck-suppress unusedFunction */ +int vu_send(const struct ctx *c, const void *buf, size_t size) +{ + VuDev *vdev = (VuDev *)&c->vdev; + size_t hdrlen = vdev->hdrlen; + VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + VuVirtqElement elem[VIRTQUEUE_MAX_SIZE]; + struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; + size_t lens[VIRTQUEUE_MAX_SIZE]; + size_t offset; + int i, j; + __virtio16 *num_buffers_ptr; + int in_sg_count; + + debug("vu_send size %zu hdrlen %zu", size, hdrlen); + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + err("Got packet, but no available descriptors on RX virtq."); + return 0; + } + + offset = 0; + i = 0; + num_buffers_ptr = NULL; + in_sg_count = 0; + while (offset < size) { + size_t len; + int total; + int ret; + + total = 0; + + if (i == ARRAY_SIZE(elem) || + in_sg_count == ARRAY_SIZE(in_sg)) { + err("virtio-net unexpected long buffer chain"); + goto err; + } + + elem[i].out_num = 0; + elem[i].out_sg = NULL; + elem[i].in_num = ARRAY_SIZE(in_sg) - in_sg_count; + elem[i].in_sg = &in_sg[in_sg_count]; + + ret = vu_queue_pop(vdev, vq, &elem[i]); + if (ret < 0) { + if (!vdev->broken) { + if (vu_wait_queue(vq) != -1) + continue; + } + if (i) { + err("virtio-net unexpected empty queue: " + "i %d mergeable %d offset %zd, size %zd, " + "features 0x%" PRIx64, + i, vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF), + offset, size, vdev->features); + } + offset = -1; + goto err; + } + in_sg_count += elem[i].in_num; + + if (elem[i].in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_detach_element(vdev, vq, elem[i].index, 0); + offset = -1; + goto err; + } + + if (i == 0) { + struct virtio_net_hdr hdr = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, + }; + + ASSERT(offset == 0); + ASSERT(elem[i].in_sg[0].iov_len >= hdrlen); + + len = iov_from_buf(elem[i].in_sg, elem[i].in_num, 0, + &hdr, sizeof(hdr)); + + num_buffers_ptr = (__virtio16 *)((char *)elem[i].in_sg[0].iov_base + + len); + + total += hdrlen; + } + + len = iov_from_buf(elem[i].in_sg, elem[i].in_num, total, (char *)buf + offset, + size - offset); + + total += len; + offset += len; + + /* If buffers can't be merged, at this point we + * must have consumed the complete packet. + * Otherwise, drop it. + */ + if (!vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) && offset < size) { + vu_queue_unpop(vdev, vq, elem[i].index, total); + goto err; + } + + lens[i] = total; + i++; + } + + if (num_buffers_ptr && vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) + *num_buffers_ptr = htole16(i); + + for (j = 0; j < i; j++) { + debug("filling total %zd idx %d", lens[j], j); + vu_queue_fill(vdev, vq, &elem[j], lens[j], j); + } + + vu_queue_flush(vdev, vq, i); + vu_queue_notify(vdev, vq); + + debug("sent %zu", offset); + + return offset; +err: + for (j = 0; j < i; j++) + vu_queue_detach_element(vdev, vq, elem[j].index, lens[j]); + + return offset; +} + +static void vu_handle_tx(VuDev *vdev, int index) +{ + struct ctx *c = (struct ctx *) ((char *)vdev - offsetof(struct ctx, vdev)); + VuVirtq *vq = &vdev->vq[index]; + int hdrlen = vdev->hdrlen; + struct timespec now; + VuVirtqElement elem[VIRTQUEUE_MAX_SIZE]; + struct iovec out_sg[VIRTQUEUE_MAX_SIZE]; + int out_sg_count; + + int count; + + if (index % 2 != VHOST_USER_TX_QUEUE) { + debug("index %d is not an TX queue", index); + return; + } + + clock_gettime(CLOCK_MONOTONIC, &now); + + tap_flush_pools(); + + count = 0; + out_sg_count = 0; + while (1) { + int ret; + + ASSERT(index == VHOST_USER_TX_QUEUE); + + elem[count].out_num = 1; + elem[count].out_sg = &out_sg[out_sg_count]; + elem[count].in_num = 0; + elem[count].in_sg = NULL; + ret = vu_queue_pop(vdev, vq, &elem[count]); + if (ret < 0) + break; + out_sg_count += elem[count].out_num; + + if (elem[count].out_num < 1) { + debug("virtio-net header not in first element"); + break; + } + ASSERT(elem[count].out_num == 1); + + tap_add_packet(c, elem[count].out_sg[0].iov_len - hdrlen, + (char *)elem[count].out_sg[0].iov_base + hdrlen); + count++; + } + tap_handler(c, &now); + + if (count) { + int i; + + for (i = 0; i < count; i++) + vu_queue_fill(vdev, vq, &elem[i], 0, i); + vu_queue_flush(vdev, vq, count); + vu_queue_notify(vdev, vq); + } +} + +/* cppcheck-suppress unusedFunction */ +void vu_kick_cb(struct ctx *c, union epoll_ref ref) +{ + VuDev *vdev = &c->vdev; + eventfd_t kick_data; + ssize_t rc; + int idx; + + for (idx = 0; idx < VHOST_USER_MAX_QUEUES; idx++) + if (c->vdev.vq[idx].kick_fd == ref.fd) + break; + + if (idx == VHOST_USER_MAX_QUEUES) + return; + + rc = eventfd_read(ref.fd, &kick_data); + if (rc == -1) { + vu_panic(vdev, "kick eventfd_read(): %s", strerror(errno)); + vu_remove_watch(vdev, ref.fd); + } else { + debug("Got kick_data: %016"PRIx64" idx:%d", + kick_data, idx); + if (idx % 2 == VHOST_USER_TX_QUEUE) + vu_handle_tx(vdev, idx); + } +} + +static bool vu_check_queue_msg_file(VuDev *vdev, struct VhostUserMsg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + if (idx >= VHOST_USER_MAX_QUEUES) { + vmsg_close_fds(msg); + vu_panic(vdev, "Invalid queue index: %u", idx); + return false; + } + + if (nofd) { + vmsg_close_fds(msg); + return true; + } + + if (msg->fd_num != 1) { + vmsg_close_fds(msg); + vu_panic(vdev, "Invalid fds in request: %d", msg->hdr.request); + return false; + } + + return true; +} + +static bool vu_set_vring_kick_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + if (!vu_check_queue_msg_file(vdev, msg)) + return false; + + if (vdev->vq[idx].kick_fd != -1) { + vu_remove_watch(vdev, vdev->vq[idx].kick_fd); + close(vdev->vq[idx].kick_fd); + vdev->vq[idx].kick_fd = -1; + } + + /* cppcheck-suppress redundantAssignment */ + vdev->vq[idx].kick_fd = nofd ? -1 : msg->fds[0]; + debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx); + + vdev->vq[idx].started = true; + + if (vdev->vq[idx].kick_fd != -1 && idx % 2 == VHOST_USER_TX_QUEUE) { + vu_set_watch(vdev, vdev->vq[idx].kick_fd); + debug("Waiting for kicks on fd: %d for vq: %d", + vdev->vq[idx].kick_fd, idx); + } + + return false; +} + +static bool vu_set_vring_call_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + if (!vu_check_queue_msg_file(vdev, msg)) + return false; + + if (vdev->vq[idx].call_fd != -1) { + close(vdev->vq[idx].call_fd); + vdev->vq[idx].call_fd = -1; + } + + /* cppcheck-suppress redundantAssignment */ + vdev->vq[idx].call_fd = nofd ? -1 : msg->fds[0]; + + /* in case of I/O hang after reconnecting */ + if (vdev->vq[idx].call_fd != -1) + eventfd_write(msg->fds[0], 1); + + debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx); + + return false; +} + +static bool vu_set_vring_err_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + if (!vu_check_queue_msg_file(vdev, msg)) + return false; + + if (vdev->vq[idx].err_fd != -1) { + close(vdev->vq[idx].err_fd); + vdev->vq[idx].err_fd = -1; + } + + /* cppcheck-suppress redundantAssignment */ + vdev->vq[idx].err_fd = nofd ? -1 : msg->fds[0]; + + return false; +} + +static bool vu_get_protocol_features_exec(struct VhostUserMsg *msg) +{ + uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK; + + vmsg_set_reply_u64(msg, features); + + return true; +} + +static bool vu_set_protocol_features_exec(VuDev *vdev, struct VhostUserMsg *msg) +{ + uint64_t features = msg->payload.u64; + + debug("u64: 0x%016"PRIx64, features); + + vdev->protocol_features = msg->payload.u64; + + if (vu_has_protocol_feature(vdev, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && + (!vu_has_protocol_feature(vdev, VHOST_USER_PROTOCOL_F_BACKEND_REQ) || + !vu_has_protocol_feature(vdev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) { + /* + * The use case for using messages for kick/call is simulation, to make + * the kick and call synchronous. To actually get that behaviour, both + * of the other features are required. + * Theoretically, one could use only kick messages, or do them without + * having F_REPLY_ACK, but too many (possibly pending) messages on the + * socket will eventually cause the master to hang, to avoid this in + * scenarios where not desired enforce that the settings are in a way + * that actually enables the simulation case. + */ + vu_panic(vdev, + "F_IN_BAND_NOTIFICATIONS requires F_BACKEND_REQ && F_REPLY_ACK"); + return false; + } + + return false; +} + + +static bool vu_get_queue_num_exec(struct VhostUserMsg *msg) +{ + vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES); + return true; +} + +static bool vu_set_vring_enable_exec(VuDev *vdev, struct VhostUserMsg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int enable = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.enable: %u", enable); + + if (idx >= VHOST_USER_MAX_QUEUES) { + vu_panic(vdev, "Invalid vring_enable index: %u", idx); + return false; + } + + vdev->vq[idx].enable = enable; + return false; +} + +/* cppcheck-suppress unusedFunction */ +void vu_init(struct ctx *c) +{ + int i; + + c->vdev.hdrlen = 0; + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) + c->vdev.vq[i] = (VuVirtq){ + .call_fd = -1, + .kick_fd = -1, + .err_fd = -1, + .notification = true, + }; +} + +static void vu_cleanup(VuDev *vdev) +{ + unsigned int i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + VuVirtq *vq = &vdev->vq[i]; + + vq->started = false; + vq->notification = true; + + if (vq->call_fd != -1) { + close(vq->call_fd); + vq->call_fd = -1; + } + if (vq->err_fd != -1) { + close(vq->err_fd); + vq->err_fd = -1; + } + if (vq->kick_fd != -1) { + vu_remove_watch(vdev, vq->kick_fd); + close(vq->kick_fd); + vq->kick_fd = -1; + } + + vq->vring.desc = 0; + vq->vring.used = 0; + vq->vring.avail = 0; + } + vdev->hdrlen = 0; + + for (i = 0; i < vdev->nregions; i++) { + const VuDevRegion *r = &vdev->regions[i]; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + void *m = (void *)r->mmap_addr; + + if (m) + munmap(m, r->size + r->mmap_offset); + } + vdev->nregions = 0; +} + +/** + * tap_handler_vu() - Packet handler for vhost-user + * @c: Execution context + * @events: epoll events + */ +/* cppcheck-suppress unusedFunction */ +void tap_handler_vu(struct ctx *c, uint32_t events) +{ + VuDev *dev = &c->vdev; + struct VhostUserMsg msg = { 0 }; + bool need_reply, reply_requested; + int ret; + + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { + tap_sock_reset(c); + return; + } + + + ret = vu_message_read_default(dev, c->fd_tap, &msg); + if (ret <= 0) { + if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) + tap_sock_reset(c); + return; + } + debug("================ Vhost user message ================"); + debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request), + msg.hdr.request); + debug("Flags: 0x%x", msg.hdr.flags); + debug("Size: %u", msg.hdr.size); + + need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK; + switch (msg.hdr.request) { + case VHOST_USER_GET_FEATURES: + reply_requested = vu_get_features_exec(&msg); + break; + case VHOST_USER_SET_FEATURES: + reply_requested = vu_set_features_exec(dev, &msg); + break; + case VHOST_USER_GET_PROTOCOL_FEATURES: + reply_requested = vu_get_protocol_features_exec(&msg); + break; + case VHOST_USER_SET_PROTOCOL_FEATURES: + reply_requested = vu_set_protocol_features_exec(dev, &msg); + break; + case VHOST_USER_GET_QUEUE_NUM: + reply_requested = vu_get_queue_num_exec(&msg); + break; + case VHOST_USER_SET_OWNER: + reply_requested = vu_set_owner_exec(); + break; + case VHOST_USER_SET_MEM_TABLE: + reply_requested = vu_set_mem_table_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_NUM: + reply_requested = vu_set_vring_num_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_ADDR: + reply_requested = vu_set_vring_addr_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_BASE: + reply_requested = vu_set_vring_base_exec(dev, &msg); + break; + case VHOST_USER_GET_VRING_BASE: + reply_requested = vu_get_vring_base_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_KICK: + reply_requested = vu_set_vring_kick_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + reply_requested = vu_set_vring_call_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_ERR: + reply_requested = vu_set_vring_err_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_ENABLE: + reply_requested = vu_set_vring_enable_exec(dev, &msg); + break; + case VHOST_USER_NONE: + vu_cleanup(dev); + return; + default: + vu_panic(dev, "Unhandled request: %d", msg.hdr.request); + return; + } + + if (!reply_requested && need_reply) { + msg.payload.u64 = 0; + msg.hdr.flags = 0; + msg.hdr.size = sizeof(msg.payload.u64); + msg.fd_num = 0; + reply_requested = true; + } + + if (reply_requested) + vu_send_reply(dev, c->fd_tap, &msg); + free(msg.data); +} diff --git a/vhost_user.h b/vhost_user.h new file mode 100644 index 000000000000..c6edc49a3bb9 --- /dev/null +++ b/vhost_user.h @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* some parts from subprojects/libvhost-user/libvhost-user.h */ + +#ifndef VHOST_USER_H +#define VHOST_USER_H + +#include "virtio.h" +#include "iov.h" + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +#define VHOST_MEMORY_BASELINE_NREGIONS 8 + +enum vhost_user_protocol_feature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + VHOST_USER_PROTOCOL_F_REPLY_ACK = 3, + VHOST_USER_PROTOCOL_F_NET_MTU = 4, + VHOST_USER_PROTOCOL_F_BACKEND_REQ = 5, + VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6, + VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7, + VHOST_USER_PROTOCOL_F_PAGEFAULT = 8, + VHOST_USER_PROTOCOL_F_CONFIG = 9, + VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, + VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, + + VHOST_USER_PROTOCOL_F_MAX +}; + +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_SET_BACKEND_REQ_FD = 21, + VHOST_USER_IOTLB_MSG = 22, + VHOST_USER_SET_VRING_ENDIAN = 23, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_CREATE_CRYPTO_SESSION = 26, + VHOST_USER_CLOSE_CRYPTO_SESSION = 27, + VHOST_USER_POSTCOPY_ADVISE = 28, + VHOST_USER_POSTCOPY_LISTEN = 29, + VHOST_USER_POSTCOPY_END = 30, + VHOST_USER_GET_INFLIGHT_FD = 31, + VHOST_USER_SET_INFLIGHT_FD = 32, + VHOST_USER_GPU_SET_SOCKET = 33, + VHOST_USER_VRING_KICK = 35, + VHOST_USER_GET_MAX_MEM_SLOTS = 36, + VHOST_USER_ADD_MEM_REG = 37, + VHOST_USER_REM_MEM_REG = 38, + VHOST_USER_MAX +}; + +typedef struct { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3) + uint32_t flags; + uint32_t size; /* the following payload size */ +} __attribute__ ((__packed__)) vhost_user_header; + +typedef struct VhostUserMemory_region { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemory_region; + +struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + struct VhostUserMemory_region regions[VHOST_MEMORY_BASELINE_NREGIONS]; +}; + +typedef union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct VhostUserMemory memory; +} vhost_user_payload; + +typedef struct VhostUserMsg { + vhost_user_header hdr; + vhost_user_payload payload; + + int fds[VHOST_MEMORY_BASELINE_NREGIONS]; + int fd_num; + uint8_t *data; +} __attribute__ ((__packed__)) VhostUserMsg; +#define VHOST_USER_HDR_SIZE sizeof(vhost_user_header) + +#define VHOST_USER_RX_QUEUE 0 +#define VHOST_USER_TX_QUEUE 1 + +static inline bool vu_queue_enabled(const VuVirtq *vq) +{ + return vq->enable; +} + +static inline bool vu_queue_started(const VuVirtq *vq) +{ + return vq->started; +} + +int vu_send(const struct ctx *c, const void *buf, size_t size); +void vu_print_capabilities(void); +void vu_init(struct ctx *c); +void vu_kick_cb(struct ctx *c, union epoll_ref ref); +void tap_handler_vu(struct ctx *c, uint32_t events); +#endif /* VHOST_USER_H */ diff --git a/virtio.c b/virtio.c index 50ec8b5119ed..5d58e56204b3 100644 --- a/virtio.c +++ b/virtio.c @@ -169,7 +169,6 @@ static bool vring_notify(const VuDev *dev, VuVirtq *vq) return !v || vring_need_event(vring_get_used_event(vq), new, old); } -/* cppcheck-suppress unusedFunction */ void vu_queue_notify(VuDev *dev, VuVirtq *vq) { if (dev->broken || !vq->vring.avail) @@ -313,7 +312,6 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, return 0; } -/* cppcheck-suppress unusedFunction */ int vu_queue_pop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem) { unsigned int head; @@ -363,7 +361,6 @@ void vu_queue_detach_element(VuDev *dev, VuVirtq *vq, /* unmap, when DMA support is added */ } -/* cppcheck-suppress unusedFunction */ void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len) { vq->last_avail_idx--; diff --git a/virtio.h b/virtio.h index 61398bb432bc..f4a9f0c23804 100644 --- a/virtio.h +++ b/virtio.h @@ -100,7 +100,6 @@ static inline bool vu_has_feature(const VuDev *vdev, unsigned int fbit) return has_feature(vdev->features, fbit); } -/* cppcheck-suppress unusedFunction */ static inline bool vu_has_protocol_feature(const VuDev *vdev, unsigned int fbit) { return has_feature(vdev->protocol_features, fbit); -- 2.45.2
On Fri, Jun 21, 2024 at 04:56:38PM +0200, Laurent Vivier wrote:Add vhost_user.c and vhost_user.h that define the functions needed to implement vhost-user backend. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com>I'm a little confused by this patch. The commit message seems to suggest that like the previous patch it's basically just a code import from qemu. However... [snip]diff --git a/passt.c b/passt.c index a5e2c5a8e151..9d21c545b9cf 100644 --- a/passt.c +++ b/passt.c @@ -73,6 +73,8 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device", [EPOLL_TYPE_TAP_PASST] = "connected qemu socket", [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", + [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", + [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",... we also have real changes to passt specific code. It's not very obvious to me what the boundaries of that are.}; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); diff --git a/passt.h b/passt.h index 46d073a2a6fd..af10d0bfe4ef 100644 --- a/passt.h +++ b/passt.h @@ -22,6 +22,7 @@ union epoll_ref; #include "fwd.h" #include "tcp.h" #include "udp.h" +#include "vhost_user.h" /** * enum epoll_type - Different types of fds we poll over @@ -51,6 +52,10 @@ enum epoll_type { EPOLL_TYPE_TAP_PASST, /* socket listening for qemu socket connections */ EPOLL_TYPE_TAP_LISTEN, + /* vhost-user command socket */ + EPOLL_TYPE_VHOST_CMD, + /* vhost-user kick event socket */ + EPOLL_TYPE_VHOST_KICK, EPOLL_NUM_TYPES, }; @@ -224,6 +229,7 @@ struct ip6_ctx { * @no_map_gw: Don't map connections, untracked UDP to gateway to host * @low_wmem: Low probed net.core.wmem_max * @low_rmem: Low probed net.core.rmem_max + * @vdev: vhost-user device */ struct ctx { enum passt_modes mode; @@ -288,6 +294,8 @@ struct ctx { int low_wmem; int low_rmem; + + struct VuDev vdev; }; void proto_update_l2_buf(const unsigned char *eth_d, diff --git a/tap.c b/tap.c index c9aeff19f177..be272d25b642 100644 --- a/tap.c +++ b/tap.c @@ -977,7 +977,7 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) * tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket * @c: Execution context */ -static void tap_sock_reset(struct ctx *c) +void tap_sock_reset(struct ctx *c) { if (c->one_off) { info("Client closed connection, exiting"); @@ -1296,6 +1296,23 @@ static void tap_sock_tun_init(struct ctx *c) epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); } +void tap_sock_update_buf(void *base, size_t size) +{ + int i; + + pool_tap4_storage.buf = base; + pool_tap4_storage.buf_size = size; + pool_tap6_storage.buf = base; + pool_tap6_storage.buf_size = size; + + for (i = 0; i < TAP_SEQS; i++) { + tap4_l4[i].p.buf = base; + tap4_l4[i].p.buf_size = size; + tap6_l4[i].p.buf = base; + tap6_l4[i].p.buf_size = size; + } +} + /** * tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor * @c: Execution context diff --git a/tap.h b/tap.h index d496bd0e4b99..3b2dde41ae8d 100644 --- a/tap.h +++ b/tap.h @@ -69,6 +69,8 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, void tap_handler_passt(struct ctx *c, uint32_t events, const struct timespec *now); int tap_sock_unix_open(char *sock_path); +void tap_sock_reset(struct ctx *c); +void tap_sock_update_buf(void *base, size_t size); void tap_sock_init(struct ctx *c); void tap_flush_pools(void); void tap_handler(struct ctx *c, const struct timespec *now); diff --git a/vhost_user.c b/vhost_user.c new file mode 100644 index 000000000000..4ac0a3e53499 --- /dev/null +++ b/vhost_user.c @@ -0,0 +1,1083 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* some parts from QEMU subprojects/libvhost-user/libvhost-user.c */ + +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <stddef.h> +#include <string.h> +#include <assert.h> +#include <stdbool.h> +#include <inttypes.h> +#include <time.h> +#include <net/ethernet.h> +#include <netinet/in.h> +#include <sys/epoll.h> +#include <sys/eventfd.h> +#include <sys/mman.h> +#include <linux/vhost_types.h> +#include <linux/virtio_net.h> + +#include "util.h" +#include "passt.h" +#include "tap.h" +#include "vhost_user.h" + +#define VHOST_USER_VERSION 1 + +/* cppcheck-suppress unusedFunction */ +void vu_print_capabilities(void) +{ + printf("{\n"); + printf(" \"type\": \"net\"\n"); + printf("}\n"); + exit(EXIT_SUCCESS); +} + +static const char * +vu_request_to_string(unsigned int req) +{ + if (req < VHOST_USER_MAX) { +#define REQ(req) [req] = #req + static const char * const vu_request_str[] = { + REQ(VHOST_USER_NONE), + REQ(VHOST_USER_GET_FEATURES), + REQ(VHOST_USER_SET_FEATURES), + REQ(VHOST_USER_SET_OWNER), + REQ(VHOST_USER_RESET_OWNER), + REQ(VHOST_USER_SET_MEM_TABLE), + REQ(VHOST_USER_SET_LOG_BASE), + REQ(VHOST_USER_SET_LOG_FD), + REQ(VHOST_USER_SET_VRING_NUM), + REQ(VHOST_USER_SET_VRING_ADDR), + REQ(VHOST_USER_SET_VRING_BASE), + REQ(VHOST_USER_GET_VRING_BASE), + REQ(VHOST_USER_SET_VRING_KICK), + REQ(VHOST_USER_SET_VRING_CALL), + REQ(VHOST_USER_SET_VRING_ERR), + REQ(VHOST_USER_GET_PROTOCOL_FEATURES), + REQ(VHOST_USER_SET_PROTOCOL_FEATURES), + REQ(VHOST_USER_GET_QUEUE_NUM), + REQ(VHOST_USER_SET_VRING_ENABLE), + REQ(VHOST_USER_SEND_RARP), + REQ(VHOST_USER_NET_SET_MTU), + REQ(VHOST_USER_SET_BACKEND_REQ_FD), + REQ(VHOST_USER_IOTLB_MSG), + REQ(VHOST_USER_SET_VRING_ENDIAN), + REQ(VHOST_USER_GET_CONFIG), + REQ(VHOST_USER_SET_CONFIG), + REQ(VHOST_USER_POSTCOPY_ADVISE), + REQ(VHOST_USER_POSTCOPY_LISTEN), + REQ(VHOST_USER_POSTCOPY_END), + REQ(VHOST_USER_GET_INFLIGHT_FD), + REQ(VHOST_USER_SET_INFLIGHT_FD), + REQ(VHOST_USER_GPU_SET_SOCKET), + REQ(VHOST_USER_VRING_KICK), + REQ(VHOST_USER_GET_MAX_MEM_SLOTS), + REQ(VHOST_USER_ADD_MEM_REG), + REQ(VHOST_USER_REM_MEM_REG), + REQ(VHOST_USER_MAX), + }; +#undef REQ + return vu_request_str[req]; + } + + return "unknown"; +} + +/* Translate qemu virtual address to our virtual address. */Now that this code is not in qemu, it's not very clear what either of these "virtual addresses" is.+static void *qva_to_va(VuDev *dev, uint64_t qemu_addr) +{ + unsigned int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const VuDevRegion *r = &dev->regions[i]; + + if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(qemu_addr - r->qva + r->mmap_addr + + r->mmap_offset); + } + } + + return NULL; +} + +static void +vmsg_close_fds(const VhostUserMsg *vmsg) +{ + int i; + + for (i = 0; i < vmsg->fd_num; i++) + close(vmsg->fds[i]); +} + +static void vu_remove_watch(VuDev *vdev, int fd) +{ + const struct ctx *c = (struct ctx *) ((char *)vdev - + offsetof(struct ctx, vdev)); + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL); +} + +/* Set reply payload.u64 and clear request flags and fd_num */ +static void vmsg_set_reply_u64(struct VhostUserMsg *vmsg, uint64_t val) +{ + vmsg->hdr.flags = 0; /* defaults will be set by vu_send_reply() */ + vmsg->hdr.size = sizeof(vmsg->payload.u64); + vmsg->payload.u64 = val; + vmsg->fd_num = 0; +} + +static ssize_t vu_message_read_default(VuDev *dev, int conn_fd, struct VhostUserMsg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * + sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + .msg_controllen = sizeof(control), + }; + size_t fd_size; + struct cmsghdr *cmsg; + ssize_t ret, sz_payload; + + ret = recvmsg(conn_fd, &msg, MSG_DONTWAIT); + if (ret < 0) { + if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK) + return 0; + vu_panic(dev, "Error while recvmsg: %s", strerror(errno)); + goto out; + } + + vmsg->fd_num = 0; + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + fd_size = cmsg->cmsg_len - CMSG_LEN(0); + vmsg->fd_num = fd_size / sizeof(int); + memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); + break; + } + } + + sz_payload = vmsg->hdr.size; + if ((size_t)sz_payload > sizeof(vmsg->payload)) { + vu_panic(dev, + "Error: too big message request: %d, size: vmsg->size: %zd, " + "while sizeof(vmsg->payload) = %zu", + vmsg->hdr.request, sz_payload, sizeof(vmsg->payload)); + goto out; + } + + if (sz_payload) { + do { + ret = recv(conn_fd, &vmsg->payload, sz_payload, 0); + } while (ret < 0 && (errno == EINTR || errno == EAGAIN)); + + if (ret < sz_payload) { + vu_panic(dev, "Error while reading: %s", strerror(errno)); + goto out; + } + } + + return 1; +out: + vmsg_close_fds(vmsg); + + return -ECONNRESET; +} + +static int vu_message_write(VuDev *dev, int conn_fd, struct VhostUserMsg *vmsg) +{ + int rc; + const uint8_t *p = (uint8_t *)vmsg; + char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS * sizeof(int))] = { 0 }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + }; + + memset(control, 0, sizeof(control)); + assert(vmsg->fd_num <= VHOST_MEMORY_BASELINE_NREGIONS); + if (vmsg->fd_num > 0) { + size_t fdsize = vmsg->fd_num * sizeof(int); + struct cmsghdr *cmsg; + + msg.msg_controllen = CMSG_SPACE(fdsize); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); + } else { + msg.msg_controllen = 0; + } + + do { + rc = sendmsg(conn_fd, &msg, 0); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + if (vmsg->hdr.size) { + do { + if (vmsg->data) + rc = write(conn_fd, vmsg->data, vmsg->hdr.size); + else + rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->hdr.size); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + } + + if (rc <= 0) { + vu_panic(dev, "Error while writing: %s", strerror(errno)); + return false; + } + + return true; +} + +static int vu_send_reply(VuDev *dev, int conn_fd, struct VhostUserMsg *msg) +{ + msg->hdr.flags &= ~VHOST_USER_VERSION_MASK; + msg->hdr.flags |= VHOST_USER_VERSION; + msg->hdr.flags |= VHOST_USER_REPLY_MASK; + + return vu_message_write(dev, conn_fd, msg); +} + +static bool vu_get_features_exec(struct VhostUserMsg *msg) +{ + uint64_t features = + 1ULL << VIRTIO_F_VERSION_1 | + 1ULL << VIRTIO_NET_F_MRG_RXBUF | + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; + + vmsg_set_reply_u64(msg, features); + + debug("Sending back to guest u64: 0x%016"PRIx64, msg->payload.u64); + + return true; +} + +static void +vu_set_enable_all_rings(VuDev *vdev, bool enabled) +{ + uint16_t i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) + vdev->vq[i].enable = enabled; +} + +static bool +vu_set_features_exec(VuDev *vdev, struct VhostUserMsg *msg) +{ + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + vdev->features = msg->payload.u64; + if (!vu_has_feature(vdev, VIRTIO_F_VERSION_1)) { + /* + * We only support devices conforming to VIRTIO 1.0 or + * later + */ + vu_panic(vdev, "virtio legacy devices aren't supported by passt"); + return false; + } + + if (!vu_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) + vu_set_enable_all_rings(vdev, true); + + /* virtio-net features */ + + if (vu_has_feature(vdev, VIRTIO_F_VERSION_1) || + vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { + vdev->hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + vdev->hdrlen = sizeof(struct virtio_net_hdr); + } + + return false; +} + +static bool +vu_set_owner_exec(void) +{ + return false; +} + +static bool map_ring(VuDev *vdev, VuVirtq *vq) +{ + vq->vring.desc = qva_to_va(vdev, vq->vra.desc_user_addr); + vq->vring.used = qva_to_va(vdev, vq->vra.used_user_addr); + vq->vring.avail = qva_to_va(vdev, vq->vra.avail_user_addr); + + debug("Setting virtq addresses:"); + debug(" vring_desc at %p", (void *)vq->vring.desc); + debug(" vring_used at %p", (void *)vq->vring.used); + debug(" vring_avail at %p", (void *)vq->vring.avail); + + return !(vq->vring.desc && vq->vring.used && vq->vring.avail); +} + +/* cppcheck-suppress unusedFunction */ +int vu_packet_check_range(void *buf, size_t offset, size_t len, const char *start, + const char *func, int line) +{ + VuDevRegion *dev_region; + + for (dev_region = buf; dev_region->mmap_addr; dev_region++) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + char *m = (char *)dev_region->mmap_addr; + + if (m <= start && + start + offset + len < m + dev_region->mmap_offset + + dev_region->size) + return 0; + } + if (func) + trace("cannot find region, %s:%i", func, line); + + return -1; +} + +/* + * #syscalls:passt mmap munmap + */ + +static bool vu_set_mem_table_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + unsigned int i; + struct VhostUserMemory m = msg->payload.memory, *memory = &m; + + for (i = 0; i < vdev->nregions; i++) { + VuDevRegion *r = &vdev->regions[i]; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + void *mm = (void *)r->mmap_addr; + + if (mm) + munmap(mm, r->size + r->mmap_offset); + } + vdev->nregions = memory->nregions; + + debug("Nregions: %u", memory->nregions); + for (i = 0; i < vdev->nregions; i++) { + void *mmap_addr; + VhostUserMemory_region *msg_region = &memory->regions[i]; + VuDevRegion *dev_region = &vdev->regions[i]; + + debug("Region %d", i); + debug(" guest_phys_addr: 0x%016"PRIx64, + msg_region->guest_phys_addr); + debug(" memory_size: 0x%016"PRIx64, + msg_region->memory_size); + debug(" userspace_addr 0x%016"PRIx64, + msg_region->userspace_addr); + debug(" mmap_offset 0x%016"PRIx64, + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned, and we use huge + * pages. + */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, + msg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) { + vu_panic(vdev, "region mmap error: %s", strerror(errno)); + } else { + dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + debug(" mmap_addr: 0x%016"PRIx64, + dev_region->mmap_addr); + } + + close(msg->fds[i]); + } + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + if (vdev->vq[i].vring.desc) { + if (map_ring(vdev, &vdev->vq[i])) + vu_panic(vdev, "remapping queue %d during setmemtable", i); + } + } + + /* XXX */ + ASSERT(vdev->nregions < VHOST_USER_MAX_RAM_SLOTS - 1); + vdev->regions[vdev->nregions].mmap_addr = 0; /* mark EOF for vu_packet_check_range() */ + + tap_sock_update_buf(vdev->regions, 0); + + return false; +} + +static bool vu_set_vring_num_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int num = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.num: %u", num); + vdev->vq[idx].vring.num = num; + + return false; +} + +static bool vu_set_vring_addr_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + struct vhost_vring_addr addr = msg->payload.addr, *vra = &addr; + VuVirtq *vq = &vdev->vq[vra->index]; + + debug("vhost_vring_addr:"); + debug(" index: %d", vra->index); + debug(" flags: %d", vra->flags); + debug(" desc_user_addr: 0x%016" PRIx64, (uint64_t)vra->desc_user_addr); + debug(" used_user_addr: 0x%016" PRIx64, (uint64_t)vra->used_user_addr); + debug(" avail_user_addr: 0x%016" PRIx64, (uint64_t)vra->avail_user_addr); + debug(" log_guest_addr: 0x%016" PRIx64, (uint64_t)vra->log_guest_addr); + + vq->vra = *vra; + vq->vring.flags = vra->flags; + vq->vring.log_guest_addr = vra->log_guest_addr; + + if (map_ring(vdev, vq)) { + vu_panic(vdev, "Invalid vring_addr message"); + return false; + } + + vq->used_idx = le16toh(vq->vring.used->idx); + + if (vq->last_avail_idx != vq->used_idx) { + debug("Last avail index != used index: %u != %u", + vq->last_avail_idx, vq->used_idx); + } + + return false; +} + +static bool vu_set_vring_base_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int num = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.num: %u", num); + vdev->vq[idx].shadow_avail_idx = vdev->vq[idx].last_avail_idx = num; + + return false; +} + +static bool vu_get_vring_base_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + unsigned int idx = msg->payload.state.index; + + debug("State.index: %u", idx); + msg->payload.state.num = vdev->vq[idx].last_avail_idx; + msg->hdr.size = sizeof(msg->payload.state); + + vdev->vq[idx].started = false; + + if (vdev->vq[idx].call_fd != -1) { + close(vdev->vq[idx].call_fd); + vdev->vq[idx].call_fd = -1; + } + if (vdev->vq[idx].kick_fd != -1) { + vu_remove_watch(vdev, vdev->vq[idx].kick_fd); + close(vdev->vq[idx].kick_fd); + vdev->vq[idx].kick_fd = -1; + } + + return true; +} + +static void vu_set_watch(VuDev *vdev, int fd) +{ + const struct ctx *c = (struct ctx *) + ((char *)vdev - offsetof(struct ctx, vdev)); + union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_KICK, .fd = fd }; + struct epoll_event ev = { 0 }; + + ev.data.u64 = ref.u64; + ev.events = EPOLLIN; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev); +} + +static int vu_wait_queue(const VuVirtq *vq) +{ + eventfd_t kick_data; + ssize_t rc; + int status; + + /* wait the kernel to put new entries in the queue */ + + status = fcntl(vq->kick_fd, F_GETFL); + if (status == -1) + return -1; + + fcntl(vq->kick_fd, F_SETFL, status & ~O_NONBLOCK); + rc = eventfd_read(vq->kick_fd, &kick_data); + fcntl(vq->kick_fd, F_SETFL, status); + if (rc == -1) + return -1; + + return 0; +} + +/* cppcheck-suppress unusedFunction */ +int vu_send(const struct ctx *c, const void *buf, size_t size) +{ + VuDev *vdev = (VuDev *)&c->vdev; + size_t hdrlen = vdev->hdrlen; + VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + VuVirtqElement elem[VIRTQUEUE_MAX_SIZE]; + struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; + size_t lens[VIRTQUEUE_MAX_SIZE]; + size_t offset; + int i, j; + __virtio16 *num_buffers_ptr; + int in_sg_count; + + debug("vu_send size %zu hdrlen %zu", size, hdrlen); + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + err("Got packet, but no available descriptors on RX virtq."); + return 0; + } + + offset = 0; + i = 0; + num_buffers_ptr = NULL; + in_sg_count = 0; + while (offset < size) { + size_t len; + int total; + int ret; + + total = 0; + + if (i == ARRAY_SIZE(elem) || + in_sg_count == ARRAY_SIZE(in_sg)) { + err("virtio-net unexpected long buffer chain"); + goto err; + } + + elem[i].out_num = 0; + elem[i].out_sg = NULL; + elem[i].in_num = ARRAY_SIZE(in_sg) - in_sg_count; + elem[i].in_sg = &in_sg[in_sg_count]; + + ret = vu_queue_pop(vdev, vq, &elem[i]); + if (ret < 0) { + if (!vdev->broken) { + if (vu_wait_queue(vq) != -1) + continue; + } + if (i) { + err("virtio-net unexpected empty queue: " + "i %d mergeable %d offset %zd, size %zd, " + "features 0x%" PRIx64, + i, vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF), + offset, size, vdev->features); + } + offset = -1; + goto err; + } + in_sg_count += elem[i].in_num; + + if (elem[i].in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_detach_element(vdev, vq, elem[i].index, 0); + offset = -1; + goto err; + } + + if (i == 0) { + struct virtio_net_hdr hdr = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, + }; + + ASSERT(offset == 0); + ASSERT(elem[i].in_sg[0].iov_len >= hdrlen); + + len = iov_from_buf(elem[i].in_sg, elem[i].in_num, 0, + &hdr, sizeof(hdr)); + + num_buffers_ptr = (__virtio16 *)((char *)elem[i].in_sg[0].iov_base + + len); + + total += hdrlen; + } + + len = iov_from_buf(elem[i].in_sg, elem[i].in_num, total, (char *)buf + offset, + size - offset); + + total += len; + offset += len; + + /* If buffers can't be merged, at this point we + * must have consumed the complete packet. + * Otherwise, drop it. + */ + if (!vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) && offset < size) { + vu_queue_unpop(vdev, vq, elem[i].index, total); + goto err; + } + + lens[i] = total; + i++; + } + + if (num_buffers_ptr && vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) + *num_buffers_ptr = htole16(i); + + for (j = 0; j < i; j++) { + debug("filling total %zd idx %d", lens[j], j); + vu_queue_fill(vdev, vq, &elem[j], lens[j], j); + } + + vu_queue_flush(vdev, vq, i); + vu_queue_notify(vdev, vq); + + debug("sent %zu", offset); + + return offset; +err: + for (j = 0; j < i; j++) + vu_queue_detach_element(vdev, vq, elem[j].index, lens[j]); + + return offset; +} + +static void vu_handle_tx(VuDev *vdev, int index) +{ + struct ctx *c = (struct ctx *) ((char *)vdev - offsetof(struct ctx, vdev)); + VuVirtq *vq = &vdev->vq[index]; + int hdrlen = vdev->hdrlen; + struct timespec now; + VuVirtqElement elem[VIRTQUEUE_MAX_SIZE]; + struct iovec out_sg[VIRTQUEUE_MAX_SIZE]; + int out_sg_count; + + int count; + + if (index % 2 != VHOST_USER_TX_QUEUE) { + debug("index %d is not an TX queue", index); + return; + } + + clock_gettime(CLOCK_MONOTONIC, &now); + + tap_flush_pools(); + + count = 0; + out_sg_count = 0; + while (1) { + int ret; + + ASSERT(index == VHOST_USER_TX_QUEUE); + + elem[count].out_num = 1; + elem[count].out_sg = &out_sg[out_sg_count]; + elem[count].in_num = 0; + elem[count].in_sg = NULL; + ret = vu_queue_pop(vdev, vq, &elem[count]); + if (ret < 0) + break; + out_sg_count += elem[count].out_num; + + if (elem[count].out_num < 1) { + debug("virtio-net header not in first element"); + break; + } + ASSERT(elem[count].out_num == 1); + + tap_add_packet(c, elem[count].out_sg[0].iov_len - hdrlen, + (char *)elem[count].out_sg[0].iov_base + hdrlen); + count++; + } + tap_handler(c, &now); + + if (count) { + int i; + + for (i = 0; i < count; i++) + vu_queue_fill(vdev, vq, &elem[i], 0, i); + vu_queue_flush(vdev, vq, count); + vu_queue_notify(vdev, vq); + } +} + +/* cppcheck-suppress unusedFunction */ +void vu_kick_cb(struct ctx *c, union epoll_ref ref) +{ + VuDev *vdev = &c->vdev; + eventfd_t kick_data; + ssize_t rc; + int idx; + + for (idx = 0; idx < VHOST_USER_MAX_QUEUES; idx++) + if (c->vdev.vq[idx].kick_fd == ref.fd) + break; + + if (idx == VHOST_USER_MAX_QUEUES) + return; + + rc = eventfd_read(ref.fd, &kick_data); + if (rc == -1) { + vu_panic(vdev, "kick eventfd_read(): %s", strerror(errno)); + vu_remove_watch(vdev, ref.fd); + } else { + debug("Got kick_data: %016"PRIx64" idx:%d", + kick_data, idx); + if (idx % 2 == VHOST_USER_TX_QUEUE) + vu_handle_tx(vdev, idx); + } +} + +static bool vu_check_queue_msg_file(VuDev *vdev, struct VhostUserMsg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + if (idx >= VHOST_USER_MAX_QUEUES) { + vmsg_close_fds(msg); + vu_panic(vdev, "Invalid queue index: %u", idx); + return false; + } + + if (nofd) { + vmsg_close_fds(msg); + return true; + } + + if (msg->fd_num != 1) { + vmsg_close_fds(msg); + vu_panic(vdev, "Invalid fds in request: %d", msg->hdr.request); + return false; + } + + return true; +} + +static bool vu_set_vring_kick_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + if (!vu_check_queue_msg_file(vdev, msg)) + return false; + + if (vdev->vq[idx].kick_fd != -1) { + vu_remove_watch(vdev, vdev->vq[idx].kick_fd); + close(vdev->vq[idx].kick_fd); + vdev->vq[idx].kick_fd = -1; + } + + /* cppcheck-suppress redundantAssignment */ + vdev->vq[idx].kick_fd = nofd ? -1 : msg->fds[0]; + debug("Got kick_fd: %d for vq: %d", vdev->vq[idx].kick_fd, idx); + + vdev->vq[idx].started = true; + + if (vdev->vq[idx].kick_fd != -1 && idx % 2 == VHOST_USER_TX_QUEUE) { + vu_set_watch(vdev, vdev->vq[idx].kick_fd); + debug("Waiting for kicks on fd: %d for vq: %d", + vdev->vq[idx].kick_fd, idx); + } + + return false; +} + +static bool vu_set_vring_call_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + if (!vu_check_queue_msg_file(vdev, msg)) + return false; + + if (vdev->vq[idx].call_fd != -1) { + close(vdev->vq[idx].call_fd); + vdev->vq[idx].call_fd = -1; + } + + /* cppcheck-suppress redundantAssignment */ + vdev->vq[idx].call_fd = nofd ? -1 : msg->fds[0]; + + /* in case of I/O hang after reconnecting */ + if (vdev->vq[idx].call_fd != -1) + eventfd_write(msg->fds[0], 1); + + debug("Got call_fd: %d for vq: %d", vdev->vq[idx].call_fd, idx); + + return false; +} + +static bool vu_set_vring_err_exec(VuDev *vdev, + struct VhostUserMsg *msg) +{ + int idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + bool nofd = msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; + + debug("u64: 0x%016"PRIx64, msg->payload.u64); + + if (!vu_check_queue_msg_file(vdev, msg)) + return false; + + if (vdev->vq[idx].err_fd != -1) { + close(vdev->vq[idx].err_fd); + vdev->vq[idx].err_fd = -1; + } + + /* cppcheck-suppress redundantAssignment */ + vdev->vq[idx].err_fd = nofd ? -1 : msg->fds[0]; + + return false; +} + +static bool vu_get_protocol_features_exec(struct VhostUserMsg *msg) +{ + uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK; + + vmsg_set_reply_u64(msg, features); + + return true; +} + +static bool vu_set_protocol_features_exec(VuDev *vdev, struct VhostUserMsg *msg) +{ + uint64_t features = msg->payload.u64; + + debug("u64: 0x%016"PRIx64, features); + + vdev->protocol_features = msg->payload.u64; + + if (vu_has_protocol_feature(vdev, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && + (!vu_has_protocol_feature(vdev, VHOST_USER_PROTOCOL_F_BACKEND_REQ) || + !vu_has_protocol_feature(vdev, VHOST_USER_PROTOCOL_F_REPLY_ACK))) { + /* + * The use case for using messages for kick/call is simulation, to make + * the kick and call synchronous. To actually get that behaviour, both + * of the other features are required. + * Theoretically, one could use only kick messages, or do them without + * having F_REPLY_ACK, but too many (possibly pending) messages on the + * socket will eventually cause the master to hang, to avoid this in + * scenarios where not desired enforce that the settings are in a way + * that actually enables the simulation case. + */ + vu_panic(vdev, + "F_IN_BAND_NOTIFICATIONS requires F_BACKEND_REQ && F_REPLY_ACK"); + return false; + } + + return false; +} + + +static bool vu_get_queue_num_exec(struct VhostUserMsg *msg) +{ + vmsg_set_reply_u64(msg, VHOST_USER_MAX_QUEUES); + return true; +} + +static bool vu_set_vring_enable_exec(VuDev *vdev, struct VhostUserMsg *msg) +{ + unsigned int idx = msg->payload.state.index; + unsigned int enable = msg->payload.state.num; + + debug("State.index: %u", idx); + debug("State.enable: %u", enable); + + if (idx >= VHOST_USER_MAX_QUEUES) { + vu_panic(vdev, "Invalid vring_enable index: %u", idx); + return false; + } + + vdev->vq[idx].enable = enable; + return false; +} + +/* cppcheck-suppress unusedFunction */ +void vu_init(struct ctx *c) +{ + int i; + + c->vdev.hdrlen = 0; + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) + c->vdev.vq[i] = (VuVirtq){ + .call_fd = -1, + .kick_fd = -1, + .err_fd = -1, + .notification = true, + }; +} + +static void vu_cleanup(VuDev *vdev) +{ + unsigned int i; + + for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { + VuVirtq *vq = &vdev->vq[i]; + + vq->started = false; + vq->notification = true; + + if (vq->call_fd != -1) { + close(vq->call_fd); + vq->call_fd = -1; + } + if (vq->err_fd != -1) { + close(vq->err_fd); + vq->err_fd = -1; + } + if (vq->kick_fd != -1) { + vu_remove_watch(vdev, vq->kick_fd); + close(vq->kick_fd); + vq->kick_fd = -1; + } + + vq->vring.desc = 0; + vq->vring.used = 0; + vq->vring.avail = 0; + } + vdev->hdrlen = 0; + + for (i = 0; i < vdev->nregions; i++) { + const VuDevRegion *r = &vdev->regions[i]; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + void *m = (void *)r->mmap_addr; + + if (m) + munmap(m, r->size + r->mmap_offset); + } + vdev->nregions = 0; +} + +/** + * tap_handler_vu() - Packet handler for vhost-user + * @c: Execution context + * @events: epoll events + */ +/* cppcheck-suppress unusedFunction */ +void tap_handler_vu(struct ctx *c, uint32_t events) +{ + VuDev *dev = &c->vdev; + struct VhostUserMsg msg = { 0 }; + bool need_reply, reply_requested; + int ret; + + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) { + tap_sock_reset(c); + return; + } + + + ret = vu_message_read_default(dev, c->fd_tap, &msg); + if (ret <= 0) { + if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) + tap_sock_reset(c); + return; + } + debug("================ Vhost user message ================"); + debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request), + msg.hdr.request); + debug("Flags: 0x%x", msg.hdr.flags); + debug("Size: %u", msg.hdr.size); + + need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK; + switch (msg.hdr.request) { + case VHOST_USER_GET_FEATURES: + reply_requested = vu_get_features_exec(&msg); + break; + case VHOST_USER_SET_FEATURES: + reply_requested = vu_set_features_exec(dev, &msg); + break; + case VHOST_USER_GET_PROTOCOL_FEATURES: + reply_requested = vu_get_protocol_features_exec(&msg); + break; + case VHOST_USER_SET_PROTOCOL_FEATURES: + reply_requested = vu_set_protocol_features_exec(dev, &msg); + break; + case VHOST_USER_GET_QUEUE_NUM: + reply_requested = vu_get_queue_num_exec(&msg); + break; + case VHOST_USER_SET_OWNER: + reply_requested = vu_set_owner_exec(); + break; + case VHOST_USER_SET_MEM_TABLE: + reply_requested = vu_set_mem_table_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_NUM: + reply_requested = vu_set_vring_num_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_ADDR: + reply_requested = vu_set_vring_addr_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_BASE: + reply_requested = vu_set_vring_base_exec(dev, &msg); + break; + case VHOST_USER_GET_VRING_BASE: + reply_requested = vu_get_vring_base_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_KICK: + reply_requested = vu_set_vring_kick_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + reply_requested = vu_set_vring_call_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_ERR: + reply_requested = vu_set_vring_err_exec(dev, &msg); + break; + case VHOST_USER_SET_VRING_ENABLE: + reply_requested = vu_set_vring_enable_exec(dev, &msg); + break; + case VHOST_USER_NONE: + vu_cleanup(dev); + return; + default: + vu_panic(dev, "Unhandled request: %d", msg.hdr.request); + return; + } + + if (!reply_requested && need_reply) { + msg.payload.u64 = 0; + msg.hdr.flags = 0; + msg.hdr.size = sizeof(msg.payload.u64); + msg.fd_num = 0; + reply_requested = true; + } + + if (reply_requested) + vu_send_reply(dev, c->fd_tap, &msg); + free(msg.data); +} diff --git a/vhost_user.h b/vhost_user.h new file mode 100644 index 000000000000..c6edc49a3bb9 --- /dev/null +++ b/vhost_user.h @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* some parts from subprojects/libvhost-user/libvhost-user.h */ + +#ifndef VHOST_USER_H +#define VHOST_USER_H + +#include "virtio.h" +#include "iov.h" + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +#define VHOST_MEMORY_BASELINE_NREGIONS 8 + +enum vhost_user_protocol_feature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + VHOST_USER_PROTOCOL_F_REPLY_ACK = 3, + VHOST_USER_PROTOCOL_F_NET_MTU = 4, + VHOST_USER_PROTOCOL_F_BACKEND_REQ = 5, + VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6, + VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7, + VHOST_USER_PROTOCOL_F_PAGEFAULT = 8, + VHOST_USER_PROTOCOL_F_CONFIG = 9, + VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, + VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, + + VHOST_USER_PROTOCOL_F_MAX +}; + +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_SET_BACKEND_REQ_FD = 21, + VHOST_USER_IOTLB_MSG = 22, + VHOST_USER_SET_VRING_ENDIAN = 23, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_CREATE_CRYPTO_SESSION = 26, + VHOST_USER_CLOSE_CRYPTO_SESSION = 27, + VHOST_USER_POSTCOPY_ADVISE = 28, + VHOST_USER_POSTCOPY_LISTEN = 29, + VHOST_USER_POSTCOPY_END = 30, + VHOST_USER_GET_INFLIGHT_FD = 31, + VHOST_USER_SET_INFLIGHT_FD = 32, + VHOST_USER_GPU_SET_SOCKET = 33, + VHOST_USER_VRING_KICK = 35, + VHOST_USER_GET_MAX_MEM_SLOTS = 36, + VHOST_USER_ADD_MEM_REG = 37, + VHOST_USER_REM_MEM_REG = 38, + VHOST_USER_MAX +}; + +typedef struct { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3) + uint32_t flags; + uint32_t size; /* the following payload size */ +} __attribute__ ((__packed__)) vhost_user_header; + +typedef struct VhostUserMemory_region { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemory_region; + +struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + struct VhostUserMemory_region regions[VHOST_MEMORY_BASELINE_NREGIONS]; +}; + +typedef union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct VhostUserMemory memory; +} vhost_user_payload; + +typedef struct VhostUserMsg { + vhost_user_header hdr; + vhost_user_payload payload; + + int fds[VHOST_MEMORY_BASELINE_NREGIONS]; + int fd_num; + uint8_t *data; +} __attribute__ ((__packed__)) VhostUserMsg; +#define VHOST_USER_HDR_SIZE sizeof(vhost_user_header) + +#define VHOST_USER_RX_QUEUE 0 +#define VHOST_USER_TX_QUEUE 1 + +static inline bool vu_queue_enabled(const VuVirtq *vq) +{ + return vq->enable; +} + +static inline bool vu_queue_started(const VuVirtq *vq) +{ + return vq->started; +} + +int vu_send(const struct ctx *c, const void *buf, size_t size); +void vu_print_capabilities(void); +void vu_init(struct ctx *c); +void vu_kick_cb(struct ctx *c, union epoll_ref ref); +void tap_handler_vu(struct ctx *c, uint32_t events); +#endif /* VHOST_USER_H */ diff --git a/virtio.c b/virtio.c index 50ec8b5119ed..5d58e56204b3 100644 --- a/virtio.c +++ b/virtio.c @@ -169,7 +169,6 @@ static bool vring_notify(const VuDev *dev, VuVirtq *vq) return !v || vring_need_event(vring_get_used_event(vq), new, old); } -/* cppcheck-suppress unusedFunction */ void vu_queue_notify(VuDev *dev, VuVirtq *vq) { if (dev->broken || !vq->vring.avail) @@ -313,7 +312,6 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, return 0; } -/* cppcheck-suppress unusedFunction */ int vu_queue_pop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem) { unsigned int head; @@ -363,7 +361,6 @@ void vu_queue_detach_element(VuDev *dev, VuVirtq *vq, /* unmap, when DMA support is added */ } -/* cppcheck-suppress unusedFunction */ void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len) { vq->last_avail_idx--; diff --git a/virtio.h b/virtio.h index 61398bb432bc..f4a9f0c23804 100644 --- a/virtio.h +++ b/virtio.h @@ -100,7 +100,6 @@ static inline bool vu_has_feature(const VuDev *vdev, unsigned int fbit) return has_feature(vdev->features, fbit); } -/* cppcheck-suppress unusedFunction */ static inline bool vu_has_protocol_feature(const VuDev *vdev, unsigned int fbit) { return has_feature(vdev->protocol_features, fbit);-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
On 24/06/2024 05:02, David Gibson wrote:On Fri, Jun 21, 2024 at 04:56:38PM +0200, Laurent Vivier wrote:I have moved all of this to the last patch. ...Add vhost_user.c and vhost_user.h that define the functions needed to implement vhost-user backend. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com>I'm a little confused by this patch. The commit message seems to suggest that like the previous patch it's basically just a code import from qemu. However... [snip]diff --git a/passt.c b/passt.c index a5e2c5a8e151..9d21c545b9cf 100644 --- a/passt.c +++ b/passt.c @@ -73,6 +73,8 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device", [EPOLL_TYPE_TAP_PASST] = "connected qemu socket", [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", + [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", + [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",... we also have real changes to passt specific code. It's not very obvious to me what the boundaries of that are.It's actually QEMU virtual address (QEMU or any other vhost-user client). it's also called userspace address in vhost data structure but I don't like this term as we don't know if it's our userspace or the user space of the vhost-user client. Our userspace address is called mmap address. We have also the guest physical address that is the address from inside the guest. The vring addresses are provided from the QEMU userspace address (information from the vhost-user level): https://qemu-project.gitlab.io/qemu/interop/vhost-user.html#a-vring-address… The descriptor ring addresses are provided from the guest space so they are physical addresses (information from the virtio level): https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x…+/* Translate qemu virtual address to our virtual address. */Now that this code is not in qemu, it's not very clear what either of these "virtual addresses" is.> +static void *qva_to_va(VuDev *dev, uint64_t qemu_addr) > +{ > + unsigned int i; > + > + /* Find matching memory region. */ > + for (i = 0; i < dev->nregions; i++) { > + const VuDevRegion *r = &dev->regions[i]; > + > + if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { > + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ > + return (void *)(qemu_addr - r->qva + r->mmap_addr + > + r->mmap_offset); > + } > + } > + > + return NULL; > +}This function translate the QEMU userspace address to our process userspace address (mmapped memory). I'm updating all the function comment headers to describe this kind of information. Thanks, Laurent
Add a function that count how many buffers from a given iovec list we need to contain a given number of bytes. It also provides how many bytes are used in the last buffer if it is not fully filled. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- iov.c | 35 +++++++++++++++++++++++++++++++++++ iov.h | 2 ++ 2 files changed, 37 insertions(+) diff --git a/iov.c b/iov.c index 3741db21790f..793788b5d2bc 100644 --- a/iov.c +++ b/iov.c @@ -155,3 +155,38 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt) return len; } + +/** + * iov_count - Calculate the number of I/O vectors and the size of + * the last one to store a given number of bytes. + * + * @iov: Pointer to the array of struct iovec describing the + * scatter/gather I/O vector. + * @iov_cnt: Number of elements in the iov array. + * @size: number of bytes we need to store in iovec + * @last_iov_length: output parameter, length used in the last iovec + * if return value is 0, this output parameter is + * undefined. + * + * Returns: The number of iovec needed to store @size bytes. + */ +/* cppcheck-suppress unusedFunction */ +size_t iov_count(const struct iovec *iov, size_t iov_cnt, + size_t size, size_t *last_iov_length) +{ + size_t n = 0; + + while (size && n < iov_cnt) { + if (size <= iov[n].iov_len) { + *last_iov_length = size; + return n + 1; + } + size -= iov[n].iov_len; + n++; + } + + if (n > 0) + *last_iov_length = iov[n - 1].iov_len; + + return n; +} diff --git a/iov.h b/iov.h index a9e1722713b3..0fa456d7051b 100644 --- a/iov.h +++ b/iov.h @@ -28,4 +28,6 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, void *buf, size_t bytes); size_t iov_size(const struct iovec *iov, size_t iov_cnt); +size_t iov_count(const struct iovec *iov, size_t iov_cnt, + size_t size, size_t *last_iov_length); #endif /* IOVEC_H */ -- 2.45.2
On Fri, Jun 21, 2024 at 04:56:39PM +0200, Laurent Vivier wrote:Add a function that count how many buffers from a given iovec list we need to contain a given number of bytes. It also provides how many bytes are used in the last buffer if it is not fully filled.Isn't this equivalent to the iov_skip_bytes() function we already have?Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- iov.c | 35 +++++++++++++++++++++++++++++++++++ iov.h | 2 ++ 2 files changed, 37 insertions(+) diff --git a/iov.c b/iov.c index 3741db21790f..793788b5d2bc 100644 --- a/iov.c +++ b/iov.c @@ -155,3 +155,38 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt) return len; } + +/** + * iov_count - Calculate the number of I/O vectors and the size of + * the last one to store a given number of bytes. + * + * @iov: Pointer to the array of struct iovec describing the + * scatter/gather I/O vector. + * @iov_cnt: Number of elements in the iov array. + * @size: number of bytes we need to store in iovec + * @last_iov_length: output parameter, length used in the last iovec + * if return value is 0, this output parameter is + * undefined. + * + * Returns: The number of iovec needed to store @size bytes. + */ +/* cppcheck-suppress unusedFunction */ +size_t iov_count(const struct iovec *iov, size_t iov_cnt, + size_t size, size_t *last_iov_length) +{ + size_t n = 0; + + while (size && n < iov_cnt) { + if (size <= iov[n].iov_len) { + *last_iov_length = size; + return n + 1; + } + size -= iov[n].iov_len; + n++; + } + + if (n > 0) + *last_iov_length = iov[n - 1].iov_len; + + return n; +} diff --git a/iov.h b/iov.h index a9e1722713b3..0fa456d7051b 100644 --- a/iov.h +++ b/iov.h @@ -28,4 +28,6 @@ size_t iov_from_buf(const struct iovec *iov, size_t iov_cnt, size_t iov_to_buf(const struct iovec *iov, size_t iov_cnt, size_t offset, void *buf, size_t bytes); size_t iov_size(const struct iovec *iov, size_t iov_cnt); +size_t iov_count(const struct iovec *iov, size_t iov_cnt, + size_t size, size_t *last_iov_length); #endif /* IOVEC_H */-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
On 24/06/2024 05:03, David Gibson wrote:On Fri, Jun 21, 2024 at 04:56:39PM +0200, Laurent Vivier wrote:Yes, it's equivalent. I'll use it in next version. Thanks, LaurentAdd a function that count how many buffers from a given iovec list we need to contain a given number of bytes. It also provides how many bytes are used in the last buffer if it is not fully filled.Isn't this equivalent to the iov_skip_bytes() function we already have?
add virtio and vhost-user functions to connect with QEMU. $ ./passt --vhost-user and # qemu-system-x86_64 ... -m 4G \ -object memory-backend-memfd,id=memfd0,share=on,size=4G \ -numa node,memdev=memfd0 \ -chardev socket,id=chr0,path=/tmp/passt_1.socket \ -netdev vhost-user,id=netdev0,chardev=chr0 \ -device virtio-net,mac=9a:2b:2c:2d:2e:2f,netdev=netdev0 \ ... Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- Makefile | 4 +- checksum.c | 1 - conf.c | 18 +- iov.c | 1 - packet.c | 6 + packet.h | 2 + passt.c | 12 +- passt.h | 2 + pcap.c | 1 - tap.c | 87 ++++++-- tap.h | 3 +- tcp.c | 17 +- tcp_vu.c | 547 +++++++++++++++++++++++++++++++++++++++++++++++++ tcp_vu.h | 9 + udp.c | 54 +++-- udp_internal.h | 39 ++++ udp_vu.c | 237 +++++++++++++++++++++ udp_vu.h | 8 + vhost_user.c | 6 - virtio.c | 1 - 20 files changed, 988 insertions(+), 67 deletions(-) create mode 100644 tcp_vu.c create mode 100644 tcp_vu.h create mode 100644 udp_internal.h create mode 100644 udp_vu.c create mode 100644 udp_vu.h diff --git a/Makefile b/Makefile index b2da6ad62103..d22388726099 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c vhost_user.c virtio.c + tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_vu.c util.c vhost_user.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h vhost_user.h virtio.h + tcp_vu.h udp.h udp_internal.h udp_vu.h util.h vhost_user.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/checksum.c b/checksum.c index 006614fcbb28..aa5b7ae1cb66 100644 --- a/checksum.c +++ b/checksum.c @@ -501,7 +501,6 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * * Return: 16-bit folded, complemented checksum */ -/* cppcheck-suppress unusedFunction */ uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init) { unsigned int i; diff --git a/conf.c b/conf.c index 94b3ed6fa659..2c9a6da05666 100644 --- a/conf.c +++ b/conf.c @@ -45,6 +45,7 @@ #include "lineread.h" #include "isolation.h" #include "log.h" +#include "vhost_user.h" /** * next_chunk - Return the next piece of a string delimited by a character @@ -751,6 +752,9 @@ static void usage(const char *name, FILE *f, int status) " -s, --socket PATH UNIX domain socket path\n" " default: probe free path starting from " UNIX_SOCK_PATH "\n", 1); + info( " --vhost-user Enable vhost-user mode"); + info( " UNIX domain socket is provided by -s option"); + info( " --print-capabilities print back-end capabilities in JSON format"); } fprintf(f, @@ -1175,6 +1179,7 @@ void conf(struct ctx *c, int argc, char **argv) {"help", no_argument, NULL, 'h' }, {"socket", required_argument, NULL, 's' }, {"fd", required_argument, NULL, 'F' }, + {"socket-path", required_argument, NULL, 's' }, /* vhost-user mandatory */ {"ns-ifname", required_argument, NULL, 'I' }, {"pcap", required_argument, NULL, 'p' }, {"pid", required_argument, NULL, 'P' }, @@ -1221,6 +1226,8 @@ void conf(struct ctx *c, int argc, char **argv) {"config-net", no_argument, NULL, 17 }, {"no-copy-routes", no_argument, NULL, 18 }, {"no-copy-addrs", no_argument, NULL, 19 }, + {"vhost-user", no_argument, NULL, 20 }, + {"print-capabilities", no_argument, NULL, 21 }, /* vhost-user mandatory */ { 0 }, }; char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 }; @@ -1373,7 +1380,6 @@ void conf(struct ctx *c, int argc, char **argv) sizeof(c->ip6.ifname_out), "%s", optarg); if (ret <= 0 || ret >= (int)sizeof(c->ip6.ifname_out)) die("Invalid interface name: %s", optarg); - break; case 17: if (c->mode != MODE_PASTA) @@ -1395,6 +1401,16 @@ void conf(struct ctx *c, int argc, char **argv) warn("--no-copy-addrs will be dropped soon"); c->no_copy_addrs = copy_addrs_opt = true; break; + case 20: + if (c->mode == MODE_PASTA) { + err("--vhost-user is for passt mode only"); + usage(argv[0], stdout, EXIT_SUCCESS); + } + c->mode = MODE_VU; + break; + case 21: + vu_print_capabilities(); + break; case 'd': if (c->debug) die("Multiple --debug options given"); diff --git a/iov.c b/iov.c index 793788b5d2bc..4215baf7c3b9 100644 --- a/iov.c +++ b/iov.c @@ -170,7 +170,6 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt) * * Returns: The number of iovec needed to store @size bytes. */ -/* cppcheck-suppress unusedFunction */ size_t iov_count(const struct iovec *iov, size_t iov_cnt, size_t size, size_t *last_iov_length) { diff --git a/packet.c b/packet.c index af2a539a1794..3c5fc39df6d7 100644 --- a/packet.c +++ b/packet.c @@ -25,6 +25,12 @@ static int packet_check_range(const struct pool *p, size_t offset, size_t len, const char *start, const char *func, int line) { + ASSERT(p->buf); + + if (p->buf_size == 0) + return vu_packet_check_range((void *)p->buf, offset, len, start, + func, line); + if (start < p->buf) { if (func) { trace("add packet start %p before buffer start %p, " diff --git a/packet.h b/packet.h index 8377dcf678bb..0aec6d9410aa 100644 --- a/packet.h +++ b/packet.h @@ -22,6 +22,8 @@ struct pool { struct iovec pkt[1]; }; +int vu_packet_check_range(void *buf, size_t offset, size_t len, + const char *start, const char *func, int line); void packet_add_do(struct pool *p, size_t len, const char *start, const char *func, int line); void *packet_get_do(const struct pool *p, const size_t idx, diff --git a/passt.c b/passt.c index 9d21c545b9cf..8c0490782a7f 100644 --- a/passt.c +++ b/passt.c @@ -274,6 +274,7 @@ int main(int argc, char **argv) pasta_netns_quit_init(&c); tap_sock_init(&c); + vu_init(&c); secret_init(&c); @@ -367,11 +368,20 @@ loop: tcp_timer_handler(&c, ref); break; case EPOLL_TYPE_UDP: - udp_buf_sock_handler(&c, ref, eventmask, &now); + if (c.mode == MODE_VU) + udp_vu_sock_handler(&c, ref, eventmask, &now); + else + udp_buf_sock_handler(&c, ref, eventmask, &now); break; case EPOLL_TYPE_PING: icmp_sock_handler(&c, ref); break; + case EPOLL_TYPE_VHOST_CMD: + tap_handler_vu(&c, eventmask); + break; + case EPOLL_TYPE_VHOST_KICK: + vu_kick_cb(&c, ref); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index af10d0bfe4ef..f15f28c89d39 100644 --- a/passt.h +++ b/passt.h @@ -22,6 +22,7 @@ union epoll_ref; #include "fwd.h" #include "tcp.h" #include "udp.h" +#include "udp_vu.h" #include "vhost_user.h" /** @@ -122,6 +123,7 @@ struct fqdn { enum passt_modes { MODE_PASST, MODE_PASTA, + MODE_VU, }; /** diff --git a/pcap.c b/pcap.c index 507be2ac1edf..d4d0ec62b944 100644 --- a/pcap.c +++ b/pcap.c @@ -142,7 +142,6 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, * containing packet data to write, including L2 header * @iovcnt: Number of buffers (@iov entries) */ -/* cppcheck-suppress unusedFunction */ void pcap_iov(const struct iovec *iov, size_t iovcnt) { struct timespec now; diff --git a/tap.c b/tap.c index be272d25b642..e3274d39131a 100644 --- a/tap.c +++ b/tap.c @@ -58,6 +58,7 @@ #include "packet.h" #include "tap.h" #include "log.h" +#include "vhost_user.h" /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); @@ -78,16 +79,22 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len) struct iovec iov[2]; size_t iovcnt = 0; - if (c->mode == MODE_PASST) { + switch (c->mode) { + case MODE_PASST: iov[iovcnt] = IOV_OF_LVALUE(vnet_len); iovcnt++; - } - - iov[iovcnt].iov_base = (void *)data; - iov[iovcnt].iov_len = l2len; - iovcnt++; + /* fall through */ + case MODE_PASTA: + iov[iovcnt].iov_base = (void *)data; + iov[iovcnt].iov_len = l2len; + iovcnt++; - tap_send_frames(c, iov, iovcnt, 1); + tap_send_frames(c, iov, iovcnt, 1); + break; + case MODE_VU: + vu_send(c, data, l2len); + break; + } } /** @@ -416,10 +423,19 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, if (!nframes) return 0; - if (c->mode == MODE_PASTA) + switch (c->mode) { + case MODE_PASTA: m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes); - else + break; + case MODE_PASST: m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes); + break; + case MODE_VU: + ASSERT(0); + default: + m = 0; + break; + } if (m < nframes) debug("tap: failed to send %zu frames of %zu", @@ -1180,11 +1196,17 @@ static void tap_sock_unix_init(struct ctx *c) ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev); - info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):"); - info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s", - c->sock_path); - info("or qrap, for earlier qemu versions:"); - info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); + if (c->mode == MODE_VU) { + info("You can start qemu with:"); + info(" kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n", + c->sock_path); + } else { + info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):"); + info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s", + c->sock_path); + info("or qrap, for earlier qemu versions:"); + info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); + } } /** @@ -1194,8 +1216,8 @@ static void tap_sock_unix_init(struct ctx *c) */ void tap_listen_handler(struct ctx *c, uint32_t events) { - union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST }; struct epoll_event ev = { 0 }; + union epoll_ref ref; int v = INT_MAX / 2; struct ucred ucred; socklen_t len; @@ -1235,7 +1257,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events) trace("tap: failed to set SO_SNDBUF to %i", v); ref.fd = c->fd_tap; - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + if (c->mode == MODE_VU) { + ref.type = EPOLL_TYPE_VHOST_CMD; + ev.events = EPOLLIN | EPOLLRDHUP; + } else { + ref.type = EPOLL_TYPE_TAP_PASST; + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; + } ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); } @@ -1324,10 +1352,22 @@ void tap_sock_init(struct ctx *c) pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz); pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz); + if (c->mode == MODE_VU) { + pool_tap4_storage.buf = NULL; + pool_tap4_storage.buf_size = 0; + pool_tap6_storage.buf = NULL; + pool_tap6_storage.buf_size = 0; + } for (i = 0; i < TAP_SEQS; i++) { tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); + if (c->mode == MODE_VU) { + tap4_l4[i].p.buf = NULL; + tap4_l4[i].p.buf_size = 0; + tap6_l4[i].p.buf = NULL; + tap6_l4[i].p.buf_size = 0; + } } if (c->fd_tap != -1) { /* Passed as --fd */ @@ -1336,12 +1376,21 @@ void tap_sock_init(struct ctx *c) ASSERT(c->one_off); ref.fd = c->fd_tap; - if (c->mode == MODE_PASST) + switch (c->mode) { + case MODE_PASST: ref.type = EPOLL_TYPE_TAP_PASST; - else + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + break; + case MODE_PASTA: ref.type = EPOLL_TYPE_TAP_PASTA; + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + break; + case MODE_VU: + ref.type = EPOLL_TYPE_VHOST_CMD; + ev.events = EPOLLIN | EPOLLRDHUP; + break; + } - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); return; diff --git a/tap.h b/tap.h index 3b2dde41ae8d..d9c6d4f57093 100644 --- a/tap.h +++ b/tap.h @@ -40,7 +40,8 @@ static inline struct iovec tap_hdr_iov(const struct ctx *c, */ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) { - thdr->vnet_len = htonl(l2len); + if (thdr) + thdr->vnet_len = htonl(l2len); } struct in_addr tap_ip4_daddr(const struct ctx *c); diff --git a/tcp.c b/tcp.c index 68524235347c..8709dd6d97bb 100644 --- a/tcp.c +++ b/tcp.c @@ -304,6 +304,7 @@ #include "flow_table.h" #include "tcp_internal.h" #include "tcp_buf.h" +#include "tcp_vu.h" #define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) @@ -1049,7 +1050,10 @@ static size_t tcp_fill_headers4(const struct ctx *c, tcp_fill_header(th, conn, seq); - tcp_update_check_tcp4(iph, th); + if (c->mode != MODE_VU) + tcp_update_check_tcp4(iph, th); + else + th->check = 0; tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); @@ -1094,7 +1098,10 @@ static size_t tcp_fill_headers6(const struct ctx *c, tcp_fill_header(th, conn, seq); - tcp_update_check_tcp6(ip6h, th); + if (c->mode != MODE_VU) + tcp_update_check_tcp6(ip6h, th); + else + th->check = 0; tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); @@ -1362,6 +1369,9 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, */ int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { + if (c->mode == MODE_VU) + return tcp_vu_send_flag(c, conn, flags); + return tcp_buf_send_flag(c, conn, flags); } @@ -1808,6 +1818,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) */ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { + if (c->mode == MODE_VU) + return tcp_vu_data_from_sock(c, conn); + return tcp_buf_data_from_sock(c, conn); } diff --git a/tcp_vu.c b/tcp_vu.c new file mode 100644 index 000000000000..f27890f63c0e --- /dev/null +++ b/tcp_vu.c @@ -0,0 +1,547 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <errno.h> +#include <stddef.h> +#include <stdint.h> + +#include <netinet/ip.h> + +#include <sys/socket.h> + +#include <linux/tcp.h> +#include <linux/virtio_net.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "siphash.h" +#include "inany.h" +#include "vhost_user.h" +#include "tcp.h" +#include "pcap.h" +#include "flow.h" +#include "tcp_conn.h" +#include "flow_table.h" +#include "tcp_vu.h" +#include "tcp_internal.h" +#include "checksum.h" + +#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) +#define CONN_V6(conn) (!CONN_V4(conn)) + +/** + * struct tcp_payload_t - TCP header and data to send segments with payload + * @th: TCP header + * @data: TCP data + */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; +}; + +/** + * struct tcp_flags_t - TCP header and data to send zero-length + * segments (flags) + * @th: TCP header + * @opts TCP options + */ +struct tcp_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +}; + +/* vhost-user */ +static const struct virtio_net_hdr vu_header = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, +}; + +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + VuDev *vdev = (VuDev *)&c->vdev; + VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + size_t tlen, vnet_hdrlen, l4len, optlen; + struct virtio_net_hdr_mrg_rxbuf *vh; + struct iovec l2_iov[TCP_NUM_IOVS]; + VuVirtqElement elem; + struct iovec in_sg; + struct ethhdr *eh; + int nb_ack; + int ret; + + elem.out_num = 0; + elem.out_sg = NULL; + elem.in_num = 1; + elem.in_sg = &in_sg; + ret = vu_queue_pop(vdev, vq, &elem); + if (ret < 0) + return 0; + + if (elem.in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_rewind(vdev, vq, 1); + return 0; + } + + vh = elem.in_sg[0].iov_base; + + vh->hdr = vu_header; + if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { + vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + vh->num_buffers = htole16(1); + } else { + vnet_hdrlen = sizeof(struct virtio_net_hdr); + } + + l2_iov[TCP_IOV_TAP].iov_base = NULL; + l2_iov[TCP_IOV_TAP].iov_len = 0; + l2_iov[TCP_IOV_ETH].iov_base = (char *)elem.in_sg[0].iov_base + vnet_hdrlen; + l2_iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + + eh = l2_iov[TCP_IOV_ETH].iov_base; + + memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + + if (CONN_V4(conn)) { + struct tcp_flags_t *payload; + struct iphdr *iph; + uint32_t seq; + + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + + l2_iov[TCP_IOV_ETH].iov_len; + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + + l2_iov[TCP_IOV_IP].iov_len; + + eh->h_proto = htons(ETH_P_IP); + + iph = l2_iov[TCP_IOV_IP].iov_base; + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; + payload->th = (struct tcphdr){ + .doff = offsetof(struct tcp_flags_t, opts) / 4, + .ack = 1 + }; + + seq = conn->seq_to_tap; + ret = tcp_prepare_flags(c, conn, flags, &payload->th, payload->opts, &optlen); + if (ret <= 0) { + vu_queue_rewind(vdev, vq, 1); + return ret; + } + + l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, optlen, NULL, + seq); + /* cppcheck-suppress unreadVariable */ + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + tlen = l4len + sizeof(*iph) + sizeof(struct ethhdr); + + if (*c->pcap) { + uint32_t sum = proto_ipv4_header_psum(l4len, + IPPROTO_TCP, + /* cppcheck-suppress unknownEvaluationOrder */ + (struct in_addr){ .s_addr = iph->saddr }, + (struct in_addr){ .s_addr = iph->daddr }); + + payload->th.check = 0; + payload->th.check = csum(&payload->th, optlen + sizeof(struct tcphdr), sum); + } + } else { + struct tcp_flags_t *payload; + struct ipv6hdr *ip6h; + uint32_t seq; + + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + + l2_iov[TCP_IOV_ETH].iov_len; + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + + l2_iov[TCP_IOV_IP].iov_len; + + eh->h_proto = htons(ETH_P_IPV6); + + ip6h = l2_iov[TCP_IOV_IP].iov_base; + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); + + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; + payload->th = (struct tcphdr){ + .doff = offsetof(struct tcp_flags_t, opts) / 4, + .ack = 1 + }; + + seq = conn->seq_to_tap; + ret = tcp_prepare_flags(c, conn, flags, &payload->th, payload->opts, &optlen); + if (ret <= 0) { + vu_queue_rewind(vdev, vq, 1); + return ret; + } + + l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, optlen, NULL, + seq); + /* cppcheck-suppress unreadVariable */ + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + tlen = l4len + sizeof(*ip6h) + sizeof(struct ethhdr); + + if (*c->pcap) { + uint32_t sum = proto_ipv6_header_psum(l4len, + IPPROTO_TCP, + &ip6h->saddr, + &ip6h->daddr); + + payload->th.check = 0; + payload->th.check = csum(&payload->th, optlen + sizeof(struct tcphdr), sum); + } + } + + pcap((void *)eh, tlen); + + tlen += vnet_hdrlen; + vu_queue_fill(vdev, vq, &elem, tlen, 0); + nb_ack = 1; + + if (flags & DUP_ACK) { + VuVirtqElement elem_dup; + struct iovec in_sg_dup; + + elem_dup.out_num = 0; + elem_dup.out_sg = NULL; + elem_dup.in_num = 1; + elem_dup.in_sg = &in_sg_dup; + ret = vu_queue_pop(vdev, vq, &elem_dup); + if (ret == 0) { + if (elem_dup.in_num < 1 || elem_dup.in_sg[0].iov_len < tlen) { + vu_queue_rewind(vdev, vq, 1); + } else { + memcpy(elem_dup.in_sg[0].iov_base, vh, tlen); + nb_ack++; + } + } + } + + vu_queue_flush(vdev, vq, nb_ack); + vu_queue_notify(vdev, vq); + + return 0; +} + +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +{ + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; + static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE]; + static VuVirtqElement elem[VIRTQUEUE_MAX_SIZE]; + static struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; + VuDev *vdev = (VuDev *)&c->vdev; + VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + size_t l2_hdrlen, vnet_hdrlen, fillsize; + int s = conn->sock, v4 = CONN_V4(conn); + struct iovec l2_iov[TCP_NUM_IOVS]; + int i, ret, iov_cnt, iov_used; + struct msghdr mh_sock = { 0 }; + uint16_t mss = MSS_GET(conn); + static int in_sg_count; + uint32_t already_sent; + const uint16_t *check; + struct iovec *first; + bool has_mrg_rxbuf; + int segment_size; + int num_buffers; + ssize_t len; + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + err("Got packet, but no available descriptors on RX virtq."); + return 0; + } + + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; + + if (SEQ_LT(already_sent, 0)) { + /* RFC 761, section 2.1. */ + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", + conn->seq_ack_from_tap, conn->seq_to_tap); + conn->seq_to_tap = conn->seq_ack_from_tap; + already_sent = 0; + } + + if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); + return 0; + } + + /* Set up buffer descriptors we'll fill completely and partially. */ + + fillsize = wnd_scaled; + + iov_vu[0].iov_base = tcp_buf_discard; + iov_vu[0].iov_len = already_sent; + fillsize -= already_sent; + + has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF); + if (has_mrg_rxbuf) + vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + else + vnet_hdrlen = sizeof(struct virtio_net_hdr); + l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr); + if (v4) + l2_hdrlen += sizeof(struct iphdr); + else + l2_hdrlen += sizeof(struct ipv6hdr); + + iov_cnt = 0; + in_sg_count = 0; + segment_size = 0; + while (fillsize > 0 && iov_cnt < VIRTQUEUE_MAX_SIZE - 1 && + in_sg_count < ARRAY_SIZE(in_sg)) { + + elem[iov_cnt].out_num = 0; + elem[iov_cnt].out_sg = NULL; + elem[iov_cnt].in_num = ARRAY_SIZE(in_sg) - in_sg_count; + elem[iov_cnt].in_sg = &in_sg[in_sg_count]; + ret = vu_queue_pop(vdev, vq, &elem[iov_cnt]); + if (ret < 0) + break; + + if (elem[iov_cnt].in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + goto err; + } + in_sg_count += elem[iov_cnt].in_num; + + ASSERT(elem[iov_cnt].in_num == 1); + ASSERT(elem[iov_cnt].in_sg[0].iov_len >= l2_hdrlen); + + if (segment_size == 0) { + iov_vu[iov_cnt + 1].iov_base = + (char *)elem[iov_cnt].in_sg[0].iov_base + l2_hdrlen; + iov_vu[iov_cnt + 1].iov_len = + elem[iov_cnt].in_sg[0].iov_len - l2_hdrlen; + } else { + iov_vu[iov_cnt + 1].iov_base = elem[iov_cnt].in_sg[0].iov_base; + iov_vu[iov_cnt + 1].iov_len = elem[iov_cnt].in_sg[0].iov_len; + } + + if (iov_vu[iov_cnt + 1].iov_len > fillsize) + iov_vu[iov_cnt + 1].iov_len = fillsize; + + segment_size += iov_vu[iov_cnt + 1].iov_len; + if (!has_mrg_rxbuf) { + segment_size = 0; + } else if (segment_size >= mss) { + iov_vu[iov_cnt + 1].iov_len -= segment_size - mss; + segment_size = 0; + } + fillsize -= iov_vu[iov_cnt + 1].iov_len; + + iov_cnt++; + } + if (iov_cnt == 0) + return 0; + + ret = 0; + mh_sock.msg_iov = iov_vu; + mh_sock.msg_iovlen = iov_cnt + 1; + + do + len = recvmsg(s, &mh_sock, MSG_PEEK); + while (len < 0 && errno == EINTR); + + if (len < 0) + goto err; + + if (!len) { + vu_queue_rewind(vdev, vq, iov_cnt); + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { + ret = tcp_vu_send_flag(c, conn, FIN | ACK); + if (ret) { + tcp_rst(c, conn); + return ret; + } + + conn_event(c, conn, TAP_FIN_SENT); + } + + return 0; + } + + len -= already_sent; + if (len <= 0) { + conn_flag(c, conn, STALLED); + vu_queue_rewind(vdev, vq, iov_cnt); + return 0; + } + + conn_flag(c, conn, ~STALLED); + + /* Likely, some new data was acked too. */ + tcp_update_seqack_wnd(c, conn, 0, NULL); + + /* initialize headers */ + iov_used = 0; + num_buffers = 0; + check = NULL; + segment_size = 0; + for (i = 0; i < iov_cnt && len; i++) { + + if (segment_size == 0) + first = &iov_vu[i + 1]; + + if (iov_vu[i + 1].iov_len > (size_t)len) + iov_vu[i + 1].iov_len = len; + + len -= iov_vu[i + 1].iov_len; + iov_used++; + + segment_size += iov_vu[i + 1].iov_len; + num_buffers++; + + if (segment_size >= mss || len == 0 || + i + 1 == iov_cnt || !has_mrg_rxbuf) { + char *base = (char *)first->iov_base - l2_hdrlen; + size_t size = first->iov_len + l2_hdrlen; + struct virtio_net_hdr_mrg_rxbuf *vh; + struct ethhdr *eh; + size_t l4len; + + vh = (struct virtio_net_hdr_mrg_rxbuf *)base; + + vh->hdr = vu_header; + if (has_mrg_rxbuf) + vh->num_buffers = htole16(num_buffers); + + l2_iov[TCP_IOV_TAP].iov_base = NULL; + l2_iov[TCP_IOV_TAP].iov_len = 0; + l2_iov[TCP_IOV_ETH].iov_base = base + vnet_hdrlen; + l2_iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + + eh = l2_iov[TCP_IOV_ETH].iov_base; + + memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + + /* initialize header */ + if (v4) { + struct tcp_payload_t *payload; + struct iphdr *iph; + + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + + l2_iov[TCP_IOV_ETH].iov_len; + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + + l2_iov[TCP_IOV_IP].iov_len; + + + eh->h_proto = htons(ETH_P_IP); + + iph = l2_iov[TCP_IOV_IP].iov_base; + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; + payload->th = (struct tcphdr){ + .doff = offsetof(struct tcp_payload_t, data) / 4, + .ack = 1 + }; + + l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, + segment_size, + len ? check : NULL, + conn->seq_to_tap); + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + if (*c->pcap) { + uint32_t sum = proto_ipv4_header_psum(l4len, + IPPROTO_TCP, + /* cppcheck-suppress unknownEvaluationOrder */ + (struct in_addr){ .s_addr = iph->saddr }, + (struct in_addr){ .s_addr = iph->daddr }); + + first->iov_base = &payload->th; + first->iov_len = size - l2_hdrlen + sizeof(struct tcphdr); + payload->th.check = 0; + payload->th.check = csum_iov(first, num_buffers, sum); + } + + check = &iph->check; + } else { + struct tcp_payload_t *payload; + struct ipv6hdr *ip6h; + + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + + l2_iov[TCP_IOV_ETH].iov_len; + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + + l2_iov[TCP_IOV_IP].iov_len; + + + eh->h_proto = htons(ETH_P_IPV6); + + ip6h = l2_iov[TCP_IOV_IP].iov_base; + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); + + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; + payload->th = (struct tcphdr){ + .doff = offsetof(struct tcp_payload_t, data) / 4, + .ack = 1 + }; +; + l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, + segment_size, + NULL, conn->seq_to_tap); + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + if (*c->pcap) { + uint32_t sum = proto_ipv6_header_psum(l4len, + IPPROTO_TCP, + &ip6h->saddr, + &ip6h->daddr); + + first->iov_base = &payload->th; + first->iov_len = size - l2_hdrlen + sizeof(struct tcphdr); + + payload->th.check = 0; + payload->th.check = csum_iov(first, num_buffers, sum); + } + } + + /* set iov for pcap logging */ + first->iov_base = eh; + first->iov_len = size - vnet_hdrlen; + + pcap_iov(first, num_buffers); + + /* set iov_len for vu_queue_fill_by_index(); */ + + first->iov_base = base; + first->iov_len = size; + + conn->seq_to_tap += segment_size; + + segment_size = 0; + num_buffers = 0; + } + } + + /* release unused buffers */ + vu_queue_rewind(vdev, vq, iov_cnt - iov_used); + + /* send packets */ + for (i = 0; i < iov_used; i++) + vu_queue_fill(vdev, vq, &elem[i], iov_vu[i + 1].iov_len, i); + + vu_queue_flush(vdev, vq, iov_used); + vu_queue_notify(vdev, vq); + + conn_flag(c, conn, ACK_FROM_TAP_DUE); + + return 0; +err: + vu_queue_rewind(vdev, vq, iov_cnt); + + if (errno != EAGAIN && errno != EWOULDBLOCK) { + ret = -errno; + tcp_rst(c, conn); + } + + return ret; +} diff --git a/tcp_vu.h b/tcp_vu.h new file mode 100644 index 000000000000..b8c57a543ed5 --- /dev/null +++ b/tcp_vu.h @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef TCP_VU_H +#define TCP_VU_H + +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); + +#endif /*TCP_VU_H */ diff --git a/udp.c b/udp.c index dba75d7fecbd..90d58b691c83 100644 --- a/udp.c +++ b/udp.c @@ -121,9 +121,7 @@ #include "tap.h" #include "pcap.h" #include "log.h" - -#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ -#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ +#include "udp_internal.h" /** * struct udp_tap_port - Port tracking based on tap-facing source port @@ -171,20 +169,8 @@ static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8) /* Static buffers */ -/** - * struct udp_payload_t - UDP header and data for inbound messages - * @uh: UDP header - * @data: UDP data - */ -static struct udp_payload_t { - struct udphdr uh; - char data[USHRT_MAX - sizeof(struct udphdr)]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -udp_payload[UDP_MAX_FRAMES]; +/* UDP header and data for inbound messages */ +static struct udp_payload_t udp_payload[UDP_MAX_FRAMES]; /* Ethernet header for IPv4 frames */ static struct ethhdr udp4_eth_hdr; @@ -239,11 +225,11 @@ static struct mmsghdr udp6_l2_mh_sock [UDP_MAX_FRAMES]; /* recvmmsg()/sendmmsg() data for "spliced" connections */ static struct iovec udp_iov_splice [UDP_MAX_FRAMES]; -static struct sockaddr_in udp4_localname = { +struct sockaddr_in udp4_localname = { .sin_family = AF_INET, .sin_addr = IN4ADDR_LOOPBACK_INIT, }; -static struct sockaddr_in6 udp6_localname = { +struct sockaddr_in6 udp6_localname = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_LOOPBACK_INIT, }; @@ -564,11 +550,11 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, * * Return: size of IPv4 payload (UDP header + data) */ -static size_t udp_update_hdr4(const struct ctx *c, - struct iphdr *ip4h, const struct sockaddr_in *s_in, - struct udp_payload_t *bp, - in_port_t dstport, size_t dlen, - const struct timespec *now) +size_t udp_update_hdr4(const struct ctx *c, + struct iphdr *ip4h, const struct sockaddr_in *s_in, + struct udp_payload_t *bp, + in_port_t dstport, size_t dlen, + const struct timespec *now) { const struct in_addr dst = c->ip4.addr_seen; in_port_t srcport = ntohs(s_in->sin_port); @@ -603,7 +589,10 @@ static size_t udp_update_hdr4(const struct ctx *c, bp->uh.source = s_in->sin_port; bp->uh.dest = htons(dstport); bp->uh.len = htons(l4len); - csum_udp4(&bp->uh, src, dst, bp->data, dlen); + if (c->mode != MODE_VU) + csum_udp4(&bp->uh, src, dst, bp->data, dlen); + else + bp->uh.check = 0; return l4len; } @@ -620,11 +609,11 @@ static size_t udp_update_hdr4(const struct ctx *c, * * Return: size of IPv6 payload (UDP header + data) */ -static size_t udp_update_hdr6(const struct ctx *c, - struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, - struct udp_payload_t *bp, - in_port_t dstport, size_t dlen, - const struct timespec *now) +size_t udp_update_hdr6(const struct ctx *c, + struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, + struct udp_payload_t *bp, + in_port_t dstport, size_t dlen, + const struct timespec *now) { const struct in6_addr *src = &s_in6->sin6_addr; const struct in6_addr *dst = &c->ip6.addr_seen; @@ -675,7 +664,10 @@ static size_t udp_update_hdr6(const struct ctx *c, bp->uh.source = s_in6->sin6_port; bp->uh.dest = htons(dstport); bp->uh.len = ip6h->payload_len; - csum_udp6(&bp->uh, src, dst, bp->data, dlen); + if (c->mode != MODE_VU) + csum_udp6(&bp->uh, src, dst, bp->data, dlen); + else + bp->uh.check = 0xffff; /* zero checksum is invalid with IPv6 */ return l4len; } diff --git a/udp_internal.h b/udp_internal.h new file mode 100644 index 000000000000..898d1e103cb8 --- /dev/null +++ b/udp_internal.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#ifndef UDP_INTERNAL_H +#define UDP_INTERNAL_H + +#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ +#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ + +extern struct sockaddr_in udp4_localname; +extern struct sockaddr_in6 udp6_localname; + +/** + * struct udp_payload_t - UDP header and data for inbound messages + * @uh: UDP header + * @data: UDP data + */ +struct udp_payload_t { + struct udphdr uh; + char data[USHRT_MAX - sizeof(struct udphdr)]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +size_t udp_update_hdr4(const struct ctx *c, + struct iphdr *ip4h, const struct sockaddr_in *s_in, + struct udp_payload_t *bp, + in_port_t dstport, size_t dlen, + const struct timespec *now); +size_t udp_update_hdr6(const struct ctx *c, + struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, + struct udp_payload_t *bp, + in_port_t dstport, size_t dlen, + const struct timespec *now); +#endif /* UDP_INTERNAL_H */ diff --git a/udp_vu.c b/udp_vu.c new file mode 100644 index 000000000000..deb649028153 --- /dev/null +++ b/udp_vu.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <unistd.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/udp.h> +#include <stdint.h> +#include <stddef.h> +#include <sys/uio.h> +#include <linux/virtio_net.h> + +#include "checksum.h" +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "pcap.h" +#include "log.h" +#include "vhost_user.h" +#include "udp_internal.h" +#include "udp_vu.h" + +/* vhost-user */ +static const struct virtio_net_hdr vu_header = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, +}; + +static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE]; +static VuVirtqElement elem [VIRTQUEUE_MAX_SIZE]; +static struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; +static int in_sg_count; + +void udp_vu_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now) +{ + VuDev *vdev = (VuDev *)&c->vdev; + VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + bool has_mrg_rxbuf, v6 = ref.udp.v6; + in_port_t dstport = ref.udp.port; + size_t l2_hdrlen, vnet_hdrlen; + struct msghdr msg; + int i, virtqueue_max; + + if (c->no_udp || !(events & EPOLLIN)) + return; + + has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF); + if (has_mrg_rxbuf) { + vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + virtqueue_max = VIRTQUEUE_MAX_SIZE; + } else { + vnet_hdrlen = sizeof(struct virtio_net_hdr); + virtqueue_max = 1; + } + l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct udphdr); + + if (v6) { + l2_hdrlen += sizeof(struct ipv6hdr); + + udp6_localname.sin6_port = htons(dstport); + msg.msg_name = &udp6_localname; + msg.msg_namelen = sizeof(udp6_localname); + } else { + l2_hdrlen += sizeof(struct iphdr); + + udp4_localname.sin_port = htons(dstport); + msg.msg_name = &udp4_localname; + msg.msg_namelen = sizeof(udp4_localname); + } + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + for (i = 0; i < UDP_MAX_FRAMES; i++) { + struct virtio_net_hdr_mrg_rxbuf *vh; + size_t size, fillsize, remaining; + int iov_cnt, iov_used; + struct ethhdr *eh; + ssize_t data_len; + size_t l4len; + char *base; + + fillsize = USHRT_MAX; + iov_cnt = 0; + in_sg_count = 0; + while (fillsize && iov_cnt < virtqueue_max && + in_sg_count < ARRAY_SIZE(in_sg)) { + int ret; + + elem[iov_cnt].out_num = 0; + elem[iov_cnt].out_sg = NULL; + elem[iov_cnt].in_num = ARRAY_SIZE(in_sg) - in_sg_count; + elem[iov_cnt].in_sg = &in_sg[in_sg_count]; + ret = vu_queue_pop(vdev, vq, &elem[iov_cnt]); + if (ret < 0) + break; + in_sg_count += elem[iov_cnt].in_num; + + if (elem[iov_cnt].in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_rewind(vdev, vq, iov_cnt); + return; + } + ASSERT(elem[iov_cnt].in_num == 1); + ASSERT(elem[iov_cnt].in_sg[0].iov_len >= l2_hdrlen); + + if (iov_cnt == 0) { + base = elem[iov_cnt].in_sg[0].iov_base; + size = elem[iov_cnt].in_sg[0].iov_len; + + /* keep space for the headers */ + iov_vu[0].iov_base = base + l2_hdrlen; + iov_vu[0].iov_len = size - l2_hdrlen; + } else { + iov_vu[iov_cnt].iov_base = elem[iov_cnt].in_sg[0].iov_base; + iov_vu[iov_cnt].iov_len = elem[iov_cnt].in_sg[0].iov_len; + } + + if (iov_vu[iov_cnt].iov_len > fillsize) + iov_vu[iov_cnt].iov_len = fillsize; + + fillsize -= iov_vu[iov_cnt].iov_len; + + iov_cnt++; + } + if (iov_cnt == 0) + break; + + msg.msg_iov = iov_vu; + msg.msg_iovlen = iov_cnt; + + data_len = recvmsg(ref.fd, &msg, 0); + if (data_len < 0) { + vu_queue_rewind(vdev, vq, iov_cnt); + return; + } + + /* restore original values */ + iov_vu[0].iov_base = base; + iov_vu[0].iov_len = size; + + /* count the numbers of buffer filled by recvmsg() */ + iov_used = iov_count(iov_vu, iov_cnt, l2_hdrlen + data_len, + &remaining); + ASSERT(iov_used <= iov_cnt); + if (iov_used > 0) { + ASSERT(iov_vu[iov_used - 1].iov_len >= remaining); + iov_vu[iov_used - 1].iov_len = remaining; + /* update size */ + if (iov_used - 1 == 0) + size = iov_vu[0].iov_len; + } + + /* release unused buffers */ + vu_queue_rewind(vdev, vq, iov_cnt - iov_used); + + /* vnet_header */ + vh = (struct virtio_net_hdr_mrg_rxbuf *)base; + vh->hdr = vu_header; + if (has_mrg_rxbuf) + vh->num_buffers = htole16(iov_used); + + /* ethernet header */ + eh = (struct ethhdr *)(base + vnet_hdrlen); + + memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + + /* initialize header */ + if (v6) { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1); + struct udp_payload_t *bp = (struct udp_payload_t *)(ip6h + 1); + + eh->h_proto = htons(ETH_P_IPV6); + + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_UDP); + + l4len = udp_update_hdr6(c, ip6h, &udp6_localname, bp, + dstport, data_len, now); + if (*c->pcap) { + uint32_t sum; + + sum = proto_ipv6_header_psum(l4len, IPPROTO_UDP, + &ip6h->saddr, + &ip6h->daddr); + + iov_vu[0].iov_base = &bp->uh; + iov_vu[0].iov_len = size - l2_hdrlen + + sizeof(bp->uh); + bp->uh.check = 0; /* by default, set to 0xffff */ + bp->uh.check = csum_iov(iov_vu, iov_used, sum); + } + } else { + struct iphdr *iph = (struct iphdr *)(eh + 1); + struct udp_payload_t *bp = (struct udp_payload_t *)(iph + 1); + + eh->h_proto = htons(ETH_P_IP); + + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP); + + l4len = udp_update_hdr4(c, iph, &udp4_localname, bp, + dstport, data_len, now); + if (*c->pcap) { + uint32_t sum; + + sum = proto_ipv4_header_psum(l4len, IPPROTO_UDP, + /* cppcheck-suppress unknownEvaluationOrder */ + (struct in_addr){ .s_addr = iph->saddr }, + (struct in_addr){ .s_addr = iph->daddr }); + + iov_vu[0].iov_base = &bp->uh; + iov_vu[0].iov_len = size - l2_hdrlen + + sizeof(bp->uh); + bp->uh.check = csum_iov(iov_vu, iov_used, sum); + } + } + + /* set iov for pcap logging */ + iov_vu[0].iov_base = base + vnet_hdrlen; + iov_vu[0].iov_len = size - vnet_hdrlen; + pcap_iov(iov_vu, iov_used); + + /* set iov_len for vu_queue_fill_by_index(); */ + iov_vu[0].iov_base = base; + iov_vu[0].iov_len = size; + + /* send packets */ + for (i = 0; i < iov_used; i++) + vu_queue_fill(vdev, vq, &elem[i], iov_vu[i].iov_len, i); + + vu_queue_flush(vdev, vq, iov_used); + vu_queue_notify(vdev, vq); + } +} diff --git a/udp_vu.h b/udp_vu.h new file mode 100644 index 000000000000..e01ce047ee0a --- /dev/null +++ b/udp_vu.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef UDP_VU_H +#define UDP_VU_H + +void udp_vu_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now); +#endif /* UDP_VU_H */ diff --git a/vhost_user.c b/vhost_user.c index 4ac0a3e53499..a3d156558359 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -28,7 +28,6 @@ #define VHOST_USER_VERSION 1 -/* cppcheck-suppress unusedFunction */ void vu_print_capabilities(void) { printf("{\n"); @@ -332,7 +331,6 @@ static bool map_ring(VuDev *vdev, VuVirtq *vq) return !(vq->vring.desc && vq->vring.used && vq->vring.avail); } -/* cppcheck-suppress unusedFunction */ int vu_packet_check_range(void *buf, size_t offset, size_t len, const char *start, const char *func, int line) { @@ -545,7 +543,6 @@ static int vu_wait_queue(const VuVirtq *vq) return 0; } -/* cppcheck-suppress unusedFunction */ int vu_send(const struct ctx *c, const void *buf, size_t size) { VuDev *vdev = (VuDev *)&c->vdev; @@ -730,7 +727,6 @@ static void vu_handle_tx(VuDev *vdev, int index) } } -/* cppcheck-suppress unusedFunction */ void vu_kick_cb(struct ctx *c, union epoll_ref ref) { VuDev *vdev = &c->vdev; @@ -927,7 +923,6 @@ static bool vu_set_vring_enable_exec(VuDev *vdev, struct VhostUserMsg *msg) return false; } -/* cppcheck-suppress unusedFunction */ void vu_init(struct ctx *c) { int i; @@ -988,7 +983,6 @@ static void vu_cleanup(VuDev *vdev) * @c: Execution context * @events: epoll events */ -/* cppcheck-suppress unusedFunction */ void tap_handler_vu(struct ctx *c, uint32_t events) { VuDev *dev = &c->vdev; diff --git a/virtio.c b/virtio.c index 5d58e56204b3..8c651070bba5 100644 --- a/virtio.c +++ b/virtio.c @@ -367,7 +367,6 @@ void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len) vu_queue_detach_element(dev, vq, index, len); } -/* cppcheck-suppress unusedFunction */ bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) { (void)dev; -- 2.45.2
On Fri, Jun 21, 2024 at 04:56:40PM +0200, Laurent Vivier wrote:add virtio and vhost-user functions to connect with QEMU. $ ./passt --vhost-user and # qemu-system-x86_64 ... -m 4G \ -object memory-backend-memfd,id=memfd0,share=on,size=4G \ -numa node,memdev=memfd0 \ -chardev socket,id=chr0,path=/tmp/passt_1.socket \ -netdev vhost-user,id=netdev0,chardev=chr0 \ -device virtio-net,mac=9a:2b:2c:2d:2e:2f,netdev=netdev0 \ ... Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- Makefile | 4 +- checksum.c | 1 - conf.c | 18 +- iov.c | 1 - packet.c | 6 + packet.h | 2 + passt.c | 12 +- passt.h | 2 + pcap.c | 1 - tap.c | 87 ++++++-- tap.h | 3 +- tcp.c | 17 +- tcp_vu.c | 547 +++++++++++++++++++++++++++++++++++++++++++++++++ tcp_vu.h | 9 + udp.c | 54 +++-- udp_internal.h | 39 ++++ udp_vu.c | 237 +++++++++++++++++++++ udp_vu.h | 8 + vhost_user.c | 6 - virtio.c | 1 - 20 files changed, 988 insertions(+), 67 deletions(-) create mode 100644 tcp_vu.c create mode 100644 tcp_vu.h create mode 100644 udp_internal.h create mode 100644 udp_vu.c create mode 100644 udp_vu.h diff --git a/Makefile b/Makefile index b2da6ad62103..d22388726099 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c vhost_user.c virtio.c + tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_vu.c util.c vhost_user.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h vhost_user.h virtio.h + tcp_vu.h udp.h udp_internal.h udp_vu.h util.h vhost_user.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/checksum.c b/checksum.c index 006614fcbb28..aa5b7ae1cb66 100644 --- a/checksum.c +++ b/checksum.c @@ -501,7 +501,6 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * * Return: 16-bit folded, complemented checksum */ -/* cppcheck-suppress unusedFunction */ uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init) { unsigned int i; diff --git a/conf.c b/conf.c index 94b3ed6fa659..2c9a6da05666 100644 --- a/conf.c +++ b/conf.c @@ -45,6 +45,7 @@ #include "lineread.h" #include "isolation.h" #include "log.h" +#include "vhost_user.h" /** * next_chunk - Return the next piece of a string delimited by a character @@ -751,6 +752,9 @@ static void usage(const char *name, FILE *f, int status) " -s, --socket PATH UNIX domain socket path\n" " default: probe free path starting from " UNIX_SOCK_PATH "\n", 1); + info( " --vhost-user Enable vhost-user mode"); + info( " UNIX domain socket is provided by -s option"); + info( " --print-capabilities print back-end capabilities in JSON format");Probably worth nothing this is only meaningful for vhost-user mode.} fprintf(f, @@ -1175,6 +1179,7 @@ void conf(struct ctx *c, int argc, char **argv) {"help", no_argument, NULL, 'h' }, {"socket", required_argument, NULL, 's' }, {"fd", required_argument, NULL, 'F' }, + {"socket-path", required_argument, NULL, 's' }, /* vhost-user mandatory */Maybe put this next to the --socket option to make it clearer it's an alias for it. "vhost-user mandatory" isn't that clear to me - initially I though it meant the user had to supply this option for vhost-user mode, rather than that the vhost-user interface mandates this option exists.{"ns-ifname", required_argument, NULL, 'I' }, {"pcap", required_argument, NULL, 'p' }, {"pid", required_argument, NULL, 'P' }, @@ -1221,6 +1226,8 @@ void conf(struct ctx *c, int argc, char **argv) {"config-net", no_argument, NULL, 17 }, {"no-copy-routes", no_argument, NULL, 18 }, {"no-copy-addrs", no_argument, NULL, 19 }, + {"vhost-user", no_argument, NULL, 20 }, + {"print-capabilities", no_argument, NULL, 21 }, /* vhost-user mandatory */ { 0 }, }; char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 }; @@ -1373,7 +1380,6 @@ void conf(struct ctx *c, int argc, char **argv) sizeof(c->ip6.ifname_out), "%s", optarg); if (ret <= 0 || ret >= (int)sizeof(c->ip6.ifname_out)) die("Invalid interface name: %s", optarg); -Unrelated whitespace change.break; case 17: if (c->mode != MODE_PASTA) @@ -1395,6 +1401,16 @@ void conf(struct ctx *c, int argc, char **argv) warn("--no-copy-addrs will be dropped soon"); c->no_copy_addrs = copy_addrs_opt = true; break; + case 20: + if (c->mode == MODE_PASTA) { + err("--vhost-user is for passt mode only"); + usage(argv[0], stdout, EXIT_SUCCESS); + } + c->mode = MODE_VU; + break; + case 21: + vu_print_capabilities(); + break; case 'd': if (c->debug) die("Multiple --debug options given"); diff --git a/iov.c b/iov.c index 793788b5d2bc..4215baf7c3b9 100644 --- a/iov.c +++ b/iov.c @@ -170,7 +170,6 @@ size_t iov_size(const struct iovec *iov, size_t iov_cnt) * * Returns: The number of iovec needed to store @size bytes. */ -/* cppcheck-suppress unusedFunction */ size_t iov_count(const struct iovec *iov, size_t iov_cnt, size_t size, size_t *last_iov_length) { diff --git a/packet.c b/packet.c index af2a539a1794..3c5fc39df6d7 100644 --- a/packet.c +++ b/packet.c @@ -25,6 +25,12 @@ static int packet_check_range(const struct pool *p, size_t offset, size_t len, const char *start, const char *func, int line) { + ASSERT(p->buf); + + if (p->buf_size == 0) + return vu_packet_check_range((void *)p->buf, offset, len, start, + func, line); + if (start < p->buf) { if (func) { trace("add packet start %p before buffer start %p, " diff --git a/packet.h b/packet.h index 8377dcf678bb..0aec6d9410aa 100644 --- a/packet.h +++ b/packet.h @@ -22,6 +22,8 @@ struct pool { struct iovec pkt[1]; }; +int vu_packet_check_range(void *buf, size_t offset, size_t len, + const char *start, const char *func, int line); void packet_add_do(struct pool *p, size_t len, const char *start, const char *func, int line); void *packet_get_do(const struct pool *p, const size_t idx, diff --git a/passt.c b/passt.c index 9d21c545b9cf..8c0490782a7f 100644 --- a/passt.c +++ b/passt.c @@ -274,6 +274,7 @@ int main(int argc, char **argv) pasta_netns_quit_init(&c); tap_sock_init(&c); + vu_init(&c); secret_init(&c); @@ -367,11 +368,20 @@ loop: tcp_timer_handler(&c, ref); break; case EPOLL_TYPE_UDP: - udp_buf_sock_handler(&c, ref, eventmask, &now); + if (c.mode == MODE_VU) + udp_vu_sock_handler(&c, ref, eventmask, &now); + else + udp_buf_sock_handler(&c, ref, eventmask, &now); break; case EPOLL_TYPE_PING: icmp_sock_handler(&c, ref); break; + case EPOLL_TYPE_VHOST_CMD: + tap_handler_vu(&c, eventmask); + break; + case EPOLL_TYPE_VHOST_KICK: + vu_kick_cb(&c, ref); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index af10d0bfe4ef..f15f28c89d39 100644 --- a/passt.h +++ b/passt.h @@ -22,6 +22,7 @@ union epoll_ref; #include "fwd.h" #include "tcp.h" #include "udp.h" +#include "udp_vu.h" #include "vhost_user.h" /** @@ -122,6 +123,7 @@ struct fqdn { enum passt_modes { MODE_PASST, MODE_PASTA, + MODE_VU, }; /** diff --git a/pcap.c b/pcap.c index 507be2ac1edf..d4d0ec62b944 100644 --- a/pcap.c +++ b/pcap.c @@ -142,7 +142,6 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, * containing packet data to write, including L2 header * @iovcnt: Number of buffers (@iov entries) */ -/* cppcheck-suppress unusedFunction */ void pcap_iov(const struct iovec *iov, size_t iovcnt) { struct timespec now; diff --git a/tap.c b/tap.c index be272d25b642..e3274d39131a 100644 --- a/tap.c +++ b/tap.c @@ -58,6 +58,7 @@ #include "packet.h" #include "tap.h" #include "log.h" +#include "vhost_user.h" /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); @@ -78,16 +79,22 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len) struct iovec iov[2]; size_t iovcnt = 0; - if (c->mode == MODE_PASST) { + switch (c->mode) { + case MODE_PASST: iov[iovcnt] = IOV_OF_LVALUE(vnet_len); iovcnt++; - } - - iov[iovcnt].iov_base = (void *)data; - iov[iovcnt].iov_len = l2len; - iovcnt++; + /* fall through */ + case MODE_PASTA: + iov[iovcnt].iov_base = (void *)data; + iov[iovcnt].iov_len = l2len; + iovcnt++; - tap_send_frames(c, iov, iovcnt, 1); + tap_send_frames(c, iov, iovcnt, 1); + break; + case MODE_VU: + vu_send(c, data, l2len); + break; + } } /** @@ -416,10 +423,19 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, if (!nframes) return 0; - if (c->mode == MODE_PASTA) + switch (c->mode) { + case MODE_PASTA: m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes); - else + break; + case MODE_PASST: m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes); + break; + case MODE_VU: + ASSERT(0); + default:This should be an ASSERT(0) as well, yes? We shouldn't be able to get here without a mode being set.+ m = 0; + break; + } if (m < nframes) debug("tap: failed to send %zu frames of %zu", @@ -1180,11 +1196,17 @@ static void tap_sock_unix_init(struct ctx *c) ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev); - info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):"); - info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s", - c->sock_path); - info("or qrap, for earlier qemu versions:"); - info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); + if (c->mode == MODE_VU) { + info("You can start qemu with:"); + info(" kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n", + c->sock_path); + } else { + info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):"); + info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s", + c->sock_path); + info("or qrap, for earlier qemu versions:"); + info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); + } } /** @@ -1194,8 +1216,8 @@ static void tap_sock_unix_init(struct ctx *c) */ void tap_listen_handler(struct ctx *c, uint32_t events) { - union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST }; struct epoll_event ev = { 0 }; + union epoll_ref ref; int v = INT_MAX / 2; struct ucred ucred; socklen_t len; @@ -1235,7 +1257,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events) trace("tap: failed to set SO_SNDBUF to %i", v); ref.fd = c->fd_tap; - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + if (c->mode == MODE_VU) { + ref.type = EPOLL_TYPE_VHOST_CMD; + ev.events = EPOLLIN | EPOLLRDHUP; + } else { + ref.type = EPOLL_TYPE_TAP_PASST; + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; + } ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); } @@ -1324,10 +1352,22 @@ void tap_sock_init(struct ctx *c) pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz); pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz); + if (c->mode == MODE_VU) { + pool_tap4_storage.buf = NULL; + pool_tap4_storage.buf_size = 0; + pool_tap6_storage.buf = NULL; + pool_tap6_storage.buf_size = 0;It seems a bit of a layering violation to initialize the pool with PACKET_INIT() then mangle its internals in the vhost-user case. Could we use a different PACKET_INIT invocation for the VU case instead?+ } for (i = 0; i < TAP_SEQS; i++) { tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); + if (c->mode == MODE_VU) { + tap4_l4[i].p.buf = NULL; + tap4_l4[i].p.buf_size = 0; + tap6_l4[i].p.buf = NULL; + tap6_l4[i].p.buf_size = 0;Same here, of course.+ } } if (c->fd_tap != -1) { /* Passed as --fd */ @@ -1336,12 +1376,21 @@ void tap_sock_init(struct ctx *c) ASSERT(c->one_off); ref.fd = c->fd_tap; - if (c->mode == MODE_PASST) + switch (c->mode) { + case MODE_PASST: ref.type = EPOLL_TYPE_TAP_PASST; - else + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + break; + case MODE_PASTA: ref.type = EPOLL_TYPE_TAP_PASTA; + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + break; + case MODE_VU: + ref.type = EPOLL_TYPE_VHOST_CMD; + ev.events = EPOLLIN | EPOLLRDHUP; + break;I suspect one of our static checkers will complain at some point if we don't put a default case with an ASSERT here.+ } - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); return; diff --git a/tap.h b/tap.h index 3b2dde41ae8d..d9c6d4f57093 100644 --- a/tap.h +++ b/tap.h @@ -40,7 +40,8 @@ static inline struct iovec tap_hdr_iov(const struct ctx *c, */ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) { - thdr->vnet_len = htonl(l2len); + if (thdr) + thdr->vnet_len = htonl(l2len); } struct in_addr tap_ip4_daddr(const struct ctx *c); diff --git a/tcp.c b/tcp.c index 68524235347c..8709dd6d97bb 100644 --- a/tcp.c +++ b/tcp.c @@ -304,6 +304,7 @@ #include "flow_table.h" #include "tcp_internal.h" #include "tcp_buf.h" +#include "tcp_vu.h" #define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) @@ -1049,7 +1050,10 @@ static size_t tcp_fill_headers4(const struct ctx *c, tcp_fill_header(th, conn, seq); - tcp_update_check_tcp4(iph, th); + if (c->mode != MODE_VU) + tcp_update_check_tcp4(iph, th); + else + th->check = 0; tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); @@ -1094,7 +1098,10 @@ static size_t tcp_fill_headers6(const struct ctx *c, tcp_fill_header(th, conn, seq); - tcp_update_check_tcp6(ip6h, th); + if (c->mode != MODE_VU) + tcp_update_check_tcp6(ip6h, th); + else + th->check = 0; tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); @@ -1362,6 +1369,9 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, */ int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { + if (c->mode == MODE_VU) + return tcp_vu_send_flag(c, conn, flags); + return tcp_buf_send_flag(c, conn, flags); } @@ -1808,6 +1818,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) */ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { + if (c->mode == MODE_VU) + return tcp_vu_data_from_sock(c, conn); + return tcp_buf_data_from_sock(c, conn); } diff --git a/tcp_vu.c b/tcp_vu.c new file mode 100644 index 000000000000..f27890f63c0e --- /dev/null +++ b/tcp_vu.c @@ -0,0 +1,547 @@ +// SPDX-License-Identifier: GPL-2.0-or-laterNeeds Copyright notice, author information and general description here.+ +#include <errno.h> +#include <stddef.h> +#include <stdint.h> + +#include <netinet/ip.h> + +#include <sys/socket.h> + +#include <linux/tcp.h> +#include <linux/virtio_net.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "siphash.h" +#include "inany.h" +#include "vhost_user.h" +#include "tcp.h" +#include "pcap.h" +#include "flow.h" +#include "tcp_conn.h" +#include "flow_table.h" +#include "tcp_vu.h" +#include "tcp_internal.h" +#include "checksum.h" + +#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) +#define CONN_V6(conn) (!CONN_V4(conn)) + +/** + * struct tcp_payload_t - TCP header and data to send segments with payload + * @th: TCP header + * @data: TCP data + */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; +};This could be common with tcp_buf.c, couldn't it?+ +/** + * struct tcp_flags_t - TCP header and data to send zero-length + * segments (flags) + * @th: TCP header + * @opts TCP options + */ +struct tcp_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +};Likewise here.+ +/* vhost-user */ +static const struct virtio_net_hdr vu_header = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, +}; + +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + VuDev *vdev = (VuDev *)&c->vdev; + VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + size_t tlen, vnet_hdrlen, l4len, optlen; + struct virtio_net_hdr_mrg_rxbuf *vh; + struct iovec l2_iov[TCP_NUM_IOVS]; + VuVirtqElement elem; + struct iovec in_sg; + struct ethhdr *eh; + int nb_ack; + int ret; + + elem.out_num = 0; + elem.out_sg = NULL; + elem.in_num = 1; + elem.in_sg = &in_sg; + ret = vu_queue_pop(vdev, vq, &elem); + if (ret < 0) + return 0; + + if (elem.in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_rewind(vdev, vq, 1); + return 0; + } + + vh = elem.in_sg[0].iov_base;AFAICT, the code below requires that in_sg[0] be large enough to contain the frame, plus a virtio_net_hdr_mrg_rxbuf. Seems like that we should ASSERT() that somewhere. If I'm understanding correctly that the virtio_net_hdr_mrg_rxbuf is a kind of pseudo-header you need for each frame, I'm wondering if it could be integrated into the tap_hdr mechanisms.+ + vh->hdr = vu_header; + if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { + vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);sizeof(*vh) maybe?+ vh->num_buffers = htole16(1); + } else { + vnet_hdrlen = sizeof(struct virtio_net_hdr); + } + + l2_iov[TCP_IOV_TAP].iov_base = NULL; + l2_iov[TCP_IOV_TAP].iov_len = 0; + l2_iov[TCP_IOV_ETH].iov_base = (char *)elem.in_sg[0].iov_base + vnet_hdrlen; + l2_iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + + eh = l2_iov[TCP_IOV_ETH].iov_base;You could initialise eh first, then set l2_iov[TCP_IOV_ETH] using IOV_OF_LVALUE().+ + memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + + if (CONN_V4(conn)) { + struct tcp_flags_t *payload; + struct iphdr *iph; + uint32_t seq; + + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + + l2_iov[TCP_IOV_ETH].iov_len; + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + + l2_iov[TCP_IOV_IP].iov_len; +Similar thing for iph and TCP_IOV_IP.+ eh->h_proto = htons(ETH_P_IP); + + iph = l2_iov[TCP_IOV_IP].iov_base; + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);Hrm... once l2_iov[TCP_IOV_IP] is set up like this, couldn't you share the actual initialisation of the header fields with the tcp_buf code?+ payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; + payload->th = (struct tcphdr){ + .doff = offsetof(struct tcp_flags_t, opts) / 4, + .ack = 1 + };Similarly the TCP header.+ seq = conn->seq_to_tap; + ret = tcp_prepare_flags(c, conn, flags, &payload->th, payload->opts, &optlen); + if (ret <= 0) { + vu_queue_rewind(vdev, vq, 1); + return ret; + } + + l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, optlen, NULL, + seq);Hrm, I guess you sort of are, but I wonder if we can make a longer stretch of code common here.+ /* cppcheck-suppress unreadVariable */ + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + tlen = l4len + sizeof(*iph) + sizeof(struct ethhdr);'l2len' would be the preferred name for this quantity now.+ + if (*c->pcap) { + uint32_t sum = proto_ipv4_header_psum(l4len, + IPPROTO_TCP, + /* cppcheck-suppress unknownEvaluationOrder */ + (struct in_addr){ .s_addr = iph->saddr },I think using locals of type struct in_addr would be cleaner here, and avoid the cppcheck warning more elegantly. > + (struct in_addr){ .s_addr = iph->daddr }); > + > + payload->th.check = 0; > + payload->th.check = csum(&payload->th, optlen + sizeof(struct tcphdr), sum); > + } > + } else { > + struct tcp_flags_t *payload; > + struct ipv6hdr *ip6h; > + uint32_t seq; > + > + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + > + l2_iov[TCP_IOV_ETH].iov_len; > + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); > + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + > + l2_iov[TCP_IOV_IP].iov_len; > + > + eh->h_proto = htons(ETH_P_IPV6); > + > + ip6h = l2_iov[TCP_IOV_IP].iov_base; > + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); > ++ payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; + payload->th = (struct tcphdr){ + .doff = offsetof(struct tcp_flags_t, opts) / 4, + .ack = 1 + };> ++ seq = conn->seq_to_tap; + ret = tcp_prepare_flags(c, conn, flags, &payload->th, payload->opts, &optlen); + if (ret <= 0) { + vu_queue_rewind(vdev, vq, 1); + return ret; + } + + l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, optlen, NULL, + seq);> + /* cppcheck-suppress unreadVariable */ > + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; > + > + tlen = l4len + sizeof(*ip6h) + sizeof(struct ethhdr); > + > + if (*c->pcap) { > + uint32_t sum = proto_ipv6_header_psum(l4len, > + IPPROTO_TCP, > + &ip6h->saddr, > + &ip6h->daddr); > + > + payload->th.check = 0; > + payload->th.check = csum(&payload->th, optlen + sizeof(struct tcphdr), sum); > + } > + } > + > + pcap((void *)eh, tlen); > + > + tlen += vnet_hdrlen; > + vu_queue_fill(vdev, vq, &elem, tlen, 0); > + nb_ack = 1; > + > + if (flags & DUP_ACK) { > + VuVirtqElement elem_dup; > + struct iovec in_sg_dup; > + > + elem_dup.out_num = 0; > + elem_dup.out_sg = NULL; > + elem_dup.in_num = 1; > + elem_dup.in_sg = &in_sg_dup; > + ret = vu_queue_pop(vdev, vq, &elem_dup); > + if (ret == 0) { > + if (elem_dup.in_num < 1 || elem_dup.in_sg[0].iov_len < tlen) { > + vu_queue_rewind(vdev, vq, 1); > + } else { > + memcpy(elem_dup.in_sg[0].iov_base, vh, tlen); > + nb_ack++; > + } > + } > + } > + > + vu_queue_flush(vdev, vq, nb_ack); > + vu_queue_notify(vdev, vq); > + > + return 0; > +} > + > +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > +{ > + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; > + static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE]; > + static VuVirtqElement elem[VIRTQUEUE_MAX_SIZE]; > + static struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; > + VuDev *vdev = (VuDev *)&c->vdev; > + VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; > + size_t l2_hdrlen, vnet_hdrlen, fillsize; > + int s = conn->sock, v4 = CONN_V4(conn); > + struct iovec l2_iov[TCP_NUM_IOVS]; > + int i, ret, iov_cnt, iov_used; > + struct msghdr mh_sock = { 0 }; > + uint16_t mss = MSS_GET(conn); > + static int in_sg_count; > + uint32_t already_sent; > + const uint16_t *check; > + struct iovec *first; > + bool has_mrg_rxbuf; > + int segment_size; > + int num_buffers; > + ssize_t len; > + > + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { > + err("Got packet, but no available descriptors on RX virtq."); flow_err() to give more useful information here.+ return 0; + } + + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; + + if (SEQ_LT(already_sent, 0)) { + /* RFC 761, section 2.1. */ + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", + conn->seq_ack_from_tap, conn->seq_to_tap); + conn->seq_to_tap = conn->seq_ack_from_tap; + already_sent = 0; + } + + if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); + return 0; + }Duplicating some of this subtle TCP core logic between the buf and vu paths worries me quite a bit :/.+ + /* Set up buffer descriptors we'll fill completely and partially. */ + + fillsize = wnd_scaled; + + iov_vu[0].iov_base = tcp_buf_discard; + iov_vu[0].iov_len = already_sent; + fillsize -= already_sent; + + has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF); + if (has_mrg_rxbuf) + vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + else + vnet_hdrlen = sizeof(struct virtio_net_hdr); + l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr);"l2_hdrlen" is a bad name, since this is including headers both "above" L2 (tcp, IP) and "below" L2 (vnet_hdrlen).+ if (v4) + l2_hdrlen += sizeof(struct iphdr); + else + l2_hdrlen += sizeof(struct ipv6hdr); + + iov_cnt = 0; + in_sg_count = 0; + segment_size = 0; + while (fillsize > 0 && iov_cnt < VIRTQUEUE_MAX_SIZE - 1 && + in_sg_count < ARRAY_SIZE(in_sg)) { + + elem[iov_cnt].out_num = 0; + elem[iov_cnt].out_sg = NULL; + elem[iov_cnt].in_num = ARRAY_SIZE(in_sg) - in_sg_count; + elem[iov_cnt].in_sg = &in_sg[in_sg_count]; + ret = vu_queue_pop(vdev, vq, &elem[iov_cnt]); + if (ret < 0) + break; + + if (elem[iov_cnt].in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + goto err; + } + in_sg_count += elem[iov_cnt].in_num; + + ASSERT(elem[iov_cnt].in_num == 1); + ASSERT(elem[iov_cnt].in_sg[0].iov_len >= l2_hdrlen); + + if (segment_size == 0) { + iov_vu[iov_cnt + 1].iov_base = + (char *)elem[iov_cnt].in_sg[0].iov_base + l2_hdrlen; + iov_vu[iov_cnt + 1].iov_len = + elem[iov_cnt].in_sg[0].iov_len - l2_hdrlen;Do we need to verify somehwere that this buffer is large enough for all the headers?+ } else { + iov_vu[iov_cnt + 1].iov_base = elem[iov_cnt].in_sg[0].iov_base; + iov_vu[iov_cnt + 1].iov_len = elem[iov_cnt].in_sg[0].iov_len; + } + + if (iov_vu[iov_cnt + 1].iov_len > fillsize) + iov_vu[iov_cnt + 1].iov_len = fillsize; + + segment_size += iov_vu[iov_cnt + 1].iov_len; + if (!has_mrg_rxbuf) { + segment_size = 0; + } else if (segment_size >= mss) { + iov_vu[iov_cnt + 1].iov_len -= segment_size - mss; + segment_size = 0; + }If I'm understanding this correctly, we're adjusting the size of each TCP packet we generate to match the size of the buffer that the guest provides. Is that correct? In practice, how large are these buffers going to me - could this dramatically our typical segment size in practice?+ fillsize -= iov_vu[iov_cnt + 1].iov_len; + + iov_cnt++; + } + if (iov_cnt == 0) + return 0; + + ret = 0; + mh_sock.msg_iov = iov_vu; + mh_sock.msg_iovlen = iov_cnt + 1; + + do + len = recvmsg(s, &mh_sock, MSG_PEEK); + while (len < 0 && errno == EINTR); + + if (len < 0) + goto err; + + if (!len) { + vu_queue_rewind(vdev, vq, iov_cnt); + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { + ret = tcp_vu_send_flag(c, conn, FIN | ACK); + if (ret) { + tcp_rst(c, conn); + return ret; + } + + conn_event(c, conn, TAP_FIN_SENT); + } + + return 0; + } + + len -= already_sent; + if (len <= 0) { + conn_flag(c, conn, STALLED); + vu_queue_rewind(vdev, vq, iov_cnt); + return 0; + } + + conn_flag(c, conn, ~STALLED); + + /* Likely, some new data was acked too. */ + tcp_update_seqack_wnd(c, conn, 0, NULL); + + /* initialize headers */ + iov_used = 0; + num_buffers = 0; + check = NULL; + segment_size = 0; + for (i = 0; i < iov_cnt && len; i++) { + + if (segment_size == 0) + first = &iov_vu[i + 1]; + + if (iov_vu[i + 1].iov_len > (size_t)len) + iov_vu[i + 1].iov_len = len; + + len -= iov_vu[i + 1].iov_len; + iov_used++; + + segment_size += iov_vu[i + 1].iov_len; + num_buffers++; + + if (segment_size >= mss || len == 0 || + i + 1 == iov_cnt || !has_mrg_rxbuf) { + char *base = (char *)first->iov_base - l2_hdrlen; + size_t size = first->iov_len + l2_hdrlen; + struct virtio_net_hdr_mrg_rxbuf *vh; + struct ethhdr *eh; + size_t l4len; + + vh = (struct virtio_net_hdr_mrg_rxbuf *)base; + + vh->hdr = vu_header; + if (has_mrg_rxbuf) + vh->num_buffers = htole16(num_buffers); + + l2_iov[TCP_IOV_TAP].iov_base = NULL; + l2_iov[TCP_IOV_TAP].iov_len = 0; + l2_iov[TCP_IOV_ETH].iov_base = base + vnet_hdrlen; + l2_iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); + + eh = l2_iov[TCP_IOV_ETH].iov_base;Again, this could be done a bit more neatly using IOV_OF_LVALUE(). Also... IIUC, the only purpose of l2_iov[] here is to communicate the various pieces of the packet to the header initiailizing functions. That seems like a poor choice compared to specifically typed pointers for each piece (which, yes, I realise would mean a bunch of changes in the existing code).+ memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + + /* initialize header */ + if (v4) { + struct tcp_payload_t *payload; + struct iphdr *iph; + + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + + l2_iov[TCP_IOV_ETH].iov_len; + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + + l2_iov[TCP_IOV_IP].iov_len; + + + eh->h_proto = htons(ETH_P_IP); + + iph = l2_iov[TCP_IOV_IP].iov_base; + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; + payload->th = (struct tcphdr){ + .doff = offsetof(struct tcp_payload_t, data) / 4, + .ack = 1 + }; + + l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, + segment_size, + len ? check : NULL, + conn->seq_to_tap); + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + if (*c->pcap) { + uint32_t sum = proto_ipv4_header_psum(l4len, + IPPROTO_TCP, + /* cppcheck-suppress unknownEvaluationOrder */ + (struct in_addr){ .s_addr = iph->saddr }, + (struct in_addr){ .s_addr = iph->daddr }); + + first->iov_base = &payload->th; + first->iov_len = size - l2_hdrlen + sizeof(struct tcphdr); + payload->th.check = 0; + payload->th.check = csum_iov(first, num_buffers, sum); + } + + check = &iph->check; + } else { + struct tcp_payload_t *payload; + struct ipv6hdr *ip6h; + + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + + l2_iov[TCP_IOV_ETH].iov_len; + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + + l2_iov[TCP_IOV_IP].iov_len; + + + eh->h_proto = htons(ETH_P_IPV6); + + ip6h = l2_iov[TCP_IOV_IP].iov_base; + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); + + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; + payload->th = (struct tcphdr){ + .doff = offsetof(struct tcp_payload_t, data) / 4, + .ack = 1 + }; +; + l4len = tcp_l2_buf_fill_headers(c, conn, l2_iov, + segment_size, + NULL, conn->seq_to_tap); + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; + + if (*c->pcap) { + uint32_t sum = proto_ipv6_header_psum(l4len, + IPPROTO_TCP, + &ip6h->saddr, + &ip6h->daddr); + + first->iov_base = &payload->th; + first->iov_len = size - l2_hdrlen + sizeof(struct tcphdr); + + payload->th.check = 0; + payload->th.check = csum_iov(first, num_buffers, sum); + } + } + + /* set iov for pcap logging */ + first->iov_base = eh; + first->iov_len = size - vnet_hdrlen; + + pcap_iov(first, num_buffers); + + /* set iov_len for vu_queue_fill_by_index(); */ + + first->iov_base = base; + first->iov_len = size; + + conn->seq_to_tap += segment_size; + + segment_size = 0; + num_buffers = 0; + } + } + + /* release unused buffers */ + vu_queue_rewind(vdev, vq, iov_cnt - iov_used); + + /* send packets */ + for (i = 0; i < iov_used; i++) + vu_queue_fill(vdev, vq, &elem[i], iov_vu[i + 1].iov_len, i); + + vu_queue_flush(vdev, vq, iov_used); + vu_queue_notify(vdev, vq); + + conn_flag(c, conn, ACK_FROM_TAP_DUE); + + return 0; +err: + vu_queue_rewind(vdev, vq, iov_cnt); + + if (errno != EAGAIN && errno != EWOULDBLOCK) { + ret = -errno; + tcp_rst(c, conn); + } + + return ret; +} diff --git a/tcp_vu.h b/tcp_vu.h new file mode 100644 index 000000000000..b8c57a543ed5 --- /dev/null +++ b/tcp_vu.h @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef TCP_VU_H +#define TCP_VU_H + +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); + +#endif /*TCP_VU_H */ diff --git a/udp.c b/udp.c index dba75d7fecbd..90d58b691c83 100644 --- a/udp.c +++ b/udp.c @@ -121,9 +121,7 @@ #include "tap.h" #include "pcap.h" #include "log.h" - -#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ -#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ +#include "udp_internal.h" /** * struct udp_tap_port - Port tracking based on tap-facing source port @@ -171,20 +169,8 @@ static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8) /* Static buffers */ -/** - * struct udp_payload_t - UDP header and data for inbound messages - * @uh: UDP header - * @data: UDP data - */ -static struct udp_payload_t { - struct udphdr uh; - char data[USHRT_MAX - sizeof(struct udphdr)]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))) -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) -#endif -udp_payload[UDP_MAX_FRAMES]; +/* UDP header and data for inbound messages */ +static struct udp_payload_t udp_payload[UDP_MAX_FRAMES]; /* Ethernet header for IPv4 frames */ static struct ethhdr udp4_eth_hdr; @@ -239,11 +225,11 @@ static struct mmsghdr udp6_l2_mh_sock [UDP_MAX_FRAMES]; /* recvmmsg()/sendmmsg() data for "spliced" connections */ static struct iovec udp_iov_splice [UDP_MAX_FRAMES]; -static struct sockaddr_in udp4_localname = { +struct sockaddr_in udp4_localname = { .sin_family = AF_INET, .sin_addr = IN4ADDR_LOOPBACK_INIT, }; -static struct sockaddr_in6 udp6_localname = { +struct sockaddr_in6 udp6_localname = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_LOOPBACK_INIT, }; @@ -564,11 +550,11 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n, * * Return: size of IPv4 payload (UDP header + data) */ -static size_t udp_update_hdr4(const struct ctx *c, - struct iphdr *ip4h, const struct sockaddr_in *s_in, - struct udp_payload_t *bp, - in_port_t dstport, size_t dlen, - const struct timespec *now) +size_t udp_update_hdr4(const struct ctx *c, + struct iphdr *ip4h, const struct sockaddr_in *s_in, + struct udp_payload_t *bp, + in_port_t dstport, size_t dlen, + const struct timespec *now) { const struct in_addr dst = c->ip4.addr_seen; in_port_t srcport = ntohs(s_in->sin_port); @@ -603,7 +589,10 @@ static size_t udp_update_hdr4(const struct ctx *c, bp->uh.source = s_in->sin_port; bp->uh.dest = htons(dstport); bp->uh.len = htons(l4len); - csum_udp4(&bp->uh, src, dst, bp->data, dlen); + if (c->mode != MODE_VU) + csum_udp4(&bp->uh, src, dst, bp->data, dlen); + else + bp->uh.check = 0; return l4len; } @@ -620,11 +609,11 @@ static size_t udp_update_hdr4(const struct ctx *c, * * Return: size of IPv6 payload (UDP header + data) */ -static size_t udp_update_hdr6(const struct ctx *c, - struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, - struct udp_payload_t *bp, - in_port_t dstport, size_t dlen, - const struct timespec *now) +size_t udp_update_hdr6(const struct ctx *c, + struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, + struct udp_payload_t *bp, + in_port_t dstport, size_t dlen, + const struct timespec *now) { const struct in6_addr *src = &s_in6->sin6_addr; const struct in6_addr *dst = &c->ip6.addr_seen; @@ -675,7 +664,10 @@ static size_t udp_update_hdr6(const struct ctx *c, bp->uh.source = s_in6->sin6_port; bp->uh.dest = htons(dstport); bp->uh.len = ip6h->payload_len; - csum_udp6(&bp->uh, src, dst, bp->data, dlen); + if (c->mode != MODE_VU) + csum_udp6(&bp->uh, src, dst, bp->data, dlen); + else + bp->uh.check = 0xffff; /* zero checksum is invalid with IPv6 */ return l4len; } diff --git a/udp_internal.h b/udp_internal.h new file mode 100644 index 000000000000..898d1e103cb8 --- /dev/null +++ b/udp_internal.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#ifndef UDP_INTERNAL_H +#define UDP_INTERNAL_H + +#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ +#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ + +extern struct sockaddr_in udp4_localname; +extern struct sockaddr_in6 udp6_localname; + +/** + * struct udp_payload_t - UDP header and data for inbound messages + * @uh: UDP header + * @data: UDP data + */ +struct udp_payload_t { + struct udphdr uh; + char data[USHRT_MAX - sizeof(struct udphdr)]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +size_t udp_update_hdr4(const struct ctx *c, + struct iphdr *ip4h, const struct sockaddr_in *s_in, + struct udp_payload_t *bp, + in_port_t dstport, size_t dlen, + const struct timespec *now); +size_t udp_update_hdr6(const struct ctx *c, + struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, + struct udp_payload_t *bp, + in_port_t dstport, size_t dlen, + const struct timespec *now); +#endif /* UDP_INTERNAL_H */ diff --git a/udp_vu.c b/udp_vu.c new file mode 100644 index 000000000000..deb649028153 --- /dev/null +++ b/udp_vu.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <unistd.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/udp.h> +#include <stdint.h> +#include <stddef.h> +#include <sys/uio.h> +#include <linux/virtio_net.h> + +#include "checksum.h" +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "pcap.h" +#include "log.h" +#include "vhost_user.h" +#include "udp_internal.h" +#include "udp_vu.h" + +/* vhost-user */ +static const struct virtio_net_hdr vu_header = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, +}; + +static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE]; +static VuVirtqElement elem [VIRTQUEUE_MAX_SIZE]; +static struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; +static int in_sg_count; + +void udp_vu_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now) +{ + VuDev *vdev = (VuDev *)&c->vdev; + VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + bool has_mrg_rxbuf, v6 = ref.udp.v6; + in_port_t dstport = ref.udp.port; + size_t l2_hdrlen, vnet_hdrlen; + struct msghdr msg; + int i, virtqueue_max;As with TCP, it kind of feels like we should be able to share more of the skeleton of this path. I'm worried about the amount of logic duplication we have, in terms of maintainability.+ + if (c->no_udp || !(events & EPOLLIN)) + return; + + has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF); + if (has_mrg_rxbuf) { + vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + virtqueue_max = VIRTQUEUE_MAX_SIZE; + } else { + vnet_hdrlen = sizeof(struct virtio_net_hdr); + virtqueue_max = 1; + } + l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct udphdr); + + if (v6) { + l2_hdrlen += sizeof(struct ipv6hdr); + + udp6_localname.sin6_port = htons(dstport); + msg.msg_name = &udp6_localname; + msg.msg_namelen = sizeof(udp6_localname); + } else { + l2_hdrlen += sizeof(struct iphdr); + + udp4_localname.sin_port = htons(dstport); + msg.msg_name = &udp4_localname; + msg.msg_namelen = sizeof(udp4_localname); + } + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + for (i = 0; i < UDP_MAX_FRAMES; i++) { + struct virtio_net_hdr_mrg_rxbuf *vh; + size_t size, fillsize, remaining; + int iov_cnt, iov_used; + struct ethhdr *eh; + ssize_t data_len; + size_t l4len; + char *base; + + fillsize = USHRT_MAX; + iov_cnt = 0; + in_sg_count = 0; + while (fillsize && iov_cnt < virtqueue_max && + in_sg_count < ARRAY_SIZE(in_sg)) { + int ret; + + elem[iov_cnt].out_num = 0; + elem[iov_cnt].out_sg = NULL; + elem[iov_cnt].in_num = ARRAY_SIZE(in_sg) - in_sg_count; + elem[iov_cnt].in_sg = &in_sg[in_sg_count]; + ret = vu_queue_pop(vdev, vq, &elem[iov_cnt]); + if (ret < 0) + break; + in_sg_count += elem[iov_cnt].in_num; + + if (elem[iov_cnt].in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_rewind(vdev, vq, iov_cnt); + return; + } + ASSERT(elem[iov_cnt].in_num == 1); + ASSERT(elem[iov_cnt].in_sg[0].iov_len >= l2_hdrlen); + + if (iov_cnt == 0) { + base = elem[iov_cnt].in_sg[0].iov_base; + size = elem[iov_cnt].in_sg[0].iov_len; + + /* keep space for the headers */ + iov_vu[0].iov_base = base + l2_hdrlen; + iov_vu[0].iov_len = size - l2_hdrlen; + } else { + iov_vu[iov_cnt].iov_base = elem[iov_cnt].in_sg[0].iov_base; + iov_vu[iov_cnt].iov_len = elem[iov_cnt].in_sg[0].iov_len; + } + + if (iov_vu[iov_cnt].iov_len > fillsize) + iov_vu[iov_cnt].iov_len = fillsize; + + fillsize -= iov_vu[iov_cnt].iov_len; + + iov_cnt++; + } + if (iov_cnt == 0) + break; + + msg.msg_iov = iov_vu; + msg.msg_iovlen = iov_cnt; + + data_len = recvmsg(ref.fd, &msg, 0); + if (data_len < 0) { + vu_queue_rewind(vdev, vq, iov_cnt); + return; + } + + /* restore original values */ + iov_vu[0].iov_base = base; + iov_vu[0].iov_len = size; + + /* count the numbers of buffer filled by recvmsg() */ + iov_used = iov_count(iov_vu, iov_cnt, l2_hdrlen + data_len, + &remaining); + ASSERT(iov_used <= iov_cnt); + if (iov_used > 0) { + ASSERT(iov_vu[iov_used - 1].iov_len >= remaining); + iov_vu[iov_used - 1].iov_len = remaining; + /* update size */ + if (iov_used - 1 == 0) + size = iov_vu[0].iov_len; + } + + /* release unused buffers */ + vu_queue_rewind(vdev, vq, iov_cnt - iov_used); + + /* vnet_header */ + vh = (struct virtio_net_hdr_mrg_rxbuf *)base; + vh->hdr = vu_header; + if (has_mrg_rxbuf) + vh->num_buffers = htole16(iov_used); + + /* ethernet header */ + eh = (struct ethhdr *)(base + vnet_hdrlen); + + memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + + /* initialize header */ + if (v6) { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1); + struct udp_payload_t *bp = (struct udp_payload_t *)(ip6h + 1); + + eh->h_proto = htons(ETH_P_IPV6); + + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_UDP); + + l4len = udp_update_hdr6(c, ip6h, &udp6_localname, bp, + dstport, data_len, now); + if (*c->pcap) { + uint32_t sum; + + sum = proto_ipv6_header_psum(l4len, IPPROTO_UDP, + &ip6h->saddr, + &ip6h->daddr); + + iov_vu[0].iov_base = &bp->uh; + iov_vu[0].iov_len = size - l2_hdrlen + + sizeof(bp->uh); + bp->uh.check = 0; /* by default, set to 0xffff */ + bp->uh.check = csum_iov(iov_vu, iov_used, sum); + } + } else { + struct iphdr *iph = (struct iphdr *)(eh + 1); + struct udp_payload_t *bp = (struct udp_payload_t *)(iph + 1); + + eh->h_proto = htons(ETH_P_IP); + + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP); + + l4len = udp_update_hdr4(c, iph, &udp4_localname, bp, + dstport, data_len, now); + if (*c->pcap) { + uint32_t sum; + + sum = proto_ipv4_header_psum(l4len, IPPROTO_UDP, + /* cppcheck-suppress unknownEvaluationOrder */ + (struct in_addr){ .s_addr = iph->saddr }, + (struct in_addr){ .s_addr = iph->daddr }); + + iov_vu[0].iov_base = &bp->uh; + iov_vu[0].iov_len = size - l2_hdrlen + + sizeof(bp->uh); + bp->uh.check = csum_iov(iov_vu, iov_used, sum); + } + } + + /* set iov for pcap logging */ + iov_vu[0].iov_base = base + vnet_hdrlen; + iov_vu[0].iov_len = size - vnet_hdrlen; + pcap_iov(iov_vu, iov_used); + + /* set iov_len for vu_queue_fill_by_index(); */ + iov_vu[0].iov_base = base; + iov_vu[0].iov_len = size; + + /* send packets */ + for (i = 0; i < iov_used; i++) + vu_queue_fill(vdev, vq, &elem[i], iov_vu[i].iov_len, i); + + vu_queue_flush(vdev, vq, iov_used); + vu_queue_notify(vdev, vq); + } +} diff --git a/udp_vu.h b/udp_vu.h new file mode 100644 index 000000000000..e01ce047ee0a --- /dev/null +++ b/udp_vu.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef UDP_VU_H +#define UDP_VU_H + +void udp_vu_sock_handler(const struct ctx *c, union epoll_ref ref, + uint32_t events, const struct timespec *now); +#endif /* UDP_VU_H */ diff --git a/vhost_user.c b/vhost_user.c index 4ac0a3e53499..a3d156558359 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -28,7 +28,6 @@ #define VHOST_USER_VERSION 1 -/* cppcheck-suppress unusedFunction */ void vu_print_capabilities(void) { printf("{\n"); @@ -332,7 +331,6 @@ static bool map_ring(VuDev *vdev, VuVirtq *vq) return !(vq->vring.desc && vq->vring.used && vq->vring.avail); } -/* cppcheck-suppress unusedFunction */ int vu_packet_check_range(void *buf, size_t offset, size_t len, const char *start, const char *func, int line) { @@ -545,7 +543,6 @@ static int vu_wait_queue(const VuVirtq *vq) return 0; } -/* cppcheck-suppress unusedFunction */ int vu_send(const struct ctx *c, const void *buf, size_t size) { VuDev *vdev = (VuDev *)&c->vdev; @@ -730,7 +727,6 @@ static void vu_handle_tx(VuDev *vdev, int index) } } -/* cppcheck-suppress unusedFunction */ void vu_kick_cb(struct ctx *c, union epoll_ref ref) { VuDev *vdev = &c->vdev; @@ -927,7 +923,6 @@ static bool vu_set_vring_enable_exec(VuDev *vdev, struct VhostUserMsg *msg) return false; } -/* cppcheck-suppress unusedFunction */ void vu_init(struct ctx *c) { int i; @@ -988,7 +983,6 @@ static void vu_cleanup(VuDev *vdev) * @c: Execution context * @events: epoll events */ -/* cppcheck-suppress unusedFunction */ void tap_handler_vu(struct ctx *c, uint32_t events) { VuDev *dev = &c->vdev; diff --git a/virtio.c b/virtio.c index 5d58e56204b3..8c651070bba5 100644 --- a/virtio.c +++ b/virtio.c @@ -367,7 +367,6 @@ void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len) vu_queue_detach_element(dev, vq, index, len); } -/* cppcheck-suppress unusedFunction */ bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) { (void)dev;-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
On 24/06/2024 07:05, David Gibson wrote:On Fri, Jun 21, 2024 at 04:56:40PM +0200, Laurent Vivier wrote: > add virtio and vhost-user functions to connect with QEMU. > > $ ./passt --vhost-user > > and > > # qemu-system-x86_64 ... -m 4G \ > -object memory-backend-memfd,id=memfd0,share=on,size=4G \ > -numa node,memdev=memfd0 \ > -chardev socket,id=chr0,path=/tmp/passt_1.socket \ > -netdev vhost-user,id=netdev0,chardev=chr0 \ > -device virtio-net,mac=9a:2b:2c:2d:2e:2f,netdev=netdev0 \ > ... > > Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> > --- > Makefile | 4 +- > checksum.c | 1 - > conf.c | 18 +- > iov.c | 1 - > packet.c | 6 + > packet.h | 2 + > passt.c | 12 +- > passt.h | 2 + > pcap.c | 1 - > tap.c | 87 ++++++-- > tap.h | 3 +- > tcp.c | 17 +- > tcp_vu.c | 547 +++++++++++++++++++++++++++++++++++++++++++++++++ > tcp_vu.h | 9 + > udp.c | 54 +++-- > udp_internal.h | 39 ++++ > udp_vu.c | 237 +++++++++++++++++++++ > udp_vu.h | 8 + > vhost_user.c | 6 - > virtio.c | 1 - > 20 files changed, 988 insertions(+), 67 deletions(-) > create mode 100644 tcp_vu.c > create mode 100644 tcp_vu.h > create mode 100644 udp_internal.h > create mode 100644 udp_vu.c > create mode 100644 udp_vu.h >...In fact, no: because we cannot have the ((aligned)) attribute as the address is provided by the guest.diff --git a/tcp_vu.c b/tcp_vu.c new file mode 100644 index 000000000000..f27890f63c0e --- /dev/null +++ b/tcp_vu.c @@ -0,0 +1,547 @@ +// SPDX-License-Identifier: GPL-2.0-or-laterNeeds Copyright notice, author information and general description here.+ +#include <errno.h> +#include <stddef.h> +#include <stdint.h> + +#include <netinet/ip.h> + +#include <sys/socket.h> + +#include <linux/tcp.h> +#include <linux/virtio_net.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "siphash.h" +#include "inany.h" +#include "vhost_user.h" +#include "tcp.h" +#include "pcap.h" +#include "flow.h" +#include "tcp_conn.h" +#include "flow_table.h" +#include "tcp_vu.h" +#include "tcp_internal.h" +#include "checksum.h" + +#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) +#define CONN_V6(conn) (!CONN_V4(conn)) + +/** + * struct tcp_payload_t - TCP header and data to send segments with payload + * @th: TCP header + * @data: TCP data + */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; +};This could be common with tcp_buf.c, couldn't it?I agree+ +/** + * struct tcp_flags_t - TCP header and data to send zero-length + * segments (flags) + * @th: TCP header + * @opts TCP options + */ +struct tcp_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +};Likewise here.+ +/* vhost-user */ +static const struct virtio_net_hdr vu_header = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, +}; + +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + VuDev *vdev = (VuDev *)&c->vdev; + VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + size_t tlen, vnet_hdrlen, l4len, optlen; + struct virtio_net_hdr_mrg_rxbuf *vh; + struct iovec l2_iov[TCP_NUM_IOVS]; + VuVirtqElement elem; + struct iovec in_sg; + struct ethhdr *eh; + int nb_ack; + int ret; + + elem.out_num = 0; + elem.out_sg = NULL; + elem.in_num = 1; + elem.in_sg = &in_sg; + ret = vu_queue_pop(vdev, vq, &elem); + if (ret < 0) + return 0; + + if (elem.in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_rewind(vdev, vq, 1); + return 0; + } + + vh = elem.in_sg[0].iov_base;AFAICT, the code below requires that in_sg[0] be large enough to contain the frame, plus a virtio_net_hdr_mrg_rxbuf. Seems like that we should ASSERT() that somewhere.If I'm understanding correctly that the virtio_net_hdr_mrg_rxbuf is a kind of pseudo-header you need for each frame, I'm wondering if it could be integrated into the tap_hdr mechanisms.I tried, but the we use a contiguous buffer rather than the TCP iovec so it's simpler to initialize the header in place. Thanks, Laurent
On Fri, Jul 12, 2024 at 04:49:07PM +0200, Laurent Vivier wrote:On 24/06/2024 07:05, David Gibson wrote: > On Fri, Jun 21, 2024 at 04:56:40PM +0200, Laurent Vivier wrote:[snip]Ah, right. It still concerns me a bit, having two structures with the same name in different parts of the code, when the difference between them is so subtle. Could we use a common struct for both "buf" and vu, and put the alignment constraint on the actual array in the buf code, rather than on the type? -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibsonIn fact, no: because we cannot have the ((aligned)) attribute as the address is provided by the guest.+/** + * struct tcp_payload_t - TCP header and data to send segments with payload + * @th: TCP header + * @data: TCP data + */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; +};This could be common with tcp_buf.c, couldn't it?