For vhost-user, we will need to spread TCP payload over several buffers. To re-use tcp_update_check_tcp[4|6](), provide an iovec rather than a pointer to a buffer. This series updates also csum_iov() and pcap_iov() to add an offset of bytes to skip in the iovec array. It's based on top of "tcp: Use tcp_payload_t rather than tcphdr" that is added in the series for convenience. Laurent Vivier (4): tcp: Use tcp_payload_t rather than tcphdr pcap: Add an offset argument in pcap_iov() checksum: Add an offset argument in csum_iov() tcp: Update TCP checksum using an iovec array checksum.c | 14 ++++-- checksum.h | 3 +- pcap.c | 5 ++- pcap.h | 2 +- tcp.c | 118 ++++++++++++++++++++++++++++++++++--------------- tcp_buf.c | 29 ------------ tcp_internal.h | 29 ++++++++++++ 7 files changed, 128 insertions(+), 72 deletions(-) -- 2.46.0
As tcp_update_check_tcp4() and tcp_update_check_tcp6() compute the checksum using the TCP header and the TCP payload, it is clearer to use a pointer to tcp_payload_t that includes tcphdr and payload rather than a pointer to tcphdr (and guessing TCP header is followed by the payload). Move tcp_payload_t and tcp_flags_t to tcp_internal.h. (They will be used also by vhost-user). Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 42 ++++++++++++++++++++++-------------------- tcp_buf.c | 29 ----------------------------- tcp_internal.h | 29 +++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 49 deletions(-) diff --git a/tcp.c b/tcp.c index 1962fcc469ed..c9472d905520 100644 --- a/tcp.c +++ b/tcp.c @@ -757,32 +757,34 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s) /** * tcp_update_check_tcp4() - Update TCP checksum from stored one * @iph: IPv4 header - * @th: TCP header followed by TCP payload + * @bp: TCP header followed by TCP payload */ -static void tcp_update_check_tcp4(const struct iphdr *iph, struct tcphdr *th) +static void tcp_update_check_tcp4(const struct iphdr *iph, + struct tcp_payload_t *bp) { uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); struct in_addr saddr = { .s_addr = iph->saddr }; struct in_addr daddr = { .s_addr = iph->daddr }; uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr); - th->check = 0; - th->check = csum(th, l4len, sum); + bp->th.check = 0; + bp->th.check = csum(bp, l4len, sum); } /** * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6 * @ip6h: IPv6 header - * @th: TCP header followed by TCP payload + * @bp: TCP header followed by TCP payload */ -static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th) +static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, + struct tcp_payload_t *bp) { uint16_t l4len = ntohs(ip6h->payload_len); uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &ip6h->saddr, &ip6h->daddr); - th->check = 0; - th->check = csum(th, l4len, sum); + bp->th.check = 0; + bp->th.check = csum(bp, l4len, sum); } /** @@ -902,7 +904,7 @@ static void tcp_fill_header(struct tcphdr *th, * @conn: Connection pointer * @taph: tap backend specific header * @iph: Pointer to IPv4 header - * @th: Pointer to TCP header + * @bp: Pointer to TCP header followed by TCP payload * @dlen: TCP payload length * @check: Checksum, if already known * @seq: Sequence number for this segment @@ -912,14 +914,14 @@ static void tcp_fill_header(struct tcphdr *th, */ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, struct tap_hdr *taph, - struct iphdr *iph, struct tcphdr *th, + struct iphdr *iph, struct tcp_payload_t *bp, size_t dlen, const uint16_t *check, uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); const struct in_addr *src4 = inany_v4(&tapside->oaddr); const struct in_addr *dst4 = inany_v4(&tapside->eaddr); - size_t l4len = dlen + sizeof(*th); + size_t l4len = dlen + sizeof(bp->th); size_t l3len = l4len + sizeof(*iph); ASSERT(src4 && dst4); @@ -931,12 +933,12 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, iph->check = check ? *check : csum_ip4_header(l3len, IPPROTO_TCP, *src4, *dst4); - tcp_fill_header(th, conn, seq); + tcp_fill_header(&bp->th, conn, seq); if (no_tcp_csum) - th->check = 0; + bp->th.check = 0; else - tcp_update_check_tcp4(iph, th); + tcp_update_check_tcp4(iph, bp); tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); @@ -948,7 +950,7 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, * @conn: Connection pointer * @taph: tap backend specific header * @ip6h: Pointer to IPv6 header - * @th: Pointer to TCP header + * @bp: Pointer to TCP header followed by TCP payload * @dlen: TCP payload length * @check: Checksum, if already known * @seq: Sequence number for this segment @@ -958,11 +960,11 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, */ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, struct tap_hdr *taph, - struct ipv6hdr *ip6h, struct tcphdr *th, + struct ipv6hdr *ip6h, struct tcp_payload_t *bp, size_t dlen, uint32_t seq, bool no_tcp_csum) { const struct flowside *tapside = TAPFLOW(conn); - size_t l4len = dlen + sizeof(*th); + size_t l4len = dlen + sizeof(bp->th); ip6h->payload_len = htons(l4len); ip6h->saddr = tapside->oaddr.a6; @@ -976,12 +978,12 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff; ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff; - tcp_fill_header(th, conn, seq); + tcp_fill_header(&bp->th, conn, seq); if (no_tcp_csum) - th->check = 0; + bp->th.check = 0; else - tcp_update_check_tcp6(ip6h, th); + tcp_update_check_tcp6(ip6h, bp); tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); diff --git a/tcp_buf.c b/tcp_buf.c index ffbff5e4b485..238827b01d90 100644 --- a/tcp_buf.c +++ b/tcp_buf.c @@ -38,35 +38,6 @@ (c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM) /* Static buffers */ -/** - * struct tcp_payload_t - TCP header and data to send segments with payload - * @th: TCP header - * @data: TCP data - */ -struct tcp_payload_t { - struct tcphdr th; - uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - -/** - * struct tcp_flags_t - TCP header and data to send zero-length - * segments (flags) - * @th: TCP header - * @opts TCP options - */ -struct tcp_flags_t { - struct tcphdr th; - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; -#ifdef __AVX2__ -} __attribute__ ((packed, aligned(32))); -#else -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); -#endif - /* Ethernet header for IPv4 frames */ static struct ethhdr tcp4_eth_src; diff --git a/tcp_internal.h b/tcp_internal.h index de06db1438d6..2f74ffeff8f3 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -63,6 +63,35 @@ enum tcp_iov_parts { TCP_NUM_IOVS }; +/** + * struct tcp_payload_t - TCP header and data to send segments with payload + * @th: TCP header + * @data: TCP data + */ +struct tcp_payload_t { + struct tcphdr th; + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */ +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + +/** + * struct tcp_flags_t - TCP header and data to send zero-length + * segments (flags) + * @th: TCP header + * @opts TCP options + */ +struct tcp_flags_t { + struct tcphdr th; + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; +#ifdef __AVX2__ +} __attribute__ ((packed, aligned(32))); +#else +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); +#endif + extern char tcp_buf_discard [MAX_WINDOW]; void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, -- 2.46.0
The offset is passed directly to pcap_frame() and allows any headers that are not part of the frame to capture to be skipped. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- pcap.c | 5 +++-- pcap.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pcap.c b/pcap.c index e6b5ced4a9f8..6ee6cdfd261a 100644 --- a/pcap.c +++ b/pcap.c @@ -138,9 +138,10 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, * @iov: Pointer to the array of struct iovec describing the I/O vector * containing packet data to write, including L2 header * @iovcnt: Number of buffers (@iov entries) + * @offset: Offset of the L2 frame within the full data length */ /* cppcheck-suppress unusedFunction */ -void pcap_iov(const struct iovec *iov, size_t iovcnt) +void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset) { struct timespec now; @@ -148,7 +149,7 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt) return; clock_gettime(CLOCK_REALTIME, &now); - pcap_frame(iov, iovcnt, 0, &now); + pcap_frame(iov, iovcnt, offset, &now); } /** diff --git a/pcap.h b/pcap.h index 533923749222..9795f2e8adc5 100644 --- a/pcap.h +++ b/pcap.h @@ -9,7 +9,7 @@ void pcap(const char *pkt, size_t l2len); void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, size_t offset); -void pcap_iov(const struct iovec *iov, size_t iovcnt); +void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset); void pcap_init(struct ctx *c); #endif /* PCAP_H */ -- 2.46.0
On Tue, Sep 24, 2024 at 05:46:40PM +0200, Laurent Vivier wrote:The offset is passed directly to pcap_frame() and allows any headers that are not part of the frame to capture to be skipped. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com>Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au>--- pcap.c | 5 +++-- pcap.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pcap.c b/pcap.c index e6b5ced4a9f8..6ee6cdfd261a 100644 --- a/pcap.c +++ b/pcap.c @@ -138,9 +138,10 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, * @iov: Pointer to the array of struct iovec describing the I/O vector * containing packet data to write, including L2 header * @iovcnt: Number of buffers (@iov entries) + * @offset: Offset of the L2 frame within the full data length */ /* cppcheck-suppress unusedFunction */ -void pcap_iov(const struct iovec *iov, size_t iovcnt) +void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset) { struct timespec now; @@ -148,7 +149,7 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt) return; clock_gettime(CLOCK_REALTIME, &now); - pcap_frame(iov, iovcnt, 0, &now); + pcap_frame(iov, iovcnt, offset, &now); } /** diff --git a/pcap.h b/pcap.h index 533923749222..9795f2e8adc5 100644 --- a/pcap.h +++ b/pcap.h @@ -9,7 +9,7 @@ void pcap(const char *pkt, size_t l2len); void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, size_t offset); -void pcap_iov(const struct iovec *iov, size_t iovcnt); +void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset); void pcap_init(struct ctx *c); #endif /* PCAP_H */-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
The offset allows any headers are that are not part of the data to checksum to be skipped. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- checksum.c | 13 +++++++++++-- checksum.h | 3 ++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/checksum.c b/checksum.c index 006614fcbb28..f80db4d309a2 100644 --- a/checksum.c +++ b/checksum.c @@ -59,6 +59,7 @@ #include "util.h" #include "ip.h" #include "checksum.h" +#include "iov.h" /* Checksums are optional for UDP over IPv4, so we usually just set * them to 0. Change this to 1 to calculate real UDP over IPv4 @@ -498,15 +499,23 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * @iov Pointer to the array of IO vectors * @n Length of the array * @init Initial 32-bit checksum, 0 for no pre-computed checksum + * @offset: Offset of the data to checksum within the full data length * * Return: 16-bit folded, complemented checksum */ /* cppcheck-suppress unusedFunction */ -uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init) +uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, + uint32_t init) { unsigned int i; + size_t first; - for (i = 0; i < n; i++) + i = iov_skip_bytes(iov, n, offset, &first); + init = csum_unfolded((char *)iov[i].iov_base + first, + iov[i].iov_len, init); + i++; + + for (; i < n; i++) init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init); return (uint16_t)~csum_fold(init); diff --git a/checksum.h b/checksum.h index c5964ac78921..49f7472dd1b6 100644 --- a/checksum.h +++ b/checksum.h @@ -32,6 +32,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, const void *payload, size_t dlen); uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init); uint16_t csum(const void *buf, size_t len, uint32_t init); -uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init); +uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, + uint32_t init); #endif /* CHECKSUM_H */ -- 2.46.0
On Tue, Sep 24, 2024 at 05:46:41PM +0200, Laurent Vivier wrote:The offset allows any headers are that are not part of the data to checksum to be skipped. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- checksum.c | 13 +++++++++++-- checksum.h | 3 ++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/checksum.c b/checksum.c index 006614fcbb28..f80db4d309a2 100644 --- a/checksum.c +++ b/checksum.c @@ -59,6 +59,7 @@ #include "util.h" #include "ip.h" #include "checksum.h" +#include "iov.h" /* Checksums are optional for UDP over IPv4, so we usually just set * them to 0. Change this to 1 to calculate real UDP over IPv4 @@ -498,15 +499,23 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * @iov Pointer to the array of IO vectors * @n Length of the array * @init Initial 32-bit checksum, 0 for no pre-computed checksum + * @offset: Offset of the data to checksum within the full data length * * Return: 16-bit folded, complemented checksum */ /* cppcheck-suppress unusedFunction */ -uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init) +uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, + uint32_t init) { unsigned int i; + size_t first; - for (i = 0; i < n; i++) + i = iov_skip_bytes(iov, n, offset, &first);Just to be safe, you should probably check for i >= n here: if offset is larger than the total length of the vector enough, iov_skip_bytes() will return that.+ init = csum_unfolded((char *)iov[i].iov_base + first, + iov[i].iov_len, init); + i++; + + for (; i < n; i++) init = csum_unfolded(iov[i].iov_base, iov[i].iov_len, init); return (uint16_t)~csum_fold(init); diff --git a/checksum.h b/checksum.h index c5964ac78921..49f7472dd1b6 100644 --- a/checksum.h +++ b/checksum.h @@ -32,6 +32,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, const void *payload, size_t dlen); uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init); uint16_t csum(const void *buf, size_t len, uint32_t init); -uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init); +uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, + uint32_t init); #endif /* CHECKSUM_H */-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
TCP header and payload are supposed to be in the same buffer, and tcp_update_check_tcp4()/tcp_update_check_tcp6() compute the checksum from the base address of the header using the length of the IP payload. In the future (for vhost-user) we need to dispatch the TCP header and the TCP payload through several buffers. To be able to manage that, we provide an iovec array that points to the data of the TCP frame. We provide also an offset to be able to provide an array that contains the TCP frame embedded in an lower level frame, and this offset points to the TCP header inside the iovec array. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- checksum.c | 1 - tcp.c | 100 ++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/checksum.c b/checksum.c index f80db4d309a2..96ccfe2af50b 100644 --- a/checksum.c +++ b/checksum.c @@ -503,7 +503,6 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * * Return: 16-bit folded, complemented checksum */ -/* cppcheck-suppress unusedFunction */ uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, uint32_t init) { diff --git a/tcp.c b/tcp.c index c9472d905520..efd4037ed008 100644 --- a/tcp.c +++ b/tcp.c @@ -755,36 +755,65 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s) } /** - * tcp_update_check_tcp4() - Update TCP checksum from stored one - * @iph: IPv4 header - * @bp: TCP header followed by TCP payload - */ -static void tcp_update_check_tcp4(const struct iphdr *iph, - struct tcp_payload_t *bp) + * tcp_update_check_tcp4() - Calculate TCP checksum for IPv6 + * @src: IPv4 source address + * @dst: IPv4 destination address + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + * @payload_offset: IPv4 payload offset in the iovec array + */ +void tcp_update_check_tcp4(struct in_addr src, + struct in_addr dst, + const struct iovec *iov, int iov_cnt, + size_t payload_offset) { - uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); - struct in_addr saddr = { .s_addr = iph->saddr }; - struct in_addr daddr = { .s_addr = iph->daddr }; - uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr); + size_t check_ofs; + __sum16 *check; + int check_idx; + uint32_t sum; + + sum = proto_ipv4_header_psum(iov_size(iov, iov_cnt) - payload_offset, + IPPROTO_TCP, src, dst); + + check_idx = iov_skip_bytes(iov, iov_cnt, + payload_offset + offsetof(struct tcphdr, check), + &check_ofs); + + check = (__sum16 *)((char *)iov[check_idx].iov_base + check_ofs); - bp->th.check = 0; - bp->th.check = csum(bp, l4len, sum); + *check = 0; + *check = csum_iov(iov, iov_cnt, payload_offset, sum); } /** * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6 - * @ip6h: IPv6 header - * @bp: TCP header followed by TCP payload - */ -static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, - struct tcp_payload_t *bp) + * @src: IPv6 source address + * @dst: IPv6 destination address + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + * @payload_offset: IPv6 payload offset in the iovec array + */ +void tcp_update_check_tcp6(const struct in6_addr *src, + const struct in6_addr *dst, + const struct iovec *iov, int iov_cnt, + size_t payload_offset) { - uint16_t l4len = ntohs(ip6h->payload_len); - uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, - &ip6h->saddr, &ip6h->daddr); + size_t check_ofs; + __sum16 *check; + int check_idx; + uint32_t sum; + + sum = proto_ipv6_header_psum(iov_size(iov, iov_cnt) - payload_offset, + IPPROTO_TCP, src, dst); + + check_idx = iov_skip_bytes(iov, iov_cnt, + payload_offset + offsetof(struct tcphdr, check), + &check_ofs); + + check = (__sum16 *)((char *)iov[check_idx].iov_base + check_ofs); - bp->th.check = 0; - bp->th.check = csum(bp, l4len, sum); + *check = 0; + *check = csum_iov(iov, iov_cnt, payload_offset, sum); } /** @@ -935,10 +964,18 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, tcp_fill_header(&bp->th, conn, seq); - if (no_tcp_csum) + if (no_tcp_csum) { bp->th.check = 0; - else - tcp_update_check_tcp4(iph, bp); + } else { + const struct iovec iov = { + .iov_base = bp, + .iov_len = ntohs(iph->tot_len) - sizeof(struct iphdr), + }; + struct in_addr saddr = { .s_addr = iph->saddr }; + struct in_addr daddr = { .s_addr = iph->daddr }; + + tcp_update_check_tcp4(saddr, daddr, &iov, 1, 0); + } tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); @@ -980,10 +1017,17 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, tcp_fill_header(&bp->th, conn, seq); - if (no_tcp_csum) + if (no_tcp_csum) { bp->th.check = 0; - else - tcp_update_check_tcp6(ip6h, bp); + } else { + const struct iovec iov = { + .iov_base = bp, + .iov_len = ntohs(ip6h->payload_len) + }; + + tcp_update_check_tcp6(&ip6h->saddr, &ip6h->daddr, + &iov, 1, 0); + } tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); -- 2.46.0
On Tue, Sep 24, 2024 at 05:46:42PM +0200, Laurent Vivier wrote:TCP header and payload are supposed to be in the same buffer, and tcp_update_check_tcp4()/tcp_update_check_tcp6() compute the checksum from the base address of the header using the length of the IP payload. In the future (for vhost-user) we need to dispatch the TCP header and the TCP payload through several buffers. To be able to manage that, we provide an iovec array that points to the data of the TCP frame. We provide also an offset to be able to provide an array that contains the TCP frame embedded in an lower level frame, and this offset points to the TCP header inside the iovec array. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- checksum.c | 1 - tcp.c | 100 ++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/checksum.c b/checksum.c index f80db4d309a2..96ccfe2af50b 100644 --- a/checksum.c +++ b/checksum.c @@ -503,7 +503,6 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * * Return: 16-bit folded, complemented checksum */ -/* cppcheck-suppress unusedFunction */ uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, uint32_t init) { diff --git a/tcp.c b/tcp.c index c9472d905520..efd4037ed008 100644 --- a/tcp.c +++ b/tcp.c @@ -755,36 +755,65 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s) } /** - * tcp_update_check_tcp4() - Update TCP checksum from stored one - * @iph: IPv4 header - * @bp: TCP header followed by TCP payload - */ -static void tcp_update_check_tcp4(const struct iphdr *iph, - struct tcp_payload_t *bp) + * tcp_update_check_tcp4() - Calculate TCP checksum for IPv6 + * @src: IPv4 source address + * @dst: IPv4 destination address + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + * @payload_offset: IPv4 payload offset in the iovec arrayYou explain it here, but "payload_offset" is a bit unclear if you're not sure which layer it's talking about. "l4offset" maybe?+ */ +void tcp_update_check_tcp4(struct in_addr src, + struct in_addr dst, + const struct iovec *iov, int iov_cnt, + size_t payload_offset) { - uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); - struct in_addr saddr = { .s_addr = iph->saddr }; - struct in_addr daddr = { .s_addr = iph->daddr }; - uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr); + size_t check_ofs; + __sum16 *check;What's a __sum16?+ int check_idx; + uint32_t sum; + + sum = proto_ipv4_header_psum(iov_size(iov, iov_cnt) - payload_offset, + IPPROTO_TCP, src, dst); + + check_idx = iov_skip_bytes(iov, iov_cnt, + payload_offset + offsetof(struct tcphdr, check), + &check_ofs); + + check = (__sum16 *)((char *)iov[check_idx].iov_base + check_ofs);So.. it's not likely, but it's possible for the first byte of the checksum to be in one iovec and the second byte in another. This whole construction is a bit awkward too. I think we want another helper on top of iov_skip_bytes(). It would retreive a pointer to a field of a given length and offset within the IOV, returning NULL if that can't be found contiguously. It could have a macro wrapper that fills in some of the details based on a type. For now I'd imagine we just give up if it returns NULL, but that's enough to reduce a potential out of bounds memory access to merely breaking one connection. If we ever need it, we can add a slow path to handle that case. There are a couple of other curly cases to consider too, alas: what if the field you request does exist contiguously, but isn't properly aligned for the type we want to access it as? Then there's the question of whether doing this will run afoul of the type-based aliasing rules.- bp->th.check = 0; - bp->th.check = csum(bp, l4len, sum); + *check = 0; + *check = csum_iov(iov, iov_cnt, payload_offset, sum); } /** * tcp_update_check_tcp6() - Calculate TCP checksum for IPv6 - * @ip6h: IPv6 header - * @bp: TCP header followed by TCP payload - */ -static void tcp_update_check_tcp6(const struct ipv6hdr *ip6h, - struct tcp_payload_t *bp) + * @src: IPv6 source address + * @dst: IPv6 destination address + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + * @payload_offset: IPv6 payload offset in the iovec array + */ +void tcp_update_check_tcp6(const struct in6_addr *src, + const struct in6_addr *dst, + const struct iovec *iov, int iov_cnt, + size_t payload_offset) { - uint16_t l4len = ntohs(ip6h->payload_len); - uint32_t sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, - &ip6h->saddr, &ip6h->daddr); + size_t check_ofs; + __sum16 *check; + int check_idx; + uint32_t sum; + + sum = proto_ipv6_header_psum(iov_size(iov, iov_cnt) - payload_offset, + IPPROTO_TCP, src, dst); + + check_idx = iov_skip_bytes(iov, iov_cnt, + payload_offset + offsetof(struct tcphdr, check), + &check_ofs); + + check = (__sum16 *)((char *)iov[check_idx].iov_base + check_ofs); - bp->th.check = 0; - bp->th.check = csum(bp, l4len, sum); + *check = 0; + *check = csum_iov(iov, iov_cnt, payload_offset, sum); } /** @@ -935,10 +964,18 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, tcp_fill_header(&bp->th, conn, seq); - if (no_tcp_csum) + if (no_tcp_csum) { bp->th.check = 0; - else - tcp_update_check_tcp4(iph, bp); + } else { + const struct iovec iov = { + .iov_base = bp, + .iov_len = ntohs(iph->tot_len) - sizeof(struct iphdr), + }; + struct in_addr saddr = { .s_addr = iph->saddr }; + struct in_addr daddr = { .s_addr = iph->daddr }; + + tcp_update_check_tcp4(saddr, daddr, &iov, 1, 0); + } tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); @@ -980,10 +1017,17 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, tcp_fill_header(&bp->th, conn, seq); - if (no_tcp_csum) + if (no_tcp_csum) { bp->th.check = 0; - else - tcp_update_check_tcp6(ip6h, bp); + } else { + const struct iovec iov = { + .iov_base = bp, + .iov_len = ntohs(ip6h->payload_len) + }; + + tcp_update_check_tcp6(&ip6h->saddr, &ip6h->daddr, + &iov, 1, 0); + } tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr));-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
On 25/09/2024 03:12, David Gibson wrote:On Tue, Sep 24, 2024 at 05:46:42PM +0200, Laurent Vivier wrote:It's the type of "check" in struct tcphdr.TCP header and payload are supposed to be in the same buffer, and tcp_update_check_tcp4()/tcp_update_check_tcp6() compute the checksum from the base address of the header using the length of the IP payload. In the future (for vhost-user) we need to dispatch the TCP header and the TCP payload through several buffers. To be able to manage that, we provide an iovec array that points to the data of the TCP frame. We provide also an offset to be able to provide an array that contains the TCP frame embedded in an lower level frame, and this offset points to the TCP header inside the iovec array. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- checksum.c | 1 - tcp.c | 100 ++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/checksum.c b/checksum.c index f80db4d309a2..96ccfe2af50b 100644 --- a/checksum.c +++ b/checksum.c @@ -503,7 +503,6 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * * Return: 16-bit folded, complemented checksum */ -/* cppcheck-suppress unusedFunction */ uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, uint32_t init) { diff --git a/tcp.c b/tcp.c index c9472d905520..efd4037ed008 100644 --- a/tcp.c +++ b/tcp.c @@ -755,36 +755,65 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s) } /** - * tcp_update_check_tcp4() - Update TCP checksum from stored one - * @iph: IPv4 header - * @bp: TCP header followed by TCP payload - */ -static void tcp_update_check_tcp4(const struct iphdr *iph, - struct tcp_payload_t *bp) + * tcp_update_check_tcp4() - Calculate TCP checksum for IPv6 + * @src: IPv4 source address + * @dst: IPv4 destination address + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + * @payload_offset: IPv4 payload offset in the iovec arrayYou explain it here, but "payload_offset" is a bit unclear if you're not sure which layer it's talking about. "l4offset" maybe?+ */ +void tcp_update_check_tcp4(struct in_addr src, + struct in_addr dst, + const struct iovec *iov, int iov_cnt, + size_t payload_offset) { - uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); - struct in_addr saddr = { .s_addr = iph->saddr }; - struct in_addr daddr = { .s_addr = iph->daddr }; - uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr); + size_t check_ofs; + __sum16 *check;What's a __sum16?I'm guessing iov_base and iov_len are 32bit aligned, and address of "check" too (as it is from tcphdr in memory, that should be 32bit aligned, and offset of "check" is 16), so a 16 bit value cannot be shared between two iovecs. I'm guessing that any 32bit value we take from a structure will not cross boundary of an iovec.+ int check_idx; + uint32_t sum; + + sum = proto_ipv4_header_psum(iov_size(iov, iov_cnt) - payload_offset, + IPPROTO_TCP, src, dst); + + check_idx = iov_skip_bytes(iov, iov_cnt, + payload_offset + offsetof(struct tcphdr, check), + &check_ofs); + + check = (__sum16 *)((char *)iov[check_idx].iov_base + check_ofs);So.. it's not likely, but it's possible for the first byte of the checksum to be in one iovec and the second byte in another. This whole construction is a bit awkward too.I think we want another helper on top of iov_skip_bytes(). It would retreive a pointer to a field of a given length and offset within the IOV, returning NULL if that can't be found contiguously. It could have a macro wrapper that fills in some of the details based on a type. For now I'd imagine we just give up if it returns NULL, but that's enough to reduce a potential out of bounds memory access to merely breaking one connection. If we ever need it, we can add a slow path to handle that case. There are a couple of other curly cases to consider too, alas: what if the field you request does exist contiguously, but isn't properly aligned for the type we want to access it as? Then there's the question of whether doing this will run afoul of the type-based aliasing rules.In our case, "check" is a field of "struct tcphdr", I think it's sane to think it is correctly aligned. I don't want to write complicated code only to write the checksum of the tcp header. Thanks, Laurent
On Wed, Sep 25, 2024 at 08:40:28AM +0200, Laurent Vivier wrote:On 25/09/2024 03:12, David Gibson wrote:Huh, ok.On Tue, Sep 24, 2024 at 05:46:42PM +0200, Laurent Vivier wrote:It's the type of "check" in struct tcphdr.TCP header and payload are supposed to be in the same buffer, and tcp_update_check_tcp4()/tcp_update_check_tcp6() compute the checksum from the base address of the header using the length of the IP payload. In the future (for vhost-user) we need to dispatch the TCP header and the TCP payload through several buffers. To be able to manage that, we provide an iovec array that points to the data of the TCP frame. We provide also an offset to be able to provide an array that contains the TCP frame embedded in an lower level frame, and this offset points to the TCP header inside the iovec array. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- checksum.c | 1 - tcp.c | 100 ++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/checksum.c b/checksum.c index f80db4d309a2..96ccfe2af50b 100644 --- a/checksum.c +++ b/checksum.c @@ -503,7 +503,6 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * * Return: 16-bit folded, complemented checksum */ -/* cppcheck-suppress unusedFunction */ uint16_t csum_iov(const struct iovec *iov, size_t n, size_t offset, uint32_t init) { diff --git a/tcp.c b/tcp.c index c9472d905520..efd4037ed008 100644 --- a/tcp.c +++ b/tcp.c @@ -755,36 +755,65 @@ static void tcp_sock_set_bufsize(const struct ctx *c, int s) } /** - * tcp_update_check_tcp4() - Update TCP checksum from stored one - * @iph: IPv4 header - * @bp: TCP header followed by TCP payload - */ -static void tcp_update_check_tcp4(const struct iphdr *iph, - struct tcp_payload_t *bp) + * tcp_update_check_tcp4() - Calculate TCP checksum for IPv6 + * @src: IPv4 source address + * @dst: IPv4 destination address + * @iov: Pointer to the array of IO vectors + * @iov_cnt: Length of the array + * @payload_offset: IPv4 payload offset in the iovec arrayYou explain it here, but "payload_offset" is a bit unclear if you're not sure which layer it's talking about. "l4offset" maybe?+ */ +void tcp_update_check_tcp4(struct in_addr src, + struct in_addr dst, + const struct iovec *iov, int iov_cnt, + size_t payload_offset) { - uint16_t l4len = ntohs(iph->tot_len) - sizeof(struct iphdr); - struct in_addr saddr = { .s_addr = iph->saddr }; - struct in_addr daddr = { .s_addr = iph->daddr }; - uint32_t sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, saddr, daddr); + size_t check_ofs; + __sum16 *check;What's a __sum16?That's probably true.. but I think we should actually verify / assert that somewhere.I'm guessing iov_base and iov_len are 32bit aligned, and address of "check" too (as it is from tcphdr in memory, that should be 32bit aligned, and offset of "check" is 16), so a 16 bit value cannot be shared between two iovecs. I'm guessing that any 32bit value we take from a structure will not cross boundary of an iovec.+ int check_idx; + uint32_t sum; + + sum = proto_ipv4_header_psum(iov_size(iov, iov_cnt) - payload_offset, + IPPROTO_TCP, src, dst); + + check_idx = iov_skip_bytes(iov, iov_cnt, + payload_offset + offsetof(struct tcphdr, check), + &check_ofs); + + check = (__sum16 *)((char *)iov[check_idx].iov_base + check_ofs);So.. it's not likely, but it's possible for the first byte of the checksum to be in one iovec and the second byte in another. This whole construction is a bit awkward too.It will be correctly aligned as long as the struct tcphdr itself is correctly aligned. We can probably count on that being 2-byte aligned w.r.t. the start of the frame, but probably no more since the L2 header is 14 bytes long. If there were odd-length iovs, and the TCP header wasn't in the first, that could destroy its alignment though. Actually... the same thing could happen in the first IOV if iov_base wasn't aligned, which is admittedly probably even less likely.I think we want another helper on top of iov_skip_bytes(). It would retreive a pointer to a field of a given length and offset within the IOV, returning NULL if that can't be found contiguously. It could have a macro wrapper that fills in some of the details based on a type. For now I'd imagine we just give up if it returns NULL, but that's enough to reduce a potential out of bounds memory access to merely breaking one connection. If we ever need it, we can add a slow path to handle that case. There are a couple of other curly cases to consider too, alas: what if the field you request does exist contiguously, but isn't properly aligned for the type we want to access it as? Then there's the question of whether doing this will run afoul of the type-based aliasing rules.In our case, "check" is a field of "struct tcphdr", I think it's sane to think it is correctly aligned.I don't want to write complicated code only to write the checksum of the tcp header.I agree, but I think we should at least test and bail with an error message if our assumptions about the alignments of the IOVs were given aren't true. -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
On 25/09/2024 09:01, David Gibson wrote:Do you think an ASSERT() is enough? Thanks, LaurentI don't want to write complicated code only to write the checksum of the tcp header.I agree, but I think we should at least test and bail with an error message if our assumptions about the alignments of the IOVs were given aren't true.
On Wed, Sep 25, 2024 at 09:27:03AM +0200, Laurent Vivier wrote:On 25/09/2024 09:01, David Gibson wrote:It probably shouldn't be an ASSERT(); that would indicate hitting it was a bug in past, whereas IIUC here it would be triggered by strange behaviour from the guest or qemu. I think a die() would suffice, though. Of course, just resetting the tap connection or even just dropping that flow would be even better, but given the unlikeliness of the event, I think die() would be fine. -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibsonDo you think an ASSERT() is enough?I don't want to write complicated code only to write the checksum of the tcp header.I agree, but I think we should at least test and bail with an error message if our assumptions about the alignments of the IOVs were given aren't true.