On Wed, 7 Feb 2024 15:02:42 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:On 2/6/24 17:10, Stefano Brivio wrote:Well, those are two different aspects, but anyway, yes, both. Having it in a header file implies that the compiler considers the problem separately for every compilation unit (roughly every .c file, here). If you move it in a source file, the compiler will instead apply some heuristics to decide if it makes sense to inline and, if, yes, you end up with essentially the same result. If I apply this on top of your series: --- diff --git a/iov.c b/iov.c index 38a8e75..cabd6d0 100644 --- a/iov.c +++ b/iov.c @@ -76,3 +76,27 @@ unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt, } return j; } + +size_t iov_from_buf(const struct iovec *iov, unsigned int iov_cnt, + size_t offset, const void *buf, size_t bytes) +{ + if (__builtin_constant_p(bytes) && iov_cnt && + offset <= iov[0].iov_len && bytes <= iov[0].iov_len - offset) { + memcpy((char *)iov[0].iov_base + offset, buf, bytes); + return bytes; + } else { + return iov_from_buf_full(iov, iov_cnt, offset, buf, bytes); + } +} + +size_t iov_to_buf(const struct iovec *iov, const unsigned int iov_cnt, + size_t offset, void *buf, size_t bytes) +{ + if (__builtin_constant_p(bytes) && iov_cnt && + offset <= iov[0].iov_len && bytes <= iov[0].iov_len - offset) { + memcpy(buf, (char *)iov[0].iov_base + offset, bytes); + return bytes; + } else { + return iov_to_buf_full(iov, iov_cnt, offset, buf, bytes); + } +} diff --git a/iov.h b/iov.h index 31fbf6d..598c2ba 100644 --- a/iov.h +++ b/iov.h @@ -12,35 +12,14 @@ size_t iov_from_buf_full(const struct iovec *iov, unsigned int iov_cnt, size_t offset, const void *buf, size_t bytes); size_t iov_to_buf_full(const struct iovec *iov, const unsigned int iov_cnt, size_t offset, void *buf, size_t bytes); - -static inline size_t iov_from_buf(const struct iovec *iov, - unsigned int iov_cnt, size_t offset, - const void *buf, size_t bytes) -{ - if (__builtin_constant_p(bytes) && iov_cnt && - offset <= iov[0].iov_len && bytes <= iov[0].iov_len - offset) { - memcpy((char *)iov[0].iov_base + offset, buf, bytes); - return bytes; - } else { - return iov_from_buf_full(iov, iov_cnt, offset, buf, bytes); - } -} - -static inline size_t iov_to_buf(const struct iovec *iov, - const unsigned int iov_cnt, size_t offset, - void *buf, size_t bytes) -{ - if (__builtin_constant_p(bytes) && iov_cnt && - offset <= iov[0].iov_len && bytes <= iov[0].iov_len - offset) { - memcpy(buf, (char *)iov[0].iov_base + offset, bytes); - return bytes; - } else { - return iov_to_buf_full(iov, iov_cnt, offset, buf, bytes); - } -} - size_t iov_size(const struct iovec *iov, const unsigned int iov_cnt); unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt, const struct iovec *iov, unsigned int iov_cnt, size_t offset, size_t bytes); + +size_t iov_from_buf(const struct iovec *iov, unsigned int iov_cnt, + size_t offset, const void *buf, size_t bytes); + +size_t iov_to_buf(const struct iovec *iov, const unsigned int iov_cnt, + size_t offset, void *buf, size_t bytes); #endif --- and have a look at it: $ CFLAGS="-g" make $ objdump -DxSs passt | less you'll see it's not inlined, but the function simply resolves to a jump to iov_from_buf_full(): 0000000000020ac0 <iov_from_buf>: return iov_from_buf_full(iov, iov_cnt, offset, buf, bytes); 20ac0: e9 db fd ff ff jmp 208a0 <iov_from_buf_full> 20ac5: 66 66 2e 0f 1f 84 00 data16 cs nopw 0x0(%rax,%rax,1) 20acc: 00 00 00 00 because in the usage you make of it (vu_send()), elem->in_sg is never a constant. If we look at the AVX2 version instead: $ objdump -DxSs passt.avx2 | less iov_from_buf() directly becomes iov_from_buf_full() (inlined): 000000000002dfa0 <iov_from_buf>: { 2dfa0: 41 57 push %r15 2dfa2: 41 56 push %r14 2dfa4: 41 89 f6 mov %esi,%r14d for (i = 0, done = 0; (offset || done < bytes) && i < iov_cnt; i++) { 2dfa7: 4c 89 c6 mov %r8,%rsi and it's still called from vu_send() -- not inlined there, probably because iov_from_buf_full() is too big. In the end, the compiler decides to inline iov_from_buf_full() into iov_from_buf(), to drop the "constant" implementation from it (because it wouldn't be used), and I guess that makes sense. -- StefanoOn Fri, 2 Feb 2024 15:11:28 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote: ... diff --git a/iov.h b/iov.h new file mode 100644 index 000000000000..31fbf6d0e1cf --- /dev/null +++ b/iov.h @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* some parts copied from QEMU include/qemu/iov.h */ + +#ifndef IOVEC_H +#define IOVEC_H + +#include <unistd.h> +#include <string.h> + +size_t iov_from_buf_full(const struct iovec *iov, unsigned int iov_cnt, + size_t offset, const void *buf, size_t bytes); +size_t iov_to_buf_full(const struct iovec *iov, const unsigned int iov_cnt, + size_t offset, void *buf, size_t bytes); + +static inline size_t iov_from_buf(const struct iovec *iov, + unsigned int iov_cnt, size_t offset, + const void *buf, size_t bytes) +{ Is there a particular reason to include these two in a header? The compiler will inline as needed if they are in a source file.This code has been introduced in QEMU by: commit ad523bca56a7202d2498c550a41be5c986c4d33c Author: Paolo Bonzini <pbonzini(a)redhat.com> Date: Tue Dec 22 12:03:33 2015 +0100 iov: avoid memcpy for "simple" iov_from_buf/iov_to_buf memcpy can take a large amount of time for small reads and writes. For virtio it is a common case that the first iovec can satisfy the whole read or write. In that case, and if bytes is a constant to avoid excessive growth of code, inline the first iteration into the caller. Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com> Message-id: 1450782213-14227-1-git-send-email-pbonzini(a)redhat.com Signed-off-by: Stefan Hajnoczi <stefanha(a)redhat.com> Is the compiler able to check "bytes" is a constant and inline the function if the definition is in a .c file and not in a .h ?