Re: [PATCH 07/10] pcap: Pass explicit L2 length to pcap_iov()

3 Apr 2026

On 4/3/26 08:20, Stefano Brivio wrote:
...
On Wed,  1 Apr 2026 21:18:23 +0200
Laurent Vivier  wrote:
...
With vhost-user multibuffer frames, the iov can be larger than the
actual L2 frame. The previous approach of computing L2 length as
iov_size() - offset would overcount and write extra bytes into the
pcap file.
Pass the L2 frame length explicitly to pcap_frame() and pcap_iov(),
and write exactly that many bytes instead of the full iov remainder.
Signed-off-by: Laurent Vivier 
---
  pcap.c      | 37 ++++++++++++++++++++++++++++---------
  pcap.h      |  2 +-
  tap.c       |  2 +-
  tcp_vu.c    |  9 ++++++---
  udp_vu.c    |  4 +++-
  vu_common.c |  2 +-
  6 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/pcap.c b/pcap.c
index a026f17e7974..dfe1c61add9a 100644
--- a/pcap.c
+++ b/pcap.c
@@ -52,22 +52,38 @@ struct pcap_pkthdr {
   * @iov:	IO vector containing frame (with L2 headers and tap headers)
   * @iovcnt:	Number of buffers (@iov entries) in frame
   * @offset:	Byte offset of the L2 headers within @iov
+ * @l2len:	Length of L2 frame data to capture
   * @now:	Timestamp
   */
  static void pcap_frame(const struct iovec *iov, size_t iovcnt,
-		       size_t offset, const struct timespec *now)
+		       size_t offset, size_t l2len, const struct timespec *now)
  {
-	size_t l2len = iov_size(iov, iovcnt) - offset;
  struct pcap_pkthdr h = {
      .tv_sec = now->tv_sec,
      .tv_usec = DIV_ROUND_CLOSEST(now->tv_nsec, 1000),
      .caplen = l2len,
      .len = l2len
  };
+	size_t i;
-	if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0 ||
-	    write_remainder(pcap_fd, iov, iovcnt, offset) < 0)
-		debug_perror("Cannot log packet, length %zu", l2len);
+	if (write_all_buf(pcap_fd, &h, sizeof(h)) < 0) {
+		debug_perror("Cannot log packet, packet header error");
+		return;
+	}
+
+	for (i = iov_skip_bytes(iov, iovcnt, offset, &offset);
+	     i < iovcnt && l2len; i++) {
+		size_t n = MIN(l2len, iov[i].iov_len - offset);
+
+		if (write_all_buf(pcap_fd, (char *)iov[i].iov_base + offset,
+				  n) < 0) {
This could be written as:
size_t l = MIN(l2len, iov[i].iov_len - offset), n;
n = write_all_buf(pcap_fd, (char *)iov[i].iov_base + offset, l);
      if (n < 0) {
same number of lines, but slightly clearer I think.
...
+			debug_perror("Cannot log packet");
Should we add back the ", length %zu" part, perhaps, using this 'l'
variable? I think it makes a difference if we see we're failing to
capture a reasonably sized 64k packet (disk full or something) compared
to a 100G-length buffer coming from some calculation gone wrong.
...
+			return;
+		}
+
+		offset = 0;
+		l2len -= n;
+	}
  }
/**
@@ -87,7 +103,7 @@ void pcap(const char *pkt, size_t l2len)
  if (clock_gettime(CLOCK_REALTIME, &now))
      err_perror("Failed to get CLOCK_REALTIME time");
-	pcap_frame(&iov, 1, 0, &now);
+	pcap_frame(&iov, 1, 0, l2len, &now);
  }
/**
@@ -110,7 +126,9 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
      err_perror("Failed to get CLOCK_REALTIME time");
for (i = 0; i < n; i++)
-		pcap_frame(iov + i * frame_parts, frame_parts, offset, &now);
+		pcap_frame(iov + i * frame_parts, frame_parts, offset,
Nit: curly brackets for coding style.
...
+			   iov_size(iov + i * frame_parts, frame_parts) - offset,
+			   &now);
  }
/**
@@ -120,8 +138,9 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
   *		containing packet data to write, including L2 header
   * @iovcnt:	Number of buffers (@iov entries)
   * @offset:	Offset of the L2 frame within the full data length
+ * @l2len:	Length of L2 frame data to capture
   */
-void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
+void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset, size_t l2len)
  {
  struct timespec now = { 0 };
@@ -131,7 +150,7 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset)
  if (clock_gettime(CLOCK_REALTIME, &now))
      err_perror("Failed to get CLOCK_REALTIME time");
-	pcap_frame(iov, iovcnt, offset, &now);
+	pcap_frame(iov, iovcnt, offset, l2len, &now);
  }
/**
diff --git a/pcap.h b/pcap.h
index dface5df4ee6..c171257cbd73 100644
--- a/pcap.h
+++ b/pcap.h
@@ -13,7 +13,7 @@ extern int pcap_fd;
  void pcap(const char *pkt, size_t l2len);
  void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n,
         size_t offset);
-void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset);
+void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset, size_t l2len);
  void pcap_init(struct ctx *c);
#endif /* PCAP_H */
diff --git a/tap.c b/tap.c
index b61199dd699d..007c91864b4e 100644
--- a/tap.c
+++ b/tap.c
@@ -1105,7 +1105,7 @@ void tap_add_packet(struct ctx *c, struct iov_tail *data,
  struct ethhdr eh_storage;
  const struct ethhdr *eh;
-	pcap_iov(data->iov, data->cnt, data->off);
+	pcap_iov(data->iov, data->cnt, data->off, iov_tail_size(data));
This leads to a warning from Coverity Scan:
/home/sbrivio/passt/tap.c:1108:2:
   Type: Evaluation order violation (EVALUATION_ORDER)
/home/sbrivio/passt/tap.c:1108:2:
   read_write_order: In argument #4 of "pcap_iov(data->iov, data->cnt, data->off, iov_tail_size(data))", a call is made to "iov_tail_size(data)".  In argument #1 of this function, the object "data->off" is modified.  This object is also used in "data->off", the argument #3 of the outer function call.  The order in which these arguments are evaluated is not specified, and will vary between platforms.
I'm not sure if we can ever reach this point with a non-pruned iov_tail
so that iov_tail_prune() will actually modify 'off', but just to be
sure I guess we should call iov_tail_size() first, assign its result to
a temporary variable, and use that.
In fact, iov_size() will not change after pruning. The couple (cnt, off) will always point 
to the same offset (but cnt can be higher and off lower).
...
...
eh = IOV_PEEK_HEADER(data, eh_storage);
  if (!eh)
diff --git a/tcp_vu.c b/tcp_vu.c
index 0cd01190d612..329fa969fca1 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -143,7 +143,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
  vu_flush(vdev, vq, flags_elem, 1);
if (*c->pcap)
-		pcap_iov(&flags_elem[0].in_sg[0], 1, VNET_HLEN);
+		pcap_iov(&flags_elem[0].in_sg[0], 1, VNET_HLEN,
+			 hdrlen + optlen - VNET_HLEN);
Same here, curly brackets.
...
if (flags & DUP_ACK) {
      elem_cnt = vu_collect(vdev, vq, &flags_elem[1], 1,
@@ -159,7 +160,8 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
      	vu_flush(vdev, vq, &flags_elem[1], 1);
if (*c->pcap)
-				pcap_iov(&flags_elem[1].in_sg[0], 1, VNET_HLEN);
+				pcap_iov(&flags_elem[1].in_sg[0], 1, VNET_HLEN,
+					 hdrlen + optlen - VNET_HLEN);
And here.
...
}
  }
  vu_queue_notify(vdev, vq);
@@ -464,7 +466,8 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
      vu_flush(vdev, vq, &elem[head[i]], buf_cnt);
if (*c->pcap)
-			pcap_iov(iov, buf_cnt, VNET_HLEN);
+			pcap_iov(iov, buf_cnt, VNET_HLEN,
+				 dlen + hdrlen - VNET_HLEN);
And here too.
...
conn->seq_to_tap += dlen;
  }
diff --git a/udp_vu.c b/udp_vu.c
index 5421a7d71a19..81491afa7e6a 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -185,6 +185,7 @@ void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx)
  static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE];
  struct vu_dev *vdev = c->vdev;
  struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	size_t hdrlen = udp_vu_hdrlen(v6);
  int i;
assert(!c->no_udp);
@@ -230,7 +231,8 @@ void udp_vu_sock_to_tap(const struct ctx *c, int s, int n, flow_sidx_t tosidx)
      	size_t l4len = udp_vu_prepare(c, iov_vu, toside, dlen);
      	if (*c->pcap) {
      		udp_vu_csum(toside, iov_vu, iov_cnt, l4len);
-				pcap_iov(iov_vu, iov_cnt, VNET_HLEN);
+				pcap_iov(iov_vu, iov_cnt, VNET_HLEN,
+					 hdrlen + dlen - VNET_HLEN);
And here as well. Or maybe, in all those cases, we could have a 'size'
temporary variable which should make things a bit more obvious.
I agree

Thanks,
Laurent