On Fri, 11 Oct 2024 20:07:30 +0200 Stefano Brivio <sbrivio(a)redhat.com> wrote:[...]I played with this a bit more. If I select the AVX2-based page clearing with: if (system_state >= SYSTEM_RUNNING && irq_fpu_usable()) { instead of just irq_fpu_usable(), the kernel boots, and everything works (also after init). I tested this in a VM where I can't really get a baseline throughput that's comparable to the host: iperf3 to iperf3 via loopback gives me about 50 Gbps (instead of 70 as I get on the host), and the same iperf3 vhost-user test with outbound traffic from the nested, L2 guest yields about 20 Gbps (instead of 25). 1. The VMOVDQA version I was originally trying looks like this: Samples: 39K of event 'cycles:P', Event count (approx.): 34909065261 Children Self Command Shared Object Symbol - 94.32% 0.87% passt.avx2 [kernel.kallsyms] [k] entry_SYSCALL_64 ◆ - 93.45% entry_SYSCALL_64 ▒ - 93.29% do_syscall_64 ▒ - 79.66% __sys_sendmsg ▒ - 79.52% ___sys_sendmsg ▒ - 78.88% ____sys_sendmsg ▒ - 78.46% tcp_sendmsg ▒ - 66.75% tcp_sendmsg_locked ▒ - 25.81% sk_page_frag_refill ▒ - 25.73% skb_page_frag_refill ▒ - 25.34% alloc_pages_mpol_noprof ▒ - 25.17% __alloc_pages_noprof ▒ - 24.91% get_page_from_freelist ▒ - 23.38% kernel_init_pages ▒ 0.88% kernel_fpu_begin_mask ▒ - 15.37% tcp_write_xmit ▒ - 14.14% __tcp_transmit_skb ▒ - 13.31% __ip_queue_xmit ▒ - 11.06% ip_finish_output2 ▒ - 10.89% __dev_queue_xmit ▒ - 10.00% __local_bh_enable_ip ▒ - do_softirq.part.0 ▒ - handle_softirqs ▒ - 9.86% net_rx_action ▒ - 7.95% __napi_poll ▒ + process_backlog ▒ + 1.17% napi_consume_skb ▒ + 0.61% dev_hard_start_xmit ▒ - 1.56% ip_local_out ▒ - __ip_local_out ▒ - 1.29% nf_hook_slow ▒ 1.00% nf_conntrack_in ▒ + 14.60% _copy_from_iter ▒ + 3.97% __tcp_push_pending_frames ▒ + 2.42% tcp_stream_alloc_skb ▒ + 2.08% tcp_wmem_schedule ▒ 0.64% __check_object_size ▒ + 11.08% release_sock ▒ + 4.48% ksys_write ▒ + 3.57% __x64_sys_epoll_wait ▒ + 2.26% __x64_sys_getsockopt ▒ 1.09% syscall_exit_to_user_mode ▒ + 0.90% ksys_read ▒ 0.64% syscall_trace_enter ▒ ...that's 24.91% clock cycles spent on get_page_from_freelist() instead of 25.61% I was getting with the original clear_page() implementation. Checking the annotated output, it doesn't look very... superscalar: Samples: 39K of event 'cycles:P', 4000 Hz, Event count (approx.): 34909065261 Percent│ { ▒ │ return page_to_virt(page); ▒ │32: mov %r12,%rbx ▒ │ sub vmemmap_base,%rbx ▒ 0.32 │ sar $0x6,%rbx ▒ 0.02 │ shl $0xc,%rbx ▒ 0.02 │ add page_offset_base,%rbx ▒ │ clear_page(): ▒ │ if (system_state >= SYSTEM_RUNNING && irq_fpu_usable()) { ▒ 0.05 │ cmpl $0x2,system_state ▒ 0.47 │ jbe 21 ▒ 0.01 │ call irq_fpu_usable ▒ 0.20 │ test %al,%al ▒ 0.56 │ je 21 ▒ │ kernel_fpu_begin_mask(0); ▒ 0.07 │ xor %edi,%edi ▒ │ call kernel_fpu_begin_mask ▒ │ MEMSET_AVX2_ZERO(0); ▒ 0.06 │ vpxor %ymm0,%ymm0,%ymm0 ▒ │ for (i = 0; i < BYTES_TO_YMM(PAGE_SIZE); i++) ▒ 0.58 │ lea 0x1000(%rbx),%rax ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * i], 0); ▒ 4.96 │6f: vmovdqa %ymm0,(%rbx) ▒ 71.38 │ vmovdqa %ymm0,0x20(%rbx) ▒ │ for (i = 0; i < BYTES_TO_YMM(PAGE_SIZE); i++) ▒ 2.81 │ add $0x40,%rbx ▒ 0.06 │ cmp %rbx,%rax ▒ 17.22 │ jne 6f ▒ │ kernel_fpu_end(); ▒ │ call kernel_fpu_end ▒ │ kernel_init_pages(): ▒ 0.44 │ add $0x40,%r12 ▒ 0.55 │ cmp %r12,%rbp ▒ 0.07 │ jne 32 ▒ │ clear_highpage_kasan_tagged(page + i); ▒ │ kasan_enable_current(); ▒ │ } ◆ │8f: pop %rbx ▒ 0.11 │ pop %rbp ▒ │ pop %r12 ▒ 0.01 │ jmp __x86_return_thunk ▒ │98: jmp __x86_return_thunk ▒ 2. Let's try to unroll it: Samples: 39K of event 'cycles:P', Event count (approx.): 33598124504 Children Self Command Shared Object Symbol + 92.49% 0.33% passt.avx2 [kernel.kallsyms] [k] entry_SYSCALL_64_a◆ - 92.01% 0.47% passt.avx2 [kernel.kallsyms] [k] do_syscall_64 ▒ - 91.54% do_syscall_64 ▒ - 75.04% __sys_sendmsg ▒ - 74.85% ___sys_sendmsg ▒ - 74.26% ____sys_sendmsg ▒ - 73.68% tcp_sendmsg ▒ - 62.69% tcp_sendmsg_locked ▒ - 22.26% sk_page_frag_refill ▒ - 22.14% skb_page_frag_refill ▒ - 21.74% alloc_pages_mpol_noprof ▒ - 21.52% __alloc_pages_noprof ▒ - 21.25% get_page_from_freelist ▒ - 20.04% prep_new_page ▒ - 19.57% clear_page ▒ 0.55% kernel_fpu_begin_mask ▒ + 15.04% tcp_write_xmit ▒ + 13.77% _copy_from_iter ▒ + 5.12% __tcp_push_pending_frames ▒ + 2.05% tcp_wmem_schedule ▒ + 1.86% tcp_stream_alloc_skb ▒ 0.73% __check_object_size ▒ + 10.15% release_sock ▒ + 0.62% lock_sock_nested ▒ + 5.63% ksys_write ▒ + 4.65% __x64_sys_epoll_wait ▒ + 2.61% __x64_sys_getsockopt ▒ 1.21% syscall_exit_to_user_mode ▒ + 1.16% ksys_read ▒ + 0.84% syscall_trace_enter ▒ annotated: Samples: 39K of event 'cycles:P', 4000 Hz, Event count (approx.): 33598124504 clear_page /proc/kcore [Percent: local period] Percent│ │ ffffffffb5198480 <load0>: 0.06 │ push %rbx 0.27 │ cmpl $0x2,0x1ab243c(%rip) 0.07 │ mov %rdi,%rbx │ ja 1b │ d: mov %rbx,%rdi │ call clear_page_rep │ pop %rbx │ jmp srso_return_thunk 0.03 │ 1b: call irq_fpu_usable 0.14 │ test %al,%al 0.64 │ je d 0.04 │ xor %edi,%edi │ call kernel_fpu_begin_mask 0.05 │ vpxor %ymm0,%ymm0,%ymm0 0.80 │ vmovdqa %ymm0,(%rbx) 1.12 │ vmovdqa %ymm0,0x20(%rbx) 0.06 │ vmovdqa %ymm0,0x40(%rbx) 1.39 │ vmovdqa %ymm0,0x60(%rbx) 0.24 │ vmovdqa %ymm0,0x80(%rbx) 0.58 │ vmovdqa %ymm0,0xa0(%rbx) 0.21 │ vmovdqa %ymm0,0xc0(%rbx) 0.77 │ vmovdqa %ymm0,0xe0(%rbx) 0.38 │ vmovdqa %ymm0,0x100(%rbx) 7.60 │ vmovdqa %ymm0,0x120(%rbx) 0.26 │ vmovdqa %ymm0,0x140(%rbx) 1.38 │ vmovdqa %ymm0,0x160(%rbx) 0.42 │ vmovdqa %ymm0,0x180(%rbx) 1.25 │ vmovdqa %ymm0,0x1a0(%rbx) 0.26 │ vmovdqa %ymm0,0x1c0(%rbx) 0.73 │ vmovdqa %ymm0,0x1e0(%rbx) 0.33 │ vmovdqa %ymm0,0x200(%rbx) 1.72 │ vmovdqa %ymm0,0x220(%rbx) 0.16 │ vmovdqa %ymm0,0x240(%rbx) 0.61 │ vmovdqa %ymm0,0x260(%rbx) 0.19 │ vmovdqa %ymm0,0x280(%rbx) 0.68 │ vmovdqa %ymm0,0x2a0(%rbx) 0.22 │ vmovdqa %ymm0,0x2c0(%rbx) 0.66 │ vmovdqa %ymm0,0x2e0(%rbx) 0.50 │ vmovdqa %ymm0,0x300(%rbx) 0.67 │ vmovdqa %ymm0,0x320(%rbx) 0.29 │ vmovdqa %ymm0,0x340(%rbx) 0.31 │ vmovdqa %ymm0,0x360(%rbx) 0.14 │ vmovdqa %ymm0,0x380(%rbx) 0.55 │ vmovdqa %ymm0,0x3a0(%rbx) 0.35 │ vmovdqa %ymm0,0x3c0(%rbx) 0.82 │ vmovdqa %ymm0,0x3e0(%rbx) 0.25 │ vmovdqa %ymm0,0x400(%rbx) 0.49 │ vmovdqa %ymm0,0x420(%rbx) ▒ 0.18 │ vmovdqa %ymm0,0x440(%rbx) ▒ 1.05 │ vmovdqa %ymm0,0x460(%rbx) ▒ 0.08 │ vmovdqa %ymm0,0x480(%rbx) ▒ 2.22 │ vmovdqa %ymm0,0x4a0(%rbx) ▒ 0.20 │ vmovdqa %ymm0,0x4c0(%rbx) ▒ 2.33 │ vmovdqa %ymm0,0x4e0(%rbx) ▒ 0.03 │ vmovdqa %ymm0,0x500(%rbx) ▒ 2.87 │ vmovdqa %ymm0,0x520(%rbx) ▒ 0.08 │ vmovdqa %ymm0,0x540(%rbx) ▒ 1.60 │ vmovdqa %ymm0,0x560(%rbx) ▒ 0.01 │ vmovdqa %ymm0,0x580(%rbx) ▒ 7.03 │ vmovdqa %ymm0,0x5a0(%rbx) ▒ 0.42 │ vmovdqa %ymm0,0x5c0(%rbx) ▒ 2.74 │ vmovdqa %ymm0,0x5e0(%rbx) ▒ 0.69 │ vmovdqa %ymm0,0x600(%rbx) ▒ 2.34 │ vmovdqa %ymm0,0x620(%rbx) ▒ 0.37 │ vmovdqa %ymm0,0x640(%rbx) ▒ 1.21 │ vmovdqa %ymm0,0x660(%rbx) ▒ 0.22 │ vmovdqa %ymm0,0x680(%rbx) ▒ 1.16 │ vmovdqa %ymm0,0x6a0(%rbx) ▒ 0.29 │ vmovdqa %ymm0,0x6c0(%rbx) ▒ 0.98 │ vmovdqa %ymm0,0x6e0(%rbx) ▒ 0.19 │ vmovdqa %ymm0,0x700(%rbx) ▒ 0.81 │ vmovdqa %ymm0,0x720(%rbx) ▒ 0.47 │ vmovdqa %ymm0,0x740(%rbx) ▒ 0.69 │ vmovdqa %ymm0,0x760(%rbx) ▒ 0.23 │ vmovdqa %ymm0,0x780(%rbx) ▒ 0.68 │ vmovdqa %ymm0,0x7a0(%rbx) ▒ 0.30 │ vmovdqa %ymm0,0x7c0(%rbx) ▒ 0.68 │ vmovdqa %ymm0,0x7e0(%rbx) ▒ 0.25 │ vmovdqa %ymm0,0x800(%rbx) ◆ 0.58 │ vmovdqa %ymm0,0x820(%rbx) ▒ 0.19 │ vmovdqa %ymm0,0x840(%rbx) ▒ 0.83 │ vmovdqa %ymm0,0x860(%rbx) ▒ 0.27 │ vmovdqa %ymm0,0x880(%rbx) ▒ 1.01 │ vmovdqa %ymm0,0x8a0(%rbx) ▒ 0.16 │ vmovdqa %ymm0,0x8c0(%rbx) ▒ 0.89 │ vmovdqa %ymm0,0x8e0(%rbx) ▒ 0.24 │ vmovdqa %ymm0,0x900(%rbx) ▒ 0.98 │ vmovdqa %ymm0,0x920(%rbx) ▒ 0.28 │ vmovdqa %ymm0,0x940(%rbx) ▒ 0.86 │ vmovdqa %ymm0,0x960(%rbx) ▒ 0.23 │ vmovdqa %ymm0,0x980(%rbx) ▒ 1.19 │ vmovdqa %ymm0,0x9a0(%rbx) ▒ 0.28 │ vmovdqa %ymm0,0x9c0(%rbx) ▒ 1.04 │ vmovdqa %ymm0,0x9e0(%rbx) ▒ 0.33 │ vmovdqa %ymm0,0xa00(%rbx) ▒ 0.90 │ vmovdqa %ymm0,0xa20(%rbx) ▒ 0.35 │ vmovdqa %ymm0,0xa40(%rbx) ▒ 0.87 │ vmovdqa %ymm0,0xa60(%rbx) ▒ 0.25 │ vmovdqa %ymm0,0xa80(%rbx) ▒ 0.89 │ vmovdqa %ymm0,0xaa0(%rbx) ▒ 0.28 │ vmovdqa %ymm0,0xac0(%rbx) ▒ 0.92 │ vmovdqa %ymm0,0xae0(%rbx) ▒ 0.23 │ vmovdqa %ymm0,0xb00(%rbx) ▒ 1.39 │ vmovdqa %ymm0,0xb20(%rbx) ▒ 0.29 │ vmovdqa %ymm0,0xb40(%rbx) ▒ 1.15 │ vmovdqa %ymm0,0xb60(%rbx) ▒ 0.26 │ vmovdqa %ymm0,0xb80(%rbx) ▒ 1.33 │ vmovdqa %ymm0,0xba0(%rbx) ▒ 0.29 │ vmovdqa %ymm0,0xbc0(%rbx) ▒ 1.05 │ vmovdqa %ymm0,0xbe0(%rbx) ▒ 0.25 │ vmovdqa %ymm0,0xc00(%rbx) ▒ 0.89 │ vmovdqa %ymm0,0xc20(%rbx) ▒ 0.34 │ vmovdqa %ymm0,0xc40(%rbx) ▒ 0.78 │ vmovdqa %ymm0,0xc60(%rbx) ▒ 0.40 │ vmovdqa %ymm0,0xc80(%rbx) ▒ 0.99 │ vmovdqa %ymm0,0xca0(%rbx) ▒ 0.44 │ vmovdqa %ymm0,0xcc0(%rbx) ▒ 1.06 │ vmovdqa %ymm0,0xce0(%rbx) ▒ 0.35 │ vmovdqa %ymm0,0xd00(%rbx) ▒ 0.85 │ vmovdqa %ymm0,0xd20(%rbx) ▒ 0.46 │ vmovdqa %ymm0,0xd40(%rbx) ▒ 0.88 │ vmovdqa %ymm0,0xd60(%rbx) ▒ 0.38 │ vmovdqa %ymm0,0xd80(%rbx) ▒ 0.82 │ vmovdqa %ymm0,0xda0(%rbx) ▒ 0.40 │ vmovdqa %ymm0,0xdc0(%rbx) ▒ 0.98 │ vmovdqa %ymm0,0xde0(%rbx) ▒ 0.27 │ vmovdqa %ymm0,0xe00(%rbx) ▒ 1.10 │ vmovdqa %ymm0,0xe20(%rbx) ▒ 0.25 │ vmovdqa %ymm0,0xe40(%rbx) ▒ 0.89 │ vmovdqa %ymm0,0xe60(%rbx) ▒ 0.32 │ vmovdqa %ymm0,0xe80(%rbx) ▒ 0.87 │ vmovdqa %ymm0,0xea0(%rbx) ▒ 0.22 │ vmovdqa %ymm0,0xec0(%rbx) ▒ 0.94 │ vmovdqa %ymm0,0xee0(%rbx) ▒ 0.27 │ vmovdqa %ymm0,0xf00(%rbx) ▒ 0.90 │ vmovdqa %ymm0,0xf20(%rbx) ▒ 0.28 │ vmovdqa %ymm0,0xf40(%rbx) ▒ 0.79 │ vmovdqa %ymm0,0xf60(%rbx) ▒ 0.31 │ vmovdqa %ymm0,0xf80(%rbx) ▒ 1.11 │ vmovdqa %ymm0,0xfa0(%rbx) ▒ 0.25 │ vmovdqa %ymm0,0xfc0(%rbx) ▒ 0.99 │ vmovdqa %ymm0,0xfe0(%rbx) ▒ 0.10 │ pop %rbx ▒ │ jmp 0xffffffffb4e4b050 ▒ ...that looks like progress: we now spend 21.25% of the clock cyles on get_page_from_freelist() (non-AVX: 25.61%). But still, there seem to be (somewhat unexpected) stalls. For example, after 8 VMOVDQA instructions: 7.60 │ vmovdqa %ymm0,0x120(%rbx) we have one where we spend/wait a long time, and there are more later. 3. ...what if we use a non-temporal hint, that is, if we clear the page without making it cache hot ("stream" instead of "store")? That's vmovntdq m256, ymm ("nt" meaning non-temporal). It's not vmovntdqa (where "a" stands for "aligned"), as one could expect from the vmovdqa above, because there's no unaligned version. The only vmovntdq_a_ instruction is vmovntdqa ymm, m256 (memory to register, "stream load"), because there's an unaligned equivalent in that case. Anyway, perf output: Samples: 39K of event 'cycles:P', Event count (approx.): 33890710610 Children Self Command Shared Object Symbol - 92.62% 0.88% passt.avx2 [kernel.vmlinux] [k] entry_SYSCALL◆ - 91.74% entry_SYSCALL_64 ▒ - 91.60% do_syscall_64 ▒ - 75.05% __sys_sendmsg ▒ - 74.88% ___sys_sendmsg ▒ - 74.22% ____sys_sendmsg ▒ - 73.65% tcp_sendmsg ▒ - 61.71% tcp_sendmsg_locked ▒ - 24.82% _copy_from_iter ▒ 24.40% rep_movs_alternative ▒ - 14.69% sk_page_frag_refill ▒ - 14.57% skb_page_frag_refill ▒ - 14.19% alloc_pages_mpol_noprof ▒ - 14.03% __alloc_pages_noprof ▒ - 13.77% get_page_from_freelist ▒ - 12.56% prep_new_page ▒ - 12.19% clear_page ▒ 0.68% kernel_fpu_begin_mask ▒ - 11.12% tcp_write_xmit ▒ - 10.17% __tcp_transmit_skb ▒ - 9.62% __ip_queue_xmit ▒ - 8.08% ip_finish_output2 ▒ - 7.96% __dev_queue_xmit ▒ - 7.26% __local_bh_enable_ip ▒ - 7.24% do_softirq.part.0 ▒ - handle_softirqs ▒ - net_rx_action ▒ + 5.80% __napi_poll ▒ + 0.87% napi_consume_skb ▒ - 1.06% ip_local_out ▒ - 1.05% __ip_local_out ▒ - 0.90% nf_hook_slow ▒ 0.66% nf_conntrack_in ▒ + 4.22% __tcp_push_pending_frames ▒ + 2.51% tcp_wmem_schedule ▒ + 1.99% tcp_stream_alloc_skb ▒ 0.59% __check_object_size ▒ + 11.21% release_sock ▒ 0.52% lock_sock_nested ▒ + 5.32% ksys_write ▒ + 4.75% __x64_sys_epoll_wait ▒ + 2.45% __x64_sys_getsockopt ▒ 1.29% syscall_exit_to_user_mode ▒ + 1.25% ksys_read ▒ + 0.70% syscall_trace_enter ▒ ...finally we cut down significantly on cycles spent to clear pages, with get_page_from_freelist() taking 13.77% of the cycles instead of 25.61%. That's about half the overhead. This makes _copy_from_iter() the biggest consumer of cycles under tcp_sendmsg_locked(), which is what I expected. Does this mean that we increase the overhead there because we're increasing the amount of cache misses there, or are we simply more efficient? I'm not sure yet. For completeness, annotated version of clear_page(): │ if (system_state >= SYSTEM_RUNNING && irq_fpu_usable()) { 0.09 │ 1b: call irq_fpu_usable 0.11 │ test %al,%al 0.51 │ je d │ kernel_fpu_begin_mask(0); 0.16 │ xor %edi,%edi │ call kernel_fpu_begin_mask │ MEMSET_AVX2_ZERO(0); 0.05 │ vpxor %ymm0,%ymm0,%ymm0 │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x00], 0); 0.79 │ vmovntdq %ymm0,(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x01], 0); 2.46 │ vmovntdq %ymm0,0x20(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x02], 0); 0.07 │ vmovntdq %ymm0,0x40(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x03], 0); 1.35 │ vmovntdq %ymm0,0x60(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x04], 0); 0.18 │ vmovntdq %ymm0,0x80(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x05], 0); 1.40 │ vmovntdq %ymm0,0xa0(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x06], 0); 0.11 │ vmovntdq %ymm0,0xc0(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x07], 0); 0.81 │ vmovntdq %ymm0,0xe0(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x08], 0); 0.07 │ vmovntdq %ymm0,0x100(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x09], 0); 1.25 │ vmovntdq %ymm0,0x120(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x0a], 0); 0.08 │ vmovntdq %ymm0,0x140(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x0b], 0); 1.36 │ vmovntdq %ymm0,0x160(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x0c], 0); 0.11 │ vmovntdq %ymm0,0x180(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x0d], 0); 1.73 │ vmovntdq %ymm0,0x1a0(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x0e], 0); 0.09 │ vmovntdq %ymm0,0x1c0(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x0f], 0); 0.97 │ vmovntdq %ymm0,0x1e0(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x10], 0); 0.07 │ vmovntdq %ymm0,0x200(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x11], 0); 1.25 │ vmovntdq %ymm0,0x220(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x12], 0); 0.14 │ vmovntdq %ymm0,0x240(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x13], 0); 0.79 │ vmovntdq %ymm0,0x260(%rbx) │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x14], 0); ▒ 0.09 │ vmovntdq %ymm0,0x280(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x15], 0); ▒ 1.19 │ vmovntdq %ymm0,0x2a0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x16], 0); ▒ 0.07 │ vmovntdq %ymm0,0x2c0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x17], 0); ▒ 1.45 │ vmovntdq %ymm0,0x2e0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x18], 0); ▒ │ vmovntdq %ymm0,0x300(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x19], 0); ▒ 1.45 │ vmovntdq %ymm0,0x320(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x1a], 0); ▒ 0.05 │ vmovntdq %ymm0,0x340(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x1b], 0); ▒ 1.49 │ vmovntdq %ymm0,0x360(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x1c], 0); ▒ 0.14 │ vmovntdq %ymm0,0x380(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x1d], 0); ▒ 1.34 │ vmovntdq %ymm0,0x3a0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x1e], 0); ▒ 0.09 │ vmovntdq %ymm0,0x3c0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x1f], 0); ▒ 1.69 │ vmovntdq %ymm0,0x3e0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x20], 0); ▒ 0.16 │ vmovntdq %ymm0,0x400(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x21], 0); ▒ 1.15 │ vmovntdq %ymm0,0x420(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x22], 0); ▒ 0.13 │ vmovntdq %ymm0,0x440(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x23], 0); ▒ 1.36 │ vmovntdq %ymm0,0x460(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x24], 0); ▒ 0.07 │ vmovntdq %ymm0,0x480(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x25], 0); ▒ 1.01 │ vmovntdq %ymm0,0x4a0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x26], 0); ▒ 0.09 │ vmovntdq %ymm0,0x4c0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x27], 0); ◆ 1.53 │ vmovntdq %ymm0,0x4e0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x28], 0); ▒ 0.12 │ vmovntdq %ymm0,0x500(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x29], 0); ▒ 1.45 │ vmovntdq %ymm0,0x520(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x2a], 0); ▒ 0.13 │ vmovntdq %ymm0,0x540(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x2b], 0); ▒ 0.97 │ vmovntdq %ymm0,0x560(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x2c], 0); ▒ 0.12 │ vmovntdq %ymm0,0x580(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x2d], 0); ▒ 1.21 │ vmovntdq %ymm0,0x5a0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x2e], 0); ▒ 0.15 │ vmovntdq %ymm0,0x5c0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x2f], 0); ▒ 1.42 │ vmovntdq %ymm0,0x5e0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x30], 0); ▒ 0.19 │ vmovntdq %ymm0,0x600(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x31], 0); ▒ 1.12 │ vmovntdq %ymm0,0x620(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x32], 0); ▒ 0.04 │ vmovntdq %ymm0,0x640(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x33], 0); ▒ 1.59 │ vmovntdq %ymm0,0x660(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x34], 0); ▒ 0.07 │ vmovntdq %ymm0,0x680(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x35], 0); ▒ 1.65 │ vmovntdq %ymm0,0x6a0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x36], 0); ▒ 0.14 │ vmovntdq %ymm0,0x6c0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x37], 0); ▒ 1.00 │ vmovntdq %ymm0,0x6e0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x38], 0); ▒ 0.14 │ vmovntdq %ymm0,0x700(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x39], 0); ▒ 1.31 │ vmovntdq %ymm0,0x720(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x3a], 0); ▒ 0.10 │ vmovntdq %ymm0,0x740(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x3b], 0); ▒ 1.21 │ vmovntdq %ymm0,0x760(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x3c], 0); ▒ 0.07 │ vmovntdq %ymm0,0x780(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x3d], 0); ▒ 1.27 │ vmovntdq %ymm0,0x7a0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x3e], 0); ▒ 0.09 │ vmovntdq %ymm0,0x7c0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x3f], 0); ▒ 1.28 │ vmovntdq %ymm0,0x7e0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x40], 0); ▒ 0.11 │ vmovntdq %ymm0,0x800(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x41], 0); ▒ 1.32 │ vmovntdq %ymm0,0x820(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x42], 0); ▒ 0.09 │ vmovntdq %ymm0,0x840(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x43], 0); ▒ 1.43 │ vmovntdq %ymm0,0x860(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x44], 0); ▒ 0.11 │ vmovntdq %ymm0,0x880(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x45], 0); ▒ 1.21 │ vmovntdq %ymm0,0x8a0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x46], 0); ▒ 0.11 │ vmovntdq %ymm0,0x8c0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x47], 0); ▒ 1.09 │ vmovntdq %ymm0,0x8e0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x48], 0); ▒ 0.07 │ vmovntdq %ymm0,0x900(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x49], 0); ▒ 1.26 │ vmovntdq %ymm0,0x920(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x4a], 0); ▒ 0.16 │ vmovntdq %ymm0,0x940(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x4b], 0); ▒ 1.58 │ vmovntdq %ymm0,0x960(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x4c], 0); ▒ 0.05 │ vmovntdq %ymm0,0x980(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x4d], 0); ▒ 1.54 │ vmovntdq %ymm0,0x9a0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x4e], 0); ▒ 0.07 │ vmovntdq %ymm0,0x9c0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x4f], 0); ▒ 1.66 │ vmovntdq %ymm0,0x9e0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x50], 0); ▒ 0.16 │ vmovntdq %ymm0,0xa00(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x51], 0); ▒ 1.31 │ vmovntdq %ymm0,0xa20(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x52], 0); ▒ 0.20 │ vmovntdq %ymm0,0xa40(%rbx) ◆ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x53], 0); ▒ 1.44 │ vmovntdq %ymm0,0xa60(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x54], 0); ▒ 0.05 │ vmovntdq %ymm0,0xa80(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x55], 0); ▒ 1.52 │ vmovntdq %ymm0,0xaa0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x56], 0); ▒ 0.21 │ vmovntdq %ymm0,0xac0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x57], 0); ▒ 1.09 │ vmovntdq %ymm0,0xae0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x58], 0); ▒ 0.22 │ vmovntdq %ymm0,0xb00(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x59], 0); ▒ 1.58 │ vmovntdq %ymm0,0xb20(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x5a], 0); ▒ 0.12 │ vmovntdq %ymm0,0xb40(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x5b], 0); ▒ 1.46 │ vmovntdq %ymm0,0xb60(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x5c], 0); ▒ 0.04 │ vmovntdq %ymm0,0xb80(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x5d], 0); ▒ 1.62 │ vmovntdq %ymm0,0xba0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x5e], 0); ▒ 0.07 │ vmovntdq %ymm0,0xbc0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x5f], 0); ▒ 1.71 │ vmovntdq %ymm0,0xbe0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x60], 0); ▒ 0.19 │ vmovntdq %ymm0,0xc00(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x61], 0); ▒ 1.89 │ vmovntdq %ymm0,0xc20(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x62], 0); ▒ 0.11 │ vmovntdq %ymm0,0xc40(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x63], 0); ▒ 1.98 │ vmovntdq %ymm0,0xc60(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x64], 0); ▒ 0.16 │ vmovntdq %ymm0,0xc80(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x65], 0); ▒ 1.58 │ vmovntdq %ymm0,0xca0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x66], 0); ▒ 0.13 │ vmovntdq %ymm0,0xcc0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x67], 0); ▒ 1.16 │ vmovntdq %ymm0,0xce0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x68], 0); ▒ 0.09 │ vmovntdq %ymm0,0xd00(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x69], 0); ▒ 1.67 │ vmovntdq %ymm0,0xd20(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x6a], 0); ▒ 0.11 │ vmovntdq %ymm0,0xd40(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x6b], 0); ▒ 1.82 │ vmovntdq %ymm0,0xd60(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x6c], 0); ▒ 0.07 │ vmovntdq %ymm0,0xd80(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x6d], 0); ▒ 1.57 │ vmovntdq %ymm0,0xda0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x6e], 0); ▒ 0.02 │ vmovntdq %ymm0,0xdc0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x6f], 0); ▒ 1.27 │ vmovntdq %ymm0,0xde0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x70], 0); ▒ │ vmovntdq %ymm0,0xe00(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x71], 0); ▒ 1.48 │ vmovntdq %ymm0,0xe20(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x72], 0); ▒ 0.11 │ vmovntdq %ymm0,0xe40(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x73], 0); ▒ 1.87 │ vmovntdq %ymm0,0xe60(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x74], 0); ▒ 0.16 │ vmovntdq %ymm0,0xe80(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x75], 0); ▒ 1.45 │ vmovntdq %ymm0,0xea0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x76], 0); ▒ 0.07 │ vmovntdq %ymm0,0xec0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x77], 0); ▒ 1.65 │ vmovntdq %ymm0,0xee0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x78], 0); ▒ 0.10 │ vmovntdq %ymm0,0xf00(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x79], 0); ▒ 1.53 │ vmovntdq %ymm0,0xf20(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x7a], 0); ▒ 0.07 │ vmovntdq %ymm0,0xf40(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x7b], 0); ▒ 1.51 │ vmovntdq %ymm0,0xf60(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x7c], 0); ▒ 0.12 │ vmovntdq %ymm0,0xf80(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x7d], 0); ▒ 1.62 │ vmovntdq %ymm0,0xfa0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x7e], 0); ▒ 0.08 │ vmovntdq %ymm0,0xfc0(%rbx) ▒ │ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x7f], 0); ▒ 1.62 │ vmovntdq %ymm0,0xfe0(%rbx) ▒ │ } ▒ 0.13 │ pop %rbx ▒ │ kernel_fpu_end(); ▒ │ jmp ffffffff8104b050 <kernel_fpu_end> ▒ ...no stalls on any particular store. Current patch attached if you want to fry^W test on your laptop. -- Stefano...maybe I can try out a kernel with a version of that as clear_page_rep() and see what happens....so I tried, it looks like this, but it doesn't boot for some reason: