On Fri, 11 Oct 2024 20:07:30 +0200
Stefano Brivio
[...]
...maybe I can try out a kernel with a version of that as clear_page_rep() and see what happens.
...so I tried, it looks like this, but it doesn't boot for some reason:
I played with this a bit more. If I select the AVX2-based page clearing
with:
if (system_state >= SYSTEM_RUNNING && irq_fpu_usable()) {
instead of just irq_fpu_usable(), the kernel boots, and everything
works (also after init).
I tested this in a VM where I can't really get a baseline throughput
that's comparable to the host: iperf3 to iperf3 via loopback gives me
about 50 Gbps (instead of 70 as I get on the host), and the same iperf3
vhost-user test with outbound traffic from the nested, L2 guest yields
about 20 Gbps (instead of 25).
1. The VMOVDQA version I was originally trying looks like this:
Samples: 39K of event 'cycles:P', Event count (approx.): 34909065261
Children Self Command Shared Object Symbol
- 94.32% 0.87% passt.avx2 [kernel.kallsyms] [k] entry_SYSCALL_64 ◆
- 93.45% entry_SYSCALL_64 ▒
- 93.29% do_syscall_64 ▒
- 79.66% __sys_sendmsg ▒
- 79.52% ___sys_sendmsg ▒
- 78.88% ____sys_sendmsg ▒
- 78.46% tcp_sendmsg ▒
- 66.75% tcp_sendmsg_locked ▒
- 25.81% sk_page_frag_refill ▒
- 25.73% skb_page_frag_refill ▒
- 25.34% alloc_pages_mpol_noprof ▒
- 25.17% __alloc_pages_noprof ▒
- 24.91% get_page_from_freelist ▒
- 23.38% kernel_init_pages ▒
0.88% kernel_fpu_begin_mask ▒
- 15.37% tcp_write_xmit ▒
- 14.14% __tcp_transmit_skb ▒
- 13.31% __ip_queue_xmit ▒
- 11.06% ip_finish_output2 ▒
- 10.89% __dev_queue_xmit ▒
- 10.00% __local_bh_enable_ip ▒
- do_softirq.part.0 ▒
- handle_softirqs ▒
- 9.86% net_rx_action ▒
- 7.95% __napi_poll ▒
+ process_backlog ▒
+ 1.17% napi_consume_skb ▒
+ 0.61% dev_hard_start_xmit ▒
- 1.56% ip_local_out ▒
- __ip_local_out ▒
- 1.29% nf_hook_slow ▒
1.00% nf_conntrack_in ▒
+ 14.60% _copy_from_iter ▒
+ 3.97% __tcp_push_pending_frames ▒
+ 2.42% tcp_stream_alloc_skb ▒
+ 2.08% tcp_wmem_schedule ▒
0.64% __check_object_size ▒
+ 11.08% release_sock ▒
+ 4.48% ksys_write ▒
+ 3.57% __x64_sys_epoll_wait ▒
+ 2.26% __x64_sys_getsockopt ▒
1.09% syscall_exit_to_user_mode ▒
+ 0.90% ksys_read ▒
0.64% syscall_trace_enter ▒
...that's 24.91% clock cycles spent on get_page_from_freelist() instead of
25.61% I was getting with the original clear_page() implementation. Checking
the annotated output, it doesn't look very... superscalar:
Samples: 39K of event 'cycles:P', 4000 Hz, Event count (approx.): 34909065261
Percent│ { ▒
│ return page_to_virt(page); ▒
│32: mov %r12,%rbx ▒
│ sub vmemmap_base,%rbx ▒
0.32 │ sar $0x6,%rbx ▒
0.02 │ shl $0xc,%rbx ▒
0.02 │ add page_offset_base,%rbx ▒
│ clear_page(): ▒
│ if (system_state >= SYSTEM_RUNNING && irq_fpu_usable()) { ▒
0.05 │ cmpl $0x2,system_state ▒
0.47 │ jbe 21 ▒
0.01 │ call irq_fpu_usable ▒
0.20 │ test %al,%al ▒
0.56 │ je 21 ▒
│ kernel_fpu_begin_mask(0); ▒
0.07 │ xor %edi,%edi ▒
│ call kernel_fpu_begin_mask ▒
│ MEMSET_AVX2_ZERO(0); ▒
0.06 │ vpxor %ymm0,%ymm0,%ymm0 ▒
│ for (i = 0; i < BYTES_TO_YMM(PAGE_SIZE); i++) ▒
0.58 │ lea 0x1000(%rbx),%rax ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * i], 0); ▒
4.96 │6f: vmovdqa %ymm0,(%rbx) ▒
71.38 │ vmovdqa %ymm0,0x20(%rbx) ▒
│ for (i = 0; i < BYTES_TO_YMM(PAGE_SIZE); i++) ▒
2.81 │ add $0x40,%rbx ▒
0.06 │ cmp %rbx,%rax ▒
17.22 │ jne 6f ▒
│ kernel_fpu_end(); ▒
│ call kernel_fpu_end ▒
│ kernel_init_pages(): ▒
0.44 │ add $0x40,%r12 ▒
0.55 │ cmp %r12,%rbp ▒
0.07 │ jne 32 ▒
│ clear_highpage_kasan_tagged(page + i); ▒
│ kasan_enable_current(); ▒
│ } ◆
│8f: pop %rbx ▒
0.11 │ pop %rbp ▒
│ pop %r12 ▒
0.01 │ jmp __x86_return_thunk ▒
│98: jmp __x86_return_thunk ▒
2. Let's try to unroll it:
Samples: 39K of event 'cycles:P', Event count (approx.): 33598124504
Children Self Command Shared Object Symbol
+ 92.49% 0.33% passt.avx2 [kernel.kallsyms] [k] entry_SYSCALL_64_a◆
- 92.01% 0.47% passt.avx2 [kernel.kallsyms] [k] do_syscall_64 ▒
- 91.54% do_syscall_64 ▒
- 75.04% __sys_sendmsg ▒
- 74.85% ___sys_sendmsg ▒
- 74.26% ____sys_sendmsg ▒
- 73.68% tcp_sendmsg ▒
- 62.69% tcp_sendmsg_locked ▒
- 22.26% sk_page_frag_refill ▒
- 22.14% skb_page_frag_refill ▒
- 21.74% alloc_pages_mpol_noprof ▒
- 21.52% __alloc_pages_noprof ▒
- 21.25% get_page_from_freelist ▒
- 20.04% prep_new_page ▒
- 19.57% clear_page ▒
0.55% kernel_fpu_begin_mask ▒
+ 15.04% tcp_write_xmit ▒
+ 13.77% _copy_from_iter ▒
+ 5.12% __tcp_push_pending_frames ▒
+ 2.05% tcp_wmem_schedule ▒
+ 1.86% tcp_stream_alloc_skb ▒
0.73% __check_object_size ▒
+ 10.15% release_sock ▒
+ 0.62% lock_sock_nested ▒
+ 5.63% ksys_write ▒
+ 4.65% __x64_sys_epoll_wait ▒
+ 2.61% __x64_sys_getsockopt ▒
1.21% syscall_exit_to_user_mode ▒
+ 1.16% ksys_read ▒
+ 0.84% syscall_trace_enter ▒
annotated:
Samples: 39K of event 'cycles:P', 4000 Hz, Event count (approx.): 33598124504
clear_page /proc/kcore [Percent: local period]
Percent│
│ ffffffffb5198480 <load0>:
0.06 │ push %rbx
0.27 │ cmpl $0x2,0x1ab243c(%rip)
0.07 │ mov %rdi,%rbx
│ ja 1b
│ d: mov %rbx,%rdi
│ call clear_page_rep
│ pop %rbx
│ jmp srso_return_thunk
0.03 │ 1b: call irq_fpu_usable
0.14 │ test %al,%al
0.64 │ je d
0.04 │ xor %edi,%edi
│ call kernel_fpu_begin_mask
0.05 │ vpxor %ymm0,%ymm0,%ymm0
0.80 │ vmovdqa %ymm0,(%rbx)
1.12 │ vmovdqa %ymm0,0x20(%rbx)
0.06 │ vmovdqa %ymm0,0x40(%rbx)
1.39 │ vmovdqa %ymm0,0x60(%rbx)
0.24 │ vmovdqa %ymm0,0x80(%rbx)
0.58 │ vmovdqa %ymm0,0xa0(%rbx)
0.21 │ vmovdqa %ymm0,0xc0(%rbx)
0.77 │ vmovdqa %ymm0,0xe0(%rbx)
0.38 │ vmovdqa %ymm0,0x100(%rbx)
7.60 │ vmovdqa %ymm0,0x120(%rbx)
0.26 │ vmovdqa %ymm0,0x140(%rbx)
1.38 │ vmovdqa %ymm0,0x160(%rbx)
0.42 │ vmovdqa %ymm0,0x180(%rbx)
1.25 │ vmovdqa %ymm0,0x1a0(%rbx)
0.26 │ vmovdqa %ymm0,0x1c0(%rbx)
0.73 │ vmovdqa %ymm0,0x1e0(%rbx)
0.33 │ vmovdqa %ymm0,0x200(%rbx)
1.72 │ vmovdqa %ymm0,0x220(%rbx)
0.16 │ vmovdqa %ymm0,0x240(%rbx)
0.61 │ vmovdqa %ymm0,0x260(%rbx)
0.19 │ vmovdqa %ymm0,0x280(%rbx)
0.68 │ vmovdqa %ymm0,0x2a0(%rbx)
0.22 │ vmovdqa %ymm0,0x2c0(%rbx)
0.66 │ vmovdqa %ymm0,0x2e0(%rbx)
0.50 │ vmovdqa %ymm0,0x300(%rbx)
0.67 │ vmovdqa %ymm0,0x320(%rbx)
0.29 │ vmovdqa %ymm0,0x340(%rbx)
0.31 │ vmovdqa %ymm0,0x360(%rbx)
0.14 │ vmovdqa %ymm0,0x380(%rbx)
0.55 │ vmovdqa %ymm0,0x3a0(%rbx)
0.35 │ vmovdqa %ymm0,0x3c0(%rbx)
0.82 │ vmovdqa %ymm0,0x3e0(%rbx)
0.25 │ vmovdqa %ymm0,0x400(%rbx)
0.49 │ vmovdqa %ymm0,0x420(%rbx) ▒
0.18 │ vmovdqa %ymm0,0x440(%rbx) ▒
1.05 │ vmovdqa %ymm0,0x460(%rbx) ▒
0.08 │ vmovdqa %ymm0,0x480(%rbx) ▒
2.22 │ vmovdqa %ymm0,0x4a0(%rbx) ▒
0.20 │ vmovdqa %ymm0,0x4c0(%rbx) ▒
2.33 │ vmovdqa %ymm0,0x4e0(%rbx) ▒
0.03 │ vmovdqa %ymm0,0x500(%rbx) ▒
2.87 │ vmovdqa %ymm0,0x520(%rbx) ▒
0.08 │ vmovdqa %ymm0,0x540(%rbx) ▒
1.60 │ vmovdqa %ymm0,0x560(%rbx) ▒
0.01 │ vmovdqa %ymm0,0x580(%rbx) ▒
7.03 │ vmovdqa %ymm0,0x5a0(%rbx) ▒
0.42 │ vmovdqa %ymm0,0x5c0(%rbx) ▒
2.74 │ vmovdqa %ymm0,0x5e0(%rbx) ▒
0.69 │ vmovdqa %ymm0,0x600(%rbx) ▒
2.34 │ vmovdqa %ymm0,0x620(%rbx) ▒
0.37 │ vmovdqa %ymm0,0x640(%rbx) ▒
1.21 │ vmovdqa %ymm0,0x660(%rbx) ▒
0.22 │ vmovdqa %ymm0,0x680(%rbx) ▒
1.16 │ vmovdqa %ymm0,0x6a0(%rbx) ▒
0.29 │ vmovdqa %ymm0,0x6c0(%rbx) ▒
0.98 │ vmovdqa %ymm0,0x6e0(%rbx) ▒
0.19 │ vmovdqa %ymm0,0x700(%rbx) ▒
0.81 │ vmovdqa %ymm0,0x720(%rbx) ▒
0.47 │ vmovdqa %ymm0,0x740(%rbx) ▒
0.69 │ vmovdqa %ymm0,0x760(%rbx) ▒
0.23 │ vmovdqa %ymm0,0x780(%rbx) ▒
0.68 │ vmovdqa %ymm0,0x7a0(%rbx) ▒
0.30 │ vmovdqa %ymm0,0x7c0(%rbx) ▒
0.68 │ vmovdqa %ymm0,0x7e0(%rbx) ▒
0.25 │ vmovdqa %ymm0,0x800(%rbx) ◆
0.58 │ vmovdqa %ymm0,0x820(%rbx) ▒
0.19 │ vmovdqa %ymm0,0x840(%rbx) ▒
0.83 │ vmovdqa %ymm0,0x860(%rbx) ▒
0.27 │ vmovdqa %ymm0,0x880(%rbx) ▒
1.01 │ vmovdqa %ymm0,0x8a0(%rbx) ▒
0.16 │ vmovdqa %ymm0,0x8c0(%rbx) ▒
0.89 │ vmovdqa %ymm0,0x8e0(%rbx) ▒
0.24 │ vmovdqa %ymm0,0x900(%rbx) ▒
0.98 │ vmovdqa %ymm0,0x920(%rbx) ▒
0.28 │ vmovdqa %ymm0,0x940(%rbx) ▒
0.86 │ vmovdqa %ymm0,0x960(%rbx) ▒
0.23 │ vmovdqa %ymm0,0x980(%rbx) ▒
1.19 │ vmovdqa %ymm0,0x9a0(%rbx) ▒
0.28 │ vmovdqa %ymm0,0x9c0(%rbx) ▒
1.04 │ vmovdqa %ymm0,0x9e0(%rbx) ▒
0.33 │ vmovdqa %ymm0,0xa00(%rbx) ▒
0.90 │ vmovdqa %ymm0,0xa20(%rbx) ▒
0.35 │ vmovdqa %ymm0,0xa40(%rbx) ▒
0.87 │ vmovdqa %ymm0,0xa60(%rbx) ▒
0.25 │ vmovdqa %ymm0,0xa80(%rbx) ▒
0.89 │ vmovdqa %ymm0,0xaa0(%rbx) ▒
0.28 │ vmovdqa %ymm0,0xac0(%rbx) ▒
0.92 │ vmovdqa %ymm0,0xae0(%rbx) ▒
0.23 │ vmovdqa %ymm0,0xb00(%rbx) ▒
1.39 │ vmovdqa %ymm0,0xb20(%rbx) ▒
0.29 │ vmovdqa %ymm0,0xb40(%rbx) ▒
1.15 │ vmovdqa %ymm0,0xb60(%rbx) ▒
0.26 │ vmovdqa %ymm0,0xb80(%rbx) ▒
1.33 │ vmovdqa %ymm0,0xba0(%rbx) ▒
0.29 │ vmovdqa %ymm0,0xbc0(%rbx) ▒
1.05 │ vmovdqa %ymm0,0xbe0(%rbx) ▒
0.25 │ vmovdqa %ymm0,0xc00(%rbx) ▒
0.89 │ vmovdqa %ymm0,0xc20(%rbx) ▒
0.34 │ vmovdqa %ymm0,0xc40(%rbx) ▒
0.78 │ vmovdqa %ymm0,0xc60(%rbx) ▒
0.40 │ vmovdqa %ymm0,0xc80(%rbx) ▒
0.99 │ vmovdqa %ymm0,0xca0(%rbx) ▒
0.44 │ vmovdqa %ymm0,0xcc0(%rbx) ▒
1.06 │ vmovdqa %ymm0,0xce0(%rbx) ▒
0.35 │ vmovdqa %ymm0,0xd00(%rbx) ▒
0.85 │ vmovdqa %ymm0,0xd20(%rbx) ▒
0.46 │ vmovdqa %ymm0,0xd40(%rbx) ▒
0.88 │ vmovdqa %ymm0,0xd60(%rbx) ▒
0.38 │ vmovdqa %ymm0,0xd80(%rbx) ▒
0.82 │ vmovdqa %ymm0,0xda0(%rbx) ▒
0.40 │ vmovdqa %ymm0,0xdc0(%rbx) ▒
0.98 │ vmovdqa %ymm0,0xde0(%rbx) ▒
0.27 │ vmovdqa %ymm0,0xe00(%rbx) ▒
1.10 │ vmovdqa %ymm0,0xe20(%rbx) ▒
0.25 │ vmovdqa %ymm0,0xe40(%rbx) ▒
0.89 │ vmovdqa %ymm0,0xe60(%rbx) ▒
0.32 │ vmovdqa %ymm0,0xe80(%rbx) ▒
0.87 │ vmovdqa %ymm0,0xea0(%rbx) ▒
0.22 │ vmovdqa %ymm0,0xec0(%rbx) ▒
0.94 │ vmovdqa %ymm0,0xee0(%rbx) ▒
0.27 │ vmovdqa %ymm0,0xf00(%rbx) ▒
0.90 │ vmovdqa %ymm0,0xf20(%rbx) ▒
0.28 │ vmovdqa %ymm0,0xf40(%rbx) ▒
0.79 │ vmovdqa %ymm0,0xf60(%rbx) ▒
0.31 │ vmovdqa %ymm0,0xf80(%rbx) ▒
1.11 │ vmovdqa %ymm0,0xfa0(%rbx) ▒
0.25 │ vmovdqa %ymm0,0xfc0(%rbx) ▒
0.99 │ vmovdqa %ymm0,0xfe0(%rbx) ▒
0.10 │ pop %rbx ▒
│ jmp 0xffffffffb4e4b050 ▒
...that looks like progress: we now spend 21.25% of the clock cyles on
get_page_from_freelist() (non-AVX: 25.61%). But still, there seem to be
(somewhat unexpected) stalls. For example, after 8 VMOVDQA instructions:
7.60 │ vmovdqa %ymm0,0x120(%rbx)
we have one where we spend/wait a long time, and there are more later.
3. ...what if we use a non-temporal hint, that is, if we clear the page
without making it cache hot ("stream" instead of "store")?
That's vmovntdq m256, ymm ("nt" meaning non-temporal). It's not vmovntdqa
(where "a" stands for "aligned"), as one could expect from the vmovdqa above,
because there's no unaligned version.
The only vmovntdq_a_ instruction is vmovntdqa ymm, m256 (memory to register,
"stream load"), because there's an unaligned equivalent in that case.
Anyway, perf output:
Samples: 39K of event 'cycles:P', Event count (approx.): 33890710610
Children Self Command Shared Object Symbol
- 92.62% 0.88% passt.avx2 [kernel.vmlinux] [k] entry_SYSCALL◆
- 91.74% entry_SYSCALL_64 ▒
- 91.60% do_syscall_64 ▒
- 75.05% __sys_sendmsg ▒
- 74.88% ___sys_sendmsg ▒
- 74.22% ____sys_sendmsg ▒
- 73.65% tcp_sendmsg ▒
- 61.71% tcp_sendmsg_locked ▒
- 24.82% _copy_from_iter ▒
24.40% rep_movs_alternative ▒
- 14.69% sk_page_frag_refill ▒
- 14.57% skb_page_frag_refill ▒
- 14.19% alloc_pages_mpol_noprof ▒
- 14.03% __alloc_pages_noprof ▒
- 13.77% get_page_from_freelist ▒
- 12.56% prep_new_page ▒
- 12.19% clear_page ▒
0.68% kernel_fpu_begin_mask ▒
- 11.12% tcp_write_xmit ▒
- 10.17% __tcp_transmit_skb ▒
- 9.62% __ip_queue_xmit ▒
- 8.08% ip_finish_output2 ▒
- 7.96% __dev_queue_xmit ▒
- 7.26% __local_bh_enable_ip ▒
- 7.24% do_softirq.part.0 ▒
- handle_softirqs ▒
- net_rx_action ▒
+ 5.80% __napi_poll ▒
+ 0.87% napi_consume_skb ▒
- 1.06% ip_local_out ▒
- 1.05% __ip_local_out ▒
- 0.90% nf_hook_slow ▒
0.66% nf_conntrack_in ▒
+ 4.22% __tcp_push_pending_frames ▒
+ 2.51% tcp_wmem_schedule ▒
+ 1.99% tcp_stream_alloc_skb ▒
0.59% __check_object_size ▒
+ 11.21% release_sock ▒
0.52% lock_sock_nested ▒
+ 5.32% ksys_write ▒
+ 4.75% __x64_sys_epoll_wait ▒
+ 2.45% __x64_sys_getsockopt ▒
1.29% syscall_exit_to_user_mode ▒
+ 1.25% ksys_read ▒
+ 0.70% syscall_trace_enter ▒
...finally we cut down significantly on cycles spent to clear pages, with
get_page_from_freelist() taking 13.77% of the cycles instead of 25.61%.
That's about half the overhead.
This makes _copy_from_iter() the biggest consumer of cycles under
tcp_sendmsg_locked(), which is what I expected. Does this mean that we
increase the overhead there because we're increasing the amount of cache
misses there, or are we simply more efficient? I'm not sure yet.
For completeness, annotated version of clear_page():
│ if (system_state >= SYSTEM_RUNNING && irq_fpu_usable()) {
0.09 │ 1b: call irq_fpu_usable
0.11 │ test %al,%al
0.51 │ je d
│ kernel_fpu_begin_mask(0);
0.16 │ xor %edi,%edi
│ call kernel_fpu_begin_mask
│ MEMSET_AVX2_ZERO(0);
0.05 │ vpxor %ymm0,%ymm0,%ymm0
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x00], 0);
0.79 │ vmovntdq %ymm0,(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x01], 0);
2.46 │ vmovntdq %ymm0,0x20(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x02], 0);
0.07 │ vmovntdq %ymm0,0x40(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x03], 0);
1.35 │ vmovntdq %ymm0,0x60(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x04], 0);
0.18 │ vmovntdq %ymm0,0x80(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x05], 0);
1.40 │ vmovntdq %ymm0,0xa0(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x06], 0);
0.11 │ vmovntdq %ymm0,0xc0(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x07], 0);
0.81 │ vmovntdq %ymm0,0xe0(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x08], 0);
0.07 │ vmovntdq %ymm0,0x100(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x09], 0);
1.25 │ vmovntdq %ymm0,0x120(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x0a], 0);
0.08 │ vmovntdq %ymm0,0x140(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x0b], 0);
1.36 │ vmovntdq %ymm0,0x160(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x0c], 0);
0.11 │ vmovntdq %ymm0,0x180(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x0d], 0);
1.73 │ vmovntdq %ymm0,0x1a0(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x0e], 0);
0.09 │ vmovntdq %ymm0,0x1c0(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x0f], 0);
0.97 │ vmovntdq %ymm0,0x1e0(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x10], 0);
0.07 │ vmovntdq %ymm0,0x200(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x11], 0);
1.25 │ vmovntdq %ymm0,0x220(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x12], 0);
0.14 │ vmovntdq %ymm0,0x240(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x13], 0);
0.79 │ vmovntdq %ymm0,0x260(%rbx)
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x14], 0); ▒
0.09 │ vmovntdq %ymm0,0x280(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x15], 0); ▒
1.19 │ vmovntdq %ymm0,0x2a0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x16], 0); ▒
0.07 │ vmovntdq %ymm0,0x2c0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x17], 0); ▒
1.45 │ vmovntdq %ymm0,0x2e0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x18], 0); ▒
│ vmovntdq %ymm0,0x300(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x19], 0); ▒
1.45 │ vmovntdq %ymm0,0x320(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x1a], 0); ▒
0.05 │ vmovntdq %ymm0,0x340(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x1b], 0); ▒
1.49 │ vmovntdq %ymm0,0x360(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x1c], 0); ▒
0.14 │ vmovntdq %ymm0,0x380(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x1d], 0); ▒
1.34 │ vmovntdq %ymm0,0x3a0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x1e], 0); ▒
0.09 │ vmovntdq %ymm0,0x3c0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x1f], 0); ▒
1.69 │ vmovntdq %ymm0,0x3e0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x20], 0); ▒
0.16 │ vmovntdq %ymm0,0x400(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x21], 0); ▒
1.15 │ vmovntdq %ymm0,0x420(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x22], 0); ▒
0.13 │ vmovntdq %ymm0,0x440(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x23], 0); ▒
1.36 │ vmovntdq %ymm0,0x460(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x24], 0); ▒
0.07 │ vmovntdq %ymm0,0x480(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x25], 0); ▒
1.01 │ vmovntdq %ymm0,0x4a0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x26], 0); ▒
0.09 │ vmovntdq %ymm0,0x4c0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x27], 0); ◆
1.53 │ vmovntdq %ymm0,0x4e0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x28], 0); ▒
0.12 │ vmovntdq %ymm0,0x500(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x29], 0); ▒
1.45 │ vmovntdq %ymm0,0x520(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x2a], 0); ▒
0.13 │ vmovntdq %ymm0,0x540(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x2b], 0); ▒
0.97 │ vmovntdq %ymm0,0x560(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x2c], 0); ▒
0.12 │ vmovntdq %ymm0,0x580(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x2d], 0); ▒
1.21 │ vmovntdq %ymm0,0x5a0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x2e], 0); ▒
0.15 │ vmovntdq %ymm0,0x5c0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x2f], 0); ▒
1.42 │ vmovntdq %ymm0,0x5e0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x30], 0); ▒
0.19 │ vmovntdq %ymm0,0x600(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x31], 0); ▒
1.12 │ vmovntdq %ymm0,0x620(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x32], 0); ▒
0.04 │ vmovntdq %ymm0,0x640(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x33], 0); ▒
1.59 │ vmovntdq %ymm0,0x660(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x34], 0); ▒
0.07 │ vmovntdq %ymm0,0x680(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x35], 0); ▒
1.65 │ vmovntdq %ymm0,0x6a0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x36], 0); ▒
0.14 │ vmovntdq %ymm0,0x6c0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x37], 0); ▒
1.00 │ vmovntdq %ymm0,0x6e0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x38], 0); ▒
0.14 │ vmovntdq %ymm0,0x700(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x39], 0); ▒
1.31 │ vmovntdq %ymm0,0x720(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x3a], 0); ▒
0.10 │ vmovntdq %ymm0,0x740(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x3b], 0); ▒
1.21 │ vmovntdq %ymm0,0x760(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x3c], 0); ▒
0.07 │ vmovntdq %ymm0,0x780(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x3d], 0); ▒
1.27 │ vmovntdq %ymm0,0x7a0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x3e], 0); ▒
0.09 │ vmovntdq %ymm0,0x7c0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x3f], 0); ▒
1.28 │ vmovntdq %ymm0,0x7e0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x40], 0); ▒
0.11 │ vmovntdq %ymm0,0x800(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x41], 0); ▒
1.32 │ vmovntdq %ymm0,0x820(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x42], 0); ▒
0.09 │ vmovntdq %ymm0,0x840(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x43], 0); ▒
1.43 │ vmovntdq %ymm0,0x860(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x44], 0); ▒
0.11 │ vmovntdq %ymm0,0x880(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x45], 0); ▒
1.21 │ vmovntdq %ymm0,0x8a0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x46], 0); ▒
0.11 │ vmovntdq %ymm0,0x8c0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x47], 0); ▒
1.09 │ vmovntdq %ymm0,0x8e0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x48], 0); ▒
0.07 │ vmovntdq %ymm0,0x900(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x49], 0); ▒
1.26 │ vmovntdq %ymm0,0x920(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x4a], 0); ▒
0.16 │ vmovntdq %ymm0,0x940(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x4b], 0); ▒
1.58 │ vmovntdq %ymm0,0x960(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x4c], 0); ▒
0.05 │ vmovntdq %ymm0,0x980(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x4d], 0); ▒
1.54 │ vmovntdq %ymm0,0x9a0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x4e], 0); ▒
0.07 │ vmovntdq %ymm0,0x9c0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x4f], 0); ▒
1.66 │ vmovntdq %ymm0,0x9e0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x50], 0); ▒
0.16 │ vmovntdq %ymm0,0xa00(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x51], 0); ▒
1.31 │ vmovntdq %ymm0,0xa20(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x52], 0); ▒
0.20 │ vmovntdq %ymm0,0xa40(%rbx) ◆
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x53], 0); ▒
1.44 │ vmovntdq %ymm0,0xa60(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x54], 0); ▒
0.05 │ vmovntdq %ymm0,0xa80(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x55], 0); ▒
1.52 │ vmovntdq %ymm0,0xaa0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x56], 0); ▒
0.21 │ vmovntdq %ymm0,0xac0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x57], 0); ▒
1.09 │ vmovntdq %ymm0,0xae0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x58], 0); ▒
0.22 │ vmovntdq %ymm0,0xb00(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x59], 0); ▒
1.58 │ vmovntdq %ymm0,0xb20(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x5a], 0); ▒
0.12 │ vmovntdq %ymm0,0xb40(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x5b], 0); ▒
1.46 │ vmovntdq %ymm0,0xb60(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x5c], 0); ▒
0.04 │ vmovntdq %ymm0,0xb80(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x5d], 0); ▒
1.62 │ vmovntdq %ymm0,0xba0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x5e], 0); ▒
0.07 │ vmovntdq %ymm0,0xbc0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x5f], 0); ▒
1.71 │ vmovntdq %ymm0,0xbe0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x60], 0); ▒
0.19 │ vmovntdq %ymm0,0xc00(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x61], 0); ▒
1.89 │ vmovntdq %ymm0,0xc20(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x62], 0); ▒
0.11 │ vmovntdq %ymm0,0xc40(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x63], 0); ▒
1.98 │ vmovntdq %ymm0,0xc60(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x64], 0); ▒
0.16 │ vmovntdq %ymm0,0xc80(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x65], 0); ▒
1.58 │ vmovntdq %ymm0,0xca0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x66], 0); ▒
0.13 │ vmovntdq %ymm0,0xcc0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x67], 0); ▒
1.16 │ vmovntdq %ymm0,0xce0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x68], 0); ▒
0.09 │ vmovntdq %ymm0,0xd00(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x69], 0); ▒
1.67 │ vmovntdq %ymm0,0xd20(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x6a], 0); ▒
0.11 │ vmovntdq %ymm0,0xd40(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x6b], 0); ▒
1.82 │ vmovntdq %ymm0,0xd60(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x6c], 0); ▒
0.07 │ vmovntdq %ymm0,0xd80(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x6d], 0); ▒
1.57 │ vmovntdq %ymm0,0xda0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x6e], 0); ▒
0.02 │ vmovntdq %ymm0,0xdc0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x6f], 0); ▒
1.27 │ vmovntdq %ymm0,0xde0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x70], 0); ▒
│ vmovntdq %ymm0,0xe00(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x71], 0); ▒
1.48 │ vmovntdq %ymm0,0xe20(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x72], 0); ▒
0.11 │ vmovntdq %ymm0,0xe40(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x73], 0); ▒
1.87 │ vmovntdq %ymm0,0xe60(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x74], 0); ▒
0.16 │ vmovntdq %ymm0,0xe80(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x75], 0); ▒
1.45 │ vmovntdq %ymm0,0xea0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x76], 0); ▒
0.07 │ vmovntdq %ymm0,0xec0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x77], 0); ▒
1.65 │ vmovntdq %ymm0,0xee0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x78], 0); ▒
0.10 │ vmovntdq %ymm0,0xf00(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x79], 0); ▒
1.53 │ vmovntdq %ymm0,0xf20(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x7a], 0); ▒
0.07 │ vmovntdq %ymm0,0xf40(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x7b], 0); ▒
1.51 │ vmovntdq %ymm0,0xf60(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x7c], 0); ▒
0.12 │ vmovntdq %ymm0,0xf80(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x7d], 0); ▒
1.62 │ vmovntdq %ymm0,0xfa0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x7e], 0); ▒
0.08 │ vmovntdq %ymm0,0xfc0(%rbx) ▒
│ MEMSET_AVX2_STORE(((unsigned char *)page)[YMM_BYTES * 0x7f], 0); ▒
1.62 │ vmovntdq %ymm0,0xfe0(%rbx) ▒
│ } ▒
0.13 │ pop %rbx ▒
│ kernel_fpu_end(); ▒
│ jmp ffffffff8104b050