DSM is too slow for real application..

Major performance problem of the GiantVM is due to the slow performance of the DSM layer. There are lots of known and unknown reason about this slow performance.

Known issues are...

All communication among GVM nodes are done by 4KB page basis.
It even rack of the simple next page pre-fetching technique.
Too much synchronizations and preparations are necessary before shiping any page to requesting node.

Further investigation is necessary for more reason for slow performance.

DSM EPT fault profile log

1) fast path

262817.007583 |    1)               |  kvm_dsm_vcpu_acquire_page [kvm]() {
262817.007583 |    1)   0.093 us    |    kvm_vcpu_gfn_to_memslot [kvm]();
262817.007584 |    1)               |    __kvm_dsm_acquire_page [kvm]() {
262817.007584 |    1)               |      dsm_lock [kvm]() {
262817.007584 |    1)   0.086 us    |        mutex_trylock();
262817.007585 |    1)   0.791 us    |      }
262817.007586 |    1)               |      kvm_dsm_page_fault [kvm]() {
262817.007586 |    1)   0.154 us    |        ktime_get_real_ts64();
262817.007587 |    1)               |        ivy_kvm_dsm_page_fault [kvm]() {
262817.007587 |    1)               |          mutex_lock() {
262817.007588 |    1)   0.073 us    |            rcu_all_qs();
262817.007588 |    1)   0.827 us    |          }
262817.007589 |    1)   1.599 us    |        }
262817.007589 |    1)   0.130 us    |        ktime_get_real_ts64();
262817.007590 |    1)   4.091 us    |      }
262817.007590 |    1)   6.297 us    |    }
262817.007591 |    1)   7.802 us    |  }

2) slow path

262817.007681 |    1)               |  kvm_dsm_vcpu_acquire_page [kvm]() {
262817.007682 |    1)   0.167 us    |    kvm_vcpu_gfn_to_memslot [kvm]();
262817.007683 |    1)               |    __kvm_dsm_acquire_page [kvm]() {
262817.007684 |    1)               |      dsm_lock [kvm]() {
262817.007685 |    1)   0.266 us    |        mutex_trylock();
262817.007686 |    1)   1.802 us    |      }
262817.007687 |    1)               |      kvm_dsm_page_fault [kvm]() {
262817.007687 |    1)   0.210 us    |        ktime_get_real_ts64();
262817.007689 |    1)               |        ivy_kvm_dsm_page_fault [kvm]() {
262817.007689 |    1)   2.533 us    |          kmem_cache_alloc_trace()
262817.007693 |    1)               |          kvm_dsm_fetch [kvm]() {
262817.007693 |    1)               |            krdma_send [kvm]() {
262817.007693 |    1)               |              mutex_lock() {
262817.007694 |    1)   0.077 us    |                rcu_all_qs();
262817.007694 |    1)   0.791 us    |              }
262817.007695 |    1)               |              mlx5_ib_post_send [mlx5_ib]() {
262817.007695 |    1)   0.076 us    |                _raw_spin_lock_irqsave();
262817.007696 |    1)               |                begin_wqe [mlx5_ib]() {
262817.007696 |    1)   0.076 us    |                  mlx5_wq_overflow [mlx5_ib]();
262817.007697 |    1)   0.885 us    |                }
262817.007698 |    1)   0.094 us    |                finish_wqe [mlx5_ib]();
262817.007699 |    1)   3.380 us    |              }
262817.007699 |    1)               |              krdma_poll [kvm]() {
262817.007699 |    1)   0.074 us    |                rcu_all_qs();
262817.007700 |    1)               |                mlx5_ib_poll_cq [mlx5_ib]() {
262817.007701 |    1)   0.086 us    |                  _raw_spin_lock_irqsave();
262817.007701 |    1)   0.197 us    |                  get_sw_cqe [mlx5_ib]();
262817.007703 |    1)   2.006 us    |                }
262817.007703 |    1)   3.614 us    |              }
262817.007703 |    1)   0.086 us    |              mutex_unlock();
262817.007704 |    1) + 10.809 us   |            }
262817.007705 |    1)               |            krdma_receive [kvm]() {
262817.007705 |    1)               |              mutex_lock() {
262817.007705 |    1)   0.077 us    |                rcu_all_qs();
262817.007706 |    1)   0.771 us    |              }
262817.007706 |    1)               |              krdma_poll [kvm]() {
262817.007707 |    1)   0.076 us    |                rcu_all_qs();
262817.007708 |    1)               |                mlx5_ib_poll_cq [mlx5_ib]() {
262817.007708 |    1)   0.080 us    |                  _raw_spin_lock_irqsave();
262817.007709 |    1)   0.220 us    |                  get_sw_cqe [mlx5_ib]();
262817.007710 |    1)   1.792 us    |                }
262817.007710 |    1)   3.251 us    |              }
262817.007710 |    1)   0.074 us    |              mutex_unlock();
262817.007711 |    1)   0.143 us    |              ktime_get();
262817.007712 |    1) + 20.236 us   |              schedule_hrtimeout_range();
262817.007733 |    1)               |              mutex_lock() {
262817.007734 |    1)   0.081 us    |                rcu_all_qs();
262817.007735 |    1)   0.874 us    |              }
262817.007735 |    1)               |              krdma_poll [kvm]() {
262817.007735 |    1)   0.077 us    |                rcu_all_qs();
262817.007736 |    1)               |                mlx5_ib_poll_cq [mlx5_ib]() {
262817.007736 |    1)   0.080 us    |                  _raw_spin_lock_irqsave();
262817.007737 |    1)   0.174 us    |                  get_sw_cqe [mlx5_ib]();
262817.007738 |    1)               |                  rdma_port_get_link_layer [ib_core]() {
262817.007739 |    1)   0.090 us    |                    mlx5_ib_port_link_layer [mlx5_ib]();
262817.007739 |    1)   0.924 us    |                  }
262817.007740 |    1)   3.461 us    |                }
262817.007740 |    1)               |                krdma_post_recv [kvm]() {
262817.007741 |    1)               |                  mlx5_ib_post_recv [mlx5_ib]() {
262817.007741 |    1)   0.077 us    |                    _raw_spin_lock_irqsave();
262817.007742 |    1)   0.076 us    |                    mlx5_wq_overflow [mlx5_ib]();
262817.007743 |    1)   1.515 us    |                  }
262817.007743 |    1)   2.496 us    |                }
262817.007743 |    1)   8.223 us    |              }
262817.007744 |    1)   0.080 us    |              mutex_unlock();
262817.007745 |    1) + 39.896 us   |            }
262817.007745 |    1) + 52.254 us   |          }
262817.007746 |    1)               |          dsm_decode_diff [kvm]() {
262817.007746 |    1)   2.222 us    |            kmem_cache_alloc_trace(}
262817.007749 |    1)   5.099 us    |            __kvm_read_guest_page [kvm]()
262817.007755 |    1)   0.096 us    |            xbzrle_decode_buffer [kvm]();
262817.007756 |    1)   0.104 us    |            kfree();
262817.007756 |    1) + 10.485 us   |          }
262817.007757 |    1)   4.018 us    |          __kvm_write_guest_page [kvm]()
262817.007761 |    1)   0.094 us    |          kvm_dsm_pf_trace [kvm]();
262817.007762 |    1)   0.104 us    |          kfree();
262817.007763 |    1) + 74.009 us   |        }
262817.007763 |    1)   0.160 us    |        ktime_get_real_ts64();
262817.007764 |    1) + 77.066 us   |      }
262817.007765 |    1) + 80.850 us   |    }
262817.007765 |    1) + 83.196 us   |  }

That's a great tracing result!

The biggest bottleneck in GiantVM is DSM. ivm_kvm_dsm_page_fault() is a core function that implements DSM and is called during the page fault handling. Take a look at the ivm_kvm_dsm_page_fault() code.

If you find the cause of what made the frequent calls to ivy_kvm_dsm_page_fault(), especially the communication between nodes, you are taking the first step towards finding the cause of the slow performance of GiantVM.

Below are the main points you should know about at DSM/GuestOS/application level in order to improve GiantVM performance.

DSM protocol : There is an overhead due to the implementation of the DSM protocol. Some examples are as follows.
- Communication is done in units of 4KB.
- GiantVM implements the ivy protocol, which invalidates the page of the remote node when writing. The overhead of this behavior is large.
- In addition, the read operation is implemented by modifying the original ivy protocol. For example, reading steals the owner privileges from the remote node. This also causes network traffic.
Application or guest OS kernel code that does not fit the logic of the currently implemented DSM : For example, if multiple threads in the application run and continuously update a common page, the performance degradation becomes severe.
- If multiple threads are running, there may be cases where the data in the same page is continuously updated between the threads. A thread running on node A wrote data (d) to a common page (P). If a remote thread running on node B wants to update d of page (P) with d', it invalidates page P of node A, bring P to node B, and update it to d'. However, when the thread in node A tries to update d' to d'' again, it must invalidate the page in node B and bring P back to A. As such, there may be application patterns in which these processes are repeated over and over again.
: Even at the Guest OS kernel level, if there is a data structure commonly used by multiple threads, such as spinlock and semaphore, and it is continuously updated by multiple threads, the performance problem becomes serious. In this case, invalidate-write is ping-ponged between GiantVM nodes, which may degrade performance.
- For your information, due to the structure of the current Giant VM, I/O is concentrated in master node. However, it is estimated that the degradation problem caused by the kernel data structure (page cache, inode, dentry, etc.) for I/O being ping-ponged between multiple nodes will be rather serious.
: There may be code that causes "false sharing" in the application/guestOS.
- Thread T1 running on node A accesses data1, and thread T2 running on node B accesses data2. However, data1 and data2 are on the same page, so when T1 renews data1, it invalides the common page in node B and fetches that page to node A, and when T2 tries to renew data2, it invalidates the page in A and fetches the page back to B. In this way, there may be applications or guest OS kernel with false sharing usage patterns.
* In summary, the data structure, data placement, etc. can be adjusted to fit the DSM logic so that communication with the remote node occurs as little as possible**.
The implementation of network functions : There is also room for optimizing infiniband-based communication implementations, the lowest layer of DSM.
- For example, in krdma_receive(), you can see code that causes performance degradation due to repeated poll/sleep. However, if you fix this to busy-waiting, the CPU usage rises rapidly. You can either find a good compromise between the two, or you can suggest a new API to improve on these issues.

You can try to improve GiantVM performance with various views as above.

The key point is to find the cause of what made the frequent inter-node communication within the DSM through page fault and think about how to solve it.

In the x86 config file, There is CONFIG_X86_INTERNODE_CACHE_SHIFT It is used as follows

#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)

#ifdef CONFIG_X86_VSMP
#ifdef CONFIG_SMP
#define __cacheline_aligned_in_smp                  \
    __attribute__((__aligned__(INTERNODE_CACHE_BYTES)))     \
    __page_aligned_data
#endif
#endif

Also, __cacheline_aligned_in_smp is used various places.

static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
...

Let's look some symbols in each kernel

CONFIG_X86_INTERNODE_CACHE_SHIFT=6

ffffffffb1c06a00 D jiffies_lock
ffffffffb1c06a40 d tick_broadcast_lock
ffffffffb1c06a80 d mod_tree
ffffffffb1c06ac0 d hash_lock
ffffffffb1c06b00 d running_trace_lock
ffffffffb1c06b40 d bpf_tree
ffffffffb1c06b80 d page_wait_table
ffffffffb1c08380 D vm_node_stat
ffffffffb1c08480 D vm_numa_stat
ffffffffb1c084c0 D vm_zone_stat
ffffffffb1c08540 d nr_files
ffffffffb1c08580 D rename_lock
ffffffffb1c085c0 d inode_hash_lock
ffffffffb1c08600 D mount_lock
ffffffffb1c08640 d bdev_lock
ffffffffb1c08680 D dq_data_lock
ffffffffb1c086c0 d dq_state_lock
ffffffffb1c08700 d dq_list_lock

CONFIG_X86_INTERNODE_CACHE_SHIFT=12

ffffffff94c09000 D jiffies_lock
ffffffff94c0a000 d tick_broadcast_lock
ffffffff94c0b000 d hash_lock
ffffffff94c0c000 d running_trace_lock
ffffffff94c0d000 D vm_node_stat
ffffffff94c0e000 D vm_numa_stat
ffffffff94c0f000 D vm_zone_stat
ffffffff94c10000 d nr_files
ffffffff94c11000 D rename_lock
ffffffff94c12000 d inode_hash_lock
ffffffff94c13000 D mount_lock
ffffffff94c14000 d bdev_lock
ffffffff94c15000 D dq_data_lock
ffffffff94c16000 d dq_state_lock
ffffffff94c17000 d dq_list_lock

This is the memcpy benchmark result. In each kernel, measured 100 times by perf.

CONFIG_X86_INTERNODE_CACHE_SHIFT =6

Performance counter stats for './memcpy' (100 runs):

 259978.452821      task-clock (msec)         #    6.276 CPUs utilized            ( +-  1.40% )
           534      context-switches          #    0.002 K/sec                    ( +-  1.60% )
             6      cpu-migrations            #    0.000 K/sec                    ( +-  5.73% )
       1179763      page-faults               #    0.005 M/sec                    ( +-  0.00% )
   49770782740      cycles                    #    0.191 GHz                      ( +-  0.94% )
    6788366029      instructions              #    0.14  insn per cycle           ( +-  1.04% )
    1178004636      branches                  #    4.531 M/sec                    ( +-  1.27% )
      22321686      branch-misses             #    1.89% of all branches          ( +-  0.74% )

        41.427 +- 0.483 seconds time elapsed  ( +-  1.17% )

CONFIG_X86_INTERNODE_CACHE_SHIFT =6 + NUMA enabled

Performance counter stats for './memcpy' (5 runs):

 286335.680792      task-clock (msec)         #    5.733 CPUs utilized            ( +- 12.56% )
         20371      context-switches          #    0.071 K/sec                    ( +- 27.26% )
            33      cpu-migrations            #    0.000 K/sec                    ( +- 23.22% )
       1377198      page-faults               #    0.005 M/sec                    ( +-  3.36% )
  122617856787      cycles                    #    0.428 GHz                      ( +- 19.68% )
   12387225599      instructions              #    0.10  insn per cycle           ( +- 14.95% )
    3299629481      branches                  #   11.524 M/sec                    ( +- 20.33% )
      27266874      branch-misses             #    0.83% of all branches          ( +- 10.90% )

         49.95 +- 5.23 seconds time elapsed  ( +- 10.47% )

CONFIG_X86_INTERNODE_CACHE_SHIFT =12

Performance counter stats for './memcpy' (100 runs):

 188572.103094      task-clock (msec)         #    6.185 CPUs utilized            ( +-  1.52% )
           434      context-switches          #    0.002 K/sec                    ( +-  4.05% )
             5      cpu-migrations            #    0.000 K/sec                    ( +-  6.45% )
       1179764      page-faults               #    0.006 M/sec                    ( +-  0.00% )
   44146163773      cycles                    #    0.234 GHz                      ( +-  1.18% )
    6535642154      instructions              #    0.15  insn per cycle           ( +-  1.14% )
    1158120155      branches                  #    6.142 M/sec                    ( +-  1.35% )
      18796532      branch-misses             #    1.62% of all branches          ( +-  0.89% )

        30.489 +- 0.368 seconds time elapsed  ( +-  1.21% )

CONFIG_X86_INTERNODE_CACHE_SHIFT =12 + NUMA enabled

Performance counter stats for './memcpy' (100 runs):

 121816.632584      task-clock (msec)         #    5.341 CPUs utilized            ( +-  2.07% )
          5656      context-switches          #    0.046 K/sec                    ( +- 17.38% )
            13      cpu-migrations            #    0.000 K/sec                    ( +-  3.52% )
       1205422      page-faults               #    0.010 M/sec                    ( +-  0.32% )
   40038370992      cycles                    #    0.329 GHz                      ( +-  4.55% )
    6366053292      instructions              #    0.16  insn per cycle           ( +-  2.23% )
    1153377048      branches                  #    9.468 M/sec                    ( +-  4.38% )
      12104144      branch-misses             #    1.05% of all branches          ( +-  1.72% )

        22.807 +- 0.386 seconds time elapsed  ( +-  1.69% )

CONFIG_X86_INTERNODE_CACHE_SHIFT =12 + rw_sem patched (Kerel version 5.14)

Performance counter stats for './memcpy' (100 runs):

 174825.883616      task-clock:u (msec)       #    6.064 CPUs utilized            ( +-  1.62% )
             0      context-switches:u        #    0.000 K/sec
             0      cpu-migrations:u          #    0.000 K/sec
       1179755      page-faults:u             #    0.007 M/sec                    ( +-  0.00% )
    7464043256      cycles:u                  #    0.043 GHz                      ( +-  0.56% )
    1176060580      instructions:u            #    0.16  insn per cycle           ( +-  0.00% )
     101951650      branches:u                #    0.583 M/sec                    ( +-  0.00% )
         20670      branch-misses:u           #    0.02% of all branches          ( +-  9.48% )

        28.830 +- 0.334 seconds time elapsed  ( +-  1.16% )

CONFIG_X86_INTERNODE_CACHE_SHIFT =12 + NUMA enabled (Kerel version 5.14)

Performance counter stats for './memcpy' (100 runs):

  80171.830899      task-clock:u (msec)       #    4.939 CPUs utilized            ( +-  2.48% )
             0      context-switches:u        #    0.000 K/sec
             0      cpu-migrations:u          #    0.000 K/sec
       1179754      page-faults:u             #    0.015 M/sec                    ( +-  0.00% )
    7810004815      cycles:u                  #    0.097 GHz                      ( +-  0.24% )
    1176092899      instructions:u            #    0.15  insn per cycle           ( +-  0.00% )
     101962590      branches:u                #    1.272 M/sec                    ( +-  0.00% )
         14729      branch-misses:u           #    0.01% of all branches          ( +-  4.68% )

        16.232 +- 0.252 seconds time elapsed  ( +-  1.55% )

CONFIG_X86_INTERNODE_CACHE_SHIFT =12 + rw_sem patched + NUMA enabled (Kerel version 5.14)

Performance counter stats for './memcpy' (100 runs):

  95791.046495      task-clock:u (msec)       #    5.147 CPUs utilized            ( +-  0.88% )
             0      context-switches:u        #    0.000 K/sec
             0      cpu-migrations:u          #    0.000 K/sec
       1179754      page-faults:u             #    0.012 M/sec                    ( +-  0.00% )
    8214521361      cycles:u                  #    0.086 GHz                      ( +-  0.24% )
    1176099221      instructions:u            #    0.14  insn per cycle           ( +-  0.00% )
     101968318      branches:u                #    1.064 M/sec                    ( +-  0.00% )
         16641      branch-misses:u           #    0.02% of all branches          ( +-  6.97% )

        18.611 +- 0.122 seconds time elapsed  ( +-  0.66% )

performance

Memory Usage

CONFIG_X86_INTERNODE_CACHE_SHIFT =6

[28] .data             PROGBITS         ffffffff82600000  01800000
   000000000024bc00  0000000000000000  WA       0     0     8192
[33] .data..percpu     PROGBITS         0000000000000000  01c00000
   0000000000025000  0000000000000000  WA       0     0     4096
[58] .bss              NOBITS           ffffffff82ad8000  01ed8000
   0000000000528000  0000000000000000  WA       0     0     4096

CONFIG_X86_INTERNODE_CACHE_SHIFT =12

[28] .data             PROGBITS         ffffffff82400000  01600000
   0000000000e63000  0000000000000000  WA       0     0     8192
[33] .data..percpu     PROGBITS         0000000000000000  02600000
   000000000003e000  0000000000000000  WA       0     0     4096
[58] .bss              NOBITS           ffffffff834fb000  028fb000
   0000000000249000  0000000000000000  WA       0     0     4096

Available memory has been reduced by 31MB.

Great job! Looks like quite meaningful results. I would like to see how the amount of memory consumed by the guest kernel changes after you modify the parameter.

Impact report (Working in progress)

Relevant sections

$ git grep INTERNODE_CACHE_BYTES
arch/x86/kernel/vmlinux.lds.S:  READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
arch/x86/kernel/vmlinux.lds.S:  PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
arch/x86/kernel/vmlinux.lds.S:  PERCPU_SECTION(INTERNODE_CACHE_BYTES)

READ_MOSTLY, PERCPU section are affected

Relevant kernel data structure

1. ARCH_MIN_TASKALIGN

# define ARCH_MIN_TASKALIGN            (1 << INTERNODE_CACHE_SHIFT)

void __init fork_init(void)
{
        int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);

        /* create a slab on which task_structs can be allocated */
        task_struct_cachep = kmem_cache_create_usercopy("task_struct",
                        arch_task_struct_size, align, <--------------------- Impact
                        SLAB_PANIC|SLAB_ACCOUNT,
                        useroffset, usersize, NULL);

2. __cacheline_aligned_in_smp

#ifdef CONFIG_X86_VSMP
#ifdef CONFIG_SMP
#define __cacheline_aligned_in_smp                  \
    __attribute__((__aligned__(INTERNODE_CACHE_BYTES)))     \
    __page_aligned_data
#endif
#endif

block/blk-mq.h

struct blk_mq_ctx {                                                                                                                                                                                                  struct {
        spinlock_t      lock;
        struct list_head    rq_list;
    }  ____cacheline_aligned_in_smp;

    unsigned int        cpu;
    unsigned int        index_hw;

    /* incremented at dispatch time */
    unsigned long       rq_dispatched[2];
    unsigned long       rq_merged;

    /* incremented at completion time */
    unsigned long       ____cacheline_aligned_in_smp rq_completed[2];

    struct request_queue    *queue;
    struct kobject      kobj;
} ____cacheline_aligned_in_smp

struct list_lru_node {
        /* protects all lists on the node, including per cgroup */
        spinlock_t              lock;
        /* global list, used for the root cgroup in cgroup aware lrus */
        struct list_lru_one     lru;
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
        /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
        struct list_lru_memcg   __rcu *memcg_lrus;
#endif
        long nr_items;
} ____cacheline_aligned_in_smp;

fs/aio.c

struct kioctx {
        struct {
                atomic_t        reqs_available;
        } ____cacheline_aligned_in_smp;

        struct {
                spinlock_t      ctx_lock;
                struct list_head active_reqs;   /* used for cancellation */
        } ____cacheline_aligned_in_smp;

        struct {
                struct mutex    ring_lock;
                wait_queue_head_t wait;
        } ____cacheline_aligned_in_smp;

        struct {
                unsigned        tail;
                unsigned        completed_events;
                spinlock_t      completion_lock;
        } ____cacheline_aligned_in_smp;
}

struct super_block {
        ...
        struct list_lru         s_dentry_lru ____cacheline_aligned_in_smp;
        struct list_lru         s_inode_lru ____cacheline_aligned_in_smp;
        ...

        /* s_inode_list_lock protects s_inodes */
        spinlock_t              s_inode_list_lock ____cacheline_aligned_in_smp;
        ...
}

struct zone {
    /* Read-mostly fields */

        ...
    /* Write-intensive fields used from the page allocator */
    ZONE_PADDING(_pad1_)

    /* free areas of different sizes */
    struct free_area    free_area[MAX_ORDER];

    /* zone flags, see below */
    unsigned long       flags;

    /* Primarily protects free_area */
    spinlock_t      lock;

    /* Write-intensive fields used by compaction and vmstats. */
    ZONE_PADDING(_pad2_)
}

Hot excutable area

CONFIG_X86_INTERNODE_CACHE_SHIFT = 6

RIP:ffffffff81230cd1 Miss: 11708  <handle_mm_fault>
RIP:ffffffff812302ad Miss: 11928  <_handle_mm_fault>
RIP:ffffffff811fa173 Miss: 12274  <get_page_from_freelist>
RIP:ffffffff812403b2 Miss: 12440  <page_add_new_anon_rmap>
RIP:ffffffff81004109 Miss: 12878  <prepare_exit_to_usermode>
RIP:ffffffff819fe0a0 Miss: 14021  <_raw_spin_lock>
RIP:ffffffff8123019e Miss: 16636  <__handle_mm_fault>
RIP:ffffffff811fa11f Miss: 21277  <get_page_from_freelist>
RIP:ffffffff811f93a6 Miss: 35668  <get_page_from_freelist>
RIP:ffffffff810e4d35 Miss: 80772  <down_read_trylock>
RIP:ffffffff810e4c93 Miss: 99658  <up_read>
RIP:ffffffff819ea517 Miss: 119081 <clear_page_erms>
Total 580450

CONFIG_X86_INTERNODE_CACHE_SHIFT =12 + rw_sem patched + NUMA enabled

RIP:ffffffff811c409e Miss: 5388
RIP:ffffffff81440c39 Miss: 9423
RIP:ffffffff810ec4e8 Miss: 9896
RIP:ffffffff810b88c3 Miss: 128777 <down_read_trylock>
RIP:ffffffff810b8da7 Miss: 152979 <up_read>
RIP:ffffffff8144c717 Miss: 181586 <clear_page_erms>
Total 520949

ememos / GiantVM