ememos / GiantVM

9 stars 9 forks source link

DSM is too slow for real application.. #1

Open solemnify opened 3 years ago

solemnify commented 3 years ago

Major performance problem of the GiantVM is due to the slow performance of the DSM layer. There are lots of known and unknown reason about this slow performance.

Known issues are...

Further investigation is necessary for more reason for slow performance.

YWHyuk commented 2 years ago

DSM EPT fault profile log

1) fast path

262817.007583 |    1)               |  kvm_dsm_vcpu_acquire_page [kvm]() {
262817.007583 |    1)   0.093 us    |    kvm_vcpu_gfn_to_memslot [kvm]();
262817.007584 |    1)               |    __kvm_dsm_acquire_page [kvm]() {
262817.007584 |    1)               |      dsm_lock [kvm]() {
262817.007584 |    1)   0.086 us    |        mutex_trylock();
262817.007585 |    1)   0.791 us    |      }
262817.007586 |    1)               |      kvm_dsm_page_fault [kvm]() {
262817.007586 |    1)   0.154 us    |        ktime_get_real_ts64();
262817.007587 |    1)               |        ivy_kvm_dsm_page_fault [kvm]() {
262817.007587 |    1)               |          mutex_lock() {
262817.007588 |    1)   0.073 us    |            rcu_all_qs();
262817.007588 |    1)   0.827 us    |          }
262817.007589 |    1)   1.599 us    |        }
262817.007589 |    1)   0.130 us    |        ktime_get_real_ts64();
262817.007590 |    1)   4.091 us    |      }
262817.007590 |    1)   6.297 us    |    }
262817.007591 |    1)   7.802 us    |  }

2) slow path

262817.007681 |    1)               |  kvm_dsm_vcpu_acquire_page [kvm]() {
262817.007682 |    1)   0.167 us    |    kvm_vcpu_gfn_to_memslot [kvm]();
262817.007683 |    1)               |    __kvm_dsm_acquire_page [kvm]() {
262817.007684 |    1)               |      dsm_lock [kvm]() {
262817.007685 |    1)   0.266 us    |        mutex_trylock();
262817.007686 |    1)   1.802 us    |      }
262817.007687 |    1)               |      kvm_dsm_page_fault [kvm]() {
262817.007687 |    1)   0.210 us    |        ktime_get_real_ts64();
262817.007689 |    1)               |        ivy_kvm_dsm_page_fault [kvm]() {
262817.007689 |    1)   2.533 us    |          kmem_cache_alloc_trace()
262817.007693 |    1)               |          kvm_dsm_fetch [kvm]() {
262817.007693 |    1)               |            krdma_send [kvm]() {
262817.007693 |    1)               |              mutex_lock() {
262817.007694 |    1)   0.077 us    |                rcu_all_qs();
262817.007694 |    1)   0.791 us    |              }
262817.007695 |    1)               |              mlx5_ib_post_send [mlx5_ib]() {
262817.007695 |    1)   0.076 us    |                _raw_spin_lock_irqsave();
262817.007696 |    1)               |                begin_wqe [mlx5_ib]() {
262817.007696 |    1)   0.076 us    |                  mlx5_wq_overflow [mlx5_ib]();
262817.007697 |    1)   0.885 us    |                }
262817.007698 |    1)   0.094 us    |                finish_wqe [mlx5_ib]();
262817.007699 |    1)   3.380 us    |              }
262817.007699 |    1)               |              krdma_poll [kvm]() {
262817.007699 |    1)   0.074 us    |                rcu_all_qs();
262817.007700 |    1)               |                mlx5_ib_poll_cq [mlx5_ib]() {
262817.007701 |    1)   0.086 us    |                  _raw_spin_lock_irqsave();
262817.007701 |    1)   0.197 us    |                  get_sw_cqe [mlx5_ib]();
262817.007703 |    1)   2.006 us    |                }
262817.007703 |    1)   3.614 us    |              }
262817.007703 |    1)   0.086 us    |              mutex_unlock();
262817.007704 |    1) + 10.809 us   |            }
262817.007705 |    1)               |            krdma_receive [kvm]() {
262817.007705 |    1)               |              mutex_lock() {
262817.007705 |    1)   0.077 us    |                rcu_all_qs();
262817.007706 |    1)   0.771 us    |              }
262817.007706 |    1)               |              krdma_poll [kvm]() {
262817.007707 |    1)   0.076 us    |                rcu_all_qs();
262817.007708 |    1)               |                mlx5_ib_poll_cq [mlx5_ib]() {
262817.007708 |    1)   0.080 us    |                  _raw_spin_lock_irqsave();
262817.007709 |    1)   0.220 us    |                  get_sw_cqe [mlx5_ib]();
262817.007710 |    1)   1.792 us    |                }
262817.007710 |    1)   3.251 us    |              }
262817.007710 |    1)   0.074 us    |              mutex_unlock();
262817.007711 |    1)   0.143 us    |              ktime_get();
262817.007712 |    1) + 20.236 us   |              schedule_hrtimeout_range();
262817.007733 |    1)               |              mutex_lock() {
262817.007734 |    1)   0.081 us    |                rcu_all_qs();
262817.007735 |    1)   0.874 us    |              }
262817.007735 |    1)               |              krdma_poll [kvm]() {
262817.007735 |    1)   0.077 us    |                rcu_all_qs();
262817.007736 |    1)               |                mlx5_ib_poll_cq [mlx5_ib]() {
262817.007736 |    1)   0.080 us    |                  _raw_spin_lock_irqsave();
262817.007737 |    1)   0.174 us    |                  get_sw_cqe [mlx5_ib]();
262817.007738 |    1)               |                  rdma_port_get_link_layer [ib_core]() {
262817.007739 |    1)   0.090 us    |                    mlx5_ib_port_link_layer [mlx5_ib]();
262817.007739 |    1)   0.924 us    |                  }
262817.007740 |    1)   3.461 us    |                }
262817.007740 |    1)               |                krdma_post_recv [kvm]() {
262817.007741 |    1)               |                  mlx5_ib_post_recv [mlx5_ib]() {
262817.007741 |    1)   0.077 us    |                    _raw_spin_lock_irqsave();
262817.007742 |    1)   0.076 us    |                    mlx5_wq_overflow [mlx5_ib]();
262817.007743 |    1)   1.515 us    |                  }
262817.007743 |    1)   2.496 us    |                }
262817.007743 |    1)   8.223 us    |              }
262817.007744 |    1)   0.080 us    |              mutex_unlock();
262817.007745 |    1) + 39.896 us   |            }
262817.007745 |    1) + 52.254 us   |          }
262817.007746 |    1)               |          dsm_decode_diff [kvm]() {
262817.007746 |    1)   2.222 us    |            kmem_cache_alloc_trace(}
262817.007749 |    1)   5.099 us    |            __kvm_read_guest_page [kvm]()
262817.007755 |    1)   0.096 us    |            xbzrle_decode_buffer [kvm]();
262817.007756 |    1)   0.104 us    |            kfree();
262817.007756 |    1) + 10.485 us   |          }
262817.007757 |    1)   4.018 us    |          __kvm_write_guest_page [kvm]()
262817.007761 |    1)   0.094 us    |          kvm_dsm_pf_trace [kvm]();
262817.007762 |    1)   0.104 us    |          kfree();
262817.007763 |    1) + 74.009 us   |        }
262817.007763 |    1)   0.160 us    |        ktime_get_real_ts64();
262817.007764 |    1) + 77.066 us   |      }
262817.007765 |    1) + 80.850 us   |    }
262817.007765 |    1) + 83.196 us   |  }
ememos commented 2 years ago

That's a great tracing result!

The biggest bottleneck in GiantVM is DSM. ivm_kvm_dsm_page_fault() is a core function that implements DSM and is called during the page fault handling. Take a look at the ivm_kvm_dsm_page_fault() code.

If you find the cause of what made the frequent calls to ivy_kvm_dsm_page_fault(), especially the communication between nodes, you are taking the first step towards finding the cause of the slow performance of GiantVM.

Below are the main points you should know about at DSM/GuestOS/application level in order to improve GiantVM performance.

  1. DSM protocol : There is an overhead due to the implementation of the DSM protocol. Some examples are as follows.

    • Communication is done in units of 4KB.
    • GiantVM implements the ivy protocol, which invalidates the page of the remote node when writing. The overhead of this behavior is large.
    • In addition, the read operation is implemented by modifying the original ivy protocol. For example, reading steals the owner privileges from the remote node. This also causes network traffic.
  2. Application or guest OS kernel code that does not fit the logic of the currently implemented DSM : For example, if multiple threads in the application run and continuously update a common page, the performance degradation becomes severe.

    • If multiple threads are running, there may be cases where the data in the same page is continuously updated between the threads. A thread running on node A wrote data (d) to a common page (P). If a remote thread running on node B wants to update d of page (P) with d', it invalidates page P of node A, bring P to node B, and update it to d'. However, when the thread in node A tries to update d' to d'' again, it must invalidate the page in node B and bring P back to A. As such, there may be application patterns in which these processes are repeated over and over again.

    : Even at the Guest OS kernel level, if there is a data structure commonly used by multiple threads, such as spinlock and semaphore, and it is continuously updated by multiple threads, the performance problem becomes serious. In this case, invalidate-write is ping-ponged between GiantVM nodes, which may degrade performance.

    • For your information, due to the structure of the current Giant VM, I/O is concentrated in master node. However, it is estimated that the degradation problem caused by the kernel data structure (page cache, inode, dentry, etc.) for I/O being ping-ponged between multiple nodes will be rather serious.

    : There may be code that causes "false sharing" in the application/guestOS.

    • Thread T1 running on node A accesses data1, and thread T2 running on node B accesses data2. However, data1 and data2 are on the same page, so when T1 renews data1, it invalides the common page in node B and fetches that page to node A, and when T2 tries to renew data2, it invalidates the page in A and fetches the page back to B. In this way, there may be applications or guest OS kernel with false sharing usage patterns.

    * In summary, the data structure, data placement, etc. can be adjusted to fit the DSM logic so that communication with the remote node occurs as little as possible**.

  3. The implementation of network functions : There is also room for optimizing infiniband-based communication implementations, the lowest layer of DSM.

    • For example, in krdma_receive(), you can see code that causes performance degradation due to repeated poll/sleep. However, if you fix this to busy-waiting, the CPU usage rises rapidly. You can either find a good compromise between the two, or you can suggest a new API to improve on these issues.

You can try to improve GiantVM performance with various views as above.

The key point is to find the cause of what made the frequent inter-node communication within the DSM through page fault and think about how to solve it.

YWHyuk commented 2 years ago

In the x86 config file, There is CONFIG_X86_INTERNODE_CACHE_SHIFT It is used as follows

#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)

#ifdef CONFIG_X86_VSMP
#ifdef CONFIG_SMP
#define __cacheline_aligned_in_smp                  \
    __attribute__((__aligned__(INTERNODE_CACHE_BYTES)))     \
    __page_aligned_data
#endif
#endif

Also, __cacheline_aligned_in_smp is used various places.

static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
...

Let's look some symbols in each kernel

This is the memcpy benchmark result. In each kernel, measured 100 times by perf.

Available memory has been reduced by 31MB.

baiksong commented 2 years ago

Great job! Looks like quite meaningful results. I would like to see how the amount of memory consumed by the guest kernel changes after you modify the parameter.

YWHyuk commented 2 years ago

Impact report (Working in progress)

Relevant sections

$ git grep INTERNODE_CACHE_BYTES
arch/x86/kernel/vmlinux.lds.S:  READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
arch/x86/kernel/vmlinux.lds.S:  PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
arch/x86/kernel/vmlinux.lds.S:  PERCPU_SECTION(INTERNODE_CACHE_BYTES)

Relevant kernel data structure

1. ARCH_MIN_TASKALIGN

# define ARCH_MIN_TASKALIGN            (1 << INTERNODE_CACHE_SHIFT)
void __init fork_init(void)
{
        int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);

        /* create a slab on which task_structs can be allocated */
        task_struct_cachep = kmem_cache_create_usercopy("task_struct",
                        arch_task_struct_size, align, <--------------------- Impact
                        SLAB_PANIC|SLAB_ACCOUNT,
                        useroffset, usersize, NULL);

2. __cacheline_aligned_in_smp

#ifdef CONFIG_X86_VSMP
#ifdef CONFIG_SMP
#define __cacheline_aligned_in_smp                  \
    __attribute__((__aligned__(INTERNODE_CACHE_BYTES)))     \
    __page_aligned_data
#endif
#endif
block/blk-mq.h

struct blk_mq_ctx {                                                                                                                                                                                                  struct {
        spinlock_t      lock;
        struct list_head    rq_list;
    }  ____cacheline_aligned_in_smp;

    unsigned int        cpu;
    unsigned int        index_hw;

    /* incremented at dispatch time */
    unsigned long       rq_dispatched[2];
    unsigned long       rq_merged;

    /* incremented at completion time */
    unsigned long       ____cacheline_aligned_in_smp rq_completed[2];

    struct request_queue    *queue;
    struct kobject      kobj;
} ____cacheline_aligned_in_smp
struct list_lru_node {
        /* protects all lists on the node, including per cgroup */
        spinlock_t              lock;
        /* global list, used for the root cgroup in cgroup aware lrus */
        struct list_lru_one     lru;
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
        /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
        struct list_lru_memcg   __rcu *memcg_lrus;
#endif
        long nr_items;
} ____cacheline_aligned_in_smp;
fs/aio.c

struct kioctx {
        struct {
                atomic_t        reqs_available;
        } ____cacheline_aligned_in_smp;

        struct {
                spinlock_t      ctx_lock;
                struct list_head active_reqs;   /* used for cancellation */
        } ____cacheline_aligned_in_smp;

        struct {
                struct mutex    ring_lock;
                wait_queue_head_t wait;
        } ____cacheline_aligned_in_smp;

        struct {
                unsigned        tail;
                unsigned        completed_events;
                spinlock_t      completion_lock;
        } ____cacheline_aligned_in_smp;
}
struct super_block {
        ...
        struct list_lru         s_dentry_lru ____cacheline_aligned_in_smp;
        struct list_lru         s_inode_lru ____cacheline_aligned_in_smp;
        ...

        /* s_inode_list_lock protects s_inodes */
        spinlock_t              s_inode_list_lock ____cacheline_aligned_in_smp;
        ...
}
struct zone {
    /* Read-mostly fields */

        ...
    /* Write-intensive fields used from the page allocator */
    ZONE_PADDING(_pad1_)

    /* free areas of different sizes */
    struct free_area    free_area[MAX_ORDER];

    /* zone flags, see below */
    unsigned long       flags;

    /* Primarily protects free_area */
    spinlock_t      lock;

    /* Write-intensive fields used by compaction and vmstats. */
    ZONE_PADDING(_pad2_)
}

Hot excutable area