Linux内存管理 - Githubissues

junlon2006 commented 4 years ago

base on linux kernel 4.13

junlon2006 commented 4 years ago

如图32位系统，4G虚拟内存空间，高地址4G-3G为内核映射区，3G-0G为应用态访问区。如图64位系统，256T虚拟空间（48根总线，高16位默认置零），内核映射区、应用态分别128T。

junlon2006 commented 4 years ago

如图用户态虚拟地址，从低地址往高地址分别为： text（存放代码段） Data（存放已初始化的全局变量和已初始化的静态变量；初始化值必须是非零的，否则优化到BSS） BSS（存放未初始化的全局变量和未初始化的静态变量） heap（堆区，向高地址生长） mmap（mmap映射区，so、文件、largeBin存储区，向低地址生长） stack（栈空间，向低地址生长）

注意：data段数据会固化在可执行文件里，占用flash空间。

glibc ptmalloc下面的代码，比较p1、p2、p3、p4的大小，解释为什么？

#include <stdio.h>
#include <stdlib.h>

#define SMALL_BINS   (1024 * 16)
#define BIG_BINS     (1024 * 256)

int main() {
  char *p1 = (char *)malloc(SMALL_BINS);
  printf("p1=%p\n", p1);

  char *p2 = (char *)malloc(BIG_BINS);
  printf("p2=%p\n", p2);

  char *p3 = (char *)malloc(SMALL_BINS);
  printf("p3=%p\n", p3);

  char *p4 = (char *)malloc(BIG_BINS);
  printf("p4=%p\n", p4);

  free(p1);
  free(p2);
  free(p3);
  free(p4);

  return 0;
}

junlon2006 commented 4 years ago

内存分配器malloc 1、malloc通过glibc各种wrapper，指向系统调用mmap、brk（对于small bin调用brk，向高地址扩展，当sbrk达到顶时，调整brk向高地址继续扩展；对于large bin调用mmap，mmap内存向低地址扩展）上一条comment中的代码就是例子，比较p1,p2,p3,p4大小，p1、p3通过brk分配在heap区，地址较小（p1 < p3向高地址扩展），p2、p4通过mmap映射，地址较大（p2 > p4向低地址扩展） 2、malloc在glibc中默认使用ptmalloc。

junlon2006 commented 4 years ago

每个进程具有独立的虚拟地址空间，虚拟内存通过分页分段映射物理内存，从进程的角度看物理内存像是当前进程独占。内核对于所有的进程，不但物理内存只有一份，虚拟内存也是只有一份。内核代码，全局变量，bss都是在代码段。

junlon2006 commented 4 years ago

内核态的虚拟空间和某一个进程没有关系，所有进程通过系统调用进入到内核之后，看到的虚拟地址空间都是一样的。

也就是说内核可以修改一切进程的内存信息，但是kernel不会这么做，可以做但是没有必要。

junlon2006 commented 4 years ago

内核中的结构体定义

struct mm_struct {
    struct vm_area_struct *mmap;        /* list of VMAs */
    struct rb_root mm_rb;
    u32 vmacache_seqnum;                   /* per-thread vmacache */
#ifdef CONFIG_MMU
    unsigned long (*get_unmapped_area) (struct file *filp,
                unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags);
#endif
    unsigned long mmap_base;        /* base of mmap area */
    unsigned long mmap_legacy_base;         /* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
    /* Base adresses for compatible mmap() */
    unsigned long mmap_compat_base;
    unsigned long mmap_compat_legacy_base;
#endif
    unsigned long task_size;        /* size of task vm space */
    unsigned long highest_vm_end;       /* highest vma end address */
    pgd_t * pgd;

    /**
     * @mm_users: The number of users including userspace.
     *
     * Use mmget()/mmget_not_zero()/mmput() to modify. When this drops
     * to 0 (i.e. when the task exits and there are no other temporary
     * reference holders), we also release a reference on @mm_count
     * (which may then free the &struct mm_struct if @mm_count also
     * drops to 0).
     */
    atomic_t mm_users;

    /**
     * @mm_count: The number of references to &struct mm_struct
     * (@mm_users count as 1).
     *
     * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
     * &struct mm_struct is freed.
     */
    atomic_t mm_count;

    atomic_long_t nr_ptes;          /* PTE page table pages */
#if CONFIG_PGTABLE_LEVELS > 2
    atomic_long_t nr_pmds;          /* PMD page table pages */
#endif
    int map_count;              /* number of VMAs */

    spinlock_t page_table_lock;     /* Protects page tables and some counters */
    struct rw_semaphore mmap_sem;

    struct list_head mmlist;        /* List of maybe swapped mm's.  These are globally strung
                         * together off init_mm.mmlist, and are protected
                         * by mmlist_lock
                         */

    unsigned long hiwater_rss;  /* High-watermark of RSS usage */
    unsigned long hiwater_vm;   /* High-water virtual memory usage */

    unsigned long total_vm;     /* Total pages mapped */
    unsigned long locked_vm;    /* Pages that have PG_mlocked set */
    unsigned long pinned_vm;    /* Refcount permanently increased */
    unsigned long data_vm;      /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
    unsigned long exec_vm;      /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
    unsigned long stack_vm;     /* VM_STACK */
    unsigned long def_flags;
    unsigned long start_code, end_code, start_data, end_data;
    unsigned long start_brk, brk, start_stack;
    unsigned long arg_start, arg_end, env_start, env_end;

    unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

    /*
     * Special counters, in some configurations protected by the
     * page_table_lock, in other configurations by being atomic.
     */
    struct mm_rss_stat rss_stat;

    struct linux_binfmt *binfmt;

    cpumask_var_t cpu_vm_mask_var;

    /* Architecture-specific MM context */
    mm_context_t context;

    unsigned long flags; /* Must use atomic bitops to access the bits */

    struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
    spinlock_t          ioctx_lock;
    struct kioctx_table __rcu   *ioctx_table;
#endif
#ifdef CONFIG_MEMCG
    /*
     * "owner" points to a task that is regarded as the canonical
     * user/owner of this mm. All of the following must be true in
     * order for it to be changed:
     *
     * current == mm->owner
     * current->mm != mm
     * new_owner->mm == mm
     * new_owner->alloc_lock is held
     */
    struct task_struct __rcu *owner;
#endif
    struct user_namespace *user_ns;

    /* store ref to file /proc/<pid>/exe symlink points to */
    struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
    struct mmu_notifier_mm *mmu_notifier_mm;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
    pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_CPUMASK_OFFSTACK
    struct cpumask cpumask_allocation;
#endif
#ifdef CONFIG_NUMA_BALANCING
    /*
     * numa_next_scan is the next time that the PTEs will be marked
     * pte_numa. NUMA hinting faults will gather statistics and migrate
     * pages to new nodes if necessary.
     */
    unsigned long numa_next_scan;

    /* Restart point for scanning and setting pte_numa */
    unsigned long numa_scan_offset;

    /* numa_scan_seq prevents two threads setting pte_numa */
    int numa_scan_seq;
#endif
    /*
     * An operation with batched TLB flushing is going on. Anything that
     * can move process memory needs to flush the TLB when moving a
     * PROT_NONE or PROT_NUMA mapped page.
     */
    atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
    /* See flush_tlb_batched_pending() */
    bool tlb_flush_batched;
#endif
    struct uprobes_state uprobes_state;
#ifdef CONFIG_HUGETLB_PAGE
    atomic_long_t hugetlb_usage;
#endif
    struct work_struct async_put_work;
} __randomize_layout;

junlon2006 commented 4 years ago

32位内核态虚拟内存布局

    #define __va(x)      ((void *)((unsigned long)(x)+PAGE_OFFSET))  
    #define __pa(x)    __phys_addr((unsigned long)(x))  
    #define __phys_addr(x)    __phys_addr_nodebug(x)  
    #define __phys_addr_nodebug(x)  ((x) - PAGE_OFFSET)

junlon2006 / linux-c

Linux内存管理 #17