linux内存管理+线性地址+非连续区域

发表于 2020-03-15

linux内存管理+线性地址+非连续区域

相关的数据结构

node数据结构（numa）

pglist_data（pg_data_t）的表头是pgdat_list

typedef struct pglist_data {
        zone_t node_zones[MAX_NR_ZONES];                该节点所在管理区为HIGHMEM、NORMAL、DAM三者之一。
        zonelist_t node_zonelists[NR_GFPINDEX];         按照分配时的管理区顺序排列。通过page_alloc.c中的 bulid)zaonelists()建立顺序。
        struct page *node_mem_map;                      struct page数组中的第一个页面,被放置在全局mem_map数组中。
        unsigned long *valid_addr_bitmap;               描述内存节点中空洞的位图，因为并没有实际的内存空间存在。
        struct bootmem_data *bdata;                     指向内存引导程序
        unsigned long node_start_paddr;                 节点的起始物理地址
        unsigned long node_start_mapnr;                 它指出该节点在全局mem_map中的页面偏移，通过计算mem_map与该节点的局部mem_map中成为lmem_map之间的页面数，得到页面偏移。
        unsigned long node_size;                        管理区中的页面总数
        int node_id;                                    节点的ID号（NID），从0开始
        struct pglist_data *node_next;                  指向下一个节点。所有的节点都有一个pgdat_list的链表维护，这些节点都放在该链表中。
} pg_data_t;

ZONE数据结构(内存管理区)

对于x86机器，管理区的示例如下

ZONE_DMA 16MB
ZONE_NORMAL 16MB~896MB
ZONE_HIGHMEM 896MB~未尾

typedef struct zone_struct {
        spinlock_t                   lock;                                         并行访问时保护该管理区的自旋锁。
        unsigned long                offset;
        unsigned long                free_pages;                                   该管理区中空闲页面的总数。
        unsigned long                inactive_clean_pages;
        unsigned long                inactive_dirty_pages;
        unsigned long                pages_min, pages_low, pages_high;             管理区极值,pgaes_min(保留的页框池)
        struct list_head             inactive_clean_list;
        free_area_t                  free_area[MAX_ORDER];                         空闲区域位图。
        char                        *name;                                         管理区的字符串名字“DMA”“Normal”“HighMem”
        wait_queue                   *wait_table;                                  等待队列hash表
        unsigned long                wait_table_size;
        unsigned long                wait_table_shift;
        unsigned long                size;                                         该管理区的大小，以页面数计算
        struct pglist_data           *zone_pgdat;                                  指向父pg_data_t
        unsigned long                zone_start_paddr;                             节点的起始物理地址
        unsigned long                zone_start_mapnr;                             节点在全局mem_map中的页面偏移
        struct page                  *zone_mem_map;                                设计的管理区在全局mem_map中的第一页
} zone_t;

unsigned long pages_min, pages_low, pages_high;管理区极值
pages_min最有可能的值是大于等于20（80KB）个页面小于等于255（1MB）个页面
当空闲页面数达到pages_low时，伙伴分配器就会唤醒kswapd(守护程序)释放页面，pages_low默认值是pages_min的两倍。当空闲页面数达到pages_min时，分配器会以同步方式启动kswapd。kswapd被唤醒并开始释放页面后，在 pages_high个页面被释放以前，是不会认为该管理区已经“平衡”的，当到pages_high值后，kswapd再次睡眠。pages_high默认是pages_min的三倍。

PAGE

所有的page都存储在一个全局的mem_map数组中，该数组通常放在ZONE_NORMAL的首部，或者就在小内存系统中为装入内核映像而预留的区域之后。

/*
 * Each physical page in the system has a struct page associated with
 * it to keep track of whatever it is we are using the page for at the
 * moment. Note that we have no way to track which tasks are using
 * a page, though if it is a pagecache page, rmap structures can tell us
 * who is mapping it.
 *
 * The objects in struct page are organized in double word blocks in
 * order to allows us to use atomic double word operations on portions
 * of struct page. That is currently only used by slub but the arrangement
 * allows the use of atomic double word operations on the flags/mapping
 * and lru list pointers also.
 */
struct page {
    /* First double word block */
    unsigned long flags;        /* Atomic flags, some possibly updated asynchronously
                                              描述page的状态和其他信息  */
    union
    {
        struct address_space *mapping;  /* If low bit clear, points to
                         * inode address_space, or NULL.
                         * If page mapped as anonymous
                         * memory, low bit is set, and
                         * it points to anon_vma object:
                         * see PAGE_MAPPING_ANON below.
                         */
        void *s_mem;            /* slab first object */
        atomic_t compound_mapcount;     /* first tail page */
        /* page_deferred_list().next     -- second tail page */
    };
 
    /* Second double word */
    struct {
        union {
            pgoff_t index;      /* Our offset within mapping.
            在映射的虚拟空间（vma_area）内的偏移；
            一个文件可能只映射一部分，假设映射了1M的空间，
            index指的是在1M空间内的偏移，而不是在整个文件内的偏移。 */
            void *freelist;     /* sl[aou]b first free object */
            /* page_deferred_list().prev    -- second tail page */
        };
 
        union {
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
            /* Used for cmpxchg_double in slub */
            unsigned long counters;
#else
            /*
             * Keep _refcount separate from slub cmpxchg_double
             * data.  As the rest of the double word is protected by
             * slab_lock but _refcount is not.
             */
            unsigned counters;
#endif
 
            struct {
 
                union {
                    /*
                     * Count of ptes mapped in mms, to show
                     * when page is mapped & limit reverse
                     * map searches.
                     * 页映射计数器
                     */
                    atomic_t _mapcount;
 
                    struct { /* SLUB */
                        unsigned inuse:16;
                        unsigned objects:15;
                        unsigned frozen:1;
                    };
                    int units;      /* SLOB */
                };
                /*
                 * Usage count, *USE WRAPPER FUNCTION*
                 * when manual accounting. See page_ref.h
                 * 页引用计数器
                 */
                atomic_t _refcount;
            };
            unsigned int active;    /* SLAB */
        };
    };
 
    /*
     * Third double word block
     *
     * WARNING: bit 0 of the first word encode PageTail(). That means
     * the rest users of the storage space MUST NOT use the bit to
     * avoid collision and false-positive PageTail().
     */
    union {
        struct list_head lru;   /* Pageout list, eg. active_list
                     * protected by zone->lru_lock !
                     * Can be used as a generic list
                     * by the page owner.
                     */
        struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
                        * lru or handled by a slab
                        * allocator, this points to the
                        * hosting device page map.
                        */
        struct {        /* slub per cpu partial pages */
            struct page *next;      /* Next partial slab */
#ifdef CONFIG_64BIT
            int pages;      /* Nr of partial slabs left */
            int pobjects;   /* Approximate # of objects */
#else
            short int pages;
            short int pobjects;
#endif
        };
 
        struct rcu_head rcu_head;       /* Used by SLAB
                         * when destroying via RCU
                         */
        /* Tail pages of compound page */
        struct {
            unsigned long compound_head; /* If bit zero is set */
 
            /* First tail page only */
#ifdef CONFIG_64BIT
            /*
             * On 64 bit system we have enough space in struct page
             * to encode compound_dtor and compound_order with
             * unsigned int. It can help compiler generate better or
             * smaller code on some archtectures.
             */
            unsigned int compound_dtor;
            unsigned int compound_order;
#else
            unsigned short int compound_dtor;
            unsigned short int compound_order;
#endif
        };
 
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
        struct {
            unsigned long __pad;    /* do not overlay pmd_huge_pte
                         * with compound_head to avoid
                         * possible bit 0 collision.
                         */
            pgtable_t pmd_huge_pte; /* protected by page->ptl */
        };
#endif
    };
 
    /* Remainder is not double word aligned */
    union {
        unsigned long private;      /* Mapping-private opaque data:
                         * usually used for buffer_heads
                         * if PagePrivate set; used for
                         * swp_entry_t if PageSwapCache;
                         * indicates order in the buddy
                         * system if PG_buddy is set.
                         * 私有数据指针，由应用场景确定其具体的含义
                         */
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
        spinlock_t *ptl;
#else
        spinlock_t ptl;
#endif
#endif
        struct kmem_cache *slab_cache;  /* SL[AU]B: Pointer to slab */
    };
 
#ifdef CONFIG_MEMCG
    struct mem_cgroup *mem_cgroup;
#endif
 
    /*
     * On machines where all RAM is mapped into kernel address space,
     * we can simply calculate the virtual address. On machines with
     * highmem some memory is mapped into kernel virtual memory
     * dynamically, so we need a place to store that address.
     * Note that this field could be 16 bits on x86 ... ;)
     *
     * Architectures with slow multiplication can define
     * WANT_PAGE_VIRTUAL in asm/page.h
     */
#if defined(WANT_PAGE_VIRTUAL)
    void *virtual;          /* Kernel virtual address (NULL if
                       not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
 
#ifdef CONFIG_KMEMCHECK
    /*
     * kmemcheck wants to track the status of each byte in a page; this
     * is a pointer to such a status block. NULL if not tracked.
     */
    void *shadow;
#endif
 
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
    int _last_cpupid;
#endif
}
/*
 * The struct page can be forced to be double word aligned so that atomic ops
 * on double words work. The SLUB allocator can make use of such a feature.
 */
#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
    __aligned(2 * sizeof(unsigned long))
#endif
;

mm_struct

struct mm_struct {
//mmap指向虚拟区间链表
    struct vm_area_struct * mmap;       /* list of VMAs */
//指向红黑树
    struct rb_root mm_rb;
//指向最近的虚拟空间
    struct vm_area_struct * mmap_cache; /* last find_vma result */
//
    unsigned long (*get_unmapped_area) (struct file *filp,
                unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags);
    void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
    unsigned long mmap_base;        /* base of mmap area */
    unsigned long task_size;        /* size of task vm space */
    unsigned long cached_hole_size;     /* if non-zero, the largest hole below free_area_cache */
    unsigned long free_area_cache;      /* first hole of size cached_hole_size or larger */
//指向进程的页目录
    pgd_t * pgd;
//空间中有多少用户
    atomic_t mm_users;          /* How many users with user space? */
//引用计数；描述有多少指针指向当前的mm_struct
    atomic_t mm_count;          /* How many references to "struct mm_struct" (users count as 1) */
//虚拟区间的个数
    int map_count;              /* number of VMAs */
    struct rw_semaphore mmap_sem;
//保护任务页表
    spinlock_t page_table_lock;     /* Protects page tables and some counters */
//所有mm的链表
    struct list_head mmlist;        /* List of maybe swapped mm's.  These are globally strung
                         * together off init_mm.mmlist, and are protected
                         * by mmlist_lock
                         */

    /* Special counters, in some configurations protected by the
     * page_table_lock, in other configurations by being atomic.
     */
    mm_counter_t _file_rss;
    mm_counter_t _anon_rss;

    unsigned long hiwater_rss;  /* High-watermark of RSS usage */
    unsigned long hiwater_vm;   /* High-water virtual memory usage */

    unsigned long total_vm, locked_vm, shared_vm, exec_vm;
    unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
//start_code:代码段的起始地址
//end_code:代码段的结束地址
//start_data:数据段起始地址
//end_data:数据段结束地址
    unsigned long start_code, end_code, start_data, end_data;
//start_brk:堆的起始地址
//brk:堆的结束地址
//start_stack:栈的起始地址
    unsigned long start_brk, brk, start_stack;
//arg_start,arg_end:参数段的起始和结束地址
//env_start,env_end:环境段的起始和结束地址
    unsigned long arg_start, arg_end, env_start, env_end;

    unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

    struct linux_binfmt *binfmt;

    cpumask_t cpu_vm_mask;

    /* Architecture-specific MM context */
    mm_context_t context;

    /* Swap token stuff */
    /*
     * Last value of global fault stamp as seen by this process.
     * In other words, this value gives an indication of how long
     * it has been since this task got the token.
     * Look at mm/thrash.c
     */
    unsigned int faultstamp;
    unsigned int token_priority;
    unsigned int last_interval;

    unsigned long flags; /* Must use atomic bitops to access the bits */

    struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
    spinlock_t      ioctx_lock;
    struct hlist_head   ioctx_list;
#endif
#ifdef CONFIG_MM_OWNER
    /*
     * "owner" points to a task that is regarded as the canonical
     * user/owner of this mm. All of the following must be true in
     * order for it to be changed:
     *
     * current == mm->owner
     * current->mm != mm
     * new_owner->mm == mm
     * new_owner->alloc_lock is held
     */
    struct task_struct *owner;
#endif

#ifdef CONFIG_PROC_FS
    /* store ref to file /proc/<pid>/exe symlink points to */
    struct file *exe_file;
    unsigned long num_exe_file_vmas;
#endif
#ifdef CONFIG_MMU_NOTIFIER
    struct mmu_notifier_mm *mmu_notifier_mm;
#endif
};

vm_area_struct

struct vm_area_struct {
    struct mm_struct * vm_mm;   /* The address space we belong to. */
    unsigned long vm_start;     /* Our start address within vm_mm. */
    unsigned long vm_end;       /* The first byte after our end address
                       within vm_mm. */

    /* linked list of VM areas per task, sorted by address */
    struct vm_area_struct *vm_next;

    pgprot_t vm_page_prot;      /* Access permissions of this VMA. */
    unsigned long vm_flags;     /* Flags, see mm.h. */

    struct rb_node vm_rb;

    /*
     * For areas with an address space and backing store,
     * linkage into the address_space->i_mmap prio tree, or
     * linkage to the list of like vmas hanging off its node, or
     * linkage of vma in the address_space->i_mmap_nonlinear list.
     */
    union {
        struct {
            struct list_head list;
            void *parent;   /* aligns with prio_tree_node parent */
            struct vm_area_struct *head;
        } vm_set;

        struct raw_prio_tree_node prio_tree_node;
    } shared;

    /*
     * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
     * list, after a COW of one of the file pages.  A MAP_SHARED vma
     * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
     * or brk vma (with NULL file) can only be in an anon_vma list.
     */
    struct list_head anon_vma_node; /* Serialized by anon_vma->lock */
    struct anon_vma *anon_vma;  /* Serialized by page_table_lock */

    /* Function pointers to deal with this struct. */
    const struct vm_operations_struct *vm_ops;

    /* Information about our backing store: */
    unsigned long vm_pgoff;     /* Offset (within vm_file) in PAGE_SIZE
                       units, *not* PAGE_CACHE_SIZE */
    struct file * vm_file;      /* File we map to (can be NULL). */
    void * vm_private_data;     /* was vm_pte (shared mem) */
    unsigned long vm_truncate_count;/* truncate_count or restart_addr */

#ifndef CONFIG_MMU
    struct vm_region *vm_region;    /* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
    struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
#endif
};

非连续区域内存管理