linux内存管理+线性地址+非连续区域

linux内存管理+线性地址+非连续区域

相关的数据结构

node数据结构(numa)

  • pglist_data(pg_data_t)的表头是pgdat_list
1
2
3
4
5
6
7
8
9
10
11
12
typedef struct pglist_data {
zone_t node_zones[MAX_NR_ZONES]; 该节点所在管理区为HIGHMEM、NORMAL、DAM三者之一。
zonelist_t node_zonelists[NR_GFPINDEX]; 按照分配时的管理区顺序排列。通过page_alloc.c中的 bulid)zaonelists()建立顺序。
struct page *node_mem_map; struct page数组中的第一个页面,被放置在全局mem_map数组中。
unsigned long *valid_addr_bitmap; 描述内存节点中空洞的位图,因为并没有实际的内存空间存在。
struct bootmem_data *bdata; 指向内存引导程序
unsigned long node_start_paddr; 节点的起始物理地址
unsigned long node_start_mapnr; 它指出该节点在全局mem_map中的页面偏移,通过计算mem_map与该节点的局部mem_map中成为lmem_map之间的页面数,得到页面偏移。
unsigned long node_size; 管理区中的页面总数
int node_id; 节点的ID号(NID),从0开始
struct pglist_data *node_next; 指向下一个节点。所有的节点都有一个pgdat_list的链表维护,这些节点都放在该链表中。
} pg_data_t;

ZONE数据结构(内存管理区)

对于x86机器,管理区的示例如下
  • ZONE_DMA 16MB
  • ZONE_NORMAL 16MB~896MB
  • ZONE_HIGHMEM 896MB~未尾
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
typedef struct zone_struct {
spinlock_t lock; 并行访问时保护该管理区的自旋锁。
unsigned long offset;
unsigned long free_pages; 该管理区中空闲页面的总数。
unsigned long inactive_clean_pages;
unsigned long inactive_dirty_pages;
unsigned long pages_min, pages_low, pages_high; 管理区极值,pgaes_min(保留的页框池)
struct list_head inactive_clean_list;
free_area_t free_area[MAX_ORDER]; 空闲区域位图。
char *name; 管理区的字符串名字“DMA”“Normal”“HighMem”
wait_queue *wait_table; 等待队列hash表
unsigned long wait_table_size;
unsigned long wait_table_shift;
unsigned long size; 该管理区的大小,以页面数计算
struct pglist_data *zone_pgdat; 指向父pg_data_t
unsigned long zone_start_paddr; 节点的起始物理地址
unsigned long zone_start_mapnr; 节点在全局mem_map中的页面偏移
struct page *zone_mem_map; 设计的管理区在全局mem_map中的第一页
} zone_t;
  • unsigned long pages_min, pages_low, pages_high;管理区极值
    pages_min最有可能的值是大于等于20(80KB)个页面小于等于255(1MB)个页面
    当空闲页面数达到pages_low时,伙伴分配器就会唤醒kswapd(守护程序)释放页面,pages_low默认值是pages_min的两倍。当空闲页面数达到pages_min时,分配器会以同步方式启动kswapd。kswapd被唤醒并开始释放页面后,在 pages_high个页面被释放以前,是不会认为该管理区已经“平衡”的,当到pages_high值后,kswapd再次睡眠。pages_high默认是pages_min的三倍。

PAGE

所有的page都存储在一个全局的mem_map数组中,该数组通常放在ZONE_NORMAL的首部,或者就在小内存系统中为装入内核映像而预留的区域之后。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
* moment. Note that we have no way to track which tasks are using
* a page, though if it is a pagecache page, rmap structures can tell us
* who is mapping it.
*
* The objects in struct page are organized in double word blocks in
* order to allows us to use atomic double word operations on portions
* of struct page. That is currently only used by slub but the arrangement
* allows the use of atomic double word operations on the flags/mapping
* and lru list pointers also.
*/
struct page {
/* First double word block */
unsigned long flags; /* Atomic flags, some possibly updated asynchronously
描述page的状态和其他信息 */
union
{
struct address_space *mapping; /* If low bit clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
* memory, low bit is set, and
* it points to anon_vma object:
* see PAGE_MAPPING_ANON below.
*/
void *s_mem; /* slab first object */
atomic_t compound_mapcount; /* first tail page */
/* page_deferred_list().next -- second tail page */
};

/* Second double word */
struct {
union {
pgoff_t index; /* Our offset within mapping.
在映射的虚拟空间(vma_area)内的偏移;
一个文件可能只映射一部分,假设映射了1M的空间,
index指的是在1M空间内的偏移,而不是在整个文件内的偏移。 */
void *freelist; /* sl[aou]b first free object */
/* page_deferred_list().prev -- second tail page */
};

union {
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
/* Used for cmpxchg_double in slub */
unsigned long counters;
#else
/*
* Keep _refcount separate from slub cmpxchg_double
* data. As the rest of the double word is protected by
* slab_lock but _refcount is not.
*/
unsigned counters;
#endif

struct {

union {
/*
* Count of ptes mapped in mms, to show
* when page is mapped & limit reverse
* map searches.
* 页映射计数器
*/
atomic_t _mapcount;

struct { /* SLUB */
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
int units; /* SLOB */
};
/*
* Usage count, *USE WRAPPER FUNCTION*
* when manual accounting. See page_ref.h
* 页引用计数器
*/
atomic_t _refcount;
};
unsigned int active; /* SLAB */
};
};

/*
* Third double word block
*
* WARNING: bit 0 of the first word encode PageTail(). That means
* the rest users of the storage space MUST NOT use the bit to
* avoid collision and false-positive PageTail().
*/
union {
struct list_head lru; /* Pageout list, eg. active_list
* protected by zone->lru_lock !
* Can be used as a generic list
* by the page owner.
*/
struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
* lru or handled by a slab
* allocator, this points to the
* hosting device page map.
*/
struct { /* slub per cpu partial pages */
struct page *next; /* Next partial slab */
#ifdef CONFIG_64BIT
int pages; /* Nr of partial slabs left */
int pobjects; /* Approximate # of objects */
#else
short int pages;
short int pobjects;
#endif
};

struct rcu_head rcu_head; /* Used by SLAB
* when destroying via RCU
*/
/* Tail pages of compound page */
struct {
unsigned long compound_head; /* If bit zero is set */

/* First tail page only */
#ifdef CONFIG_64BIT
/*
* On 64 bit system we have enough space in struct page
* to encode compound_dtor and compound_order with
* unsigned int. It can help compiler generate better or
* smaller code on some archtectures.
*/
unsigned int compound_dtor;
unsigned int compound_order;
#else
unsigned short int compound_dtor;
unsigned short int compound_order;
#endif
};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
struct {
unsigned long __pad; /* do not overlay pmd_huge_pte
* with compound_head to avoid
* possible bit 0 collision.
*/
pgtable_t pmd_huge_pte; /* protected by page->ptl */
};
#endif
};

/* Remainder is not double word aligned */
union {
unsigned long private; /* Mapping-private opaque data:
* usually used for buffer_heads
* if PagePrivate set; used for
* swp_entry_t if PageSwapCache;
* indicates order in the buddy
* system if PG_buddy is set.
* 私有数据指针,由应用场景确定其具体的含义
*/
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
#else
spinlock_t ptl;
#endif
#endif
struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */
};

#ifdef CONFIG_MEMCG
struct mem_cgroup *mem_cgroup;
#endif

/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */

#ifdef CONFIG_KMEMCHECK
/*
* kmemcheck wants to track the status of each byte in a page; this
* is a pointer to such a status block. NULL if not tracked.
*/
void *shadow;
#endif

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
}
/*
* The struct page can be forced to be double word aligned so that atomic ops
* on double words work. The SLUB allocator can make use of such a feature.
*/
#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
__aligned(2 * sizeof(unsigned long))
#endif
;

mm_struct


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
struct mm_struct {
//mmap指向虚拟区间链表
struct vm_area_struct * mmap; /* list of VMAs */
//指向红黑树
struct rb_root mm_rb;
//指向最近的虚拟空间
struct vm_area_struct * mmap_cache; /* last find_vma result */
//
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
unsigned long mmap_base; /* base of mmap area */
unsigned long task_size; /* size of task vm space */
unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
//指向进程的页目录
pgd_t * pgd;
//空间中有多少用户
atomic_t mm_users; /* How many users with user space? */
//引用计数;描述有多少指针指向当前的mm_struct
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
//虚拟区间的个数
int map_count; /* number of VMAs */
struct rw_semaphore mmap_sem;
//保护任务页表
spinlock_t page_table_lock; /* Protects page tables and some counters */
//所有mm的链表
struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung
* together off init_mm.mmlist, and are protected
* by mmlist_lock
*/

/* Special counters, in some configurations protected by the
* page_table_lock, in other configurations by being atomic.
*/
mm_counter_t _file_rss;
mm_counter_t _anon_rss;

unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */

unsigned long total_vm, locked_vm, shared_vm, exec_vm;
unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
//start_code:代码段的起始地址
//end_code:代码段的结束地址
//start_data:数据段起始地址
//end_data:数据段结束地址
unsigned long start_code, end_code, start_data, end_data;
//start_brk:堆的起始地址
//brk:堆的结束地址
//start_stack:栈的起始地址
unsigned long start_brk, brk, start_stack;
//arg_start,arg_end:参数段的起始和结束地址
//env_start,env_end:环境段的起始和结束地址
unsigned long arg_start, arg_end, env_start, env_end;

unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

struct linux_binfmt *binfmt;

cpumask_t cpu_vm_mask;

/* Architecture-specific MM context */
mm_context_t context;

/* Swap token stuff */
/*
* Last value of global fault stamp as seen by this process.
* In other words, this value gives an indication of how long
* it has been since this task got the token.
* Look at mm/thrash.c
*/
unsigned int faultstamp;
unsigned int token_priority;
unsigned int last_interval;

unsigned long flags; /* Must use atomic bitops to access the bits */

struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
struct hlist_head ioctx_list;
#endif
#ifdef CONFIG_MM_OWNER
/*
* "owner" points to a task that is regarded as the canonical
* user/owner of this mm. All of the following must be true in
* order for it to be changed:
*
* current == mm->owner
* current->mm != mm
* new_owner->mm == mm
* new_owner->alloc_lock is held
*/
struct task_struct *owner;
#endif

#ifdef CONFIG_PROC_FS
/* store ref to file /proc/<pid>/exe symlink points to */
struct file *exe_file;
unsigned long num_exe_file_vmas;
#endif
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_mm *mmu_notifier_mm;
#endif
};

vm_area_struct

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
struct vm_area_struct {
struct mm_struct * vm_mm; /* The address space we belong to. */
unsigned long vm_start; /* Our start address within vm_mm. */
unsigned long vm_end; /* The first byte after our end address
within vm_mm. */

/* linked list of VM areas per task, sorted by address */
struct vm_area_struct *vm_next;

pgprot_t vm_page_prot; /* Access permissions of this VMA. */
unsigned long vm_flags; /* Flags, see mm.h. */

struct rb_node vm_rb;

/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap prio tree, or
* linkage to the list of like vmas hanging off its node, or
* linkage of vma in the address_space->i_mmap_nonlinear list.
*/
union {
struct {
struct list_head list;
void *parent; /* aligns with prio_tree_node parent */
struct vm_area_struct *head;
} vm_set;

struct raw_prio_tree_node prio_tree_node;
} shared;

/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
* list, after a COW of one of the file pages. A MAP_SHARED vma
* can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
* or brk vma (with NULL file) can only be in an anon_vma list.
*/
struct list_head anon_vma_node; /* Serialized by anon_vma->lock */
struct anon_vma *anon_vma; /* Serialized by page_table_lock */

/* Function pointers to deal with this struct. */
const struct vm_operations_struct *vm_ops;

/* Information about our backing store: */
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
units, *not* PAGE_CACHE_SIZE */
struct file * vm_file; /* File we map to (can be NULL). */
void * vm_private_data; /* was vm_pte (shared mem) */
unsigned long vm_truncate_count;/* truncate_count or restart_addr */

#ifndef CONFIG_MMU
struct vm_region *vm_region; /* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
};

非连续区域内存管理