Source: include/linux/vmalloc.h
The call graph for this function is shown in Figure 7.2. The following macros only by their GFP_ flags (See Section 6.4). The size parameter is page aligned by __vmalloc()(See Section G.1.2).
37 static inline void * vmalloc (unsigned long size) 38 { 39 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); 40 } 45 46 static inline void * vmalloc_dma (unsigned long size) 47 { 48 return __vmalloc(size, GFP_KERNEL|GFP_DMA, PAGE_KERNEL); 49 } 54 55 static inline void * vmalloc_32(unsigned long size) 56 { 57 return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); 58 }
This function has three tasks. It page aligns the size request, asks get_vm_area() to find an area for the request and uses vmalloc_area_pages() to allocate the PTEs for the pages.
261 void * __vmalloc (unsigned long size, int gfp_mask, pgprot_t prot) 262 { 263 void * addr; 264 struct vm_struct *area; 265 266 size = PAGE_ALIGN(size); 267 if (!size || (size >> PAGE_SHIFT) > num_physpages) 268 return NULL; 269 area = get_vm_area(size, VM_ALLOC); 270 if (!area) 271 return NULL; 272 addr = area->addr; 273 if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, gfp_mask, 274 prot, NULL)) { 275 vfree(addr); 276 return NULL; 277 } 278 return addr; 279 }
To allocate an area for the vm_struct, the slab allocator is asked to provide the necessary memory via kmalloc(). It then searches the vm_struct list linearaly looking for a region large enough to satisfy a request, including a page pad at the end of the area.
195 struct vm_struct * get_vm_area(unsigned long size, unsigned long flags) 196 { 197 unsigned long addr, next; 198 struct vm_struct **p, *tmp, *area; 199 200 area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); 201 if (!area) 202 return NULL; 203 204 size += PAGE_SIZE; 205 if(!size) { 206 kfree (area); 207 return NULL; 208 } 209 210 addr = VMALLOC_START; 211 write_lock(&vmlist_lock); 212 for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { 213 if ((size + addr) < addr) 214 goto out; 215 if (size + addr <= (unsigned long) tmp->addr) 216 break; 217 next = tmp->size + (unsigned long) tmp->addr; 218 if (next > addr) 219 addr = next; 220 if (addr > VMALLOC_END-size) 221 goto out; 222 } 223 area->flags = flags; 224 area->addr = (void *)addr; 225 area->size = size; 226 area->next = *p; 227 *p = area; 228 write_unlock(&vmlist_lock); 229 return area; 230 231 out: 232 write_unlock(&vmlist_lock); 233 kfree(area); 234 return NULL; 235 }
This is just a wrapper around __vmalloc_area_pages(). This function exists for compatibility with older kernels. The name change was made to reflect that the new function __vmalloc_area_pages() is able to take an array of pages to use for insertion into the pagetables.
189 int vmalloc_area_pages(unsigned long address, unsigned long size, 190 int gfp_mask, pgprot_t prot) 191 { 192 return __vmalloc_area_pages(address, size, gfp_mask, prot, NULL); 193 }
This is the beginning of a standard page table walk function. This top level function will step through all PGDs within an address range. For each PGD, it will call pmd_alloc() to allocate a PMD directory and call alloc_area_pmd() for the directory.
155 static inline int __vmalloc_area_pages (unsigned long address, 156 unsigned long size, 157 int gfp_mask, 158 pgprot_t prot, 159 struct page ***pages) 160 { 161 pgd_t * dir; 162 unsigned long end = address + size; 163 int ret; 164 165 dir = pgd_offset_k(address); 166 spin_lock(&init_mm.page_table_lock); 167 do { 168 pmd_t *pmd; 169 170 pmd = pmd_alloc(&init_mm, dir, address); 171 ret = -ENOMEM; 172 if (!pmd) 173 break; 174 175 ret = -ENOMEM; 176 if (alloc_area_pmd(pmd, address, end - address, gfp_mask, prot, pages)) 177 break; 178 179 address = (address + PGDIR_SIZE) & PGDIR_MASK; 180 dir++; 181 182 ret = 0; 183 } while (address && (address < end)); 184 spin_unlock(&init_mm.page_table_lock); 185 flush_cache_all(); 186 return ret; 187 }
This is the second stage of the standard page table walk to allocate PTE entries for an address range. For every PMD within a given address range on a PGD, pte_alloc() will creates a PTE directory and then alloc_area_pte() will be called to allocate the physical pages
132 static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address, 133 unsigned long size, int gfp_mask, 134 pgprot_t prot, struct page ***pages) 135 { 136 unsigned long end; 137 138 address &= ~PGDIR_MASK; 139 end = address + size; 140 if (end > PGDIR_SIZE) 141 end = PGDIR_SIZE; 142 do { 143 pte_t * pte = pte_alloc(&init_mm, pmd, address); 144 if (!pte) 145 return -ENOMEM; 146 if (alloc_area_pte(pte, address, end - address, 147 gfp_mask, prot, pages)) 148 return -ENOMEM; 149 address = (address + PMD_SIZE) & PMD_MASK; 150 pmd++; 151 } while (address < end); 152 return 0; 152 }
This is the last stage of the page table walk. For every PTE in the given PTE directory and address range, a page will be allocated and associated with the PTE.
95 static inline int alloc_area_pte (pte_t * pte, unsigned long address, 96 unsigned long size, int gfp_mask, 97 pgprot_t prot, struct page ***pages) 98 { 99 unsigned long end; 100 101 address &= ~PMD_MASK; 102 end = address + size; 103 if (end > PMD_SIZE) 104 end = PMD_SIZE; 105 do { 106 struct page * page; 107 108 if (!pages) { 109 spin_unlock(&init_mm.page_table_lock); 110 page = alloc_page(gfp_mask); 111 spin_lock(&init_mm.page_table_lock); 112 } else { 113 page = (**pages); 114 (*pages)++; 115 116 /* Add a reference to the page so we can free later */ 117 if (page) 118 atomic_inc(&page->count); 119 120 } 121 if (!pte_none(*pte)) 122 printk(KERN_ERR "alloc_area_pte: page already exists\n"); 123 if (!page) 124 return -ENOMEM; 125 set_pte(pte, mk_pte(page, prot)); 126 address += PAGE_SIZE; 127 pte++; 128 } while (address < end); 129 return 0; 130 }
This function allows a caller-supplied array of pages to be inserted into the vmalloc address space. This is unused in 2.4.22 and I suspect it is an accidental backport from 2.6.x where it is used by the sound subsystem core.
281 void * vmap(struct page **pages, int count, 282 unsigned long flags, pgprot_t prot) 283 { 284 void * addr; 285 struct vm_struct *area; 286 unsigned long size = count << PAGE_SHIFT; 287 288 if (!size || size > (max_mapnr << PAGE_SHIFT)) 289 return NULL; 290 area = get_vm_area(size, flags); 291 if (!area) { 292 return NULL; 293 } 294 addr = area->addr; 295 if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, 0, 296 prot, &pages)) { 297 vfree(addr); 298 return NULL; 299 } 300 return addr; 301 }
The call graph for this function is shown in Figure 7.4. This is the top level function responsible for freeing a non-contiguous area of memory. It performs basic sanity checks before finding the vm_struct for the requested addr. Once found, it calls vmfree_area_pages().
237 void vfree(void * addr) 238 { 239 struct vm_struct **p, *tmp; 240 241 if (!addr) 242 return; 243 if ((PAGE_SIZE-1) & (unsigned long) addr) { 244 printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 245 return; 246 } 247 write_lock(&vmlist_lock); 248 for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) { 249 if (tmp->addr == addr) { 250 *p = tmp->next; 251 vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size); 252 write_unlock(&vmlist_lock); 253 kfree(tmp); 254 return; 255 } 256 } 257 write_unlock(&vmlist_lock); 258 printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); 259 }
This is the first stage of the page table walk to free all pages and PTEs associated with an address range. It is responsible for stepping through the relevant PGDs and for flushing the TLB.
80 void vmfree_area_pages(unsigned long address, unsigned long size) 81 { 82 pgd_t * dir; 83 unsigned long end = address + size; 84 85 dir = pgd_offset_k(address); 86 flush_cache_all(); 87 do { 88 free_area_pmd(dir, address, end - address); 89 address = (address + PGDIR_SIZE) & PGDIR_MASK; 90 dir++; 91 } while (address && (address < end)); 92 flush_tlb_all(); 93 }
This is the second stage of the page table walk. For every PMD in this directory, call free_area_pte() to free up the pages and PTEs.
56 static inline void free_area_pmd(pgd_t * dir, unsigned long address, unsigned long size) 57 { 58 pmd_t * pmd; 59 unsigned long end; 60 61 if (pgd_none(*dir)) 62 return; 63 if (pgd_bad(*dir)) { 64 pgd_ERROR(*dir); 65 pgd_clear(dir); 66 return; 67 } 68 pmd = pmd_offset(dir, address); 69 address &= ~PGDIR_MASK; 70 end = address + size; 71 if (end > PGDIR_SIZE) 72 end = PGDIR_SIZE; 73 do { 74 free_area_pte(pmd, address, end - address); 75 address = (address + PMD_SIZE) & PMD_MASK; 76 pmd++; 77 } while (address < end); 78 }
This is the final stage of the page table walk. For every PTE in the given PMD within the address range, it will free the PTE and the associated page
22 static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size) 23 { 24 pte_t * pte; 25 unsigned long end; 26 27 if (pmd_none(*pmd)) 28 return; 29 if (pmd_bad(*pmd)) { 30 pmd_ERROR(*pmd); 31 pmd_clear(pmd); 32 return; 33 } 34 pte = pte_offset(pmd, address); 35 address &= ~PMD_MASK; 36 end = address + size; 37 if (end > PMD_SIZE) 38 end = PMD_SIZE; 39 do { 40 pte_t page; 41 page = ptep_get_and_clear(pte); 42 address += PAGE_SIZE; 43 pte++; 44 if (pte_none(page)) 45 continue; 46 if (pte_present(page)) { 47 struct page *ptpage = pte_page(page); 48 if (VALID_PAGE(ptpage) && (!PageReserved(ptpage))) 49 __free_page(ptpage); 50 continue; 51 } 52 printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n"); 53 } while (address < end); 54 }