This section covers the functions used to allocate, initialise, copy and destroy memory descriptors.
The initial mm_struct in the system is called init_mm and is statically initialised at compile time using the macro INIT_MM().
238 #define INIT_MM(name) \ 239 { \ 240 mm_rb: RB_ROOT, \ 241 pgd: swapper_pg_dir, \ 242 mm_users: ATOMIC_INIT(2), \ 243 mm_count: ATOMIC_INIT(1), \ 244 mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem),\ 245 page_table_lock: SPIN_LOCK_UNLOCKED, \ 246 mmlist: LIST_HEAD_INIT(name.mmlist), \ 247 }
Once it is established, new mm_structs are copies of their parent mm_struct and are copied using copy_mm() with the process specific fields initialised with init_mm().
This function makes a copy of the mm_struct for the given task. This is only called from do_fork() after a new process has been created and needs its own mm_struct.
315 static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) 316 { 317 struct mm_struct * mm, *oldmm; 318 int retval; 319 320 tsk->min_flt = tsk->maj_flt = 0; 321 tsk->cmin_flt = tsk->cmaj_flt = 0; 322 tsk->nswap = tsk->cnswap = 0; 323 324 tsk->mm = NULL; 325 tsk->active_mm = NULL; 326 327 /* 328 * Are we cloning a kernel thread? 330 * We need to steal a active VM for that.. 331 */ 332 oldmm = current->mm; 333 if (!oldmm) 334 return 0; 335 336 if (clone_flags & CLONE_VM) { 337 atomic_inc(&oldmm->mm_users); 338 mm = oldmm; 339 goto good_mm; 340 }
Reset fields that are not inherited by a child mm_struct and find a mm to copy from.
342 retval = -ENOMEM; 343 mm = allocate_mm(); 344 if (!mm) 345 goto fail_nomem; 346 347 /* Copy the current MM stuff.. */ 348 memcpy(mm, oldmm, sizeof(*mm)); 349 if (!mm_init(mm)) 350 goto fail_nomem; 351 352 if (init_new_context(tsk,mm)) 353 goto free_pt; 354 355 down_write(&oldmm->mmap_sem); 356 retval = dup_mmap(mm); 357 up_write(&oldmm->mmap_sem); 358
359 if (retval) 360 goto free_pt; 361 362 /* 363 * child gets a private LDT (if there was an LDT in the parent) 364 */ 365 copy_segments(tsk, mm); 366 367 good_mm: 368 tsk->mm = mm; 369 tsk->active_mm = mm; 370 return 0; 371 372 free_pt: 373 mmput(mm); 374 fail_nomem: 375 return retval; 376 }
This function initialises process specific mm fields.
230 static struct mm_struct * mm_init(struct mm_struct * mm) 231 { 232 atomic_set(&mm->mm_users, 1); 233 atomic_set(&mm->mm_count, 1); 234 init_rwsem(&mm->mmap_sem); 235 mm->page_table_lock = SPIN_LOCK_UNLOCKED; 236 mm->pgd = pgd_alloc(mm); 237 mm->def_flags = 0; 238 if (mm->pgd) 239 return mm; 240 free_mm(mm); 241 return NULL; 242 }
Two functions are provided allocating a mm_struct. To be slightly confusing, they are essentially the name. allocate_mm() will allocate a mm_struct from the slab allocator. mm_alloc() will allocate the struct and then call the function mm_init() to initialise it.
227 #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
248 struct mm_struct * mm_alloc(void) 249 { 250 struct mm_struct * mm; 251 252 mm = allocate_mm(); 253 if (mm) { 254 memset(mm, 0, sizeof(*mm)); 255 return mm_init(mm); 256 } 257 return NULL; 258 }
A new user to an mm increments the usage count with a simple call,
atomic_inc(&mm->mm_users};
It is decremented with a call to mmput(). If the mm_users count reaches zero, all the mapped regions are deleted with exit_mmap() and the page tables destroyed as there is no longer any users of the userspace portions. The mm_count count is decremented with mmdrop() as all the users of the page tables and VMAs are counted as one mm_struct user. When mm_count reaches zero, the mm_struct will be destroyed.
276 void mmput(struct mm_struct *mm) 277 { 278 if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { 279 extern struct mm_struct *swap_mm; 280 if (swap_mm == mm) 281 swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); 282 list_del(&mm->mmlist); 283 mmlist_nr--; 284 spin_unlock(&mmlist_lock); 285 exit_mmap(mm); 286 mmdrop(mm); 287 } 288 }
765 static inline void mmdrop(struct mm_struct * mm) 766 { 767 if (atomic_dec_and_test(&mm->mm_count)) 768 __mmdrop(mm); 769 }
265 inline void __mmdrop(struct mm_struct *mm) 266 { 267 BUG_ON(mm == &init_mm); 268 pgd_free(mm->pgd); 269 destroy_context(mm); 270 free_mm(mm); 271 }
This large section deals with the creation, deletion and manipulation of memory regions.
The main call graph for creating a memory region is shown in Figure 4.4.
This is a very simply wrapper function around do_mmap_pgoff() which performs most of the work.
557 static inline unsigned long do_mmap(struct file *file, unsigned long addr, 558 unsigned long len, unsigned long prot, 559 unsigned long flag, unsigned long offset) 560 { 561 unsigned long ret = -EINVAL; 562 if ((offset + PAGE_ALIGN(len)) < offset) 563 goto out; 564 if (!(offset & ~PAGE_MASK)) 565 ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); 566 out: 567 return ret; 568 }
This function is very large and so is broken up into a number of sections. Broadly speaking the sections are
393 unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len, unsigned long prot, 394 unsigned long flags, unsigned long pgoff) 395 { 396 struct mm_struct * mm = current->mm; 397 struct vm_area_struct * vma, * prev; 398 unsigned int vm_flags; 399 int correct_wcount = 0; 400 int error; 401 rb_node_t ** rb_link, * rb_parent; 402 403 if (file && (!file->f_op || !file->f_op->mmap)) 404 return -ENODEV; 405 406 if (!len) 407 return addr; 408 409 len = PAGE_ALIGN(len); 410 if (len > TASK_SIZE || len == 0) return -EINVAL; 413 414 /* offset overflow? */ 415 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 416 return -EINVAL; 417 418 /* Too many mappings? */ 419 if (mm->map_count > max_map_count) 420 return -ENOMEM; 421
422 /* Obtain the address to map to. we verify (or select) it and 423 * ensure that it represents a valid section of the address space. 424 */ 425 addr = get_unmapped_area(file, addr, len, pgoff, flags); 426 if (addr & ~PAGE_MASK) 427 return addr; 428
429 /* Do simple checking here so the lower-level routines won't have 430 * to. we assume access permissions have been handled by the open 431 * of the memory object, so we don't do any here. 432 */ 433 vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 434 435 /* mlock MCL_FUTURE? */ 436 if (vm_flags & VM_LOCKED) { 437 unsigned long locked = mm->locked_vm << PAGE_SHIFT; 438 locked += len; 439 if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) 440 return -EAGAIN; 441 } 442
443 if (file) { 444 switch (flags & MAP_TYPE) { 445 case MAP_SHARED: 446 if ((prot & PROT_WRITE) && !(file->f_mode & FMODE_WRITE)) 447 return -EACCES; 448 449 /* Make sure we don't allow writing to an append-only file.. */ 450 if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & FMODE_WRITE)) 451 return -EACCES; 452 453 /* make sure there are no mandatory locks on the file. */ 454 if (locks_verify_locked(file->f_dentry->d_inode)) 455 return -EAGAIN; 456 457 vm_flags |= VM_SHARED | VM_MAYSHARE; 458 if (!(file->f_mode & FMODE_WRITE)) 459 vm_flags &= ~(VM_MAYWRITE | VM_SHARED); 460 461 /* fall through */ 462 case MAP_PRIVATE: 463 if (!(file->f_mode & FMODE_READ)) 464 return -EACCES; 465 break; 466 467 default: 468 return -EINVAL; 469 }
470 } else { 471 vm_flags |= VM_SHARED | VM_MAYSHARE; 472 switch (flags & MAP_TYPE) { 473 default: 474 return -EINVAL; 475 case MAP_PRIVATE: 476 vm_flags &= ~(VM_SHARED | VM_MAYSHARE); 477 /* fall through */ 478 case MAP_SHARED: 479 break; 480 } 481 }
483 /* Clear old maps */ 484 munmap_back: 485 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 486 if (vma && vma->vm_start < addr + len) { 487 if (do_munmap(mm, addr, len)) 488 return -ENOMEM; 489 goto munmap_back; 490 }
491 492 /* Check against address space limit. */ 493 if ((mm->total_vm << PAGE_SHIFT) + len 494 > current->rlim[RLIMIT_AS].rlim_cur) 495 return -ENOMEM; 496 497 /* Private writable mapping? Check memory availability.. */ 498 if ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && 499 !(flags & MAP_NORESERVE) && 500 !vm_enough_memory(len >> PAGE_SHIFT)) 501 return -ENOMEM; 502 503 /* Can we just expand an old anonymous mapping? */ 504 if (!file && !(vm_flags & VM_SHARED) && rb_parent) 505 if (vma_merge(mm, prev, rb_parent, addr, addr + len, vm_flags)) 506 goto out; 507
508 /* Determine the object being mapped and call the appropriate 509 * specific mapper. the address has already been validated, but 510 * not unmapped, but the maps are removed from the list. 511 */ 512 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 513 if (!vma) 514 return -ENOMEM; 515 516 vma->vm_mm = mm; 517 vma->vm_start = addr; 518 vma->vm_end = addr + len; 519 vma->vm_flags = vm_flags; 520 vma->vm_page_prot = protection_map[vm_flags & 0x0f]; 521 vma->vm_ops = NULL; 522 vma->vm_pgoff = pgoff; 523 vma->vm_file = NULL; 524 vma->vm_private_data = NULL; 525 vma->vm_raend = 0;
527 if (file) { 528 error = -EINVAL; 529 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 530 goto free_vma; 531 if (vm_flags & VM_DENYWRITE) { 532 error = deny_write_access(file); 533 if (error) 534 goto free_vma; 535 correct_wcount = 1; 536 } 537 vma->vm_file = file; 538 get_file(file); 539 error = file->f_op->mmap(file, vma); 540 if (error) 541 goto unmap_and_free_vma;
542 } else if (flags & MAP_SHARED) { 543 error = shmem_zero_setup(vma); 544 if (error) 545 goto free_vma; 546 } 547
548 /* Can addr have changed?? 549 * 550 * Answer: Yes, several device drivers can do it in their 551 * f_op->mmap method. -DaveM 552 */ 553 if (addr != vma->vm_start) { 554 /* 555 * It is a bit too late to pretend changing the virtual 556 * area of the mapping, we just corrupted userspace 557 * in the do_munmap, so FIXME (not in 2.4 to avoid 558 * breaking the driver API). 559 */ 560 struct vm_area_struct * stale_vma; 561 /* Since addr changed, we rely on the mmap op to prevent 562 * collisions with existing vmas and just use 563 * find_vma_prepare to update the tree pointers. 564 */ 565 addr = vma->vm_start; 566 stale_vma = find_vma_prepare(mm, addr, &prev, 567 &rb_link, &rb_parent); 568 /* 569 * Make sure the lowlevel driver did its job right. 570 */ 571 if (unlikely(stale_vma && stale_vma->vm_start < vma->vm_end)) { 572 printk(KERN_ERR "buggy mmap operation: [<%p>]\n", 573 file ? file->f_op->mmap : NULL); 574 BUG(); 575 } 576 } 577 578 vma_link(mm, vma, prev, rb_link, rb_parent); 579 if (correct_wcount) 580 atomic_inc(&file->f_dentry->d_inode->i_writecount); 581
582 out: 583 mm->total_vm += len >> PAGE_SHIFT; 584 if (vm_flags & VM_LOCKED) { 585 mm->locked_vm += len >> PAGE_SHIFT; 586 make_pages_present(addr, addr + len); 587 } 588 return addr; 589 590 unmap_and_free_vma: 591 if (correct_wcount) 592 atomic_inc(&file->f_dentry->d_inode->i_writecount); 593 vma->vm_file = NULL; 594 fput(file); 595 596 /* Undo any partial mapping done by a device driver. */ 597 zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); 598 free_vma: 599 kmem_cache_free(vm_area_cachep, vma); 600 return error; 601 }
The call graph for insert_vm_struct() is shown in Figure 4.6.
This is the top level function for inserting a new vma into an address space. There is a second function like it called simply insert_vm_struct() that is not described in detail here as the only difference is the one line of code increasing the map_count.
1174 void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) 1175 { 1176 struct vm_area_struct * __vma, * prev; 1177 rb_node_t ** rb_link, * rb_parent; 1178 1179 __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent); 1180 if (__vma && __vma->vm_start < vma->vm_end) 1181 BUG(); 1182 __vma_link(mm, vma, prev, rb_link, rb_parent); 1183 mm->map_count++; 1184 validate_mm(mm); 1185 }
This is responsible for finding the correct places to insert a VMA at the supplied address. It returns a number of pieces of information via the actual return and the function arguments. The forward VMA to link to is returned with return. pprev is the previous node which is required because the list is a singly linked list. rb_link and rb_parent are the parent and leaf node the new VMA will be inserted between.
246 static struct vm_area_struct * find_vma_prepare( struct mm_struct * mm, unsigned long addr, 247 struct vm_area_struct ** pprev, 248 rb_node_t *** rb_link, rb_node_t ** rb_parent) 249 { 250 struct vm_area_struct * vma; 251 rb_node_t ** __rb_link, * __rb_parent, * rb_prev; 252 253 __rb_link = &mm->mm_rb.rb_node; 254 rb_prev = __rb_parent = NULL; 255 vma = NULL; 256 257 while (*__rb_link) { 258 struct vm_area_struct *vma_tmp; 259 260 __rb_parent = *__rb_link; 261 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); 262 263 if (vma_tmp->vm_end > addr) { 264 vma = vma_tmp; 265 if (vma_tmp->vm_start <= addr) 266 return vma; 267 __rb_link = &__rb_parent->rb_left; 268 } else { 269 rb_prev = __rb_parent; 270 __rb_link = &__rb_parent->rb_right; 271 } 272 } 273 274 *pprev = NULL; 275 if (rb_prev) 276 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 277 *rb_link = __rb_link; 278 *rb_parent = __rb_parent; 279 return vma; 280 }
This is the top-level function for linking a VMA into the proper lists. It is responsible for acquiring the necessary locks to make a safe insertion
337 static inline void vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, 338 rb_node_t ** rb_link, rb_node_t * rb_parent) 339 { 340 lock_vma_mappings(vma); 341 spin_lock(&mm->page_table_lock); 342 __vma_link(mm, vma, prev, rb_link, rb_parent); 343 spin_unlock(&mm->page_table_lock); 344 unlock_vma_mappings(vma); 345 346 mm->map_count++; 347 validate_mm(mm); 348 }
This simply calls three helper functions which are responsible for linking the VMA into the three linked lists that link VMAs together.
329 static void __vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, 330 rb_node_t ** rb_link, rb_node_t * rb_parent) 331 { 332 __vma_link_list(mm, vma, prev, rb_parent); 333 __vma_link_rb(mm, vma, rb_link, rb_parent); 334 __vma_link_file(vma); 335 }
282 static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, 283 rb_node_t * rb_parent) 284 { 285 if (prev) { 286 vma->vm_next = prev->vm_next; 287 prev->vm_next = vma; 288 } else { 289 mm->mmap = vma; 290 if (rb_parent) 291 vma->vm_next = rb_entry(rb_parent, struct vm_area_struct, vm_rb); 292 else 293 vma->vm_next = NULL; 294 } 295 }
The principal workings of this function are stored within <linux/rbtree.h> and will not be discussed in detail in this book.
297 static inline void __vma_link_rb(struct mm_struct * mm, struct vm_area_struct * vma, 298 rb_node_t ** rb_link, rb_node_t * rb_parent) 299 { 300 rb_link_node(&vma->vm_rb, rb_parent, rb_link); 301 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 302 }
This function links the VMA into a linked list of shared file mappings.
304 static inline void __vma_link_file(struct vm_area_struct * vma) 305 { 306 struct file * file; 307 308 file = vma->vm_file; 309 if (file) { 310 struct inode * inode = file->f_dentry->d_inode; 311 struct address_space *mapping = inode->i_mapping; 312 struct vm_area_struct **head; 313 314 if (vma->vm_flags & VM_DENYWRITE) 315 atomic_dec(&inode->i_writecount); 316 317 head = &mapping->i_mmap; 318 if (vma->vm_flags & VM_SHARED) 319 head = &mapping->i_mmap_shared; 320 321 /* insert vma into inode's share list */ 322 if((vma->vm_next_share = *head) != NULL) 323 (*head)->vm_pprev_share = &vma->vm_next_share; 324 *head = vma; 325 vma->vm_pprev_share = head; 326 } 327 }
This function checks to see if a region pointed to be prev may be expanded forwards to cover the area from addr to end instead of allocating a new VMA. If it cannot, the VMA ahead is checked to see can it be expanded backwards instead.
350 static int vma_merge(struct mm_struct * mm, struct vm_area_struct * prev, 351 rb_node_t * rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags) 352 { 353 spinlock_t * lock = &mm->page_table_lock; 354 if (!prev) { 355 prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb); 356 goto merge_next; 357 }
358 if (prev->vm_end == addr && can_vma_merge(prev, vm_flags)) { 359 struct vm_area_struct * next; 360 361 spin_lock(lock); 362 prev->vm_end = end; 363 next = prev->vm_next; 364 if (next && prev->vm_end == next->vm_start && can_vma_merge(next, vm_flags)) { 365 prev->vm_end = next->vm_end; 366 __vma_unlink(mm, next, prev); 367 spin_unlock(lock); 368 369 mm->map_count--; 370 kmem_cache_free(vm_area_cachep, next); 371 return 1; 372 } 373 spin_unlock(lock); 374 return 1; 375 } 376 377 prev = prev->vm_next; 378 if (prev) { 379 merge_next: 380 if (!can_vma_merge(prev, vm_flags)) 381 return 0; 382 if (end == prev->vm_start) { 383 spin_lock(lock); 384 prev->vm_start = addr; 385 spin_unlock(lock); 386 return 1; 387 } 388 } 389 390 return 0; 391 }
This trivial function checks to see if the permissions of the supplied VMA match the permissions in vm_flags
582 static inline int can_vma_merge(struct vm_area_struct * vma, unsigned long vm_flags) 583 { 584 if (!vma->vm_file && vma->vm_flags == vm_flags) 585 return 1; 586 else 587 return 0; 588 }
The call graph for this function is shown in Figure 4.7. This is the system service call to remap a memory region
347 asmlinkage unsigned long sys_mremap(unsigned long addr, 348 unsigned long old_len, unsigned long new_len, 349 unsigned long flags, unsigned long new_addr) 350 { 351 unsigned long ret; 352 353 down_write(¤t->mm->mmap_sem); 354 ret = do_mremap(addr, old_len, new_len, flags, new_addr); 355 up_write(¤t->mm->mmap_sem); 356 return ret; 357 }
This function does most of the actual “work” required to remap, resize and move a memory region. It is quite long but can be broken up into distinct parts which will be dealt with separately here. The tasks are broadly speaking
219 unsigned long do_mremap(unsigned long addr, 220 unsigned long old_len, unsigned long new_len, 221 unsigned long flags, unsigned long new_addr) 222 { 223 struct vm_area_struct *vma; 224 unsigned long ret = -EINVAL; 225 226 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) 227 goto out; 228 229 if (addr & ~PAGE_MASK) 230 goto out; 231 232 old_len = PAGE_ALIGN(old_len); 233 new_len = PAGE_ALIGN(new_len); 234
236 if (flags & MREMAP_FIXED) { 237 if (new_addr & ~PAGE_MASK) 238 goto out; 239 if (!(flags & MREMAP_MAYMOVE)) 240 goto out; 241 242 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) 243 goto out; 244 245 /* Check if the location we're moving into overlaps the 246 * old location at all, and fail if it does. 247 */ 248 if ((new_addr <= addr) && (new_addr+new_len) > addr) 249 goto out; 250 251 if ((addr <= new_addr) && (addr+old_len) > new_addr) 252 goto out; 253 254 do_munmap(current->mm, new_addr, new_len); 255 }
This block handles the condition where the region location is fixed and must be fully moved. It ensures the area been moved to is safe and definitely unmapped.
261 ret = addr; 262 if (old_len >= new_len) { 263 do_munmap(current->mm, addr+new_len, old_len - new_len); 264 if (!(flags & MREMAP_FIXED) || (new_addr == addr)) 265 goto out; 266 }
271 ret = -EFAULT; 272 vma = find_vma(current->mm, addr); 273 if (!vma || vma->vm_start > addr) 274 goto out; 275 /* We can't remap across vm area boundaries */ 276 if (old_len > vma->vm_end - addr) 277 goto out; 278 if (vma->vm_flags & VM_DONTEXPAND) { 279 if (new_len > old_len) 280 goto out; 281 } 282 if (vma->vm_flags & VM_LOCKED) { 283 unsigned long locked = current->mm->locked_vm << PAGE_SHIFT; 284 locked += new_len - old_len; 285 ret = -EAGAIN; 286 if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) 287 goto out; 288 } 289 ret = -ENOMEM; 290 if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) 291 > current->rlim[RLIMIT_AS].rlim_cur) 292 goto out; 293 /* Private writable mapping? Check memory availability.. */ 294 if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && 295 !(flags & MAP_NORESERVE) && 296 !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT)) 297 goto out;
Do a number of checks to make sure it is safe to grow or move the region
302 if (old_len == vma->vm_end - addr && 303 !((flags & MREMAP_FIXED) && (addr != new_addr)) && 304 (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { 305 unsigned long max_addr = TASK_SIZE; 306 if (vma->vm_next) 307 max_addr = vma->vm_next->vm_start; 308 /* can we just expand the current mapping? */ 309 if (max_addr - addr >= new_len) { 310 int pages = (new_len - old_len) >> PAGE_SHIFT; 311 spin_lock(&vma->vm_mm->page_table_lock); 312 vma->vm_end = addr + new_len; 313 spin_unlock(&vma->vm_mm->page_table_lock); 314 current->mm->total_vm += pages; 315 if (vma->vm_flags & VM_LOCKED) { 316 current->mm->locked_vm += pages; 317 make_pages_present(addr + old_len, 318 addr + new_len); 319 } 320 ret = addr; 321 goto out; 322 } 323 }
Handle the case where the region is been expanded and cannot be moved
329 ret = -ENOMEM; 330 if (flags & MREMAP_MAYMOVE) { 331 if (!(flags & MREMAP_FIXED)) { 332 unsigned long map_flags = 0; 333 if (vma->vm_flags & VM_SHARED) 334 map_flags |= MAP_SHARED; 335 336 new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags); 337 ret = new_addr; 338 if (new_addr & ~PAGE_MASK) 339 goto out; 340 } 341 ret = move_vma(vma, addr, old_len, new_len, new_addr); 342 } 343 out: 344 return ret; 345 }
To expand the region, a new one has to be allocated and the old one moved to it
The call graph for this function is shown in Figure 4.8. This function is responsible for moving all the page table entries from one VMA to another region. If necessary a new VMA will be allocated for the region being moved to. Just like the function above, it is very long but may be broken up into the following distinct parts.
125 static inline unsigned long move_vma(struct vm_area_struct * vma, 126 unsigned long addr, unsigned long old_len, unsigned long new_len, 127 unsigned long new_addr) 128 { 129 struct mm_struct * mm = vma->vm_mm; 130 struct vm_area_struct * new_vma, * next, * prev; 131 int allocated_vma; 132 133 new_vma = NULL; 134 next = find_vma_prev(mm, new_addr, &prev);
135 if (next) { 136 if (prev && prev->vm_end == new_addr && 137 can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { 138 spin_lock(&mm->page_table_lock); 139 prev->vm_end = new_addr + new_len; 140 spin_unlock(&mm->page_table_lock); 141 new_vma = prev; 142 if (next != prev->vm_next) 143 BUG(); 144 if (prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags)) { 145 spin_lock(&mm->page_table_lock); 146 prev->vm_end = next->vm_end; 147 __vma_unlink(mm, next, prev); 148 spin_unlock(&mm->page_table_lock); 149 150 mm->map_count--; 151 kmem_cache_free(vm_area_cachep, next); 152 } 153 } else if (next->vm_start == new_addr + new_len && 154 can_vma_merge(next, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { 155 spin_lock(&mm->page_table_lock); 156 next->vm_start = new_addr; 157 spin_unlock(&mm->page_table_lock); 158 new_vma = next; 159 } 160 } else {
In this block, the new location is between two existing VMAs. Checks are made to see can be preceding region be expanded to cover the new mapping and then if it can be expanded to cover the next VMA as well. If it cannot be expanded, the next region is checked to see if it can be expanded backwards.
161 prev = find_vma(mm, new_addr-1); 162 if (prev && prev->vm_end == new_addr && 163 can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { 164 spin_lock(&mm->page_table_lock); 165 prev->vm_end = new_addr + new_len; 166 spin_unlock(&mm->page_table_lock); 167 new_vma = prev; 168 } 169 }
This block is for the case where the newly mapped region is the last VMA (next is NULL) so a check is made to see can the preceding region be expanded.
170 171 allocated_vma = 0; 172 if (!new_vma) { 173 new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 174 if (!new_vma) 175 goto out; 176 allocated_vma = 1; 177 } 178
179 if (!move_page_tables(current->mm, new_addr, addr, old_len)) { 180 unsigned long vm_locked = vma->vm_flags & VM_LOCKED; 181 182 if (allocated_vma) { 183 *new_vma = *vma; 184 new_vma->vm_start = new_addr; 185 new_vma->vm_end = new_addr+new_len; 186 new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT; 187 new_vma->vm_raend = 0; 188 if (new_vma->vm_file) 189 get_file(new_vma->vm_file); 190 if (new_vma->vm_ops && new_vma->vm_ops->open) 191 new_vma->vm_ops->open(new_vma); 192 insert_vm_struct(current->mm, new_vma); 193 } do_munmap(current->mm, addr, old_len); 197 current->mm->total_vm += new_len >> PAGE_SHIFT; 198 if (new_vma->vm_flags & VM_LOCKED) { 199 current->mm->locked_vm += new_len >> PAGE_SHIFT; 200 make_pages_present(new_vma->vm_start, 201 new_vma->vm_end); 202 } 203 return new_addr; 204 } 205 if (allocated_vma) 206 kmem_cache_free(vm_area_cachep, new_vma); 207 out: 208 return -ENOMEM; 209 }
This function makes all pages between addr and end present. It assumes that the two addresses are within the one VMA.
1460 int make_pages_present(unsigned long addr, unsigned long end) 1461 { 1462 int ret, len, write; 1463 struct vm_area_struct * vma; 1464 1465 vma = find_vma(current->mm, addr); 1466 write = (vma->vm_flags & VM_WRITE) != 0; 1467 if (addr >= end) 1468 BUG(); 1469 if (end > vma->vm_end) 1470 BUG(); 1471 len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; 1472 ret = get_user_pages(current, current->mm, addr, 1473 len, write, 0, NULL, NULL); 1474 return ret == len ? 0 : -1; 1475 }
This function is used to fault in user pages and may be used to fault in pages belonging to another process, which is required by ptrace() for example.
454 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, 455 int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) 456 { 457 int i; 458 unsigned int flags; 459 460 /* 461 * Require read or write permissions. 462 * If 'force' is set, we only require the "MAY" flags. 463 */ 464 flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 465 flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 466 i = 0; 467
468 do { 469 struct vm_area_struct * vma; 470 471 vma = find_extend_vma(mm, start); 472 473 if ( !vma || (pages && vma->vm_flags & VM_IO) || !(flags & vma->vm_flags) ) 474 return i ? : -EFAULT; 475 476 spin_lock(&mm->page_table_lock); 477 do { 478 struct page *map; 479 while (!(map = follow_page(mm, start, write))) { 480 spin_unlock(&mm->page_table_lock); 481 switch (handle_mm_fault(mm, vma, start, write)) { 482 case 1: 483 tsk->min_flt++; 484 break; 485 case 2: 486 tsk->maj_flt++; 487 break; 488 case 0: 489 if (i) return i; 490 return -EFAULT; 491 default: 492 if (i) return i; 493 return -ENOMEM; 494 } 495 spin_lock(&mm->page_table_lock); 496 } 497 if (pages) { 498 pages[i] = get_page_map(map); 499 /* FIXME: call the correct function, 500 * depending on the type of the found page 501 */ 502 if (!pages[i]) 503 goto bad_page; 504 page_cache_get(pages[i]); 505 } 506 if (vmas) 507 vmas[i] = vma; 508 i++; 509 start += PAGE_SIZE; 510 len--; 511 } while(len && start < vma->vm_end); 512 spin_unlock(&mm->page_table_lock); 513 } while(len); 514 out: 515 return i;
516 517 /* 518 * We found an invalid page in the VMA. Release all we have 519 * so far and fail. 520 */ 521 bad_page: 522 spin_unlock(&mm->page_table_lock); 523 while (i--) 524 page_cache_release(pages[i]); 525 i = -EFAULT; 526 goto out; 527 }
The call graph for this function is shown in Figure 4.9. This function is responsible copying all the page table entries from the region pointed to be old_addr to new_addr. It works by literally copying page table entries one at a time. When it is finished, it deletes all the entries from the old area. This is not the most efficient way to perform the operation, but it is very easy to error recover.
90 static int move_page_tables(struct mm_struct * mm, 91 unsigned long new_addr, unsigned long old_addr, unsigned long len) 92 { 93 unsigned long offset = len; 94 95 flush_cache_range(mm, old_addr, old_addr + len); 96 102 while (offset) { 103 offset -= PAGE_SIZE; 104 if (move_one_page(mm, old_addr + offset, new_addr + offset)) 105 goto oops_we_failed; 106 } 107 flush_tlb_range(mm, old_addr, old_addr + len); 108 return 0; 109 117 oops_we_failed: 118 flush_cache_range(mm, new_addr, new_addr + len); 119 while ((offset += PAGE_SIZE) < len) 120 move_one_page(mm, new_addr + offset, old_addr + offset); 121 zap_page_range(mm, new_addr, len); 122 return -1; 123 }
This function is responsible for acquiring the spinlock before finding the correct PTE with get_one_pte() and copying it with copy_one_pte()
77 static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr) 78 { 79 int error = 0; 80 pte_t * src; 81 82 spin_lock(&mm->page_table_lock); 83 src = get_one_pte(mm, old_addr); 84 if (src) 85 error = copy_one_pte(mm, src, alloc_one_pte(mm, new_addr)); 86 spin_unlock(&mm->page_table_lock); 87 return error; 88 }
This is a very simple page table walk.
18 static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr) 19 { 20 pgd_t * pgd; 21 pmd_t * pmd; 22 pte_t * pte = NULL; 23 24 pgd = pgd_offset(mm, addr); 25 if (pgd_none(*pgd)) 26 goto end; 27 if (pgd_bad(*pgd)) { 28 pgd_ERROR(*pgd); 29 pgd_clear(pgd); 30 goto end; 31 } 32 33 pmd = pmd_offset(pgd, addr); 34 if (pmd_none(*pmd)) 35 goto end; 36 if (pmd_bad(*pmd)) { 37 pmd_ERROR(*pmd); 38 pmd_clear(pmd); 39 goto end; 40 } 41 42 pte = pte_offset(pmd, addr); 43 if (pte_none(*pte)) 44 pte = NULL; 45 end: 46 return pte; 47 }
Trivial function to allocate what is necessary for one PTE in a region.
49 static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr) 50 { 51 pmd_t * pmd; 52 pte_t * pte = NULL; 53 54 pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); 55 if (pmd) 56 pte = pte_alloc(mm, pmd, addr); 57 return pte; 58 }
Copies the contents of one PTE to another.
60 static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst) 61 { 62 int error = 0; 63 pte_t pte; 64 65 if (!pte_none(*src)) { 66 pte = ptep_get_and_clear(src); 67 if (!dst) { 68 /* No dest? We must put it back. */ 69 dst = src; 70 error++; 71 } 72 set_pte(dst, pte); 73 } 74 return error; 75 }
The call graph for this function is shown in Figure 4.11. This function is responsible for unmapping a region. If necessary, the unmapping can span multiple VMAs and it can partially unmap one if necessary. Hence the full unmapping operation is divided into two major operations. This function is responsible for finding what VMAs are affected and unmap_fixup() is responsible for fixing up the remaining VMAs.
This function is divided up in a number of small sections will be dealt with in turn. The are broadly speaking;
924 int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) 925 { 926 struct vm_area_struct *mpnt, *prev, **npp, *free, *extra; 927 928 if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr) 929 return -EINVAL; 930 931 if ((len = PAGE_ALIGN(len)) == 0) 932 return -EINVAL; 933 939 mpnt = find_vma_prev(mm, addr, &prev); 940 if (!mpnt) 941 return 0; 942 /* we have addr < mpnt->vm_end */ 943 944 if (mpnt->vm_start >= addr+len) 945 return 0; 946 948 if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) 949 && mm->map_count >= max_map_count) 950 return -ENOMEM; 951 956 extra = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 957 if (!extra) 958 return -ENOMEM;
960 npp = (prev ? &prev->vm_next : &mm->mmap); 961 free = NULL; 962 spin_lock(&mm->page_table_lock); 963 for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) { 964 *npp = mpnt->vm_next; 965 mpnt->vm_next = free; 966 free = mpnt; 967 rb_erase(&mpnt->vm_rb, &mm->mm_rb); 968 } 969 mm->mmap_cache = NULL; /* Kill the cache. */ 970 spin_unlock(&mm->page_table_lock);
This section takes all the VMAs affected by the unmapping and places them on a separate linked list headed by a variable called free. This makes the fixup of the regions much easier.
971 972 /* Ok - we have the memory areas we should free on the 973 * 'free' list, so release them, and unmap the page range.. 974 * If the one of the segments is only being partially unmapped, 975 * it will put new vm_area_struct(s) into the address space. 976 * In that case we have to be careful with VM_DENYWRITE. 977 */ 978 while ((mpnt = free) != NULL) { 979 unsigned long st, end, size; 980 struct file *file = NULL; 981 982 free = free->vm_next; 983 984 st = addr < mpnt->vm_start ? mpnt->vm_start : addr; 985 end = addr+len; 986 end = end > mpnt->vm_end ? mpnt->vm_end : end; 987 size = end - st; 988 989 if (mpnt->vm_flags & VM_DENYWRITE && 990 (st != mpnt->vm_start || end != mpnt->vm_end) && 991 (file = mpnt->vm_file) != NULL) { 992 atomic_dec(&file->f_dentry->d_inode->i_writecount); 993 } 994 remove_shared_vm_struct(mpnt); 995 mm->map_count--; 996 997 zap_page_range(mm, st, size); 998 999 /* 1000 * Fix the mapping, and free the old area * if it wasn't reused. 1001 */ 1002 extra = unmap_fixup(mm, mpnt, st, size, extra); 1003 if (file) 1004 atomic_inc(&file->f_dentry->d_inode->i_writecount); 1005 }
1006 validate_mm(mm); 1007 1008 /* Release the extra vma struct if it wasn't used */ 1009 if (extra) 1010 kmem_cache_free(vm_area_cachep, extra); 1011 1012 free_pgtables(mm, prev, addr, addr+len); 1013 1014 return 0; 1015 }
This function fixes up the regions after a block has been unmapped. It is passed a list of VMAs that are affected by the unmapping, the region and length to be unmapped and a spare VMA that may be required to fix up the region if a whole is created. There is four principle cases it handles; The unmapping of a region, partial unmapping from the start to somewhere in the middle, partial unmapping from somewhere in the middle to the end and the creation of a hole in the middle of the region. Each case will be taken in turn.
787 static struct vm_area_struct * unmap_fixup(struct mm_struct *mm, 788 struct vm_area_struct *area, unsigned long addr, size_t len, 789 struct vm_area_struct *extra) 790 { 791 struct vm_area_struct *mpnt; 792 unsigned long end = addr + len; 793 794 area->vm_mm->total_vm -= len >> PAGE_SHIFT; 795 if (area->vm_flags & VM_LOCKED) 796 area->vm_mm->locked_vm -= len >> PAGE_SHIFT; 797
Function preamble.
798 /* Unmapping the whole area. */ 799 if (addr == area->vm_start && end == area->vm_end) { 800 if (area->vm_ops && area->vm_ops->close) 801 area->vm_ops->close(area); 802 if (area->vm_file) 803 fput(area->vm_file); 804 kmem_cache_free(vm_area_cachep, area); 805 return extra; 806 }
The first, and easiest, case is where the full region is being unmapped
809 if (end == area->vm_end) { 810 /* 811 * here area isn't visible to the semaphore-less readers 812 * so we don't need to update it under the spinlock. 813 */ 814 area->vm_end = addr; 815 lock_vma_mappings(area); 816 spin_lock(&mm->page_table_lock); 817 }
Handle the case where the middle of the region to the end is been unmapped
817 else if (addr == area->vm_start) { 818 area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT; 819 /* same locking considerations of the above case */ 820 area->vm_start = end; 821 lock_vma_mappings(area); 822 spin_lock(&mm->page_table_lock); 823 } else {
Handle the case where the VMA is been unmapped from the start to some part in the middle
823 } else { 825 /* Add end mapping -- leave beginning for below */ 826 mpnt = extra; 827 extra = NULL; 828 829 mpnt->vm_mm = area->vm_mm; 830 mpnt->vm_start = end; 831 mpnt->vm_end = area->vm_end; 832 mpnt->vm_page_prot = area->vm_page_prot; 833 mpnt->vm_flags = area->vm_flags; 834 mpnt->vm_raend = 0; 835 mpnt->vm_ops = area->vm_ops; 836 mpnt->vm_pgoff = area->vm_pgoff + ((end - area->vm_start) >> PAGE_SHIFT); 837 mpnt->vm_file = area->vm_file; 838 mpnt->vm_private_data = area->vm_private_data; 839 if (mpnt->vm_file) 840 get_file(mpnt->vm_file); 841 if (mpnt->vm_ops && mpnt->vm_ops->open) 842 mpnt->vm_ops->open(mpnt); 843 area->vm_end = addr; /* Truncate area */ 844 845 /* Because mpnt->vm_file == area->vm_file this locks 846 * things correctly. 847 */ 848 lock_vma_mappings(area); 849 spin_lock(&mm->page_table_lock); 850 __insert_vm_struct(mm, mpnt); 851 }
Handle the case where a hole is being created by a partial unmapping. In this case, the extra VMA is required to create a new mapping from the end of the unmapped region to the end of the old VMA
852 853 __insert_vm_struct(mm, area); 854 spin_unlock(&mm->page_table_lock); 855 unlock_vma_mappings(area); 856 return extra; 857 }
This function simply steps through all VMAs associated with the supplied mm and unmaps them.
1127 void exit_mmap(struct mm_struct * mm) 1128 { 1129 struct vm_area_struct * mpnt; 1130 1131 release_segments(mm); 1132 spin_lock(&mm->page_table_lock); 1133 mpnt = mm->mmap; 1134 mm->mmap = mm->mmap_cache = NULL; 1135 mm->mm_rb = RB_ROOT; 1136 mm->rss = 0; 1137 spin_unlock(&mm->page_table_lock); 1138 mm->total_vm = 0; 1139 mm->locked_vm = 0; 1140 1141 flush_cache_mm(mm); 1142 while (mpnt) { 1143 struct vm_area_struct * next = mpnt->vm_next; 1144 unsigned long start = mpnt->vm_start; 1145 unsigned long end = mpnt->vm_end; 1146 unsigned long size = end - start; 1147 1148 if (mpnt->vm_ops) { 1149 if (mpnt->vm_ops->close) 1150 mpnt->vm_ops->close(mpnt); 1151 } 1152 mm->map_count--; 1153 remove_shared_vm_struct(mpnt); 1154 zap_page_range(mm, start, size); 1155 if (mpnt->vm_file) 1156 fput(mpnt->vm_file); 1157 kmem_cache_free(vm_area_cachep, mpnt); 1158 mpnt = next; 1159 } 1160 flush_tlb_mm(mm); 1161 1162 /* This is just debugging */ 1163 if (mm->map_count) 1164 BUG(); 1165 1166 clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); 1167 }
This is the top-level function used to unmap all PTEs and free pages within a region. It is used when pagetables needs to be torn down such as when the process exits or a region is unmapped.
146 void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) 147 { 148 pgd_t * page_dir = mm->pgd; 149 150 spin_lock(&mm->page_table_lock); 151 page_dir += first; 152 do { 153 free_one_pgd(page_dir); 154 page_dir++; 155 } while (--nr); 156 spin_unlock(&mm->page_table_lock); 157 158 /* keep the page table cache within bounds */ 159 check_pgt_cache(); 160 }
This function tears down one PGD. For each PMD in this PGD, free_one_pmd() will be called.
109 static inline void free_one_pgd(pgd_t * dir) 110 { 111 int j; 112 pmd_t * pmd; 113 114 if (pgd_none(*dir)) 115 return; 116 if (pgd_bad(*dir)) { 117 pgd_ERROR(*dir); 118 pgd_clear(dir); 119 return; 120 } 121 pmd = pmd_offset(dir, 0); 122 pgd_clear(dir); 123 for (j = 0; j < PTRS_PER_PMD ; j++) { 124 prefetchw(pmd+j+(PREFETCH_STRIDE/16)); 125 free_one_pmd(pmd+j); 126 } 127 pmd_free(pmd); 128 }
93 static inline void free_one_pmd(pmd_t * dir) 94 { 95 pte_t * pte; 96 97 if (pmd_none(*dir)) 98 return; 99 if (pmd_bad(*dir)) { 100 pmd_ERROR(*dir); 101 pmd_clear(dir); 102 return; 103 } 104 pte = pte_offset(dir, 0); 105 pmd_clear(dir); 106 pte_free(pte); 107 }
The functions in this section deal with searching the virtual address space for mapped and free regions.
661 struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) 662 { 663 struct vm_area_struct *vma = NULL; 664 665 if (mm) { 666 /* Check the cache first. */ 667 /* (Cache hit rate is typically around 35%.) */ 668 vma = mm->mmap_cache; 669 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { 670 rb_node_t * rb_node; 671 672 rb_node = mm->mm_rb.rb_node; 673 vma = NULL; 674 675 while (rb_node) { 676 struct vm_area_struct * vma_tmp; 677 678 vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); 679 680 if (vma_tmp->vm_end > addr) { 681 vma = vma_tmp; 682 if (vma_tmp->vm_start <= addr) 683 break; 684 rb_node = rb_node->rb_left; 685 } else 686 rb_node = rb_node->rb_right; 687 } 688 if (vma) 689 mm->mmap_cache = vma; 690 } 691 } 692 return vma; 693 }
696 struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, 697 struct vm_area_struct **pprev) 698 { 699 if (mm) { 700 /* Go through the RB tree quickly. */ 701 struct vm_area_struct * vma; 702 rb_node_t * rb_node, * rb_last_right, * rb_prev; 703 704 rb_node = mm->mm_rb.rb_node; 705 rb_last_right = rb_prev = NULL; 706 vma = NULL; 707 708 while (rb_node) { 709 struct vm_area_struct * vma_tmp; 710 711 vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); 712 713 if (vma_tmp->vm_end > addr) { 714 vma = vma_tmp; 715 rb_prev = rb_last_right; 716 if (vma_tmp->vm_start <= addr) 717 break; 718 rb_node = rb_node->rb_left; 719 } else { 720 rb_last_right = rb_node; 721 rb_node = rb_node->rb_right; 722 } 723 } 724 if (vma) { 725 if (vma->vm_rb.rb_left) { 726 rb_prev = vma->vm_rb.rb_left; 727 while (rb_prev->rb_right) 728 rb_prev = rb_prev->rb_right; 729 } 730 *pprev = NULL; 731 if (rb_prev) 732 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 733 if ((rb_prev ? (*pprev)->vm_next : mm->mmap) != vma) 734 BUG(); 735 return vma; 736 } 737 } 738 *pprev = NULL; 739 return NULL; 740 }
673 static inline struct vm_area_struct * find_vma_intersection( struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) 674 { 675 struct vm_area_struct * vma = find_vma(mm,start_addr); 676 677 if (vma && end_addr <= vma->vm_start) 678 vma = NULL; 679 return vma; 680 }
The call graph for this function is shown at Figure 4.5.
644 unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) 645 { 646 if (flags & MAP_FIXED) { 647 if (addr > TASK_SIZE - len) 648 return -ENOMEM; 649 if (addr & ~PAGE_MASK) 650 return -EINVAL; 651 return addr; 652 } 653 654 if (file && file->f_op && file->f_op->get_unmapped_area) 655 return file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); 656 657 return arch_get_unmapped_area(file, addr, len, pgoff, flags); 658 }
Architectures have the option of specifying this function for themselves by defining HAVE_ARCH_UNMAPPED_AREA. If the architectures does not supply one, this version is used.
614 #ifndef HAVE_ARCH_UNMAPPED_AREA 615 static inline unsigned long arch_get_unmapped_area( struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) 616 { 617 struct vm_area_struct *vma; 618 619 if (len > TASK_SIZE) 620 return -ENOMEM; 621 622 if (addr) { 623 addr = PAGE_ALIGN(addr); 624 vma = find_vma(current->mm, addr); 625 if (TASK_SIZE - len >= addr && 626 (!vma || addr + len <= vma->vm_start)) 627 return addr; 628 } 629 addr = PAGE_ALIGN(TASK_UNMAPPED_BASE); 630 631 for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) { 632 /* At this point: (!vma || addr < vma->vm_end). */ 633 if (TASK_SIZE - len < addr) 634 return -ENOMEM; 635 if (!vma || addr + len <= vma->vm_start) 636 return addr; 637 addr = vma->vm_end; 638 } 639 } 640 #else 641 extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 642 #endif
This section contains the functions related to locking and unlocking a region. The main complexity in them is how the regions need to be fixed up after the operation takes place.
The call graph for this function is shown in Figure 4.10. This is the system call mlock() for locking a region of memory into physical memory. This function simply checks to make sure that process and user limits are not exceeeded and that the region to lock is page aligned.
195 asmlinkage long sys_mlock(unsigned long start, size_t len) 196 { 197 unsigned long locked; 198 unsigned long lock_limit; 199 int error = -ENOMEM; 200 201 down_write(¤t->mm->mmap_sem); 202 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 203 start &= PAGE_MASK; 204 205 locked = len >> PAGE_SHIFT; 206 locked += current->mm->locked_vm; 207 208 lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; 209 lock_limit >>= PAGE_SHIFT; 210 211 /* check against resource limits */ 212 if (locked > lock_limit) 213 goto out; 214 215 /* we may lock at most half of physical memory... */ 216 /* (this check is pretty bogus, but doesn't hurt) */ 217 if (locked > num_physpages/2) 218 goto out; 219 220 error = do_mlock(start, len, 1); 221 out: 222 up_write(¤t->mm->mmap_sem); 223 return error; 224 }
This is the system call mlockall() which attempts to lock all pages in the calling process in memory. If MCL_CURRENT is specified, all current pages will be locked. If MCL_FUTURE is specified, all future mappings will be locked. The flags may be or-ed together. This function makes sure that the flags and process limits are ok before calling do_mlockall().
266 asmlinkage long sys_mlockall(int flags) 267 { 268 unsigned long lock_limit; 269 int ret = -EINVAL; 270 271 down_write(¤t->mm->mmap_sem); 272 if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) 273 goto out; 274 275 lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur; 276 lock_limit >>= PAGE_SHIFT; 277 278 ret = -ENOMEM; 279 if (current->mm->total_vm > lock_limit) 280 goto out; 281 282 /* we may lock at most half of physical memory... */ 283 /* (this check is pretty bogus, but doesn't hurt) */ 284 if (current->mm->total_vm > num_physpages/2) 285 goto out; 286 287 ret = do_mlockall(flags); 288 out: 289 up_write(¤t->mm->mmap_sem); 290 return ret; 291 }
238 static int do_mlockall(int flags) 239 { 240 int error; 241 unsigned int def_flags; 242 struct vm_area_struct * vma; 243 244 if (!capable(CAP_IPC_LOCK)) 245 return -EPERM; 246 247 def_flags = 0; 248 if (flags & MCL_FUTURE) 249 def_flags = VM_LOCKED; 250 current->mm->def_flags = def_flags; 251 252 error = 0; 253 for (vma = current->mm->mmap; vma ; vma = vma->vm_next) { 254 unsigned int newflags; 255 256 newflags = vma->vm_flags | VM_LOCKED; 257 if (!(flags & MCL_CURRENT)) 258 newflags &= ~VM_LOCKED; 259 error = mlock_fixup(vma, vma->vm_start, vma->vm_end, newflags); 260 if (error) 261 break; 262 } 263 return error; 264 }
This function is is responsible for starting the work needed to either lock or unlock a region depending on the value of the on parameter. It is broken up into two sections. The first makes sure the region is page aligned (despite the fact the only two callers of this function do the same thing) before finding the VMA that is to be adjusted. The second part then sets the appropriate flags before calling mlock_fixup() for each VMA that is affected by this locking.
148 static int do_mlock(unsigned long start, size_t len, int on) 149 { 150 unsigned long nstart, end, tmp; 151 struct vm_area_struct * vma, * next; 152 int error; 153 154 if (on && !capable(CAP_IPC_LOCK)) 155 return -EPERM; 156 len = PAGE_ALIGN(len); 157 end = start + len; 158 if (end < start) 159 return -EINVAL; 160 if (end == start) 161 return 0; 162 vma = find_vma(current->mm, start); 163 if (!vma || vma->vm_start > start) 164 return -ENOMEM;
Page align the request and find the VMA
166 for (nstart = start ; ; ) { 167 unsigned int newflags; 168 170 171 newflags = vma->vm_flags | VM_LOCKED; 172 if (!on) 173 newflags &= ~VM_LOCKED; 174 175 if (vma->vm_end >= end) { 176 error = mlock_fixup(vma, nstart, end, newflags); 177 break; 178 } 179 180 tmp = vma->vm_end; 181 next = vma->vm_next; 182 error = mlock_fixup(vma, nstart, tmp, newflags); 183 if (error) 184 break; 185 nstart = tmp; 186 vma = next; 187 if (!vma || vma->vm_start != nstart) { 188 error = -ENOMEM; 189 break; 190 } 191 } 192 return error; 193 }
Walk through the VMAs affected by this locking and call mlock_fixup() for each of them.
Page align the request before calling do_mlock() which begins the real work of fixing up the regions.
226 asmlinkage long sys_munlock(unsigned long start, size_t len) 227 { 228 int ret; 229 230 down_write(¤t->mm->mmap_sem); 231 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 232 start &= PAGE_MASK; 233 ret = do_mlock(start, len, 0); 234 up_write(¤t->mm->mmap_sem); 235 return ret; 236 }
Trivial function. If the flags to mlockall() are 0 it gets translated as none of the current pages must be present and no future mappings should be locked either which means the VM_LOCKED flag will be removed on all VMAs.
293 asmlinkage long sys_munlockall(void) 294 { 295 int ret; 296 297 down_write(¤t->mm->mmap_sem); 298 ret = do_mlockall(0); 299 up_write(¤t->mm->mmap_sem); 300 return ret; 301 }
This function identifies four separate types of locking that must be addressed. There first is where the full VMA is to be locked where it calls mlock_fixup_all(). The second is where only the beginning portion of the VMA is affected, handled by mlock_fixup_start(). The third is the locking of a region at the end handled by mlock_fixup_end() and the last is locking a region in the middle of the VMA with mlock_fixup_middle().
117 static int mlock_fixup(struct vm_area_struct * vma, 118 unsigned long start, unsigned long end, unsigned int newflags) 119 { 120 int pages, retval; 121 122 if (newflags == vma->vm_flags) 123 return 0; 124 125 if (start == vma->vm_start) { 126 if (end == vma->vm_end) 127 retval = mlock_fixup_all(vma, newflags); 128 else 129 retval = mlock_fixup_start(vma, end, newflags); 130 } else { 131 if (end == vma->vm_end) 132 retval = mlock_fixup_end(vma, start, newflags); 133 else 134 retval = mlock_fixup_middle(vma, start, end, newflags); 135 } 136 if (!retval) { 137 /* keep track of amount of locked VM */ 138 pages = (end - start) >> PAGE_SHIFT; 139 if (newflags & VM_LOCKED) { 140 pages = -pages; 141 make_pages_present(start, end); 142 } 143 vma->vm_mm->locked_vm -= pages; 144 } 145 return retval; 146 }
15 static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags) 16 { 17 spin_lock(&vma->vm_mm->page_table_lock); 18 vma->vm_flags = newflags; 19 spin_unlock(&vma->vm_mm->page_table_lock); 20 return 0; 21 }
Slightly more compilcated. A new VMA is required to represent the affected region. The start of the old VMA is moved forward
23 static inline int mlock_fixup_start(struct vm_area_struct * vma, 24 unsigned long end, int newflags) 25 { 26 struct vm_area_struct * n; 27 28 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 29 if (!n) 30 return -EAGAIN; 31 *n = *vma; 32 n->vm_end = end; 33 n->vm_flags = newflags; 34 n->vm_raend = 0; 35 if (n->vm_file) 36 get_file(n->vm_file); 37 if (n->vm_ops && n->vm_ops->open) 38 n->vm_ops->open(n); 39 vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; 40 lock_vma_mappings(vma); 41 spin_lock(&vma->vm_mm->page_table_lock); 42 vma->vm_start = end; 43 __insert_vm_struct(current->mm, n); 44 spin_unlock(&vma->vm_mm->page_table_lock); 45 unlock_vma_mappings(vma); 46 return 0; 47 }
Essentially the same as mlock_fixup_start() except the affected region is at the end of the VMA.
49 static inline int mlock_fixup_end(struct vm_area_struct * vma, 50 unsigned long start, int newflags) 51 { 52 struct vm_area_struct * n; 53 54 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 55 if (!n) 56 return -EAGAIN; 57 *n = *vma; 58 n->vm_start = start; 59 n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; 60 n->vm_flags = newflags; 61 n->vm_raend = 0; 62 if (n->vm_file) 63 get_file(n->vm_file); 64 if (n->vm_ops && n->vm_ops->open) 65 n->vm_ops->open(n); 66 lock_vma_mappings(vma); 67 spin_lock(&vma->vm_mm->page_table_lock); 68 vma->vm_end = start; 69 __insert_vm_struct(current->mm, n); 70 spin_unlock(&vma->vm_mm->page_table_lock); 71 unlock_vma_mappings(vma); 72 return 0; 73 }
Similar to the previous two fixup functions except that 2 new regions are required to fix up the mapping.
75 static inline int mlock_fixup_middle(struct vm_area_struct * vma, 76 unsigned long start, unsigned long end, int newflags) 77 { 78 struct vm_area_struct * left, * right; 79 80 left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 81 if (!left) 82 return -EAGAIN; 83 right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 84 if (!right) { 85 kmem_cache_free(vm_area_cachep, left); 86 return -EAGAIN; 87 } 88 *left = *vma; 89 *right = *vma; 90 left->vm_end = start; 91 right->vm_start = end; 92 right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; 93 vma->vm_flags = newflags; 94 left->vm_raend = 0; 95 right->vm_raend = 0; 96 if (vma->vm_file) 97 atomic_add(2, &vma->vm_file->f_count); 98 99 if (vma->vm_ops && vma->vm_ops->open) { 100 vma->vm_ops->open(left); 101 vma->vm_ops->open(right); 102 } 103 vma->vm_raend = 0; 104 vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; 105 lock_vma_mappings(vma); 106 spin_lock(&vma->vm_mm->page_table_lock); 107 vma->vm_start = start; 108 vma->vm_end = end; 109 vma->vm_flags = newflags; 110 __insert_vm_struct(current->mm, left); 111 __insert_vm_struct(current->mm, right); 112 spin_unlock(&vma->vm_mm->page_table_lock); 113 unlock_vma_mappings(vma); 114 return 0; 115 }
This section deals with the page fault handler. It begins with the architecture specific function for the x86 and then moves to the architecture independent layer. The architecture specific functions all have the same responsibilities.
The call graph for this function is shown in Figure 4.12. This function is the x86 architecture dependent function for the handling of page fault exception handlers. Each architecture registers their own but all of them have similar responsibilities.
140 asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) 141 { 142 struct task_struct *tsk; 143 struct mm_struct *mm; 144 struct vm_area_struct * vma; 145 unsigned long address; 146 unsigned long page; 147 unsigned long fixup; 148 int write; 149 siginfo_t info; 150 151 /* get the address */ 152 __asm__("movl %%cr2,%0":"=r" (address)); 153 154 /* It's safe to allow irq's after cr2 has been saved */ 155 if (regs->eflags & X86_EFLAGS_IF) 156 local_irq_enable(); 157 158 tsk = current; 159
Function preamble. Get the fault address and enable interrupts
173 if (address >= TASK_SIZE && !(error_code & 5)) 174 goto vmalloc_fault; 175 176 mm = tsk->mm; 177 info.si_code = SEGV_MAPERR; 178 183 if (in_interrupt() || !mm) 184 goto no_context; 185
Check for exceptional faults, kernel faults, fault in interrupt and fault with no memory context
186 down_read(&mm->mmap_sem); 187 188 vma = find_vma(mm, address); 189 if (!vma) 190 goto bad_area; 191 if (vma->vm_start <= address) 192 goto good_area; 193 if (!(vma->vm_flags & VM_GROWSDOWN)) 194 goto bad_area; 195 if (error_code & 4) { 196 /* 197 * accessing the stack below %esp is always a bug. 198 * The "+ 32" is there due to some instructions (like 199 * pusha) doing post-decrement on the stack and that 200 * doesn't show up until later.. 201 */ 202 if (address + 32 < regs->esp) 203 goto bad_area; 204 } 205 if (expand_stack(vma, address)) 206 goto bad_area;
If a fault in userspace, find the VMA for the faulting address and determine if it is a good area, a bad area or if the fault occurred near a region that can be expanded such as the stack
211 good_area: 212 info.si_code = SEGV_ACCERR; 213 write = 0; 214 switch (error_code & 3) { 215 default: /* 3: write, present */ 216 #ifdef TEST_VERIFY_AREA 217 if (regs->cs == KERNEL_CS) 218 printk("WP fault at %08lx\n", regs->eip); 219 #endif 220 /* fall through */ 221 case 2: /* write, not present */ 222 if (!(vma->vm_flags & VM_WRITE)) 223 goto bad_area; 224 write++; 225 break; 226 case 1: /* read, present */ 227 goto bad_area; 228 case 0: /* read, not present */ 229 if (!(vma->vm_flags & (VM_READ | VM_EXEC))) 230 goto bad_area; 231 }
There is the first part of a good area is handled. The permissions need to be checked in case this is a protection fault.
233 survive: 239 switch (handle_mm_fault(mm, vma, address, write)) { 240 case 1: 241 tsk->min_flt++; 242 break; 243 case 2: 244 tsk->maj_flt++; 245 break; 246 case 0: 247 goto do_sigbus; 248 default: 249 goto out_of_memory; 250 } 251 252 /* 253 * Did it hit the DOS screen memory VA from vm86 mode? 254 */ 255 if (regs->eflags & VM_MASK) { 256 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; 257 if (bit < 32) 258 tsk->thread.screen_bitmap |= 1 << bit; 259 } 260 up_read(&mm->mmap_sem); 261 return;
At this point, an attempt is going to be made to handle the fault gracefully with handle_mm_fault().
267 bad_area: 268 up_read(&mm->mmap_sem); 269 270 /* User mode accesses just cause a SIGSEGV */ 271 if (error_code & 4) { 272 tsk->thread.cr2 = address; 273 tsk->thread.error_code = error_code; 274 tsk->thread.trap_no = 14; 275 info.si_signo = SIGSEGV; 276 info.si_errno = 0; 277 /* info.si_code has been set above */ 278 info.si_addr = (void *)address; 279 force_sig_info(SIGSEGV, &info, tsk); 280 return; 281 } 282 283 /* 284 * Pentium F0 0F C7 C8 bug workaround. 285 */ 286 if (boot_cpu_data.f00f_bug) { 287 unsigned long nr; 288 289 nr = (address - idt) >> 3; 290 291 if (nr == 6) { 292 do_invalid_op(regs, 0); 293 return; 294 } 295 }
This is the bad area handler such as using memory with no vm_area_struct managing it. If the fault is not by a user process or the f00f bug, the no_context label is fallen through to.
296 297 no_context: 298 /* Are we prepared to handle this kernel fault? */ 299 if ((fixup = search_exception_table(regs->eip)) != 0) { 300 regs->eip = fixup; 301 return; 302 }
304 /* 305 * Oops. The kernel tried to access some bad page. We'll have to 306 * terminate things with extreme prejudice. 307 */ 308 309 bust_spinlocks(1); 310 311 if (address < PAGE_SIZE) 312 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); 313 else 314 printk(KERN_ALERT "Unable to handle kernel paging request"); 315 printk(" at virtual address %08lx\n",address); 316 printk(" printing eip:\n"); 317 printk("%08lx\n", regs->eip); 318 asm("movl %%cr3,%0":"=r" (page)); 319 page = ((unsigned long *) __va(page))[address >> 22]; 320 printk(KERN_ALERT "*pde = %08lx\n", page); 321 if (page & 1) { 322 page &= PAGE_MASK; 323 address &= 0x003ff000; 324 page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; 325 printk(KERN_ALERT "*pte = %08lx\n", page); 326 } 327 die("Oops", regs, error_code); 328 bust_spinlocks(0); 329 do_exit(SIGKILL);
This is the no_context handler. Some bad exception occurred which is going to end up in the process been terminated in all likeliness. Otherwise the kernel faulted when it definitely should have and an OOPS report is generated.
335 out_of_memory: 336 if (tsk->pid == 1) { 337 yield(); 338 goto survive; 339 } 340 up_read(&mm->mmap_sem); 341 printk("VM: killing process %s\n", tsk->comm); 342 if (error_code & 4) 343 do_exit(SIGKILL); 344 goto no_context;
The out of memory handler. Usually ends with the faulting process getting killed unless it is init
345 346 do_sigbus: 347 up_read(&mm->mmap_sem); 348 353 tsk->thread.cr2 = address; 354 tsk->thread.error_code = error_code; 355 tsk->thread.trap_no = 14; 356 info.si_signo = SIGBUS; 357 info.si_errno = 0; 358 info.si_code = BUS_ADRERR; 359 info.si_addr = (void *)address; 360 force_sig_info(SIGBUS, &info, tsk); 361 362 /* Kernel mode? Handle exceptions or die */ 363 if (!(error_code & 4)) 364 goto no_context; 365 return;
367 vmalloc_fault: 368 { 376 int offset = __pgd_offset(address); 377 pgd_t *pgd, *pgd_k; 378 pmd_t *pmd, *pmd_k; 379 pte_t *pte_k; 380 381 asm("movl %%cr3,%0":"=r" (pgd)); 382 pgd = offset + (pgd_t *)__va(pgd); 383 pgd_k = init_mm.pgd + offset; 384 385 if (!pgd_present(*pgd_k)) 386 goto no_context; 387 set_pgd(pgd, *pgd_k); 388 389 pmd = pmd_offset(pgd, address); 390 pmd_k = pmd_offset(pgd_k, address); 391 if (!pmd_present(*pmd_k)) 392 goto no_context; 393 set_pmd(pmd, *pmd_k); 394 395 pte_k = pte_offset(pmd_k, address); 396 if (!pte_present(*pte_k)) 397 goto no_context; 398 return; 399 } 400 }
This is the vmalloc fault handler. When pages are mapped in the vmalloc space, only the refernce page table is updated. As each process references this area, a fault will be trapped and the process page tables will be synchronised with the reference page table here.
This function is called by the architecture dependant page fault handler. The VMA supplied is guarenteed to be one that can grow to cover the address.
640 static inline int expand_stack(struct vm_area_struct * vma, unsigned long address) 641 { 642 unsigned long grow; 643 644 /* 645 * vma->vm_start/vm_end cannot change under us because * the caller is required 646 * to hold the mmap_sem in write mode. We need to get the 647 * spinlock only before relocating the vma range ourself. 648 */ 649 address &= PAGE_MASK; 650 spin_lock(&vma->vm_mm->page_table_lock); 651 grow = (vma->vm_start - address) >> PAGE_SHIFT; 652 if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || 653 ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { 654 spin_unlock(&vma->vm_mm->page_table_lock); 655 return -ENOMEM; 656 } 657 vma->vm_start = address; 658 vma->vm_pgoff -= grow; 659 vma->vm_mm->total_vm += grow; 660 if (vma->vm_flags & VM_LOCKED) 661 vma->vm_mm->locked_vm += grow; 662 spin_unlock(&vma->vm_mm->page_table_lock); 663 return 0; 664 }
This is the top level pair of functions for the architecture independent page fault handler.
The call graph for this function is shown in Figure 4.14. This function allocates the PMD and PTE necessary for this new PTE hat is about to be allocated. It takes the necessary locks to protect the page tables before calling handle_pte_fault() to fault in the page itself.
1364 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, 1365 unsigned long address, int write_access) 1366 { 1367 pgd_t *pgd; 1368 pmd_t *pmd; 1369 1370 current->state = TASK_RUNNING; 1371 pgd = pgd_offset(mm, address); 1372 1373 /* 1374 * We need the page table lock to synchronize with kswapd 1375 * and the SMP-safe atomic PTE updates. 1376 */ 1377 spin_lock(&mm->page_table_lock); 1378 pmd = pmd_alloc(mm, pgd, address); 1379 1380 if (pmd) { 1381 pte_t * pte = pte_alloc(mm, pmd, address); 1382 if (pte) 1383 return handle_pte_fault(mm, vma, address, write_access, pte); 1384 } 1385 spin_unlock(&mm->page_table_lock); 1386 return -1; 1387 }
This function decides what type of fault this is and which function should handle it. do_no_page() is called if this is the first time a page is to be allocated. do_swap_page() handles the case where the page was swapped out to disk with the exception of pages swapped out from tmpfs. do_wp_page() breaks COW pages. If none of them are appropriate, the PTE entry is simply updated. If it was written to, it is marked dirty and it is marked accessed to show it is a young page.
1331 static inline int handle_pte_fault(struct mm_struct *mm, 1332 struct vm_area_struct * vma, unsigned long address, 1333 int write_access, pte_t * pte) 1334 { 1335 pte_t entry; 1336 1337 entry = *pte; 1338 if (!pte_present(entry)) { 1339 /* 1340 * If it truly wasn't present, we know that kswapd 1341 * and the PTE updates will not touch it later. So 1342 * drop the lock. 1343 */ 1344 if (pte_none(entry)) 1345 return do_no_page(mm, vma, address, write_access, pte); 1346 return do_swap_page(mm, vma, address, pte, entry, write_access); 1347 } 1348 1349 if (write_access) { 1350 if (!pte_write(entry)) 1351 return do_wp_page(mm, vma, address, pte, entry); 1352 1353 entry = pte_mkdirty(entry); 1354 } 1355 entry = pte_mkyoung(entry); 1356 establish_pte(vma, address, pte, entry); 1357 spin_unlock(&mm->page_table_lock); 1358 return 1; 1359 }
The call graph for this function is shown in Figure 4.15. This function is called the first time a page is referenced so that it may be allocated and filled with data if necessary. If it is an anonymous page, determined by the lack of a vm_ops available to the VMA or the lack of a nopage() function, then do_anonymous_page() is called. Otherwise the supplied nopage() function is called to allocate a page and it is inserted into the page tables here. The function has the following tasks;
1245 static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, 1246 unsigned long address, int write_access, pte_t *page_table) 1247 { 1248 struct page * new_page; 1249 pte_t entry; 1250 1251 if (!vma->vm_ops || !vma->vm_ops->nopage) 1252 return do_anonymous_page(mm, vma, page_table, write_access, address); 1253 spin_unlock(&mm->page_table_lock); 1254 1255 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); 1256 1257 if (new_page == NULL) /* no page was available -- SIGBUS */ 1258 return 0; 1259 if (new_page == NOPAGE_OOM) 1260 return -1;
1265 if (write_access && !(vma->vm_flags & VM_SHARED)) { 1266 struct page * page = alloc_page(GFP_HIGHUSER); 1267 if (!page) { 1268 page_cache_release(new_page); 1269 return -1; 1270 } 1271 copy_user_highpage(page, new_page, address); 1272 page_cache_release(new_page); 1273 lru_cache_add(page); 1274 new_page = page; 1275 }
Break COW early in this block if appropriate. COW is broken if the fault is a write fault and the region is not shared with VM_SHARED. If COW was not broken in this case, a second fault would occur immediately upon return.
1277 spin_lock(&mm->page_table_lock); 1288 /* Only go through if we didn't race with anybody else... */ 1289 if (pte_none(*page_table)) { 1290 ++mm->rss; 1291 flush_page_to_ram(new_page); 1292 flush_icache_page(vma, new_page); 1293 entry = mk_pte(new_page, vma->vm_page_prot); 1294 if (write_access) 1295 entry = pte_mkwrite(pte_mkdirty(entry)); 1296 set_pte(page_table, entry); 1297 } else { 1298 /* One of our sibling threads was faster, back out. */ 1299 page_cache_release(new_page); 1300 spin_unlock(&mm->page_table_lock); 1301 return 1; 1302 } 1303 1304 /* no need to invalidate: a not-present page shouldn't * be cached */ 1305 update_mmu_cache(vma, address, entry); 1306 spin_unlock(&mm->page_table_lock); 1307 return 2; /* Major fault */ 1308 }
This function allocates a new page for a process accessing a page for the first time. If it is a read access, a system wide page containing only zeros is mapped into the process. If it is write, a zero filled page is allocated and placed within the page tables
1190 static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) 1191 { 1192 pte_t entry; 1193 1194 /* Read-only mapping of ZERO_PAGE. */ 1195 entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); 1196 1197 /* ..except if it's a write access */ 1198 if (write_access) { 1199 struct page *page; 1200 1201 /* Allocate our own private page. */ 1202 spin_unlock(&mm->page_table_lock); 1203 1204 page = alloc_page(GFP_HIGHUSER); 1205 if (!page) 1206 goto no_mem; 1207 clear_user_highpage(page, addr); 1208 1209 spin_lock(&mm->page_table_lock); 1210 if (!pte_none(*page_table)) { 1211 page_cache_release(page); 1212 spin_unlock(&mm->page_table_lock); 1213 return 1; 1214 } 1215 mm->rss++; 1216 flush_page_to_ram(page); 1217 entry = pte_mkwrite( pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 1218 lru_cache_add(page); 1219 mark_page_accessed(page); 1220 } 1221 1222 set_pte(page_table, entry); 1223 1224 /* No need to invalidate - it was non-present before */ 1225 update_mmu_cache(vma, addr, entry); 1226 spin_unlock(&mm->page_table_lock); 1227 return 1; /* Minor fault */ 1228 1229 no_mem: 1230 return -1; 1231 }
The call graph for this function is shown in Figure 4.16. This function handles the case where a page has been swapped out. A swapped out page may exist in the swap cache if it is shared between a number of processes or recently swapped in during readahead. This function is broken up into three parts
1117 static int do_swap_page(struct mm_struct * mm, 1118 struct vm_area_struct * vma, unsigned long address, 1119 pte_t * page_table, pte_t orig_pte, int write_access) 1120 { 1121 struct page *page; 1122 swp_entry_t entry = pte_to_swp_entry(orig_pte); 1123 pte_t pte; 1124 int ret = 1; 1125 1126 spin_unlock(&mm->page_table_lock); 1127 page = lookup_swap_cache(entry);
Function preamble, check for the page in the swap cache
1128 if (!page) { 1129 swapin_readahead(entry); 1130 page = read_swap_cache_async(entry); 1131 if (!page) { 1136 int retval; 1137 spin_lock(&mm->page_table_lock); 1138 retval = pte_same(*page_table, orig_pte) ? -1 : 1; 1139 spin_unlock(&mm->page_table_lock); 1140 return retval; 1141 } 1142 1143 /* Had to read the page from swap area: Major fault */ 1144 ret = 2; 1145 }
If the page did not exist in the swap cache, then read it from backing storage with swapin_readhead() which reads in the requested pages and a number of pages after it. Once it completes, read_swap_cache_async() should be able to return the page.
1147 mark_page_accessed(page); 1148 1149 lock_page(page); 1150 1151 /* 1152 * Back out if somebody else faulted in this pte while we 1153 * released the page table lock. 1154 */ 1155 spin_lock(&mm->page_table_lock); 1156 if (!pte_same(*page_table, orig_pte)) { 1157 spin_unlock(&mm->page_table_lock); 1158 unlock_page(page); 1159 page_cache_release(page); 1160 return 1; 1161 } 1162 1163 /* The page isn't present yet, go ahead with the fault. */ 1164 1165 swap_free(entry); 1166 if (vm_swap_full()) 1167 remove_exclusive_swap_page(page); 1168 1169 mm->rss++; 1170 pte = mk_pte(page, vma->vm_page_prot); 1171 if (write_access && can_share_swap_page(page)) 1172 pte = pte_mkdirty(pte_mkwrite(pte)); 1173 unlock_page(page); 1174 1175 flush_page_to_ram(page); 1176 flush_icache_page(vma, page); 1177 set_pte(page_table, pte); 1178 1179 /* No need to invalidate - it was non-present before */ 1180 update_mmu_cache(vma, address, pte); 1181 spin_unlock(&mm->page_table_lock); 1182 return ret; 1183 }
Place the page in the process page tables
This function determines if the swap cache entry for this page may be used or not. It may be used if there is no other references to it. Most of the work is performed by exclusive_swap_page() but this function first makes a few basic checks to avoid having to acquire too many locks.
259 int can_share_swap_page(struct page *page) 260 { 261 int retval = 0; 262 263 if (!PageLocked(page)) 264 BUG(); 265 switch (page_count(page)) { 266 case 3: 267 if (!page->buffers) 268 break; 269 /* Fallthrough */ 270 case 2: 271 if (!PageSwapCache(page)) 272 break; 273 retval = exclusive_swap_page(page); 274 break; 275 case 1: 276 if (PageReserved(page)) 277 break; 278 retval = 1; 279 } 280 return retval; 281 }
This function checks if the process is the only user of a locked swap page.
229 static int exclusive_swap_page(struct page *page) 230 { 231 int retval = 0; 232 struct swap_info_struct * p; 233 swp_entry_t entry; 234 235 entry.val = page->index; 236 p = swap_info_get(entry); 237 if (p) { 238 /* Is the only swap cache user the cache itself? */ 239 if (p->swap_map[SWP_OFFSET(entry)] == 1) { 240 /* Recheck the page count with the pagecache * lock held.. */ 241 spin_lock(&pagecache_lock); 242 if (page_count(page) - !!page->buffers == 2) 243 retval = 1; 244 spin_unlock(&pagecache_lock); 245 } 246 swap_info_put(p); 247 } 248 return retval; 249 }
The call graph for this function is shown in Figure 4.17. This function handles the case where a user tries to write to a private page shared amoung processes, such as what happens after fork(). Basically what happens is a page is allocated, the contents copied to the new page and the shared count decremented in the old page.
948 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, 949 unsigned long address, pte_t *page_table, pte_t pte) 950 { 951 struct page *old_page, *new_page; 952 953 old_page = pte_page(pte); 954 if (!VALID_PAGE(old_page)) 955 goto bad_wp_page; 956
957 if (!TryLockPage(old_page)) { 958 int reuse = can_share_swap_page(old_page); 959 unlock_page(old_page); 960 if (reuse) { 961 flush_cache_page(vma, address); 962 establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); 963 spin_unlock(&mm->page_table_lock); 964 return 1; /* Minor fault */ 965 } 966 }
968 /* 969 * Ok, we need to copy. Oh, well.. 970 */ 971 page_cache_get(old_page); 972 spin_unlock(&mm->page_table_lock); 973 974 new_page = alloc_page(GFP_HIGHUSER); 975 if (!new_page) 976 goto no_mem; 977 copy_cow_page(old_page,new_page,address); 978
982 spin_lock(&mm->page_table_lock); 983 if (pte_same(*page_table, pte)) { 984 if (PageReserved(old_page)) 985 ++mm->rss; 986 break_cow(vma, new_page, address, page_table); 987 lru_cache_add(new_page); 988 989 /* Free the old page.. */ 990 new_page = old_page; 991 } 992 spin_unlock(&mm->page_table_lock); 993 page_cache_release(new_page); 994 page_cache_release(old_page); 995 return 1; /* Minor fault */
996 997 bad_wp_page: 998 spin_unlock(&mm->page_table_lock); 999 printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n", address,(unsigned long)old_page); 1000 return -1; 1001 no_mem: 1002 page_cache_release(old_page); 1003 return -1; 1004 }
This is more the domain of the IO manager than the VM but because it performs the operations via the page cache, we will cover it briefly. The operation of generic_file_write() is essentially the same although it is not covered by this book. However, if you understand how the read takes place, the write function will pose no problem to you.
This is the generic file read function used by any filesystem that reads pages through the page cache. For normal IO, it is responsible for building a read_descriptor_t for use with do_generic_file_read() and file_read_actor(). For direct IO, this function is basically a wrapper around generic_file_direct_IO().
1695 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos) 1696 { 1697 ssize_t retval; 1698 1699 if ((ssize_t) count < 0) 1700 return -EINVAL; 1701 1702 if (filp->f_flags & O_DIRECT) 1703 goto o_direct; 1704 1705 retval = -EFAULT; 1706 if (access_ok(VERIFY_WRITE, buf, count)) { 1707 retval = 0; 1708 1709 if (count) { 1710 read_descriptor_t desc; 1711 1712 desc.written = 0; 1713 desc.count = count; 1714 desc.buf = buf; 1715 desc.error = 0; 1716 do_generic_file_read(filp, ppos, &desc, file_read_actor); 1717 1718 retval = desc.written; 1719 if (!retval) 1720 retval = desc.error; 1721 } 1722 } 1723 out: 1724 return retval;
This block is concern with normal file IO.
1725 1726 o_direct: 1727 { 1728 loff_t pos = *ppos, size; 1729 struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; 1730 struct inode *inode = mapping->host; 1731 1732 retval = 0; 1733 if (!count) 1734 goto out; /* skip atime */ 1735 down_read(&inode->i_alloc_sem); 1736 down(&inode->i_sem); 1737 size = inode->i_size; 1738 if (pos < size) { 1739 retval = generic_file_direct_IO(READ, filp, buf, count, pos); 1740 if (retval > 0) 1741 *ppos = pos + retval; 1742 } 1743 UPDATE_ATIME(filp->f_dentry->d_inode); 1744 goto out; 1745 } 1746 }
This block is concerned with direct IO. It is largely responsible for extracting the parameters required for generic_file_direct_IO().
This is the core part of the generic file read operation. It is responsible for allocating a page if it doesn't already exist in the page cache. If it does, it must make sure the page is up-to-date and finally, it is responsible for making sure that the appropriate readahead window is set.
1349 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) 1350 { 1351 struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; 1352 struct inode *inode = mapping->host; 1353 unsigned long index, offset; 1354 struct page *cached_page; 1355 int reada_ok; 1356 int error; 1357 int max_readahead = get_max_readahead(inode); 1358 1359 cached_page = NULL; 1360 index = *ppos >> PAGE_CACHE_SHIFT; 1361 offset = *ppos & ~PAGE_CACHE_MASK; 1362
1363 /* 1364 * If the current position is outside the previous read-ahead 1365 * window, we reset the current read-ahead context and set read 1366 * ahead max to zero (will be set to just needed value later), 1367 * otherwise, we assume that the file accesses are sequential 1368 * enough to continue read-ahead. 1369 */ 1370 if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) { 1371 reada_ok = 0; 1372 filp->f_raend = 0; 1373 filp->f_ralen = 0; 1374 filp->f_ramax = 0; 1375 filp->f_rawin = 0; 1376 } else { 1377 reada_ok = 1; 1378 } 1379 /* 1380 * Adjust the current value of read-ahead max. 1381 * If the read operation stay in the first half page, force no 1382 * readahead. Otherwise try to increase read ahead max just * enough to do the read request. 1383 * Then, at least MIN_READAHEAD if read ahead is ok, 1384 * and at most MAX_READAHEAD in all cases. 1385 */ 1386 if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) { 1387 filp->f_ramax = 0; 1388 } else { 1389 unsigned long needed; 1390 1391 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1; 1392 1393 if (filp->f_ramax < needed) 1394 filp->f_ramax = needed; 1395 1396 if (reada_ok && filp->f_ramax < vm_min_readahead) 1397 filp->f_ramax = vm_min_readahead; 1398 if (filp->f_ramax > max_readahead) 1399 filp->f_ramax = max_readahead; 1400 }
1402 for (;;) { 1403 struct page *page, **hash; 1404 unsigned long end_index, nr, ret; 1405 1406 end_index = inode->i_size >> PAGE_CACHE_SHIFT; 1407 1408 if (index > end_index) 1409 break; 1410 nr = PAGE_CACHE_SIZE; 1411 if (index == end_index) { 1412 nr = inode->i_size & ~PAGE_CACHE_MASK; 1413 if (nr <= offset) 1414 break; 1415 } 1416 1417 nr = nr - offset; 1418 1419 /* 1420 * Try to find the data in the page cache.. 1421 */ 1422 hash = page_hash(mapping, index); 1423 1424 spin_lock(&pagecache_lock); 1425 page = __find_page_nolock(mapping, index, *hash); 1426 if (!page) 1427 goto no_cached_page;
1428 found_page: 1429 page_cache_get(page); 1430 spin_unlock(&pagecache_lock); 1431 1432 if (!Page_Uptodate(page)) 1433 goto page_not_up_to_date; 1434 generic_file_readahead(reada_ok, filp, inode, page);
In this block, the page was found in the page cache.
1435 page_ok: 1436 /* If users can be writing to this page using arbitrary 1437 * virtual addresses, take care about potential aliasing 1438 * before reading the page on the kernel side. 1439 */ 1440 if (mapping->i_mmap_shared != NULL) 1441 flush_dcache_page(page); 1442 1443 /* 1444 * Mark the page accessed if we read the 1445 * beginning or we just did an lseek. 1446 */ 1447 if (!offset || !filp->f_reada) 1448 mark_page_accessed(page); 1449 1450 /* 1451 * Ok, we have the page, and it's up-to-date, so 1452 * now we can copy it to user space... 1453 * 1454 * The actor routine returns how many bytes were actually used.. 1455 * NOTE! This may not be the same as how much of a user buffer 1456 * we filled up (we may be padding etc), so we can only update 1457 * "pos" here (the actor routine has to update the user buffer 1458 * pointers and the remaining count). 1459 */ 1460 ret = actor(desc, page, offset, nr); 1461 offset += ret; 1462 index += offset >> PAGE_CACHE_SHIFT; 1463 offset &= ~PAGE_CACHE_MASK; 1464 1465 page_cache_release(page); 1466 if (ret == nr && desc->count) 1467 continue; 1468 break;
In this block, the page is present in the page cache and ready to be read by the file read actor function.
1470 /* 1471 * Ok, the page was not immediately readable, so let's try to read * ahead while we're at it.. 1472 */ 1473 page_not_up_to_date: 1474 generic_file_readahead(reada_ok, filp, inode, page); 1475 1476 if (Page_Uptodate(page)) 1477 goto page_ok; 1478 1479 /* Get exclusive access to the page ... */ 1480 lock_page(page); 1481 1482 /* Did it get unhashed before we got the lock? */ 1483 if (!page->mapping) { 1484 UnlockPage(page); 1485 page_cache_release(page); 1486 continue; 1487 } 1488 1489 /* Did somebody else fill it already? */ 1490 if (Page_Uptodate(page)) { 1491 UnlockPage(page); 1492 goto page_ok; 1493 }
In this block, the page being read was not up-to-date with information on the disk. generic_file_readahead() is called to update the current page and readahead as IO is required anyway.
1495 readpage: 1496 /* ... and start the actual read. The read will * unlock the page. */ 1497 error = mapping->a_ops->readpage(filp, page); 1498 1499 if (!error) { 1500 if (Page_Uptodate(page)) 1501 goto page_ok; 1502 1503 /* Again, try some read-ahead while waiting for * the page to finish.. */ 1504 generic_file_readahead(reada_ok, filp, inode, page); 1505 wait_on_page(page); 1506 if (Page_Uptodate(page)) 1507 goto page_ok; 1508 error = -EIO; 1509 } 1510 1511 /* UHHUH! A synchronous read error occurred. Report it */ 1512 desc->error = error; 1513 page_cache_release(page); 1514 break;
At this block, readahead failed to we synchronously read the page with the address_space supplied readpage() function.
1516 no_cached_page: 1517 /* 1518 * Ok, it wasn't cached, so we need to create a new 1519 * page.. 1520 * 1521 * We get here with the page cache lock held. 1522 */ 1523 if (!cached_page) { 1524 spin_unlock(&pagecache_lock); 1525 cached_page = page_cache_alloc(mapping); 1526 if (!cached_page) { 1527 desc->error = -ENOMEM; 1528 break; 1529 } 1530 1531 /* 1532 * Somebody may have added the page while we 1533 * dropped the page cache lock. Check for that. 1534 */ 1535 spin_lock(&pagecache_lock); 1536 page = __find_page_nolock(mapping, index, *hash); 1537 if (page) 1538 goto found_page; 1539 } 1540 1541 /* 1542 * Ok, add the new page to the hash-queues... 1543 */ 1544 page = cached_page; 1545 __add_to_page_cache(page, mapping, index, hash); 1546 spin_unlock(&pagecache_lock); 1547 lru_cache_add(page); 1548 cached_page = NULL; 1549 1550 goto readpage; 1551 }
In this block, the page does not exist in the page cache so allocate one and add it.
1552 1553 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; 1554 filp->f_reada = 1; 1555 if (cached_page) 1556 page_cache_release(cached_page); 1557 UPDATE_ATIME(inode); 1558 }
This function performs generic file read-ahead. Readahead is one of the few areas that is very heavily commented upon in the code. It is highly recommended that you read the comments in mm/filemap.c marked with “Read-ahead context”.
1222 static void generic_file_readahead(int reada_ok, 1223 struct file * filp, struct inode * inode, 1224 struct page * page) 1225 { 1226 unsigned long end_index; 1227 unsigned long index = page->index; 1228 unsigned long max_ahead, ahead; 1229 unsigned long raend; 1230 int max_readahead = get_max_readahead(inode); 1231 1232 end_index = inode->i_size >> PAGE_CACHE_SHIFT; 1233 1234 raend = filp->f_raend; 1235 max_ahead = 0;
1236 1237 /* 1238 * The current page is locked. 1239 * If the current position is inside the previous read IO request, 1240 * do not try to reread previously read ahead pages. 1241 * Otherwise decide or not to read ahead some pages synchronously. 1242 * If we are not going to read ahead, set the read ahead context 1243 * for this page only. 1244 */ 1245 if (PageLocked(page)) { 1246 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) { 1247 raend = index; 1248 if (raend < end_index) 1249 max_ahead = filp->f_ramax; 1250 filp->f_rawin = 0; 1251 filp->f_ralen = 1; 1252 if (!max_ahead) { 1253 filp->f_raend = index + filp->f_ralen; 1254 filp->f_rawin += filp->f_ralen; 1255 } 1256 } 1257 }
This block has encountered a page that is locked so it must decide whether to temporarily disable readahead.
1258 /* 1259 * The current page is not locked. 1260 * If we were reading ahead and, 1261 * if the current max read ahead size is not zero and, 1262 * if the current position is inside the last read-ahead IO 1263 * request, it is the moment to try to read ahead asynchronously. 1264 * We will later force unplug device in order to force * asynchronous read IO. 1265 */ 1266 else if (reada_ok && filp->f_ramax && raend >= 1 && 1267 index <= raend && index + filp->f_ralen >= raend) { 1268 /* 1269 * Add ONE page to max_ahead in order to try to have about the 1270 * same IO maxsize as synchronous read-ahead * (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE. 1271 * Compute the position of the last page we have tried to read 1272 * in order to begin to read ahead just at the next page. 1273 */ 1274 raend -= 1; 1275 if (raend < end_index) 1276 max_ahead = filp->f_ramax + 1; 1277 1278 if (max_ahead) { 1279 filp->f_rawin = filp->f_ralen; 1280 filp->f_ralen = 0; 1281 reada_ok = 2; 1282 } 1283 }
This is one of the rare cases where the in-code commentary makes the code as clear as it possibly could be. Basically, it is saying that if the current page is not locked for IO, then extend the readahead window slight and remember that readahead is currently going well.
1284 /* 1285 * Try to read ahead pages. 1286 * We hope that ll_rw_blk() plug/unplug, coalescence, requests 1287 * sort and the scheduler, will work enough for us to avoid too * bad actuals IO requests. 1288 */ 1289 ahead = 0; 1290 while (ahead < max_ahead) { 1291 ahead ++; 1292 if ((raend + ahead) >= end_index) 1293 break; 1294 if (page_cache_read(filp, raend + ahead) < 0) 1295 break; 1296 }
This block performs the actual readahead by calling page_cache_read() for each of the pages in the readahead window. Note here how ahead is incremented for each page that is readahead.
1297 /* 1298 * If we tried to read ahead some pages, 1299 * If we tried to read ahead asynchronously, 1300 * Try to force unplug of the device in order to start an 1301 * asynchronous read IO request. 1302 * Update the read-ahead context. 1303 * Store the length of the current read-ahead window. 1304 * Double the current max read ahead size. 1305 * That heuristic avoid to do some large IO for files that are 1306 * not really accessed sequentially. 1307 */ 1308 if (ahead) { 1309 filp->f_ralen += ahead; 1310 filp->f_rawin += filp->f_ralen; 1311 filp->f_raend = raend + ahead + 1; 1312 1313 filp->f_ramax += filp->f_ramax; 1314 1315 if (filp->f_ramax > max_readahead) 1316 filp->f_ramax = max_readahead; 1317 1318 #ifdef PROFILE_READAHEAD 1319 profile_readahead((reada_ok == 2), filp); 1320 #endif 1321 } 1322 1323 return; 1324 }
If readahead was successful, then update the readahead fields in the struct file to mark the progress. This is basically growing the readahead context but can be reset by do_generic_file_readahead() if it is found that the readahead is ineffective.
This is the generic mmap() function used by many struct files as their struct file_operations. It is mainly responsible for ensuring the appropriate address_space functions exist and setting what VMA operations to use.
2249 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2250 { 2251 struct address_space *mapping = file->f_dentry->d_inode->i_mapping; 2252 struct inode *inode = mapping->host; 2253 2254 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { 2255 if (!mapping->a_ops->writepage) 2256 return -EINVAL; 2257 } 2258 if (!mapping->a_ops->readpage) 2259 return -ENOEXEC; 2260 UPDATE_ATIME(inode); 2261 vma->vm_ops = &generic_file_vm_ops; 2262 return 0; 2263 }
This section covers the path where a file is being truncated. The actual system call truncate() is implemented by sys_truncate() in fs/open.c. By the time the top-level function in the VM is called (vmtruncate()), the dentry information for the file has been updated and the inode's semaphore has been acquired.
This is the top-level VM function responsible for truncating a file. When it completes, all page table entries mapping pages that have been truncated have been unmapped and reclaimed if possible.
1042 int vmtruncate(struct inode * inode, loff_t offset) 1043 { 1044 unsigned long pgoff; 1045 struct address_space *mapping = inode->i_mapping; 1046 unsigned long limit; 1047 1048 if (inode->i_size < offset) 1049 goto do_expand; 1050 inode->i_size = offset; 1051 spin_lock(&mapping->i_shared_lock); 1052 if (!mapping->i_mmap && !mapping->i_mmap_shared) 1053 goto out_unlock; 1054 1055 pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1056 if (mapping->i_mmap != NULL) 1057 vmtruncate_list(mapping->i_mmap, pgoff); 1058 if (mapping->i_mmap_shared != NULL) 1059 vmtruncate_list(mapping->i_mmap_shared, pgoff); 1060 1061 out_unlock: 1062 spin_unlock(&mapping->i_shared_lock); 1063 truncate_inode_pages(mapping, offset); 1064 goto out_truncate; 1065 1066 do_expand: 1067 limit = current->rlim[RLIMIT_FSIZE].rlim_cur; 1068 if (limit != RLIM_INFINITY && offset > limit) 1069 goto out_sig; 1070 if (offset > inode->i_sb->s_maxbytes) 1071 goto out; 1072 inode->i_size = offset; 1073 1074 out_truncate: 1075 if (inode->i_op && inode->i_op->truncate) { 1076 lock_kernel(); 1077 inode->i_op->truncate(inode); 1078 unlock_kernel(); 1079 } 1080 return 0; 1081 out_sig: 1082 send_sig(SIGXFSZ, current, 0); 1083 out: 1084 return -EFBIG; 1085 }
This function cycles through all VMAs in an address_spaces list and calls zap_page_range() for the range of addresses which map a file that is being truncated.
1006 static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff) 1007 { 1008 do { 1009 struct mm_struct *mm = mpnt->vm_mm; 1010 unsigned long start = mpnt->vm_start; 1011 unsigned long end = mpnt->vm_end; 1012 unsigned long len = end - start; 1013 unsigned long diff; 1014 1015 /* mapping wholly truncated? */ 1016 if (mpnt->vm_pgoff >= pgoff) { 1017 zap_page_range(mm, start, len); 1018 continue; 1019 } 1020 1021 /* mapping wholly unaffected? */ 1022 len = len >> PAGE_SHIFT; 1023 diff = pgoff - mpnt->vm_pgoff; 1024 if (diff >= len) 1025 continue; 1026 1027 /* Ok, partially affected.. */ 1028 start += diff << PAGE_SHIFT; 1029 len = (len - diff) << PAGE_SHIFT; 1030 zap_page_range(mm, start, len); 1031 } while ((mpnt = mpnt->vm_next_share) != NULL); 1032 }
This function is the top-level pagetable-walk function which unmaps userpages in the specified range from a mm_struct.
360 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) 361 { 362 mmu_gather_t *tlb; 363 pgd_t * dir; 364 unsigned long start = address, end = address + size; 365 int freed = 0; 366 367 dir = pgd_offset(mm, address); 368 369 /* 370 * This is a long-lived spinlock. That's fine. 371 * There's no contention, because the page table 372 * lock only protects against kswapd anyway, and 373 * even if kswapd happened to be looking at this 374 * process we _want_ it to get stuck. 375 */ 376 if (address >= end) 377 BUG(); 378 spin_lock(&mm->page_table_lock); 379 flush_cache_range(mm, address, end); 380 tlb = tlb_gather_mmu(mm); 381 382 do { 383 freed += zap_pmd_range(tlb, dir, address, end - address); 384 address = (address + PGDIR_SIZE) & PGDIR_MASK; 385 dir++; 386 } while (address && (address < end)); 387 388 /* this will flush any remaining tlb entries */ 389 tlb_finish_mmu(tlb, start, end); 390 391 /* 392 * Update rss for the mm_struct (not necessarily current->mm) 393 * Notice that rss is an unsigned long. 394 */ 395 if (mm->rss > freed) 396 mm->rss -= freed; 397 else 398 mm->rss = 0; 399 spin_unlock(&mm->page_table_lock); 400 }
This function is unremarkable. It steps through the PMDs that are affected by the requested range and calls zap_pte_range() for each one.
331 static inline int zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size) 332 { 333 pmd_t * pmd; 334 unsigned long end; 335 int freed; 336 337 if (pgd_none(*dir)) 338 return 0; 339 if (pgd_bad(*dir)) { 340 pgd_ERROR(*dir); 341 pgd_clear(dir); 342 return 0; 343 } 344 pmd = pmd_offset(dir, address); 345 end = address + size; 346 if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) 347 end = ((address + PGDIR_SIZE) & PGDIR_MASK); 348 freed = 0; 349 do { 350 freed += zap_pte_range(tlb, pmd, address, end - address); 351 address = (address + PMD_SIZE) & PMD_MASK; 352 pmd++; 353 } while (address < end); 354 return freed; 355 }
This function calls tlb_remove_page() for each PTE in the requested pmd within the requested address range.
294 static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size) 295 { 296 unsigned long offset; 297 pte_t * ptep; 298 int freed = 0; 299 300 if (pmd_none(*pmd)) 301 return 0; 302 if (pmd_bad(*pmd)) { 303 pmd_ERROR(*pmd); 304 pmd_clear(pmd); 305 return 0; 306 } 307 ptep = pte_offset(pmd, address); 308 offset = address & ~PMD_MASK; 309 if (offset + size > PMD_SIZE) 310 size = PMD_SIZE - offset; 311 size &= PAGE_MASK; 312 for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { 313 pte_t pte = *ptep; 314 if (pte_none(pte)) 315 continue; 316 if (pte_present(pte)) { 317 struct page *page = pte_page(pte); 318 if (VALID_PAGE(page) && !PageReserved(page)) 319 freed ++; 320 /* This will eventually call __free_pte on the pte. */ 321 tlb_remove_page(tlb, ptep, address + offset); 322 } else { 323 free_swap_and_cache(pte_to_swp_entry(pte)); 324 pte_clear(ptep); 325 } 326 } 327 328 return freed; 329 }
This is the top-level function responsible for truncating all pages from the page cache that occur after lstart in a mapping.
327 void truncate_inode_pages(struct address_space * mapping, loff_t lstart) 328 { 329 unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 330 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 331 int unlocked; 332 333 spin_lock(&pagecache_lock); 334 do { 335 unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial); 336 unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial); 337 unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial); 338 } while (unlocked); 339 /* Traversed all three lists without dropping the lock */ 340 spin_unlock(&pagecache_lock); 341 }
This function searches the requested list (head) which is part of an address_space. If pages are found after start, they will be truncated.
259 static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial) 260 { 261 struct list_head *curr; 262 struct page * page; 263 int unlocked = 0; 264 265 restart: 266 curr = head->prev; 267 while (curr != head) { 268 unsigned long offset; 269 270 page = list_entry(curr, struct page, list); 271 offset = page->index; 272 273 /* Is one of the pages to truncate? */ 274 if ((offset >= start) || (*partial && (offset + 1) == start)) { 275 int failed; 276 277 page_cache_get(page); 278 failed = TryLockPage(page); 279 280 list_del(head); 281 if (!failed) 282 /* Restart after this page */ 283 list_add_tail(head, curr); 284 else 285 /* Restart on this page */ 286 list_add(head, curr); 287 288 spin_unlock(&pagecache_lock); 289 unlocked = 1; 290 291 if (!failed) { 292 if (*partial && (offset + 1) == start) { 293 truncate_partial_page(page, *partial); 294 *partial = 0; 295 } else 296 truncate_complete_page(page); 297 298 UnlockPage(page); 299 } else 300 wait_on_page(page); 301 302 page_cache_release(page); 303 304 if (current->need_resched) { 305 __set_current_state(TASK_RUNNING); 306 schedule(); 307 } 308 309 spin_lock(&pagecache_lock); 310 goto restart; 311 } 312 curr = curr->prev; 313 } 314 return unlocked; 315 }
239 static void truncate_complete_page(struct page *page) 240 { 241 /* Leave it on the LRU if it gets converted into * anonymous buffers */ 242 if (!page->buffers || do_flushpage(page, 0)) 243 lru_cache_del(page); 244 245 /* 246 * We remove the page from the page cache _after_ we have 247 * destroyed all buffer-cache references to it. Otherwise some 248 * other process might think this inode page is not in the 249 * page cache and creates a buffer-cache alias to it causing 250 * all sorts of fun problems ... 251 */ 252 ClearPageDirty(page); 253 ClearPageUptodate(page); 254 remove_inode_page(page); 255 page_cache_release(page); 256 }
This function is responsible for flushing all buffers associated with a page.
223 static int do_flushpage(struct page *page, unsigned long offset) 224 { 225 int (*flushpage) (struct page *, unsigned long); 226 flushpage = page->mapping->a_ops->flushpage; 227 if (flushpage) 228 return (*flushpage)(page, offset); 229 return block_flushpage(page, offset); 230 }
This function partially truncates a page by zeroing out the higher bytes no longer in use and flushing any associated buffers.
232 static inline void truncate_partial_page(struct page *page, unsigned partial) 233 { 234 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); 235 if (page->buffers) 236 do_flushpage(page, partial); 237 }
This is the generic nopage() function used by many VMAs. This loops around itself with a large number of goto's which can be difficult to trace but there is nothing novel here. It is principally responsible for fetching the faulting page from either the pgae cache or reading it from disk. If appropriate it will also perform file read-ahead.
1994 struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused) 1995 { 1996 int error; 1997 struct file *file = area->vm_file; 1998 struct address_space *mapping = file->f_dentry->d_inode->i_mapping; 1999 struct inode *inode = mapping->host; 2000 struct page *page, **hash; 2001 unsigned long size, pgoff, endoff; 2002 2003 pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; 2004 endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; 2005
This block acquires the struct file, addres_space and inode important for this page fault. It then acquires the starting offset within the file needed for this fault and the offset that corresponds to the end of this VMA. The offset is the end of the VMA instead of the end of the page in case file read-ahead is performed.
2006 retry_all: 2007 /* 2008 * An external ptracer can access pages that normally aren't 2009 * accessible.. 2010 */ 2011 size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 2012 if ((pgoff >= size) && (area->vm_mm == current->mm)) 2013 return NULL; 2014 2015 /* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */ 2016 if (size > endoff) 2017 size = endoff; 2018 2019 /* 2020 * Do we have something in the page cache already? 2021 */ 2022 hash = page_hash(mapping, pgoff); 2023 retry_find: 2024 page = __find_get_page(mapping, pgoff, hash); 2025 if (!page) 2026 goto no_cached_page; 2027 2028 /* 2029 * Ok, found a page in the page cache, now we need to check 2030 * that it's up-to-date. 2031 */ 2032 if (!Page_Uptodate(page)) 2033 goto page_not_uptodate;
2035 success: 2036 /* 2037 * Try read-ahead for sequential areas. 2038 */ 2039 if (VM_SequentialReadHint(area)) 2040 nopage_sequential_readahead(area, pgoff, size); 2041 2042 /* 2043 * Found the page and have a reference on it, need to check sharing 2044 * and possibly copy it over to another page.. 2045 */ 2046 mark_page_accessed(page); 2047 flush_page_to_ram(page); 2048 return page; 2049
2050 no_cached_page: 2051 /* 2052 * If the requested offset is within our file, try to read 2053 * a whole cluster of pages at once. 2054 * 2055 * Otherwise, we're off the end of a privately mapped file, 2056 * so we need to map a zero page. 2057 */ 2058 if ((pgoff < size) && !VM_RandomReadHint(area)) 2059 error = read_cluster_nonblocking(file, pgoff, size); 2060 else 2061 error = page_cache_read(file, pgoff); 2062 2063 /* 2064 * The page we want has now been added to the page cache. 2065 * In the unlikely event that someone removed it in the 2066 * meantime, we'll just come back here and read it again. 2067 */ 2068 if (error >= 0) 2069 goto retry_find; 2070 2071 /* 2072 * An error return from page_cache_read can result if the 2073 * system is low on memory, or a problem occurs while trying 2074 * to schedule I/O. 2075 */ 2076 if (error == -ENOMEM) 2077 return NOPAGE_OOM; 2078 return NULL;
2080 page_not_uptodate: 2081 lock_page(page); 2082 2083 /* Did it get unhashed while we waited for it? */ 2084 if (!page->mapping) { 2085 UnlockPage(page); 2086 page_cache_release(page); 2087 goto retry_all; 2088 } 2089 2090 /* Did somebody else get it up-to-date? */ 2091 if (Page_Uptodate(page)) { 2092 UnlockPage(page); 2093 goto success; 2094 } 2095 2096 if (!mapping->a_ops->readpage(file, page)) { 2097 wait_on_page(page); 2098 if (Page_Uptodate(page)) 2099 goto success; 2100 }
In this block, the page was found but it was not up-to-date so the reasons for the page not being up to date are checked. If it looks ok, the appropriate readpage() function is called to resync the page.
2101 2102 /* 2103 * Umm, take care of errors if the page isn't up-to-date. 2104 * Try to re-read it _once_. We do this synchronously, 2105 * because there really aren't any performance issues here 2106 * and we need to check for errors. 2107 */ 2108 lock_page(page); 2109 2110 /* Somebody truncated the page on us? */ 2111 if (!page->mapping) { 2112 UnlockPage(page); 2113 page_cache_release(page); 2114 goto retry_all; 2115 } 2116 2117 /* Somebody else successfully read it in? */ 2118 if (Page_Uptodate(page)) { 2119 UnlockPage(page); 2120 goto success; 2121 } 2122 ClearPageError(page); 2123 if (!mapping->a_ops->readpage(file, page)) { 2124 wait_on_page(page); 2125 if (Page_Uptodate(page)) 2126 goto success; 2127 } 2128 2129 /* 2130 * Things didn't work out. Return zero to tell the 2131 * mm layer so, possibly freeing the page cache page first. 2132 */ 2133 page_cache_release(page); 2134 return NULL; 2135 }
In this path, the page is not up-to-date due to some IO error. A second attempt is made to read the page data and if it fails, return.
This function adds the page corresponding to the offset within the file to the page cache if it does not exist there already.
702 static int page_cache_read(struct file * file, unsigned long offset) 703 { 704 struct address_space *mapping = file->f_dentry->d_inode->i_mapping; 705 struct page **hash = page_hash(mapping, offset); 706 struct page *page; 707 708 spin_lock(&pagecache_lock); 709 page = __find_page_nolock(mapping, offset, *hash); 710 spin_unlock(&pagecache_lock); 711 if (page) 712 return 0; 713 714 page = page_cache_alloc(mapping); 715 if (!page) 716 return -ENOMEM; 717 718 if (!add_to_page_cache_unique(page, mapping, offset, hash)) { 719 int error = mapping->a_ops->readpage(file, page); 720 page_cache_release(page); 721 return error; 722 } 723 /* 724 * We arrive here in the unlikely event that someone 725 * raced with us and added our page to the cache first. 726 */ 727 page_cache_release(page); 728 return 0; 729 }
This function is only called by filemap_nopage() when the VM_SEQ_READ flag has been specified in the VMA. When half of the current readahead-window has been faulted in, the next readahead window is scheduled for IO and pages from the previous window are freed.
1936 static void nopage_sequential_readahead( struct vm_area_struct * vma, 1937 unsigned long pgoff, unsigned long filesize) 1938 { 1939 unsigned long ra_window; 1940 1941 ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode); 1942 ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1); 1943 1944 /* vm_raend is zero if we haven't read ahead * in this area yet. */ 1945 if (vma->vm_raend == 0) 1946 vma->vm_raend = vma->vm_pgoff + ra_window; 1947
1948 /* 1949 * If we've just faulted the page half-way through our window, 1950 * then schedule reads for the next window, and release the 1951 * pages in the previous window. 1952 */ 1953 if ((pgoff + (ra_window >> 1)) == vma->vm_raend) { 1954 unsigned long start = vma->vm_pgoff + vma->vm_raend; 1955 unsigned long end = start + ra_window; 1956 1957 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff)) 1958 end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff; 1959 if (start > end) 1960 return; 1961 1962 while ((start < end) && (start < filesize)) { 1963 if (read_cluster_nonblocking(vma->vm_file, 1964 start, filesize) < 0) 1965 break; 1966 start += CLUSTER_PAGES; 1967 } 1968 run_task_queue(&tq_disk); 1969 1970 /* if we're far enough past the beginning of this area, 1971 recycle pages that are in the previous window. */ 1972 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) { 1973 unsigned long window = ra_window << PAGE_SHIFT; 1974 1975 end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT); 1976 end -= window + window; 1977 filemap_sync(vma, end - window, window, MS_INVALIDATE); 1978 } 1979 1980 vma->vm_raend += ra_window; 1981 } 1982 1983 return; 1984 }
737 static int read_cluster_nonblocking(struct file * file, unsigned long offset, 738 unsigned long filesize) 739 { 740 unsigned long pages = CLUSTER_PAGES; 741 742 offset = CLUSTER_OFFSET(offset); 743 while ((pages-- > 0) && (offset < filesize)) { 744 int error = page_cache_read(file, offset); 745 if (error < 0) 746 return error; 747 offset ++; 748 } 749 750 return 0; 751 }
This function will fault in a number of pages after the current entry. It will stop with either CLUSTER_PAGES have been swapped in or an unused swap entry is found.
1093 void swapin_readahead(swp_entry_t entry) 1094 { 1095 int i, num; 1096 struct page *new_page; 1097 unsigned long offset; 1098 1099 /* 1100 * Get the number of handles we should do readahead io to. 1101 */ 1102 num = valid_swaphandles(entry, &offset); 1103 for (i = 0; i < num; offset++, i++) { 1104 /* Ok, do the async read-ahead now */ 1105 new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset)); 1106 if (!new_page) 1107 break; 1108 page_cache_release(new_page); 1109 } 1110 return; 1111 }
This function determines how many pages should be readahead from swap starting from offset. It will readahead to the next unused swap slot but at most, it will return CLUSTER_PAGES.
1238 int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 1239 { 1240 int ret = 0, i = 1 << page_cluster; 1241 unsigned long toff; 1242 struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info; 1243 1244 if (!page_cluster) /* no readahead */ 1245 return 0; 1246 toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster; 1247 if (!toff) /* first page is swap header */ 1248 toff++, i--; 1249 *offset = toff; 1250 1251 swap_device_lock(swapdev); 1252 do { 1253 /* Don't read-ahead past the end of the swap area */ 1254 if (toff >= swapdev->max) 1255 break; 1256 /* Don't read in free or bad pages */ 1257 if (!swapdev->swap_map[toff]) 1258 break; 1259 if (swapdev->swap_map[toff] == SWAP_MAP_BAD) 1260 break; 1261 toff++; 1262 ret++; 1263 } while (--i); 1264 swap_device_unlock(swapdev); 1265 return ret; 1266 }