Source: arch/i386/kernel/setup.c
The call graph for this function is shown in Figure 2.3. This function gets the necessary information to give to the boot memory allocator to initialise itself. It is broken up into a number of different tasks.
991 static unsigned long __init setup_memory(void) 992 { 993 unsigned long bootmap_size, start_pfn, max_low_pfn; 994 995 /* 996 * partially used pages are not usable - thus 997 * we are rounding upwards: 998 */ 999 start_pfn = PFN_UP(__pa(&_end)); 1000 1001 find_max_pfn(); 1002 1003 max_low_pfn = find_max_low_pfn(); 1004 1005 #ifdef CONFIG_HIGHMEM 1006 highstart_pfn = highend_pfn = max_pfn; 1007 if (max_pfn > max_low_pfn) { 1008 highstart_pfn = max_low_pfn; 1009 } 1010 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 1011 pages_to_mb(highend_pfn - highstart_pfn)); 1012 #endif 1013 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 1014 pages_to_mb(max_low_pfn));
1018 bootmap_size = init_bootmem(start_pfn, max_low_pfn); 1019 1020 register_bootmem_low_pages(max_low_pfn); 1021 1028 reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + 1029 bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); 1030 1035 reserve_bootmem(0, PAGE_SIZE); 1036 1037 #ifdef CONFIG_SMP 1043 reserve_bootmem(PAGE_SIZE, PAGE_SIZE); 1044 #endif 1045 #ifdef CONFIG_ACPI_SLEEP 1046 /* 1047 * Reserve low memory region for sleep support. 1048 */ 1049 acpi_reserve_bootmem(); 1050 #endif
1051 #ifdef CONFIG_X86_LOCAL_APIC 1052 /* 1053 * Find and reserve possible boot-time SMP configuration: 1054 */ 1055 find_smp_config(); 1056 #endif 1057 #ifdef CONFIG_BLK_DEV_INITRD 1058 if (LOADER_TYPE && INITRD_START) { 1059 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { 1060 reserve_bootmem(INITRD_START, INITRD_SIZE); 1061 initrd_start = 1062 INITRD_START ? INITRD_START + PAGE_OFFSET : 0; 1063 initrd_end = initrd_start+INITRD_SIZE; 1064 } 1065 else { 1066 printk(KERN_ERR "initrd extends beyond end of memory " 1067 "(0x%08lx > 0x%08lx)\ndisabling initrd\n", 1068 INITRD_START + INITRD_SIZE, 1069 max_low_pfn << PAGE_SHIFT); 1070 initrd_start = 0; 1071 } 1072 } 1073 #endif 1074 1075 return max_low_pfn; 1076 }
This is the top-level function which is used to initialise each of the zones. The size of the zones in PFNs was discovered during setup_memory() (See Section B.1.1). This function populates an array of zone sizes for passing to free_area_init().
323 static void __init zone_sizes_init(void) 324 { 325 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; 326 unsigned int max_dma, high, low; 327 328 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 329 low = max_low_pfn; 330 high = highend_pfn; 331 332 if (low < max_dma) 333 zones_size[ZONE_DMA] = low; 334 else { 335 zones_size[ZONE_DMA] = max_dma; 336 zones_size[ZONE_NORMAL] = low - max_dma; 337 #ifdef CONFIG_HIGHMEM 338 zones_size[ZONE_HIGHMEM] = high - low; 339 #endif 340 } 341 free_area_init(zones_size); 342 }
This is the architecture independant function for setting up a UMA architecture. It simply calls the core function passing the static contig_page_data as the node. NUMA architectures will use free_area_init_node() instead.
838 void __init free_area_init(unsigned long *zones_size) 839 { 840 free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0); 841 }
There are two versions of this function. The first is almost identical to free_area_init() except it uses a different starting physical address. There is for architectures that have only one node (so they use contig_page_data) but whose physical address is not at 0.
This version of the function, called after the pagetable initialisation, if for initialisation each pgdat in the system. The caller has the option of allocating their own local portion of the mem_map and passing it in as a parameter if they want to optimise it's location for the architecture. If they choose not to, it will be allocated later by free_area_init_core().
61 void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, 62 unsigned long *zones_size, unsigned long zone_start_paddr, 63 unsigned long *zholes_size) 64 { 65 int i, size = 0; 66 struct page *discard; 67 68 if (mem_map == (mem_map_t *)NULL) 69 mem_map = (mem_map_t *)PAGE_OFFSET; 70 71 free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_paddr, 72 zholes_size, pmap); 73 pgdat->node_id = nid; 74 75 /* 76 * Get space for the valid bitmap. 77 */ 78 for (i = 0; i < MAX_NR_ZONES; i++) 79 size += zones_size[i]; 80 size = LONG_ALIGN((size + 7) >> 3); 81 pgdat->valid_addr_bitmap = (unsigned long *)alloc_bootmem_node(pgdat, size); 82 memset(pgdat->valid_addr_bitmap, 0, size); 83 }
This function is responsible for initialising all zones and allocating their local lmem_map within a node. In UMA architectures, this function is called in a way that will initialise the global mem_map array. In NUMA architectures, the array is treated as a virtual array that is sparsely populated.
684 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, 685 unsigned long *zones_size, unsigned long zone_start_paddr, 686 unsigned long *zholes_size, struct page *lmem_map) 687 { 688 unsigned long i, j; 689 unsigned long map_size; 690 unsigned long totalpages, offset, realtotalpages; 691 const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); 692 693 if (zone_start_paddr & ~PAGE_MASK) 694 BUG(); 695 696 totalpages = 0; 697 for (i = 0; i < MAX_NR_ZONES; i++) { 698 unsigned long size = zones_size[i]; 699 totalpages += size; 700 } 701 realtotalpages = totalpages; 702 if (zholes_size) 703 for (i = 0; i < MAX_NR_ZONES; i++) 704 realtotalpages -= zholes_size[i]; 705 706 printk("On node %d totalpages: %lu\n", nid, realtotalpages);
This block is mainly responsible for calculating the size of each zone.
708 /* 709 * Some architectures (with lots of mem and discontinous memory 710 * maps) have to search for a good mem_map area: 711 * For discontigmem, the conceptual mem map array starts from 712 * PAGE_OFFSET, we need to align the actual array onto a mem map 713 * boundary, so that MAP_NR works. 714 */ 715 map_size = (totalpages + 1)*sizeof(struct page); 716 if (lmem_map == (struct page *)0) { 717 lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); 718 lmem_map = (struct page *)(PAGE_OFFSET + 719 MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); 720 } 721 *gmap = pgdat->node_mem_map = lmem_map; 722 pgdat->node_size = totalpages; 723 pgdat->node_start_paddr = zone_start_paddr; 724 pgdat->node_start_mapnr = (lmem_map - mem_map); 725 pgdat->nr_zones = 0; 726 727 offset = lmem_map - mem_map;
This block allocates the local lmem_map if necessary and sets the gmap. In UMA architectures, gmap is actually mem_map and so this is where the memory for it is allocated
728 for (j = 0; j < MAX_NR_ZONES; j++) { 729 zone_t *zone = pgdat->node_zones + j; 730 unsigned long mask; 731 unsigned long size, realsize; 732 733 zone_table[nid * MAX_NR_ZONES + j] = zone; 734 realsize = size = zones_size[j]; 735 if (zholes_size) 736 realsize -= zholes_size[j]; 737 738 printk("zone(%lu): %lu pages.\n", j, size); 739 zone->size = size; 740 zone->name = zone_names[j]; 741 zone->lock = SPIN_LOCK_UNLOCKED; 742 zone->zone_pgdat = pgdat; 743 zone->free_pages = 0; 744 zone->need_balance = 0; 745 if (!size) 746 continue;
This block starts a loop which initialises every zone_t within the node. The initialisation starts with the setting of the simplier fields that values already exist for.
752 zone->wait_table_size = wait_table_size(size); 753 zone->wait_table_shift = 754 BITS_PER_LONG - wait_table_bits(zone->wait_table_size); 755 zone->wait_table = (wait_queue_head_t *) 756 alloc_bootmem_node(pgdat, zone->wait_table_size 757 * sizeof(wait_queue_head_t)); 758 759 for(i = 0; i < zone->wait_table_size; ++i) 760 init_waitqueue_head(zone->wait_table + i);
Initialise the waitqueue for this zone. Processes waiting on pages in the zone use this hashed table to select a queue to wait on. This means that all processes waiting in a zone will not have to be woken when a page is unlocked, just a smaller subset.
762 pgdat->nr_zones = j+1; 763 764 mask = (realsize / zone_balance_ratio[j]); 765 if (mask < zone_balance_min[j]) 766 mask = zone_balance_min[j]; 767 else if (mask > zone_balance_max[j]) 768 mask = zone_balance_max[j]; 769 zone->pages_min = mask; 770 zone->pages_low = mask*2; 771 zone->pages_high = mask*3; 772 773 zone->zone_mem_map = mem_map + offset; 774 zone->zone_start_mapnr = offset; 775 zone->zone_start_paddr = zone_start_paddr; 776 777 if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) 778 printk("BUG: wrong zone alignment, it will crash\n"); 779
Calculate the watermarks for the zone and record the location of the zone. The watermarks are calculated as ratios of the zone size.
780 /* 781 * Initially all pages are reserved - free ones are freed 782 * up by free_all_bootmem() once the early boot process is 783 * done. Non-atomic initialization, single-pass. 784 */ 785 for (i = 0; i < size; i++) { 786 struct page *page = mem_map + offset + i; 787 set_page_zone(page, nid * MAX_NR_ZONES + j); 788 set_page_count(page, 0); 789 SetPageReserved(page); 790 INIT_LIST_HEAD(&page->list); 791 if (j != ZONE_HIGHMEM) 792 set_page_address(page, __va(zone_start_paddr)); 793 zone_start_paddr += PAGE_SIZE; 794 } 795
796 offset += size; 797 for (i = 0; ; i++) { 798 unsigned long bitmap_size; 799 800 INIT_LIST_HEAD(&zone->free_area[i].free_list); 801 if (i == MAX_ORDER-1) { 802 zone->free_area[i].map = NULL; 803 break; 804 } 805 829 bitmap_size = (size-1) >> (i+4); 830 bitmap_size = LONG_ALIGN(bitmap_size+1); 831 zone->free_area[i].map = 832 (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); 833 } 834 } 835 build_zonelists(pgdat); 836 }
This block initialises the free lists for the zone and allocates the bitmap used by the buddy allocator to record the state of page buddies.
This builds the list of fallback zones for each zone in the requested node. This is for when an allocation cannot be satisified and another zone is consulted. When this is finished, allocatioons from ZONE_HIGHMEM will fallback to ZONE_NORMAL. Allocations from ZONE_NORMAL will fall back to ZONE_DMA which in turn has nothing to fall back on.
589 static inline void build_zonelists(pg_data_t *pgdat) 590 { 591 int i, j, k; 592 593 for (i = 0; i <= GFP_ZONEMASK; i++) { 594 zonelist_t *zonelist; 595 zone_t *zone; 596 597 zonelist = pgdat->node_zonelists + i; 598 memset(zonelist, 0, sizeof(*zonelist)); 599 600 j = 0; 601 k = ZONE_NORMAL; 602 if (i & __GFP_HIGHMEM) 603 k = ZONE_HIGHMEM; 604 if (i & __GFP_DMA) 605 k = ZONE_DMA; 606 607 switch (k) { 608 default: 609 BUG(); 610 /* 611 * fallthrough: 612 */ 613 case ZONE_HIGHMEM: 614 zone = pgdat->node_zones + ZONE_HIGHMEM; 615 if (zone->size) { 616 #ifndef CONFIG_HIGHMEM 617 BUG(); 618 #endif 619 zonelist->zones[j++] = zone; 620 } 621 case ZONE_NORMAL: 622 zone = pgdat->node_zones + ZONE_NORMAL; 623 if (zone->size) 624 zonelist->zones[j++] = zone; 625 case ZONE_DMA: 626 zone = pgdat->node_zones + ZONE_DMA; 627 if (zone->size) 628 zonelist->zones[j++] = zone; 629 } 630 zonelist->zones[j++] = NULL; 631 } 632 }
B.2 Page Operations | 216 |
B.2.1 Locking Pages | 216 |
B.2.1.1 Function: lock_page() | 216 |
B.2.1.2 Function: __lock_page() | 216 |
B.2.1.3 Function: sync_page() | 217 |
B.2.2 Unlocking Pages | 218 |
B.2.2.1 Function: unlock_page() | 218 |
B.2.3 Waiting on Pages | 219 |
B.2.3.1 Function: wait_on_page() | 219 |
B.2.3.2 Function: ___wait_on_page() | 219 |
This function tries to lock a page. If the page cannot be locked, it will cause the process to sleep until the page is available.
921 void lock_page(struct page *page) 922 { 923 if (TryLockPage(page)) 924 __lock_page(page); 925 }
This is called after a TryLockPage() failed. It will locate the waitqueue for this page and sleep on it until the lock can be acquired.
897 static void __lock_page(struct page *page) 898 { 899 wait_queue_head_t *waitqueue = page_waitqueue(page); 900 struct task_struct *tsk = current; 901 DECLARE_WAITQUEUE(wait, tsk); 902 903 add_wait_queue_exclusive(waitqueue, &wait); 904 for (;;) { 905 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 906 if (PageLocked(page)) { 907 sync_page(page); 908 schedule(); 909 } 910 if (!TryLockPage(page)) 911 break; 912 } 913 __set_task_state(tsk, TASK_RUNNING); 914 remove_wait_queue(waitqueue, &wait); 915 }
This calls the filesystem-specific sync_page() to synchronsise the page with it's backing storage.
140 static inline int sync_page(struct page *page) 141 { 142 struct address_space *mapping = page->mapping; 143 144 if (mapping && mapping->a_ops && mapping->a_ops->sync_page) 145 return mapping->a_ops->sync_page(page); 146 return 0; 147 }
This function unlocks a page and wakes up any processes that may be waiting on it.
874 void unlock_page(struct page *page) 875 { 876 wait_queue_head_t *waitqueue = page_waitqueue(page); 877 ClearPageLaunder(page); 878 smp_mb__before_clear_bit(); 879 if (!test_and_clear_bit(PG_locked, &(page)->flags)) 880 BUG(); 881 smp_mb__after_clear_bit(); 882 883 /* 884 * Although the default semantics of wake_up() are 885 * to wake all, here the specific function is used 886 * to make it even more explicit that a number of 887 * pages are being waited on here. 888 */ 889 if (waitqueue_active(waitqueue)) 890 wake_up_all(waitqueue); 891 }
Source: include/linux/pagemap.h
94 static inline void wait_on_page(struct page * page) 95 { 96 if (PageLocked(page)) 97 ___wait_on_page(page); 98 }
This function is called after PageLocked() has been used to determine the page is locked. The calling process will probably sleep until the page is unlocked.
849 void ___wait_on_page(struct page *page) 850 { 851 wait_queue_head_t *waitqueue = page_waitqueue(page); 852 struct task_struct *tsk = current; 853 DECLARE_WAITQUEUE(wait, tsk); 854 855 add_wait_queue(waitqueue, &wait); 856 do { 857 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 858 if (!PageLocked(page)) 859 break; 860 sync_page(page); 861 schedule(); 862 } while (PageLocked(page)); 863 __set_task_state(tsk, TASK_RUNNING); 864 remove_wait_queue(waitqueue, &wait); 865 }