The call graph for this function is shown in Figure 11.2. This is the high level API function for searching the swap areas for a free swap lot and returning the resulting swp_entry_t.
99 swp_entry_t get_swap_page(void) 100 { 101 struct swap_info_struct * p; 102 unsigned long offset; 103 swp_entry_t entry; 104 int type, wrapped = 0; 105 106 entry.val = 0; /* Out of memory */ 107 swap_list_lock(); 108 type = swap_list.next; 109 if (type < 0) 110 goto out; 111 if (nr_swap_pages <= 0) 112 goto out; 113 114 while (1) { 115 p = &swap_info[type]; 116 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { 117 swap_device_lock(p); 118 offset = scan_swap_map(p); 119 swap_device_unlock(p); 120 if (offset) { 121 entry = SWP_ENTRY(type,offset); 122 type = swap_info[type].next; 123 if (type < 0 || 124 p->prio != swap_info[type].prio) { 125 swap_list.next = swap_list.head; 126 } else { 127 swap_list.next = type; 128 } 129 goto out; 130 } 131 } 132 type = p->next; 133 if (!wrapped) { 134 if (type < 0 || p->prio != swap_info[type].prio) { 135 type = swap_list.head; 136 wrapped = 1; 137 } 138 } else 139 if (type < 0) 140 goto out; /* out of swap space */ 141 } 142 out: 143 swap_list_unlock(); 144 return entry; 145 }
This function tries to allocate SWAPFILE_CLUSTER number of pages sequentially in swap. When it has allocated that many, it searches for another block of free slots of size SWAPFILE_CLUSTER. If it fails to find one, it resorts to allocating the first free slot. This clustering attempts to make sure that slots are allocated and freed in SWAPFILE_CLUSTER sized chunks.
36 static inline int scan_swap_map(struct swap_info_struct *si) 37 { 38 unsigned long offset; 47 if (si->cluster_nr) { 48 while (si->cluster_next <= si->highest_bit) { 49 offset = si->cluster_next++; 50 if (si->swap_map[offset]) 51 continue; 52 si->cluster_nr--; 53 goto got_page; 54 } 55 }
Allocate SWAPFILE_CLUSTER pages sequentially. cluster_nr is initialised to SWAPFILE_CLUTER and decrements with each allocation
56 si->cluster_nr = SWAPFILE_CLUSTER; 57 58 /* try to find an empty (even not aligned) cluster. */ 59 offset = si->lowest_bit; 60 check_next_cluster: 61 if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) 62 { 63 int nr; 64 for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) 65 if (si->swap_map[nr]) 66 { 67 offset = nr+1; 68 goto check_next_cluster; 69 } 70 /* We found a completly empty cluster, so start 71 * using it. 72 */ 73 goto got_page; 74 }
At this stage, SWAPFILE_CLUSTER pages have been allocated sequentially so find the next free block of SWAPFILE_CLUSTER pages.
75 /* No luck, so now go finegrined as usual. -Andrea */ 76 for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { 77 if (si->swap_map[offset]) 78 continue; 79 si->lowest_bit = offset+1;
This unusual for loop extract starts scanning for a free page starting from lowest_bit
80 got_page: 81 if (offset == si->lowest_bit) 82 si->lowest_bit++; 83 if (offset == si->highest_bit) 84 si->highest_bit--; 85 if (si->lowest_bit > si->highest_bit) { 86 si->lowest_bit = si->max; 87 si->highest_bit = 0; 88 } 89 si->swap_map[offset] = 1; 90 nr_swap_pages--; 91 si->cluster_next = offset+1; 92 return offset; 93 } 94 si->lowest_bit = si->max; 95 si->highest_bit = 0; 96 return 0; 97 }
A slot has been found, do some housekeeping and return it
The call graph for this function is shown in Figure 11.3. This function wraps around the normal page cache handler. It first checks if the page is already in the swap cache with swap_duplicate() and if it does not, it calls add_to_page_cache_unique() instead.
70 int add_to_swap_cache(struct page *page, swp_entry_t entry) 71 { 72 if (page->mapping) 73 BUG(); 74 if (!swap_duplicate(entry)) { 75 INC_CACHE_INFO(noent_race); 76 return -ENOENT; 77 } 78 if (add_to_page_cache_unique(page, &swapper_space, entry.val, 79 page_hash(&swapper_space, entry.val)) != 0) { 80 swap_free(entry); 81 INC_CACHE_INFO(exist_race); 82 return -EEXIST; 83 } 84 if (!PageLocked(page)) 85 BUG(); 86 if (!PageSwapCache(page)) 87 BUG(); 88 INC_CACHE_INFO(add_total); 89 return 0; 90 }
This function verifies a swap entry is valid and if so, increments its swap map count.
1161 int swap_duplicate(swp_entry_t entry) 1162 { 1163 struct swap_info_struct * p; 1164 unsigned long offset, type; 1165 int result = 0; 1166 1167 type = SWP_TYPE(entry); 1168 if (type >= nr_swapfiles) 1169 goto bad_file; 1170 p = type + swap_info; 1171 offset = SWP_OFFSET(entry); 1172 1173 swap_device_lock(p); 1174 if (offset < p->max && p->swap_map[offset]) { 1175 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 1176 p->swap_map[offset]++; 1177 result = 1; 1178 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { 1179 if (swap_overflow++ < 5) 1180 printk(KERN_WARNING "swap_dup: swap entry overflow\n"); 1181 p->swap_map[offset] = SWAP_MAP_MAX; 1182 result = 1; 1183 } 1184 } 1185 swap_device_unlock(p); 1186 out: 1187 return result; 1188 1189 bad_file: 1190 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 1191 goto out; 1192 }
Decrements the corresponding swap_map entry for the swp_entry_t
214 void swap_free(swp_entry_t entry) 215 { 216 struct swap_info_struct * p; 217 218 p = swap_info_get(entry); 219 if (p) { 220 swap_entry_free(p, SWP_OFFSET(entry)); 221 swap_info_put(p); 222 } 223 }
192 static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) 193 { 194 int count = p->swap_map[offset]; 195 196 if (count < SWAP_MAP_MAX) { 197 count--; 198 p->swap_map[offset] = count; 199 if (!count) { 200 if (offset < p->lowest_bit) 201 p->lowest_bit = offset; 202 if (offset > p->highest_bit) 203 p->highest_bit = offset; 204 nr_swap_pages++; 205 } 206 } 207 return count; 208 }
This function finds the swap_info_struct for the given entry, performs some basic checking and then locks the device.
147 static struct swap_info_struct * swap_info_get(swp_entry_t entry) 148 { 149 struct swap_info_struct * p; 150 unsigned long offset, type; 151 152 if (!entry.val) 153 goto out; 154 type = SWP_TYPE(entry); 155 if (type >= nr_swapfiles) 156 goto bad_nofile; 157 p = & swap_info[type]; 158 if (!(p->flags & SWP_USED)) 159 goto bad_device; 160 offset = SWP_OFFSET(entry); 161 if (offset >= p->max) 162 goto bad_offset; 163 if (!p->swap_map[offset]) 164 goto bad_free; 165 swap_list_lock(); 166 if (p->prio > swap_info[swap_list.next].prio) 167 swap_list.next = type; 168 swap_device_lock(p); 169 return p; 170 171 bad_free: 172 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); 173 goto out; 174 bad_offset: 175 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); 176 goto out; 177 bad_device: 178 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); 179 goto out; 180 bad_nofile: 181 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 182 out: 183 return NULL; 184 }
This function simply unlocks the area and list
186 static void swap_info_put(struct swap_info_struct * p) 187 { 188 swap_device_unlock(p); 189 swap_list_unlock(); 190 }
Top level function for finding a page in the swap cache
161 struct page * lookup_swap_cache(swp_entry_t entry) 162 { 163 struct page *found; 164 165 found = find_get_page(&swapper_space, entry.val); 166 /* 167 * Unsafe to assert PageSwapCache and mapping on page found: 168 * if SMP nothing prevents swapoff from deleting this page from 169 * the swap cache at this moment. find_lock_page would prevent 170 * that, but no need to change: we _have_ got the right page. 171 */ 172 INC_CACHE_INFO(find_total); 173 if (found) 174 INC_CACHE_INFO(find_success); 175 return found; 176 }
This function will either return the requsted page from the swap cache. If it does not exist, a page will be allocated, placed in the swap cache and the data is scheduled to be read from disk with rw_swap_page().
184 struct page * read_swap_cache_async(swp_entry_t entry) 185 { 186 struct page *found_page, *new_page = NULL; 187 int err; 188 189 do { 196 found_page = find_get_page(&swapper_space, entry.val); 197 if (found_page) 198 break; 199 200 /* 201 * Get a new page to read into from swap. 202 */ 203 if (!new_page) { 204 new_page = alloc_page(GFP_HIGHUSER); 205 if (!new_page) 206 break; /* Out of memory */ 207 } 208 209 /* 210 * Associate the page with swap entry in the swap cache. 211 * May fail (-ENOENT) if swap entry has been freed since 212 * our caller observed it. May fail (-EEXIST) if there 213 * is already a page associated with this entry in the 214 * swap cache: added by a racing read_swap_cache_async, 215 * or by try_to_swap_out (or shmem_writepage) re-using 216 * the just freed swap entry for an existing page. 217 */ 218 err = add_to_swap_cache(new_page, entry); 219 if (!err) { 220 /* 221 * Initiate read into locked page and return. 222 */ 223 rw_swap_page(READ, new_page); 224 return new_page; 225 } 226 } while (err != -ENOENT); 227 228 if (new_page) 229 page_cache_release(new_page); 230 return found_page; 231 }
This is the function registered in swap_aops for writing out pages. It's function is pretty simple. First it calls remove_exclusive_swap_page() to try and free the page. If the page was freed, then the page will be unlocked here before returning as there is no IO pending on the page. Otherwise rw_swap_page() is called to sync the page with backing storage.
24 static int swap_writepage(struct page *page) 25 { 26 if (remove_exclusive_swap_page(page)) { 27 UnlockPage(page); 28 return 0; 29 } 30 rw_swap_page(WRITE, page); 31 return 0; 32 }
This function will tries to work out if there is other processes sharing this page or not. If possible the page will be removed from the swap cache and freed. Once removed from the swap cache, swap_free() is decremented to indicate that the swap cache is no longer using the slot. The count will instead reflect the number of PTEs that contain a swp_entry_t for this slot.
287 int remove_exclusive_swap_page(struct page *page) 288 { 289 int retval; 290 struct swap_info_struct * p; 291 swp_entry_t entry; 292 293 if (!PageLocked(page)) 294 BUG(); 295 if (!PageSwapCache(page)) 296 return 0; 297 if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */ 298 return 0; 299 300 entry.val = page->index; 301 p = swap_info_get(entry); 302 if (!p) 303 return 0; 304 305 /* Is the only swap cache user the cache itself? */ 306 retval = 0; 307 if (p->swap_map[SWP_OFFSET(entry)] == 1) { 308 /* Recheck the page count with the pagecache lock held.. */ 309 spin_lock(&pagecache_lock); 310 if (page_count(page) - !!page->buffers == 2) { 311 __delete_from_swap_cache(page); 312 SetPageDirty(page); 313 retval = 1; 314 } 315 spin_unlock(&pagecache_lock); 316 } 317 swap_info_put(p); 318 319 if (retval) { 320 block_flushpage(page, 0); 321 swap_free(entry); 322 page_cache_release(page); 323 } 324 325 return retval; 326 }
This function frees an entry from the swap cache and tries to reclaims the page. Note that this function only applies to the swap cache.
332 void free_swap_and_cache(swp_entry_t entry) 333 { 334 struct swap_info_struct * p; 335 struct page *page = NULL; 336 337 p = swap_info_get(entry); 338 if (p) { 339 if (swap_entry_free(p, SWP_OFFSET(entry)) == 1) 340 page = find_trylock_page(&swapper_space, entry.val); 341 swap_info_put(p); 342 } 343 if (page) { 344 page_cache_get(page); 345 /* Only cache user (+us), or swap space full? Free it! */ 346 if (page_count(page) - !!page->buffers == 2 || vm_swap_full()) { 347 delete_from_swap_cache(page); 348 SetPageDirty(page); 349 } 350 UnlockPage(page); 351 page_cache_release(page); 352 } 353 }
This is the main function used for reading data from backing storage into a page or writing data from a page to backing storage. Which operation is performs depends on the first parameter rw. It is basically a wrapper function around the core function rw_swap_page_base(). This simply enforces that the operations are only performed on pages in the swap cache.
85 void rw_swap_page(int rw, struct page *page) 86 { 87 swp_entry_t entry; 88 89 entry.val = page->index; 90 91 if (!PageLocked(page)) 92 PAGE_BUG(page); 93 if (!PageSwapCache(page)) 94 PAGE_BUG(page); 95 if (!rw_swap_page_base(rw, entry, page)) 96 UnlockPage(page); 97 }
This is the core function for reading or writing data to the backing storage. Whether it is writing to a partition or a file, the block layer brw_page() function is used to perform the actual IO. This function sets up the necessary buffer information for the block layer to do it's job. The brw_page() performs asynchronous IO so it is likely it will return with the page locked which will be unlocked when the IO completes.
36 static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page) 37 { 38 unsigned long offset; 39 int zones[PAGE_SIZE/512]; 40 int zones_used; 41 kdev_t dev = 0; 42 int block_size; 43 struct inode *swapf = 0; 44 45 if (rw == READ) { 46 ClearPageUptodate(page); 47 kstat.pswpin++; 48 } else 49 kstat.pswpout++; 50
51 get_swaphandle_info(entry, &offset, &dev, &swapf); 52 if (dev) { 53 zones[0] = offset; 54 zones_used = 1; 55 block_size = PAGE_SIZE; 56 } else if (swapf) { 57 int i, j; 58 unsigned int block = 59 offset << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits); 60 61 block_size = swapf->i_sb->s_blocksize; 62 for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size) 63 if (!(zones[i] = bmap(swapf,block++))) { 64 printk("rw_swap_page: bad swap file\n"); 65 return 0; 66 } 67 zones_used = i; 68 dev = swapf->i_dev; 69 } else { 70 return 0; 71 } 72 73 /* block_size == PAGE_SIZE/zones_used */ 74 brw_page(rw, page, dev, zones, block_size); 75 return 1; 76 }
This function is responsible for returning either the kdev_t or struct inode that is managing the swap area that entry belongs to.
1197 void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, 1198 kdev_t *dev, struct inode **swapf) 1199 { 1200 unsigned long type; 1201 struct swap_info_struct *p; 1202 1203 type = SWP_TYPE(entry); 1204 if (type >= nr_swapfiles) { 1205 printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val); 1206 return; 1207 } 1208 1209 p = &swap_info[type]; 1210 *offset = SWP_OFFSET(entry); 1211 if (*offset >= p->max && *offset != 0) { 1212 printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val); 1213 return; 1214 } 1215 if (p->swap_map && !p->swap_map[*offset]) { 1216 printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val); 1217 return; 1218 } 1219 if (!(p->flags & SWP_USED)) { 1220 printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val); 1221 return; 1222 } 1223 1224 if (p->swap_device) { 1225 *dev = p->swap_device; 1226 } else if (p->swap_file) { 1227 *swapf = p->swap_file->d_inode; 1228 } else { 1229 printk(KERN_ERR "rw_swap_page: no swap file or device\n"); 1230 } 1231 return; 1232 }
This, quite large, function is responsible for the activating of swap space. Broadly speaking the tasks is takes are as follows;
855 asmlinkage long sys_swapon(const char * specialfile, int swap_flags) 856 { 857 struct swap_info_struct * p; 858 struct nameidata nd; 859 struct inode * swap_inode; 860 unsigned int type; 861 int i, j, prev; 862 int error; 863 static int least_priority = 0; 864 union swap_header *swap_header = 0; 865 int swap_header_version; 866 int nr_good_pages = 0; 867 unsigned long maxpages = 1; 868 int swapfilesize; 869 struct block_device *bdev = NULL; 870 unsigned short *swap_map; 871 872 if (!capable(CAP_SYS_ADMIN)) 873 return -EPERM; 874 lock_kernel(); 875 swap_list_lock(); 876 p = swap_info;
877 for (type = 0 ; type < nr_swapfiles ; type++,p++) 878 if (!(p->flags & SWP_USED)) 879 break; 880 error = -EPERM; 881 if (type >= MAX_SWAPFILES) { 882 swap_list_unlock(); 883 goto out; 884 } 885 if (type >= nr_swapfiles) 886 nr_swapfiles = type+1; 887 p->flags = SWP_USED; 888 p->swap_file = NULL; 889 p->swap_vfsmnt = NULL; 890 p->swap_device = 0; 891 p->swap_map = NULL; 892 p->lowest_bit = 0; 893 p->highest_bit = 0; 894 p->cluster_nr = 0; 895 p->sdev_lock = SPIN_LOCK_UNLOCKED; 896 p->next = -1; 897 if (swap_flags & SWAP_FLAG_PREFER) { 898 p->prio = 899 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; 900 } else { 901 p->prio = --least_priority; 902 } 903 swap_list_unlock();
Find a free swap_info_struct and initialise it with default values
904 error = user_path_walk(specialfile, &nd); 905 if (error) 906 goto bad_swap_2; 907 908 p->swap_file = nd.dentry; 909 p->swap_vfsmnt = nd.mnt; 910 swap_inode = nd.dentry->d_inode; 911 error = -EINVAL; 912
Traverse the VFS and get some information about the special file
913 if (S_ISBLK(swap_inode->i_mode)) { 914 kdev_t dev = swap_inode->i_rdev; 915 struct block_device_operations *bdops; 916 devfs_handle_t de; 917 918 p->swap_device = dev; 919 set_blocksize(dev, PAGE_SIZE); 920 921 bd_acquire(swap_inode); 922 bdev = swap_inode->i_bdev; 923 de = devfs_get_handle_from_inode(swap_inode); 924 bdops = devfs_get_ops(de); 925 if (bdops) bdev->bd_op = bdops; 926 927 error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP); 928 devfs_put_ops(de);/* Decrement module use count * now we're safe*/ 929 if (error) 930 goto bad_swap_2; 931 set_blocksize(dev, PAGE_SIZE); 932 error = -ENODEV; 933 if (!dev || (blk_size[MAJOR(dev)] && 934 !blk_size[MAJOR(dev)][MINOR(dev)])) 935 goto bad_swap; 936 swapfilesize = 0; 937 if (blk_size[MAJOR(dev)]) 938 swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] 939 >> (PAGE_SHIFT - 10); 940 } else if (S_ISREG(swap_inode->i_mode)) 941 swapfilesize = swap_inode->i_size >> PAGE_SHIFT; 942 else 943 goto bad_swap;
If a partition, configure the block device before calculating the size of the area, else obtain it from the inode for the file.
945 error = -EBUSY; 946 for (i = 0 ; i < nr_swapfiles ; i++) { 947 struct swap_info_struct *q = &swap_info[i]; 948 if (i == type || !q->swap_file) 949 continue; 950 if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping) 951 goto bad_swap; 952 } 953 954 swap_header = (void *) __get_free_page(GFP_USER); 955 if (!swap_header) { 956 printk("Unable to start swapping: out of memory :-)\n"); 957 error = -ENOMEM; 958 goto bad_swap; 959 } 960 961 lock_page(virt_to_page(swap_header)); 962 rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header); 963 964 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) 965 swap_header_version = 1; 966 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) 967 swap_header_version = 2; 968 else { 969 printk("Unable to find swap-space signature\n"); 970 error = -EINVAL; 971 goto bad_swap; 972 }
974 switch (swap_header_version) { 975 case 1: 976 memset(((char *) swap_header)+PAGE_SIZE-10,0,10); 977 j = 0; 978 p->lowest_bit = 0; 979 p->highest_bit = 0; 980 for (i = 1 ; i < 8*PAGE_SIZE ; i++) { 981 if (test_bit(i,(char *) swap_header)) { 982 if (!p->lowest_bit) 983 p->lowest_bit = i; 984 p->highest_bit = i; 985 maxpages = i+1; 986 j++; 987 } 988 } 989 nr_good_pages = j; 990 p->swap_map = vmalloc(maxpages * sizeof(short)); 991 if (!p->swap_map) { 992 error = -ENOMEM; 993 goto bad_swap; 994 } 995 for (i = 1 ; i < maxpages ; i++) { 996 if (test_bit(i,(char *) swap_header)) 997 p->swap_map[i] = 0; 998 else 999 p->swap_map[i] = SWAP_MAP_BAD; 1000 } 1001 break; 1002
Read in the information needed to populate the swap_map when the swap area is version 1.
1003 case 2: 1006 if (swap_header->info.version != 1) { 1007 printk(KERN_WARNING 1008 "Unable to handle swap header version %d\n", 1009 swap_header->info.version); 1010 error = -EINVAL; 1011 goto bad_swap; 1012 } 1013 1014 p->lowest_bit = 1; 1015 maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1; 1016 if (maxpages > swap_header->info.last_page) 1017 maxpages = swap_header->info.last_page; 1018 p->highest_bit = maxpages - 1; 1019 1020 error = -EINVAL; 1021 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1022 goto bad_swap; 1023 1025 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { 1026 error = -ENOMEM; 1027 goto bad_swap; 1028 } 1029 1030 error = 0; 1031 memset(p->swap_map, 0, maxpages * sizeof(short)); 1032 for (i=0; i<swap_header->info.nr_badpages; i++) { 1033 int page = swap_header->info.badpages[i]; 1034 if (page <= 0 || page >= swap_header->info.last_page) 1035 error = -EINVAL; 1036 else 1037 p->swap_map[page] = SWAP_MAP_BAD; 1038 } 1039 nr_good_pages = swap_header->info.last_page - 1040 swap_header->info.nr_badpages - 1041 1 /* header page */; 1042 if (error) 1043 goto bad_swap; 1044 }
Read the header information when the file format is version 2
1045 1046 if (swapfilesize && maxpages > swapfilesize) { 1047 printk(KERN_WARNING 1048 "Swap area shorter than signature indicates\n"); 1049 error = -EINVAL; 1050 goto bad_swap; 1051 } 1052 if (!nr_good_pages) { 1053 printk(KERN_WARNING "Empty swap-file\n"); 1054 error = -EINVAL; 1055 goto bad_swap; 1056 } 1057 p->swap_map[0] = SWAP_MAP_BAD; 1058 swap_list_lock(); 1059 swap_device_lock(p); 1060 p->max = maxpages; 1061 p->flags = SWP_WRITEOK; 1062 p->pages = nr_good_pages; 1063 nr_swap_pages += nr_good_pages; 1064 total_swap_pages += nr_good_pages; 1065 printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n", 1066 nr_good_pages<<(PAGE_SHIFT-10), p->prio);
1068 /* insert swap space into swap_list: */ 1069 prev = -1; 1070 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 1071 if (p->prio >= swap_info[i].prio) { 1072 break; 1073 } 1074 prev = i; 1075 } 1076 p->next = i; 1077 if (prev < 0) { 1078 swap_list.head = swap_list.next = p - swap_info; 1079 } else { 1080 swap_info[prev].next = p - swap_info; 1081 } 1082 swap_device_unlock(p); 1083 swap_list_unlock(); 1084 error = 0; 1085 goto out;
1086 bad_swap: 1087 if (bdev) 1088 blkdev_put(bdev, BDEV_SWAP); 1089 bad_swap_2: 1090 swap_list_lock(); 1091 swap_map = p->swap_map; 1092 nd.mnt = p->swap_vfsmnt; 1093 nd.dentry = p->swap_file; 1094 p->swap_device = 0; 1095 p->swap_file = NULL; 1096 p->swap_vfsmnt = NULL; 1097 p->swap_map = NULL; 1098 p->flags = 0; 1099 if (!(swap_flags & SWAP_FLAG_PREFER)) 1100 ++least_priority; 1101 swap_list_unlock(); 1102 if (swap_map) 1103 vfree(swap_map); 1104 path_release(&nd); 1105 out: 1106 if (swap_header) 1107 free_page((long) swap_header); 1108 unlock_kernel(); 1109 return error; 1110 }
This function is called during the initialisation of kswapd to set the size of page_cluster. This variable determines how many pages readahead from files and from backing storage when paging in data.
100 void __init swap_setup(void) 101 { 102 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 103 104 /* Use a smaller cluster for small-memory machines */ 105 if (megs < 16) 106 page_cluster = 2; 107 else 108 page_cluster = 3; 109 /* 110 * Right now other parts of the system means that we 111 * _really_ don't want to cluster much more 112 */ 113 }
This function is principally concerned with updating the swap_info_struct and the swap lists. The main task of paging in all pages in the area is the responsibility of try_to_unuse(). The function tasks are broadly
720 asmlinkage long sys_swapoff(const char * specialfile) 721 { 722 struct swap_info_struct * p = NULL; 723 unsigned short *swap_map; 724 struct nameidata nd; 725 int i, type, prev; 726 int err; 727 728 if (!capable(CAP_SYS_ADMIN)) 729 return -EPERM; 730 731 err = user_path_walk(specialfile, &nd); 732 if (err) 733 goto out; 734
735 lock_kernel(); 736 prev = -1; 737 swap_list_lock(); 738 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 739 p = swap_info + type; 740 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { 741 if (p->swap_file == nd.dentry) 742 break; 743 } 744 prev = type; 745 } 746 err = -EINVAL; 747 if (type < 0) { 748 swap_list_unlock(); 749 goto out_dput; 750 } 751 752 if (prev < 0) { 753 swap_list.head = p->next; 754 } else { 755 swap_info[prev].next = p->next; 756 } 757 if (type == swap_list.next) { 758 /* just pick something that's safe... */ 759 swap_list.next = swap_list.head; 760 } 761 nr_swap_pages -= p->pages; 762 total_swap_pages -= p->pages; 763 p->flags = SWP_USED;
Acquire the BKL, find the swap_info_struct for the area to be deactivated and remove it from the swap list.
764 swap_list_unlock(); 765 unlock_kernel(); 766 err = try_to_unuse(type);
767 lock_kernel(); 768 if (err) { 769 /* re-insert swap space back into swap_list */ 770 swap_list_lock(); 771 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) 772 if (p->prio >= swap_info[i].prio) 773 break; 774 p->next = i; 775 if (prev < 0) 776 swap_list.head = swap_list.next = p - swap_info; 777 else 778 swap_info[prev].next = p - swap_info; 779 nr_swap_pages += p->pages; 780 total_swap_pages += p->pages; 781 p->flags = SWP_WRITEOK; 782 swap_list_unlock(); 783 goto out_dput; 784 }
Acquire the BKL. If we failed to page in all pages, then reinsert the area into the swap list
785 if (p->swap_device) 786 blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP); 787 path_release(&nd); 788 789 swap_list_lock(); 790 swap_device_lock(p); 791 nd.mnt = p->swap_vfsmnt; 792 nd.dentry = p->swap_file; 793 p->swap_vfsmnt = NULL; 794 p->swap_file = NULL; 795 p->swap_device = 0; 796 p->max = 0; 797 swap_map = p->swap_map; 798 p->swap_map = NULL; 799 p->flags = 0; 800 swap_device_unlock(p); 801 swap_list_unlock(); 802 vfree(swap_map); 803 err = 0; 804 805 out_dput: 806 unlock_kernel(); 807 path_release(&nd); 808 out: 809 return err; 810 }
Else the swap area was successfully deactivated to close the block device and mark the swap_info_struct free
This function is heavily commented in the source code albeit it consists of speculation or is slightly inaccurate at parts. The comments are omitted here for brevity.
513 static int try_to_unuse(unsigned int type) 514 { 515 struct swap_info_struct * si = &swap_info[type]; 516 struct mm_struct *start_mm; 517 unsigned short *swap_map; 518 unsigned short swcount; 519 struct page *page; 520 swp_entry_t entry; 521 int i = 0; 522 int retval = 0; 523 int reset_overflow = 0; 525 540 start_mm = &init_mm; 541 atomic_inc(&init_mm.mm_users); 542
556 while ((i = find_next_to_unuse(si, i))) { 557 /* 558 * Get a page for the entry, using the existing swap 559 * cache page if there is one. Otherwise, get a clean 560 * page and read the swap into it. 561 */ 562 swap_map = &si->swap_map[i]; 563 entry = SWP_ENTRY(type, i); 564 page = read_swap_cache_async(entry); 565 if (!page) { 572 if (!*swap_map) 573 continue; 574 retval = -ENOMEM; 575 break; 576 } 577 578 /* 579 * Don't hold on to start_mm if it looks like exiting. 580 */ 581 if (atomic_read(&start_mm->mm_users) == 1) { 582 mmput(start_mm); 583 start_mm = &init_mm; 584 atomic_inc(&init_mm.mm_users); 585 }
587 /* 588 * Wait for and lock page. When do_swap_page races with 589 * try_to_unuse, do_swap_page can handle the fault much 590 * faster than try_to_unuse can locate the entry. This 591 * apparently redundant "wait_on_page" lets try_to_unuse 592 * defer to do_swap_page in such a case - in some tests, 593 * do_swap_page and try_to_unuse repeatedly compete. 594 */ 595 wait_on_page(page); 596 lock_page(page); 597 598 /* 599 * Remove all references to entry, without blocking. 600 * Whenever we reach init_mm, there's no address space 601 * to search, but use it as a reminder to search shmem. 602 */ 603 shmem = 0; 604 swcount = *swap_map; 605 if (swcount > 1) { 606 flush_page_to_ram(page); 607 if (start_mm == &init_mm) 608 shmem = shmem_unuse(entry, page); 609 else 610 unuse_process(start_mm, entry, page); 611 }
612 if (*swap_map > 1) { 613 int set_start_mm = (*swap_map >= swcount); 614 struct list_head *p = &start_mm->mmlist; 615 struct mm_struct *new_start_mm = start_mm; 616 struct mm_struct *mm; 617 618 spin_lock(&mmlist_lock); 619 while (*swap_map > 1 && 620 (p = p->next) != &start_mm->mmlist) { 621 mm = list_entry(p, struct mm_struct, mmlist); 622 swcount = *swap_map; 623 if (mm == &init_mm) { 624 set_start_mm = 1; 625 spin_unlock(&mmlist_lock); 626 shmem = shmem_unuse(entry, page); 627 spin_lock(&mmlist_lock); 628 } else 629 unuse_process(mm, entry, page); 630 if (set_start_mm && *swap_map < swcount) { 631 new_start_mm = mm; 632 set_start_mm = 0; 633 } 634 } 635 atomic_inc(&new_start_mm->mm_users); 636 spin_unlock(&mmlist_lock); 637 mmput(start_mm); 638 start_mm = new_start_mm; 639 }
654 if (*swap_map == SWAP_MAP_MAX) { 655 swap_list_lock(); 656 swap_device_lock(si); 657 nr_swap_pages++; 658 *swap_map = 1; 659 swap_device_unlock(si); 660 swap_list_unlock(); 661 reset_overflow = 1; 662 }
683 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 684 rw_swap_page(WRITE, page); 685 lock_page(page); 686 } 687 if (PageSwapCache(page)) { 688 if (shmem) 689 swap_duplicate(entry); 690 else 691 delete_from_swap_cache(page); 692 }
699 SetPageDirty(page); 700 UnlockPage(page); 701 page_cache_release(page);
708 if (current->need_resched) 714 schedule(); 715 } 716 717 mmput(start_mm); 718 if (reset_overflow) { 714 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); 715 swap_overflow = 0; 716 } 717 return retval; 718 }
This function begins the page table walk required to remove the requested page and entry from the process page tables managed by mm. This is only required when a swap area is being deactivated so, while expensive, it is a very rare operation. This set of functions should be instantly recognisable as a standard page-table walk.
454 static void unuse_process(struct mm_struct * mm, 455 swp_entry_t entry, struct page* page) 456 { 457 struct vm_area_struct* vma; 458 459 /* 460 * Go through process' page directory. 461 */ 462 spin_lock(&mm->page_table_lock); 463 for (vma = mm->mmap; vma; vma = vma->vm_next) { 464 pgd_t * pgd = pgd_offset(mm, vma->vm_start); 465 unuse_vma(vma, pgd, entry, page); 466 } 467 spin_unlock(&mm->page_table_lock); 468 return; 469 }
This function searches the requested VMA for page table entries mapping the page and using the given swap entry. It calls unuse_pgd() for every PGD this VMA maps.
440 static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, 441 swp_entry_t entry, struct page* page) 442 { 443 unsigned long start = vma->vm_start, end = vma->vm_end; 444 445 if (start >= end) 446 BUG(); 447 do { 448 unuse_pgd(vma, pgdir, start, end - start, entry, page); 449 start = (start + PGDIR_SIZE) & PGDIR_MASK; 450 pgdir++; 451 } while (start && (start < end)); 452 }
This function searches the requested PGD for page table entries mapping the page and using the given swap entry. It calls unuse_pmd() for every PMD this PGD maps.
409 static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, 410 unsigned long address, unsigned long size, 411 swp_entry_t entry, struct page* page) 412 { 413 pmd_t * pmd; 414 unsigned long offset, end; 415 416 if (pgd_none(*dir)) 417 return; 418 if (pgd_bad(*dir)) { 419 pgd_ERROR(*dir); 420 pgd_clear(dir); 421 return; 422 } 423 pmd = pmd_offset(dir, address); 424 offset = address & PGDIR_MASK; 425 address &= ~PGDIR_MASK; 426 end = address + size; 427 if (end > PGDIR_SIZE) 428 end = PGDIR_SIZE; 429 if (address >= end) 430 BUG(); 431 do { 432 unuse_pmd(vma, pmd, address, end - address, offset, entry, 433 page); 434 address = (address + PMD_SIZE) & PMD_MASK; 435 pmd++; 436 } while (address && (address < end)); 437 }
This function searches the requested PMD for page table entries mapping the page and using the given swap entry. It calls unuse_pte() for every PTE this PMD maps.
381 static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, 382 unsigned long address, unsigned long size, unsigned long offset, 383 swp_entry_t entry, struct page* page) 384 { 385 pte_t * pte; 386 unsigned long end; 387 388 if (pmd_none(*dir)) 389 return; 390 if (pmd_bad(*dir)) { 391 pmd_ERROR(*dir); 392 pmd_clear(dir); 393 return; 394 } 395 pte = pte_offset(dir, address); 396 offset += address & PMD_MASK; 397 address &= ~PMD_MASK; 398 end = address + size; 399 if (end > PMD_SIZE) 400 end = PMD_SIZE; 401 do { 402 unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page); 403 address += PAGE_SIZE; 404 pte++; 405 } while (address && (address < end)); 406 }
This function checks if the PTE at dir matches the entry we are searching for. If it does, the swap entry is freed and a reference is taken to the page representing the PTE that will be updated to map it.
365 static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, 366 pte_t *dir, swp_entry_t entry, struct page* page) 367 { 368 pte_t pte = *dir; 369 370 if (likely(pte_to_swp_entry(pte).val != entry.val)) 371 return; 372 if (unlikely(pte_none(pte) || pte_present(pte))) 373 return; 374 get_page(page); 375 set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); 376 swap_free(entry); 377 ++vma->vm_mm->rss; 378 }