Appendix L Shared Memory Virtual Filesystem
L.1 Initialising shmfs
L.1.1 Function: init_tmpfs
Source: mm/shmem.c
This function is responsible for registering and mounting the
tmpfs and shmemfs filesystems.
1451 #ifdef CONFIG_TMPFS
1453 static DECLARE_FSTYPE(shmem_fs_type, "shm",
shmem_read_super, FS_LITTER);
1454 static DECLARE_FSTYPE(tmpfs_fs_type, "tmpfs",
shmem_read_super, FS_LITTER);
1455 #else
1456 static DECLARE_FSTYPE(tmpfs_fs_type, "tmpfs",
shmem_read_super, FS_LITTER|FS_NOMOUNT);
1457 #endif
1560 static int __init init_tmpfs(void)
1561 {
1562 int error;
1563
1564 error = register_filesystem(&tmpfs_fs_type);
1565 if (error) {
1566 printk(KERN_ERR "Could not register tmpfs\n");
1567 goto out3;
1568 }
1569 #ifdef CONFIG_TMPFS
1570 error = register_filesystem(&shmem_fs_type);
1571 if (error) {
1572 printk(KERN_ERR "Could not register shm fs\n");
1573 goto out2;
1574 }
1575 devfs_mk_dir(NULL, "shm", NULL);
1576 #endif
1577 shm_mnt = kern_mount(&tmpfs_fs_type);
1578 if (IS_ERR(shm_mnt)) {
1579 error = PTR_ERR(shm_mnt);
1580 printk(KERN_ERR "Could not kern_mount tmpfs\n");
1581 goto out1;
1582 }
1583
1584 /* The internal instance should not do size checking */
1585 shmem_set_size(SHMEM_SB(shm_mnt->mnt_sb), ULONG_MAX, ULONG_MAX);
1586 return 0;
1587
1588 out1:
1589 #ifdef CONFIG_TMPFS
1590 unregister_filesystem(&shmem_fs_type);
1591 out2:
1592 #endif
1593 unregister_filesystem(&tmpfs_fs_type);
1594 out3:
1595 shm_mnt = ERR_PTR(error);
1596 return error;
1597 }
1598 module_init(init_tmpfs)
- 1551The shm filesystem is only mountable if CONFIG_TMPFS
is defined at compile time. Even if it is not specified, a tmpfs will still
be setup for anonymous shared memory resulting from a fork()
- 1553DECLARE_FSTYPE(), declared in
<linux/fs.h>, declares tmpfs_fs_type
as type struct file_system_type and fills in
four fields. “tmpfs” is it's human readable name.
shmem_read_super() is the function which is used to read the
superblock for the filesystem (a detailed description of superblocks
and how they pertain to filesystems is beyond the scope of this book).
FS_LITTER is a flag that indicates the filesystem tree should
be maintained in the dcache. Finally, the macro sets the module owner of
the filesystem to be the module loading the filesystem
- 1560__init places this function in the init section.
This means that after the kernel has finished bootstrapping, the code for the
function will be removed
- 1564-1568Register the filesystem tmpfs_fs_type which was
declared in line 1433. If it fails, goto out3 where the appropriate
error will be returned
- 1569-1474If tmpfs is specified at configure time, register the shmem
filesystem. If it fails, goto out2 where tmpfs_fs_type
will be unregistered before returning the error
- 1575If /dev/ is being managed by the device filesystem
(devfs), then create a new shm directory. If the kernel does not
use devfs, then the system administrator must manually create the directory
- 1577kern_mount() mounts a filesystem internally. In other
words, the filesystem is mounted and active but it is not visible to the user
anywhere in the VFS. The mount point in shm_mnt which is local
to the shmem.c file and of type struct vfsmount. This
variable is needed for searching the filesystem and for unmounting it later
- 1578-1582Ensure the filesystem mounted correctly but if it didn't,
goto out1 where the filesystems will be unregistered before
returning the error
- 1585The function shmem_set_size()
(See Section L.1.3) is responsible for setting the maximum number
of blocks and inodes that may be created in this filesystem
- 1598module_init() in this instance indicates that
init_shmem_fs() should be called when the module is loaded. If it is
compiled directly into the kernel, the function will be called on system
startup
L.1.2 Function: shmem_read_super
Source: mm/shmem.c
This is the callback function provided for the filesystem which “reads” the
superblock. With an ordinary filesystem, this would entail reading the
information from the disk but as this is a RAM-based filesystem, it instead
populates a struct super_block.
1452 static struct super_block *shmem_read_super(struct super_block *sb,
void* data, int silent)
1453 {
1454 struct inode *inode;
1455 struct dentry *root;
1456 unsigned long blocks, inodes;
1457 int mode = S_IRWXUGO | S_ISVTX;
1458 uid_t uid = current->fsuid;
1459 gid_t gid = current->fsgid;
1460 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1461 struct sysinfo si;
1462
1463 /*
1464 * Per default we only allow half of the physical ram per
1465 * tmpfs instance
1466 */
1467 si_meminfo(&si);
1468 blocks = inodes = si.totalram / 2;
1469
1470 #ifdef CONFIG_TMPFS
1471 if (shmem_parse_options(data, &mode, &uid,
&gid, &blocks, &inodes))
1472 return NULL;
1473 #endif
1474
1475 spin_lock_init(&sbinfo->stat_lock);
1476 sbinfo->max_blocks = blocks;
1477 sbinfo->free_blocks = blocks;
1478 sbinfo->max_inodes = inodes;
1479 sbinfo->free_inodes = inodes;
1480 sb->s_maxbytes = SHMEM_MAX_BYTES;
1481 sb->s_blocksize = PAGE_CACHE_SIZE;
1482 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1483 sb->s_magic = TMPFS_MAGIC;
1484 sb->s_op = &shmem_ops;
1485 inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
1486 if (!inode)
1487 return NULL;
1488
1489 inode->i_uid = uid;
1490 inode->i_gid = gid;
1491 root = d_alloc_root(inode);
1492 if (!root) {
1493 iput(inode);
1494 return NULL;
1495 }
1496 sb->s_root = root;
1497 return sb;
1498 }
- 1471The parameters are;
- sb is the super_block to populate
- data contains the mount arguments
- silent is unused in this function
- 1457-1459Set the default mode, uid and gid. These may be overridden
with the parameters passed as mount options
- 1460Each super_block is allowed to have a
filesystem specific struct that is contained within a union called
super_block→u. The macro SHMEM_SB() returns
the struct shmem_sb_info contained within this union
- 1467si_meminfo() populates struct sysinfo
with total memory, available memory and usage statistics. The function is
defined in arch/i386/mm/init.c and is architecture dependant
- 1468By default, only allow the filesystem to consume half of total
available physical memory
- 1471-1472If tmpfs is available, parse the mount options allowing them
to override the defaults
- 1475Acquire the lock protecting sbinfo which is the
struct shmem_sb_info in the super_block
- 1483Populate the sb and sbinfo fields
- 1484The shmem_ops is a struct of function pointers for
super block operations such as remounting the filesystem and deleting an
inode
- 1485-1487This block allocates a special inode which represents the
root of the filesystem
- 1489-1490Set the uid and gid of the root of the new filesystem
- 1496Set the root inode into the super_block
- 1497Return the populated superblock
L.1.3 Function: shmem_set_size
Source: mm/shmem.c
This function updates the number of available blocks and inodes in the
filesystem. It is set while the filesystem is being mounted or remounted.
861 static int shmem_set_size(struct shmem_sb_info *info,
862 unsigned long max_blocks,
unsigned long max_inodes)
863 {
864 int error;
865 unsigned long blocks, inodes;
866
867 spin_lock(&info->stat_lock);
868 blocks = info->max_blocks - info->free_blocks;
869 inodes = info->max_inodes - info->free_inodes;
870 error = -EINVAL;
871 if (max_blocks < blocks)
872 goto out;
873 if (max_inodes < inodes)
874 goto out;
875 error = 0;
876 info->max_blocks = max_blocks;
877 info->free_blocks = max_blocks - blocks;
878 info->max_inodes = max_inodes;
879 info->free_inodes = max_inodes - inodes;
880 out:
881 spin_unlock(&info->stat_lock);
882 return error;
883 }
- 861The parameters are the info representing the filesystem
superblock, the maximum number of blocks (max_blocks) and the
maximum number of inodes (max_inodes)
- 867Lock the superblock info spinlock
- 868Calculate the number of blocks current in use by the
filesystem. On initial mount, this is unimportant, but if the filesystem is
being remounted, the function must make sure that the new filesystem is not
too small
- 869Calculate the number of inodes currently in use
- 871-872If the remounted filesystem would have too few blocks to store
the current information, goto out to return -EINVAL
- 873-874Similarly, make sure there are enough available inodes or
return -EINVAL
- 875It is safe to mount the filesystem so set error to 0
indicating that this operation will be successful
- 876-877Set the maximum number of blocks and number of available blocks
in the filesystems superblock info struct
- 878-879Set the maximum and available number of inodes
- 881Unlock the filesystems superblock info struct
- 882Return 0 if successful or -EINVAL if not
L.2 Creating Files in tmpfs
L.2.1 Function: shmem_create
Source: mm/shmem.c
This is the top-level function called when creating a new file.
1164 static int shmem_create(struct inode *dir,
struct dentry *dentry,
int mode)
1165 {
1166 return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1167 }
- 1164The parameters are:
-
- dir is the inode of the directory the new file is being created in
- dentry is the dentry of the new file being created
- mode is the flags passed to the open system call
- 1166Call shmem_mknod()(See Section L.2.2) adding the
S_IFREG flag to the mode flags so a regular file will be created
L.2.2 Function: shmem_mknod
Source: mm/shmem.c
1139 static int shmem_mknod(struct inode *dir,
struct dentry *dentry,
int mode, int dev)
1140 {
1141 struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1142 int error = -ENOSPC;
1143
1144 if (inode) {
1145 dir->i_size += BOGO_DIRENT_SIZE;
1146 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1147 d_instantiate(dentry, inode);
1148 dget(dentry); /* Extra count - pin the dentry in core */
1149 error = 0;
1150 }
1151 return error;
1152 }
- 1141Call shmem_get_inode() (See Section L.2.3)
to create a new inode
- 1144If the inode was successfully created, update the directory
statistics and instantiate the new file
- 1145Update the size of the directory
- 1146Update the ctime and mtime fields
- 1147Instantiate the inode
- 1148Take a reference to the dentry so that it will be pinned and not
accidentally reclaimed during pageout. Unlike normal files, there is no
automatic way of recreating dentries once they are deleted
- 1149Indicate the call ended successfully
- 1151Return success or -ENOSPC on error
L.2.3 Function: shmem_get_inode
Source: mm/shmem.c
809 struct inode *shmem_get_inode(struct super_block *sb,
int mode,
int dev)
810 {
811 struct inode *inode;
812 struct shmem_inode_info *info;
813 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
814
815 spin_lock(&sbinfo->stat_lock);
816 if (!sbinfo->free_inodes) {
817 spin_unlock(&sbinfo->stat_lock);
818 return NULL;
819 }
820 sbinfo->free_inodes--;
821 spin_unlock(&sbinfo->stat_lock);
822
823 inode = new_inode(sb);
This preamble section is responsible for updating the free inode count and
allocating an inode with new_inode().
- 815Acquire the sbinfo spinlock as it is about to be updated
- 816-819Make sure there are free inodes and if not, return NULL
- 820-821Update the free inode count and free the lock
- 823new_inode() is part of the filesystem layer and declared
in <linux/fs.h>. Exactly how it works is beyond the scope of this
document but the summary is simple. It allocates an inode from the slab
allocator, zeros most fields and populates inode→i_sb,
inode→i_dev and inode→i_blkbits based
on information in the super block
824 if (inode) {
825 inode->i_mode = mode;
826 inode->i_uid = current->fsuid;
827 inode->i_gid = current->fsgid;
828 inode->i_blksize = PAGE_CACHE_SIZE;
829 inode->i_blocks = 0;
830 inode->i_rdev = NODEV;
831 inode->i_mapping->a_ops = &shmem_aops;
832 inode->i_atime = inode->i_mtime
= inode->i_ctime
= CURRENT_TIME;
833 info = SHMEM_I(inode);
834 info->inode = inode;
835 spin_lock_init(&info->lock);
836 switch (mode & S_IFMT) {
837 default:
838 init_special_inode(inode, mode, dev);
839 break;
840 case S_IFREG:
841 inode->i_op = &shmem_inode_operations;
842 inode->i_fop = &shmem_file_operations;
843 spin_lock(&shmem_ilock);
844 list_add_tail(&info->list, &shmem_inodes);
845 spin_unlock(&shmem_ilock);
846 break;
847 case S_IFDIR:
848 inode->i_nlink++;
849 /* Some things misbehave if size == 0 on a directory */
850 inode->i_size = 2 * BOGO_DIRENT_SIZE;
851 inode->i_op = &shmem_dir_inode_operations;
852 inode->i_fop = &dcache_dir_ops;
853 break;
854 case S_IFLNK:
855 break;
856 }
857 }
858 return inode;
859 }
- 824-858Fill in the inode fields if created successfully
- 825-830Fill in the basic inode information
- 831Set the address_space_operations
to use shmem_aops which sets up the function
shmem_writepage()(See Section L.6.1) to be used as a
page writeback callback for the address_space
- 832-834Fill in more basic information
- 835-836Initialise the inodes semaphore and spinlock
- 836-856Determine how to fill the remaining fields based on the mode
flags passed in
- 838In this case, a special inode is being created. Specifically, this
is while the filesystem is being mounted and the root inode is being created
- 840-846Create an inode for a regular file. The main point to note here
is that the inode→i_op and inode→i_fop
fields are set to shmem_inode_operations and
shmem_file_operations respectively
- 847-852Create an inode for a new directory. The i_nlink
and i_size fields are updated to show the increased number of
files and the size of the directory. The main point to note here is that the
inode→i_op and inode→i_fop fields are
set to shmem_dir_inode_operations and dcach_dir_ops
respectively
- 854-855If linking a file, do nothing for now as it is handled by the
parent function shmem_link()
- 858Return the new inode or NULL if it could not be created
L.3 File Operations in tmpfs
L.3.1 Memory Mapping
The tasks for memory mapping a virtual file are simple. The only changes that
need to be made is to update the VMAs vm_operations_struct
field (vma→vm_ops) to use the shmfs equivilants for
faulting.
L.3.1.1 Function: shmem_mmap
Source: mm/shmem.c
796 static int shmem_mmap(struct file * file, struct vm_area_struct * vma)
797 {
798 struct vm_operations_struct *ops;
799 struct inode *inode = file->f_dentry->d_inode;
800
801 ops = &shmem_vm_ops;
802 if (!S_ISREG(inode->i_mode))
803 return -EACCES;
804 UPDATE_ATIME(inode);
805 vma->vm_ops = ops;
806 return 0;
807 }
- 801ops is now the vm_operations_struct to be
used for the virtual filesystem
- 802Make sure that the inode being mapped is a regular file. If
not, return -EACCESS
- 804Update the atime for the inode to show it was accessed
- 805Update vma→vm_ops so that
shmem_nopage() (See Section L.5.1.1) will be used to handle
page faults within the mapping
L.3.2 Reading Files
L.3.2.1 Function: shmem_file_read
Source: mm/shmem.c
This is the top-level function called for read()ing a tmpfs file.
1088 static ssize_t shmem_file_read(struct file *filp, char *buf,
size_t count, loff_t *ppos)
1089 {
1090 read_descriptor_t desc;
1091
1092 if ((ssize_t) count < 0)
1093 return -EINVAL;
1094 if (!access_ok(VERIFY_WRITE, buf, count))
1095 return -EFAULT;
1096 if (!count)
1097 return 0;
1098
1099 desc.written = 0;
1100 desc.count = count;
1101 desc.buf = buf;
1102 desc.error = 0;
1103
1104 do_shmem_file_read(filp, ppos, &desc);
1105 if (desc.written)
1106 return desc.written;
1107 return desc.error;
1108 }
- 1088The parameters are:
-
- filp is a pointer to the struct file being read
- buf is the buffer that should be filled
- count is the number of bytes that should be read
- ppos is the current position
- 1092-1093count cannot be negative
- 1094-1095access_ok() ensures that it is safe to write
count number of bytes to the userspace buffer. If it can't,
-EFAULT will be returned
- 1099-1102Initialise a read_descriptor_t struct which will
eventually be passed to file_read_actor()(See Section L.3.2.3)
- 1104Call do_shmem_file_read() to start performing the
actual read
- 1105-1106Return the number of bytes that were written to the userspace
buffer
- 1107If none were written, return the error
L.3.2.2 Function: do_shmem_file_read
Source: mm/shmem.c
This function retrieves the pages needed for the file read with
shmem_getpage() and calls file_read_actor() to copy the
data to userspace.
1003 static void do_shmem_file_read(struct file *filp,
loff_t *ppos,
read_descriptor_t *desc)
1004 {
1005 struct inode *inode = filp->f_dentry->d_inode;
1006 struct address_space *mapping = inode->i_mapping;
1007 unsigned long index, offset;
1008
1009 index = *ppos >> PAGE_CACHE_SHIFT;
1010 offset = *ppos & ~PAGE_CACHE_MASK;
1011
1012 for (;;) {
1013 struct page *page = NULL;
1014 unsigned long end_index, nr, ret;
1015
1016 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1017 if (index > end_index)
1018 break;
1019 if (index == end_index) {
1020 nr = inode->i_size & ~PAGE_CACHE_MASK;
1021 if (nr <= offset)
1022 break;
1023 }
1024
1025 desc->error = shmem_getpage(inode, index, &page, SGP_READ);
1026 if (desc->error) {
1027 if (desc->error == -EINVAL)
1028 desc->error = 0;
1029 break;
1030 }
1031
1036 nr = PAGE_CACHE_SIZE;
1037 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1038 if (index == end_index) {
1039 nr = inode->i_size & ~PAGE_CACHE_MASK;
1040 if (nr <= offset) {
1041 page_cache_release(page);
1042 break;
1043 }
1044 }
1045 nr -= offset;
1046
1047 if (page != ZERO_PAGE(0)) {
1053 if (mapping->i_mmap_shared != NULL)
1054 flush_dcache_page(page);
1055 /*
1056 * Mark the page accessed if we read the
1057 * beginning or we just did an lseek.
1058 */
1059 if (!offset || !filp->f_reada)
1060 mark_page_accessed(page);
1061 }
1062
1073 ret = file_read_actor(desc, page, offset, nr);
1074 offset += ret;
1075 index += offset >> PAGE_CACHE_SHIFT;
1076 offset &= ~PAGE_CACHE_MASK;
1077
1078 page_cache_release(page);
1079 if (ret != nr || !desc->count)
1080 break;
1081 }
1082
1083 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1084 filp->f_reada = 1;
1085 UPDATE_ATIME(inode);
1086 }
- 1005-1006Retrieve the inode and mapping using the
struct file
- 1009index is the page index within the file that contains the
data
- 1010offset is the offset within the page that is currently
being read
- 1012-1081Loop until the requested number of bytes has been read.
nr is the number of bytes that are still to be read within the
current page. desc→count starts as the number of bytes to
read and is decremented by file_read_actor() (See Section L.3.2.3)
- 1016-1018end_index is the index of the last page in the file.
Break when the end of the file is reached
- 1019-1023When the last page is reached, set nr to be
the number of bytes to be read within this page. If the file pointer is after
nr, break as there is no more data to be read. This could happen
after the file was truncated
- 1025-1030shmem_getpage()(See Section L.5.1.2) will
locate the requested page in the page cache, swap cache or page it in. If an
error occurs, record it in desc→error and return
- 1036nr is the number of pages that must be read from the page
so initialise it to the size of a page as this full page is being read
- 1037Initialise end_index which is index of the page at the
end of the file
- 1038-1044If this is the last page in the file, update nr
to be the number of bytes in the page. If nr is currently after
the end of the file (could happen after truncate), then release the reference
to the page (taken by shmem_getpage()) and exit the loop
- 1045Update the number of bytes to be read. Remember that
offset is where the file reader is currently within the page
- 1047-1061If the page being read is not the global zero page, take care
of potential aliasing problems by calling flush_dcache_page(). If
the page is being read the first time or an lseek() just occured
(f_reada is zero), then mark the page accessed with
mark_page_accesssed()
- 1073Call file_read_actor()(See Section L.3.2.3)
to copy the data to userspace. It returns the number of bytes that were
copied and updates the user buffer pointers and remaining count
- 1074Update the offset within the page being read
- 1075Move the index to the next page if necessary
- 1076Ensure that offset is an offset within a page
- 1078Release the reference to the page being copied. The reference was
taken by shmem_getpage()
- 1079-1080If the requested bytes have been read, return
- 1083Update the file pointer
- 1084Enable file readahead
- 1085Update the access time for the inode as it has just been read from
L.3.2.3 Function: file_read_actor
Source: mm/filemap.c
This function is responsible for copying data from a page to a userspace
buffer. It is ultimatly called by a number of functions including
generic_file_read(), generic_file_write() and
shmem_file_read().
1669 int file_read_actor(read_descriptor_t * desc,
struct page *page,
unsigned long offset,
unsigned long size)
1670 {
1671 char *kaddr;
1672 unsigned long left, count = desc->count;
1673
1674 if (size > count)
1675 size = count;
1676
1677 kaddr = kmap(page);
1678 left = __copy_to_user(desc->buf, kaddr + offset, size);
1679 kunmap(page);
1680
1681 if (left) {
1682 size -= left;
1683 desc->error = -EFAULT;
1684 }
1685 desc->count = count - size;
1686 desc->written += size;
1687 desc->buf += size;
1688 return size;
1689 }
- 1669The parameters are:
-
- desc is a structure containing information about the read, including
the buffer and the total number of bytes that are to be read from this file
- page is the page containing file data that is to be copied to userspace
- offset is the offset within the page that is being copied
- size is the number of bytes to be read from page
- 1672count is now the number of bytes that are to be read
from the file
- 1674-1675Make sure to not read more bytes than are requested
- 1677Map the page into low memory with kmap(). See Section
I.1.0.5
- 1678Copy the data from the kernel page to the userspace buffer
- 1679Unmap the page. See Section I.3.1
- 1644-1647If
all the bytes were not copied, it must be
because the buffer was not accessible. Update size so that
desc→count will reflect how many bytes are still to be
copied by the read. -EFAULT will be returned to the process
performing the read
- 1685-1687Update the desc struct to show the current status of
the read
- 1688Return the number of bytes that were written to the userspace
buffer
L.3.3.1 Function: shmem_file_write
Source: mm/shmem.c
925 shmem_file_write(struct file *file, const char *buf,
size_t count, loff_t *ppos)
926 {
927 struct inode *inode = file->f_dentry->d_inode;
928 loff_t pos;
929 unsigned long written;
930 int err;
931
932 if ((ssize_t) count < 0)
933 return -EINVAL;
934
935 if (!access_ok(VERIFY_READ, buf, count))
936 return -EFAULT;
937
938 down(&inode->i_sem);
939
940 pos = *ppos;
941 written = 0;
942
943 err = precheck_file_write(file, inode, &count, &pos);
944 if (err || !count)
945 goto out;
946
947 remove_suid(inode);
948 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
949
Function preamble.
- 927Get the inode that represents the file being written
- 932-933Return -EINVAL if the user tries to write a
negative number of bytes
- 935-936Return -EFAULT if the userspace buffer is
inaccessible
- 938Acquire the semaphore protecting the inode
- 940Record the beginning of where the write is taking place
- 941Initialise the written number of bytes to 0
- 943precheck_file_write() performs a number of checks to
make sure the write is ok to proceed. This includes updating pos to
be the end of the file if opened in append mode and checking that the process
limits wil not be exceeded
- 944-945If the write cannot proceed, goto out
- 947Clear the SUID bit if it is set
- 948Update the inodes ctime and mtime
950 do {
951 struct page *page = NULL;
952 unsigned long bytes, index, offset;
953 char *kaddr;
954 int left;
955
956 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
957 index = pos >> PAGE_CACHE_SHIFT;
958 bytes = PAGE_CACHE_SIZE - offset;
959 if (bytes > count)
960 bytes = count;
961
962 /*
963 * We don't hold page lock across copy from user -
964 * what would it guard against? - so no deadlock here.
965 */
966
967 err = shmem_getpage(inode, index, &page, SGP_WRITE);
968 if (err)
969 break;
970
971 kaddr = kmap(page);
972 left = __copy_from_user(kaddr + offset, buf, bytes);
973 kunmap(page);
974
975 written += bytes;
976 count -= bytes;
977 pos += bytes;
978 buf += bytes;
979 if (pos > inode->i_size)
980 inode->i_size = pos;
981
982 flush_dcache_page(page);
983 SetPageDirty(page);
984 SetPageReferenced(page);
985 page_cache_release(page);
986
987 if (left) {
988 pos -= left;
989 written -= left;
990 err = -EFAULT;
991 break;
992 }
993 } while (count);
994
995 *ppos = pos;
996 if (written)
997 err = written;
998 out:
999 up(&inode->i_sem);
1000 return err;
1001 }
- 950-993Loop until all the requested bytes have been written
- 956Set offset to be the offset within the current page being
written
- 957index is the page index within the file current being
written
- 958bytes is the number of bytes within the current page
remaining to be written
- 959-960If bytes indicates that more bytes should be written
than was requested (count), set bytes to count
- 967-969Locate the page to be written to. The SGP_WRITE
flag indicates that a page should be allocated if one does not already exist.
If the page could not be found or allocated, break out of the loop
- 971-973Map the page to be written to and copy the bytes from the
userspace buffer before unmapping the page again
- 975Update the number of bytes written
- 976Update the number of bytes remaining to write
- 977Update the position within the file
- 978Update the pointer within the userspace buffer
- 979-980If the file is now bigger, update
inode→i_size
- 982Flush the dcache to avoid aliasing problems
- 983-984Set the page dirty and referenced
- 985Release the reference to the page taken by
shmem_getpage()
- 987-992If all the requested bytes were not read from the userspace
buffer, update the written statistics and the postition within the file and
buffer
- 995Update the file pointer
- 996-997If all the requested bytes were not written, set the error
return variable
- 999Release the inodes semaphore
- 1000Return success or else return the number of bytes remaining to be
written
L.3.4 Symbolic Linking
L.3.4.1 Function: shmem_symlink
Source: mm/shmem.c
This function is responsible for creating a symbolic link symname
and deciding where to store the information. The name of the link will be
stored in the inode if the name is small enough and in a page frame otherwise.
1272 static int shmem_symlink(struct inode * dir,
struct dentry *dentry,
const char * symname)
1273 {
1274 int error;
1275 int len;
1276 struct inode *inode;
1277 struct page *page = NULL;
1278 char *kaddr;
1279 struct shmem_inode_info *info;
1280
1281 len = strlen(symname) + 1;
1282 if (len > PAGE_CACHE_SIZE)
1283 return -ENAMETOOLONG;
1284
1285 inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1286 if (!inode)
1287 return -ENOSPC;
1288
1289 info = SHMEM_I(inode);
1290 inode->i_size = len-1;
This block performs basic sanity checks and creating a new inode for the
symbolic link.
- 1272The parameter symname is the name of the link to create
- 1281Calculate the length (len) of the link
- 1282-1283If the name is larger than a page, return
-ENAMETOOLONG
- 1285-1287Allocate a new inode. Return -ENOSPC
if it fails
- 1289Get the private information struct
- 1290The size of the inode is the length of the link
1291 if (len <= sizeof(struct shmem_inode_info)) {
1292 /* do it inline */
1293 memcpy(info, symname, len);
1294 inode->i_op = &shmem_symlink_inline_operations;
1295 } else {
1296 error = shmem_getpage(inode, 0, &page, SGP_WRITE);
1297 if (error) {
1298 iput(inode);
1299 return error;
1300 }
1301 inode->i_op = &shmem_symlink_inode_operations;
1302 spin_lock(&shmem_ilock);
1303 list_add_tail(&info->list, &shmem_inodes);
1304 spin_unlock(&shmem_ilock);
1305 kaddr = kmap(page);
1306 memcpy(kaddr, symname, len);
1307 kunmap(page);
1308 SetPageDirty(page);
1309 page_cache_release(page);
1310 }
This block is responsible for storing the link information.
- 1291-1295If the length of the name is smaller than the space used for
the shmem_inode_info, then copy the name into the space
reserved for the private struct
- 1294Set the inode→i_op to
shmem_symlink_inline_operations which has functions which know
the link name is in the inode
- 1295-1314Allocate a page to store the the link in
- 1296Acquire the private information semaphore
- 1297Allocate a page with shmem_getpage_locked
- 1298-1302If an error occured, drop the reference to the inode and
return the error
- 1301Use shmem_symlink_inode_operations which understands
that the link information is contained within a page
- 1302shmem_ilock is a global spinlock which protects a
global linked list of inodes which are linked via the private information
structs info→list field
- 1303Add the new inode to the global list
- 1304Release shmem_ilock
- 1305Map the page
- 1306Copy in the link information
- 1307Unmap the page
- 1308-1309Set the page dirty and unlock it
- 1310Release our reference to it
- 1311Release the private information semaphore
1311 dir->i_size += BOGO_DIRENT_SIZE;
1312 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1313 d_instantiate(dentry, inode);
1314 dget(dentry);
1315 return 0;
1316 }
- 1311Increment the size of the directory as a new inode has been
added. BOGO_DIRENT_SIZE is just a pseudo size of inodes so that
ls output looks nice
- 1312Update the i_ctime and i_mtime
- 1313-1314Instantiate the inode
- 1315Return successs
L.3.4.2 Function: shmem_readlink_inline
Source: mm/shmem.c
1318 static int shmem_readlink_inline(struct dentry *dentry,
char *buffer, int buflen)
1319 {
1320 return vfs_readlink(dentry, buffer, buflen,
(const char *)SHMEM_I(dentry->d_inode));
1321 }
- 1320The link name is contained within the inode so pass it as a
parameter to the VFS layer with vfs_readlink()
L.3.4.3 Function: shmem_follow_link_inline
Source: mm/shmem.c
1323 static int shmem_follow_link_inline(struct dentry *dentry,
struct nameidata *nd)
1324 {
1325 return vfs_follow_link(nd,
(const char *)SHMEM_I(dentry->d_inode));
1326 }
- 1209The link name is contained within the inode so pass it as a
parameter to the VFS layer with vfs_followlink()
L.3.4.4 Function: shmem_readlink
Source: mm/shmem.c
1328 static int shmem_readlink(struct dentry *dentry,
char *buffer, int buflen)
1329 {
1330 struct page *page - NULL;
1331 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ);
1332 if (res)
1333 return res;
1334 res = vfs_readlink(dentry,buffer,buflen, kmap(page));
1335 kunmap(page);
1336 mark_page_accessed(page);
1337 page_cache_release(page);
1338 return res;
1339 }
- 1331The link name is contained in a page associated with the symlink so
call shmem_getpage()(See Section L.5.1.2) to get a pointer
to it
- 1332-1333If an error occured, return NULL
- 1334Map the page with kmap() (See Section I.1.0.5) and pass
it as a pointer to vfs_readlink(). The link is at the beginning
of the page
- 1335Unmap the page
- 1336Mark the page accessed
- 1338Drop our reference to the page taken by shmem_getpage()
- 1338Return the link
1231 static int shmem_follow_link(struct dentry *dentry,
struct nameidata *nd)
1232 {
1233 struct page * page;
1234 int res = shmem_getpage(dentry->d_inode, 0, &page);
1235 if (res)
1236 return res;
1237
1238 res = vfs_follow_link(nd, kmap(page));
1239 kunmap(page);
1240 page_cache_release(page);
1241 return res;
1242 }
- 1234The link name is within a page so get the page with
shmem_getpage()
- 1235-1236Return the error if one occured
- 1238Map the page and pass it as a pointer to
vfs_follow_link()
- 1239Unmap the page
- 1240Drop our reference to the page
- 1241Return success
L.3.5 Synchronising
L.3.5.1 Function: shmem_sync_file
Source: mm/shmem.c
This function simply returns 0 as the file exists only in memory and does not
need to be synchronised with a file on disk.
1446 static int shmem_sync_file(struct file * file,
struct dentry *dentry,
int datasync)
1447 {
1448 return 0;
1449 }
L.4 Inode Operations in tmpfs
L.4.1 Truncating
L.4.1.1 Function: shmem_truncate
Source: mm/shmem.c
By the time this function has been called, the inode→i_size
has been set to the new size by vmtruncate(). It is the job of
this function to either create or remove pages as necessary to set the size
of the file.
351 static void shmem_truncate(struct inode *inode)
352 {
353 struct shmem_inode_info *info = SHMEM_I(inode);
354 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
355 unsigned long freed = 0;
356 unsigned long index;
357
358 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
359 index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
360 if (index >= info->next_index)
361 return;
362
363 spin_lock(&info->lock);
364 while (index < info->next_index)
365 freed += shmem_truncate_indirect(info, index);
366 BUG_ON(info->swapped > info->next_index);
367 spin_unlock(&info->lock);
368
369 spin_lock(&sbinfo->stat_lock);
370 sbinfo->free_blocks += freed;
371 inode->i_blocks -= freed*BLOCKS_PER_PAGE;
372 spin_unlock(&sbinfo->stat_lock);
373 }
- 353Get the private filesystem information for this inode with
SHMEM_I()
- 354Get the superblock private information
- 358Update the ctime and mtime for the inode
- 359Get the index of the page that is the new end of the file. The old
size is stored in info→next_index
- 360-361If the file is being expanded, just return as the global zero
page will be used to represent the expanded region
- 363Acquire the private info spinlock
- 364-365Continually call shmem_truncate_indirect() until the
file is truncated to the desired size
- 366It is a bug if the shmem_info_info struct indicates
that there are more pages swapped out than there are pages in the file
- 367release the private info spinlock
- 369Acquire the superblock private info spinlock
- 370Update the number of free blocks available
- 371Update the number of blocks being used by this inode
- 372Release the superblock private info spinlock
L.4.1.2 Function: shmem_truncate_indirect
Source: mm/shmem.c
This function locates the last doubly-indirect block in the inode and calls
shmem_truncate_direct() to truncate it.
308 static inline unsigned long
309 shmem_truncate_indirect(struct shmem_inode_info *info,
unsigned long index)
310 {
311 swp_entry_t ***base;
312 unsigned long baseidx, start;
313 unsigned long len = info->next_index;
314 unsigned long freed;
315
316 if (len <= SHMEM_NR_DIRECT) {
317 info->next_index = index;
318 if (!info->swapped)
319 return 0;
320 freed = shmem_free_swp(info->i_direct + index,
321 info->i_direct + len);
322 info->swapped -= freed;
323 return freed;
324 }
325
326 if (len <= ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT) {
327 len -= SHMEM_NR_DIRECT;
328 base = (swp_entry_t ***) &info->i_indirect;
329 baseidx = SHMEM_NR_DIRECT;
330 } else {
331 len -= ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT;
332 BUG_ON(len > ENTRIES_PER_PAGEPAGE*ENTRIES_PER_PAGE/2);
333 baseidx = len - 1;
334 baseidx -= baseidx % ENTRIES_PER_PAGEPAGE;
335 base = (swp_entry_t ***) info->i_indirect +
336 ENTRIES_PER_PAGE/2 + baseidx/ENTRIES_PER_PAGEPAGE;
337 len -= baseidx;
338 baseidx += ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT;
339 }
340
341 if (index > baseidx) {
342 info->next_index = index;
343 start = index - baseidx;
344 } else {
345 info->next_index = baseidx;
346 start = 0;
347 }
348 return *base? shmem_truncate_direct(info, base, start, len): 0;
349 }
- 313len is the second last page that is currently in use by
the file
- 316-324If the file is small and all entries are stored in the direct
block information, simply call shmem_free_swp() passing it the first
swap entry in info→i_direct and the number of entries to
truncate
- 326-339The pages to be truncated are in the indirect blocks
somewhere. This section of code is dedicated to calculating three variables,
base, baseidx and len. base is the
beginning of the page that contains pointers to swap entries to be truncated.
baseidx is the page index of the first entry within the indirect
block being used and len is the number of entries to be truncated
from in this pass
- 326-330This calculates the variables for a doubly indirect
block. The base is then set to the swap entry at the
beginnning of info→i_indirect. baseidx
is SHMEM_NR_DIRECT which is the page index at the beginning
of info→i_indirect. At this point, len is the
number of pages in the file so the number of direct blocks is subtracted to
leave the remaining number of pages
- 330-339Else this is a triply indexed block so the next level must
be traversed before the base, baseidx and len
are calculated
- 341-344If the file is going to be bigger after the truncation,
update next_index to the new end of file and make start
the beginning of the indirect block
- 344-347If the file is been made smaller, move the current end of
the file to the beginning of this indirect block that is about to be truncated
- 348If there is a block at base, call
shmem_truncate_direct() to truncate pages in it
L.4.1.3 Function: shmem_truncate_direct
Source: mm/shmem.c
This function is responsible for cycling through an indirect block and calling
shmem_free_swp for each page that contains swap vectors which are
to be truncated.
264 static inline unsigned long
265 shmem_truncate_direct(struct shmem_inode_info *info,
swp_entry_t ***dir,
unsigned long start, unsigned long len)
266 {
267 swp_entry_t **last, **ptr;
268 unsigned long off, freed_swp, freed = 0;
269
270 last = *dir + (len + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE;
271 off = start % ENTRIES_PER_PAGE;
272
273 for (ptr = *dir + start/ENTRIES_PER_PAGE; ptr < last; ptr++, off = 0) {
274 if (!*ptr)
275 continue;
276
277 if (info->swapped) {
278 freed_swp = shmem_free_swp(*ptr + off,
279 *ptr + ENTRIES_PER_PAGE);
280 info->swapped -= freed_swp;
281 freed += freed_swp;
282 }
283
284 if (!off) {
285 freed++;
286 free_page((unsigned long) *ptr);
287 *ptr = 0;
288 }
289 }
290
291 if (!start) {
292 freed++;
293 free_page((unsigned long) *dir);
294 *dir = 0;
295 }
296 return freed;
297 }
- 270last is the last page within the indirect block that is
to be truncated
- 271off is the offset within the page that the truncation is
to if this is a partial truncation rather than a full page truncation
- 273-289Beginning with the startth block in dir,
truncate pages until last is reached
- 274-275If no page is here, continue to the next one
- 277-282If the info struct indicates that there are pages
swapped out belonging to this inode, call shmem_free_swp() to free
any swap slot associated with this page. If one was freed, update
infoswapped and increment the count of the freed
number of pages
- 284-288If this is not a partial truncate, free the page
- 291-295If this whole indirect block is now free, reclaim the page
- 296Return the number of pages freed
L.4.1.4 Function: shmem_free_swp
Source: mm/shmem.c
This frees count number of swap entries starting with the entry at
dir.
240 static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
241 {
242 swp_entry_t *ptr;
243 int freed = 0;
244
245 for (ptr = dir; ptr < edir; ptr++) {
246 if (ptr->val) {
247 free_swap_and_cache(*ptr);
248 *ptr = (swp_entry_t){0};
249 freed++;
250 }
251 }
252 return freed;
254 }
- 245-251Loop through each of the swap entries to be freed
- 246-250If a swap entry exists, free it with
free_swap_and_cache() and set the swap entry to 0. Increment the
number of pages freed
- 252Return the total number of pages freed
L.4.2.1 Function: shmem_link
Source: mm/shmem.c
This function creates a hard link with dentry to
old_dentry.
1172 static int shmem_link(struct dentry *old_dentry,
struct inode *dir,
struct dentry *dentry)
1173 {
1174 struct inode *inode = old_dentry->d_inode;
1175
1176 if (S_ISDIR(inode->i_mode))
1177 return -EPERM;
1178
1179 dir->i_size += BOGO_DIRENT_SIZE;
1180 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1181 inode->i_nlink++;
1182 atomic_inc(&inode->i_count);
1183 dget(dentry);
1184 d_instantiate(dentry, inode);
1185 return 0;
1186 }
- 1174Get the inode corresponding to old_dentry
- 1176-1177If it is linking to a directory, return -EPERM.
Strictly speaking, root should be allowed to hard-link directories although it
is not recommended because of the possibility of creating a loop within the
filesystem which utilities like find get lost in. tmpfs
simply does not allow the hard-linking of directories
- 1179Increment the size of the directory with the new link
- 1180Update the directories mtime and ctime. Update
the inodes ctime
- 1181Increment the number of links leading to inode
- 1183Get an extra reference to the new dentry with
dget()
- 1184Instantiate the new dentry
- 1185Return success
L.4.3.1 Function: shmem_unlink
Source: mm/shmem.c
1221 static int shmem_unlink(struct inode* dir,
struct dentry *dentry)
1222 {
1223 struct inode *inode = dentry->d_inode;
1224
1225 dir->i_size -= BOGO_DIRENT_SIZE;
1226 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1227 inode->i_nlink--;
1228 dput(dentry);
1229 return 0;
1230 }
- 1223Get the inode for the dentry being unlinked
- 1225Update the directory inodes size
- 1226Update the various ctime and mtime variables
- 1227Decrement the number of links to the inode
- 1228Call dput() to decrement the reference to the
dentry. This function will also call iput() to clear up the
inode if it's reference count reaches zero
L.4.4 Making Directories
L.4.4.1 Function: shmem_mkdir
Source: mm/shmem.c
1154 static int shmem_mkdir(struct inode *dir,
struct dentry *dentry,
int mode)
1155 {
1156 int error;
1157
1158 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1159 return error;
1160 dir->i_nlink++;
1161 return 0;
1162 }
- 1158Call shmem_mknod()(See Section L.2.2) to create
a special file. By specifiing the S_IFDIR flag, a directory
will be created
- 1160Increment the parent directory's i_nlink field
L.4.5 Removing Directories
L.4.5.1 Function: shmem_rmdir
Source: mm/shmem.c
1232 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1233 {
1234 if (!shmem_empty(dentry))
1235 return -ENOTEMPTY;
1236
1237 dir->i_nlink--;
1238 return shmem_unlink(dir, dentry);
1239 }
- 1234-1235Check to see if the directory is empty with
shmem_empty() (See Section L.4.5.2). If it is not, return
-ENOTEMPTY
- 1237Decrement the parent directory's i_nlink field
- 1238Return the result of shmem_unlink()(See Section L.4.3.1)
which should delete the directory
L.4.5.2 Function: shmem_empty
Source: mm/shmem.c
This function checks to see if a directory is empty or not.
1201 static int shmem_empty(struct dentry *dentry)
1202 {
1203 struct list_head *list;
1204
1205 spin_lock(&dcache_lock);
1206 list = dentry->d_subdirs.next;
1207
1208 while (list != &dentry->d_subdirs) {
1209 struct dentry *de = list_entry(list,
struct dentry, d_child);
1210
1211 if (shmem_positive(de)) {
1212 spin_unlock(&dcache_lock);
1213 return 0;
1214 }
1215 list = list->next;
1216 }
1217 spin_unlock(&dcache_lock);
1218 return 1;
1219 }
- 1205The dcache_lock protect many things but it mainly
protects dcache lookups which is what will be required for this function so
acquire it
- 1208Cycle through the subdirs list, which contains all children
dentries , and see can one active dentry be found. If it is, 0 will be
returned indicating the directory is not empty
- 1209Get the dentry for this child
- 1211shmem_positive()(See Section L.4.5.3) returns
if the dentry has a valid inode associated with it and is currently hashed. If
it's hashed, it means that the dentry is active and the directory is not empty
- 1212-1213If the directory is not empty, free the spinlock and return
- 1215Move to the next child
- 1217-1218The directory is empty. Free the spinlock and return
L.4.5.3 Function: shmem_positive
Source: mm/shmem.c
1188 static inline int shmem_positive(struct dentry *dentry)
1189 {
1190 return dentry->d_inode && !d_unhashed(dentry);
1191 }
- 1190Return true if the dentry has a valid inode and is currently
hashed
L.5 Page Faulting within a Virtual File
L.5.1 Reading Pages during Page Fault
L.5.1.1 Function: shmem_nopage
Source: mm/shmem.c
This is the toplevel nopage() function that is called by
do_no_page() when faulting in a page. This is called regardless of
the fault being the first fault or if it is being faulted in from backing
storage.
763 struct page * shmem_nopage(struct vm_area_struct *vma,
unsigned long address,
int unused)
764 {
765 struct inode *inode = vma->vm_file->f_dentry->d_inode;
766 struct page *page = NULL;
767 unsigned long idx;
768 int error;
769
770 idx = (address - vma->vm_start) >> PAGE_SHIFT;
771 idx += vma->vm_pgoff;
772 idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
773
774 error = shmem_getpage(inode, idx, &page, SGP_CACHE);
775 if (error)
776 return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
777
778 mark_page_accessed(page);
779 flush_page_to_ram(page);
780 return page;
781 }
- 763The two parameters of relevance are the VMA the fault occured in
and the faulting address
- 765Record the inode the fault occured in
- 770-772Calculate the idx as the offset in counts of
PAGE_SIZE within the virtual file
- 772This adjustment takes into account the possibility that an entry in
the page cache is a different size to a page. At the moment, there is no
difference
- 774-775shmem_getpage()(See Section L.5.1.2) is
responsible for locating the page at idx
- 775-776If an error occured, decide whether to return an OOM error or
an invalid faulting address error
- 778Mark the page accessed so it will be moved to the top of the LRU
lists
- 779flush_page_to_ram() is responsible for avoiding d-cache
aliasing problems
- 780Return the faulted-in page
L.5.1.2 Function: shmem_getpage
Source: mm/shmem.c
583 static int shmem_getpage(struct inode *inode,
unsigned long idx,
struct page **pagep,
enum sgp_type sgp)
584 {
585 struct address_space *mapping = inode->i_mapping;
586 struct shmem_inode_info *info = SHMEM_I(inode);
587 struct shmem_sb_info *sbinfo;
588 struct page *filepage = *pagep;
589 struct page *swappage;
590 swp_entry_t *entry;
591 swp_entry_t swap;
592 int error = 0;
593
594 if (idx >= SHMEM_MAX_INDEX)
595 return -EFBIG;
596 /*
597 * Normally, filepage is NULL on entry, and either found
598 * uptodate immediately, or allocated and zeroed, or read
599 * in under swappage, which is then assigned to filepage.
600 * But shmem_readpage and shmem_prepare_write pass in a locked
601 * filepage, which may be found not uptodate by other callers
602 * too, and may need to be copied from the swappage read in.
603 */
604 repeat:
605 if (!filepage)
606 filepage = find_lock_page(mapping, idx);
607 if (filepage && Page_Uptodate(filepage))
608 goto done;
609
610 spin_lock(&info->lock);
611 entry = shmem_swp_alloc(info, idx, sgp);
612 if (IS_ERR(entry)) {
613 spin_unlock(&info->lock);
614 error = PTR_ERR(entry);
615 goto failed;
616 }
617 swap = *entry;
- 583The parameters are:
-
- inode is the inode that the fault is occuring in
- idx is the index of the page within the file that is being faulted
- pagep if NULL will become the faulted page if successful. If a valid
page is passed in, this function will make sure it is uptodate
- sgp indicates what type of access this is which determines how a page
will be located and returned
- 586SHMEM_I() returns the shmem_inode_info
contained with the filesystem-specific information within the superblock
information
- 594-595Make sure the index is not beyond the end of the file
- 605-606If no page was passed in with the pagep parameter,
then try and locate the page and lock it with find_lock_page()
(See Section J.1.4.4)
- 607-608If the page was found and is up to date, then goto
done as this function has nothing more to do
- 610Lock the inode private information struct
- 611Search for the swap entry for this idx with
shmem_swp_alloc(). If one did not previously exist, it will be
allocated
- 612-616If an error occured, release the spinlock and return the error
619 if (swap.val) {
620 /* Look it up and read it in.. */
621 swappage = lookup_swap_cache(swap);
622 if (!swappage) {
623 spin_unlock(&info->lock);
624 swapin_readahead(swap);
625 swappage = read_swap_cache_async(swap);
626 if (!swappage) {
627 spin_lock(&info->lock);
628 entry = shmem_swp_alloc(info, idx, sgp);
629 if (IS_ERR(entry))
630 error = PTR_ERR(entry);
631 else if (entry->val == swap.val)
632 error = -ENOMEM;
633 spin_unlock(&info->lock);
634 if (error)
635 goto failed;
636 goto repeat;
637 }
638 wait_on_page(swappage);
639 page_cache_release(swappage);
640 goto repeat;
641 }
642
643 /* We have to do this with page locked to prevent races */
644 if (TryLockPage(swappage)) {
645 spin_unlock(&info->lock);
646 wait_on_page(swappage);
647 page_cache_release(swappage);
648 goto repeat;
649 }
650 if (!Page_Uptodate(swappage)) {
651 spin_unlock(&info->lock);
652 UnlockPage(swappage);
653 page_cache_release(swappage);
654 error = -EIO;
655 goto failed;
656 }
In this block, a valid swap entry exists for the page. The page will be first
searched for in the swap cache and if it does not exist there, it will be read
in from backing storage.
- 619-690This block of lines deal with the case where a valid swap entry
exists
- 612Search for swappage in the swap cache with
lookup_swap_cache() (See Section K.2.4.1)
- 622-641If the page does not exist in the swap cache, read it in from
backing storage with read_swap_cache_async(). Note that in line
638, wait_on_page() is called to wait until the IO completes. Once
the IO completes, the reference to the page is released and the
repeat label is jumped to reacquire the spinlocks and try again
- 644-649Try and lock the page. If it fails, wait until it can be locked
and jump to repeat to try again
- 650-656If the page is not up-to-date, the IO failed for some reason so
return the error
658 delete_from_swap_cache(swappage);
659 if (filepage) {
660 entry->val = 0;
661 info->swapped--;
662 spin_unlock(&info->lock);
663 flush_page_to_ram(swappage);
664 copy_highpage(filepage, swappage);
665 UnlockPage(swappage);
666 page_cache_release(swappage);
667 flush_dcache_page(filepage);
668 SetPageUptodate(filepage);
669 SetPageDirty(filepage);
670 swap_free(swap);
671 } else if (add_to_page_cache_unique(swappage,
672 mapping, idx, page_hash(mapping, idx)) == 0) {
673 entry->val = 0;
674 info->swapped--;
675 spin_unlock(&info->lock);
676 filepage = swappage;
677 SetPageUptodate(filepage);
678 SetPageDirty(filepage);
679 swap_free(swap);
680 } else {
681 if (add_to_swap_cache(swappage, swap) != 0)
682 BUG();
683 spin_unlock(&info->lock);
684 SetPageUptodate(swappage);
685 SetPageDirty(swappage);
686 UnlockPage(swappage);
687 page_cache_release(swappage);
688 goto repeat;
689 }
At this point, the page exists in the swap cache
- 658Delete the page from the swap cache so we can attempt to add it to
the page cache
- 659-670If the caller supplied a page with the pagep
parameter, then update pagep with the data in swappage
- 671-680Else try and add swappage to the page cache. Note that
info→swapped is updated and the page is marked uptodate
before the swap entry is freed with swap_free()
- 681-689If we failed to add the page to the page cache, add it back to
the swap cache with add_to_swap_cache(). The page is marked
uptodate before being unlocked and goto repeat to try again
690 } else if (sgp == SGP_READ && !filepage) {
691 filepage = find_get_page(mapping, idx);
692 if (filepage &&
693 (!Page_Uptodate(filepage) || TryLockPage(filepage))) {
694 spin_unlock(&info->lock);
695 wait_on_page(filepage);
696 page_cache_release(filepage);
697 filepage = NULL;
698 goto repeat;
699 }
700 spin_unlock(&info->lock);
In this block, a valid swap entry does not exist for the idx. If the
page is being read and the pagep is NULL, then locate the page in the
page cache.
- 691Call find_get_page() (See Section J.1.4.1) to find
the page in the page cache
- 692-699If the page was found but was not up to date or could not be
locked, release the spinlock and wait until the page is unlocked. Then goto
repeat to reacquire the spinlock and try again
- 700Release the spinlock
701 } else {
702 sbinfo = SHMEM_SB(inode->i_sb);
703 spin_lock(&sbinfo->stat_lock);
704 if (sbinfo->free_blocks == 0) {
705 spin_unlock(&sbinfo->stat_lock);
706 spin_unlock(&info->lock);
707 error = -ENOSPC;
708 goto failed;
709 }
710 sbinfo->free_blocks--;
711 inode->i_blocks += BLOCKS_PER_PAGE;
712 spin_unlock(&sbinfo->stat_lock);
713
714 if (!filepage) {
715 spin_unlock(&info->lock);
716 filepage = page_cache_alloc(mapping);
717 if (!filepage) {
718 shmem_free_block(inode);
719 error = -ENOMEM;
720 goto failed;
721 }
722
723 spin_lock(&info->lock);
724 entry = shmem_swp_alloc(info, idx, sgp);
725 if (IS_ERR(entry))
726 error = PTR_ERR(entry);
727 if (error || entry->val ||
728 add_to_page_cache_unique(filepage,
729 mapping, idx, page_hash(mapping, idx)) != 0) {
730 spin_unlock(&info->lock);
731 page_cache_release(filepage);
732 shmem_free_block(inode);
733 filepage = NULL;
734 if (error)
735 goto failed;
736 goto repeat;
737 }
738 }
739
740 spin_unlock(&info->lock);
741 clear_highpage(filepage);
742 flush_dcache_page(filepage);
743 SetPageUptodate(filepage);
744 }
Else a page that is not in the page cache is being written to. It will need to
be allocated.
- 702Get the superblock info with SHMEM_SB()
- 703Acquire the superblock info spinlock
- 704-709If there are no free blocks left in the filesystem, release the
spinlocks, set the return error to -ENOSPC and goto
failed;
- 710Decrement the number of available blocks
- 711Increment the block usage count for the inode
- 712Release the superblock private information spinlock
- 714-715If a page was not supplied via pagep, then allocate a
page and swap entry for the new page
- 715Release the info spinlock as page_cache_alloc() may
sleep
- 716Allocate a new page
- 717-721If the allocation failed, free the block with
shmem_free_block() and set the return error to -ENOMEM
before gotoing failed
- 723Reacquire the info spinlock
- 724shmem_swp_entry() locates a swap entry for the page.
If one does not already exist (which is likely will not for this page), one
will be allocated and returned
- 725-726If no swap entry was found or allocated, set the return error
- 728-729If no error occured, add the page to the page cache
- 730-732If the page was not added to the page cache (because we raced
and another process inserted the page while we had the spinlock released for
example), then drop the reference to the new page and free the block
- 734-735If an error occured, goto failed to report the error
- 736Otherwise, goto repeat where the desired page will be
searched for within the page cache again
- 740Release the info spinlock
- 741Zero-fill the new page
- 742Flush the dcache to avoid possible CPU dcache aliasing
- 743Mark the page as being uptodate
745 done:
746 if (!*pagep) {
747 if (filepage) {
748 UnlockPage(filepage);
749 *pagep = filepage;
750 } else
751 *pagep = ZERO_PAGE(0);
752 }
753 return 0;
754
755 failed:
756 if (*pagep != filepage) {
757 UnlockPage(filepage);
758 page_cache_release(filepage);
759 }
760 return error;
761 }
- 746-752If a page was not passed in via pagep, decide what to
return. If a page was allocated for writing, unlock and return
filepage. Otherwise, the caller is just a reader, so return the
global zero-filleed page
- 753Return success
- 755This is the failure path
- 756If a page was allocated by this function and stored in
filepage, unlock it and drop the reference to it which will free it
- 760Return the error code
L.5.2 Locating Swapped Pages
L.5.2.1 Function: shmem_alloc_entry
Source: mm/shmem.c
This function is a top-level function that returns the swap entry
corresponding to a particular page index within a file. If the swap entry does
not exist, one will be allocated.
183 static inline swp_entry_t * shmem_alloc_entry (
struct shmem_inode_info *info,
unsigned long index)
184 {
185 unsigned long page = 0;
186 swp_entry_t * res;
187
188 if (index >= SHMEM_MAX_INDEX)
189 return ERR_PTR(-EFBIG);
190
191 if (info->next_index <= index)
192 info->next_index = index + 1;
193
194 while ((res = shmem_swp_entry(info,index,page)) ==
ERR_PTR(-ENOMEM)) {
195 page = get_zeroed_page(GFP_USER);
196 if (!page)
197 break;
198 }
199 return res;
200 }
- 188-189SHMEM_MAX_INDEX is calculated at compile-time
and it indicates the largest possible virtual file in pages. If the
var is greater than the maximum possible sized file, return
-EFBIG
- 191-192next_index records the index of the page at the
end of the file. inode→i_size alone is insufficent as
the next_index field is needed for file truncation
- 194-198Call shmem_swp_entry() to locate the
swp_entry_t for the requested index.
While searching, shmem_swp_entry() may need a number of pages. If
it does, it returns -ENOMEM which indicates that
get_zeroed_page() should be called before trying again
- 199Return the swp_entry_t
L.5.2.2 Function: shmem_swp_entry
Source: mm/shmem.c
This function uses information within the inode to locate the
swp_entry_t for a given index. The inode itself is able to
store SHMEM_NR_DIRECT swap vectors. After that indirect
blocks are used.
127 static swp_entry_t *shmem_swp_entry (struct shmem_inode_info *info,
unsigned long index,
unsigned long page)
128 {
129 unsigned long offset;
130 void **dir;
131
132 if (index < SHMEM_NR_DIRECT)
133 return info->i_direct+index;
134 if (!info->i_indirect) {
135 if (page) {
136 info->i_indirect = (void **) *page;
137 *page = 0;
138 }
139 return NULL;
140 }
141
142 index -= SHMEM_NR_DIRECT;
143 offset = index % ENTRIES_PER_PAGE;
144 index /= ENTRIES_PER_PAGE;
145 dir = info->i_indirect;
146
147 if (index >= ENTRIES_PER_PAGE/2) {
148 index -= ENTRIES_PER_PAGE/2;
149 dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
150 index %= ENTRIES_PER_PAGE;
151 if (!*dir) {
152 if (page) {
153 *dir = (void *) *page;
154 *page = 0;
155 }
156 return NULL;
157 }
158 dir = ((void **)*dir);
159 }
160
161 dir += index;
162 if (!*dir) {
163 if (!page || !*page)
164 return NULL;
165 *dir = (void *) *page;
166 *page = 0;
167 }
168 return (swp_entry_t *) *dir + offset;
169 }
- 132-133If the index is below SHMEM_NR_DIRECT,
then the swap vector is contained within the direct block so return it
- 134-140If a page does not exist at this indirect block, install the
page that was passed in with the page parameter and return NULL. This
tells the called to allocate a new page and call the function again
- 142Treat the indirect blocks as starting from index 0
- 143ENTRIES_PER_PAGE is the number of swap vectors
contained within each page in the indirect block. offset is now the
index of the desired swap vector within the indirect block page when it is
found
- 144index is now the directory number within the indirect
block list that must be found
- 145Get a pointer to the first indirect block we are interested in
- 147-159If the required directory (index) is greater than
ENTRIES_PER_PAGE/2, then it is a triple indirect block so the
next block must be traversed
- 148Pointers to the next set of directory blocks is in the second half
of the current block so calculate index as an offset within the
second half of the current block
- 149Calculate dir as a pointer to the next directory
block
- 150index is now a pointer within dir to a page
containing the swap vectors we are interested in
- 151-156If dir has not been allocated, install the page
supplied with the page parameter and return NULL so the caller will
allocate a new page and call the function again
- 158dir is now the base of the page of swap vectors containing
the one we are interested in
- 161Move dir forward to the entry we want
- 162-167If an entry does not exist, install the page supplied
as a parameter if available. If not, return NULL so that one will be allocated
and the function called again
- 168Return the found swap vector
L.6 Swap Space Interaction
L.6.1 Function: shmem_writepage
Source: mm/shmem.c
This function is responsible for moving a page from the page cache to the
swap cache.
522 static int shmem_writepage(struct page *page)
523 {
524 struct shmem_inode_info *info;
525 swp_entry_t *entry, swap;
526 struct address_space *mapping;
527 unsigned long index;
528 struct inode *inode;
529
530 BUG_ON(!PageLocked(page));
531 if (!PageLaunder(page))
532 return fail_writepage(page);
533
534 mapping = page->mapping;
535 index = page->index;
536 inode = mapping->host;
537 info = SHMEM_I(inode);
538 if (info->flags & VM_LOCKED)
539 return fail_writepage(page);
This block is function preamble to make sure the operation is possible.
- 522The parameter is the page to move to the swap cache
- 530It is a bug if the page is already locked for IO
- 531-532If the launder bit has not been set, call
fail_writepage(). fail_writepage() is used by
in-memory filesystems to mark the page dirty and re-activate it so that the
page reclaimer does not repeatadly attempt to write the same page
- 534-537Records variables that are needed as parameters later in the
function
- 538-539If the inode filesystem information is locked, fail
540 getswap:
541 swap = get_swap_page();
542 if (!swap.val)
543 return fail_writepage(page);
544
545 spin_lock(&info->lock);
546 BUG_ON(index >= info->next_index);
547 entry = shmem_swp_entry(info, index, NULL);
548 BUG_ON(!entry);
549 BUG_ON(entry->val);
550
This block is responsible for allocating a swap slot from the backing storage
and a swp_entry_t within the inode.
- 541-543Locate a free swap slot with get_swap_page()
(See Section K.1.1). It fails, call fail_writepage()
- 545Lock the inode information
- 547Get a free swp_entry_t from the filesystem-specific
private inode information with shmem_swp_entry()
551 /* Remove it from the page cache */
552 remove_inode_page(page);
553 page_cache_release(page);
554
555 /* Add it to the swap cache */
556 if (add_to_swap_cache(page, swap) != 0) {
557 /*
558 * Raced with "speculative" read_swap_cache_async.
559 * Add page back to page cache, unref swap, try again.
560 */
561 add_to_page_cache_locked(page, mapping, index);
562 spin_unlock(&info->lock);
563 swap_free(swap);
564 goto getswap;
565 }
566
567 *entry = swap;
568 info->swapped++;
569 spin_unlock(&info->lock);
570 SetPageUptodate(page);
571 set_page_dirty(page);
572 UnlockPage(page);
573 return 0;
574 }
Move from the page cache to the swap cache and update statistics.
- 552remove_inode_page()(See Section J.1.2.1)
removes the page from the inode and hash lists the page is a member of
- 553page_cache_release() drops the local reference to the
page taken for the writepage() operation
- 556Add the page to the swap cache. After this returns, the
page→mapping will now be swapper_space
- 561The operation failed so add the page back to the page cache
- 562Unlock the private information
- 563-564free the swap slot and try again
- 567Here, the page has successfully become part of the swap cache.
Update the inode information to point to the swap slot in backing storage
- 568Increment the counter recording the number of pages belonging to
this inode that are in swap
- 569Free the private inode information
- 570-571Move the page to the address_space dirty pages
list so that it will be written to backing storage
- 573Return success
L.6.2 Function: shmem_unuse
Source: mm/shmem.c
This function will search the shmem_inodes list for the inode that
holds the information for the requsted entry and page. It
is a very expensive operation but it is only called when a swap area is being
deactivated so it is not a significant problem. On return, the swap entry
will be freed and the page will be moved from the swap cache to the page cache.
498 int shmem_unuse(swp_entry_t entry, struct page *page)
499 {
500 struct list_head *p;
501 struct shmem_inode_info * nfo;
502
503 spin_lock(&shmem_ilock);
504 list_for_each(p, &shmem_inodes) {
505 info = list_entry(p, struct shmem_inode_info, list);
506
507 if (info->swapped && shmem_unuse_inode(info, entry, page)) {
508 /* move head to start search for next from here */
509 list_move_tail(&shmem_inodes, &info->list);
510 found = 1;
511 break;
512 }
513 }
514 spin_unlock(&shmem_ilock);
515 return found;
516 }
- 503Acquire the shmem_ilock spinlock protecting the inode
list
- 504Cycle through each entry in the shmem_inodes
list searching for the inode holding the requested entry and
page
- 509Move the inode to the top of the list. In the event that we are
reclaiming many pages, the next search will find the inode of interest at the
top of the list
- 510Indicate that the page was found
- 511This page and entry have been found to break out
of the loop
- 514Release the shmem_ilock spinlock
- 515Return if the page was found or not by
shmem_unuse_inode()
L.6.3 Function: shmem_unuse_inode
Source: mm/shmem.c
This function searches the inode information in info to determine
if the entry and page belong to it. If they do, the
entry will be cleared and the page will be removed from the swap
cache and moved to the page cache instead.
436 static int shmem_unuse_inode(struct shmem_inode_info *info,
swp_entry_t entry,
struct page *page)
437 {
438 struct inode *inode;
439 struct address_space *mapping;
440 swp_entry_t *ptr;
441 unsigned long idx;
442 int offset;
443
444 idx = 0;
445 ptr = info->i_direct;
446 spin_lock(&info->lock);
447 offset = info->next_index;
448 if (offset > SHMEM_NR_DIRECT)
449 offset = SHMEM_NR_DIRECT;
450 offset = shmem_find_swp(entry, ptr, ptr + offset);
451 if (offset >= 0)
452 goto found;
453
454 for (idx = SHMEM_NR_DIRECT; idx < info->next_index;
455 idx += ENTRIES_PER_PAGE) {
456 ptr = shmem_swp_entry(info, idx, NULL);
457 if (!ptr)
458 continue;
459 offset = info->next_index - idx;
460 if (offset > ENTRIES_PER_PAGE)
461 offset = ENTRIES_PER_PAGE;
462 offset = shmem_find_swp(entry, ptr, ptr + offset);
463 if (offset >= 0)
464 goto found;
465 }
466 spin_unlock(&info->lock);
467 return 0;
468 found:
470 idx += offset;
471 inode = info->inode;
472 mapping = inode->i_mapping;
473 delete_from_swap_cache(page);
474
475 /* Racing against delete or truncate?
* Must leave out of page cache */
476 limit = (inode->i_state & I_FREEING)? 0:
477 (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
478
479 if (idx >= limit || add_to_page_cache_unique(page,
480 mapping, idx, page_hash(mapping, idx)) == 0) {
481 ptr[offset].val = 0;
482 info->swapped--;
483 } else if (add_to_swap_cache(page, entry) != 0)
484 BUG();
485 spin_unlock(&info->lock);
486 SetPageUptodate(page);
487 /*
488 * Decrement swap count even when the entry is left behind:
489 * try_to_unuse will skip over mms, then reincrement count.
490 */
491 swap_free(entry);
492 return 1;
493 }
- 445Initialise ptr to start at the beginning of the direct
block for the inode being searched
- 446Lock the inode private information
- 447Initialise offset to be the last page index in the file
- 448-449If offset is beyond the end of the direct block, set
it to the end of the direct block for the moment
- 450Use shmem_find_swap()(See Section L.6.4) to
search the direct block for the entry
- 451-452If the entry was in the direct block, goto found,
otherwise we have to search the indirect blocks
- 454-465Search each of the indirect blocks for the entry
- 456shmem_swp_entry()(See Section L.5.2.2) returns
the swap vector at the current idx within the inode. As idx
is incremented in ENTRIES_PER_PAGE sized strides, this will
return the beginning of the next indirect block being searched
- 457-458If an error occured, the indirect block does not exist so
continue, which probably will exit the loop
- 459Calculate how many pages are left in the end of the file to see if
we only have to search a partially filled indirect block
- 460-461If offset is greater than the size of an indirect
block, set offset to ENTRIES_PER_PAGE so this full indirect block
will be searched by shmem_find_swp()
- 462Search the entire of the current indirect block for entry
with shmem_find_swp()(See Section L.6.4)
- 463-467If the entry was found, goto found,
otherwise the next indirect block will be searched. If the entyr is never
found, the info struct will be unlocked and 0 returned indicating
that this inode did not contain the entry and page
- 469The entry was found, so free it with
swap_free()
- 470Move idx to the location of the swap vector within the
block
- 471-472Get the inode and mapping
- 473Delete the page from the swap cache
- 476-477Check if the inode is currently being deleted or truncated by
examining inode→i_state. If it is, set limit to
the index of the last page in the adjusted file size
- 479-482If the page is not being truncated or deleted, add it to the
page cache with add_to_page_cache_unique(). If successful, clear
the swap entry and decrement info→swapped
- 483-484Else add the page back to the swap cache where it will be
reclaimed later
- 485Release the info spinlock
- 486Mark the page uptodate
- 491Decrement the swap count
- 492Return success
L.6.4 Function: shmem_find_swp
Source: mm/shmem.c
This function searches an indirect block between the two pointers
ptr and eptr for the requested entry. Note that
the two pointers must be in the same indirect block.
425 static inline int shmem_find_swp(swp_entry_t entry,
swp_entry_t *dir,
swp_entry_t *edir)
426 {
427 swp_entry_t *ptr;
428
429 for (ptr = dir; ptr < edir; ptr++) {
430 if (ptr->val == entry.val)
431 return ptr - dir;
432 }
433 return -1;
434 }
- 429Loop between the dir and edir pointers
- 430If the current ptr entry matches the requested
entry then return the offset from dir. As
shmem_unuse_inode() is the only user of this function, this will
result in the offset within the indirect block being returned
- 433Return indicating that the entry was not found
L.7 Setting up Shared Regions
L.7.1 Function: shmem_zero_setup
Source: mm/shmem.c
This function is called to setup a VMA that is a shared region backed by
anonymous pages. The call graph which shows this function is in Figure
12.5. This occurs when mmap() creates an anonymous region with the MAP_SHARED
flag.
1664 int shmem_zero_setup(struct vm_area_struct *vma)
1665 {
1666 struct file *file;
1667 loff_t size = vma->vm_end - vma->vm_start;
1668
1669 file = shmem_file_setup("dev/zero", size);
1670 if (IS_ERR(file))
1671 return PTR_ERR(file);
1672
1673 if (vma->vm_file)
1674 fput(vma->vm_file);
1675 vma->vm_file = file;
1676 vma->vm_ops = &shmem_vm_ops;
1677 return 0;
1678 }
- 1667Calculate the size
- 1669Call shmem_file_setup()(See Section L.7.2)
to create a file called dev/zero and of the calculated size. We will
see in the functions code commentary why the name does not have to be unique
- 1673-1674If a file already exists for this virtual area, call
fput() to drop it's reference
- 1675Record the new file pointer
- 1675Set the vm_ops so that shmem_nopage()
(See Section L.5.1.1) will be called when a page needs to be faulted
in for this VMA
L.7.2 Function: shmem_file_setup
Source: mm/shmem.c
This function is called to create a new file in shmfs, the internal
filesystem. As the filesystem is internal, the supplied name
does not have to be unique within each directory. Hence, every file that is
created by an anonymous region with shmem_zero_setup() will simple
be called “dev/zero” and regions created with shmget()
will be called “SYSVNN” where NN is the key that is passed as
the first arguement to shmget().
1607 struct file *shmem_file_setup(char *name, loff_tsize)
1608 {
1609 int error;
1610 struct file *file;
1611 struct inode *inode;
1612 struct dentry *dentry, *root;
1613 struct qstr this;
1614 int vm_enough_memory(long pages);
1615
1616 if (IS_ERR(shm_mnt))
1617 return (void *)shm_mnt;
1618
1619 if (size > SHMEM_MAX_BYTES)
1620 return ERR_PTR(-EINVAL);
1621
1622 if (!vm_enough_memory(VM_ACCT(size)))
1623 return ERR_PTR(-ENOMEM);
1624
1625 this.name = name;
1626 this.len = strlen(name);
1627 this.hash = 0; /* will go */
- 1607The parameters are the name of the file to create
and it's expected size
- 1614vm_enough_memory()(See Section M.1.1)
checks to make sure there is enough memory to satisify the mapping
- 1616-1617If there is an error with the mount point, return the error
- 1619-1620Do not create a file greater than
SHMEM_MAX_BYTES which is calculated at top of
mm/shmem.c
- 1622-1623Make sure there is enough memory to satisify the mapping
- 1625-1627Populate the struct qstr which is the string
type used for dnodes
1628 root = shm_mnt->mnt_root;
1629 dentry = d_alloc(root, &this);
1630 if (!dentry)
1631 return ERR_PTR(-ENOMEM);
1632
1633 error = -ENFILE;
1634 file = get_empty_filp();
1635 if (!file)
1636 goto put_dentry;
1637
1638 error = -ENOSPC;
1639 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
1640 if (!inode)
1641 goto close_file;
1642
1643 d_instantiate(dentry, inode);
1644 inode->i_size = size;
1645 inode->i_nlink = 0; /* It is unlinked */
1646 file->f_vfsmnt = mntget(shm_mnt);
1647 file->f_dentry = dentry;
1648 file->f_op = &shmem_file_operations;
1649 file->f_mode = FMODE_WRITE | FMODE_READ;
1650 return file;
1651
1652 close_file:
1653 put_filp(file);
1654 put_dentry:
1655 dput(dentry);
1656 return ERR_PTR(error);
1657 }
- 1628root is assigned to be the dnode representing the root of
shmfs
- 1629Allocate a new dentry with d_alloc()
- 1630-1631Return -ENOMEM if one could not be allocated
- 1634Get an empty struct file from the file table. If one
couldn't be found, -ENFILE will be returned indicating a file
table overflow
- 1639-1641Create a new inode which is a regular file
(S_IFREG) and globally readable, writable and executable. If
it fails, return -ENOSPC indicating no space is left in the
filesystem
- 1643d_instantiate() fills in the inode information for a
dentry. It is defined in fs/dcache.c
- 1644-1649Fill in the remaining inode and file information
- 1650Return the newly created struct file
- 1653Error path when an inode could not be created. put_filp()
fill free up the struct file entry in the file table
- 1655dput() will drop the reference to the dentry, destroying
it
- 1656Return the error code
L.8 System V IPC
L.8.1 Creating a SYSV shared region
L.8.1.1 Function: sys_shmget
Source: ipc/shm.c
229 asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
230 {
231 struct shmid_kernel *shp;
232 int err, id = 0;
233
234 down(&shm_ids.sem);
235 if (key == IPC_PRIVATE) {
236 err = newseg(key, shmflg, size);
237 } else if ((id = ipc_findkey(&shm_ids, key)) == -1) {
238 if (!(shmflg & IPC_CREAT))
239 err = -ENOENT;
240 else
241 err = newseg(key, shmflg, size);
242 } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {
243 err = -EEXIST;
244 } else {
245 shp = shm_lock(id);
246 if(shp==NULL)
247 BUG();
248 if (shp->shm_segsz < size)
249 err = -EINVAL;
250 else if (ipcperms(&shp->shm_perm, shmflg))
251 err = -EACCES;
252 else
253 err = shm_buildid(id, shp->shm_perm.seq);
254 shm_unlock(id);
255 }
256 up(&shm_ids.sem);
257 return err;
258 }
- 234Acquire the semaphore protecting shared memory IDs
- 235-236If IPC_PRIVATE is specified, most of the flags
are ignored and the region is created with newseg(). This flag is
intended to provide exclusive access to a shared region but Linux does not
guarentee exclusive access
- 237Else search to see if the key already exists with
ipc_findkey()
- 238-239If it does not and IPC_CREAT was not specified,
then return -ENOENT
- 241Else, create a new region with newseg()
- 243-243If the region already exists and the process requested a new
region that did not previously exist to be created, return
-EEXIST
- 244-255Else we are accessing an existing region, so lock it, make sure
we have the required permissions, build a segment identifier with
shm_buildid() and unlock the region again. The segment identifier
will be returned back to userspace
- 256Release the semaphore protecting IDs
- 257Return either the error or the segment identifer
L.8.1.2 Function: newseg
Source: ipc/shm.c
This function creates a new shared segment.
178 static int newseg (key_t key, int shmflg, size_t size)
179 {
180 int error;
181 struct shmid_kernel *shp;
182 int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
183 struct file * file;
184 char name[13];
185 int id;
186
187 if (size < SHMMIN || size > shm_ctlmax)
188 return -EINVAL;
189
190 if (shm_tot + numpages >= shm_ctlall)
191 return -ENOSPC;
192
193 shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_USER);
194 if (!shp)
195 return -ENOMEM;
196 sprintf (name, "SYSV%08x", key);
This block allocates the segment descriptor.
- 182Calculate the number of pages the region will occupy
- 188-188Ensure the size of the region does not break limits
- 190-191Make sure the total number of pages required for the segment
will not break limits
- 193Allocate the descriptor with kmalloc()(See Section H.4.2.1)
- 196Print the name of the file to be created in shmfs. The
name is SYSVNN where NN is the key identifier of the
region
197 file = shmem_file_setup(name, size);
198 error = PTR_ERR(file);
199 if (IS_ERR(file))
200 goto no_file;
201
202 error = -ENOSPC;
203 id = shm_addid(shp);
204 if(id == -1)
205 goto no_id;
206 shp->shm_perm.key = key;
207 shp->shm_flags = (shmflg & S_IRWXUGO);
208 shp->shm_cprid = current->pid;
209 shp->shm_lprid = 0;
210 shp->shm_atim = shp->shm_dtim = 0;
211 shp->shm_ctim = CURRENT_TIME;
212 shp->shm_segsz = size;
213 shp->shm_nattch = 0;
214 shp->id = shm_buildid(id,shp->shm_perm.seq);
215 shp->shm_file = file;
216 file->f_dentry->d_inode->i_ino = shp->id;
217 file->f_op = &shm_file_operations;
218 shm_tot += numpages;
219 shm_unlock (id);
220 return shp->id;
221
222 no_id:
223 fput(file);
224 no_file:
225 kfree(shp);
226 return error;
227 }
- 197Create a new file in shmfs with
shmem_file_setup()(See Section L.7.2)
- 198-200Make sure no error occured with the file creation
- 202By default, the error to return indicates that there is no shared
memory identifiers available or that the size of the request is too large
- 206-213Fill in fields in the segment descriptor
- 214Build a segment identifier which is what is returned to the caller
of shmget()
- 215-217Set the file pointers and file operations structure
- 218Update shm_tot to the total number of pages used by
shared segments
- 220Return the identifier
L.8.2 Attaching a SYSV Shared Region
L.8.2.1 Function: sys_shmat
Source: ipc/shm.c
568 asmlinkage long sys_shmat (int shmid, char *shmaddr,
int shmflg, ulong *raddr)
569 {
570 struct shmid_kernel *shp;
571 unsigned long addr;
572 unsigned long size;
573 struct file * file;
574 int err;
575 unsigned long flags;
576 unsigned long prot;
577 unsigned long o_flags;
578 int acc_mode;
579 void *user_addr;
580
581 if (shmid < 0)
582 return -EINVAL;
583
584 if ((addr = (ulong)shmaddr)) {
585 if (addr & (SHMLBA-1)) {
586 if (shmflg & SHM_RND)
587 addr &= ~(SHMLBA-1); /* round down */
588 else
589 return -EINVAL;
590 }
591 flags = MAP_SHARED | MAP_FIXED;
592 } else {
593 if ((shmflg & SHM_REMAP))
594 return -EINVAL;
595
596 flags = MAP_SHARED;
597 }
598
599 if (shmflg & SHM_RDONLY) {
600 prot = PROT_READ;
601 o_flags = O_RDONLY;
602 acc_mode = S_IRUGO;
603 } else {
604 prot = PROT_READ | PROT_WRITE;
605 o_flags = O_RDWR;
606 acc_mode = S_IRUGO | S_IWUGO;
607 }
This section ensures the parameters to shmat() are valid.
- 581-582Negative identifiers are not allowed so return
-EINVAL is one is supplied
- 584-591If the caller supplied an address, make sure it is ok
- 585SHMLBA is the segment boundary address multiple. In
Linux, this is always PAGE_SIZE. If the address is not page
aligned, then check if the caller specified SHM_RND which
allows the address to be changed. If specified, round the address down to the
nearest page boundary, otherwise return -EINVAL
- 591Set the flags to use with the VMA to create a shared region
(MAP_SHARED) with a fixed address (MAP_FIXED)
- 593-596If an address was not supplied, make sure the
SHM_REMAP was specified and only use the MAP_SHARED
flag with the VMA. This means that do_mmap() (See Section D.2.1.1)
will find a suitable address to attach the shared region
613 shp = shm_lock(shmid);
614 if(shp == NULL)
615 return -EINVAL;
616 err = shm_checkid(shp,shmid);
617 if (err) {
618 shm_unlock(shmid);
619 return err;
620 }
621 if (ipcperms(&shp->shm_perm, acc_mode)) {
622 shm_unlock(shmid);
623 return -EACCES;
624 }
625 file = shp->shm_file;
626 size = file->f_dentry->d_inode->i_size;
627 shp->shm_nattch++;
628 shm_unlock(shmid);
This block ensures the IPC permissions are valid
- 613shm_lock() locks the descriptor corresponding to
shmid and returns a pointer to the descriptor
- 614-615Make sure the descriptor exists
- 616-620Make sure the ID matches the descriptor
- 612-624Make sure the caller has the correct permissions
- 625Get a pointer to the struct file which
do_mmap() requires
- 626Get the size of the shared region so do_mmap() knows what
size of VMA to create
- 627Temporarily increment shm_nattach() which normally
indicates how many VMAs are using the segment. This is to prevent the segment
been freed prematurely. The real counter will be incremented by
shm_open() which is the open() callback used by the
vm_operations_struct used for shared regions
- 628Release the descriptor
630 down_write(¤t->mm->mmap_sem);
631 if (addr && !(shmflg & SHM_REMAP)) {
632 user_addr = ERR_PTR(-EINVAL);
633 if (find_vma_intersection(current->mm, addr, addr + size))
634 goto invalid;
635 /*
636 * If shm segment goes below stack, make sure there is some
637 * space left for the stack to grow (at least 4 pages).
638 */
639 if (addr < current->mm->start_stack &&
640 addr > current->mm->start_stack - size - PAGE_SIZE * 5)
641 goto invalid;
642 }
643
644 user_addr = (void*) do_mmap (file, addr, size, prot, flags, 0);
This block is where do_mmap() will be called to attach the region to
the calling process.
- 630Acquire the semaphore protecting the mm_struct
- 632-634If an address was specified, call
find_vma_intersection() (See Section D.3.1.3)
to ensure no VMA overlaps the region we are trying to use
- 639-641Make sure there is at least a 4 page gap between the end of the
shared region and the stack
- 644Call do_mmap()(See Section D.2.1.1) which will allocate
the VMA and map it into the process address space
646 invalid:
647 up_write(¤t->mm->mmap_sem);
648
649 down (&shm_ids.sem);
650 if(!(shp = shm_lock(shmid)))
651 BUG();
652 shp->shm_nattch--;
653 if(shp->shm_nattch == 0 &&
654 shp->shm_flags & SHM_DEST)
655 shm_destroy (shp);
656 else
657 shm_unlock(shmid);
658 up (&shm_ids.sem);
659
660 *raddr = (unsigned long) user_addr;
661 err = 0;
662 if (IS_ERR(user_addr))
663 err = PTR_ERR(user_addr);
664 return err;
665
666 }
- 647Release the mm_struct semaphore
- 649Release the region IDs semaphore
- 650-651Lock the segment descriptor
- 652Decrement the temporary shm_nattch counter. This will
have been properly incremented by the vm_ops→open callback
- 653-655If the users reach 0 and the SHM_DEST flag has
been specified, the region is destroyed as it is no longer required
- 657Otherwise, just unlock the segment
- 660Set the address to return to the caller
- 661-663If an error occured, set the error to return to the caller
- 664Return