内存映射
do_mmap
此函数传入的参数都是用户层
static inline unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
{
unsigned long ret = -EINVAL;
if ((offset + PAGE_ALIGN(len)) < offset)
//页对齐len,检测传入参数是否有误。
goto out;
if (!(offset & ~PAGE_MASK))
//检测offset是否页对齐。映射时只能映射页对齐的长度。
ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
out:
return ret;
}
do_mmap的参数情况: file: 如果新的线性区将要把一个文件映射到内存,则要用文件描述符file和文件偏移offset,如不需要,则file和offset不考虑都为空; addr: 指定从哪里开始查找空闲区间,一般都是NULL即由内核指定; len: 要求的线性地址空间长度; prot: 指定线性区下的页的访问权限; flag: 指定线性区的其他标志;
do_mmap它的核心函数是do_mmap_pgoff。
do_mmap_pgoff
unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff)
{
/*当前进程的mm*/
struct mm_struct * mm = current->mm;
//为当前进程创建mm_struct内存描述符
struct inode *inode;
//索引节点
unsigned int vm_flags;
int error;
unsigned long reqprot = prot;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
* (the exception is when the underlying filesystem is noexec
* mounted, in which case we dont add PROT_EXEC.)
*/
/*是否隐藏了可执行属性*/
/*prot : 映射区域的保护方式。可以为以下几种方式的组合:
PROT_EXEC 映射区域可被执行
PROT_READ 映射区域可被读取
PROT_WRITE 映射区域可被写入
PROT_NONE 映射区域不能存取*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
prot |= PROT_EXEC;
if (!len)
return -EINVAL;
/*判断输入的欲映射的起始地址是否小于最小映射地址,如果小于,将addr修改为最小地址,不过前提是MAP_FIXED旗标没有设置*/
if (!(flags & MAP_FIXED))
/*flags : 影响映射区域的各种特性。在调用mmap()时必须要指定MAP_SHARED 或MAP_PRIVATE。
MAP_FIXED 如果参数start所指的地址无法成功建立映射时,则放弃映射,不对地址做修正。通常不鼓励用此旗标。
MAP_SHARED 对映射区域的写入数据会复制回文件内,而且允许其他映射该文件的进程共享。
MAP_PRIVATE 对映射区域的写入操作会产生一个映射文件的复制,即私人的“写入时复制”(copy on write)对此区域作的任何修改都不会写回原来的文件内容。
MAP_ANONYMOUS建立匿名映射。此时会忽略参数fd,不涉及文件,而且映射区域无法和其他进程共享。
MAP_DENYWRITE只允许对映射区域的写入操作,其他对文件直接写入的操作将会被拒绝。
MAP_LOCKED 将映射区域锁定住,这表示该区域不会被置换(swap)。*/
//如果调用都没有要求MAP_FIXED,即可以由内核来决定将文件映射到哪个内存地址上。
addr = round_hint_to_min(addr);
//预指定addr,通过调用函数round_hint_to_min实现
/* Careful about overflows.. */
/*检测len是否为0*/
len = PAGE_ALIGN(len);
//PAGE_ALIGN宏表示将物理地址addr修整为页边界地址(页的上边界),这里暂不明确
if (!len)
return -ENOMEM;
/* offset overflow? */
/*再次检测是否越界*/
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EOVERFLOW;
/* Too many mappings? */
/*在一个进程中对于mmap个数是有限制的。超出了还是nomem的错误*/
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
/*创建新的vma区域之前先要寻找一块足够大小的空闲区域,本函数就是用于查找没有映射过的空洞内存区,返回值addr就是这段空洞的起始地址,这个是查询mm中空闲的内存地址*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (addr & ~PAGE_MASK)
//判断地址是否对齐
return addr;
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
/*设置vm_flags,根据传入的port和flags以及mm本身自有的旗标来设置*/
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
/* 假如flags设置MAP_LOCKED,即类似于mlock()将申请的地址空间锁定在内存中, 检查是否可以进行lock*/
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
/* mlock MCL_FUTURE? */
if (vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
locked = len >> PAGE_SHIFT;
locked += mm->locked_vm;
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
lock_limit >>= PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
return -EAGAIN;
}
/*判断是文件映射还是匿名映射,如果是文件映射则赋值inode*/
inode = file ? file->f_path.dentry->d_inode : NULL;
/*对vm_flags进行设置,由参数flags确定vma线性区的flags,是共享还是私有*/
/*文件映射*/
if (file) {
switch (flags & MAP_TYPE) {
/*共享*/
case MAP_SHARED:
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
//映射区域可被打开写入
return -EACCES;
/*
* Make sure we don't allow writing to an append-only file..
*/
if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
//确保我们不允许写入一个只允许写追加的文件
return -EACCES;
/*
* Make sure there are no mandatory locks on the file.
*/
if (locks_verify_locked(inode))
//确保文件没有被强制锁定。
return -EAGAIN;
vm_flags |= VM_SHARED | VM_MAYSHARE;
//尝试允许其他进程共享。
if (!(file->f_mode & FMODE_WRITE))
//如果file不允许写就算了,共享也没有意义。
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
/* fall through */
/*私有*/
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
//如果file不可读,
return -EACCES;
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
if (!file->f_op || !file->f_op->mmap)
return -ENODEV;
break;
default:
return -EINVAL;
}
}
/*匿名映射*/
else {
switch (flags & MAP_TYPE) {
/*共享,对应共享内存*/
case MAP_SHARED:
/*
* Ignore pgoff.
pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
/*私有*/
case MAP_PRIVATE:
/*
* Set pgoff according to addr for anon_vma.
pgoff = addr >> PAGE_SHIFT;
break;
default:
return -EINVAL;
}
}
error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
if (error)
return error;
error = ima_file_mmap(file, prot);
if (error)
return error;
/*实际创建vma*/
//mmap_region是mmap实现的另一个关键点,它实现对设备文件或普通文件的file->f_op->mmap(file, vma)函数的调用
return mmap_region(file, addr, len, flags, vm_flags, pgoff);
}
get_unmapped_area
这个是获取没有被映射的内存区
unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,unsigned long pgoff, unsigned long flags)
{
unsigned long (*get_area)(struct file *, unsigned long,unsigned long, unsigned long, unsigned long);
get_area = current->mm->get_unmapped_area;
//在mmap区域找到一块没有被映射过的vm_area_struct对象
if (file && file->f_op && file->f_op->get_unmapped_area)
//如果驱动程序的file_operations定义了get_unmapped_area,则会使用驱动程序定义的方法
// file->f_op->get_unmapped_area,但是通常都不会有驱动程序定义这个方法,所以一般都是使用
// current->mm->get_unmapped_area 这个方法。目的是在mmap区域找到一块没有被映射过的vm_area_struct对象 */
get_area = file->f_op->get_unmapped_area;
addr = get_area(file, addr, len, pgoff, flags);
//获取地址
if (IS_ERR_VALUE(addr))
//IS_ERR_VALUE宏既可以用来判断返回错误码(负值)的函数,又可以用来判断返回错误指针的函数。
return addr;
if (addr > TASK_SIZE - len)
////TASK_SIZE是内核空间和用户空间的分界线,一般等于PAGE_OFFSET,判断地址是否超出范围
return -ENOMEM;
if (addr & ~PAGE_MASK)//判断地址是否超出页面大小
return -EINVAL;
return arch_rebalance_pgtables(addr, len);
}
mmap_region
map_region是mmap实现的另一个关键点,它实现对设备文件或普通文件的file->f_op->mmap(file, vma)函数的调用
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, unsigned long flags,
unsigned int vm_flags, unsigned long pgoff)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
int correct_wcount = 0;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
/* Clear old maps */
error = -ENOMEM;
munmap_back:
vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
//函数find_vma_prepare()与find_vma()基本相同,它扫描当前进程地址空间的
//vm_area_struct结构所形成的红黑树,试图找到结束地址高于addr的第一个区间;如果找到了一个虚拟区,
//说明addr所在的虚拟区已经在使用,也就是已经有映射存在,因此要调用do_munmap()把这个老的虚拟区从
//进程地址空间中撤销,如果撤销不成功,就返回一个负数;如果撤销成功,就继续查找,直到在红黑树中找不到addr所在的虚拟区
if (vma && vma->vm_start < addr + len) {
//调用删除操作
if (do_munmap(mm, addr, len))
return -ENOMEM;
goto munmap_back;
}
/* Check against address space limit. */
//检查进程对线性区大小的限制,是否允许插入新的线性区。
if (!may_expand_vm(mm, len >> PAGE_SHIFT))
//may_expand_vm判断能否增加 len >> PAGE_SHIFT个页面
//页数和超过限定值返回 0 ,不超过返回1
return -ENOMEM;
/*
* Set 'VM_NORESERVE' if we should not account for the
* memory use of this mapping.
*/
if ((flags & MAP_NORESERVE)) {
//如果flags参数中没有设置MAP_NORESERVE标志,新的虚拟区含有私有的可写页,
/* We honor MAP_NORESERVE if allowed to overcommit */
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
vm_flags |= VM_NORESERVE;
/* hugetlb applies strict overcommit unless MAP_NORESERVE */
if (file && is_file_hugepages(file))
vm_flags |= VM_NORESERVE;
}
/*
* Private writable mapping: check memory availability
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory(charged))
//空闲页面数小于要映射的虚拟区的大小;则函数终止并返回一个负数;其中函数security_vm_enough_memory()
//用来检查一个进程的地址空间中是否有足够的内存来进行一个新的映射
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
/*
* Can we just expand an old mapping?
*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
虚拟区的合并是由vma_merge()函数实现的。如果合并成功,则转out处。
if (vma)
goto out;
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
确定要映射的对象并调用适当的特定映射器。地址已被验证,但是
不是未映射,但映射将从列表中移除。
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
INIT_LIST_HEAD(&vma->anon_vma_chain);
if (file) {
error = -EINVAL;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
goto free_vma;
if (vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);
if (error)
goto free_vma;
correct_wcount = 1;
}
vma->vm_file = file;
get_file(file);
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
if (vm_flags & VM_EXECUTABLE)
added_exe_file_vma(mm);
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
f_op->mmap method. -DaveM
*/
addr = vma->vm_start;
pgoff = vma->vm_pgoff;
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
if (vma_wants_writenotify(vma)) {
pgprot_t pprot = vma->vm_page_prot;
/* Can vma->vm_page_prot have changed??
*
* Answer: Yes, drivers may have changed it in their
f_op->mmap method.
*
* Ensures that vmas marked as uncached stay that way.
/
vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
}
vma_link(mm, vma, prev, rb_link, rb_parent);
file = vma->vm_file;
/* Once vma denies write, undo our temporary denial count */
if (correct_wcount)
atomic_inc(&inode->i_writecount);
out:
//虚存区合并成功
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
//映射区是否可被置换
if (!mlock_vma_pages_range(vma, addr, addr + len))
mm->locked_vm += (len >> PAGE_SHIFT);
} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
make_pages_present(addr, addr + len);
return addr;
unmap_and_free_vma:
//删除和释放
if (correct_wcount)
atomic_inc(&inode->i_writecount);
vma->vm_file = NULL;
fput(file);
/* Undo any partial mapping done by a device driver. */
//撤消设备驱动程序所做的任何部分映射。
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
free_vma:
kmem_cache_free(vm_area_cachep, vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
return error;
}