锁定¶

下面的文本描述了 VFS 相关方法的锁定规则。它（据信）是最新的。请，如果您更改原型或锁定协议中的任何内容 - 请更新此文件。并更新树中的相关实例，不要将其留给文件系统/设备等的维护者。至少，将可疑案例列表放在此文件的末尾。不要将其变成日志 - 树外代码的维护者应该能够使用 diff(1)。

目前这里缺少的东西：套接字操作。Alexey？

dentry_operations¶

原型

int (*d_revalidate)(struct inode *, const struct qstr *,
                    struct dentry *, unsigned int);
int (*d_weak_revalidate)(struct dentry *, unsigned int);
int (*d_hash)(const struct dentry *, struct qstr *);
int (*d_compare)(const struct dentry *,
                unsigned int, const char *, const struct qstr *);
int (*d_delete)(struct dentry *);
int (*d_init)(struct dentry *);
void (*d_release)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
struct vfsmount *(*d_automount)(struct path *path);
int (*d_manage)(const struct path *, bool);
struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
bool (*d_unalias_trylock)(const struct dentry *);
void (*d_unalias_unlock)(const struct dentry *);

锁定规则

操作	rename_lock	->d_lock	可能阻塞	rcu-walk
d_revalidate	否	否	是 (ref-walk)	可能
d_weak_revalidate	否	否	是	否
d_hash	否	否	否	可能
d_compare	是	否	否	可能
d_delete	否	是	否	否
d_init	否	否	是	否
d_release	否	否	是	否
d_prune	否	是	否	否
d_iput	否	否	是	否
d_dname	否	否	否	否
d_automount	否	否	是	否
d_manage	否	否	是 (ref-walk)	可能
d_real	否	否	是	否
d_unalias_trylock	是	否	否	否
d_unalias_unlock	是	否	否	否

inode_operations¶

原型

int (*create) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t, bool);
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,const char *);
struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t);
int (*rmdir) (struct inode *,struct dentry *);
int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *,
                struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
const char *(*get_link) (struct dentry *, struct inode *, struct delayed_call *);
void (*truncate) (struct inode *);
int (*permission) (struct mnt_idmap *, struct inode *, int, unsigned int);
struct posix_acl * (*get_inode_acl)(struct inode *, int, bool);
int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *);
int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
void (*update_time)(struct inode *, struct timespec *, int);
int (*atomic_open)(struct inode *, struct dentry *,
                        struct file *, unsigned open_flag,
                        umode_t create_mode);
int (*tmpfile) (struct mnt_idmap *, struct inode *,
                struct file *, umode_t);
int (*fileattr_set)(struct mnt_idmap *idmap,
                    struct dentry *dentry, struct fileattr *fa);
int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int);
struct offset_ctx *(*get_offset_ctx)(struct inode *inode);

锁定规则: 全部可能阻塞

操作	i_rwsem(inode)
lookup	共享
create	独占
link	独占（两者）
mknod	独占
symlink	独占
mkdir	独占
unlink	独占（两者）
rmdir	独占（两者）（见下文）
rename	独占（两个父目录，一些子目录）（见下文）
readlink	否
get_link	否
setattr	独占
permission	否（如果在 rcu-walk 模式下调用，则可能不阻塞）
get_inode_acl	否
get_acl	否
getattr	否
listxattr	否
fiemap	否
update_time	否
atomic_open	共享（如果在打开标志中设置了 O_CREAT，则独占）
tmpfile	否
fileattr_get	否或独占
fileattr_set	独占
get_offset_ctx	否

此外，->rmdir()、->unlink() 和 ->rename() 在受害者上具有 ->i_rwsem 独占。跨目录 ->rename() 具有（每个超级块）->s_vfs_rename_sem。 ->unlink() 和 ->rename() 在所有涉及的非目录上具有 ->i_rwsem 独占。 ->rename() 在任何更改父目录的子目录上具有 ->i_rwsem 独占。

有关目录操作锁定方案的更详细讨论，请参阅目录锁定。

xattr_handler 操作¶

原型

bool (*list)(struct dentry *dentry);
int (*get)(const struct xattr_handler *handler, struct dentry *dentry,
           struct inode *inode, const char *name, void *buffer,
           size_t size);
int (*set)(const struct xattr_handler *handler,
           struct mnt_idmap *idmap,
           struct dentry *dentry, struct inode *inode, const char *name,
           const void *buffer, size_t size, int flags);

锁定规则: 全部可能阻塞

操作	i_rwsem(inode)
list	否
get	否
set	独占

super_operations¶

原型

struct inode *(*alloc_inode)(struct super_block *sb);
void (*free_inode)(struct inode *);
void (*destroy_inode)(struct inode *);
void (*dirty_inode) (struct inode *, int flags);
int (*write_inode) (struct inode *, struct writeback_control *wbc);
int (*drop_inode) (struct inode *);
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
int (*freeze_fs) (struct super_block *);
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
void (*umount_begin) (struct super_block *);
int (*show_options)(struct seq_file *, struct dentry *);
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);

锁定规则: 全部可能阻塞 [不正确，见下文]

操作	s_umount	注意
alloc_inode
free_inode		从 RCU 回调调用
destroy_inode
dirty_inode
write_inode
drop_inode		!!!inode->i_lock!!!
evict_inode
put_super	write
sync_fs	read
freeze_fs	write
unfreeze_fs	write
statfs	可能(read)	（见下文）
remount_fs	write
umount_begin	否
show_options	否	(namespace_sem)
quota_read	否	（见下文）
quota_write	否	（见下文）

当 ustat(2)（本机或兼容）调用 ->statfs() 时，它具有 s_umount（共享），但这只是一个糟糕的 API 的意外；当我们只有用户空间给我们的 dev_t 来识别超级块时，s_umount 用于固定超级块。其他所有内容（statfs()、fstatfs() 等）在调用 ->statfs() 时不持有它 - 超级块通过解析传递给 syscall 的路径名来固定。

->quota_read() 和 ->quota_write() 函数都保证是通过配额代码（通过 dqio_sem）在配额文件上操作的唯一函数（除非管理员真的想搞砸一些东西并在启用配额的情况下写入配额文件）。有关锁定的其他详细信息，另请参阅 dquot_operations 部分。

file_system_type¶

原型

struct dentry *(*mount) (struct file_system_type *, int,
               const char *, void *);
void (*kill_sb) (struct super_block *);

锁定规则

操作	可能阻塞
mount	是
kill_sb	是

->mount() 返回 ERR_PTR 或根 dentry；其超级块应在返回时锁定。

->kill_sb() 接受一个写锁定的超级块，对其执行所有关闭工作，解锁并删除引用。

address_space_operations¶

原型

int (*read_folio)(struct file *, struct folio *);
int (*writepages)(struct address_space *, struct writeback_control *);
bool (*dirty_folio)(struct address_space *, struct folio *folio);
void (*readahead)(struct readahead_control *);
int (*write_begin)(struct file *, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct folio **foliop, void **fsdata);
int (*write_end)(struct file *, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct folio *folio, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidate_folio) (struct folio *, size_t start, size_t len);
bool (*release_folio)(struct folio *, gfp_t);
void (*free_folio)(struct folio *);
int (*direct_IO)(struct kiocb *, struct iov_iter *iter);
int (*migrate_folio)(struct address_space *, struct folio *dst,
                struct folio *src, enum migrate_mode);
int (*launder_folio)(struct folio *);
bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count);
int (*error_remove_folio)(struct address_space *, struct folio *);
int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span)
int (*swap_deactivate)(struct file *);
int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);

锁定规则: 除 dirty_folio 和 free_folio 外，全部可能阻塞

操作	folio 已锁定	i_rwsem	invalidate_lock
read_folio	是，解锁		共享
writepages
dirty_folio	可能
readahead	是，解锁		共享
write_begin	锁定 folio	独占
write_end	是，解锁	独占
bmap
invalidate_folio	是		独占
release_folio	是
free_folio	是
direct_IO
migrate_folio	是（两者）
launder_folio	是
is_partially_uptodate	是
error_remove_folio	是
swap_activate	否
swap_deactivate	否
swap_rw	是，解锁

->write_begin()、->write_end() 和 ->read_folio() 可以从请求处理程序 (/dev/loop) 调用。

->read_folio() 解锁 folio，同步或通过 I/O 完成。

->readahead() 解锁尝试 I/O 的 folio，如 ->read_folio()。

->writepages() 用于定期回写和 syscall 发起的同步操作。address_space 至少应针对 *nr_to_write 个页面启动 I/O。*nr_to_write 必须为写入的每个页面递减。address_space 实现可能会写入比 *nr_to_write 要求的更多（或更少）页面，但它应该尽量接近。如果 nr_to_write 为 NULL，则必须写入所有脏页。

writepages 应该 _仅_ 写入 mapping->i_pages 中存在的页面。

->dirty_folio() 在内核中的各个位置调用，当目标 folio 被标记为需要回写时。folio 不能被截断，因为调用者持有 folio 锁，或者调用者在持有页表锁时找到了 folio，这将阻止截断。

->bmap() 当前由某些文件系统提供的旧 ioctl() (FIBMAP) 和交换器使用。后者最终会消失。请保持这种方式，不要产生新的调用者。

->invalidate_folio() 在文件系统必须尝试从页面中删除一些或所有缓冲区时调用，当页面被截断时。成功时返回零。文件系统必须独占获取 invalidate_lock，然后才能在截断/空洞打孔路径中使页面缓存无效（从而调用 ->invalidate_folio），以阻止页面缓存无效和页面缓存填充函数（fault、read 等）之间的竞争。

->release_folio() 在 MM 想要更改将使文件系统的私有数据无效的 folio 时调用。例如，它可能即将从 address_space 中删除或拆分。folio 已锁定且未进行回写。它可能是脏的。gfp 参数通常不用于分配，而是指示文件系统可以做什么来尝试释放私有数据。文件系统可以返回 false 以指示无法释放 folio 的私有数据。如果它返回 true，则应已从 folio 中删除私有数据。如果文件系统未提供 ->release_folio 方法，则 pagecache 将假定私有数据是 buffer_heads 并调用 try_to_free_buffers()。

->free_folio() 在内核从页面缓存中删除 folio 时调用。

->launder_folio() 可以在释放 folio 之前调用，如果发现它仍然是脏的。如果 folio 已成功清理，则返回零，否则返回错误值。请注意，为了防止 folio 被映射回去并重新变脏，需要锁定它以进行整个操作。

->swap_activate() 将被调用以准备给定的文件进行交换。它应执行任何必要的验证和准备，以确保可以以最小的内存分配执行写入。它应调用 add_swap_extent() 或 helper iomap_swapfile_activate()，并返回添加的范围数。如果 IO 应通过 ->swap_rw() 提交，则应设置 SWP_FS_OPS，否则 IO 将直接提交到块设备 sis->bdev。

->swap_deactivate() 将在 sys_swapoff() 路径中调用，在 ->swap_activate() 返回成功后。

如果 ->swap_activate() 设置了 SWP_FS_OPS，则将为交换 IO 调用 ->swap_rw。

file_lock_operations¶

原型

void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
void (*fl_release_private)(struct file_lock *);

锁定规则

操作	inode->i_lock	可能阻塞
fl_copy_lock	是	否
fl_release_private	可能	可能[1]_

lock_manager_operations¶

原型

void (*lm_notify)(struct file_lock *);  /* unblock callback */
int (*lm_grant)(struct file_lock *, struct file_lock *, int);
void (*lm_break)(struct file_lock *); /* break_lease callback */
int (*lm_change)(struct file_lock **, int);
bool (*lm_breaker_owns_lease)(struct file_lock *);
bool (*lm_lock_expirable)(struct file_lock *);
void (*lm_expire_lock)(void);

锁定规则

操作	flc_lock	blocked_lock_lock	可能阻塞
lm_notify	否	是	否
lm_grant	否	否	否
lm_break	是	否	否
lm_change	是	否	否
lm_breaker_owns_lease	是	否	否
lm_lock_expirable	是	否	否
lm_expire_lock	否	否	是

buffer_head¶

原型

void (*b_end_io)(struct buffer_head *bh, int uptodate);

锁定规则

从中断调用。换句话说，这里需要格外小心。bh 已锁定，但这就是我们在这里拥有的所有保证。目前只有 RAID1、highmem、fs/buffer.c 和 fs/ntfs/aops.c 提供这些。块设备在 IO 完成后调用此方法。

block_device_operations¶

原型

int (*open) (struct block_device *, fmode_t);
int (*release) (struct gendisk *, fmode_t);
int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*direct_access) (struct block_device *, sector_t, void **,
                        unsigned long *);
void (*unlock_native_capacity) (struct gendisk *);
int (*getgeo)(struct block_device *, struct hd_geometry *);
void (*swap_slot_free_notify) (struct block_device *, unsigned long);

锁定规则

操作	open_mutex
open	是
release	是
ioctl	否
compat_ioctl	否
direct_access	否
unlock_native_capacity	否
getgeo	否
swap_slot_free_notify	否（见下文）

调用 swap_slot_free_notify 时会持有 swap_lock，有时也会持有页面锁。

file_operations¶

原型

loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iopoll) (struct kiocb *kiocb, bool spin);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *);
int (*release) (struct inode *, struct file *);
int (*fsync) (struct file *, loff_t start, loff_t end, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
unsigned long (*get_unmapped_area)(struct file *, unsigned long,
                unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *,
                size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
                size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *, int, loff_t, loff_t);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
unsigned (*mmap_capabilities)(struct file *);
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
                loff_t, size_t, unsigned int);
loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                struct file *file_out, loff_t pos_out,
                loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);

锁定规则: 全部可能阻塞。

->llseek() 锁定已从 llseek 移动到各个 llseek 实现。如果您的 fs 未使用 generic_file_llseek，则需要在 ->llseek() 中获取和释放相应的锁。对于许多文件系统，获取 inode 互斥锁或仅使用 i_size_read() 可能是安全的。注意：这不能保护 file->f_pos 免受并发修改，因为这是用户空间必须处理的事情。

调用 ->iterate_shared() 时会持有 i_rwsem 以进行读取，并独占持有文件 f_pos_lock

->fasync() 负责维护 filp->f_flags 中的 FASYNC 位。大多数实例调用 fasync_helper()，它执行该维护，因此通常不需要担心。VFS 层中 > 0 的返回值将映射为零。

必须更改目录上的 ->readdir() 和 ->ioctl()。理想情况下，我们会将 ->readdir() 移动到 inode_operations 并使用单独的方法用于目录 ->ioctl() 或完全杀死后者。其中一个问题是，对于任何类似于联合挂载的东西，我们都不会为所有组件提供 struct file。还有其他原因导致当前接口一团糟...

目录上的 ->read 可能必须消失 - 我们应该在 sys_read() 及其朋友中强制执行 -EISDIR。

->setlease 操作应在个别文件系统中设置租约之前或之后调用 generic_setlease() 以记录操作结果

->fallocate 实现必须非常小心，以在打孔或执行其他使页面缓存内容无效的操作时保持页面缓存一致性。通常，文件系统需要调用 truncate_inode_pages_range() 以使页面缓存的相关范围无效。但是，文件系统通常还需要更新其内部（和磁盘上）文件偏移量 -> 磁盘块映射的视图。在此更新完成之前，文件系统需要阻止页面错误和读取，从而从磁盘重新加载现在过时的页面缓存内容。由于 VFS 在从磁盘加载页面（filemap_fault()、filemap_read()、readahead 路径）时以共享模式获取 mapping->invalidate_lock，因此 fallocate 实现必须获取 invalidate_lock 以防止重新加载。

->copy_file_range 和 ->remap_file_range 实现需要针对操作运行时的文件数据修改进行序列化。对于通过 write(2) 和类似操作的阻塞更改，可以使用 inode->i_rwsem。要阻止在操作期间通过内存映射对文件内容进行的更改，文件系统必须获取 mapping->invalidate_lock 以与 ->page_mkwrite 协调。

dquot_operations¶

原型

int (*write_dquot) (struct dquot *);
int (*acquire_dquot) (struct dquot *);
int (*release_dquot) (struct dquot *);
int (*mark_dirty) (struct dquot *);
int (*write_info) (struct super_block *, int);

这些操作旨在或多或少地成为包装函数，以确保 wrt 文件系统的正确锁定并调用通用配额操作。

文件系统应该从通用配额函数中获得什么

操作	FS 递归	调用时持有的锁
write_dquot	是	dqonoff_sem 或 dqptr_sem
acquire_dquot	是	dqonoff_sem 或 dqptr_sem
release_dquot	是	dqonoff_sem 或 dqptr_sem
mark_dirty	否
write_info	是	dqonoff_sem

FS 递归意味着从超级块操作调用 ->quota_read() 和 ->quota_write()。

有关配额锁定的更多详细信息，请参见 fs/dquot.c。

vm_operations_struct¶

原型

void (*open)(struct vm_area_struct *);
void (*close)(struct vm_area_struct *);
vm_fault_t (*fault)(struct vm_fault *);
vm_fault_t (*huge_fault)(struct vm_fault *, unsigned int order);
vm_fault_t (*map_pages)(struct vm_fault *, pgoff_t start, pgoff_t end);
vm_fault_t (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
vm_fault_t (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *);
int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);

锁定规则

操作	mmap_lock	PageLocked(page)
open	write
close	read/write
fault	read	可以返回页面锁定的
huge_fault	可能-read
map_pages	可能-read
page_mkwrite	read	可以返回页面锁定的
pfn_mkwrite	read
access	read

->fault() 在即将错误地进入先前不存在的 pte 时调用。文件系统必须在 vm_fault 结构中找到并返回与传入的 “pgoff” 关联的页面。如果该页面可能被截断和/或无效，则文件系统必须锁定 invalidate_lock，然后确保该页面尚未被截断（invalidate_lock 将阻止后续截断），然后以 VM_FAULT_LOCKED 和页面锁定状态返回。VM 将解锁页面。

->huge_fault() 在不存在 PUD 或 PMD 条目时调用。这使文件系统有机会安装 PUD 或 PMD 大小的页面。文件系统还可以使用 ->fault 方法返回 PMD 大小的页面，因此可能不需要实现此函数。特别是，文件系统不应从 ->huge_fault() 调用 filemap_fault()。调用此方法时可能未持有 mmap_lock。

->map_pages() 在 VM 请求映射易于访问的页面时调用。文件系统应找到并映射与从 “start_pgoff” 到 “end_pgoff” 的偏移量关联的页面。调用 ->map_pages() 时会持有 RCU 锁，并且不得阻塞。如果无法在不阻塞的情况下访问页面，则文件系统应跳过它。文件系统应使用 set_pte_range() 设置页表条目。与页面关联的条目的指针在 vm_fault 结构中的 “pte” 字段中传递。其他偏移量的条目的指针应相对于 “pte” 进行计算。

->page_mkwrite() 在即将变为可写的先前只读的 pte 时调用。文件系统再次必须确保没有截断/无效竞争或与诸如 ->remap_file_range 或 ->copy_file_range 等操作的竞争，然后以页面锁定状态返回。通常，mapping->invalidate_lock 适合正确的序列化。如果页面已被截断，则文件系统不应像 ->fault() 处理程序一样查找新页面，而只需返回 VM_FAULT_NOPAGE，这将导致 VM 重试错误。

->pfn_mkwrite() 与 page_mkwrite 相同，但当 pte 是 VM_PFNMAP 或 VM_MIXEDMAP 且没有页面的条目时。预期的返回是 VM_FAULT_NOPAGE。或 VM_FAULT_ERROR 类型之一。在此调用之后的默认行为是使 pte 可读写，除非 pfn_mkwrite 返回错误。

->access() 在 access_process_vm() 中的 get_user_pages() 失败时调用，通常用于通过 /proc/pid/mem 或 ptrace 调试进程。此函数仅适用于 VM_IO | VM_PFNMAP VMA。

可疑的东西

（如果您破坏了某些东西或注意到它已损坏并且不自己修复它 - 至少将其放在此处）

Linux 内核

目录

本页

锁定¶

dentry_operations¶

inode_operations¶

xattr_handler 操作¶

super_operations¶

file_system_type¶

address_space_operations¶

file_lock_operations¶

lock_manager_operations¶

buffer_head¶

block_device_operations¶

file_operations¶

dquot_operations¶

vm_operations_struct¶