核心数据结构
FGP Flags
#define FGP_ACCESSED ((__force fgf_t)0x00000001)
#define FGP_LOCK ((__force fgf_t)0x00000002)
#define FGP_CREAT ((__force fgf_t)0x00000004)
#define FGP_WRITE ((__force fgf_t)0x00000008)
#define FGP_NOFS ((__force fgf_t)0x00000010)
#define FGP_NOWAIT ((__force fgf_t)0x00000020)
#define FGP_FOR_MMAP ((__force fgf_t)0x00000040)
#define FGP_STABLE ((__force fgf_t)0x00000080)
#define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */
#define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
- FGP_ACCESSED: 标记页面为已访问(用于LRU)
- FGP_LOCK: 返回时页面已加锁
- FGP_CREAT: 未命中时创建新页面
- FGP_WRITE: 用于写入操作
- FGP_NOFS: 清除
__GFP_FS(避免递归文件系统操作)
- FGP_NOWAIT: 非阻塞模式,加锁失败返回
-EAGAIN
- FGP_FOR_MMAP: mmap场景,创建后解锁
- FGP_STABLE: 等待页面稳定(写回完成)
address_space
struct address_space {
struct inode *host; // 指向所属的inode
struct xarray i_pages; // XArray:存储页缓存的radix tree
struct rw_semaphore invalidate_lock; // 保护页面失效操作的读写锁
gfp_t gfp_mask; // 页面分配的内存标志
atomic_t i_mmap_writable; // 可写内存映射计数
struct rb_root_cached i_mmap; // 内存映射的RB树(用于跟踪VMA)
unsigned long nrpages; // 页缓存中的页面总数
pgoff_t writeback_index; // 写回操作的起始索引
const struct address_space_operations *a_ops; // 地址空间操作函数集
unsigned long flags; // 标志位(AS_*)
errseq_t wb_err; // 写回错误序列号
spinlock_t i_private_lock; // 私有数据保护锁
struct list_head i_private_list; // 私有数据链表
struct rw_semaphore i_mmap_rwsem; // 保护i_mmap的读写锁
void * i_private_data; // 文件系统私有数据指针
};
- host: 指向所属 inode,建立双向关联
- i_pages: XArray,存储页缓存页面的 radix tree
- 键:页面索引(pgoff_t)
- 值:folio/page 指针
- 标志:XA_FLAGS_LOCK_IRQ(IRQ 安全)、XA_FLAGS_ACCOUNT(内存统计)
- invalidate_lock: 保护页面失效操作,防止与 I/O 并发
- i_mmap: RB 树,记录该文件的所有内存映射(VMA)
- a_ops: 地址空间操作函数集,文件系统可自定义
- nrpages: 页缓存页面总数,用于统计和限制
- writeback_index: 写回起始索引,支持断点续写
Xarray
i_pages 的数据结构,用于管理 Pages Cache。XArray 是基于 Radix Tree(基数树)的实现,在 Linux 内核中体验更像一个无限增长的数组。
Xarray 根结构
struct xarray {
spinlock_t xa_lock;
/* private: The rest of the data structure is not to be used directly. */
gfp_t xa_flags;
void __rcu * xa_head;
};
- xa_lock: 保护 XArray 的锁(IRQ 安全)
- xa_flags: 标志(如 XA_FLAGS_LOCK_IRQ、XA_FLAGS_ACCOUNT)
- xa_head: 指向根节点或直接条目(RCU 指针)
常量
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6)
#endif
#define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1)
- XA_CHUNK_SHIFT: 通常为 6(64 位系统)
- XA_CHUNK_SIZE: 64(表示了一个 xa_node 节点可以表示的最大数量槽位)
- XA_CHUNK_MASK: 63(0x3F,用于取模)
数据结构
XArray树结构(3级示例):
xa_head
│
▼
[xa_node (shift=12)]
│
┌─────────────┼─────────────┐
│ │ │
slots[0] slots[1] slots[2]
│ │ │
▼ ▼ ▼
[xa_node] [xa_node] NULL
(shift=6) (shift=6)
│ │
┌───┼───┐ ┌───┼───┐
│ │ │ │ │ │
slots[0] slots[1] slots[2] slots[3]
│ │ │ │ │ │
▼ ▼ ▼ ▼ ▼ ▼
folio folio folio folio folio
(0) (1) (2) (3) (4)
索引计算:
- index = 0: 0 >> 12 = 0, 0 >> 6 = 0, 0 & 63 = 0 → slots[0][0]
- index = 1: 1 >> 12 = 0, 1 >> 6 = 0, 1 & 63 = 1 → slots[0][1]
- index = 64: 64 >> 12 = 0, 64 >> 6 = 1, 64 & 63 = 0 → slots[1][0]
Node 结构
struct xa_node {
unsigned char shift; /* Bits remaining in each slot */
unsigned char offset; /* Slot offset in parent */
unsigned char count; /* Total entry count */
unsigned char nr_values; /* Value entry count */
struct xa_node __rcu *parent; /* NULL at top of tree */
struct xarray *array; /* The array we belong to */
union {
struct list_head private_list; /* For tree user */
struct rcu_head rcu_head; /* Used when freeing node */
};
void __rcu *slots[XA_CHUNK_SIZE];
union {
unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS];
unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS];
};
};
- shift: 该层级剩余的位数(决定该节点覆盖的索引范围)
- shift = 0: 叶子节点,直接存储条目
- shift = 6: 覆盖 64 个索引(2^6 = 64)
- shift = 12: 覆盖 4096 个索引(2^12 = 4096)
- shift = 18: 覆盖 262144 个索引(2^18 = 262144)
- offset: 在父节点中的槽位偏移
- count: 非 NULL 槽位总数
- nr_values: 值条目数量
- parent: 父节点指针(根节点为 NULL)
- array: 所属的 XArray
- slots[]: 64 个槽位数组(每个槽位可指向子节点或条目)
- marks[]: 标记位图(用于 DIRTY、WRITEBACK 等)
Entry
Node 中 slots 存放的 Entry,有三种基本类型。
Entry (void *)
│
├─ [00] Pointer Entry (普通指针条目)
│ ├─ NULL Entry (空指针)
│ └─ 普通指针 (指向实际数据对象)
│
├─ [10] Internal Entry (内部条目)
│ ├─ Node Pointer (>4096)
│ ├─ Sibling Entry (0-62)
│ ├─ Retry Entry (256)
│ ├─ Zero Entry (257)
│ └─ Error Entry (-4094 ~ -2)
│
└─ [x1] Value Entry (值条目)
├─ 纯值条目 (低1位=1, 低2位=01)
└─ Tagged Pointer (低2位=11)
Pointer Entry(低两位为 00)
Null Entry
// NULL指针本身就是NULL entry
void *entry = NULL;
普通指针
指向实际数据或子节点。
// 示例:指向struct folio的指针
void *entry = folio; // 低2位 = 00
Internal Entry(低两位为10)
转化函数
static inline void *xa_mk_node(const struct xa_node *node)
{
return (void *)((unsigned long)node | 2);
}
/* Private */
static inline struct xa_node *xa_to_node(const void *entry)
{
return (struct xa_node *)((unsigned long)entry - 2);
}
Node Pointer:
指向下一层 xa_node
- 编码:xa_mk_internal(v),其中 v > 1024(实际检查 > 4096)
static inline bool xa_is_node(const void *entry)
{
return xa_is_internal(entry) && (unsigned long)entry > 4096;
}
Slibling Entry(兄弟条目)
- 大页面(multi-index entry)时,多个索引共享同一 folio
- 编码:xa_mk_internal(offset),offset 范围 0-62
假设一个64KB的大页面(order=4),占用16个4KB页面的索引:
索引100-115都指向同一个folio
存储方式:
i_pages[100] = folio指针 ← 规范槽位(canonical slot)
i_pages[101] = sibling(100) ← 指向槽位100
i_pages[102] = sibling(100)
...
i_pages[115] = sibling(100)
查找索引103时:
xas_descend() → 发现sibling(100)
→ 跳转到槽位100 → 返回folio指针
// 创建函数
static inline void *xa_mk_sibling(unsigned int offset)
{
return xa_mk_internal(offset);
}
// 识别函数
static inline bool xa_is_sibling(const void *entry)
{
return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
(entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
}
// 提取函数
static inline unsigned long xa_to_sibling(const void *entry)
{
return xa_to_internal(entry);
}
Retry Entry (重试条目)
标记节点正在被修改或即将释放,提示 RCU 读者重试
- 编码:XA_RETRY_ENTRY = xa_mk_internal(256)
#define XA_RETRY_ENTRY xa_mk_internal(256)
/**
* xa_is_retry() - Is the entry a retry entry?
* @entry: Entry retrieved from the XArray
*
* Return: %true if the entry is a retry entry.
*/
static inline bool xa_is_retry(const void *entry)
{
return unlikely(entry == XA_RETRY_ENTRY);
}
Zero Entry (零条目)
- 功能:预留索引,表示“已分配但值为 NULL”
- 编码:XA_ZERO_ENTRY = xa_mk_internal(257)
#define XA_ZERO_ENTRY xa_mk_internal(257)
/**
* xa_is_zero() - Is the entry a zero entry?
* @entry: Entry retrieved from the XArray
*
* The normal API will return NULL as the contents of a slot containing
* a zero entry. You can only see zero entries by using the advanced API.
*
* Return: %true if the entry is a zero entry.
*/
static inline bool xa_is_zero(const void *entry)
{
return unlikely(entry == XA_ZERO_ENTRY);
}
static inline __must_check
int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
{
return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}
Error Entry(错误条目)
- 功能:表示操作错误(如 ENOMEM)
- 编码:xa_mk_internal(负数),范围 -4094 到 -2
static inline bool xa_is_err(const void *entry)
{
return unlikely(xa_is_internal(entry) &&
entry >= xa_mk_internal(-MAX_ERRNO));
}
Value Entry(第一位为1)
值条目,用于存储 swap/shadow 条目或标记指针。
- 场景:内存不足时,Page Cache 中的页面被回收,但需要记录“这里曾经有页面”。
- 编码紧凑,可携带回收时间、节点ID、memcg ID 等信息
Value Entry(纯值条目)
static inline void *xa_mk_value(unsigned long v)
{
WARN_ON((long)v < 0);
return (void *)((v << 1) | 1);
}
/**
* xa_to_value() - Get value stored in an XArray entry.
* @entry: XArray entry.
*
* Context: Any context.
* Return: The value stored in the XArray entry.
*/
static inline unsigned long xa_to_value(const void *entry)
{
return (unsigned long)entry >> 1;
}
/**
* xa_is_value() - Determine if an entry is a value.
* @entry: XArray entry.
*
* Context: Any context.
* Return: True if the entry is a value, false if it is a pointer.
*/
static inline bool xa_is_value(const void *entry)
{
return (unsigned long)entry & 1;
}
// 列子
// 创建值条目
void *entry = xa_mk_value(123); // 将整数123编码为值条目
// 提取值
unsigned long value = xa_to_value(entry); // 返回123
// 判断是否为值条目
if (xa_is_value(entry)) {
// 这是值条目,不是指针
}
Tagged Pointer(标记指针)
PerBDI
现代Linux内核中,bdflush已被per-BDI(Backing Device Info)的写回线程替代。每个BDI有一个或多个bdi_writeback结构,每个bdi_writeback对应一个写回线程(通过workqueue实现)。通过这个机制来负责的 Linux 的脏数据回写。
bdi_writeback (核心数据结构)
struct bdi_writeback {
struct backing_dev_info *bdi; /* our parent bdi */
unsigned long state; /* Always use atomic bitops on this */
unsigned long last_old_flush; /* last old data flush */
struct list_head b_dirty; /* dirty inodes */
struct list_head b_io; /* parked for writeback */
struct list_head b_more_io; /* parked for more writeback */
struct list_head b_dirty_time; /* time stamps are dirty */
spinlock_t list_lock; /* protects the b_* lists */
atomic_t writeback_inodes; /* number of inodes under writeback */
struct percpu_counter stat[NR_WB_STAT_ITEMS];
unsigned long bw_time_stamp; /* last time write bw is updated */
unsigned long dirtied_stamp;
unsigned long written_stamp; /* pages written at bw_time_stamp */
unsigned long write_bandwidth; /* the estimated write bandwidth */
unsigned long avg_write_bandwidth;/* further smoothed write bw, > 0 */
/*
* The base dirty throttle rate, re-calculated on every 200ms.
* All the bdi tasks' dirty rate will be curbed under it.
* @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
* in small steps and is much more smooth/stable than the latter.
*/
unsigned long dirty_ratelimit;
unsigned long balanced_dirty_ratelimit;
struct fprop_local_percpu completions;
int dirty_exceeded;
enum wb_reason start_all_reason;
spinlock_t work_lock; /* protects work_list & dwork scheduling */
struct list_head work_list;
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
struct percpu_ref refcnt; /* used only for !root wb's */
struct fprop_local_percpu memcg_completions;
struct cgroup_subsys_state *memcg_css; /* the associated memcg */
struct cgroup_subsys_state *blkcg_css; /* and blkcg */
struct list_head memcg_node; /* anchored at memcg->cgwb_list */
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
struct list_head b_attached; /* attached inodes, protected by list_lock */
struct list_head offline_node; /* anchored at offline_cgwbs */
union {
struct work_struct release_work;
struct rcu_head rcu;
};
#endif
};
- b_dirty: 脏inode列表
- b_io: 待写回的inode列表
- b_more_io: 需要更多IO的inode列表
- b_dirty_time: 仅时间戳脏的inode列表
- work_list: 待处理的写回工作列表
- dwork: 延迟工作项,绑定到wb_workfn
- work_lock: 保护work_list和dwork调度的锁
wb_writeback_work 写回工作任务
struct wb_writeback_work {
long nr_pages;
struct super_block *sb;
enum writeback_sync_modes sync_mode;
unsigned int tagged_writepages:1;
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
unsigned int auto_free:1; /* free on completion */
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
struct wb_completion *done; /* set if the caller waits */
};
- nr_pages: 要写回的页面数
- sync_mode: 同步模式(WB_SYNC_ALL或WB_SYNC_NONE)
- for_background: 是否为后台写回
- for_sync: 是否为同步操作
- reason: 写回原因(如WB_REASON_PERIODIC、WB_REASON_VMSCAN等)
- list: 链接到wb->work_list
PageCaches 生命周期
┌─────────────────────────────────────────────────────────────────┐
│ 1. 创建阶段(Creation) │
│ │
│ 文件系统挂载/创建文件/打开文件 │
│ └─> alloc_inode() │
│ └─> kmem_cache_alloc(ext4_inode_cachep) │
│ └─> init_once() [slab构造函数] │
│ └─> inode_init_once() │
│ └─> __address_space_init_once() │
│ └─> xa_init_flags(&i_pages, ...) │
│ └─> i_pages初始化(空XArray) │
│ │
│ └─> inode_init_always() │
│ └─> 初始化address_space其他字段 │
│ └─> mapping->a_ops = &empty_aops │
│ └─> mapping->host = inode │
│ └─> nrpages = 0 │
└─────────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ 2. 使用阶段(Usage) │
│ │
│ 文件读取: │
│ └─> filemap_fault() / filemap_read() │
│ └─> filemap_get_folio() │
│ └─> 在i_pages中查找page │
│ ├─> 命中:直接返回 │
│ └─> 未命中: │
│ └─> __filemap_get_folio(FGP_CREAT) │
│ └─> 分配新page │
│ └─> a_ops->read_folio() │
│ └─> 从磁盘读取数据 │
│ └─> __filemap_add_folio() │
│ └─> xas_store(&i_pages, folio) │
│ └─> page加入i_pages │
│ └─> nrpages++ │
│ │
│ 文件写入: │
│ └─> generic_perform_write() │
│ └─> write_begin() │
│ └─> grab_cache_page_write_begin() │
│ └─> 查找或创建page │
│ └─> 写入数据到page │
│ └─> write_end() │
│ └─> mark_page_dirty() │
│ └─> 标记page为DIRTY │
│ │
│ 页缓存管理: │
│ ├─> 查找:xa_load(&i_pages, index) │
│ ├─> 添加:xas_store(&i_pages, folio) │
│ ├─> 删除:xas_store(&i_pages, NULL) │
│ └─> 标记:xa_set_mark(&i_pages, index, PAGECACHE_TAG_DIRTY) │
└─────────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ 3. 回收阶段(Reclaim) │
│ │
│ 内存压力触发: │
│ └─> kswapd / direct reclaim │
│ └─> shrink_node() │
│ └─> shrink_inode_list() │
│ └─> inode->i_mapping->a_ops->invalidate_folio() │
│ └─> 从i_pages中移除page │
│ └─> nrpages-- │
│ │
│ 文件删除/截断: │
│ └─> truncate_inode_pages() │
│ └─> 遍历i_pages,删除所有page │
│ └─> nrpages = 0 │
└─────────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ 4. 销毁阶段(Destruction) │
│ │
│ inode引用计数降为0: │
│ └─> iput(inode) │
│ └─> iput_final() │
│ └─> evict() │
│ └─> clear_inode() │
│ └─> truncate_inode_pages_final() │
│ └─> 清空i_pages中的所有page │
│ └─> 等待i_pages锁 │
│ └─> destroy_inode() │
│ └─> kmem_cache_free(ext4_inode_cachep) │
│ └─> i_pages随inode一起释放 │
│ │
│ 注意:i_pages本身是inode结构的一部分, │
│ 不需要单独释放 │
└─────────────────────────────────────────────────────────────────┘
address_space的创建初始化
Address_space 的创建(除 Swap Cache)主要分为两个板块(核心函数 alloc_inode)
- ops->alloc_inode(sb) 或 alloc_inode_sb() 分配 inode 结构体,并调用 inode_init_once 初始化 inode 以及 i_data( address_space)
- 后调用 inode_init_always_gfp 初始化 address_space 的其他属性(初始化 address_space 的运行时属性)
alloc_inode(sb)
│
├─> ops->alloc_inode(sb) 或 alloc_inode_sb()
│ └─> kmem_cache_alloc(ext4_inode_cachep, GFP_KERNEL)
│ │
│ └─> [slab分配器内部]
│ └─> 如果是新分配的对象:
│ └─> init_once(foo) [slab构造函数]
│ └─> inode_init_once(inode)
│ ├─> memset(inode, 0, ...) // 清零整个结构体
│ ├─> 初始化各种链表头
│ └─> __address_space_init_once(&inode->i_data)
│ └─> xa_init_flags(&mapping->i_pages, ...) // ✅ i_pages初始化
│
└─> inode_init_always(sb, inode)
└─> inode_init_always_gfp(sb, inode, GFP_NOFS)
├─> 初始化inode基础字段
└─> 初始化address_space运行时属性
├─> mapping->a_ops = &empty_aops
├─> mapping->host = inode
├─> mapping_set_gfp_mask(...)
└─> inode->i_mapping = mapping
调用链
路径1/2/3简述是下放的调用链
场景1:ext4文件系统挂载,创建根inode
└─> ext4_iget(sb, EXT4_ROOT_INO, ...)
└─> iget_locked(sb, ino)
└─> alloc_inode(sb)
└─> ext4_alloc_inode()
└─> kmem_cache_alloc(ext4_inode_cachep)
└─> init_once()被调用(slab构造函数)
└─> inode_init_once()
└─> __address_space_init_once()
└─> xa_init_flags(&i_pages, ...) // i_pages初始化
└─> inode_init_always(sb, inode)
└─> inode_init_always_gfp()
└─> 初始化address_space其他字段
├─> mapping->a_ops = &empty_aops
├─> mapping->host = inode
├─> mapping_set_gfp_mask(...)
└─> inode->i_mapping = mapping
场景2:创建新文件
└─> ext4_new_inode()
└─> new_inode(sb)
└─> alloc_inode(sb)
└─> 同上,init_once()初始化i_pages
└─> inode_init_always()初始化其他字段
场景3:打开已存在文件
└─> ext4_iget()
└─> iget_locked()
└─> 如果inode不在缓存中:
└─> alloc_inode()
└─> 同上流程
路径1:文件系统挂载时创建根 inode(以 ext4 为例)
核心主要是调用到:inode_init_always函数
注意:inode_init_always_gfp 只初始化了 address_space 的基础字段,i_pages 的初始化在 inode_init_once() 中完成。
内核启动
└─> mount系统调用
└─> do_mount() [fs/namespace.c]
└─> path_mount() [fs/namespace.c]
└─> do_new_mount() [fs/namespace.c]
└─> vfs_get_tree() [fs/super.c]
└─> ext4_get_tree() [fs/ext4/super.c]
└─> get_tree_bdev() [fs/super.c]
└─> ext4_fill_super() [fs/ext4/super.c]
├─> 读取超级块、组描述符等元数据
├─> 初始化日志系统
└─> ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL)
└─> __ext4_iget() [fs/ext4/inode.c:4779]
└─> iget_locked(sb, ino) [fs/inode.c:1403]
├─> alloc_inode(sb) [fs/inode.c:261]
│ └─> ext4_alloc_inode() [fs/ext4/super.c]
│ └─> kmem_cache_alloc(ext4_inode_cachep)
│ └─> 从slab分配器分配ext4_inode_info结构
│ └─> 包含内嵌的struct inode
│ └─> 此时inode->i_data还未初始化
│
└─> inode_init_always(sb, inode) [fs/inode.c:158]
└─> inode_init_always_gfp(sb, inode, GFP_NOFS)
├─> mapping = &inode->i_data
├─> mapping->a_ops = &empty_aops
├─> mapping->host = inode
├─> mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE)
├─> init_rwsem(&mapping->invalidate_lock)
└─> inode->i_mapping = mapping
└─> 注意:此时i_pages还未初始化!
└─> 设置I_NEW标志,加入inode哈希表
└─> __ext4_iget()继续执行
├─> __ext4_get_inode_loc_noinmem()
│ └─> 从磁盘读取inode元数据
├─> 填充inode的各个字段
└─> unlock_new_inode(inode)
└─> 清除I_NEW标志,唤醒等待者
路径2:创建新文件时创建 inode
用户空间:open("newfile", O_CREAT)
└─> open系统调用
└─> do_sys_open() [fs/open.c]
└─> do_filp_open() [fs/namei.c]
└─> path_openat() [fs/namei.c]
└─> do_open() [fs/namei.c]
└─> vfs_create() [fs/namei.c]
└─> ext4_create() [fs/ext4/namei.c]
└─> ext4_new_inode_start_handle() [fs/ext4/namei.c]
└─> __ext4_new_inode() [fs/ext4/ialloc.c:924]
├─> new_inode(sb) [fs/inode.c:1121]
│ └─> new_inode_pseudo(sb)
│ └─> alloc_inode(sb)
│ └─> ext4_alloc_inode()
│ └─> kmem_cache_alloc(ext4_inode_cachep)
│ └─> 分配时调用init_once回调
│ └─> inode_init_once() [fs/inode.c:424]
│ └─> __address_space_init_once(&inode->i_data)
│ └─> xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT)
│ └─> i_pages初始化完成!
│
└─> inode_init_always(sb, inode)
└─> 初始化address_space的其他字段
路径3:打开已经存在的文件时创建 inode
用户空间:open("existing_file", O_RDONLY)
└─> open系统调用
└─> do_sys_open()
└─> do_filp_open()
└─> path_openat()
└─> do_open()
└─> vfs_open()
└─> d_inode(path->dentry)
└─> 如果inode不在缓存中:
└─> ext4_lookup() [fs/ext4/namei.c]
└─> ext4_iget(sb, ino, EXT4_IGET_NORMAL)
└─> __ext4_iget()
└─> iget_locked(sb, ino)
└─> alloc_inode(sb)
└─> 同上,通过slab分配器分配
└─> inode_init_once()被调用
└─> i_pages初始化
Swap Cache 的 address_space 创建
内核启动或添加swap分区
└─> swapon系统调用
└─> do_swapoff() [mm/swapfile.c]
└─> setup_swap_extents() [mm/swapfile.c]
└─> init_swap_address_space() [mm/swap_state.c:710]
├─> 计算需要的address_space数量
│ └─> nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES)
├─> kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL)
│ └─> 分配多个address_space结构
└─> 对每个address_space初始化:
├─> xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ)
│ └─> 注意:没有XA_FLAGS_ACCOUNT标志
├─> atomic_set(&space->i_mmap_writable, 0)
├─> space->a_ops = &swap_aops
└─> mapping_set_no_writeback_tags(space)
└─> swap cache不使用writeback相关标记
alloc_inode(核心函数)
分配 inode 结构体,对 inode 进行初始化。这里我们可以很清晰看见,这里通过 ops->alloc_inode(sb)/alloc_inode_sb 进行 inode 结构体的获取,然后通过 inode_init_always 对 inode 进行初始化
static struct inode *alloc_inode(struct super_block *sb)
{
const struct super_operations *ops = sb->s_op;
struct inode *inode;
if (ops->alloc_inode)
inode = ops->alloc_inode(sb);
else
inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);
if (!inode)
return NULL;
if (unlikely(inode_init_always(sb, inode))) {
if (ops->destroy_inode) {
ops->destroy_inode(inode);
if (!ops->free_inode)
return NULL;
}
inode->free_inode = ops->free_inode;
i_callback(&inode->i_rcu);
return NULL;
}
return inode;
}
inode_init_once( i_pages 的初始化 )
i_pages 的初始化只有一个路径在(alloc_inode 内部触发):通过 inode_init_once() → __address_space_init_once()。
void inode_init_once(struct inode *inode)
{
memset(inode, 0, sizeof(*inode));
INIT_HLIST_NODE(&inode->i_hash);
INIT_LIST_HEAD(&inode->i_devices);
INIT_LIST_HEAD(&inode->i_io_list);
INIT_LIST_HEAD(&inode->i_wb_list);
INIT_LIST_HEAD(&inode->i_lru);
INIT_LIST_HEAD(&inode->i_sb_list);
__address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
}
static void __address_space_init_once(struct address_space *mapping)
{
xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->i_private_list);
spin_lock_init(&mapping->i_private_lock);
mapping->i_mmap = RB_ROOT_CACHED;
}
xa_init_flags() 的作用:
- 初始化 XArray 的根节点(xa_head = NULL)
- 设置标志:XA_FLAGS_LOCK_IRQ(IRQ 安全锁)、XA_FLAGS_ACCOUNT(内存统计)
- 初始化 XArray 的内部锁
触发时机
inode_init_once() 作为 slab 分配器的构造函数,在从 slab 分配 inode 时自动调用:
ext4模块加载
└─> module_init(init_ext4_fs) [fs/ext4/super.c]
└─> ext4_init_inode_table() [fs/ext4/super.c]
└─> ext4_inode_cachep = kmem_cache_create_usercopy(
"ext4_inode_cache",
sizeof(struct ext4_inode_info),
0,
(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT),
offsetof(struct ext4_inode_info, i_data),
sizeof_field(struct ext4_inode_info, i_data),
init_once) // 关键:init_once作为构造函数
└─> kmem_cache_create_usercopy() [mm/slab_common.c]
└─> __kmem_cache_create()
└─> 创建slab缓存,注册init_once回调
后续任何分配inode的操作:
└─> alloc_inode(sb) [fs/inode.c:261]
└─> ext4_alloc_inode(sb) [fs/ext4/super.c]
└─> kmem_cache_alloc(ext4_inode_cachep, GFP_KERNEL)
└─> slab_alloc() [mm/slub.c]
└─> 如果对象是新分配的(未初始化):
└─> slab_post_alloc_hook()
└─> 调用构造函数:init_once(foo)
└─> inode_init_once(inode)
└─> memset(inode, 0, sizeof(*inode))
└─> 清零整个inode结构
└─> INIT_HLIST_NODE(&inode->i_hash)
└─> INIT_LIST_HEAD(&inode->i_devices)
└─> INIT_LIST_HEAD(&inode->i_io_list)
└─> INIT_LIST_HEAD(&inode->i_wb_list)
└─> INIT_LIST_HEAD(&inode->i_lru)
└─> INIT_LIST_HEAD(&inode->i_sb_list)
└─> __address_space_init_once(&inode->i_data)
├─> xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT)
│ └─> 初始化XArray结构
│ ├─> mapping->i_pages.xa_head = NULL
│ ├─> mapping->i_pages.xa_flags = XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT
│ └─> 初始化xa_lock(IRQ安全的spinlock)
├─> init_rwsem(&mapping->i_mmap_rwsem)
├─> INIT_LIST_HEAD(&mapping->i_private_list)
├─> spin_lock_init(&mapping->i_private_lock)
└─> mapping->i_mmap = RB_ROOT_CACHED
└─> i_size_ordered_init(inode)
inode_init_always_gfp(初始化基础属性)
初始化 address_speace 的一些基础属性
int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp)
{
static const struct inode_operations empty_iops;
static const struct file_operations no_open_fops = {.open = no_open};
struct address_space *const mapping = &inode->i_data;
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
inode->i_state = 0;
atomic64_set(&inode->i_sequence, 0);
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
inode->i_fop = &no_open_fops;
inode->i_ino = 0;
__i_nlink = 1;
inode->i_opflags = 0;
if (sb->s_xattr)
inode->i_opflags |= IOP_XATTR;
i_uid_write(inode, 0);
i_gid_write(inode, 0);
atomic_set(&inode->i_writecount, 0);
inode->i_size = 0;
inode->i_write_hint = WRITE_LIFE_NOT_SET;
inode->i_blocks = 0;
inode->i_bytes = 0;
inode->i_generation = 0;
inode->i_pipe = NULL;
inode->i_cdev = NULL;
inode->i_link = NULL;
inode->i_dir_seq = 0;
inode->i_rdev = 0;
inode->dirtied_when = 0;
#ifdef CONFIG_CGROUP_WRITEBACK
inode->i_wb_frn_winner = 0;
inode->i_wb_frn_avg_time = 0;
inode->i_wb_frn_history = 0;
#endif
spin_lock_init(&inode->i_lock);
lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
init_rwsem(&inode->i_rwsem);
lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
atomic_set(&inode->i_dio_count, 0);
mapping->a_ops = &empty_aops;
mapping->host = inode;
mapping->flags = 0;
mapping->wb_err = 0;
atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
atomic_set(&mapping->nr_thps, 0);
#endif
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->i_private_data = NULL;
mapping->writeback_index = 0;
init_rwsem(&mapping->invalidate_lock);
lockdep_set_class_and_name(&mapping->invalidate_lock,
&sb->s_type->invalidate_lock_key,
"mapping.invalidate_lock");
if (sb->s_iflags & SB_I_STABLE_WRITES)
mapping_set_stable_writes(mapping);
inode->i_private = NULL;
inode->i_mapping = mapping;
INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
#ifdef CONFIG_FS_POSIX_ACL
inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
#endif
#ifdef CONFIG_FSNOTIFY
inode->i_fsnotify_mask = 0;
#endif
inode->i_flctx = NULL;
if (unlikely(security_inode_alloc(inode, gfp)))
return -ENOMEM;
this_cpu_inc(nr_inodes);
return 0;
}
内核如何定位数据并映射到 Pages Cache
其实很好理解,获取文件偏移后比如index:0x00100010,那么从root entry开始用shift开始提取slots的下标。
比如: index: 0x010010,先获取根 enrty ,shift 为 12
slots_index = 0x10010 >> shift & 0x3f // 为 16 < XA_CHUNK_SIZE 合法
entry = node->slots[slots_index]
- 那么根据slots_index获取的entry再判断是否为node节点,如果为pointer entry/value entry/null entry就直接返回,如果是 node 节点就继续下降。很明显我们还没命中目标,就继续下降
继续下降,然后直到遇到了pointer entry/value entry/null entry 就返回。
// xas_load() 继续循环
xa_is_node(entry) = true // Node B是节点指针
node = xa_to_node(entry); // 转换为 Node B 结构
node->shift = 6
// xas_descend() 下降
offset = get_offset(0x10010, Node B)
= (0x10010 >> 6) & 0x3F
= (65552 >> 6) & 63
= 1024 & 63
= 0
// 检查范围
offset = 0 < XA_CHUNK_SIZE (64) ✓ 合法
// 读取slot
entry = Node B->slots[0] // 假设是 Node C (节点指针)
// 判断entry类型
xa_is_node(entry) = true // 是节点指针,继续下降
调用链
filemap_get_pages(iocb, count, fbatch, need_uptodate)
│
├─> 步骤1:计算页索引
│ ├─> index = iocb->ki_pos >> PAGE_SHIFT
│ └─> last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE)
│
├─> 步骤2:第一次批量查找
│ └─> filemap_get_read_batch(mapping, index, last_index - 1, fbatch)
│ │
│ ├─> XA_STATE(xas, &mapping->i_pages, index)
│ │ └─> 初始化XArray状态结构
│ │ ├─> xas.xa = &mapping->i_pages
│ │ ├─> xas.xa_index = index
│ │ └─> xas.xa_node = NULL
│ │
│ ├─> rcu_read_lock()
│ │ └─> 获取RCU读锁(允许并发读取)
│ │
│ └─> for (folio = xas_load(&xas); folio; folio = xas_next(&xas))
│ │
│ ├─> xas_load(&xas) [lib/xarray.c:236]
│ │ │
│ │ ├─> xas_start(&xas) [lib/xarray.c:181]
│ │ │ ├─> entry = xa_head(xas->xa)
│ │ │ │ └─> 获取XArray根条目(xa_head)
│ │ │ │
│ │ │ ├─> 如果xa_is_node(entry):
│ │ │ │ └─> 检查索引范围
│ │ │ │
│ │ │ └─> 返回entry
│ │ │
│ │ └─> while (xa_is_node(entry))
│ │ │
│ │ ├─> node = xa_to_node(entry)
│ │ │ └─> 转换为xa_node结构
│ │ │
│ │ └─> xas_descend(&xas, node) [lib/xarray.c:203]
│ │ │
│ │ ├─> offset = get_offset(xas->xa_index, node)
│ │ │ └─> offset = (index >> node->shift) & XA_CHUNK_MASK
│ │ │ └─> 计算在当前节点中的偏移
│ │ │
│ │ ├─> entry = xa_entry(xas->xa, node, offset)
│ │ │ └─> 从节点的slots数组中获取条目
│ │ │ └─> entry = node->slots[offset]
│ │ │
│ │ ├─> xas->xa_node = node
│ │ │ └─> 更新当前节点
│ │ │
│ │ ├─> while (xa_is_sibling(entry))
│ │ │ └─> 处理sibling条目
│ │ │
│ │ └─> xas->xa_offset = offset
│ │ └─> 更新偏移
│ │
│ ├─> xas_retry(&xas, folio) [include/linux/xarray.h:1526]
│ │ └─> 检查是否需要重试(并发修改检测)
│ │ └─> entry == XA_RETRY_ENTRY
│ │
│ ├─> xa_is_value(folio) [include/linux/xarray.h:83]
│ │ └─> (unsigned long)entry & 1
│ │ └─> 判断是否为值条目(swap/shadow)
│ │
│ ├─> xa_is_sibling(folio) [include/linux/xarray.h]
│ │ └─> 判断是否为sibling条目
│ │
│ ├─> folio_try_get(folio) [include/linux/mm.h]
│ │ └─> atomic_inc_not_zero(&folio->_refcount)
│ │ └─> 增加引用计数(防止被释放)
│ │
│ ├─> xas_reload(&xas) [include/linux/xarray.h:1595]
│ │ └─> 重新加载条目(验证并发修改)
│ │ └─> xas_load(&xas)
│ │
│ ├─> folio_batch_add(fbatch, folio)
│ │ └─> 将folio添加到批次
│ │
│ └─> xas_next(&xas) [lib/xarray.c]
│ ├─> xas_next_offset(&xas)
│ │ └─> xas->xa_offset++
│ │ └─> xas_move_index(&xas, xas->xa_offset)
│ │ └─> 移动到下一个索引
│ │
│ └─> xas_load(&xas)
│ └─> 加载下一个条目
│
│ └─> rcu_read_unlock()
│ └─> 释放RCU读锁
│
├─> 步骤3:如果未找到页面
│ ├─> 检查IOCB_NOIO标志
│ │
│ └─> page_cache_sync_readahead()
│ └─> 执行同步预读
│ └─> 可能触发页面创建
│
│ └─> 再次尝试批量查找
│ └─> filemap_get_read_batch()
│
├─> 步骤4:如果仍未找到
│ └─> filemap_create_folio()
│ ├─> filemap_alloc_folio()
│ │ └─> 分配新folio
│ │
│ ├─> filemap_add_folio()
│ │ └─> __filemap_add_folio()
│ │ ├─> xas_lock_irq(&xas)
│ │ ├─> xas_store(&xas, folio)
│ │ │ └─> 将folio加入i_pages
│ │ └─> xas_unlock_irq(&xas)
│ │
│ └─> filemap_read_folio()
│ └─> 从磁盘读取数据
│
└─> 步骤5:处理找到的页面
├─> folio_test_readahead()
├─> folio_test_uptodate()
└─> filemap_update_page()(如果需要)
流程图
┌─────────────────────────────────────────────────────────────┐
│ filemap_get_folio(mapping, index) │
└──────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ __filemap_get_folio() │
│ - 初始化folio = NULL │
└──────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ filemap_get_entry(mapping, index) │
│ - XA_STATE(xas, &mapping->i_pages, index) │
│ - rcu_read_lock() │
└──────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ xas_reset(&xas) │
│ - xas->xa_node = XAS_RESTART │
└──────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ xas_load(&xas) │
└──────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ xas_start(&xas) │
│ ├─ xas_valid()? → 检查状态 │
│ ├─ xa_head(xas->xa) → 获取根节点 │
│ └─ 检查index范围 │
└──────────────────────┬──────────────────────────────────────┘
│
┌─────────┴─────────┐
│ │
xa_is_node(entry)? 不是节点
│ │
▼ ▼
┌──────────────────┐ ┌──────────────────┐
│ xa_to_node() │ │ 返回entry │
│ 转换节点指针 │ │ (可能是folio/NULL)│
└────────┬─────────┘ └──────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ xas_descend(&xas, node) │
│ ├─ get_offset(index, node) │
│ │ └─ (index >> node->shift) & XA_CHUNK_MASK │
│ ├─ xa_entry(node, offset) → 读取slot │
│ └─ while (xa_is_sibling(entry)) │
│ └─ 跳转到规范槽位 │
└──────────────────────┬──────────────────────────────────────┘
│
┌─────────┴─────────┐
│ │
xa_is_node(entry)? 不是节点
│ │
▼ ▼
继续下降 返回entry
│ │
└──────────┬──────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ xas_retry(&xas, entry) │
│ ├─ xa_is_zero()? → 返回true,重试 │
│ ├─ xa_is_retry()? → xas_reset(),返回true,重试 │
│ └─ 返回false,继续 │
└──────────────────────┬──────────────────────────────────────┘
│
┌─────────┴─────────┐
│ │
xa_is_value()? 不是value
│ │
▼ ▼
┌──────────────────┐ ┌──────────────────┐
│ 返回shadow/swap │ │ folio_try_get() │
│ entry,不增加引用│ │ 增加引用计数 │
└──────────────────┘ └────────┬──────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ xas_reload(&xas) │
│ - 重新读取slot,验证entry未改变 │
└──────────────────────┬──────────────────────────────────────┘
│
┌─────────┴─────────┐
│ │
entry改变了? 未改变
│ │
▼ ▼
folio_put() 返回folio
goto repeat rcu_read_unlock()
Xarray 的查询
┌─────────────────────────────────────────────────────────────────┐
│ XArray查询流程(xas_load) │
└─────────────────────────────────────────────────────────────────┘
XA_STATE初始化
│
├─> xas.xa = &mapping->i_pages
├─> xas.xa_index = index (目标索引)
└─> xas.xa_node = NULL
xas_start()
│
├─> entry = xa_head(xas->xa)
│ └─> 获取XArray根条目
│ │
│ ├─> 如果xa_head == NULL
│ │ └─> 返回NULL(空数组)
│ │
│ ├─> 如果xa_is_node(entry)
│ │ └─> 是节点,继续下降
│ │
│ └─> 否则
│ └─> 直接返回条目(单条目数组)
while (xa_is_node(entry))
│
├─> node = xa_to_node(entry)
│ └─> 转换为xa_node结构
│
├─> xas_descend(&xas, node)
│ │
│ ├─> offset = get_offset(xas->xa_index, node)
│ │ └─> offset = (index >> node->shift) & XA_CHUNK_MASK
│ │ └─> 计算在当前节点中的槽位偏移
│ │
│ ├─> entry = xa_entry(xas->xa, node, offset)
│ │ └─> entry = node->slots[offset]
│ │ └─> 从节点的slots数组获取条目
│ │
│ ├─> xas->xa_node = node
│ │ └─> 更新当前节点
│ │
│ ├─> while (xa_is_sibling(entry))
│ │ └─> 处理sibling条目(大页面的多槽位表示)
│ │
│ └─> xas->xa_offset = offset
│ └─> 更新偏移
│
└─> 返回entry
判断结果:
│
├─> entry == NULL
│ └─> ❌ 页面不存在
│
├─> xa_is_value(entry)
│ └─> ❌ 是值条目(swap/shadow),不是页面
│
├─> xa_is_node(entry)
│ └─> 继续下降(递归)
│
└─> entry是普通指针
└─> ✅ 页面存在,返回folio指针
get_offset ( index 到位段映射)
index的二进制表示:
[高位] ... [中间位] [低位]
│ │ │
│ │ └─> 叶子节点槽位(shift=0)
│ └──────────> 中间节点槽位(shift=6)
└────────────────────> 根节点槽位(shift=12)
static unsigned int get_offset(unsigned long index, struct xa_node *node)
{
return (index >> node->shift) & XA_CHUNK_MASK;
}
Level 0 (根节点, shift=12):
offset = (0x1234 >> 12) & 0x3F = 0x1 & 0x3F = 1
→ 访问 slots[1]
Level 1 (中间节点, shift=6):
offset = (0x1234 >> 6) & 0x3F = 0x48 & 0x3F = 0x08 = 8
→ 访问 slots[8]
Level 2 (叶子节点, shift=0):
offset = (0x1234 >> 0) & 0x3F = 0x1234 & 0x3F = 0x34 = 52
→ 访问 slots[52]
步骤1:文件偏移到页索引的转换
- index = iocb->ki_pos >> PAGE_SHIFT:文件偏移转页索引
- last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE):计算结束页索引
- filemap_get_read_batch():在 i_pages 中批量查找页面
filemap_get_pages()
│
├─> 步骤1:计算页索引范围
│ ├─> index = iocb->ki_pos >> PAGE_SHIFT
│ └─> last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE)
│
├─> 步骤2:第一次尝试批量查找
│ └─> filemap_get_read_batch(mapping, index, last_index - 1, fbatch)
│ └─> 在i_pages中查找页面
│
├─> 步骤3:如果未找到页面
│ ├─> 检查IOCB_NOIO标志
│ ├─> 执行同步预读
│ │ └─> page_cache_sync_readahead()
│ └─> 再次尝试批量查找
│ └─> filemap_get_read_batch()
│
├─> 步骤4:如果仍未找到
│ └─> filemap_create_folio()
│ └─> 创建新页面并加入Pages Cache
│
├─> 步骤5:处理找到的页面
│ ├─> 检查readahead标志
│ ├─> 检查uptodate标志
│ └─> filemap_update_page()(如果需要)
│
└─> 返回0(成功)或错误码
static int filemap_get_pages(struct kiocb *iocb, size_t count,
struct folio_batch *fbatch, bool need_uptodate)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index;
struct folio *folio;
unsigned int flags;
int err = 0;
/* "last_index" is the index of the page beyond the end of the read */
last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
if (fatal_signal_pending(current))
return -EINTR;
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
if (iocb->ki_flags & IOCB_NOWAIT)
flags = memalloc_noio_save();
page_cache_sync_readahead(mapping, ra, filp, index,
last_index - index);
if (iocb->ki_flags & IOCB_NOWAIT)
memalloc_noio_restore(flags);
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
}
if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
return -EAGAIN;
err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
}
folio = fbatch->folios[folio_batch_count(fbatch) - 1];
if (folio_test_readahead(folio)) {
err = filemap_readahead(iocb, filp, mapping, folio, last_index);
if (err)
goto err;
}
if (!folio_test_uptodate(folio)) {
if ((iocb->ki_flags & IOCB_WAITQ) && folio_batch_count(fbatch) > 1)
iocb->ki_flags |= IOCB_NOWAIT;
err = filemap_update_page(iocb, mapping, count, folio,
need_uptodate);
if (err)
goto err;
}
trace_mm_filemap_get_pages(mapping, index, last_index - 1);
return 0;
err:
if (err < 0)
folio_put(folio);
if (likely(--fbatch->nr))
return 0;
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
}
步骤2:通过页索引在 i_pages 中查找
- XA_STATE(xas, &mapping->i_pages, index):初始化 XArray 状态,定位到指定索引
- xas_load(&xas):从 XArray 加载条目
- xa_is_value(folio):判断是否为值条目(swap/shadow)
- folio_try_get():增加引用计数
void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
{
XA_STATE(xas, &mapping->i_pages, index);
struct folio *folio;
rcu_read_lock();
repeat:
xas_reset(&xas);
folio = xas_load(&xas);
if (xas_retry(&xas, folio))
goto repeat;
/*
* A shadow entry of a recently evicted page, or a swap entry from
* shmem/tmpfs. Return it without attempting to raise page count.
*/
if (!folio || xa_is_value(folio))
goto out;
if (!folio_try_get(folio))
goto repeat;
if (unlikely(folio != xas_reload(&xas))) {
folio_put(folio);
goto repeat;
}
out:
rcu_read_unlock();
return folio;
}
xas_load
void *xas_load(struct xa_state *xas)
{
void *entry = xas_start(xas);
while (xa_is_node(entry)) {
struct xa_node *node = xa_to_node(entry);
if (xas->xa_shift > node->shift)
break;
entry = xas_descend(xas, node);
if (node->shift == 0)
break;
}
return entry;
}
- 从根开始查找
- 若 entry 是节点指针,则下降一层
- 直到找到叶子 entry(folio 指针、NULL、shadow 等)
Page Caches读取
无论是mmap映射文件还是通过open直接打开文件然后读取,都是拿到index后去查找PageCaches,如果没有命中PageCaches就创建 page ,再进行读取。
调用链
Read 路径
read()
└─> vfs_read()
└─> file->f_op->read_iter()
└─> generic_file_read_iter()
└─> filemap_read()
├─> filemap_get_pages()
│ ├─> filemap_get_read_batch() [从PageCache查找]
│ ├─> page_cache_sync_readahead() [缓存未命中时预读]
│ ├─> filemap_create_folio() [创建新页面]
│ │ └─> filemap_read_folio()
│ │ └─> mapping->a_ops->read_folio()
│ └─> filemap_update_page() [更新非最新页面]
│ └─> filemap_read_folio()
└─> copy_folio_to_iter() [复制到用户空间]
Mmap 缺页中断路径
用户访问mmap内存
└─> 触发缺页中断
└─> handle_mm_fault()
└─> do_fault()
└─> do_read_fault()
├─> do_fault_around() [fault-around预读]
└─> __do_fault()
└─> vma->vm_ops->fault()
└─> filemap_fault()
├─> filemap_get_folio() [查找PageCache]
├─> do_async_mmap_readahead() [异步预读]
├─> do_sync_mmap_readahead() [同步预读]
├─> __filemap_get_folio() [创建页面]
├─> lock_folio_maybe_drop_mmap() [加锁]
└─> filemap_read_folio() [读取数据]
└─> mapping->a_ops->read_folio()
预读路径
同步预读:
page_cache_sync_readahead()
└─> page_cache_ra_order()
└─> read_pages()
└─> mapping->a_ops->readahead()
└─> readahead_folio() [逐个获取页面]
异步预读:
page_cache_async_ra()
└─> page_cache_ra_order()
└─> read_pages()
└─> mapping->a_ops->readahead()
流程图
┌─────────────────────────────────────────────────────────────┐
│ PageCache读取流程 │
└─────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────┐
│ 读取请求(read/mmap) │
└─────────────────────────────────────┘
│
┌─────────────┴─────────────┐
│ │
▼ ▼
┌──────────────────┐ ┌──────────────────┐
│ read()系统调用 │ │ mmap缺页中断 │
└──────────────────┘ └──────────────────┘
│ │
▼ ▼
┌──────────────────┐ ┌──────────────────┐
│ filemap_read() │ │ filemap_fault() │
└──────────────────┘ └──────────────────┘
│ │
└─────────────┬─────────────┘
│
▼
┌─────────────────────┐
│ 查找PageCache │
│ filemap_get_folio() │
└─────────────────────┘
│
┌─────────────┴─────────────┐
│ │
┌──────▼──────┐ ┌─────────▼─────────┐
│ 页面在缓存中 │ │ 页面不在缓存中 │
└──────┬──────┘ └─────────┬─────────┘
│ │
┌───────────┴──────────┐ │
│ │ │
▼ ▼ ▼
┌──────────┐ ┌──────────────┐ ┌──────────────┐
│ 最新页面 │ │ 非最新页面 │ │ 触发预读 │
│ (uptodate)│ │ (!uptodate) │ │ readahead() │
└────┬─────┘ └──────┬───────┘ └──────┬───────┘
│ │ │
│ ▼ │
│ ┌─────────────────┐ │
│ │ filemap_read_ │ │
│ │ folio() │ │
│ └────────┬────────┘ │
│ │ │
└───────────────────┴─────────────────┘
│
▼
┌──────────────────────┐
│ 调用文件系统read_folio│
│ mapping->a_ops-> │
│ read_folio() │
└──────────┬───────────┘
│
▼
┌──────────────────────┐
│ 从磁盘读取数据到页面 │
│ 设置PG_uptodate标志 │
└──────────┬───────────┘
│
▼
┌──────────────────────┐
│ 返回数据给用户空间 │
│ (copy_folio_to_iter) │
└──────────────────────┘
read路径
read()
└─> vfs_read()
└─> file->f_op->read_iter()
└─> generic_file_read_iter()
└─> filemap_read()
├─> filemap_get_pages()
│ ├─> filemap_get_read_batch() [从PageCache查找]
│ ├─> page_cache_sync_readahead() [缓存未命中时预读]
│ ├─> filemap_create_folio() [创建新页面]
│ │ └─> filemap_read_folio()
│ │ └─> mapping->a_ops->read_folio()
│ └─> filemap_update_page() [更新非最新页面]
│ └─> filemap_read_folio()
└─> copy_folio_to_iter() [复制到用户空间]
filemap_read(核心)
- 从PageCache读取数据到用户空间
- 循环调用filemap_get_pages()获取页面批次
- 将页面数据复制到iov_iter
- 处理文件大小边界和可写映射的缓存一致性
/**
* filemap_read - Read data from the page cache.
* @iocb: The iocb to read.
* @iter: Destination for the data.
* @already_read: Number of bytes already read by the caller.
*
* Copies data from the page cache. If the data is not currently present,
* uses the readahead and read_folio address_space operations to fetch it.
*
* Return: Total number of bytes copied, including those already read by
* the caller. If an error happens before any bytes are copied, returns
* a negative error number.
*/
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
ssize_t already_read)
{
struct file *filp = iocb->ki_filp;
struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct folio_batch fbatch;
int i, error = 0;
bool writably_mapped;
loff_t isize, end_offset;
loff_t last_pos = ra->prev_pos;
if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
if (unlikely(!iov_iter_count(iter)))
return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
folio_batch_init(&fbatch);
do {
cond_resched();
/*
* If we've already successfully copied some data, then we
* can no longer safely return -EIOCBQUEUED. Hence mark
* an async read NOWAIT at that point.
*/
if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
iocb->ki_flags |= IOCB_NOWAIT;
if (unlikely(iocb->ki_pos >= i_size_read(inode)))
break;
error = filemap_get_pages(iocb, iter->count, &fbatch, false);
if (error < 0)
break;
/*
* i_size must be checked after we know the pages are Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode);
if (unlikely(iocb->ki_pos >= isize))
goto put_folios;
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
/*
* Once we start copying data, we don't want to be touching any
* cachelines that might be contended:
*/
writably_mapped = mapping_writably_mapped(mapping);
/*
* When a read accesses the same folio several times, only
* mark it as accessed the first time.
*/
if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
fbatch.folios[0]))
folio_mark_accessed(fbatch.folios[0]);
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
size_t fsize = folio_size(folio);
size_t offset = iocb->ki_pos & (fsize - 1);
size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
fsize - offset);
size_t copied;
if (end_offset < folio_pos(folio))
break;
if (i > 0)
folio_mark_accessed(folio);
/*
* If users can be writing to this folio using arbitrary
* virtual addresses, take care of potential aliasing
* before reading the folio on the kernel side.
*/
if (writably_mapped)
flush_dcache_folio(folio);
copied = copy_folio_to_iter(folio, offset, bytes, iter);
already_read += copied;
iocb->ki_pos += copied;
last_pos = iocb->ki_pos;
if (copied < bytes) {
error = -EFAULT;
break;
}
}
put_folios:
for (i = 0; i < folio_batch_count(&fbatch); i++)
folio_put(fbatch.folios[i]);
folio_batch_init(&fbatch);
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
file_accessed(filp);
ra->prev_pos = last_pos;
return already_read ? already_read : error;
}
filemap_get_pages (获取页面批次)
- 从PageCache批量获取页面
- 缓存未命中时触发同步预读
- 仍无页面则创建新页面并读取
- 检查页面是否最新,必要时更新
static int filemap_get_pages(struct kiocb *iocb, size_t count,
struct folio_batch *fbatch, bool need_uptodate)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index;
struct folio *folio;
unsigned int flags;
int err = 0;
/* "last_index" is the index of the page beyond the end of the read */
last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
if (fatal_signal_pending(current))
return -EINTR;
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
if (iocb->ki_flags & IOCB_NOWAIT)
flags = memalloc_noio_save();
page_cache_sync_readahead(mapping, ra, filp, index,
last_index - index);
if (iocb->ki_flags & IOCB_NOWAIT)
memalloc_noio_restore(flags);
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
}
if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
return -EAGAIN;
err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
}
folio = fbatch->folios[folio_batch_count(fbatch) - 1];
if (folio_test_readahead(folio)) {
err = filemap_readahead(iocb, filp, mapping, folio, last_index);
if (err)
goto err;
}
if (!folio_test_uptodate(folio)) {
if ((iocb->ki_flags & IOCB_WAITQ) && folio_batch_count(fbatch) > 1)
iocb->ki_flags |= IOCB_NOWAIT;
err = filemap_update_page(iocb, mapping, count, folio,
need_uptodate);
if (err)
goto err;
}
trace_mm_filemap_get_pages(mapping, index, last_index - 1);
return 0;
err:
if (err < 0)
folio_put(folio);
if (likely(--fbatch->nr))
return 0;
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
}
/*
* filemap_get_read_batch - Get a batch of folios for read
*
* Get a batch of folios which represent a contiguous range of bytes in
* the file. No exceptional entries will be returned. If @index is in
* the middle of a folio, the entire folio will be returned. The last
* folio in the batch may have the readahead flag set or the uptodate flag
* clear so that the caller can take the appropriate action.
*/
/*
使用RCU从radix tree批量获取页面
遇到非最新或预读标记页面时停止
返回连续范围的页面批次
*/
static void filemap_get_read_batch(struct address_space *mapping,
pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
{
XA_STATE(xas, &mapping->i_pages, index);
struct folio *folio;
rcu_read_lock();
for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
if (xas_retry(&xas, folio))
continue;
if (xas.xa_index > max || xa_is_value(folio))
break;
if (xa_is_sibling(folio))
break;
if (!folio_try_get(folio))
goto retry;
if (unlikely(folio != xas_reload(&xas)))
goto put_folio;
if (!folio_batch_add(fbatch, folio))
break;
if (!folio_test_uptodate(folio))
break;
if (folio_test_readahead(folio))
break;
xas_advance(&xas, folio_next_index(folio) - 1);
continue;
put_folio:
folio_put(folio);
retry:
xas_reset(&xas);
}
rcu_read_unlock();
}
filemap_create_folio(没有命中时创建)
static int filemap_create_folio(struct file *file,
struct address_space *mapping, loff_t pos,
struct folio_batch *fbatch)
{
struct folio *folio;
int error;
unsigned int min_order = mapping_min_folio_order(mapping);
pgoff_t index;
folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);
if (!folio)
return -ENOMEM;
/*
* Protect against truncate / hole punch. Grabbing invalidate_lock
* here assures we cannot instantiate and bring uptodate new
* pagecache folios after evicting page cache during truncate
* and before actually freeing blocks. Note that we could
* release invalidate_lock after inserting the folio into
* the page cache as the locked folio would then be enough to
* synchronize with hole punching. But there are code paths
* such as filemap_update_page() filling in partially uptodate
* pages or ->readahead() that need to hold invalidate_lock
* while mapping blocks for IO so let's hold the lock here as
* well to keep locking rules simple.
*/
filemap_invalidate_lock_shared(mapping);
index = (pos >> (PAGE_SHIFT + min_order)) << min_order;
// 1. 分配新页面
error = filemap_add_folio(mapping, folio, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error == -EEXIST)
error = AOP_TRUNCATED_PAGE;
if (error)
goto error;
// 2. 加入PageCache
error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
if (error)
goto error;
// 3. 读取数据
filemap_invalidate_unlock_shared(mapping);
folio_batch_add(fbatch, folio);
return 0;
error:
filemap_invalidate_unlock_shared(mapping);
folio_put(folio);
return error;
}
mmap 路径
用户访问mmap内存
└─> 触发缺页中断
└─> handle_mm_fault()
└─> do_fault()
└─> do_read_fault()
├─> do_fault_around() [fault-around预读]
└─> __do_fault()
└─> vma->vm_ops->fault()
└─> filemap_fault()
├─> filemap_get_folio() [查找PageCache]
├─> do_async_mmap_readahead() [异步预读]
├─> do_sync_mmap_readahead() [同步预读]
├─> __filemap_get_folio() [创建页面]
├─> lock_folio_maybe_drop_mmap() [加锁]
└─> filemap_read_folio() [读取数据]
└─> mapping->a_ops->read_folio()
filemap_fault (核心 缺页中断处理)
- 处理mmap访问触发的缺页中断
- 先查找PageCache,命中则尝试异步预读
- 未命中则同步预读并创建页面
- 页面非最新时调用read_folio读取
- 处理截断、锁竞争等边界情况
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
int error;
struct file *file = vmf->vma->vm_file;
struct file *fpin = NULL;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
pgoff_t max_idx, index = vmf->pgoff;
struct folio *folio;
vm_fault_t ret = 0;
bool mapping_locked = false;
max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(index >= max_idx))
return VM_FAULT_SIGBUS;
trace_mm_filemap_fault(mapping, index);
/*
* Do we have something in the page cache already?
*/
folio = filemap_get_folio(mapping, index);
if (likely(!IS_ERR(folio))) {
/*
* We found the page, so try async readahead before waiting for
* the lock.
*/
if (!(vmf->flags & FAULT_FLAG_TRIED))
fpin = do_async_mmap_readahead(vmf, folio);
if (unlikely(!folio_test_uptodate(folio))) {
filemap_invalidate_lock_shared(mapping);
mapping_locked = true;
}
} else {
ret = filemap_fault_recheck_pte_none(vmf);
if (unlikely(ret))
return ret;
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
fpin = do_sync_mmap_readahead(vmf);
retry_find:
/*
* See comment in filemap_create_folio() why we need
* invalidate_lock
*/
if (!mapping_locked) {
filemap_invalidate_lock_shared(mapping);
mapping_locked = true;
}
folio = __filemap_get_folio(mapping, index,
FGP_CREAT|FGP_FOR_MMAP,
vmf->gfp_mask);
if (IS_ERR(folio)) {
if (fpin)
goto out_retry;
filemap_invalidate_unlock_shared(mapping);
return VM_FAULT_OOM;
}
}
if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
goto out_retry;
/* Did it get truncated? */
if (unlikely(folio->mapping != mapping)) {
folio_unlock(folio);
folio_put(folio);
goto retry_find;
}
VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
/*
* We have a locked folio in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error,
* or because readahead was otherwise unable to retrieve it.
*/
if (unlikely(!folio_test_uptodate(folio))) {
/*
* If the invalidate lock is not held, the folio was in cache
* and uptodate and now it is not. Strange but possible since we
* didn't hold the page lock all the time. Let's drop
* everything, get the invalidate lock and try again.
*/
if (!mapping_locked) {
folio_unlock(folio);
folio_put(folio);
goto retry_find;
}
/*
* OK, the folio is really not uptodate. This can be because the
* VMA has the VM_RAND_READ flag set, or because an error
* arose. Let's read it in directly.
*/
goto page_not_uptodate;
}
/*
* We've made it this far and we had to drop our mmap_lock, now is the
* time to return to the upper layer and have it re-find the vma and
* redo the fault.
*/
if (fpin) {
folio_unlock(folio);
goto out_retry;
}
if (mapping_locked)
filemap_invalidate_unlock_shared(mapping);
/*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(index >= max_idx)) {
folio_unlock(folio);
folio_put(folio);
return VM_FAULT_SIGBUS;
}
vmf->page = folio_file_page(folio, index);
return ret | VM_FAULT_LOCKED;
page_not_uptodate:
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because there really aren't any performance issues here
* and we need to check for errors.
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
if (fpin)
goto out_retry;
folio_put(folio);
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
filemap_invalidate_unlock_shared(mapping);
return VM_FAULT_SIGBUS;
out_retry:
/*
* We dropped the mmap_lock, we need to return to the fault handler to
* re-find the vma and come back and find our hopefully still populated
* page.
*/
if (!IS_ERR(folio))
folio_put(folio);
if (mapping_locked)
filemap_invalidate_unlock_shared(mapping);
if (fpin)
fput(fpin);
return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);
filemap_read_folio 读取单个页面
- 调用文件系统的read_folio读取页面
- 等待I/O完成
- 检查页面是否最新
- 记录内存压力统计
static int filemap_read_folio(struct file *file, filler_t filler,
struct folio *folio)
{
bool workingset = folio_test_workingset(folio);
unsigned long pflags;
int error;
/* Start the actual read. The read will unlock the page. */
if (unlikely(workingset))
psi_memstall_enter(&pflags);
error = filler(file, folio);
if (unlikely(workingset))
psi_memstall_leave(&pflags);
if (error)
return error;
error = folio_wait_locked_killable(folio);
if (error)
return error;
if (folio_test_uptodate(folio))
return 0;
if (file)
shrink_readahead_size_eio(&file->f_ra);
return -EIO;
}
__filemap_get_folio(没有命中时创建)
┌─────────────────────────────────────────┐
│ __filemap_get_folio(mapping, index, │
│ fgp_flags, gfp) │
└─────────────────────────────────────────┘
│
▼
┌─────────────────────┐
│ filemap_get_entry() │ ← 从radix tree查找
└──────────┬──────────┘
│
┌─────────┴─────────┐
│ │
找到 未找到
│ │
▼ ▼
┌──────────┐ ┌──────────────────┐
│检查类型 │ │ 检查FGP_CREAT? │
│xa_is_ │ └────────┬─────────┘
│value()? │ │
└────┬─────┘ │
│ ┌────┴────┐
否 是 否
│ │ │
▼ ▼ ▼
┌──────────┐ ┌──────────┐ ┌──────────┐
│处理已存在 │ │创建新页面 │ │返回ENOENT│
│页面 │ │ │ └──────────┘
└────┬─────┘ └────┬─────┘
│ │
│ ┌────────┴────────┐
│ │ 1. 确定order │
│ │ 2. 对齐index │
│ │ 3. 调整gfp标志 │
│ │ 4. 降级分配循环 │
│ │ - 分配folio │
│ │ - 加入PageCache│
│ │ - 失败则降级 │
│ └────────┬────────┘
│ │
│ ┌────────┴────────┐
│ │ filemap_add_ │
│ │ folio()返回 │
│ └────────┬────────┘
│ │
│ ┌────────┴────────┐
│ │ -EEXIST? │
│ │ 是→goto repeat │
│ │ 否→继续 │
│ └────────┬────────┘
│ │
│ ┌────────┴────────┐
│ │ FGP_FOR_MMAP? │
│ │ 是→解锁页面 │
│ └────────┬────────┘
│ │
└─────────────┴─────────┐
│
▼
┌────────────────┐
│ 返回folio │
│ (引用计数+1) │
└────────────────┘
- mapping: 地址空间(address_space),对应文件的PageCache
- index: 页面索引(页号)
- fgp_flags: 控制行为的标志位(见下文)
- gfp: 内存分配标志(仅在创建页面时使用)
/**
* __filemap_get_folio - Find and get a reference to a folio.
* @mapping: The address_space to search.
* @index: The page index.
* @fgp_flags: %FGP flags modify how the folio is returned.
* @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
*
* Looks up the page cache entry at @mapping & @index.
*
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
* if the %GFP flags specified for %FGP_CREAT are atomic.
*
* If this function returns a folio, it is returned with an increased refcount.
*
* Return: The found folio or an ERR_PTR() otherwise.
*/
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
fgf_t fgp_flags, gfp_t gfp)
{
struct folio *folio;
repeat:
folio = filemap_get_entry(mapping, index);
if (xa_is_value(folio))
folio = NULL;
if (!folio)
goto no_page;
if (fgp_flags & FGP_LOCK) {
if (fgp_flags & FGP_NOWAIT) {
if (!folio_trylock(folio)) {
folio_put(folio);
return ERR_PTR(-EAGAIN);
}
} else {
folio_lock(folio);
}
/* Has the page been truncated? */
if (unlikely(folio->mapping != mapping)) {
folio_unlock(folio);
folio_put(folio);
goto repeat;
}
VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
}
if (fgp_flags & FGP_ACCESSED)
folio_mark_accessed(folio);
else if (fgp_flags & FGP_WRITE) {
/* Clear idle flag for buffer write */
if (folio_test_idle(folio))
folio_clear_idle(folio);
}
if (fgp_flags & FGP_STABLE)
folio_wait_stable(folio);
no_page:
if (!folio && (fgp_flags & FGP_CREAT)) {
unsigned int min_order = mapping_min_folio_order(mapping);
unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));
int err;
index = mapping_align_index(mapping, index);
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
gfp |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
gfp &= ~__GFP_FS;
if (fgp_flags & FGP_NOWAIT) {
gfp &= ~GFP_KERNEL;
gfp |= GFP_NOWAIT | __GFP_NOWARN;
}
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
if (order > mapping_max_folio_order(mapping))
order = mapping_max_folio_order(mapping);
/* If we're not aligned, allocate a smaller folio */
if (index & ((1UL << order) - 1))
order = __ffs(index);
do {
gfp_t alloc_gfp = gfp;
err = -ENOMEM;
if (order > min_order)
alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
folio = filemap_alloc_folio(alloc_gfp, order);
if (!folio)
continue;
/* Init accessed so avoid atomic mark_page_accessed later */
if (fgp_flags & FGP_ACCESSED)
__folio_set_referenced(folio);
err = filemap_add_folio(mapping, folio, index, gfp);
if (!err)
break;
folio_put(folio);
folio = NULL;
} while (order-- > min_order);
if (err == -EEXIST)
goto repeat;
if (err) {
/*
* When NOWAIT I/O fails to allocate folios this could
* be due to a nonblocking memory allocation and not
* because the system actually is out of memory.
* Return -EAGAIN so that there caller retries in a
* blocking fashion instead of propagating -ENOMEM
* to the application.
*/
if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
err = -EAGAIN;
return ERR_PTR(err);
}
/*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
*/
if (folio && (fgp_flags & FGP_FOR_MMAP))
folio_unlock(folio);
}
if (!folio)
return ERR_PTR(-ENOENT);
return folio;
}
EXPORT_SYMBOL(__filemap_get_folio);
Page Caches写入
和读取一样,有多种路径
调用链
Write 路径 (通过open打开的fd进行操作)
write()
└─> vfs_write()
└─> file->f_op->write_iter()
└─> generic_file_write_iter()
└─> __generic_file_write_iter()
├─> generic_file_direct_write() [直接IO路径]
└─> generic_perform_write() [缓冲IO路径]
├─> balance_dirty_pages_ratelimited() [脏页限流]
├─> a_ops->write_begin() [准备页面]
│ └─> __filemap_get_folio(FGP_WRITEBEGIN)
│ └─> 如果页面不存在或非最新,先读取
├─> copy_folio_from_iter_atomic() [复制数据]
└─> a_ops->write_end() [完成写入]
├─> folio_mark_uptodate()
├─> folio_mark_dirty() [标记脏页]
└─> 更新文件大小
Mmap 写入路径
用户写入mmap内存
└─> 触发写保护缺页中断
└─> handle_mm_fault()
└─> do_wp_page()
└─> do_page_mkwrite()
└─> vma->vm_ops->page_mkwrite()
└─> filemap_page_mkwrite()
└─> folio_mark_dirty() [标记脏页]
后台写回
脏数据回写会在后文单独给出
脏页达到阈值 / 显式同步
└─> balance_dirty_pages()
└─> 唤醒写回线程
└─> wb_workfn()
└─> wb_writeback()
└─> __writeback_single_inode()
└─> do_writepages()
├─> a_ops->writepages() [批量写回]
│ └─> write_cache_pages()
│ └─> writepage() [逐个写回]
└─> a_ops->writepage() [单个写回]
└─> 将页面数据写入磁盘
└─> folio_end_writeback()
流程图
┌─────────────────────────────────────────────────────────────┐
│ PageCache写入流程 │
└─────────────────────────────────────────────────────────────┘
│
┌─────────────┴─────────────┐
│ │
▼ ▼
┌──────────────────┐ ┌──────────────────┐
│ write()系统调用 │ │ mmap写入 │
└──────────────────┘ └──────────────────┘
│ │
▼ ▼
┌──────────────────┐ ┌──────────────────┐
│ generic_perform │ │ do_page_mkwrite() │
│ _write() │ └────────┬─────────┘
└────────┬─────────┘ │
│ │
▼ ▼
┌──────────────────┐ ┌──────────────────┐
│ 脏页限流检查 │ │ 标记页面为脏 │
│ balance_dirty_ │ │ folio_mark_dirty()│
│ pages_ratelimited│ └──────────────────┘
└────────┬─────────┘
│
┌────────┴────────┐
│ │
超过阈值 未超过
│ │
▼ ▼
┌──────────┐ ┌──────────────────┐
│ 限流等待 │ │ write_begin() │
│ 或唤醒 │ │ - 获取/创建页面 │
│ 写回线程 │ │ - 必要时先读取 │
└──────────┘ └────────┬─────────┘
│
▼
┌──────────────────┐
│ 复制数据到页面 │
│ copy_folio_from_ │
│ iter_atomic() │
└────────┬─────────┘
│
▼
┌──────────────────┐
│ write_end() │
│ - 标记uptodate │
│ - 标记dirty │
│ - 更新文件大小 │
└────────┬─────────┘
│
▼
┌──────────────────┐
│ 脏页积累 │
│ 达到阈值后触发 │
│ 后台写回 │
└────────┬─────────┘
│
▼
┌──────────────────┐
│ do_writepages() │
│ - writepages() │
│ - writepage() │
└────────┬─────────┘
│
▼
┌──────────────────┐
│ 写入磁盘 │
│ - 调用文件系统 │
│ 写入函数 │
│ - 清除dirty标志 │
│ - 标记writeback │
└──────────────────┘
Write 路径
write()
└─> vfs_write()
└─> file->f_op->write_iter()
└─> generic_file_write_iter()
└─> __generic_file_write_iter()
├─> generic_file_direct_write() [直接IO路径]
└─> generic_perform_write() [缓冲IO路径]
├─> balance_dirty_pages_ratelimited() [脏页限流]
├─> a_ops->write_begin() [准备页面]
│ └─> __filemap_get_folio(FGP_WRITEBEGIN)
│ └─> 如果页面不存在或非最新,先读取
├─> copy_folio_from_iter_atomic() [复制数据]
└─> a_ops->write_end() [完成写入]
├─> folio_mark_uptodate()
├─> folio_mark_dirty() [标记脏页]
└─> 更新文件大小
generic_file_write_iter(核心)
- 循环处理写入数据,每次处理一个chunk
- 调用balance_dirty_pages_ratelimited()进行脏页限流
- 调用write_begin()准备页面
- 从用户空间复制数据到页面
- 调用write_end()完成写入并标记脏页
- 处理部分写入和错误情况
ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
{
struct file *file = iocb->ki_filp;
loff_t pos = iocb->ki_pos;
struct address_space *mapping = file->f_mapping;
const struct address_space_operations *a_ops = mapping->a_ops;
size_t chunk = mapping_max_folio_size(mapping);
long status = 0;
ssize_t written = 0;
do {
struct folio *folio;
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
void *fsdata = NULL;
bytes = iov_iter_count(i);
retry:
offset = pos & (chunk - 1);
bytes = min(chunk - offset, bytes);
balance_dirty_pages_ratelimited(mapping);
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT;
break;
}
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
}
status = a_ops->write_begin(file, mapping, pos, bytes,
&folio, &fsdata);
if (unlikely(status < 0))
break;
offset = offset_in_folio(folio, pos);
if (bytes > folio_size(folio) - offset)
bytes = folio_size(folio) - offset;
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
flush_dcache_folio(folio);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
folio, fsdata);
if (unlikely(status != copied)) {
iov_iter_revert(i, copied - max(status, 0L));
if (unlikely(status < 0))
break;
}
cond_resched();
if (unlikely(status == 0)) {
/*
* A short copy made ->write_end() reject the
* thing entirely. Might be memory poisoning
* halfway through, might be a race with munmap,
* might be severe memory pressure.
*/
if (chunk > PAGE_SIZE)
chunk /= 2;
if (copied) {
bytes = copied;
goto retry;
}
} else {
pos += status;
written += status;
}
} while (iov_iter_count(i));
if (!written)
return status;
iocb->ki_pos += written;
return written;
}
EXPORT_SYMBOL(generic_perform_write);
write_begin (写入开始)
- 使用FGP_WRITEBEGIN标志获取或创建页面(已加锁)
- 如果页面不在PageCache或非最新,先读取
- 如果写入整个页面或超出EOF,可以跳过读取
- 返回页面和文件系统私有数据(fsdata)
int bch2_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation *res;
struct folio *folio;
unsigned offset;
int ret = -ENOMEM;
res = kmalloc(sizeof(*res), GFP_KERNEL);
if (!res)
return -ENOMEM;
bch2_folio_reservation_init(c, inode, res);
*fsdata = res;
bch2_pagecache_add_get(inode);
folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
FGP_WRITEBEGIN | fgf_set_order(len),
mapping_gfp_mask(mapping));
if (IS_ERR_OR_NULL(folio))
goto err_unlock;
offset = pos - folio_pos(folio);
len = min_t(size_t, len, folio_end_pos(folio) - pos);
if (folio_test_uptodate(folio))
goto out;
/* If we're writing entire folio, don't need to read it in first: */
if (!offset && len == folio_size(folio))
goto out;
if (!offset && pos + len >= inode->v.i_size) {
folio_zero_segment(folio, len, folio_size(folio));
flush_dcache_folio(folio);
goto out;
}
if (folio_pos(folio) >= inode->v.i_size) {
folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
flush_dcache_folio(folio);
goto out;
}
readpage:
ret = bch2_read_single_folio(folio, mapping);
if (ret)
goto err;
out:
ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
if (ret)
goto err;
ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
if (ret) {
if (!folio_test_uptodate(folio)) {
/*
* If the folio hasn't been read in, we won't know if we
* actually need a reservation - we don't actually need
* to read here, we just need to check if the folio is
* fully backed by uncompressed data:
*/
goto readpage;
}
goto err;
}
*foliop = folio;
return 0;
err:
folio_unlock(folio);
folio_put(folio);
err_unlock:
bch2_pagecache_add_put(inode);
kfree(res);
*fsdata = NULL;
return bch2_err_class(ret);
}
write_end 写入结束
- 如果未复制数据,直接解锁返回
- 将页面标记为最新(uptodate)
- 更新文件大小(如果扩展)
- 标记页面为脏(dirty),触发后续写回
- 解锁页面并返回实际复制的字节数
static int fuse_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
struct inode *inode = folio->mapping->host;
/* Haven't copied anything? Skip zeroing, size extending, dirtying. */
if (!copied)
goto unlock;
pos += copied;
if (!folio_test_uptodate(folio)) {
/* Zero any unwritten bytes at the end of the page */
size_t endoff = pos & ~PAGE_MASK;
if (endoff)
folio_zero_segment(folio, endoff, PAGE_SIZE);
folio_mark_uptodate(folio);
}
if (pos > inode->i_size)
i_size_write(inode, pos);
folio_mark_dirty(folio);
unlock:
folio_unlock(folio);
folio_put(folio);
return copied;
}
Mmap 路径
用户写入mmap内存
└─> 触发写保护缺页中断
└─> handle_mm_fault()
└─> do_wp_page()
└─> do_page_mkwrite()
└─> vma->vm_ops->page_mkwrite()
└─> filemap_page_mkwrite()
└─> folio_mark_dirty() [标记脏页]
Page Caches写回(脏数据回写)
调用链
1. 脏页达到阈值触发后台写回
- 脏页超过background_thresh时,唤醒后台写回线程
- 脏页超过dirty_thresh时,限流写入进程并强制写回
- 通过balance_dirty_pages()检测并触发
写入进程产生脏页
└─> folio_mark_dirty()
└─> __mark_inode_dirty()
└─> inode_io_list_move_locked()
└─> wb_wakeup_delayed()
└─> mod_delayed_work()
└─> wb_workfn() [workqueue异步执行]
└─> wb_do_writeback()
└─> wb_writeback()
├─> queue_io() [将脏inode加入b_io队列]
└─> __writeback_inodes_wb() ⭐
└─> writeback_sb_inodes() ⭐
└─> writeback_single_inode() ⭐
└─> __writeback_single_inode() ⭐
└─> do_writepages() ⭐
└─> a_ops->writepages() ⭐
└─> write_cache_pages()
└─> a_ops->writepage() ⭐
2. 显式同步操作
- fsync()/fdatasync():同步单个文件
- sync()/syncfs():同步整个文件系统
- sync_file_range():同步文件指定范围
fsync() / fdatasync()
└─> vfs_fsync()
└─> file->f_op->fsync()
└─> file_write_and_wait_range()
└─> __filemap_fdatawrite_range()
└─> filemap_fdatawrite_wbc()
└─> do_writepages() ⭐ [直接调用,同步执行]
└─> a_ops->writepages() ⭐
└─> write_cache_pages()
└─> a_ops->writepage() ⭐
└─> filemap_fdatawait_range() [等待写回完成]
└─> folio_wait_writeback()
3. 内存压力触发写回
- 内存回收时,先写回脏页再回收
- 通过wakeup_flusher_threads(WB_REASON_VMSCAN)触发
内存回收 (vmscan)
└─> shrink_inode_list()
└─> wakeup_flusher_threads(WB_REASON_VMSCAN)
└─> __wakeup_flusher_threads_bdi()
└─> wb_start_writeback()
└─> wb_wakeup()
└─> mod_delayed_work()
└─> wb_workfn() [workqueue异步执行]
└─> wb_do_writeback()
└─> wb_writeback()
├─> queue_io()
└─> __writeback_inodes_wb() ⭐
└─> writeback_sb_inodes() ⭐
└─> writeback_single_inode() ⭐
└─> __writeback_single_inode() ⭐
└─> do_writepages() ⭐
└─> a_ops->writepages() ⭐
└─> write_cache_pages()
└─> a_ops->writepage() ⭐
4. 定时写回(Periodic Writeback)
- 定期写回旧数据(kupdate风格)
- 默认每dirty_writeback_interval秒执行一次
定时器触发
└─> wb_check_old_data_flush()
└─> wb_writeback()
├─> queue_io() [选择旧脏页,dirtied_before参数]
└─> __writeback_inodes_wb() ⭐
└─> writeback_sb_inodes() ⭐
└─> writeback_single_inode() ⭐
└─> __writeback_single_inode() ⭐
└─> do_writepages() ⭐
└─> a_ops->writepages() ⭐
└─> write_cache_pages()
└─> a_ops->writepage() ⭐
5. 文件系统卸载
文件系统卸载
└─> sync_filesystem()
└─> sync_inodes_sb()
└─> writeback_inodes_sb()
└─> writeback_sb_inodes() ⭐
└─> writeback_single_inode() ⭐
└─> __writeback_single_inode() ⭐
└─> do_writepages() ⭐
└─> a_ops->writepages() ⭐
└─> write_cache_pages()
└─> a_ops->writepage() ⭐
6. 页面回收(Page Reclaim)
页面回收
└─> shrink_folio_list()
└─> folio_writeback()
└─> writeback_inodes_wb()
└─> __writeback_inodes_wb() ⭐
└─> writeback_sb_inodes() ⭐
└─> writeback_single_inode() ⭐
└─> __writeback_single_inode() ⭐
└─> do_writepages() ⭐
└─> a_ops->writepages() ⭐
└─> write_cache_pages()
└─> a_ops->writepage() ⭐
PerBDI
本质上这些不同路径都是在向 BDI 提交任务,或者直接调用 bdi 的写回函数
工作队列初始化
功能:创建全局写回工作队列bdi_wq,用于执行所有写回工作。
static int __init default_bdi_init(void)
{
bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
WQ_SYSFS, 0);
if (!bdi_wq)
return -ENOMEM;
return 0;
}
subsys_initcall(default_bdi_init);
bdi_writeback 初始化
- 初始化inode列表(b_dirty、b_io、b_more_io、b_dirty_time)
- 初始化工作列表(work_list)
- 初始化延迟工作项(dwork绑定到wb_workfn)
- 初始化写回带宽和限流参数
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
gfp_t gfp)
{
int err;
memset(wb, 0, sizeof(*wb));
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
INIT_LIST_HEAD(&wb->b_dirty_time);
spin_lock_init(&wb->list_lock);
atomic_set(&wb->writeback_inodes, 0);
wb->bw_time_stamp = jiffies;
wb->balanced_dirty_ratelimit = INIT_BW;
wb->dirty_ratelimit = INIT_BW;
wb->write_bandwidth = INIT_BW;
wb->avg_write_bandwidth = INIT_BW;
spin_lock_init(&wb->work_lock);
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
return err;
err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS);
if (err)
fprop_local_destroy_percpu(&wb->completions);
return err;
}
wb_do_writeback 执行写回操作
- 设置WB_writeback_running标志
- 从work_list获取工作项并执行
- 检查全量刷新、定时写回、后台写回
- 清除运行标志并返回已写回页面数
/*
* Retrieve work items and do the writeback they describe
*/
static long wb_do_writeback(struct bdi_writeback *wb)
{
struct wb_writeback_work *work;
long wrote = 0;
set_bit(WB_writeback_running, &wb->state);
while ((work = get_next_work_item(wb)) != NULL) {
trace_writeback_exec(wb, work);
wrote += wb_writeback(wb, work);
finish_writeback_work(work);
}
/*
* Check for a flush-everything request
*/
wrote += wb_check_start_all(wb);
/*
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
wrote += wb_check_background_flush(wb);
clear_bit(WB_writeback_running, &wb->state);
return wrote;
}