writeback机制源码分析

来源：互联网收集：自由互联发布时间：2022-06-20

writeback相关数据结构与writeback相关的数据结构主要有： 1， backing_dev_info，该数据结构描述了backing_dev的所有信息，通常块设备的request queue中会包含backing_dev对象。 2， bdi_writeback，该数据

writeback相关数据结构与writeback相关的数据结构主要有： 1，backing_dev_info，该数据结构描述了backing_dev的所有信息，通常块设备的request queue中会包含backing_dev对象。 2，bdi_writeback，该数据结构封装了writeback的内核线程以及需要操作的inode队列。 3，wb_writeback_work，该数据结构封装了writeback的工作任务。各数据结构之间的关系如下图所示：

writeback机制源码分析_Cache

下面对各个数据结构做简要介绍。 bdi information bdi对象在块设备添加的时候需要注册到系统的bdi队列中。对于ext3而言，在mount的时候需要将底层块设备的bdi对象联系到ext3 root_inode中。bdi对象数据结构定义如下：

struct backing_dev_info {

struct list_head bdi_list;

unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */

unsigned long state; /* Always use atomic bitops on this */

unsigned int capabilities; /* Device capabilities */

congested_fn *congested_fn; /* Function pointer if device is md/dm */

void *congested_data; /* Pointer to aux data for congested func */

char *name;

struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];

unsigned long bw_time_stamp; /* last time write bw is updated */

unsigned long dirtied_stamp;

unsigned long written_stamp; /* pages written at bw_time_stamp */

unsigned long write_bandwidth; /* the estimated write bandwidth */

unsigned long avg_write_bandwidth; /* further smoothed write bw */

* The base dirty throttle rate, re-calculated on every 200ms.

* All the bdi tasks' dirty rate will be curbed under it.

* @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit

* in small steps and is much more smooth/stable than the latter.

unsigned long dirty_ratelimit;

unsigned long balanced_dirty_ratelimit;

struct prop_local_percpu completions;

int dirty_exceeded;

unsigned int min_ratio;

unsigned int max_ratio, max_prop_frac;

struct bdi_writeback wb; /* default writeback info for this bdi，writeback对象 */

spinlock_t wb_lock; /* protects work_list */

/* 任务链表 */

struct list_head work_list;

struct device *dev;

/* 在laptop模式下应用的定时器 */

struct timer_list laptop_mode_wb_timer;

#ifdef CONFIG_DEBUG_FS

struct dentry *debug_dir;

struct dentry *debug_stats;

#endif

};

在bdi数据结构中定义了一个writeback对象，该对象是对writeback内核线程的描述，并且封装了需要处理的inode队列。在bdi数据结构中有一条work_list，该work队列维护了writeback内核线程需要处理的任务。如果该队列上没有work可以处理，那么writeback内核线程将会睡眠等待。 writeback writeback对象封装了内核线程task以及需要处理的inode队列。当page cache/buffer cache需要刷新radix tree上的inode时，可以将该inode挂载到writeback对象的b_dirty队列上，然后唤醒writeback线程。在处理过程中，inode会被移到b_io队列上进行处理。多条链表的方式可以降低多线程之间的资源共享。writeback数据结构具体定义如下：

struct bdi_writeback {

struct backing_dev_info *bdi; /* our parent bdi */

unsigned int nr;

unsigned long last_old_flush; /* last old data flush */

unsigned long last_active; /* last time bdi thread was active */

struct task_struct *task; /* writeback thread */

struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */

struct list_head b_dirty; /* dirty inodes */

struct list_head b_io; /* parked for writeback */

struct list_head b_more_io; /* parked for more writeback */

spinlock_t list_lock; /* protects the b_* lists */

};

writeback work

wb_writeback_work数据结构是对writeback任务的封装，不同的任务可以采用不同的刷新策略。writeback线程的处理对象就是writeback_work。如果writeback_work队列为空，那么内核线程就可以睡眠了。Writeback_work的数据结构定义如下：

struct wb_writeback_work {

long nr_pages;

struct super_block *sb; /* superblock对象 */

unsigned long *older_than_this;

enum writeback_sync_modes sync_mode;

unsigned int tagged_writepages:1;

unsigned int for_kupdate:1;

unsigned int range_cyclic:1;

unsigned int for_background:1;

enum wb_reason reason; /* why was writeback initiated? */

struct list_head list; /* pending work list，链入bdi-> work_list队列 */

struct completion *done; /* set if the caller waits，work完成时通知调用者 */

};

writeback主要函数分析 writeback机制的主要函数包括如下两个方面： 1，管理bdi对象并且fork相应的writeback内核线程处理cache数据的刷新工作。 2，writeback内核线程处理函数，实现dirty page的刷新操作 writeback线程管理 Linux中有一个内核守护线程，该线程用来管理系统bdi队列，并且负责为block device创建writeback thread。当bdi中有dirty page并且还没有为bdi分配内核线程的时候，bdi_forker_thread程序会为其分配线程资源；当一个writeback线程长时间处于空闲状态时，bdi_forker_thread程序会释放该线程资源。 writeback线程管理程序分析如下：

static int bdi_forker_thread(void *ptr)

{

struct bdi_writeback *me = ptr;

current->flags |= PF_SWAPWRITE;

set_freezable();

* Our parent may run at a different priority, just set us to normal

set_user_nice(current, 0);

for (;;) {

struct task_struct *task = NULL;

struct backing_dev_info *bdi;

enum {

NO_ACTION, /* Nothing to do */

FORK_THREAD, /* Fork bdi thread */

KILL_THREAD, /* Kill inactive bdi thread */

} action = NO_ACTION;

* Temporary measure, we want to make sure we don't see

* dirty data on the default backing_dev_info

if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {

del_timer(&me->wakeup_timer);

wb_do_writeback(me, 0);

}

spin_lock_bh(&bdi_lock);

* In the following loop we are going to check whether we have

* some work to do without any synchronization with tasks

* waking us up to do work for them. Set the task state here

* so that we don't miss wakeups after verifying conditions.

set_current_state(TASK_INTERRUPTIBLE);

/* 遍历所有的bdi对象，检查这些bdi是否存在脏数据，如果有脏数据，那么需要为其fork线程，然后做writeback操作 */

list_for_each_entry(bdi, &bdi_list, bdi_list) {

bool have_dirty_io;

if (!bdi_cap_writeback_dirty(bdi) ||

bdi_cap_flush_forker(bdi))

continue;

WARN(!test_bit(BDI_registered, &bdi->state),

"bdi %p/%s is not registered!\n", bdi, bdi->name);

/* 检查是否存在脏数据 */

have_dirty_io = !list_empty(&bdi->work_list) ||

wb_has_dirty_io(&bdi->wb);

* If the bdi has work to do, but the thread does not

* exist - create it.

if (!bdi->wb.task && have_dirty_io) {

* Set the pending bit - if someone will try to

* unregister this bdi - it'll wait on this bit.

/* 如果有脏数据，并且不存在线程，那么接下来做线程的FORK操作 */

set_bit(BDI_pending, &bdi->state);

action = FORK_THREAD;

break;

}

spin_lock(&bdi->wb_lock);

* If there is no work to do and the bdi thread was

* inactive long enough - kill it. The wb_lock is taken

* to make sure no-one adds more work to this bdi and

* wakes the bdi thread up.

/* 如果一个bdi长时间没有脏数据，那么执行线程的KILL操作，结束掉该bdi对应的writeback线程 */

if (bdi->wb.task && !have_dirty_io &&

time_after(jiffies, bdi->wb.last_active +

bdi_longest_inactive())) {

task = bdi->wb.task;

bdi->wb.task = NULL;

spin_unlock(&bdi->wb_lock);

set_bit(BDI_pending, &bdi->state);

action = KILL_THREAD;

break;

}

spin_unlock(&bdi->wb_lock);

}

spin_unlock_bh(&bdi_lock);

/* Keep working if default bdi still has things to do */

if (!list_empty(&me->bdi->work_list))

__set_current_state(TASK_RUNNING);

/* 执行线程的FORK和KILL操作 */

switch (action) {

case FORK_THREAD:

/* FORK一个bdi_writeback_thread线程，该线程的名字为flush-major:minor */

__set_current_state(TASK_RUNNING);

task = kthread_create(bdi_writeback_thread, &bdi->wb,

"flush-%s", dev_name(bdi->dev));

if (IS_ERR(task)) {

* If thread creation fails, force writeout of

* the bdi from the thread. Hopefully 1024 is

* large enough for efficient IO.

writeback_inodes_wb(&bdi->wb, 1024,

WB_REASON_FORKER_THREAD);

} else {

* The spinlock makes sure we do not lose

* wake-ups when racing with 'bdi_queue_work()'.

* And as soon as the bdi thread is visible, we

* can start it.

spin_lock_bh(&bdi->wb_lock);

bdi->wb.task = task;

spin_unlock_bh(&bdi->wb_lock);

wake_up_process(task);

}

bdi_clear_pending(bdi);

break;

case KILL_THREAD:

/* KILL一个线程 */

__set_current_state(TASK_RUNNING);

kthread_stop(task);

bdi_clear_pending(bdi);

break;

case NO_ACTION:

/* 如果没有可执行的动作，那么调度本线程睡眠一段时间 */

if (!wb_has_dirty_io(me) || !dirty_writeback_interval)

* There are no dirty data. The only thing we

* should now care about is checking for

* inactive bdi threads and killing them. Thus,

* let's sleep for longer time, save energy and

* be friendly for battery-driven devices.

schedule_timeout(bdi_longest_inactive());

else

schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));

try_to_freeze();

break;

}

return 0;

}

writeback线程

writeback线程是bdi_forker_thread 创建的，该线程的任务就是处理等待的数据回刷任务。线程处理函数为bdi_writeback_thread，其会调用wb_do_writeback函数完成具体操作，该函数分析如下：

long wb_do_writeback(struct bdi_writeback *wb, int force_wait)

{

struct backing_dev_info *bdi = wb->bdi;

struct wb_writeback_work *work;

long wrote = 0;

set_bit(BDI_writeback_running, &wb->bdi->state);

/* 处理等待的work，所有等待work pengding在bdi->work_list上 */

while ((work = get_next_work_item(bdi)) != NULL) {

* Override sync mode, in case we must wait for completion

* because this thread is exiting now.

if (force_wait)

work->sync_mode = WB_SYNC_ALL;

trace_writeback_exec(bdi, work);

/* 调用wb_writeback函数处理相应的inode */

wrote += wb_writeback(wb, work);

* Notify the caller of completion if this is a synchronous

* work item, otherwise just free it.

/* 通知上层软件，相应的work已经完成 */

if (work->done)

complete(work->done);

else

kfree(work);

}

* Check for periodic writeback, kupdated() style

/* 处理周期性的dirty page刷新作业，buffer cache就会走这条路径，在下面的函数中会创建work，并且调用wb_writeback函数进行处理 */

wrote += wb_check_old_data_flush(wb);

wrote += wb_check_background_flush(wb);

clear_bit(BDI_writeback_running, &wb->bdi->state);

return wrote;

}

小结本文在linux-3.2的基础上对writeback代码进行了浏览。整体上来讲，writeback机制是比较简单的，其核心是通过一个常驻内核线程为bdi对象分配writeback线程，实现对cache中dirty page的数据回刷。

上一篇：将定制RPM包加入内部Yum Server
下一篇：没有了

网友评论

相关栏目

writeback机制源码分析

相关文章