This gets rid of pdflush for bdi writeout and kupdated style cleaning. pdflush writeout suffers from lack of locality and also requires more threads to handle the same workload, since it has to work in a non-blocking fashion against each queue. This also introduces lumpy behaviour and potential request starvation, since pdflush can be starved for queue access if others are accessing it. A sample ffsb workload that does random writes to files is about 8% faster here on a simple SATA drive during the benchmark phase. File layout also seems a LOT more smooth in vmstat: r b swpd free buff cache si so bi bo in cs us sy id wa 0 1 0 608848 2652 375372 0 0 0 71024 604 24 1 10 48 42 0 1 0 549644 2712 433736 0 0 0 60692 505 27 1 8 48 44 1 0 0 476928 2784 505192 0 0 4 29540 553 24 0 9 53 37 0 1 0 457972 2808 524008 0 0 0 54876 331 16 0 4 38 58 0 1 0 366128 2928 614284 0 0 4 92168 710 58 0 13 53 34 0 1 0 295092 3000 684140 0 0 0 62924 572 23 0 9 53 37 0 1 0 236592 3064 741704 0 0 4 58256 523 17 0 8 48 44 0 1 0 165608 3132 811464 0 0 0 57460 560 21 0 8 54 38 0 1 0 102952 3200 873164 0 0 4 74748 540 29 1 10 48 41 0 1 0 48604 3252 926472 0 0 0 53248 469 29 0 7 47 45 where vanilla tends to fluctuate a lot in the creation phase: r b swpd free buff cache si so bi bo in cs us sy id wa 1 1 0 678716 5792 303380 0 0 0 74064 565 50 1 11 52 36 1 0 0 662488 5864 319396 0 0 4 352 302 329 0 2 47 51 0 1 0 599312 5924 381468 0 0 0 78164 516 55 0 9 51 40 0 1 0 519952 6008 459516 0 0 4 78156 622 56 1 11 52 37 1 1 0 436640 6092 541632 0 0 0 82244 622 54 0 11 48 41 0 1 0 436640 6092 541660 0 0 0 8 152 39 0 0 51 49 0 1 0 332224 6200 644252 0 0 4 102800 728 46 1 13 49 36 1 0 0 274492 6260 701056 0 0 4 12328 459 49 0 7 50 43 0 1 0 211220 6324 763356 0 0 0 106940 515 37 1 10 51 39 1 0 0 160412 6376 813468 0 0 0 8224 415 43 0 6 49 45 1 1 0 85980 6452 886556 0 0 4 113516 575 39 1 11 54 34 0 2 0 85968 6452 886620 0 0 0 1640 158 211 0 0 46 54 A 10 disk test with btrfs performs 26% faster with per-bdi flushing. A SSD based writeback test on XFS performs over 20% better as well, with the throughput being very stable around 1GB/sec, where pdflush only manages 750MB/sec and fluctuates wildly while doing so. Random buffered writes to many files behave a lot better as well, as does random mmap'ed writes. A separate thread is added to sync the super blocks. In the long term, adding sync_supers_bdi() functionality could get rid of this thread again. Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
178 lines
5.6 KiB
C
178 lines
5.6 KiB
C
/*
|
|
* include/linux/writeback.h
|
|
*/
|
|
#ifndef WRITEBACK_H
|
|
#define WRITEBACK_H
|
|
|
|
#include <linux/sched.h>
|
|
#include <linux/fs.h>
|
|
|
|
struct backing_dev_info;
|
|
|
|
extern spinlock_t inode_lock;
|
|
extern struct list_head inode_in_use;
|
|
extern struct list_head inode_unused;
|
|
|
|
/*
|
|
* Yes, writeback.h requires sched.h
|
|
* No, sched.h is not included from here.
|
|
*/
|
|
static inline int task_is_pdflush(struct task_struct *task)
|
|
{
|
|
return task->flags & PF_FLUSHER;
|
|
}
|
|
|
|
#define current_is_pdflush() task_is_pdflush(current)
|
|
|
|
/*
|
|
* fs/fs-writeback.c
|
|
*/
|
|
enum writeback_sync_modes {
|
|
WB_SYNC_NONE, /* Don't wait on anything */
|
|
WB_SYNC_ALL, /* Wait on every mapping */
|
|
};
|
|
|
|
/*
|
|
* A control structure which tells the writeback code what to do. These are
|
|
* always on the stack, and hence need no locking. They are always initialised
|
|
* in a manner such that unspecified fields are set to zero.
|
|
*/
|
|
struct writeback_control {
|
|
struct backing_dev_info *bdi; /* If !NULL, only write back this
|
|
queue */
|
|
struct super_block *sb; /* if !NULL, only write inodes from
|
|
this super_block */
|
|
enum writeback_sync_modes sync_mode;
|
|
unsigned long *older_than_this; /* If !NULL, only write back inodes
|
|
older than this */
|
|
long nr_to_write; /* Write this many pages, and decrement
|
|
this for each page written */
|
|
long pages_skipped; /* Pages which were not written */
|
|
|
|
/*
|
|
* For a_ops->writepages(): is start or end are non-zero then this is
|
|
* a hint that the filesystem need only write out the pages inside that
|
|
* byterange. The byte at `end' is included in the writeout request.
|
|
*/
|
|
loff_t range_start;
|
|
loff_t range_end;
|
|
|
|
unsigned nonblocking:1; /* Don't get stuck on request queues */
|
|
unsigned encountered_congestion:1; /* An output: a queue is full */
|
|
unsigned for_kupdate:1; /* A kupdate writeback */
|
|
unsigned for_reclaim:1; /* Invoked from the page allocator */
|
|
unsigned for_writepages:1; /* This is a writepages() call */
|
|
unsigned range_cyclic:1; /* range_start is cyclic */
|
|
unsigned more_io:1; /* more io to be dispatched */
|
|
/*
|
|
* write_cache_pages() won't update wbc->nr_to_write and
|
|
* mapping->writeback_index if no_nrwrite_index_update
|
|
* is set. write_cache_pages() may write more than we
|
|
* requested and we want to make sure nr_to_write and
|
|
* writeback_index are updated in a consistent manner
|
|
* so we use a single control to update them
|
|
*/
|
|
unsigned no_nrwrite_index_update:1;
|
|
};
|
|
|
|
/*
|
|
* fs/fs-writeback.c
|
|
*/
|
|
struct bdi_writeback;
|
|
int inode_wait(void *);
|
|
long writeback_inodes_sb(struct super_block *);
|
|
long sync_inodes_sb(struct super_block *);
|
|
void writeback_inodes_wbc(struct writeback_control *wbc);
|
|
long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
|
|
void wakeup_flusher_threads(long nr_pages);
|
|
|
|
/* writeback.h requires fs.h; it, too, is not included from here. */
|
|
static inline void wait_on_inode(struct inode *inode)
|
|
{
|
|
might_sleep();
|
|
wait_on_bit(&inode->i_state, __I_LOCK, inode_wait,
|
|
TASK_UNINTERRUPTIBLE);
|
|
}
|
|
static inline void inode_sync_wait(struct inode *inode)
|
|
{
|
|
might_sleep();
|
|
wait_on_bit(&inode->i_state, __I_SYNC, inode_wait,
|
|
TASK_UNINTERRUPTIBLE);
|
|
}
|
|
|
|
|
|
/*
|
|
* mm/page-writeback.c
|
|
*/
|
|
void laptop_io_completion(void);
|
|
void laptop_sync_completion(void);
|
|
void throttle_vm_writeout(gfp_t gfp_mask);
|
|
|
|
/* These are exported to sysctl. */
|
|
extern int dirty_background_ratio;
|
|
extern unsigned long dirty_background_bytes;
|
|
extern int vm_dirty_ratio;
|
|
extern unsigned long vm_dirty_bytes;
|
|
extern unsigned int dirty_writeback_interval;
|
|
extern unsigned int dirty_expire_interval;
|
|
extern int vm_highmem_is_dirtyable;
|
|
extern int block_dump;
|
|
extern int laptop_mode;
|
|
|
|
extern unsigned long determine_dirtyable_memory(void);
|
|
|
|
extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
|
|
struct file *filp, void __user *buffer, size_t *lenp,
|
|
loff_t *ppos);
|
|
extern int dirty_background_bytes_handler(struct ctl_table *table, int write,
|
|
struct file *filp, void __user *buffer, size_t *lenp,
|
|
loff_t *ppos);
|
|
extern int dirty_ratio_handler(struct ctl_table *table, int write,
|
|
struct file *filp, void __user *buffer, size_t *lenp,
|
|
loff_t *ppos);
|
|
extern int dirty_bytes_handler(struct ctl_table *table, int write,
|
|
struct file *filp, void __user *buffer, size_t *lenp,
|
|
loff_t *ppos);
|
|
|
|
struct ctl_table;
|
|
struct file;
|
|
int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
|
|
void __user *, size_t *, loff_t *);
|
|
|
|
void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
|
|
unsigned long *pbdi_dirty, struct backing_dev_info *bdi);
|
|
|
|
void page_writeback_init(void);
|
|
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
|
|
unsigned long nr_pages_dirtied);
|
|
|
|
static inline void
|
|
balance_dirty_pages_ratelimited(struct address_space *mapping)
|
|
{
|
|
balance_dirty_pages_ratelimited_nr(mapping, 1);
|
|
}
|
|
|
|
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
|
|
void *data);
|
|
|
|
int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
|
|
int generic_writepages(struct address_space *mapping,
|
|
struct writeback_control *wbc);
|
|
int write_cache_pages(struct address_space *mapping,
|
|
struct writeback_control *wbc, writepage_t writepage,
|
|
void *data);
|
|
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
|
|
int sync_page_range(struct inode *inode, struct address_space *mapping,
|
|
loff_t pos, loff_t count);
|
|
int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
|
|
loff_t pos, loff_t count);
|
|
void set_page_dirty_balance(struct page *page, int page_mkwrite);
|
|
void writeback_set_ratelimit(void);
|
|
|
|
/* pdflush.c */
|
|
extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
|
|
read-only. */
|
|
|
|
|
|
#endif /* WRITEBACK_H */
|