diff options
Diffstat (limited to 'release/src/linux/linux/fs/buffer.c')
-rw-r--r-- | release/src/linux/linux/fs/buffer.c | 3012 |
1 files changed, 3012 insertions, 0 deletions
diff --git a/release/src/linux/linux/fs/buffer.c b/release/src/linux/linux/fs/buffer.c new file mode 100644 index 00000000..9ec1b055 --- /dev/null +++ b/release/src/linux/linux/fs/buffer.c @@ -0,0 +1,3012 @@ +/* + * linux/fs/buffer.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'buffer.c' implements the buffer-cache functions. Race-conditions have + * been avoided by NEVER letting an interrupt change a buffer (except for the + * data, of course), but instead letting the caller do it. + */ + +/* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */ + +/* Removed a lot of unnecessary code and simplified things now that + * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 + */ + +/* Speed up hash, lru, and free list operations. Use gfp() for allocating + * hash table, use SLAB cache for buffer heads. -DaveM + */ + +/* Added 32k buffer block sizes - these are required older ARM systems. + * - RMK + */ + +/* Thread it... -DaveM */ + +/* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */ + +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/locks.h> +#include <linux/errno.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/smp_lock.h> +#include <linux/vmalloc.h> +#include <linux/blkdev.h> +#include <linux/sysrq.h> +#include <linux/file.h> +#include <linux/init.h> +#include <linux/quotaops.h> +#include <linux/iobuf.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/completion.h> + +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/bitops.h> +#include <asm/mmu_context.h> + +#define NR_RESERVED (10*MAX_BUF_PER_PAGE) +#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this + number of unused buffer heads */ + +/* Anti-deadlock ordering: + * lru_list_lock > hash_table_lock > unused_list_lock + */ + +#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers) + +/* + * Hash table gook.. + */ +static unsigned int bh_hash_mask; +static unsigned int bh_hash_shift; +static struct buffer_head **hash_table; +static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED; + +static struct buffer_head *lru_list[NR_LIST]; + +static spinlock_cacheline_t lru_list_lock_cacheline = {SPIN_LOCK_UNLOCKED}; +#define lru_list_lock lru_list_lock_cacheline.lock + +static int nr_buffers_type[NR_LIST]; +static unsigned long size_buffers_type[NR_LIST]; + +static struct buffer_head * unused_list; +static int nr_unused_buffer_heads; +static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED; +static DECLARE_WAIT_QUEUE_HEAD(buffer_wait); + +static int grow_buffers(kdev_t dev, unsigned long block, int size); +static int osync_buffers_list(struct list_head *); +static void __refile_buffer(struct buffer_head *); + +/* This is used by some architectures to estimate available memory. */ +atomic_t buffermem_pages = ATOMIC_INIT(0); + +/* Here is the parameter block for the bdflush process. If you add or + * remove any of the parameters, make sure to update kernel/sysctl.c + * and the documentation at linux/Documentation/sysctl/vm.txt. + */ + +#define N_PARAM 9 + +/* The dummy values in this structure are left in there for compatibility + * with old programs that play with the /proc entries. + */ +union bdflush_param { + struct { + int nfract; /* Percentage of buffer cache dirty to + activate bdflush */ + int ndirty; /* Maximum number of dirty blocks to write out per + wake-cycle */ + int dummy2; /* old "nrefill" */ + int dummy3; /* unused */ + int interval; /* jiffies delay between kupdate flushes */ + int age_buffer; /* Time for normal buffer to age before we flush it */ + int nfract_sync;/* Percentage of buffer cache dirty to + activate bdflush synchronously */ + int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */ + int dummy5; /* unused */ + } b_un; + unsigned int data[N_PARAM]; +} bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}}; + +/* These are the min and max parameter values that we will allow to be assigned */ +int bdflush_min[N_PARAM] = { 0, 1, 0, 0, 0, 1*HZ, 0, 0, 0}; +int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0}; + +void unlock_buffer(struct buffer_head *bh) +{ + clear_bit(BH_Wait_IO, &bh->b_state); + clear_bit(BH_Launder, &bh->b_state); + /* + * When a locked buffer is visible to the I/O layer BH_Launder + * is set. This means before unlocking we must clear BH_Launder, + * mb() on alpha and then clear BH_Lock, so no reader can see + * BH_Launder set on an unlocked buffer and then risk to deadlock. + */ + smp_mb__after_clear_bit(); + clear_bit(BH_Lock, &bh->b_state); + smp_mb__after_clear_bit(); + if (waitqueue_active(&bh->b_wait)) + wake_up(&bh->b_wait); +} + +/* + * Note that the real wait_on_buffer() is an inline function that checks + * that the buffer is locked before calling this, so that unnecessary disk + * unplugging does not occur. + */ +void __wait_on_buffer(struct buffer_head * bh) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + get_bh(bh); + add_wait_queue(&bh->b_wait, &wait); + do { + run_task_queue(&tq_disk); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!buffer_locked(bh)) + break; + schedule(); + } while (buffer_locked(bh)); + tsk->state = TASK_RUNNING; + remove_wait_queue(&bh->b_wait, &wait); + put_bh(bh); +} + +/* + * Default synchronous end-of-IO handler.. Just mark it up-to-date and + * unlock the buffer. This is what ll_rw_block uses too. + */ +void end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + mark_buffer_uptodate(bh, uptodate); + unlock_buffer(bh); + put_bh(bh); +} + +/* + * The buffers have been marked clean and locked. Just submit the dang + * things.. + */ +static void write_locked_buffers(struct buffer_head **array, unsigned int count) +{ + do { + struct buffer_head * bh = *array++; + bh->b_end_io = end_buffer_io_sync; + submit_bh(WRITE, bh); + } while (--count); +} + +/* + * Write some buffers from the head of the dirty queue. + * + * This must be called with the LRU lock held, and will + * return without it! + */ +#define NRSYNC (32) +static int write_some_buffers(kdev_t dev) +{ + struct buffer_head *next; + struct buffer_head *array[NRSYNC]; + unsigned int count; + int nr; + + next = lru_list[BUF_DIRTY]; + nr = nr_buffers_type[BUF_DIRTY]; + count = 0; + while (next && --nr >= 0) { + struct buffer_head * bh = next; + next = bh->b_next_free; + + if (dev != NODEV && bh->b_dev != dev) + continue; + if (test_and_set_bit(BH_Lock, &bh->b_state)) + continue; + if (atomic_set_buffer_clean(bh)) { + __refile_buffer(bh); + get_bh(bh); + array[count++] = bh; + if (count < NRSYNC) + continue; + + spin_unlock(&lru_list_lock); + write_locked_buffers(array, count); + return -EAGAIN; + } + unlock_buffer(bh); + __refile_buffer(bh); + } + spin_unlock(&lru_list_lock); + + if (count) + write_locked_buffers(array, count); + return 0; +} + +/* + * Write out all buffers on the dirty list. + */ +static void write_unlocked_buffers(kdev_t dev) +{ + do + spin_lock(&lru_list_lock); + while (write_some_buffers(dev)); +} + +/* + * Wait for a buffer on the proper list. + * + * This must be called with the LRU lock held, and + * will return with it released. + */ +static int wait_for_buffers(kdev_t dev, int index, int refile) +{ + struct buffer_head * next; + int nr; + + next = lru_list[index]; + nr = nr_buffers_type[index]; + while (next && --nr >= 0) { + struct buffer_head *bh = next; + next = bh->b_next_free; + + if (!buffer_locked(bh)) { + if (refile) + __refile_buffer(bh); + continue; + } + if (dev != NODEV && bh->b_dev != dev) + continue; + + get_bh(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer (bh); + put_bh(bh); + return -EAGAIN; + } + spin_unlock(&lru_list_lock); + return 0; +} + +static int wait_for_locked_buffers(kdev_t dev, int index, int refile) +{ + do { + spin_lock(&lru_list_lock); + } while (wait_for_buffers(dev, index, refile)); + return 0; +} + +/* Call sync_buffers with wait!=0 to ensure that the call does not + * return until all buffer writes have completed. Sync() may return + * before the writes have finished; fsync() may not. + */ + +/* Godamity-damn. Some buffers (bitmaps for filesystems) + * spontaneously dirty themselves without ever brelse being called. + * We will ultimately want to put these in a separate list, but for + * now we search all of the lists for dirty buffers. + */ +int sync_buffers(kdev_t dev, int wait) +{ + int err = 0; + + /* One pass for no-wait, three for wait: + * 0) write out all dirty, unlocked buffers; + * 1) wait for all dirty locked buffers; + * 2) write out all dirty, unlocked buffers; + * 2) wait for completion by waiting for all buffers to unlock. + */ + write_unlocked_buffers(dev); + if (wait) { + err = wait_for_locked_buffers(dev, BUF_DIRTY, 0); + write_unlocked_buffers(dev); + err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1); + } + return err; +} + +int fsync_super(struct super_block *sb) +{ + kdev_t dev = sb->s_dev; + sync_buffers(dev, 0); + + lock_kernel(); + sync_inodes_sb(sb); + DQUOT_SYNC(dev); + lock_super(sb); + if (sb->s_dirt && sb->s_op && sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + unlock_kernel(); + + return sync_buffers(dev, 1); +} + +int fsync_no_super(kdev_t dev) +{ + sync_buffers(dev, 0); + return sync_buffers(dev, 1); +} + +int fsync_dev(kdev_t dev) +{ + sync_buffers(dev, 0); + + lock_kernel(); + sync_inodes(dev); + DQUOT_SYNC(dev); + sync_supers(dev); + unlock_kernel(); + + return sync_buffers(dev, 1); +} + +/* + * There's no real reason to pretend we should + * ever do anything differently + */ +void sync_dev(kdev_t dev) +{ + fsync_dev(dev); +} + +asmlinkage long sys_sync(void) +{ + fsync_dev(0); + return 0; +} + +/* + * filp may be NULL if called via the msync of a vma. + */ + +int file_fsync(struct file *filp, struct dentry *dentry, int datasync) +{ + struct inode * inode = dentry->d_inode; + struct super_block * sb; + kdev_t dev; + int ret; + + lock_kernel(); + /* sync the inode to buffers */ + write_inode_now(inode, 0); + + /* sync the superblock to buffers */ + sb = inode->i_sb; + lock_super(sb); + if (sb->s_op && sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + + /* .. finally sync the buffers to disk */ + dev = inode->i_dev; + ret = sync_buffers(dev, 1); + unlock_kernel(); + return ret; +} + +asmlinkage long sys_fsync(unsigned int fd) +{ + struct file * file; + struct dentry * dentry; + struct inode * inode; + int ret, err; + + ret = -EBADF; + file = fget(fd); + if (!file) + goto out; + + dentry = file->f_dentry; + inode = dentry->d_inode; + + ret = -EINVAL; + if (!file->f_op || !file->f_op->fsync) { + /* Why? We can still call filemap_fdatasync */ + goto out_putf; + } + + /* We need to protect against concurrent writers.. */ + down(&inode->i_sem); + ret = filemap_fdatasync(inode->i_mapping); + err = file->f_op->fsync(file, dentry, 0); + if (err && !ret) + ret = err; + err = filemap_fdatawait(inode->i_mapping); + if (err && !ret) + ret = err; + up(&inode->i_sem); + +out_putf: + fput(file); +out: + return ret; +} + +asmlinkage long sys_fdatasync(unsigned int fd) +{ + struct file * file; + struct dentry * dentry; + struct inode * inode; + int ret, err; + + ret = -EBADF; + file = fget(fd); + if (!file) + goto out; + + dentry = file->f_dentry; + inode = dentry->d_inode; + + ret = -EINVAL; + if (!file->f_op || !file->f_op->fsync) + goto out_putf; + + down(&inode->i_sem); + ret = filemap_fdatasync(inode->i_mapping); + err = file->f_op->fsync(file, dentry, 1); + if (err && !ret) + ret = err; + err = filemap_fdatawait(inode->i_mapping); + if (err && !ret) + ret = err; + up(&inode->i_sem); + +out_putf: + fput(file); +out: + return ret; +} + +/* After several hours of tedious analysis, the following hash + * function won. Do not mess with it... -DaveM + */ +#define _hashfn(dev,block) \ + ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \ + (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \ + ((block) << (bh_hash_shift - 12)))) +#define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)] + +static inline void __insert_into_hash_list(struct buffer_head *bh) +{ + struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr); + struct buffer_head *next = *head; + + *head = bh; + bh->b_pprev = head; + bh->b_next = next; + if (next != NULL) + next->b_pprev = &bh->b_next; +} + +static __inline__ void __hash_unlink(struct buffer_head *bh) +{ + struct buffer_head **pprev = bh->b_pprev; + if (pprev) { + struct buffer_head *next = bh->b_next; + if (next) + next->b_pprev = pprev; + *pprev = next; + bh->b_pprev = NULL; + } +} + +static void __insert_into_lru_list(struct buffer_head * bh, int blist) +{ + struct buffer_head **bhp = &lru_list[blist]; + + if (bh->b_prev_free || bh->b_next_free) BUG(); + + if(!*bhp) { + *bhp = bh; + bh->b_prev_free = bh; + } + bh->b_next_free = *bhp; + bh->b_prev_free = (*bhp)->b_prev_free; + (*bhp)->b_prev_free->b_next_free = bh; + (*bhp)->b_prev_free = bh; + nr_buffers_type[blist]++; + size_buffers_type[blist] += bh->b_size; +} + +static void __remove_from_lru_list(struct buffer_head * bh) +{ + struct buffer_head *next = bh->b_next_free; + if (next) { + struct buffer_head *prev = bh->b_prev_free; + int blist = bh->b_list; + + prev->b_next_free = next; + next->b_prev_free = prev; + if (lru_list[blist] == bh) { + if (next == bh) + next = NULL; + lru_list[blist] = next; + } + bh->b_next_free = NULL; + bh->b_prev_free = NULL; + nr_buffers_type[blist]--; + size_buffers_type[blist] -= bh->b_size; + } +} + +/* must be called with both the hash_table_lock and the lru_list_lock + held */ +static void __remove_from_queues(struct buffer_head *bh) +{ + __hash_unlink(bh); + __remove_from_lru_list(bh); +} + +static void remove_from_queues(struct buffer_head *bh) +{ + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + __remove_from_queues(bh); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); +} + +struct buffer_head * get_hash_table(kdev_t dev, int block, int size) +{ + struct buffer_head *bh, **p = &hash(dev, block); + + read_lock(&hash_table_lock); + + for (;;) { + bh = *p; + if (!bh) + break; + p = &bh->b_next; + if (bh->b_blocknr != block) + continue; + if (bh->b_size != size) + continue; + if (bh->b_dev != dev) + continue; + get_bh(bh); + break; + } + + read_unlock(&hash_table_lock); + return bh; +} + +void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode) +{ + spin_lock(&lru_list_lock); + if (bh->b_inode) + list_del(&bh->b_inode_buffers); + bh->b_inode = inode; + list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers); + spin_unlock(&lru_list_lock); +} + +void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode) +{ + spin_lock(&lru_list_lock); + if (bh->b_inode) + list_del(&bh->b_inode_buffers); + bh->b_inode = inode; + list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers); + spin_unlock(&lru_list_lock); +} + +/* The caller must have the lru_list lock before calling the + remove_inode_queue functions. */ +static void __remove_inode_queue(struct buffer_head *bh) +{ + bh->b_inode = NULL; + list_del(&bh->b_inode_buffers); +} + +static inline void remove_inode_queue(struct buffer_head *bh) +{ + if (bh->b_inode) + __remove_inode_queue(bh); +} + +int inode_has_buffers(struct inode *inode) +{ + int ret; + + spin_lock(&lru_list_lock); + ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers); + spin_unlock(&lru_list_lock); + + return ret; +} + +/* If invalidate_buffers() will trash dirty buffers, it means some kind + of fs corruption is going on. Trashing dirty data always imply losing + information that was supposed to be just stored on the physical layer + by the user. + + Thus invalidate_buffers in general usage is not allwowed to trash + dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to + be preserved. These buffers are simply skipped. + + We also skip buffers which are still in use. For example this can + happen if a userspace program is reading the block device. + + NOTE: In the case where the user removed a removable-media-disk even if + there's still dirty data not synced on disk (due a bug in the device driver + or due an error of the user), by not destroying the dirty buffers we could + generate corruption also on the next media inserted, thus a parameter is + necessary to handle this case in the most safe way possible (trying + to not corrupt also the new disk inserted with the data belonging to + the old now corrupted disk). Also for the ramdisk the natural thing + to do in order to release the ramdisk memory is to destroy dirty buffers. + + These are two special cases. Normal usage imply the device driver + to issue a sync on the device (without waiting I/O completion) and + then an invalidate_buffers call that doesn't trash dirty buffers. + + For handling cache coherency with the blkdev pagecache the 'update' case + is been introduced. It is needed to re-read from disk any pinned + buffer. NOTE: re-reading from disk is destructive so we can do it only + when we assume nobody is changing the buffercache under our I/O and when + we think the disk contains more recent information than the buffercache. + The update == 1 pass marks the buffers we need to update, the update == 2 + pass does the actual I/O. */ +void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) +{ + int i, nlist, slept; + struct buffer_head * bh, * bh_next; + kdev_t dev = to_kdev_t(bdev->bd_dev); /* will become bdev */ + + retry: + slept = 0; + spin_lock(&lru_list_lock); + for(nlist = 0; nlist < NR_LIST; nlist++) { + bh = lru_list[nlist]; + if (!bh) + continue; + for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) { + bh_next = bh->b_next_free; + + /* Another device? */ + if (bh->b_dev != dev) + continue; + /* Not hashed? */ + if (!bh->b_pprev) + continue; + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + slept = 1; + spin_lock(&lru_list_lock); + put_bh(bh); + } + + write_lock(&hash_table_lock); + /* All buffers in the lru lists are mapped */ + if (!buffer_mapped(bh)) + BUG(); + if (buffer_dirty(bh)) + printk("invalidate: dirty buffer\n"); + if (!atomic_read(&bh->b_count)) { + if (destroy_dirty_buffers || !buffer_dirty(bh)) { + remove_inode_queue(bh); + } + } else + printk("invalidate: busy buffer\n"); + + write_unlock(&hash_table_lock); + if (slept) + goto out; + } + } +out: + spin_unlock(&lru_list_lock); + if (slept) + goto retry; + + /* Get rid of the page cache */ + invalidate_inode_pages(bdev->bd_inode); +} + +void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers) +{ + struct block_device *bdev = bdget(dev); + if (bdev) { + invalidate_bdev(bdev, destroy_dirty_buffers); + bdput(bdev); + } +} + +static void free_more_memory(void) +{ + balance_dirty(); + wakeup_bdflush(); + try_to_free_pages(GFP_NOIO); + run_task_queue(&tq_disk); + yield(); +} + +void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) +{ + bh->b_list = BUF_CLEAN; + bh->b_end_io = handler; + bh->b_private = private; +} + +static void end_buffer_io_async(struct buffer_head * bh, int uptodate) +{ + static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; + unsigned long flags; + struct buffer_head *tmp; + struct page *page; + int fullup = 1; + + mark_buffer_uptodate(bh, uptodate); + + /* This is a temporary buffer used for page I/O. */ + page = bh->b_page; + + if (!uptodate) + SetPageError(page); + + /* + * Be _very_ careful from here on. Bad things can happen if + * two buffer heads end IO at almost the same time and both + * decide that the page is now completely done. + * + * Async buffer_heads are here only as labels for IO, and get + * thrown away once the IO for this page is complete. IO is + * deemed complete once all buffers have been visited + * (b_count==0) and are now unlocked. We must make sure that + * only the _last_ buffer that decrements its count is the one + * that unlock the page.. + */ + spin_lock_irqsave(&page_uptodate_lock, flags); + mark_buffer_async(bh, 0); + unlock_buffer(bh); + tmp = bh->b_this_page; + while (tmp != bh) { + if (buffer_locked(tmp)) { + if (buffer_async(tmp)) + goto still_busy; + } else if (!buffer_uptodate(tmp)) + fullup = 0; + tmp = tmp->b_this_page; + } + + /* OK, the async IO on this page is complete. */ + spin_unlock_irqrestore(&page_uptodate_lock, flags); + + /* + * If none of the buffers had errors and all were uptodate + * then we can set the page uptodate: + */ + if (fullup && !PageError(page)) + SetPageUptodate(page); + + UnlockPage(page); + + return; + +still_busy: + spin_unlock_irqrestore(&page_uptodate_lock, flags); + return; +} + +inline void set_buffer_async_io(struct buffer_head *bh) +{ + bh->b_end_io = end_buffer_io_async; + mark_buffer_async(bh, 1); +} + +/* + * Synchronise all the inode's dirty buffers to the disk. + * + * We have conflicting pressures: we want to make sure that all + * initially dirty buffers get waited on, but that any subsequently + * dirtied buffers don't. After all, we don't want fsync to last + * forever if somebody is actively writing to the file. + * + * Do this in two main stages: first we copy dirty buffers to a + * temporary inode list, queueing the writes as we go. Then we clean + * up, waiting for those writes to complete. + * + * During this second stage, any subsequent updates to the file may end + * up refiling the buffer on the original inode's dirty list again, so + * there is a chance we will end up with a buffer queued for write but + * not yet completed on that list. So, as a final cleanup we go through + * the osync code to catch these locked, dirty buffers without requeuing + * any newly dirty buffers for write. + */ +int fsync_buffers_list(struct list_head *list) +{ + struct buffer_head *bh; + struct inode tmp; + int err = 0, err2; + + INIT_LIST_HEAD(&tmp.i_dirty_buffers); + + spin_lock(&lru_list_lock); + + while (!list_empty(list)) { + bh = BH_ENTRY(list->next); + list_del(&bh->b_inode_buffers); + if (!buffer_dirty(bh) && !buffer_locked(bh)) + bh->b_inode = NULL; + else { + bh->b_inode = &tmp; + list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers); + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(&lru_list_lock); + /* + * Wait I/O completion before submitting + * the buffer, to be sure the write will + * be effective on the latest data in + * the buffer. (otherwise - if there's old + * I/O in flight - write_buffer would become + * a noop) + */ + wait_on_buffer(bh); + ll_rw_block(WRITE, 1, &bh); + brelse(bh); + spin_lock(&lru_list_lock); + } + } + } + + while (!list_empty(&tmp.i_dirty_buffers)) { + bh = BH_ENTRY(tmp.i_dirty_buffers.prev); + remove_inode_queue(bh); + get_bh(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&lru_list_lock); + } + + spin_unlock(&lru_list_lock); + err2 = osync_buffers_list(list); + + if (err) + return err; + else + return err2; +} + +/* + * osync is designed to support O_SYNC io. It waits synchronously for + * all already-submitted IO to complete, but does not queue any new + * writes to the disk. + * + * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as + * you dirty the buffers, and then use osync_buffers_list to wait for + * completion. Any other dirty buffers which are not yet queued for + * write will not be flushed to disk by the osync. + */ +static int osync_buffers_list(struct list_head *list) +{ + struct buffer_head *bh; + struct list_head *p; + int err = 0; + + spin_lock(&lru_list_lock); + + repeat: + list_for_each_prev(p, list) { + bh = BH_ENTRY(p); + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(&lru_list_lock); + goto repeat; + } + } + + spin_unlock(&lru_list_lock); + return err; +} + +/* + * Invalidate any and all dirty buffers on a given inode. We are + * probably unmounting the fs, but that doesn't mean we have already + * done a sync(). Just drop the buffers from the inode list. + */ +void invalidate_inode_buffers(struct inode *inode) +{ + struct list_head * entry; + + spin_lock(&lru_list_lock); + while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers) + remove_inode_queue(BH_ENTRY(entry)); + while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers) + remove_inode_queue(BH_ENTRY(entry)); + spin_unlock(&lru_list_lock); +} + + +/* + * Ok, this is getblk, and it isn't very clear, again to hinder + * race-conditions. Most of the code is seldom used, (ie repeating), + * so it should be much more efficient than it looks. + * + * The algorithm is changed: hopefully better, and an elusive bug removed. + * + * 14.02.92: changed it to sync dirty buffers a bit: better performance + * when the filesystem starts to get full of dirty blocks (I hope). + */ +struct buffer_head * getblk(kdev_t dev, int block, int size) +{ + for (;;) { + struct buffer_head * bh; + + bh = get_hash_table(dev, block, size); + if (bh) { + touch_buffer(bh); + return bh; + } + + if (!grow_buffers(dev, block, size)) + free_more_memory(); + } +} + +/* -1 -> no need to flush + 0 -> async flush + 1 -> sync flush (wait for I/O completion) */ +static int balance_dirty_state(void) +{ + unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; + + dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; + tot = nr_free_buffer_pages(); + + dirty *= 100; + soft_dirty_limit = tot * bdf_prm.b_un.nfract; + hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync; + + /* First, check for the "real" dirty limit. */ + if (dirty > soft_dirty_limit) { + if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO)) + return 1; + return 0; + } + + return -1; +} + +static int bdflush_stop(void) +{ + unsigned long dirty, tot, dirty_limit; + + dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; + tot = nr_free_buffer_pages(); + + dirty *= 100; + dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush; + + if (dirty > dirty_limit) + return 0; + return 1; +} + +/* + * if a new dirty buffer is created we need to balance bdflush. + * + * in the future we might want to make bdflush aware of different + * pressures on different devices - thus the (currently unused) + * 'dev' parameter. + */ +void balance_dirty(void) +{ + int state = balance_dirty_state(); + + if (state < 0) + return; + + wakeup_bdflush(); + + /* + * And if we're _really_ out of balance, wait for + * some of the dirty/locked buffers ourselves. + * This will throttle heavy writers. + */ + if (state > 0) { + spin_lock(&lru_list_lock); + write_some_buffers(NODEV); + } +} + +inline void __mark_dirty(struct buffer_head *bh) +{ + bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer; + refile_buffer(bh); +} + +/* atomic version, the user must call balance_dirty() by hand + as soon as it become possible to block */ +void __mark_buffer_dirty(struct buffer_head *bh) +{ + if (!atomic_set_buffer_dirty(bh)) + __mark_dirty(bh); +} + +void mark_buffer_dirty(struct buffer_head *bh) +{ + if (!atomic_set_buffer_dirty(bh)) { + __mark_dirty(bh); + balance_dirty(); + } +} + +void set_buffer_flushtime(struct buffer_head *bh) +{ + bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer; +} +EXPORT_SYMBOL(set_buffer_flushtime); + +/* + * A buffer may need to be moved from one buffer list to another + * (e.g. in case it is not shared any more). Handle this. + */ +static void __refile_buffer(struct buffer_head *bh) +{ + int dispose = BUF_CLEAN; + if (buffer_locked(bh)) + dispose = BUF_LOCKED; + if (buffer_dirty(bh)) + dispose = BUF_DIRTY; + if (dispose != bh->b_list) { + __remove_from_lru_list(bh); + bh->b_list = dispose; + if (dispose == BUF_CLEAN) + remove_inode_queue(bh); + __insert_into_lru_list(bh, dispose); + } +} + +void refile_buffer(struct buffer_head *bh) +{ + spin_lock(&lru_list_lock); + __refile_buffer(bh); + spin_unlock(&lru_list_lock); +} + +/* + * Release a buffer head + */ +void __brelse(struct buffer_head * buf) +{ + if (atomic_read(&buf->b_count)) { + put_bh(buf); + return; + } + printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n"); +} + +/* + * bforget() is like brelse(), except it discards any + * potentially dirty data. + */ +void __bforget(struct buffer_head * buf) +{ + mark_buffer_clean(buf); + __brelse(buf); +} + +/** + * bread() - reads a specified block and returns the bh + * @block: number of block + * @size: size (in bytes) to read + * + * Reads a specified block, and returns buffer head that + * contains it. It returns NULL if the block was unreadable. + */ +struct buffer_head * bread(kdev_t dev, int block, int size) +{ + struct buffer_head * bh; + + bh = getblk(dev, block, size); + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + brelse(bh); + return NULL; +} + +/* + * Note: the caller should wake up the buffer_wait list if needed. + */ +static void __put_unused_buffer_head(struct buffer_head * bh) +{ + if (bh->b_inode) + BUG(); + if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { + kmem_cache_free(bh_cachep, bh); + } else { + bh->b_dev = B_FREE; + bh->b_blocknr = -1; + bh->b_this_page = NULL; + + nr_unused_buffer_heads++; + bh->b_next_free = unused_list; + unused_list = bh; + } +} + +void put_unused_buffer_head(struct buffer_head *bh) +{ + spin_lock(&unused_list_lock); + __put_unused_buffer_head(bh); + spin_unlock(&unused_list_lock); +} +EXPORT_SYMBOL(put_unused_buffer_head); + +/* + * Reserve NR_RESERVED buffer heads for async IO requests to avoid + * no-buffer-head deadlock. Return NULL on failure; waiting for + * buffer heads is now handled in create_buffers(). + */ +struct buffer_head * get_unused_buffer_head(int async) +{ + struct buffer_head * bh; + + spin_lock(&unused_list_lock); + if (nr_unused_buffer_heads > NR_RESERVED) { + bh = unused_list; + unused_list = bh->b_next_free; + nr_unused_buffer_heads--; + spin_unlock(&unused_list_lock); + return bh; + } + spin_unlock(&unused_list_lock); + + /* This is critical. We can't call out to the FS + * to get more buffer heads, because the FS may need + * more buffer-heads itself. Thus SLAB_NOFS. + */ + if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) { + bh->b_blocknr = -1; + bh->b_this_page = NULL; + return bh; + } + + /* + * If we need an async buffer, use the reserved buffer heads. + */ + if (async) { + spin_lock(&unused_list_lock); + if (unused_list) { + bh = unused_list; + unused_list = bh->b_next_free; + nr_unused_buffer_heads--; + spin_unlock(&unused_list_lock); + return bh; + } + spin_unlock(&unused_list_lock); + } + + return NULL; +} +EXPORT_SYMBOL(get_unused_buffer_head); + +void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset) +{ + if (offset >= PAGE_SIZE) + BUG(); + + /* + * page_address will return NULL anyways for highmem pages + */ + bh->b_data = page_address(page) + offset; + bh->b_page = page; +} +EXPORT_SYMBOL(set_bh_page); + +/* + * Create the appropriate buffers when given a page for data area and + * the size of each buffer.. Use the bh->b_this_page linked list to + * follow the buffers created. Return NULL if unable to create more + * buffers. + * The async flag is used to differentiate async IO (paging, swapping) + * from ordinary buffer allocations, and only async requests are allowed + * to sleep waiting for buffer heads. + */ +static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async) +{ + struct buffer_head *bh, *head; + long offset; + +try_again: + head = NULL; + offset = PAGE_SIZE; + while ((offset -= size) >= 0) { + bh = get_unused_buffer_head(async); + if (!bh) + goto no_grow; + + bh->b_dev = NODEV; + bh->b_this_page = head; + head = bh; + + bh->b_state = 0; + bh->b_next_free = NULL; + bh->b_pprev = NULL; + atomic_set(&bh->b_count, 0); + bh->b_size = size; + + set_bh_page(bh, page, offset); + + bh->b_list = BUF_CLEAN; + bh->b_end_io = NULL; + } + return head; +/* + * In case anything failed, we just free everything we got. + */ +no_grow: + if (head) { + spin_lock(&unused_list_lock); + do { + bh = head; + head = head->b_this_page; + __put_unused_buffer_head(bh); + } while (head); + spin_unlock(&unused_list_lock); + + /* Wake up any waiters ... */ + wake_up(&buffer_wait); + } + + /* + * Return failure for non-async IO requests. Async IO requests + * are not allowed to fail, so we have to wait until buffer heads + * become available. But we don't want tasks sleeping with + * partially complete buffers, so all were released above. + */ + if (!async) + return NULL; + + /* We're _really_ low on memory. Now we just + * wait for old buffer heads to become free due to + * finishing IO. Since this is an async request and + * the reserve list is empty, we're sure there are + * async buffer heads in use. + */ + run_task_queue(&tq_disk); + + free_more_memory(); + goto try_again; +} + +/* + * Called when truncating a buffer on a page completely. + */ +static void discard_buffer(struct buffer_head * bh) +{ + if (buffer_mapped(bh)) { + mark_buffer_clean(bh); + lock_buffer(bh); + clear_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Mapped, &bh->b_state); + clear_bit(BH_Req, &bh->b_state); + clear_bit(BH_New, &bh->b_state); + remove_from_queues(bh); + unlock_buffer(bh); + } +} + +/** + * try_to_release_page - release old fs-specific metadata on a page + * + */ + +int try_to_release_page(struct page * page, int gfp_mask) +{ + if (!PageLocked(page)) + BUG(); + + if (!page->mapping) + goto try_to_free; + if (!page->mapping->a_ops->releasepage) + goto try_to_free; + if (page->mapping->a_ops->releasepage(page, gfp_mask)) + goto try_to_free; + /* + * We couldn't release buffer metadata; don't even bother trying + * to release buffers. + */ + return 0; +try_to_free: + return try_to_free_buffers(page, gfp_mask); +} + +/* + * We don't have to release all buffers here, but + * we have to be sure that no dirty buffer is left + * and no IO is going on (no buffer is locked), because + * we have truncated the file and are going to free the + * blocks on-disk.. + */ +int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache) +{ + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + + if (!PageLocked(page)) + BUG(); + if (!page->buffers) + return 1; + + head = page->buffers; + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* + * is this block fully flushed? + */ + if (offset <= curr_off) + discard_buffer(bh); + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * subtle. We release buffer-heads only if this is + * the 'final' flushpage. We have invalidated the get_block + * cached value unconditionally, so real IO is not + * possible anymore. + * + * If the free doesn't work out, the buffers can be + * left around - they just turn into anonymous buffers + * instead. + */ + if (!offset) { + if (!try_to_release_page(page, 0)) + return 0; + } + + return 1; +} + +void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize) +{ + struct buffer_head *bh, *head, *tail; + + head = create_buffers(page, blocksize, 1); + if (page->buffers) + BUG(); + + bh = head; + do { + bh->b_dev = dev; + bh->b_blocknr = 0; + bh->b_end_io = NULL; + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + page->buffers = head; + page_cache_get(page); +} +EXPORT_SYMBOL(create_empty_buffers); + +/* + * We are taking a block for data and we don't want any output from any + * buffer-cache aliases starting from return from that function and + * until the moment when something will explicitly mark the buffer + * dirty (hopefully that will not happen until we will free that block ;-) + * We don't even need to mark it not-uptodate - nobody can expect + * anything from a newly allocated buffer anyway. We used to used + * unmap_buffer() for such invalidation, but that was wrong. We definitely + * don't want to mark the alias unmapped, for example - it would confuse + * anyone who might pick it with bread() afterwards... + */ + +static void unmap_underlying_metadata(struct buffer_head * bh) +{ + struct buffer_head *old_bh; + + old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); + if (old_bh) { + mark_buffer_clean(old_bh); + wait_on_buffer(old_bh); + clear_bit(BH_Req, &old_bh->b_state); + __brelse(old_bh); + } +} + +/* + * NOTE! All mapped/uptodate combinations are valid: + * + * Mapped Uptodate Meaning + * + * No No "unknown" - must do get_block() + * No Yes "hole" - zero-filled + * Yes No "allocated" - allocated on disk, not read in + * Yes Yes "valid" - allocated and up-to-date in memory. + * + * "Dirty" is valid only with the last case (mapped+uptodate). + */ + +/* + * block_write_full_page() is SMP threaded - the kernel lock is not held. + */ +static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block) +{ + int err, i; + unsigned long block; + struct buffer_head *bh, *head; + int need_unlock; + + if (!PageLocked(page)) + BUG(); + + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits); + head = page->buffers; + + block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + bh = head; + i = 0; + + /* Stage 1: make sure we have all the buffers mapped! */ + do { + /* + * If the buffer isn't up-to-date, we can't be sure + * that the buffer has been initialized with the proper + * block number information etc.. + * + * Leave it to the low-level FS to make all those + * decisions (block #0 may actually be a valid block) + */ + if (!buffer_mapped(bh)) { + err = get_block(inode, block, bh, 1); + if (err) + goto out; + if (buffer_new(bh)) + unmap_underlying_metadata(bh); + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + /* Stage 2: lock the buffers, mark them clean */ + do { + lock_buffer(bh); + set_buffer_async_io(bh); + set_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Dirty, &bh->b_state); + bh = bh->b_this_page; + } while (bh != head); + + /* Stage 3: submit the IO */ + do { + struct buffer_head *next = bh->b_this_page; + submit_bh(WRITE, bh); + bh = next; + } while (bh != head); + + /* Done - end_buffer_io_async will unlock */ + SetPageUptodate(page); + return 0; + +out: + /* + * ENOSPC, or some other error. We may already have added some + * blocks to the file, so we need to write these out to avoid + * exposing stale data. + */ + ClearPageUptodate(page); + bh = head; + need_unlock = 1; + /* Recovery: lock and submit the mapped buffers */ + do { + if (buffer_mapped(bh)) { + lock_buffer(bh); + set_buffer_async_io(bh); + need_unlock = 0; + } + bh = bh->b_this_page; + } while (bh != head); + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_mapped(bh)) { + set_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Dirty, &bh->b_state); + submit_bh(WRITE, bh); + } + bh = next; + } while (bh != head); + if (need_unlock) + UnlockPage(page); + return err; +} + +static int __block_prepare_write(struct inode *inode, struct page *page, + unsigned from, unsigned to, get_block_t *get_block) +{ + unsigned block_start, block_end; + unsigned long block; + int err = 0; + unsigned blocksize, bbits; + struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; + char *kaddr = kmap(page); + + blocksize = 1 << inode->i_blkbits; + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, blocksize); + head = page->buffers; + + bbits = inode->i_blkbits; + block = page->index << (PAGE_CACHE_SHIFT - bbits); + + for(bh = head, block_start = 0; bh != head || !block_start; + block++, block_start=block_end, bh = bh->b_this_page) { + if (!bh) + BUG(); + block_end = block_start+blocksize; + if (block_end <= from) + continue; + if (block_start >= to) + break; + clear_bit(BH_New, &bh->b_state); + if (!buffer_mapped(bh)) { + err = get_block(inode, block, bh, 1); + if (err) + goto out; + if (buffer_new(bh)) { + unmap_underlying_metadata(bh); + if (Page_Uptodate(page)) { + set_bit(BH_Uptodate, &bh->b_state); + continue; + } + if (block_end > to) + memset(kaddr+to, 0, block_end-to); + if (block_start < from) + memset(kaddr+block_start, 0, from-block_start); + if (block_end > to || block_start < from) + flush_dcache_page(page); + continue; + } + } + if (Page_Uptodate(page)) { + set_bit(BH_Uptodate, &bh->b_state); + continue; + } + if (!buffer_uptodate(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } + } + /* + * If we issued read requests - let them complete. + */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + return -EIO; + } + return 0; +out: + /* + * Zero out any newly allocated blocks to avoid exposing stale + * data. If BH_New is set, we know that the block was newly + * allocated in the above loop. + * + * Details the buffer can be new and uptodate because: + * 1) hole in uptodate page, get_block(create) allocate the block, + * so the buffer is new and additionally we also mark it uptodate + * 2) The buffer is not mapped and uptodate due a previous partial read. + * + * We can always ignore uptodate buffers here, if you mark a buffer + * uptodate you must make sure it contains the right data first. + * + * We must stop the "undo/clear" fixup pass not at the caller "to" + * but at the last block that we successfully arrived in the main loop. + */ + bh = head; + to = block_start; /* stop at the last successfully handled block */ + block_start = 0; + do { + block_end = block_start+blocksize; + if (block_end <= from) + goto next_bh; + if (block_start >= to) + break; + if (buffer_new(bh) && !buffer_uptodate(bh)) { + memset(kaddr+block_start, 0, bh->b_size); + flush_dcache_page(page); + set_bit(BH_Uptodate, &bh->b_state); + mark_buffer_dirty(bh); + } +next_bh: + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); + return err; +} + +static int __block_commit_write(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0, need_balance_dirty = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + + blocksize = 1 << inode->i_blkbits; + + for(bh = head = page->buffers, block_start = 0; + bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_bit(BH_Uptodate, &bh->b_state); + if (!atomic_set_buffer_dirty(bh)) { + __mark_dirty(bh); + buffer_insert_inode_data_queue(bh, inode); + need_balance_dirty = 1; + } + } + } + + if (need_balance_dirty) + balance_dirty(); + /* + * is this a partial write that happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' wether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return 0; +} + +/* + * Generic "read page" function for block devices that have the normal + * get_block functionality. This is most of the block device filesystems. + * Reads the page asynchronously --- the unlock_buffer() and + * mark_buffer_uptodate() functions propagate buffer state into the + * page struct once IO has completed. + */ +int block_read_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + unsigned long iblock, lblock; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize, blocks; + int nr, i; + + if (!PageLocked(page)) + PAGE_BUG(page); + blocksize = 1 << inode->i_blkbits; + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, blocksize); + head = page->buffers; + + blocks = PAGE_CACHE_SIZE >> inode->i_blkbits; + iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits; + bh = head; + nr = 0; + i = 0; + + do { + if (buffer_uptodate(bh)) + continue; + + if (!buffer_mapped(bh)) { + if (iblock < lblock) { + if (get_block(inode, iblock, bh, 0)) + continue; + } + if (!buffer_mapped(bh)) { + memset(kmap(page) + i*blocksize, 0, blocksize); + flush_dcache_page(page); + kunmap(page); + set_bit(BH_Uptodate, &bh->b_state); + continue; + } + /* get_block() might have updated the buffer synchronously */ + if (buffer_uptodate(bh)) + continue; + } + + arr[nr] = bh; + nr++; + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + if (!nr) { + /* + * all buffers are uptodate - we can set the page + * uptodate as well. + */ + SetPageUptodate(page); + UnlockPage(page); + return 0; + } + + /* Stage two: lock the buffers */ + for (i = 0; i < nr; i++) { + struct buffer_head * bh = arr[i]; + lock_buffer(bh); + set_buffer_async_io(bh); + } + + /* Stage 3: start the IO */ + for (i = 0; i < nr; i++) { + struct buffer_head * bh = arr[i]; + if (buffer_uptodate(bh)) + end_buffer_io_async(bh, 1); + else + submit_bh(READ, bh); + } + + return 0; +} + +/* utility function for filesystems that need to do work on expanding + * truncates. Uses prepare/commit_write to allow the filesystem to + * deal with the hole. + */ +int generic_cont_expand(struct inode *inode, loff_t size) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long index, offset, limit; + int err; + + err = -EFBIG; + limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && size > (loff_t)limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (size > inode->i_sb->s_maxbytes) + goto out; + + offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ + + /* ugh. in prepare/commit_write, if from==to==start of block, we + ** skip the prepare. make sure we never send an offset for the start + ** of a block + */ + if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { + offset++; + } + index = size >> PAGE_CACHE_SHIFT; + err = -ENOMEM; + page = grab_cache_page(mapping, index); + if (!page) + goto out; + err = mapping->a_ops->prepare_write(NULL, page, offset, offset); + if (!err) { + err = mapping->a_ops->commit_write(NULL, page, offset, offset); + } + UnlockPage(page); + page_cache_release(page); + if (err > 0) + err = 0; +out: + return err; +} + +/* + * For moronic filesystems that do not allow holes in file. + * We may have to extend the file. + */ + +int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct page *new_page; + unsigned long pgpos; + long status; + unsigned zerofrom; + unsigned blocksize = 1 << inode->i_blkbits; + char *kaddr; + + while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { + status = -ENOMEM; + new_page = grab_cache_page(mapping, pgpos); + if (!new_page) + goto out; + /* we might sleep */ + if (*bytes>>PAGE_CACHE_SHIFT != pgpos) { + UnlockPage(new_page); + page_cache_release(new_page); + continue; + } + zerofrom = *bytes & ~PAGE_CACHE_MASK; + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + status = __block_prepare_write(inode, new_page, zerofrom, + PAGE_CACHE_SIZE, get_block); + if (status) + goto out_unmap; + kaddr = page_address(new_page); + memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); + flush_dcache_page(new_page); + __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE); + kunmap(new_page); + UnlockPage(new_page); + page_cache_release(new_page); + } + + if (page->index < pgpos) { + /* completely inside the area */ + zerofrom = offset; + } else { + /* page covers the boundary, find the boundary offset */ + zerofrom = *bytes & ~PAGE_CACHE_MASK; + + /* if we will expand the thing last block will be filled */ + if (to > zerofrom && (zerofrom & (blocksize-1))) { + *bytes |= (blocksize-1); + (*bytes)++; + } + + /* starting below the boundary? Nothing to zero out */ + if (offset <= zerofrom) + zerofrom = offset; + } + status = __block_prepare_write(inode, page, zerofrom, to, get_block); + if (status) + goto out1; + kaddr = page_address(page); + if (zerofrom < offset) { + memset(kaddr+zerofrom, 0, offset-zerofrom); + flush_dcache_page(page); + __block_commit_write(inode, page, zerofrom, offset); + } + return 0; +out1: + ClearPageUptodate(page); + kunmap(page); + return status; + +out_unmap: + ClearPageUptodate(new_page); + kunmap(new_page); + UnlockPage(new_page); + page_cache_release(new_page); +out: + return status; +} + +int block_prepare_write(struct page *page, unsigned from, unsigned to, + get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + int err = __block_prepare_write(inode, page, from, to, get_block); + if (err) { + ClearPageUptodate(page); + kunmap(page); + } + return err; +} + +int block_commit_write(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + __block_commit_write(inode,page,from,to); + kunmap(page); + return 0; +} + +int generic_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + __block_commit_write(inode,page,from,to); + kunmap(page); + if (pos > inode->i_size) { + inode->i_size = pos; + mark_inode_dirty(inode); + } + return 0; +} + +int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) +{ + unsigned long index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head *bh; + int err; + + blocksize = 1 << inode->i_blkbits; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, blocksize); + + /* Find the buffer that contains "offset" */ + bh = page->buffers; + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (!buffer_mapped(bh)) { + /* Hole? Nothing to do */ + if (buffer_uptodate(bh)) + goto unlock; + get_block(inode, iblock, bh, 0); + /* Still unmapped? Nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (Page_Uptodate(page)) + set_bit(BH_Uptodate, &bh->b_state); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + memset(kmap(page) + offset, 0, length); + flush_dcache_page(page); + kunmap(page); + + if (!atomic_set_buffer_dirty(bh)) { + __mark_dirty(bh); + buffer_insert_inode_data_queue(bh, inode); + balance_dirty(); + } + + err = 0; + +unlock: + UnlockPage(page); + page_cache_release(page); +out: + return err; +} + +int block_write_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + int err; + + /* easy case */ + if (page->index < end_index) + return __block_write_full_page(inode, page, get_block); + + /* things got complicated... */ + offset = inode->i_size & (PAGE_CACHE_SIZE-1); + /* OK, are we completely out? */ + if (page->index >= end_index+1 || !offset) { + UnlockPage(page); + return -EIO; + } + + /* Sigh... will have to work, then... */ + err = __block_prepare_write(inode, page, 0, offset, get_block); + if (!err) { + memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + __block_commit_write(inode,page,0,offset); +done: + kunmap(page); + UnlockPage(page); + return err; + } + ClearPageUptodate(page); + goto done; +} + +/* + * Commence writeout of all the buffers against a page. The + * page must be locked. Returns zero on success or a negative + * errno. + */ +int writeout_one_page(struct page *page) +{ + struct buffer_head *bh, *head = page->buffers; + + if (!PageLocked(page)) + BUG(); + bh = head; + do { + if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh)) + continue; + + bh->b_flushtime = jiffies; + ll_rw_block(WRITE, 1, &bh); + } while ((bh = bh->b_this_page) != head); + return 0; +} +EXPORT_SYMBOL(writeout_one_page); + +/* + * Wait for completion of I/O of all buffers against a page. The page + * must be locked. Returns zero on success or a negative errno. + */ +int waitfor_one_page(struct page *page) +{ + int error = 0; + struct buffer_head *bh, *head = page->buffers; + + bh = head; + do { + wait_on_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + error = -EIO; + } while ((bh = bh->b_this_page) != head); + return error; +} +EXPORT_SYMBOL(waitfor_one_page); + +int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block) +{ + struct buffer_head tmp; + struct inode *inode = mapping->host; + tmp.b_state = 0; + tmp.b_blocknr = 0; + get_block(inode, block, &tmp, 0); + return tmp.b_blocknr; +} + +int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block) +{ + int i, nr_blocks, retval; + unsigned long * blocks = iobuf->blocks; + int length; + + length = iobuf->length; + nr_blocks = length / blocksize; + /* build the blocklist */ + for (i = 0; i < nr_blocks; i++, blocknr++) { + struct buffer_head bh; + + bh.b_state = 0; + bh.b_dev = inode->i_dev; + bh.b_size = blocksize; + + retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1); + if (retval) { + if (!i) + /* report error to userspace */ + goto out; + else + /* do short I/O utill 'i' */ + break; + } + + if (rw == READ) { + if (buffer_new(&bh)) + BUG(); + if (!buffer_mapped(&bh)) { + /* there was an hole in the filesystem */ + blocks[i] = -1UL; + continue; + } + } else { + if (buffer_new(&bh)) + unmap_underlying_metadata(&bh); + if (!buffer_mapped(&bh)) + BUG(); + } + blocks[i] = bh.b_blocknr; + } + + /* patch length to handle short I/O */ + iobuf->length = i * blocksize; + retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize); + /* restore orig length */ + iobuf->length = length; + out: + + return retval; +} + +/* + * IO completion routine for a buffer_head being used for kiobuf IO: we + * can't dispatch the kiobuf callback until io_count reaches 0. + */ + +static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate) +{ + struct kiobuf *kiobuf; + + mark_buffer_uptodate(bh, uptodate); + + kiobuf = bh->b_private; + unlock_buffer(bh); + end_kio_request(kiobuf, uptodate); +} + +/* + * For brw_kiovec: submit a set of buffer_head temporary IOs and wait + * for them to complete. Clean up the buffer_heads afterwards. + */ + +static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size) +{ + int iosize, err; + int i; + struct buffer_head *tmp; + + iosize = 0; + err = 0; + + for (i = nr; --i >= 0; ) { + iosize += size; + tmp = bh[i]; + if (buffer_locked(tmp)) { + wait_on_buffer(tmp); + } + + if (!buffer_uptodate(tmp)) { + /* We are traversing bh'es in reverse order so + clearing iosize on error calculates the + amount of IO before the first error. */ + iosize = 0; + err = -EIO; + } + } + + if (iosize) + return iosize; + return err; +} + +/* + * Start I/O on a physical range of kernel memory, defined by a vector + * of kiobuf structs (much like a user-space iovec list). + * + * The kiobuf must already be locked for IO. IO is submitted + * asynchronously: you need to check page->locked and page->uptodate. + * + * It is up to the caller to make sure that there are enough blocks + * passed in to completely map the iobufs to disk. + */ + +int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], + kdev_t dev, unsigned long b[], int size) +{ + int err; + int length; + int transferred; + int i; + int bufind; + int pageind; + int bhind; + int offset; + unsigned long blocknr; + struct kiobuf * iobuf = NULL; + struct page * map; + struct buffer_head *tmp, **bhs = NULL; + + if (!nr) + return 0; + + /* + * First, do some alignment and validity checks + */ + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + if ((iobuf->offset & (size-1)) || + (iobuf->length & (size-1))) + return -EINVAL; + if (!iobuf->nr_pages) + panic("brw_kiovec: iobuf not initialised"); + } + + /* + * OK to walk down the iovec doing page IO on each page we find. + */ + bufind = bhind = transferred = err = 0; + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + offset = iobuf->offset; + length = iobuf->length; + iobuf->errno = 0; + if (!bhs) + bhs = iobuf->bh; + + for (pageind = 0; pageind < iobuf->nr_pages; pageind++) { + map = iobuf->maplist[pageind]; + if (!map) { + err = -EFAULT; + goto finished; + } + + while (length > 0) { + blocknr = b[bufind++]; + if (blocknr == -1UL) { + if (rw == READ) { + /* there was an hole in the filesystem */ + memset(kmap(map) + offset, 0, size); + flush_dcache_page(map); + kunmap(map); + + transferred += size; + goto skip_block; + } else + BUG(); + } + tmp = bhs[bhind++]; + + tmp->b_size = size; + set_bh_page(tmp, map, offset); + tmp->b_this_page = tmp; + + init_buffer(tmp, end_buffer_io_kiobuf, iobuf); + tmp->b_dev = dev; + tmp->b_blocknr = blocknr; + tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req); + + if (rw == WRITE) { + set_bit(BH_Uptodate, &tmp->b_state); + clear_bit(BH_Dirty, &tmp->b_state); + } else + set_bit(BH_Uptodate, &tmp->b_state); + + atomic_inc(&iobuf->io_count); + submit_bh(rw, tmp); + /* + * Wait for IO if we have got too much + */ + if (bhind >= KIO_MAX_SECTORS) { + kiobuf_wait_for_io(iobuf); /* wake-one */ + err = wait_kio(rw, bhind, bhs, size); + if (err >= 0) + transferred += err; + else + goto finished; + bhind = 0; + } + + skip_block: + length -= size; + offset += size; + + if (offset >= PAGE_SIZE) { + offset = 0; + break; + } + } /* End of block loop */ + } /* End of page loop */ + } /* End of iovec loop */ + + /* Is there any IO still left to submit? */ + if (bhind) { + kiobuf_wait_for_io(iobuf); /* wake-one */ + err = wait_kio(rw, bhind, bhs, size); + if (err >= 0) + transferred += err; + else + goto finished; + } + + finished: + if (transferred) + return transferred; + return err; +} + +int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size) +{ + struct buffer_head *head, *bh; + + if (!PageLocked(page)) + panic("brw_page: page not locked for I/O"); + + if (!page->buffers) + create_empty_buffers(page, dev, size); + head = bh = page->buffers; + + /* Stage 1: lock all the buffers */ + do { + lock_buffer(bh); + bh->b_blocknr = *(b++); + set_bit(BH_Mapped, &bh->b_state); + set_buffer_async_io(bh); + bh = bh->b_this_page; + } while (bh != head); + + /* Stage 2: start the IO */ + do { + struct buffer_head *next = bh->b_this_page; + submit_bh(rw, bh); + bh = next; + } while (bh != head); + return 0; +} + +int block_symlink(struct inode *inode, const char *symname, int len) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + int err = -ENOMEM; + char *kaddr; + + if (!page) + goto fail; + err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); + if (err) + goto fail_map; + kaddr = page_address(page); + memcpy(kaddr, symname, len-1); + mapping->a_ops->commit_write(NULL, page, 0, len-1); + /* + * Notice that we are _not_ going to block here - end of page is + * unmapped, so this will only try to map the rest of page, see + * that it is unmapped (typically even will not look into inode - + * ->i_size will be enough for everything) and zero it out. + * OTOH it's obviously correct and should make the page up-to-date. + */ + err = mapping->a_ops->readpage(NULL, page); + wait_on_page(page); + page_cache_release(page); + if (err < 0) + goto fail; + mark_inode_dirty(inode); + return 0; +fail_map: + UnlockPage(page); + page_cache_release(page); +fail: + return err; +} + +static inline void link_dev_buffers(struct page * page, struct buffer_head *head) +{ + struct buffer_head *bh, *tail; + + bh = head; + do { + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + page->buffers = head; + page_cache_get(page); +} + +/* + * Create the page-cache page that contains the requested block + */ +static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size) +{ + struct page * page; + struct buffer_head *bh; + + page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS); + if (!page) + return NULL; + + if (!PageLocked(page)) + BUG(); + + bh = page->buffers; + if (bh) { + if (bh->b_size == size) + return page; + if (!try_to_free_buffers(page, GFP_NOFS)) + goto failed; + } + + bh = create_buffers(page, size, 0); + if (!bh) + goto failed; + link_dev_buffers(page, bh); + return page; + +failed: + UnlockPage(page); + page_cache_release(page); + return NULL; +} + +static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size) +{ + struct buffer_head *head = page->buffers; + struct buffer_head *bh = head; + unsigned int uptodate; + + uptodate = 1 << BH_Mapped; + if (Page_Uptodate(page)) + uptodate |= 1 << BH_Uptodate; + + write_lock(&hash_table_lock); + do { + if (!(bh->b_state & (1 << BH_Mapped))) { + init_buffer(bh, NULL, NULL); + bh->b_dev = dev; + bh->b_blocknr = block; + bh->b_state = uptodate; + } + + /* Insert the buffer into the hash lists if necessary */ + if (!bh->b_pprev) + __insert_into_hash_list(bh); + + block++; + bh = bh->b_this_page; + } while (bh != head); + write_unlock(&hash_table_lock); +} + +/* + * Try to increase the number of buffers available: the size argument + * is used to determine what kind of buffers we want. + */ +static int grow_buffers(kdev_t dev, unsigned long block, int size) +{ + struct page * page; + struct block_device *bdev; + unsigned long index; + int sizebits; + + /* Size must be multiple of hard sectorsize */ + if (size & (get_hardsect_size(dev)-1)) + BUG(); + /* Size must be within 512 bytes and PAGE_SIZE */ + if (size < 512 || size > PAGE_SIZE) + BUG(); + + sizebits = -1; + do { + sizebits++; + } while ((size << sizebits) < PAGE_SIZE); + + index = block >> sizebits; + block = index << sizebits; + + bdev = bdget(kdev_t_to_nr(dev)); + if (!bdev) { + printk("No block device for %s\n", kdevname(dev)); + BUG(); + } + + /* Create a page with the proper size buffers.. */ + page = grow_dev_page(bdev, index, size); + + /* This is "wrong" - talk to Al Viro */ + atomic_dec(&bdev->bd_count); + if (!page) + return 0; + + /* Hash in the buffers on the hash list */ + hash_page_buffers(page, dev, block, size); + UnlockPage(page); + page_cache_release(page); + + /* We hashed up this page, so increment buffermem */ + atomic_inc(&buffermem_pages); + return 1; +} + +/* + * The first time the VM inspects a page which has locked buffers, it + * will just mark it as needing waiting upon on the scan of the page LRU. + * BH_Wait_IO is used for this. + * + * The second time the VM visits the page, if it still has locked + * buffers, it is time to start writing them out. (BH_Wait_IO was set). + * + * The third time the VM visits the page, if the I/O hasn't completed + * then it's time to wait upon writeout. BH_Lock and BH_Launder are + * used for this. + * + * There is also the case of buffers which were locked by someone else + * - write(2) callers, bdflush, etc. There can be a huge number of these + * and we don't want to just skip them all and fail the page allocation. + * We want to be able to wait on these buffers as well. + * + * The BH_Launder bit is set in submit_bh() to indicate that I/O is + * underway against the buffer, doesn't matter who started it - we know + * that the buffer will eventually come unlocked, and so it's safe to + * wait on it. + * + * The caller holds the page lock and the caller will free this page + * into current->local_page, so by waiting on the page's buffers the + * caller is guaranteed to obtain this page. + * + * sync_page_buffers() will sort-of return true if all the buffers + * against this page are freeable, so try_to_free_buffers() should + * try to free the page's buffers a second time. This is a bit + * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly. + */ +static int sync_page_buffers(struct buffer_head *head) +{ + struct buffer_head * bh = head; + int tryagain = 1; + + do { + if (!buffer_dirty(bh) && !buffer_locked(bh)) + continue; + + /* Don't start IO first time around.. */ + if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) { + tryagain = 0; + continue; + } + + /* Second time through we start actively writing out.. */ + if (test_and_set_bit(BH_Lock, &bh->b_state)) { + if (unlikely(!buffer_launder(bh))) { + tryagain = 0; + continue; + } + wait_on_buffer(bh); + tryagain = 1; + continue; + } + + if (!atomic_set_buffer_clean(bh)) { + unlock_buffer(bh); + continue; + } + + __mark_buffer_clean(bh); + get_bh(bh); + bh->b_end_io = end_buffer_io_sync; + submit_bh(WRITE, bh); + tryagain = 0; + } while ((bh = bh->b_this_page) != head); + + return tryagain; +} + +/* + * Can the buffer be thrown out? + */ +#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock)) +#define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) + +/* + * try_to_free_buffers() checks if all the buffers on this particular page + * are unused, and free's the page if so. + * + * Wake up bdflush() if this fails - if we're running low on memory due + * to dirty buffers, we need to flush them out as quickly as possible. + * + * NOTE: There are quite a number of ways that threads of control can + * obtain a reference to a buffer head within a page. So we must + * lock out all of these paths to cleanly toss the page. + */ +int try_to_free_buffers(struct page * page, unsigned int gfp_mask) +{ + struct buffer_head * tmp, * bh = page->buffers; + +cleaned_buffers_try_again: + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + tmp = bh; + do { + if (buffer_busy(tmp)) + goto busy_buffer_page; + tmp = tmp->b_this_page; + } while (tmp != bh); + + spin_lock(&unused_list_lock); + tmp = bh; + + /* if this buffer was hashed, this page counts as buffermem */ + if (bh->b_pprev) + atomic_dec(&buffermem_pages); + do { + struct buffer_head * p = tmp; + tmp = tmp->b_this_page; + + if (p->b_dev == B_FREE) BUG(); + + remove_inode_queue(p); + __remove_from_queues(p); + __put_unused_buffer_head(p); + } while (tmp != bh); + spin_unlock(&unused_list_lock); + + /* Wake up anyone waiting for buffer heads */ + wake_up(&buffer_wait); + + /* And free the page */ + page->buffers = NULL; + page_cache_release(page); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + return 1; + +busy_buffer_page: + /* Uhhuh, start writeback so that we don't end up with all dirty pages */ + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + gfp_mask = pf_gfp_mask(gfp_mask); + if (gfp_mask & __GFP_IO) { + if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) { + if (sync_page_buffers(bh)) { + /* no IO or waiting next time */ + gfp_mask = 0; + goto cleaned_buffers_try_again; + } + } + } + if (balance_dirty_state() >= 0) + wakeup_bdflush(); + return 0; +} +EXPORT_SYMBOL(try_to_free_buffers); + +/* ================== Debugging =================== */ + +void show_buffers(void) +{ +#ifdef CONFIG_SMP + struct buffer_head * bh; + int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0; + int nlist; + static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", }; +#endif + + printk("Buffer memory: %6dkB\n", + atomic_read(&buffermem_pages) << (PAGE_SHIFT-10)); + + printk("Cache memory: %6dkB\n", + (atomic_read(&page_cache_size)- atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10)); + +#ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */ + if (!spin_trylock(&lru_list_lock)) + return; + for(nlist = 0; nlist < NR_LIST; nlist++) { + found = locked = dirty = used = lastused = 0; + bh = lru_list[nlist]; + if(!bh) continue; + + do { + found++; + if (buffer_locked(bh)) + locked++; + if (buffer_dirty(bh)) + dirty++; + if (atomic_read(&bh->b_count)) + used++, lastused = found; + bh = bh->b_next_free; + } while (bh != lru_list[nlist]); + { + int tmp = nr_buffers_type[nlist]; + if (found != tmp) + printk("%9s: BUG -> found %d, reported %d\n", + buf_types[nlist], found, tmp); + } + printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), " + "%d locked, %d dirty\n", + buf_types[nlist], found, size_buffers_type[nlist]>>10, + used, lastused, locked, dirty); + } + spin_unlock(&lru_list_lock); +#endif +} + +/* ===================== Init ======================= */ + +/* + * allocate the hash table and init the free list + * Use gfp() for the hash table to decrease TLB misses, use + * SLAB cache for buffer heads. + */ +void __init buffer_init(unsigned long mempages) +{ + int order, i; + unsigned int nr_hash; + + /* The buffer cache hash table is less important these days, + * trim it a bit. + */ + mempages >>= 14; + + mempages *= sizeof(struct buffer_head *); + + for (order = 0; (1 << order) < mempages; order++) + ; + + /* try to allocate something until we get it or we're asking + for something that is really too small */ + + do { + unsigned long tmp; + + nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *); + bh_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + bh_hash_shift = 0; + while((tmp >>= 1UL) != 0UL) + bh_hash_shift++; + + hash_table = (struct buffer_head **) + __get_free_pages(GFP_ATOMIC, order); + } while (hash_table == NULL && --order > 0); + printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + if (!hash_table) + panic("Failed to allocate buffer hash table\n"); + + /* Setup hash chains. */ + for(i = 0; i < nr_hash; i++) + hash_table[i] = NULL; + + /* Setup lru lists. */ + for(i = 0; i < NR_LIST; i++) + lru_list[i] = NULL; + +} + + +/* ====================== bdflush support =================== */ + +/* This is a simple kernel daemon, whose job it is to provide a dynamic + * response to dirty buffers. Once this process is activated, we write back + * a limited number of buffers to the disks and then go back to sleep again. + */ + +DECLARE_WAIT_QUEUE_HEAD(bdflush_wait); + +void wakeup_bdflush(void) +{ + wake_up_interruptible(&bdflush_wait); +} + +/* + * Here we attempt to write back old buffers. We also try to flush inodes + * and supers as well, since this function is essentially "update", and + * otherwise there would be no way of ensuring that these quantities ever + * get written back. Ideally, we would have a timestamp on the inodes + * and superblocks so that we could write back only the old ones as well + */ + +static int sync_old_buffers(void) +{ + lock_kernel(); + sync_unlocked_inodes(); + sync_supers(0); + unlock_kernel(); + + for (;;) { + struct buffer_head *bh; + + spin_lock(&lru_list_lock); + bh = lru_list[BUF_DIRTY]; + if (!bh || time_before(jiffies, bh->b_flushtime)) + break; + if (write_some_buffers(NODEV)) + continue; + return 0; + } + spin_unlock(&lru_list_lock); + return 0; +} + +int block_sync_page(struct page *page) +{ + run_task_queue(&tq_disk); + return 0; +} + +/* This is the interface to bdflush. As we get more sophisticated, we can + * pass tuning parameters to this "process", to adjust how it behaves. + * We would want to verify each parameter, however, to make sure that it + * is reasonable. */ + +asmlinkage long sys_bdflush(int func, long data) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (func == 1) { + /* do_exit directly and let kupdate to do its work alone. */ + do_exit(0); +#if 0 /* left here as it's the only example of lazy-mm-stuff used from + a syscall that doesn't care about the current mm context. */ + int error; + struct mm_struct *user_mm; + + /* + * bdflush will spend all of it's time in kernel-space, + * without touching user-space, so we can switch it into + * 'lazy TLB mode' to reduce the cost of context-switches + * to and from bdflush. + */ + user_mm = start_lazy_tlb(); + error = sync_old_buffers(); + end_lazy_tlb(user_mm); + return error; +#endif + } + + /* Basically func 1 means read param 1, 2 means write param 1, etc */ + if (func >= 2) { + int i = (func-2) >> 1; + if (i >= 0 && i < N_PARAM) { + if ((func & 1) == 0) + return put_user(bdf_prm.data[i], (int*)data); + + if (data >= bdflush_min[i] && data <= bdflush_max[i]) { + bdf_prm.data[i] = data; + return 0; + } + } + return -EINVAL; + } + + /* Having func 0 used to launch the actual bdflush and then never + * return (unless explicitly killed). We return zero here to + * remain semi-compatible with present update(8) programs. + */ + return 0; +} + +/* + * This is the actual bdflush daemon itself. It used to be started from + * the syscall above, but now we launch it ourselves internally with + * kernel_thread(...) directly after the first thread in init/main.c + */ +int bdflush(void *startup) +{ + struct task_struct *tsk = current; + + /* + * We have a bare-bones task_struct, and really should fill + * in a few more things so "top" and /proc/2/{exe,root,cwd} + * display semi-sane things. Not real crucial though... + */ + + tsk->session = 1; + tsk->pgrp = 1; + strcpy(tsk->comm, "bdflush"); + + /* avoid getting signals */ + spin_lock_irq(&tsk->sigmask_lock); + flush_signals(tsk); + sigfillset(&tsk->blocked); + recalc_sigpending(tsk); + spin_unlock_irq(&tsk->sigmask_lock); + + complete((struct completion *)startup); + + for (;;) { + int ndirty = bdf_prm.b_un.ndirty; + + CHECK_EMERGENCY_SYNC + + while (ndirty > 0) { + spin_lock(&lru_list_lock); + if (!write_some_buffers(NODEV)) + break; + ndirty -= NRSYNC; + } + if (ndirty > 0 || bdflush_stop()) + interruptible_sleep_on(&bdflush_wait); + } +} + +/* + * This is the kernel update daemon. It was used to live in userspace + * but since it's need to run safely we want it unkillable by mistake. + * You don't need to change your userspace configuration since + * the userspace `update` will do_exit(0) at the first sys_bdflush(). + */ +int kupdate(void *startup) +{ + struct task_struct * tsk = current; + int interval; + + tsk->session = 1; + tsk->pgrp = 1; + strcpy(tsk->comm, "kupdated"); + + /* sigstop and sigcont will stop and wakeup kupdate */ + spin_lock_irq(&tsk->sigmask_lock); + sigfillset(&tsk->blocked); + siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP)); + recalc_sigpending(tsk); + spin_unlock_irq(&tsk->sigmask_lock); + + complete((struct completion *)startup); + + for (;;) { + /* update interval */ + interval = bdf_prm.b_un.interval; + if (interval) { + tsk->state = TASK_INTERRUPTIBLE; + schedule_timeout(interval); + } else { + stop_kupdate: + tsk->state = TASK_STOPPED; + schedule(); /* wait for SIGCONT */ + } + /* check for sigstop */ + if (signal_pending(tsk)) { + int stopped = 0; + spin_lock_irq(&tsk->sigmask_lock); + if (sigismember(&tsk->pending.signal, SIGSTOP)) { + sigdelset(&tsk->pending.signal, SIGSTOP); + stopped = 1; + } + recalc_sigpending(tsk); + spin_unlock_irq(&tsk->sigmask_lock); + if (stopped) + goto stop_kupdate; + } +#ifdef DEBUG + printk(KERN_DEBUG "kupdate() activated...\n"); +#endif + sync_old_buffers(); + run_task_queue(&tq_disk); + } +} + +static int __init bdflush_init(void) +{ + static struct completion startup __initdata = COMPLETION_INITIALIZER(startup); + + kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + wait_for_completion(&startup); + kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + wait_for_completion(&startup); + return 0; +} + +module_init(bdflush_init) + |