From: Adnan Misherfi Date: Fri, 27 Jan 2012 19:29:57 +0000 (-0500) Subject: dm-nfs-for-uek2 X-Git-Tag: v2.6.39-400.9.0~628 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=906478165352e97aac4abd469d4cc024974f6d20;p=users%2Fjedix%2Flinux-maple.git dm-nfs-for-uek2 --- diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 8420129fc5ee..f1bd391d2349 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -227,6 +227,16 @@ config DM_CRYPT If unsure, say N. +config DM_NFS + tristate "Target for NFS files (EXPERIMENTAL)" + depends on BLK_DEV_DM && NFS_FS && EXPERIMENTAL + ---help--- + This device-mapper target allows you to treat an NFS file as + a block device. It tries to use NFS efficiently, and preserve + bio write ordering. + + If unsure, say N. + config DM_SNAPSHOT tristate "Snapshot target" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 448838b1f92a..da444b7b0689 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o +obj-$(CONFIG_DM_NFS) += dm-nfs.o obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o diff --git a/drivers/md/dm-nfs.c b/drivers/md/dm-nfs.c new file mode 100644 index 000000000000..2ee7e5ff9179 --- /dev/null +++ b/drivers/md/dm-nfs.c @@ -0,0 +1,725 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * Prototype dm-nfs by Chuck Lever + * based on dm-loop written by Bryn Reeves . + * + * This file is released under the GPL. + * + * "dm-nfs" provides loopback-style emulation of a block device + * using a regular file as backing storage. The backing file + * resides on a remote system and is accessed via the NFS protocol. + * The NFS server on which the file resides must already be locally + * mounted when an individual dm-nfs target is set up, and cannot + * be unmounted until such a target is destroyed. + * + * This driver is separate from dm-loop for several reasons. + * + * 1. Provide good data integrity and reasonable performance given + * the write delaying behavior of the NFS client. + * + * 2. Reduce or eliminate double caching. Data for the target is + * already cached above the emulated block device; caching the + * backing file's data will pollute the page cache, risk exposing + * the data to others who can view pages in the cache, and risk + * data integrity when the backing file is accessed by multiple + * targets (eg. offline backup). + * + * 3. Local file-based targets require extra logic that is not + * needed for NFS file-based targets. Extent management is + * entirely unnecessary for NFS files, for example. + * + * 4. There is no need to protect against file truncation. In + * the dm-loop case, truncation could result in writes into + * unallocated blocks or blocks allocated to other files. For + * NFS files, this is entirely the NFS server's problem. + * + * In any case, setting S_SWAPFILE on an NFS file will cause + * the NFS client to reject all write requests to that file. + * + * The best we might do is set up an advisory file lock on the + * backing file, but for now that appears to be unnecessary. + */ + +/* + * TODO: + * + * 1. Asynch I/O - submit all I/O at once and allow asynchronous + * completion of bios. + * + * 2. Direct I/O - teach the NFS client's direct I/O engine to deal + * with non-user-space buffers intelligently, then use direct I/O + * to avoid the page cache entirely + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm.h" + +#define DM_NFS_DAEMON "kdmnfsd" +#define DM_MSG_PREFIX "nfs" + + +struct nfs_c { + unsigned long flags; + + /* These fields describe this target's backing store */ + + struct file *filp; + char *path; + loff_t offset; + loff_t size; + sector_t mapped_sectors; + + /* These fields describe this target's work queue */ + + struct workqueue_struct *wq; + struct work_struct ws; + + spinlock_t lock; + struct bio_list input; + struct bio_list work; +}; + + +/*-------------------------------------------------------------------- + * dm-nfs helpers + *--------------------------------------------------------------------*/ + +static int _check_file(struct dm_target *ti) +{ + struct nfs_c *nc = ti->private; + struct file *filp = nc->filp; + struct inode *inode = filp->f_mapping->host; + + if (!inode) + return -ENXIO; + + ti->error = "backing file must be a regular file"; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + ti->error = "backing file is mapped into userspace for writing"; + if (mapping_writably_mapped(filp->f_mapping)) + return -EBUSY; + + if (mapping_mapped(filp->f_mapping)) + DMWARN("%s is mapped into userspace", nc->path); + + ti->error = "backing file does not reside on an NFS mount"; + if (strncmp(inode->i_sb->s_type->name, "nfs", 3) != 0) + return -EBADF; + + return 0; +} + +static int _check_size(struct dm_target *ti) +{ + struct nfs_c *nc = ti->private; + struct inode *inode = nc->filp->f_mapping->host; + loff_t end, size; + int r = -EINVAL; + + nc->size = i_size_read(inode); + + ti->error = "backing file is empty"; + if (!nc->size) + goto out; + + DMDEBUG("set backing file size to %llu", (unsigned long long) nc->size); + + ti->error = "backing file cannot be less than one block in size"; + size = (1 << (inode->i_blkbits - SECTOR_SHIFT)) << SECTOR_SHIFT; + if (nc->size < size) + goto out; + + ti->error = "dm-nfs file offset must be a multiple of fs blocksize"; + if (nc->offset & ((1 << inode->i_blkbits) - 1)) + goto out; + + ti->error = "dm-nfs file offset too large"; + if (nc->offset > (nc->size - (1 << SECTOR_SHIFT))) + goto out; + + nc->mapped_sectors = (nc->size - nc->offset) >> SECTOR_SHIFT; + DMDEBUG("set mapped sectors to %llu (%lld bytes)", + (unsigned long long) nc->mapped_sectors, + nc->size - nc->offset); + + end = ti->len << SECTOR_SHIFT; + ti->error = "mapped region cannot be smaller than target size"; + if (nc->size - nc->offset < end) + goto out; + + end = nc->offset + (nc->mapped_sectors << SECTOR_SHIFT); + if (end < nc->size) + DMWARN("not using %lld bytes in incomplete block at EOF", + nc->size - end); + + r = 0; + +out: + return r; +} + +static int dm_nfs_io_get_file(struct dm_target *ti, uid_t uid) +{ + int flags = ((dm_table_get_mode(ti->table) & FMODE_WRITE) ? + O_RDWR : O_RDONLY) | O_LARGEFILE; + struct nfs_c *nc = ti->private; + struct file *filp; + uid_t save; + int r = 0; + struct cred *new; + + /* + * To prevent the server from squashing our I/O because + * we are root, force all I/O from our kthread to use + * the given user's credentials instead. + */ + save = current_fsuid(); + new = prepare_creds(); + if (!new) + return -ENOMEM; + new->fsuid = uid; + commit_creds(new); + + ti->error = "could not open backing file"; + filp = filp_open(nc->path, flags, 0); + if (IS_ERR(filp)) + return PTR_ERR(filp); + nc->filp = filp; + + r = _check_file(ti); + if (r) + goto err; + + r = _check_size(ti); + if (r) + goto err; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + new->fsuid = save; + commit_creds(new); + + return 0; + +err: + fput(filp); + return r; +} + +static void dm_nfs_io_put_file(struct file *filp) +{ + if (filp) + filp_close(filp, NULL); +} + +/* + * Force an efficient page cache fill with the equivalent of + * fadvise(POSIX_FADV_WILLNEED) on the backing pages to be read + * in next. force_page_cache_readahead will generate two megabyte + * ->readpages calls against the backing file until all requested + * pages are populated. + * + * Since this is no more than a hint to the VFS, don't bother + * looking for returned errors. + */ +static void dm_nfs_io_populate_cache(struct file *filp, loff_t offset, + loff_t len) +{ + loff_t endbyte; + pgoff_t start_index, end_index; + unsigned long nrpages; + struct file_ra_state *ra = NULL; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + goto dm_ra; + + /* Careful about overflows. Len == 0 means "as much as possible" */ + endbyte = offset + len; + if (!len || endbyte < len) + endbyte = -1; + else + endbyte--; /* inclusive */ + + start_index = offset >> PAGE_CACHE_SHIFT; + end_index = endbyte >> PAGE_CACHE_SHIFT; + + /* Careful about overflow on the "+1" */ + nrpages = end_index - start_index + 1; + if (!nrpages) + nrpages = ~0UL; + + file_ra_state_init(ra, filp->f_mapping); + page_cache_sync_readahead(filp->f_mapping, ra,filp, start_index, nrpages); + +dm_ra: + if (ra) + kfree(ra); + else + DMERR("populate cache failed to allocate ra memory"); +} + +static int dm_nfs_io_fsync(struct file *filp) +{ + int r; + + r = filp->f_op->flush(filp, NULL); + if (r) + DMERR("backing file flush failed: %d", r); + return r; +} + +/* + * Invalidate all unlocked cache pages for a backing file + * + * As long as I/O and invalidation for a given backing file + * are performed serially in a single thread and there are + * no other accessors of the backing file, this should + * effectively invalidate all cached backing file pages each + * time it is invoked. + */ +static void dm_nfs_io_invalidate_pages(struct file *filp) +{ + unmap_mapping_range(filp->f_mapping, 0, ~0UL, 0); + + if (filemap_write_and_wait(filp->f_mapping)) + return; + + invalidate_inode_pages2_range(filp->f_mapping, 0, ~0UL); +} + +static void dm_nfs_io_retry_wait(struct kiocb *iocb) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + if (!kiocbIsKicked(iocb)) + schedule(); + else + kiocbClearKicked(iocb); + __set_current_state(TASK_RUNNING); +} + + +/*-------------------------------------------------------------------- + * dm-nfs functions that run in a separate kernel thread + *--------------------------------------------------------------------*/ + +/* + * Normally, the NFS client's aio_{read,write} methods will validate + * the page cache for this file before invoking the generic aio + * routines. Since we just invalidated the backing file's page + * cache, we don't need to revalidate it here. And, we know this + * is not an O_APPEND or O_SYNC write, so we also don't need any + * of the extra processing done in nfs_file_write. Thus we invoke + * the generic aio routines directly. + * + * If we ever switch to using O_DIRECT, we will need to change this + * to call the aio_{read,write} methods instead. The NFS client + * hooks O_DIRECT I/O in those methods because the generic aio + * routines serialize direct I/O unnecessarily. + */ +static ssize_t dm_nfs_kthread_iov_start(int rw, struct kiocb *kiocb, + struct iovec *iov) +{ + if (rw == READ) + return generic_file_aio_read(kiocb, iov, 1, kiocb->ki_pos); + else + return generic_file_aio_write(kiocb, iov, 1, kiocb->ki_pos); +} + +/* + * Convert a biovec to an iovec, and start I/O on it. + * Wait here until it is complete. + */ +static int dm_nfs_kthread_biovec_start(int rw, struct file *filp, + loff_t pos, struct bio_vec *bv) +{ + mm_segment_t old_fs = get_fs(); + struct iovec iov = { + .iov_base = kmap(bv->bv_page) + bv->bv_offset, + .iov_len = bv->bv_len, + }; + struct kiocb kiocb; + ssize_t r; + + set_fs(get_ds()); + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = pos; + kiocb.ki_left = bv->bv_len; + + for (;;) { + r = dm_nfs_kthread_iov_start(rw, &kiocb, &iov); + if (r != -EIOCBRETRY) + break; + dm_nfs_io_retry_wait(&kiocb); + } + + if (-EIOCBQUEUED == r) + r = wait_on_sync_kiocb(&kiocb); + + set_fs(old_fs); + kunmap(bv->bv_page); + + if (r < 0) + return r; + if ((unsigned int)r != bv->bv_len) + return -EIO; + return 0; +} + +static void dm_nfs_kthread_bio_readahead(struct nfs_c *nc, struct bio *bio) +{ + struct bio_vec *bv, *bv_end = bio->bi_io_vec + bio->bi_vcnt; + loff_t pos; + size_t len; + + if (bio_data_dir(bio) != READ) + return; + + len = 0; + for (bv = bio->bi_io_vec; bv < bv_end; bv++) + len += bv->bv_len; + + pos = (bio->bi_sector << 9) + nc->offset; + dm_nfs_io_populate_cache(nc->filp, pos, len); +} + +/* + * Split a bio into its biovecs, and start I/O on each. + * Any error will stop the loop immediately and cause the + * whole request to fail. + */ +static int dm_nfs_kthread_bio_start(struct nfs_c *nc, struct bio *bio) +{ + struct file *filp = nc->filp; + loff_t pos = (bio->bi_sector << 9) + nc->offset; + struct bio_vec *bv, *bv_end = bio->bi_io_vec + bio->bi_vcnt; + int r = 0; + + for (bv = bio->bi_io_vec; bv < bv_end; bv++) { + r = dm_nfs_kthread_biovec_start(bio_data_dir(bio), + filp, pos, bv); + if (r) + break; + pos += bv->bv_len; + } + + return r; +} + +/* + * When awoken, this thread moves bios queued on nc->input to a + * private list, submits the requests, and invokes the completion + * callbacks. + */ +static void dm_nfs_kthread_worker(struct work_struct *ws) +{ + struct nfs_c *nc = container_of(ws, struct nfs_c, ws); + struct bio_list writes; + struct bio *bio; + int r; + + spin_lock_irq(&nc->lock); + bio_list_merge(&nc->work, &nc->input); + bio_list_init(&nc->input); + spin_unlock_irq(&nc->lock); + + /* + * Use the proper UID when submitting these requests + */ + + /* + * Invalidate all cached pages for our backing file + * before submitting these requests. This eliminates + * any locally cached data so each set of requests + * behaves as if it is direct I/O. + */ + dm_nfs_io_invalidate_pages(nc->filp); + + /* + * Try to kick off all the reads now before we + * fill individual biovecs. + */ + bio_list_for_each(bio, &nc->work) + dm_nfs_kthread_bio_readahead(nc, bio); + + /* + * Submit bios. + * + * Reads and unsuccessful writes complete immediately + * upon return. + * + * Successful writes are held until we know the final + * flush also worked. + */ + bio_list_init(&writes); + while ((bio = bio_list_pop(&nc->work))) { + r = dm_nfs_kthread_bio_start(nc, bio); + if (bio_data_dir(bio) == READ || r < 0) + bio_endio(bio, r); + else + bio_list_add(&writes, bio); + } + + /* + * After submitting all the writes in this set of requests, + * flush them all now. The NFS client aggressively caches + * writes to open files, so we must explicitly flush them + * out _before_ signalling completion. + */ + r = dm_nfs_io_fsync(nc->filp); + while ((bio = bio_list_pop(&writes))) + bio_endio(bio, r); +} + + +/*-------------------------------------------------------------------- + * Externally visible dm-nfs target methods + *--------------------------------------------------------------------*/ + +/** + * dm_nfs_ctr - Parse arguments and construct a dm-nfs target device + * @ti: target context to construct + * @argc: count of incoming arguments + * @argv: vector of incoming argument strings + * + * Arguments are " []" where: + * + * is The pathname of an NFS backing file to associate + * with this dm target + * + * is The byte offset in the backing file where the + * device data begins (usually 0) + * + * is The numeric user ID to use for all I/O against + * the backing file (defaults to root); specify + * a non-zero value to avoid root squashing on the + * server + */ +static int dm_nfs_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct nfs_c *nc = NULL; + uid_t uid = 0; + int r = -EINVAL; + + ti->error = "invalid argument count"; + if (argc < 2 || argc > 3) + goto err; + + r = -ENOMEM; + ti->error = "cannot allocate dm-nfs context"; + nc = kzalloc(sizeof(*nc), GFP_KERNEL); + if (!nc) + goto err; + + ti->error = "cannot allocate dm-nfs path"; + nc->path = kstrdup(argv[0], GFP_KERNEL); + if (!nc->path) + goto err; + + ti->private = nc; + + r = -EINVAL; + ti->error = "invalid file offset"; + if (sscanf(argv[1], "%lld", &nc->offset) != 1) + goto err; + if (nc->offset) + DMDEBUG("setting file offset to %lld", nc->offset); + + if (argc == 3) { + r = -EACCES; + ti->error = "invalid uid"; + if (sscanf(argv[2], "%u", &uid) != 1) + goto err; + if (nc->offset) + DMDEBUG("setting uid to %u", uid); + } + + /* dm_nfs_io_get_file sets ti->error */ + r = dm_nfs_io_get_file(ti, uid); + if (r) + goto err; + + r = -ENOMEM; + ti->error = "could not create dm-nfs mapping"; + spin_lock_init(&nc->lock); + bio_list_init(&nc->input); + bio_list_init(&nc->work); + INIT_WORK(&nc->ws, dm_nfs_kthread_worker); + + nc->wq = create_singlethread_workqueue(DM_NFS_DAEMON); + if (!nc->wq) + goto err_putf; + + /* Let the NFS client choose how to split requests */ + ti->split_io = 0; + + DMDEBUG("constructed dm-nfs target on %s " + "(%lldk, %llu sectors)", nc->path, + (nc->size >> 10), (unsigned long long)nc->mapped_sectors); + ti->error = NULL; + + return 0; + +err_putf: + dm_nfs_io_put_file(nc->filp); +err: + kfree(nc); + return r; +} + +/** + * dm_nfs_dtr - dm-nfs target destructor + * @ti: dm target to destroy + * + */ +static void dm_nfs_dtr(struct dm_target *ti) +{ + struct nfs_c *nc = ti->private; + + if ((dm_table_get_mode(ti->table) & FMODE_WRITE)) + flush_workqueue(nc->wq); + + if (nc->wq) + destroy_workqueue(nc->wq); + + dm_nfs_io_put_file(nc->filp); + DMINFO("released file %s", nc->path); + + kfree(nc); +} + +/** + * dm_nfs_map - start I/O on a dm-nfs target + * @ti: target of I/O request + * @bio: control block describing parameters of I/O request + * @context: ignored + * + */ +static int dm_nfs_map(struct dm_target *ti, struct bio *bio, + union map_info *context) +{ + struct nfs_c *nc = ti->private; + int need_wakeup; + + bio->bi_sector -= ti->begin; + + spin_lock_irq(&nc->lock); + need_wakeup = bio_list_empty(&nc->input); + bio_list_add(&nc->input, bio); + spin_unlock_irq(&nc->lock); + + if (need_wakeup) + queue_work(nc->wq, &nc->ws); + + return 0; +} + +/** + * dm_nfs_flush - wait for outstanding I/O on a dm-nfs target to drain + * @ti: target to flush + * + */ +static void dm_nfs_flush(struct dm_target *ti) +{ + struct nfs_c *nc = ti->private; + + flush_workqueue(nc->wq); +} + +/** + * dm_nfs_status - report status information about a dm-nfs target + * @ti: target to report on + * @type: type of info requested + * @result: buffer for results + * @maxlen: length of buffer + * + * Note: DMEMIT uses "result", "maxlen", and "sz", but they are not + * passed as arguments. + */ +static int dm_nfs_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct nfs_c *nc = ti->private; + unsigned int qlen, sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + spin_lock_irq(&nc->lock); + qlen = bio_list_size(&nc->work); + qlen += bio_list_size(&nc->input); + spin_unlock_irq(&nc->lock); + + DMEMIT("nfs %u", qlen); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %llu", nc->path, nc->offset); + break; + } + + return 0; +} + +static struct target_type nfs_target = { + .name = "nfs", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = dm_nfs_ctr, + .dtr = dm_nfs_dtr, + .map = dm_nfs_map, + .presuspend = dm_nfs_flush, + .flush = dm_nfs_flush, + .status = dm_nfs_status, +}; + + +/*-------------------------------------------------------------------- + * dm-nfs module bits + *--------------------------------------------------------------------*/ + +static int __init dm_nfs_mod_init(void) +{ + int r; + + r = dm_register_target(&nfs_target); + if (r < 0) { + DMERR("register failed %d", r); + return r; + } + + DMINFO("version %u.%u.%u loaded", + nfs_target.version[0], nfs_target.version[1], + nfs_target.version[2]); + + return 0; +} + +static void __exit dm_nfs_mod_exit(void) +{ + dm_unregister_target(&nfs_target); /* now returns void */ + DMINFO("version %u.%u.%u unloaded", + nfs_target.version[0], nfs_target.version[1], + nfs_target.version[2]); +} + +module_init(dm_nfs_mod_init); +module_exit(dm_nfs_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Chuck Lever "); +MODULE_DESCRIPTION("device-mapper NFS target");