ext4: introduce reserved space

author Lukas Czerner <lczerner@redhat.com>

Wed, 10 Apr 2013 02:11:22 +0000 (22:11 -0400)

committer Theodore Ts'o <tytso@mit.edu>

Wed, 10 Apr 2013 02:11:22 +0000 (22:11 -0400)
author Lukas Czerner <lczerner@redhat.com>
Wed, 10 Apr 2013 02:11:22 +0000 (22:11 -0400)
committer Theodore Ts'o <tytso@mit.edu>
Wed, 10 Apr 2013 02:11:22 +0000 (22:11 -0400)
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt

index 5dd957d8b25b426faf59776f50fffd27e0fb0119..f7cbf574a875271296d93bee53b290ca36c44a66 100644 (file)
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -494,6 +494,17 @@ Files in /sys/fs/ext4/<devname>
   session_write_kbytes         This file is read-only and shows the number of
                                kilobytes of data that have been written to this
                                filesystem since it was mounted.
+
+ reserved_clusters            This is RW file and contains number of reserved
+                              clusters in the file system which will be used
+                              in the specific situations to avoid costly
+                              zeroout, unexpected ENOSPC, or possible data
+                              loss. The default is 2% or 4096 clusters,
+                              whichever is smaller and this can be changed
+                              however it can never exceed number of clusters
+                              in the file system. If there is not enough space
+                              for the reserved space when mounting the file
+                              mount will _not_ fail.
  ..............................................................................
  
  Ioctls
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c

index 9e8d8ffb063f8d7b7346391bcc9fe7c552b44a35..8dcaea69e37fdbf0de0acb9f92c251166851afbb 100644 (file)
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -499,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
  static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
                                   s64 nclusters, unsigned int flags)
  {
-       s64 free_clusters, dirty_clusters, root_clusters;
+       s64 free_clusters, dirty_clusters, rsv, resv_clusters;
         struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
         struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
  
         free_clusters  = percpu_counter_read_positive(fcc);
         dirty_clusters = percpu_counter_read_positive(dcc);
+       resv_clusters = atomic64_read(&sbi->s_resv_clusters);
  
         /*
          * r_blocks_count should always be multiple of the cluster ratio so
          * we are safe to do a plane bit shift only.
          */
-       root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+       rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
+             resv_clusters;
  
-       if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
+       if (free_clusters - (nclusters + rsv + dirty_clusters) <
                                         EXT4_FREECLUSTERS_WATERMARK) {
                 free_clusters  = percpu_counter_sum_positive(fcc);
                 dirty_clusters = percpu_counter_sum_positive(dcc);
@@ -520,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
         /* Check whether we have space after accounting for current
          * dirty clusters & root reserved clusters.
          */
-       if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
+       if (free_clusters >= (rsv + nclusters + dirty_clusters))
                 return 1;
  
         /* Hm, nope.  Are (enough) root reserved clusters available? */
         if (uid_eq(sbi->s_resuid, current_fsuid()) ||
             (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
             capable(CAP_SYS_RESOURCE) ||
-               (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+           (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
  
+               if (free_clusters >= (nclusters + dirty_clusters +
+                                     resv_clusters))
+                       return 1;
+       }
+       /* No free blocks. Let's see if we can dip into reserved pool */
+       if (flags & EXT4_MB_USE_RESERVED) {
                 if (free_clusters >= (nclusters + dirty_clusters))
                         return 1;
         }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index d91871570982bc1f9659a63c29c7b085c960fe6c..12b560435aba243d5a72a180298d41c33159e8e3 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t;
  #define EXT4_MB_STREAM_ALLOC           0x0800
  /* Use reserved root blocks if needed */
  #define EXT4_MB_USE_ROOT_BLOCKS                0x1000
+/* Use blocks from reserved pool */
+#define EXT4_MB_USE_RESERVED           0x2000
  
  struct ext4_allocation_request {
         /* target inode for block we're allocating */
@@ -557,9 +559,8 @@ enum {
  #define EXT4_GET_BLOCKS_UNINIT_EXT             0x0002
  #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT      (EXT4_GET_BLOCKS_UNINIT_EXT|\
                                                  EXT4_GET_BLOCKS_CREATE)
-       /* Caller is from the delayed allocation writeout path,
-          so set the magic i_delalloc_reserve_flag after taking the
-          inode allocation semaphore for */
+       /* Caller is from the delayed allocation writeout path
+        * finally doing the actual allocation of delayed blocks */
  #define EXT4_GET_BLOCKS_DELALLOC_RESERVE       0x0004
         /* caller is from the direct IO path, request to creation of an
         unitialized extents if not allocated, split the uninitialized
@@ -571,8 +572,9 @@ enum {
         /* Convert extent to initialized after IO complete */
  #define EXT4_GET_BLOCKS_IO_CONVERT_EXT         (EXT4_GET_BLOCKS_CONVERT|\
                                          EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
-       /* Punch out blocks of an extent */
-#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT          0x0020
+       /* Eventual metadata allocation (due to growing extent tree)
+        * should not fail, so try to use reserved blocks for that.*/
+#define EXT4_GET_BLOCKS_METADATA_NOFAIL                0x0020
         /* Don't normalize allocation size (used for fallocate) */
  #define EXT4_GET_BLOCKS_NO_NORMALIZE           0x0040
         /* Request will not result in inode size update (user for fallocate) */
@@ -1188,6 +1190,7 @@ struct ext4_sb_info {
         unsigned int s_mount_flags;
         unsigned int s_def_mount_opt;
         ext4_fsblk_t s_sb_block;
+       atomic64_t s_resv_clusters;
         kuid_t s_resuid;
         kgid_t s_resgid;
         unsigned short s_mount_state;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c

index ea607f9072323e486df70ccb4c2c6597bff09aed..8b158ae2443bcd95509cc0b717d9f526e28aac87 100644 (file)
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1942,8 +1942,8 @@ prepend:
          * There is no free space in the found leaf.
          * We're gonna add a new leaf in the tree.
          */
-       if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
-               flags = EXT4_MB_USE_ROOT_BLOCKS;
+       if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+               flags = EXT4_MB_USE_RESERVED;
         err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
         if (err)
                 goto cleanup;
@@ -2729,12 +2729,14 @@ again:
  
                         /*
                          * Split the extent in two so that 'end' is the last
-                        * block in the first new extent
+                        * block in the first new extent. Also we should not
+                        * fail removing space due to ENOSPC so try to use
+                        * reserved block if that happens.
                          */
                         err = ext4_split_extent_at(handle, inode, path,
-                                               end + 1, split_flag,
-                                               EXT4_GET_BLOCKS_PRE_IO |
-                                               EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+                                       end + 1, split_flag,
+                                       EXT4_GET_BLOCKS_PRE_IO |
+                                       EXT4_GET_BLOCKS_METADATA_NOFAIL);
  
                         if (err < 0)
                                 goto out;
@@ -3209,7 +3211,8 @@ out:
  static int ext4_ext_convert_to_initialized(handle_t *handle,
                                            struct inode *inode,
                                            struct ext4_map_blocks *map,
-                                          struct ext4_ext_path *path)
+                                          struct ext4_ext_path *path,
+                                          int flags)
  {
         struct ext4_sb_info *sbi;
         struct ext4_extent_header *eh;
@@ -3435,7 +3438,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
         }
  
         allocated = ext4_split_extent(handle, inode, path,
-                                     &split_map, split_flag, 0);
+                                     &split_map, split_flag, flags);
         if (allocated < 0)
                 err = allocated;
  
@@ -3755,6 +3758,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                   flags, allocated);
         ext4_ext_show_leaf(inode, path);
  
+       /*
+        * When writing into uninitialized space, we should not fail to
+        * allocate metadata blocks for the new extent block if needed.
+        */
+       flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
+
         trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
                                                     allocated, newblock);
  
@@ -3818,7 +3827,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
         }
  
         /* buffered write, writepage time, convert*/
-       ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
+       ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
         if (ret >= 0)
                 ext4_update_inode_fsync_trans(handle, inode, 1);
  out:
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index f9b0b479ff4cce8024c98ef78ac13550c001b382..629d67b62dfba56cc9009c6a54c56866f6149581 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1688,12 +1688,21 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
          */
         map.m_lblk = next;
         map.m_len = max_blocks;
-       get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+       /*
+        * We're in delalloc path and it is possible that we're going to
+        * need more metadata blocks than previously reserved. However
+        * we must not fail because we're in writeback and there is
+        * nothing we can do about it so it might result in data loss.
+        * So use reserved blocks to allocate metadata if possible.
+        */
+       get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+                          EXT4_GET_BLOCKS_METADATA_NOFAIL;
         if (ext4_should_dioread_nolock(mpd->inode))
                 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
         if (mpd->b_state & (1 << BH_Delay))
                 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
  
+
         blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
         if (blks < 0) {
                 struct super_block *sb = mpd->inode->i_sb;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 968ca93691751781931e4a1959eb51db06fb0c02..6fea87db7daa4696e1781a2817fbd6ff61d14673 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
  static void ext4_destroy_lazyinit_thread(void);
  static void ext4_unregister_li_request(struct super_block *sb);
  static void ext4_clear_request_list(void);
+static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
  
  #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
  static struct file_system_type ext2_fs_type = {
@@ -2382,6 +2383,17 @@ struct ext4_attr {
         int offset;
  };
  
+static int parse_strtoull(const char *buf,
+               unsigned long long max, unsigned long long *value)
+{
+       int ret;
+
+       ret = kstrtoull(skip_spaces(buf), 0, value);
+       if (!ret && *value > max)
+               ret = -EINVAL;
+       return ret;
+}
+
  static int parse_strtoul(const char *buf,
                 unsigned long max, unsigned long *value)
  {
@@ -2466,6 +2478,27 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
         return count;
  }
  
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+                                 struct ext4_sb_info *sbi, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%llu\n",
+               (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+                                  struct ext4_sb_info *sbi,
+                                  const char *buf, size_t count)
+{
+       unsigned long long val;
+       int ret;
+
+       if (parse_strtoull(buf, -1ULL, &val))
+               return -EINVAL;
+       ret = ext4_reserve_clusters(sbi, val);
+
+       return ret ? ret : count;
+}
+
  static ssize_t trigger_test_error(struct ext4_attr *a,
                                   struct ext4_sb_info *sbi,
                                   const char *buf, size_t count)
@@ -2503,6 +2536,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
  EXT4_RO_ATTR(delayed_allocation_blocks);
  EXT4_RO_ATTR(session_write_kbytes);
  EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
  EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
                  inode_readahead_blks_store, s_inode_readahead_blks);
  EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2520,6 +2554,7 @@ static struct attribute *ext4_attrs[] = {
         ATTR_LIST(delayed_allocation_blocks),
         ATTR_LIST(session_write_kbytes),
         ATTR_LIST(lifetime_write_kbytes),
+       ATTR_LIST(reserved_clusters),
         ATTR_LIST(inode_readahead_blks),
         ATTR_LIST(inode_goal),
         ATTR_LIST(mb_stats),
@@ -3195,6 +3230,40 @@ int ext4_calculate_overhead(struct super_block *sb)
         return 0;
  }
  
+
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+{
+       ext4_fsblk_t resv_clusters;
+
+       /*
+        * By default we reserve 2% or 4096 clusters, whichever is smaller.
+        * This should cover the situations where we can not afford to run
+        * out of space like for example punch hole, or converting
+        * uninitialized extents in delalloc path. In most cases such
+        * allocation would require 1, or 2 blocks, higher numbers are
+        * very rare.
+        */
+       resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+
+       do_div(resv_clusters, 50);
+       resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
+
+       return resv_clusters;
+}
+
+
+static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
+{
+       ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
+                               sbi->s_cluster_bits;
+
+       if (count >= clusters)
+               return -EINVAL;
+
+       atomic64_set(&sbi->s_resv_clusters, count);
+       return 0;
+}
+
  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
  {
         char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -3918,6 +3987,13 @@ no_journal:
                          "available");
         }
  
+       err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+       if (err) {
+               ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
+                        "reserved pool", ext4_calculate_resv_clusters(sbi));
+               goto failed_mount4a;
+       }
+
         err = ext4_setup_system_zone(sb);
         if (err) {
                 ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -4750,9 +4826,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
         struct super_block *sb = dentry->d_sb;
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         struct ext4_super_block *es = sbi->s_es;
-       ext4_fsblk_t overhead = 0;
+       ext4_fsblk_t overhead = 0, resv_blocks;
         u64 fsid;
         s64 bfree;
+       resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
  
         if (!test_opt(sb, MINIX_DF))
                 overhead = sbi->s_overhead;
@@ -4764,8 +4841,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
                 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
         /* prevent underflow in case that few free space is available */
         buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
-       buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
-       if (buf->f_bfree < ext4_r_blocks_count(es))
+       buf->f_bavail = buf->f_bfree -
+                       (ext4_r_blocks_count(es) + resv_blocks);
+       if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
                 buf->f_bavail = 0;
         buf->f_files = le32_to_cpu(es->s_inodes_count);
         buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
author	Lukas Czerner <lczerner@redhat.com>
	Wed, 10 Apr 2013 02:11:22 +0000 (22:11 -0400)
committer	Theodore Ts'o <tytso@mit.edu>
	Wed, 10 Apr 2013 02:11:22 +0000 (22:11 -0400)
Documentation/filesystems/ext4.txt		patch \| blob \| history
fs/ext4/balloc.c		patch \| blob \| history
fs/ext4/ext4.h		patch \| blob \| history
fs/ext4/extents.c		patch \| blob \| history
fs/ext4/inode.c		patch \| blob \| history
fs/ext4/super.c		patch \| blob \| history