multiple of this tuning parameter if the stripe size is not set in the
         ext4 superblock
 
+  mb_max_inode_prealloc
+        The maximum length of per-inode ext4_prealloc_space list.
+
   mb_max_to_scan
         The maximum number of extents the multiblock allocator will search to
         find the best extent.
 
        struct timespec64 i_crtime;
 
        /* mballoc */
+       atomic_t i_prealloc_active;
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;
 
        unsigned int s_mb_stats;
        unsigned int s_mb_order2_reqs;
        unsigned int s_mb_group_prealloc;
+       unsigned int s_mb_max_inode_prealloc;
        unsigned int s_max_dir_size_kb;
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
-extern void ext4_discard_preallocations(struct inode *);
+extern void ext4_discard_preallocations(struct inode *, unsigned int);
 extern int __init ext4_init_mballoc(void);
 extern void ext4_exit_mballoc(void);
 extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
 
         * i_mutex. So we can safely drop the i_data_sem here.
         */
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
        up_write(&EXT4_I(inode)->i_data_sem);
        *dropped = 1;
        return 0;
                         * not a good idea to call discard here directly,
                         * but otherwise we'd need to call it every free().
                         */
-                       ext4_discard_preallocations(inode);
+                       ext4_discard_preallocations(inode, 0);
                        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                                fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
                        ext4_free_blocks(handle, inode, NULL, newblock,
        }
 
        down_write(&EXT4_I(inode)->i_data_sem);
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
 
        ret = ext4_es_remove_extent(inode, punch_start,
                                    EXT_MAX_BLOCKS - punch_start);
                up_write(&EXT4_I(inode)->i_data_sem);
                goto out_stop;
        }
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
 
        ret = ext4_ext_shift_extents(inode, handle, punch_stop,
                                     punch_stop - punch_start, SHIFT_LEFT);
                goto out_stop;
 
        down_write(&EXT4_I(inode)->i_data_sem);
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
 
        path = ext4_find_extent(inode, offset_lblk, NULL, 0);
        if (IS_ERR(path)) {
 
                        (atomic_read(&inode->i_writecount) == 1) &&
                        !EXT4_I(inode)->i_reserved_data_blocks) {
                down_write(&EXT4_I(inode)->i_data_sem);
-               ext4_discard_preallocations(inode);
+               ext4_discard_preallocations(inode, 0);
                up_write(&EXT4_I(inode)->i_data_sem);
        }
        if (is_dx(inode) && filp->private_data)
 
         * i_mutex. So we can safely drop the i_data_sem here.
         */
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
        up_write(&EXT4_I(inode)->i_data_sem);
        *dropped = 1;
        return 0;
 
         */
        if ((ei->i_reserved_data_blocks == 0) &&
            !inode_is_open_for_write(inode))
-               ext4_discard_preallocations(inode);
+               ext4_discard_preallocations(inode, 0);
 }
 
 static int __check_block_validity(struct inode *inode, const char *func,
        if (stop_block > first_block) {
 
                down_write(&EXT4_I(inode)->i_data_sem);
-               ext4_discard_preallocations(inode);
+               ext4_discard_preallocations(inode, 0);
 
                ret = ext4_es_remove_extent(inode, first_block,
                                            stop_block - first_block);
 
        down_write(&EXT4_I(inode)->i_data_sem);
 
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
 
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                err = ext4_ext_truncate(handle, inode);
 
        reset_inode_seed(inode);
        reset_inode_seed(inode_bl);
 
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
 
        err = ext4_mark_inode_dirty(handle, inode);
        if (err < 0) {
 
        sbi->s_mb_stats = MB_DEFAULT_STATS;
        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+       sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
        /*
         * The default group preallocation is 512, which for 4k block
         * sizes translates to 2 megabytes.  However for bigalloc file
        mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
 }
 
+static void ext4_mb_mark_pa_deleted(struct super_block *sb,
+                                   struct ext4_prealloc_space *pa)
+{
+       struct ext4_inode_info *ei;
+
+       if (pa->pa_deleted) {
+               ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
+                            pa->pa_type, pa->pa_pstart, pa->pa_lstart,
+                            pa->pa_len);
+               return;
+       }
+
+       pa->pa_deleted = 1;
+
+       if (pa->pa_type == MB_INODE_PA) {
+               ei = EXT4_I(pa->pa_inode);
+               atomic_dec(&ei->i_prealloc_active);
+       }
+}
+
 static void ext4_mb_pa_callback(struct rcu_head *head)
 {
        struct ext4_prealloc_space *pa;
                return;
        }
 
-       pa->pa_deleted = 1;
+       ext4_mb_mark_pa_deleted(sb, pa);
        spin_unlock(&pa->pa_lock);
 
        grp_blk = pa->pa_pstart;
        spin_lock(pa->pa_obj_lock);
        list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
        spin_unlock(pa->pa_obj_lock);
+       atomic_inc(&ei->i_prealloc_active);
 }
 
 /*
                }
 
                /* seems this one can be freed ... */
-               pa->pa_deleted = 1;
+               ext4_mb_mark_pa_deleted(sb, pa);
 
                /* we can trust pa_free ... */
                free += pa->pa_free;
  *
  * FIXME!! Make sure it is valid at all the call sites
  */
-void ext4_discard_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct super_block *sb = inode->i_sb;
 
        mb_debug(sb, "discard preallocation for inode %lu\n",
                 inode->i_ino);
-       trace_ext4_discard_preallocations(inode);
+       trace_ext4_discard_preallocations(inode,
+                       atomic_read(&ei->i_prealloc_active), needed);
 
        INIT_LIST_HEAD(&list);
 
+       if (needed == 0)
+               needed = UINT_MAX;
+
 repeat:
        /* first, collect all pa's in the inode */
        spin_lock(&ei->i_prealloc_lock);
-       while (!list_empty(&ei->i_prealloc_list)) {
-               pa = list_entry(ei->i_prealloc_list.next,
+       while (!list_empty(&ei->i_prealloc_list) && needed) {
+               pa = list_entry(ei->i_prealloc_list.prev,
                                struct ext4_prealloc_space, pa_inode_list);
                BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
                spin_lock(&pa->pa_lock);
 
                }
                if (pa->pa_deleted == 0) {
-                       pa->pa_deleted = 1;
+                       ext4_mb_mark_pa_deleted(sb, pa);
                        spin_unlock(&pa->pa_lock);
                        list_del_rcu(&pa->pa_inode_list);
                        list_add(&pa->u.pa_tmp_list, &list);
+                       needed--;
                        continue;
                }
 
                BUG_ON(pa->pa_type != MB_GROUP_PA);
 
                /* seems this one can be freed ... */
-               pa->pa_deleted = 1;
+               ext4_mb_mark_pa_deleted(sb, pa);
                spin_unlock(&pa->pa_lock);
 
                list_del_rcu(&pa->pa_inode_list);
        return ;
 }
 
+/*
+ * if per-inode prealloc list is too long, trim some PA
+ */
+static void ext4_mb_trim_inode_pa(struct inode *inode)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       int count, delta;
+
+       count = atomic_read(&ei->i_prealloc_active);
+       delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
+       if (count > sbi->s_mb_max_inode_prealloc + delta) {
+               count -= sbi->s_mb_max_inode_prealloc;
+               ext4_discard_preallocations(inode, count);
+       }
+}
+
 /*
  * release all resource we used in allocation
  */
 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
+       struct inode *inode = ac->ac_inode;
+       struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_prealloc_space *pa = ac->ac_pa;
        if (pa) {
                                ext4_mb_add_n_trim(ac);
                        }
                }
+
+               if (pa->pa_type == MB_INODE_PA) {
+                       /*
+                        * treat per-inode prealloc list as a lru list, then try
+                        * to trim the least recently used PA.
+                        */
+                       spin_lock(pa->pa_obj_lock);
+                       list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
+                       spin_unlock(pa->pa_obj_lock);
+               }
+
                ext4_mb_put_pa(ac, ac->ac_sb, pa);
        }
        if (ac->ac_bitmap_page)
        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
                mutex_unlock(&ac->ac_lg->lg_mutex);
        ext4_mb_collect_stats(ac);
+       ext4_mb_trim_inode_pa(inode);
        return 0;
 }
 
 
  */
 #define MB_DEFAULT_GROUP_PREALLOC      512
 
+/*
+ * maximum length of inode prealloc list
+ */
+#define MB_DEFAULT_MAX_INODE_PREALLOC  512
 
 struct ext4_free_data {
        /* this links the free block information from sb_info */
 
 
 out:
        if (*moved_len) {
-               ext4_discard_preallocations(orig_inode);
-               ext4_discard_preallocations(donor_inode);
+               ext4_discard_preallocations(orig_inode, 0);
+               ext4_discard_preallocations(donor_inode, 0);
        }
 
        ext4_ext_drop_refs(path);
 
        inode_set_iversion(&ei->vfs_inode, 1);
        spin_lock_init(&ei->i_raw_lock);
        INIT_LIST_HEAD(&ei->i_prealloc_list);
+       atomic_set(&ei->i_prealloc_active, 0);
        spin_lock_init(&ei->i_prealloc_lock);
        ext4_es_init_tree(&ei->i_es_tree);
        rwlock_init(&ei->i_es_lock);
 {
        invalidate_inode_buffers(inode);
        clear_inode(inode);
-       ext4_discard_preallocations(inode);
+       ext4_discard_preallocations(inode, 0);
        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
        dquot_drop(inode);
        if (EXT4_I(inode)->jinode) {
 
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
 EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
        ATTR_LIST(mb_order2_req),
        ATTR_LIST(mb_stream_req),
        ATTR_LIST(mb_group_prealloc),
+       ATTR_LIST(mb_max_inode_prealloc),
        ATTR_LIST(max_writeback_mb_bump),
        ATTR_LIST(extent_max_zeroout_kb),
        ATTR_LIST(trigger_fs_error),
 
 );
 
 TRACE_EVENT(ext4_discard_preallocations,
-       TP_PROTO(struct inode *inode),
+       TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed),
 
-       TP_ARGS(inode),
+       TP_ARGS(inode, len, needed),
 
        TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        ino_t,  ino                     )
+               __field(        dev_t,          dev             )
+               __field(        ino_t,          ino             )
+               __field(        unsigned int,   len             )
+               __field(        unsigned int,   needed          )
 
        ),
 
        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
+               __entry->len    = len;
+               __entry->needed = needed;
        ),
 
-       TP_printk("dev %d,%d ino %lu",
+       TP_printk("dev %d,%d ino %lu len: %u needed %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (unsigned long) __entry->ino)
+                 (unsigned long) __entry->ino, __entry->len,
+                 __entry->needed)
 );
 
 TRACE_EVENT(ext4_mb_discard_preallocations,