]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
btrfs: do not infinite loop in data reclaim if we aborted
authorJosef Bacik <josef@toxicpanda.com>
Thu, 20 May 2021 15:21:33 +0000 (11:21 -0400)
committerDavid Sterba <dsterba@suse.com>
Tue, 1 Jun 2021 18:59:52 +0000 (20:59 +0200)
Error injection stressing uncovered a busy loop in our data reclaim
loop.  There are two cases here, one where we loop creating block groups
until space_info->full is set, or in the main loop we will skip erroring
out any tickets if space_info->full == 0.  Unfortunately if we aborted
the transaction then we will never allocate chunks or reclaim any space
and thus never get ->full, and you'll see stack traces like this

watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [kworker/u4:4:139]
CPU: 0 PID: 139 Comm: kworker/u4:4 Tainted: G        W         5.13.0-rc1+ #328
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
Workqueue: events_unbound btrfs_async_reclaim_data_space
RIP: 0010:btrfs_join_transaction+0x12/0x20
RSP: 0018:ffffb2b780b77de0 EFLAGS: 00000246
RAX: ffffb2b781863d58 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000801 RSI: ffff987952b57400 RDI: ffff987940aa3000
RBP: ffff987954d55000 R08: 0000000000000001 R09: ffff98795539e8f0
R10: 000000000000000f R11: 000000000000000f R12: ffffffffffffffff
R13: ffff987952b574c8 R14: ffff987952b57400 R15: 0000000000000008
FS:  0000000000000000(0000) GS:ffff9879bbc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f0703da4000 CR3: 0000000113398004 CR4: 0000000000370ef0
Call Trace:
 flush_space+0x4a8/0x660
 btrfs_async_reclaim_data_space+0x55/0x130
 process_one_work+0x1e9/0x380
 worker_thread+0x53/0x3e0
 ? process_one_work+0x380/0x380
 kthread+0x118/0x140
 ? __kthread_bind_mask+0x60/0x60
 ret_from_fork+0x1f/0x30

Fix this by checking to see if we have a btrfs fs error in either of the
reclaim loops, and if so fail the tickets and bail.  In addition to
this, fix maybe_fail_all_tickets() to not try to grant tickets if we've
aborted, simply fail everything.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/space-info.c

index 42d0fa2092d4bd0a8dfe1fc4ccf5acbd81e3dccf..077e54cdc29fb8710348f8c7c2f824e27e4262f0 100644 (file)
@@ -941,6 +941,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
        struct reserve_ticket *ticket;
        u64 tickets_id = space_info->tickets_id;
        u64 first_ticket_bytes = 0;
+       const bool aborted = btrfs_has_fs_error(fs_info);
 
        if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
                btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
@@ -952,7 +953,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
                ticket = list_first_entry(&space_info->tickets,
                                          struct reserve_ticket, list);
 
-               if (ticket->steal &&
+               if (!aborted && ticket->steal &&
                    steal_from_global_rsv(fs_info, space_info, ticket))
                        return true;
 
@@ -968,15 +969,18 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
                 */
                if (first_ticket_bytes == 0)
                        first_ticket_bytes = ticket->bytes;
-               else if (first_ticket_bytes > ticket->bytes)
+               else if (!aborted && first_ticket_bytes > ticket->bytes)
                        return true;
 
-               if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+               if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
                        btrfs_info(fs_info, "failing ticket with %llu bytes",
                                   ticket->bytes);
 
                remove_ticket(space_info, ticket);
-               ticket->error = -ENOSPC;
+               if (aborted)
+                       ticket->error = -EIO;
+               else
+                       ticket->error = -ENOSPC;
                wake_up(&ticket->wait);
 
                /*
@@ -985,7 +989,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
                 * here to see if we can make progress with the next ticket in
                 * the list.
                 */
-               btrfs_try_granting_tickets(fs_info, space_info);
+               if (!aborted)
+                       btrfs_try_granting_tickets(fs_info, space_info);
        }
        return (tickets_id != space_info->tickets_id);
 }
@@ -1253,6 +1258,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
                        spin_unlock(&space_info->lock);
                        return;
                }
+
+               /* Something happened, fail everything and bail. */
+               if (btrfs_has_fs_error(fs_info))
+                       goto aborted_fs;
                last_tickets_id = space_info->tickets_id;
                spin_unlock(&space_info->lock);
        }
@@ -1283,9 +1292,19 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
                        } else {
                                flush_state = 0;
                        }
+
+                       /* Something happened, fail everything and bail. */
+                       if (btrfs_has_fs_error(fs_info))
+                               goto aborted_fs;
+
                }
                spin_unlock(&space_info->lock);
        }
+       return;
+aborted_fs:
+       maybe_fail_all_tickets(fs_info, space_info);
+       space_info->flush = 0;
+       spin_unlock(&space_info->lock);
 }
 
 void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)