]> www.infradead.org Git - users/hch/blktests.git/commitdiff
tests/nvme: Add admin-passthru+reset race test nvme-reset
authorJonathan Derrick <jonathan.derrick@linux.dev>
Mon, 14 Nov 2022 20:34:12 +0000 (13:34 -0700)
committerChristoph Hellwig <hch@lst.de>
Wed, 16 Nov 2022 07:28:48 +0000 (08:28 +0100)
Adds a test which runs many formats and reset_controllers in parallel.
The intent is to expose timing holes in the controller state machine
which will lead to hung task timing and the controller becoming
unavailable.

Reported by https://bugzilla.kernel.org/show_bug.cgi?id=216354

Signed-off-by: Jonathan Derrick <jonathan.derrick@linux.dev>
Signed-off-by: Christoph Hellwig <hch@lst.de>
tests/nvme/046 [new file with mode: 0755]
tests/nvme/046.out [new file with mode: 0644]

diff --git a/tests/nvme/046 b/tests/nvme/046
new file mode 100755 (executable)
index 0000000..4b47783
--- /dev/null
@@ -0,0 +1,85 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2022 Jonathan Derrick <jonathan.derrick@linux.dev>
+#
+# Test nvme reset controller during admin passthru
+#
+# Regression for issue reported by
+# https://bugzilla.kernel.org/show_bug.cgi?id=216354
+
+. tests/nvme/rc
+
+#restrict test to nvme-pci only
+nvme_trtype=pci
+
+DESCRIPTION="test nvme reset controller during admin passthru"
+QUICK=1
+CAN_BE_ZONED=1
+
+requires() {
+       _nvme_requires
+}
+
+device_requires() {
+       _require_test_dev_is_nvme
+}
+
+test_device() {
+       echo "Running ${TEST_NAME}"
+
+       local sysfs
+       local attr
+       local m
+
+       sysfs="$TEST_DEV_SYSFS/device"
+       timeout=$(($(cat /proc/sys/kernel/hung_task_timeout_secs) / 2))
+
+       sleep 5
+
+       if [[ ! -d "$sysfs" ]]; then
+               echo "$sysfs doesn't exist"
+       fi
+
+       # do reset controller/format loops
+       # don't check status now because a timing race is desired
+       i=0
+       start=0
+       timing_out=false
+       while [[ $i -le 1000 ]]; do
+               start=$SECONDS
+               if [[ -f "$sysfs/reset_controller" ]]; then
+                       echo 1 > "$sysfs/reset_controller" 2>/dev/null &
+                       i=$((i+1))
+               fi
+               nvme format -l 0 -f $TEST_DEV 2>/dev/null &
+
+               #Assume the controller is hung and unrecoverable
+               if [[ $(($SECONDS - $start)) -gt $timeout ]]; then
+                       echo "nvme controller timing out"
+                       timing_out=true
+                       break
+               fi
+       done
+
+       { kill $!; wait; } &> /dev/null
+
+       # at this point it may have waited hung_task_timeout / 2 already, so
+       # only wait 25% longer for a total of about 75% of allowed timeout
+       m=0
+       while [[ $m -le $((timeout / 2)) ]]; do
+               if [[ $timing_out == true ]]; then
+                       break
+               fi
+               if grep -q live "$sysfs/state"; then
+                       break
+               fi
+               sleep 1
+               m=$((m+1))
+       done
+       if ! grep -q live "$sysfs/state"; then
+               echo "nvme still not live after $(($SECONDS - $start)) seconds!"
+       fi
+       udevadm settle
+
+       echo "Test complete"
+}
diff --git a/tests/nvme/046.out b/tests/nvme/046.out
new file mode 100644 (file)
index 0000000..2b5fa6a
--- /dev/null
@@ -0,0 +1,2 @@
+Running nvme/046
+Test complete