]> www.infradead.org Git - users/hch/xfsprogs.git/commitdiff
xfs_scrub_all: implement retry and backoff for dbus calls
authorDarrick J. Wong <djwong@kernel.org>
Wed, 3 Jul 2024 21:21:18 +0000 (14:21 -0700)
committerDarrick J. Wong <djwong@kernel.org>
Tue, 9 Jul 2024 22:37:00 +0000 (15:37 -0700)
Calls to systemd across dbus are remote procedure calls, which means
that they're subject to transitory connection failures (e.g. systemd
re-exec itself).  We don't want to fail at the *first* sign of what
could be temporary trouble, so implement a limited retry with fibonacci
backoff before we resort to invoking xfs_scrub as a subprocess.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
scrub/xfs_scrub_all.in

index a09566efdcd8d27acf8e3a64bc2b37163f1ecfe4..71726cdf36d5dee99761e9dbe2a14f73a94cd7de 100644 (file)
@@ -165,6 +165,22 @@ def path_to_serviceunit(path, scrub_media):
        for line in proc.stdout:
                return line.decode(sys.stdout.encoding).strip()
 
+def fibonacci(max_ret):
+       '''Yield fibonacci sequence up to but not including max_ret.'''
+       if max_ret < 1:
+               return
+
+       x = 0
+       y = 1
+       yield 1
+
+       z = x + y
+       while z <= max_ret:
+               yield z
+               x = y
+               y = z
+               z = x + y
+
 class scrub_service(scrub_control):
        '''Control object for xfs_scrub systemd service.'''
        def __init__(self, mnt, scrub_media):
@@ -188,6 +204,25 @@ class scrub_service(scrub_control):
                self.unit = dbus.Interface(svc_obj,
                                'org.freedesktop.systemd1.Unit')
 
+       def __dbusrun(self, lambda_fn):
+               '''Call the lambda function to execute something on dbus.  dbus
+               exceptions result in retries with Fibonacci backoff, and the
+               bindings will be rebuilt every time.'''
+               global debug
+
+               fatal_ex = None
+
+               for i in fibonacci(30):
+                       try:
+                               return lambda_fn()
+                       except dbus.exceptions.DBusException as e:
+                               if debug:
+                                       print(e)
+                               fatal_ex = e
+                               time.sleep(i)
+                               self.bind()
+               raise fatal_ex
+
        def state(self):
                '''Retrieve the active state for a systemd service.  As of
                systemd 249, this is supposed to be one of the following:
@@ -195,8 +230,10 @@ class scrub_service(scrub_control):
                or "deactivating".  These strings are not localized.'''
                global debug
 
+               l = lambda: self.prop.Get('org.freedesktop.systemd1.Unit',
+                               'ActiveState')
                try:
-                       return self.prop.Get('org.freedesktop.systemd1.Unit', 'ActiveState')
+                       return self.__dbusrun(l)
                except Exception as e:
                        if debug:
                                print(e, file = sys.stderr)
@@ -231,7 +268,7 @@ class scrub_service(scrub_control):
                        print('starting %s' % self.unitname)
 
                try:
-                       self.unit.Start('replace')
+                       self.__dbusrun(lambda: self.unit.Start('replace'))
                        return self.wait()
                except Exception as e:
                        print(e, file = sys.stderr)
@@ -245,7 +282,7 @@ class scrub_service(scrub_control):
                        print('stopping %s' % self.unitname)
 
                try:
-                       self.unit.Stop('replace')
+                       self.__dbusrun(lambda: self.unit.Stop('replace'))
                        return self.wait()
                except Exception as e:
                        print(e, file = sys.stderr)