]> www.infradead.org Git - users/mchehab/rasdaemon.git/commitdiff
ras-page-isolation: do_page_offline always considers page offline was successful
authorlvying <lvying6@huawei.com>
Sat, 31 Oct 2020 09:57:14 +0000 (17:57 +0800)
committerMauro Carvalho Chehab <mchehab+huawei@kernel.org>
Wed, 23 Dec 2020 09:44:03 +0000 (10:44 +0100)
do_page_offline always consider page offline was successful even if
kernel soft/hard offline page failed.

Calling rasdaemon with:

/etc/sysconfig/rasdaemon PAGE_CE_THRESHOLD="1"

i.e when a page's address occurs Corrected Error, rasdaemon should
trigger this page soft offline.

However, after adding a livepatch into kernel's
store_soft_offline_page to observe this function's return value,
when injecting a CE into address 0x3f7ec30000, the Kernel
lot reports:

soft_offline: 0x3f7ec30: unknown non LRU page type ffffe0000000000 ()
[store_soft_offline_page]return from soft_offline_page: -5

While rasdaemon log reports:

rasdaemon[73711]: cpu 00:rasdaemon: Corrected Errors at 0x3f7ec30000 exceed threshold
rasdaemon[73711]: rasdaemon: Result of offlining page at 0x3f7ec30000: offlined

using strace to record rasdaemon's system call, it reports:

strace -p 73711
openat(AT_FDCWD, "/sys/devices/system/memory/soft_offline_page",
       O_WRONLY|O_CREAT|O_TRUNC, 0666) = 28
fstat(28, {st_mode=S_IFREG|0200, st_size=4096, ...}) = 0
write(28, "0x3f7ec30000", 12)           = -1 EIO (Input/output error)
close(28)                               = 0

So, kernel actually soft offline pfn 0x3f7ec30 failed and
store_soft_offline_page returned -EIO. However, rasdaemon always
considers the page offline to be successful.

According to strace display, ferror was unable of detecting the
failure of the write syscall.

This patch changes fopen-fprintf-ferror-fclose process to use
the lower I/O level, by using instead open-write-close, which
can detect such syscall failure.

Signed-off-by: lvying <lvying6@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
ras-page-isolation.c

index 50e440660d9e4c93e47c2256c7f5955f4588ea88..dc07545297c16eebfc55feb4117eee8f5cf5e00b 100644 (file)
@@ -17,6 +17,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
 #include "ras-logger.h"
 #include "ras-page-isolation.h"
 
@@ -210,18 +213,22 @@ void ras_page_account_init(void)
 
 static int do_page_offline(unsigned long long addr, enum otype type)
 {
-       FILE *offline_file;
-       int err;
+       int fd, rc;
+       char buf[20];
 
-       offline_file = fopen(kernel_offline[type], "w");
-       if (!offline_file)
+       fd = open(kernel_offline[type], O_WRONLY);
+       if (fd == -1) {
+               log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, kernel_offline[type]);
                return -1;
+       }
 
-       fprintf(offline_file, "%#llx", addr);
-       err = ferror(offline_file) ? -1 : 0;
-       fclose(offline_file);
-
-       return err;
+       sprintf(buf, "%#llx", addr);
+       rc = write(fd, buf, strlen(buf));
+       if (rc < 0) {
+               log(TERM, LOG_ERR, "page offline addr(%s) by %s failed, errno:%d\n", buf, kernel_offline[type], errno);
+       }
+       close(fd);
+       return rc;
 }
 
 static void page_offline(struct page_record *pr)