]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
mm/madvise: unrestrict process_madvise() for current process
authorLorenzo Stoakes <lorenzo.stoakes@oracle.com>
Thu, 26 Sep 2024 15:10:19 +0000 (16:10 +0100)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 1 Nov 2024 04:28:48 +0000 (21:28 -0700)
The process_madvise() call was introduced in commit ecb8ac8b1f14
("mm/madvise: introduce process_madvise() syscall: an external memory
hinting API") as a means of performing madvise() operations on another
process.

However, as it provides the means by which to perform multiple madvise()
operations in a batch via an iovec, it is useful to utilise the same
interface for performing operations on the current process rather than a
remote one.

Commit 22af8caff7d1 ("mm/madvise: process_madvise() drop capability check
if same mm") removed the need for a caller invoking process_madvise() on
its own pidfd to possess the CAP_SYS_NICE capability, however this leaves
the restrictions on operation in place.

Resolve this by only applying the restriction on operations when accessing
a remote process.

Moving forward we plan to implement a simpler means of specifying this
condition other than needing to establish a self pidfd, perhaps in the
form of a sentinel pidfd.

Also take the opportunity to refactor the system call implementation
abstracting the vectorised operation.

Link: https://lkml.kernel.org/r/20240926151019.82902-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Brauner <brauner@kernel.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
mm/madvise.c

index 50d223ab3894fe128c69b00b2bd607908b914dc6..e871a72a6c32951b8580a219ae0b4ce9b078e215 100644 (file)
@@ -1208,7 +1208,8 @@ madvise_behavior_valid(int behavior)
        }
 }
 
-static bool process_madvise_behavior_valid(int behavior)
+/* Can we invoke process_madvise() on a remote mm for the specified behavior? */
+static bool process_madvise_remote_valid(int behavior)
 {
        switch (behavior) {
        case MADV_COLD:
@@ -1477,6 +1478,28 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
        return do_madvise(current->mm, start, len_in, behavior);
 }
 
+/* Perform an madvise operation over a vector of addresses and lengths. */
+static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
+                             int behavior)
+{
+       ssize_t ret = 0;
+       size_t total_len;
+
+       total_len = iov_iter_count(iter);
+
+       while (iov_iter_count(iter)) {
+               ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
+                                iter_iov_len(iter), behavior);
+               if (ret < 0)
+                       break;
+               iov_iter_advance(iter, iter_iov_len(iter));
+       }
+
+       ret = (total_len - iov_iter_count(iter)) ? : ret;
+
+       return ret;
+}
+
 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
                size_t, vlen, int, behavior, unsigned int, flags)
 {
@@ -1486,7 +1509,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
        struct iov_iter iter;
        struct task_struct *task;
        struct mm_struct *mm;
-       size_t total_len;
        unsigned int f_flags;
 
        if (flags != 0) {
@@ -1504,11 +1526,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
                goto free_iov;
        }
 
-       if (!process_madvise_behavior_valid(behavior)) {
-               ret = -EINVAL;
-               goto release_task;
-       }
-
        /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
        mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
        if (IS_ERR(mm)) {
@@ -1516,26 +1533,26 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
                goto release_task;
        }
 
+       /*
+        * We need only perform this check if we are attempting to manipulate a
+        * remote process's address space.
+        */
+       if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
+               ret = -EINVAL;
+               goto release_mm;
+       }
+
        /*
         * Require CAP_SYS_NICE for influencing process performance. Note that
-        * only non-destructive hints are currently supported.
+        * only non-destructive hints are currently supported for remote
+        * processes.
         */
        if (mm != current->mm && !capable(CAP_SYS_NICE)) {
                ret = -EPERM;
                goto release_mm;
        }
 
-       total_len = iov_iter_count(&iter);
-
-       while (iov_iter_count(&iter)) {
-               ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
-                                       iter_iov_len(&iter), behavior);
-               if (ret < 0)
-                       break;
-               iov_iter_advance(&iter, iter_iov_len(&iter));
-       }
-
-       ret = (total_len - iov_iter_count(&iter)) ? : ret;
+       ret = vector_madvise(mm, &iter, behavior);
 
 release_mm:
        mmput(mm);