]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
mm: memcg: move legacy memcg event code into memcontrol-v1.c
authorRoman Gushchin <roman.gushchin@linux.dev>
Tue, 25 Jun 2024 00:58:58 +0000 (17:58 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 5 Jul 2024 01:05:52 +0000 (18:05 -0700)
Cgroup v1's memory controller contains a pretty complicated event
notifications mechanism which is not used on cgroup v2.  Let's move the
corresponding code into memcontrol-v1.c.

Please, note, that mem_cgroup_event_ratelimit() remains in memcontrol.c,
otherwise it would require exporting too many details on memcg stats
outside of memcontrol.c.

Link: https://lkml.kernel.org/r/20240625005906.106920-7-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/memcontrol.h
mm/memcontrol-v1.c
mm/memcontrol-v1.h
mm/memcontrol.c

index 83c8327455d8feb77d8c042cdf87a7eeef60f40a..588179d29849d49217c94fbcabe338eb121bed08 100644 (file)
@@ -69,18 +69,6 @@ struct mem_cgroup_id {
        refcount_t ref;
 };
 
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremented by the number of pages. This counter is used
- * to trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
-       MEM_CGROUP_TARGET_THRESH,
-       MEM_CGROUP_TARGET_SOFTLIMIT,
-       MEM_CGROUP_NTARGETS,
-};
-
 struct memcg_vmstats_percpu;
 struct memcg_vmstats;
 struct lruvec_stats_percpu;
index c25e038ac874e76ad2c1b3451717ad5e9505b878..4b2290ceace693ffd77225181d5da6810efa8c9d 100644 (file)
@@ -6,6 +6,10 @@
 #include <linux/pagewalk.h>
 #include <linux/backing-dev.h>
 #include <linux/swap_cgroup.h>
+#include <linux/eventfd.h>
+#include <linux/poll.h>
+#include <linux/sort.h>
+#include <linux/file.h>
 
 #include "internal.h"
 #include "swap.h"
@@ -60,6 +64,54 @@ static struct move_charge_struct {
        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
 
+/* for OOM */
+struct mem_cgroup_eventfd_list {
+       struct list_head list;
+       struct eventfd_ctx *eventfd;
+};
+
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+       /*
+        * memcg which the event belongs to.
+        */
+       struct mem_cgroup *memcg;
+       /*
+        * eventfd to signal userspace about the event.
+        */
+       struct eventfd_ctx *eventfd;
+       /*
+        * Each of these stored in a list by the cgroup.
+        */
+       struct list_head list;
+       /*
+        * register_event() callback will be used to add new userspace
+        * waiter for changes related to this event.  Use eventfd_signal()
+        * on eventfd to send notification to userspace.
+        */
+       int (*register_event)(struct mem_cgroup *memcg,
+                             struct eventfd_ctx *eventfd, const char *args);
+       /*
+        * unregister_event() callback will be called when userspace closes
+        * the eventfd or on cgroup removing.  This callback must be set,
+        * if you want provide notification functionality.
+        */
+       void (*unregister_event)(struct mem_cgroup *memcg,
+                                struct eventfd_ctx *eventfd);
+       /*
+        * All fields below needed to unregister event when
+        * userspace closes eventfd.
+        */
+       poll_table pt;
+       wait_queue_head_t *wqh;
+       wait_queue_entry_t wait;
+       struct work_struct remove;
+};
+
+extern spinlock_t memcg_oom_lock;
+
 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
                                         struct mem_cgroup_tree_per_node *mctz,
                                         unsigned long new_usage_in_excess)
@@ -1306,6 +1358,607 @@ void memcg1_move_task(void)
 }
 #endif
 
+static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
+{
+       struct mem_cgroup_threshold_ary *t;
+       unsigned long usage;
+       int i;
+
+       rcu_read_lock();
+       if (!swap)
+               t = rcu_dereference(memcg->thresholds.primary);
+       else
+               t = rcu_dereference(memcg->memsw_thresholds.primary);
+
+       if (!t)
+               goto unlock;
+
+       usage = mem_cgroup_usage(memcg, swap);
+
+       /*
+        * current_threshold points to threshold just below or equal to usage.
+        * If it's not true, a threshold was crossed after last
+        * call of __mem_cgroup_threshold().
+        */
+       i = t->current_threshold;
+
+       /*
+        * Iterate backward over array of thresholds starting from
+        * current_threshold and check if a threshold is crossed.
+        * If none of thresholds below usage is crossed, we read
+        * only one element of the array here.
+        */
+       for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
+               eventfd_signal(t->entries[i].eventfd);
+
+       /* i = current_threshold + 1 */
+       i++;
+
+       /*
+        * Iterate forward over array of thresholds starting from
+        * current_threshold+1 and check if a threshold is crossed.
+        * If none of thresholds above usage is crossed, we read
+        * only one element of the array here.
+        */
+       for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
+               eventfd_signal(t->entries[i].eventfd);
+
+       /* Update current_threshold */
+       t->current_threshold = i - 1;
+unlock:
+       rcu_read_unlock();
+}
+
+static void mem_cgroup_threshold(struct mem_cgroup *memcg)
+{
+       while (memcg) {
+               __mem_cgroup_threshold(memcg, false);
+               if (do_memsw_account())
+                       __mem_cgroup_threshold(memcg, true);
+
+               memcg = parent_mem_cgroup(memcg);
+       }
+}
+
+/*
+ * Check events in order.
+ *
+ */
+void memcg_check_events(struct mem_cgroup *memcg, int nid)
+{
+       if (IS_ENABLED(CONFIG_PREEMPT_RT))
+               return;
+
+       /* threshold event is triggered in finer grain than soft limit */
+       if (unlikely(mem_cgroup_event_ratelimit(memcg,
+                                               MEM_CGROUP_TARGET_THRESH))) {
+               bool do_softlimit;
+
+               do_softlimit = mem_cgroup_event_ratelimit(memcg,
+                                               MEM_CGROUP_TARGET_SOFTLIMIT);
+               mem_cgroup_threshold(memcg);
+               if (unlikely(do_softlimit))
+                       memcg1_update_tree(memcg, nid);
+       }
+}
+
+static int compare_thresholds(const void *a, const void *b)
+{
+       const struct mem_cgroup_threshold *_a = a;
+       const struct mem_cgroup_threshold *_b = b;
+
+       if (_a->threshold > _b->threshold)
+               return 1;
+
+       if (_a->threshold < _b->threshold)
+               return -1;
+
+       return 0;
+}
+
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
+{
+       struct mem_cgroup_eventfd_list *ev;
+
+       spin_lock(&memcg_oom_lock);
+
+       list_for_each_entry(ev, &memcg->oom_notify, list)
+               eventfd_signal(ev->eventfd);
+
+       spin_unlock(&memcg_oom_lock);
+       return 0;
+}
+
+void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
+{
+       struct mem_cgroup *iter;
+
+       for_each_mem_cgroup_tree(iter, memcg)
+               mem_cgroup_oom_notify_cb(iter);
+}
+
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args, enum res_type type)
+{
+       struct mem_cgroup_thresholds *thresholds;
+       struct mem_cgroup_threshold_ary *new;
+       unsigned long threshold;
+       unsigned long usage;
+       int i, size, ret;
+
+       ret = page_counter_memparse(args, "-1", &threshold);
+       if (ret)
+               return ret;
+
+       mutex_lock(&memcg->thresholds_lock);
+
+       if (type == _MEM) {
+               thresholds = &memcg->thresholds;
+               usage = mem_cgroup_usage(memcg, false);
+       } else if (type == _MEMSWAP) {
+               thresholds = &memcg->memsw_thresholds;
+               usage = mem_cgroup_usage(memcg, true);
+       } else
+               BUG();
+
+       /* Check if a threshold crossed before adding a new one */
+       if (thresholds->primary)
+               __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+       size = thresholds->primary ? thresholds->primary->size + 1 : 1;
+
+       /* Allocate memory for new array of thresholds */
+       new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
+       if (!new) {
+               ret = -ENOMEM;
+               goto unlock;
+       }
+       new->size = size;
+
+       /* Copy thresholds (if any) to new array */
+       if (thresholds->primary)
+               memcpy(new->entries, thresholds->primary->entries,
+                      flex_array_size(new, entries, size - 1));
+
+       /* Add new threshold */
+       new->entries[size - 1].eventfd = eventfd;
+       new->entries[size - 1].threshold = threshold;
+
+       /* Sort thresholds. Registering of new threshold isn't time-critical */
+       sort(new->entries, size, sizeof(*new->entries),
+                       compare_thresholds, NULL);
+
+       /* Find current threshold */
+       new->current_threshold = -1;
+       for (i = 0; i < size; i++) {
+               if (new->entries[i].threshold <= usage) {
+                       /*
+                        * new->current_threshold will not be used until
+                        * rcu_assign_pointer(), so it's safe to increment
+                        * it here.
+                        */
+                       ++new->current_threshold;
+               } else
+                       break;
+       }
+
+       /* Free old spare buffer and save old primary buffer as spare */
+       kfree(thresholds->spare);
+       thresholds->spare = thresholds->primary;
+
+       rcu_assign_pointer(thresholds->primary, new);
+
+       /* To be sure that nobody uses thresholds */
+       synchronize_rcu();
+
+unlock:
+       mutex_unlock(&memcg->thresholds_lock);
+
+       return ret;
+}
+
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args)
+{
+       return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args)
+{
+       return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, enum res_type type)
+{
+       struct mem_cgroup_thresholds *thresholds;
+       struct mem_cgroup_threshold_ary *new;
+       unsigned long usage;
+       int i, j, size, entries;
+
+       mutex_lock(&memcg->thresholds_lock);
+
+       if (type == _MEM) {
+               thresholds = &memcg->thresholds;
+               usage = mem_cgroup_usage(memcg, false);
+       } else if (type == _MEMSWAP) {
+               thresholds = &memcg->memsw_thresholds;
+               usage = mem_cgroup_usage(memcg, true);
+       } else
+               BUG();
+
+       if (!thresholds->primary)
+               goto unlock;
+
+       /* Check if a threshold crossed before removing */
+       __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+       /* Calculate new number of threshold */
+       size = entries = 0;
+       for (i = 0; i < thresholds->primary->size; i++) {
+               if (thresholds->primary->entries[i].eventfd != eventfd)
+                       size++;
+               else
+                       entries++;
+       }
+
+       new = thresholds->spare;
+
+       /* If no items related to eventfd have been cleared, nothing to do */
+       if (!entries)
+               goto unlock;
+
+       /* Set thresholds array to NULL if we don't have thresholds */
+       if (!size) {
+               kfree(new);
+               new = NULL;
+               goto swap_buffers;
+       }
+
+       new->size = size;
+
+       /* Copy thresholds and find current threshold */
+       new->current_threshold = -1;
+       for (i = 0, j = 0; i < thresholds->primary->size; i++) {
+               if (thresholds->primary->entries[i].eventfd == eventfd)
+                       continue;
+
+               new->entries[j] = thresholds->primary->entries[i];
+               if (new->entries[j].threshold <= usage) {
+                       /*
+                        * new->current_threshold will not be used
+                        * until rcu_assign_pointer(), so it's safe to increment
+                        * it here.
+                        */
+                       ++new->current_threshold;
+               }
+               j++;
+       }
+
+swap_buffers:
+       /* Swap primary and spare array */
+       thresholds->spare = thresholds->primary;
+
+       rcu_assign_pointer(thresholds->primary, new);
+
+       /* To be sure that nobody uses thresholds */
+       synchronize_rcu();
+
+       /* If all events are unregistered, free the spare array */
+       if (!new) {
+               kfree(thresholds->spare);
+               thresholds->spare = NULL;
+       }
+unlock:
+       mutex_unlock(&memcg->thresholds_lock);
+}
+
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd)
+{
+       return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd)
+{
+       return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args)
+{
+       struct mem_cgroup_eventfd_list *event;
+
+       event = kmalloc(sizeof(*event), GFP_KERNEL);
+       if (!event)
+               return -ENOMEM;
+
+       spin_lock(&memcg_oom_lock);
+
+       event->eventfd = eventfd;
+       list_add(&event->list, &memcg->oom_notify);
+
+       /* already in OOM ? */
+       if (memcg->under_oom)
+               eventfd_signal(eventfd);
+       spin_unlock(&memcg_oom_lock);
+
+       return 0;
+}
+
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd)
+{
+       struct mem_cgroup_eventfd_list *ev, *tmp;
+
+       spin_lock(&memcg_oom_lock);
+
+       list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
+               if (ev->eventfd == eventfd) {
+                       list_del(&ev->list);
+                       kfree(ev);
+               }
+       }
+
+       spin_unlock(&memcg_oom_lock);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered.  It tries to support fully configurable
+ * events for each user.  Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+       struct mem_cgroup_event *event =
+               container_of(work, struct mem_cgroup_event, remove);
+       struct mem_cgroup *memcg = event->memcg;
+
+       remove_wait_queue(event->wqh, &event->wait);
+
+       event->unregister_event(memcg, event->eventfd);
+
+       /* Notify userspace the event is going away. */
+       eventfd_signal(event->eventfd);
+
+       eventfd_ctx_put(event->eventfd);
+       kfree(event);
+       css_put(&memcg->css);
+}
+
+/*
+ * Gets called on EPOLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
+                           int sync, void *key)
+{
+       struct mem_cgroup_event *event =
+               container_of(wait, struct mem_cgroup_event, wait);
+       struct mem_cgroup *memcg = event->memcg;
+       __poll_t flags = key_to_poll(key);
+
+       if (flags & EPOLLHUP) {
+               /*
+                * If the event has been detached at cgroup removal, we
+                * can simply return knowing the other side will cleanup
+                * for us.
+                *
+                * We can't race against event freeing since the other
+                * side will require wqh->lock via remove_wait_queue(),
+                * which we hold.
+                */
+               spin_lock(&memcg->event_list_lock);
+               if (!list_empty(&event->list)) {
+                       list_del_init(&event->list);
+                       /*
+                        * We are in atomic context, but cgroup_event_remove()
+                        * may sleep, so we have to call it in workqueue.
+                        */
+                       schedule_work(&event->remove);
+               }
+               spin_unlock(&memcg->event_list_lock);
+       }
+
+       return 0;
+}
+
+static void memcg_event_ptable_queue_proc(struct file *file,
+               wait_queue_head_t *wqh, poll_table *pt)
+{
+       struct mem_cgroup_event *event =
+               container_of(pt, struct mem_cgroup_event, pt);
+
+       event->wqh = wqh;
+       add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+ssize_t memcg_write_event_control(struct kernfs_open_file *of,
+                                 char *buf, size_t nbytes, loff_t off)
+{
+       struct cgroup_subsys_state *css = of_css(of);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup_event *event;
+       struct cgroup_subsys_state *cfile_css;
+       unsigned int efd, cfd;
+       struct fd efile;
+       struct fd cfile;
+       struct dentry *cdentry;
+       const char *name;
+       char *endp;
+       int ret;
+
+       if (IS_ENABLED(CONFIG_PREEMPT_RT))
+               return -EOPNOTSUPP;
+
+       buf = strstrip(buf);
+
+       efd = simple_strtoul(buf, &endp, 10);
+       if (*endp != ' ')
+               return -EINVAL;
+       buf = endp + 1;
+
+       cfd = simple_strtoul(buf, &endp, 10);
+       if ((*endp != ' ') && (*endp != '\0'))
+               return -EINVAL;
+       buf = endp + 1;
+
+       event = kzalloc(sizeof(*event), GFP_KERNEL);
+       if (!event)
+               return -ENOMEM;
+
+       event->memcg = memcg;
+       INIT_LIST_HEAD(&event->list);
+       init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+       init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+       INIT_WORK(&event->remove, memcg_event_remove);
+
+       efile = fdget(efd);
+       if (!efile.file) {
+               ret = -EBADF;
+               goto out_kfree;
+       }
+
+       event->eventfd = eventfd_ctx_fileget(efile.file);
+       if (IS_ERR(event->eventfd)) {
+               ret = PTR_ERR(event->eventfd);
+               goto out_put_efile;
+       }
+
+       cfile = fdget(cfd);
+       if (!cfile.file) {
+               ret = -EBADF;
+               goto out_put_eventfd;
+       }
+
+       /* the process need read permission on control file */
+       /* AV: shouldn't we check that it's been opened for read instead? */
+       ret = file_permission(cfile.file, MAY_READ);
+       if (ret < 0)
+               goto out_put_cfile;
+
+       /*
+        * The control file must be a regular cgroup1 file. As a regular cgroup
+        * file can't be renamed, it's safe to access its name afterwards.
+        */
+       cdentry = cfile.file->f_path.dentry;
+       if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
+               ret = -EINVAL;
+               goto out_put_cfile;
+       }
+
+       /*
+        * Determine the event callbacks and set them in @event.  This used
+        * to be done via struct cftype but cgroup core no longer knows
+        * about these events.  The following is crude but the whole thing
+        * is for compatibility anyway.
+        *
+        * DO NOT ADD NEW FILES.
+        */
+       name = cdentry->d_name.name;
+
+       if (!strcmp(name, "memory.usage_in_bytes")) {
+               event->register_event = mem_cgroup_usage_register_event;
+               event->unregister_event = mem_cgroup_usage_unregister_event;
+       } else if (!strcmp(name, "memory.oom_control")) {
+               event->register_event = mem_cgroup_oom_register_event;
+               event->unregister_event = mem_cgroup_oom_unregister_event;
+       } else if (!strcmp(name, "memory.pressure_level")) {
+               event->register_event = vmpressure_register_event;
+               event->unregister_event = vmpressure_unregister_event;
+       } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+               event->register_event = memsw_cgroup_usage_register_event;
+               event->unregister_event = memsw_cgroup_usage_unregister_event;
+       } else {
+               ret = -EINVAL;
+               goto out_put_cfile;
+       }
+
+       /*
+        * Verify @cfile should belong to @css.  Also, remaining events are
+        * automatically removed on cgroup destruction but the removal is
+        * asynchronous, so take an extra ref on @css.
+        */
+       cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
+                                              &memory_cgrp_subsys);
+       ret = -EINVAL;
+       if (IS_ERR(cfile_css))
+               goto out_put_cfile;
+       if (cfile_css != css) {
+               css_put(cfile_css);
+               goto out_put_cfile;
+       }
+
+       ret = event->register_event(memcg, event->eventfd, buf);
+       if (ret)
+               goto out_put_css;
+
+       vfs_poll(efile.file, &event->pt);
+
+       spin_lock_irq(&memcg->event_list_lock);
+       list_add(&event->list, &memcg->event_list);
+       spin_unlock_irq(&memcg->event_list_lock);
+
+       fdput(cfile);
+       fdput(efile);
+
+       return nbytes;
+
+out_put_css:
+       css_put(css);
+out_put_cfile:
+       fdput(cfile);
+out_put_eventfd:
+       eventfd_ctx_put(event->eventfd);
+out_put_efile:
+       fdput(efile);
+out_kfree:
+       kfree(event);
+
+       return ret;
+}
+
+void memcg1_css_offline(struct mem_cgroup *memcg)
+{
+       struct mem_cgroup_event *event, *tmp;
+
+       /*
+        * Unregister events and notify userspace.
+        * Notify userspace about cgroup removing only after rmdir of cgroup
+        * directory to avoid race between userspace and kernelspace.
+        */
+       spin_lock_irq(&memcg->event_list_lock);
+       list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+               list_del_init(&event->list);
+               schedule_work(&event->remove);
+       }
+       spin_unlock_irq(&memcg->event_list_lock);
+}
+
 static int __init memcg1_init(void)
 {
        int node;
index d377c0be9880b211b9c3a26d6dd502f27d88010a..524a2c76ffc97c738b643bd12772b01bb35835b9 100644 (file)
@@ -41,4 +41,55 @@ u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
 int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
                                 struct cftype *cft, u64 val);
 
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremented by the number of pages. This counter is used
+ * to trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+       MEM_CGROUP_TARGET_THRESH,
+       MEM_CGROUP_TARGET_SOFTLIMIT,
+       MEM_CGROUP_NTARGETS,
+};
+
+/* Whether legacy memory+swap accounting is active */
+static bool do_memsw_account(void)
+{
+       return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
+}
+
+/*
+ * Iteration constructs for visiting all cgroups (under a tree).  If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root)           \
+       for (iter = mem_cgroup_iter(root, NULL, NULL);  \
+            iter != NULL;                              \
+            iter = mem_cgroup_iter(root, iter, NULL))
+
+#define for_each_mem_cgroup(iter)                      \
+       for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
+            iter != NULL;                              \
+            iter = mem_cgroup_iter(NULL, iter, NULL))
+
+void memcg1_css_offline(struct mem_cgroup *memcg);
+
+/* for encoding cft->private value on file */
+enum res_type {
+       _MEM,
+       _MEMSWAP,
+       _KMEM,
+       _TCP,
+};
+
+bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
+                               enum mem_cgroup_events_target target);
+unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
+void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
+ssize_t memcg_write_event_control(struct kernfs_open_file *of,
+                                 char *buf, size_t nbytes, loff_t off);
+
+
 #endif /* __MM_MEMCONTROL_V1_H */
index 1b5d9c82bea2264a7a3adb407ee2bbae1e51a8f0..5a5bd0767fb0fc91f2c624b06267e1b5722e8247 100644 (file)
@@ -46,9 +46,6 @@
 #include <linux/slab.h>
 #include <linux/swapops.h>
 #include <linux/spinlock.h>
-#include <linux/eventfd.h>
-#include <linux/poll.h>
-#include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmpressure.h>
@@ -58,7 +55,6 @@
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include <linux/lockdep.h>
-#include <linux/file.h>
 #include <linux/resume_user_mode.h>
 #include <linux/psi.h>
 #include <linux/seq_buf.h>
@@ -96,91 +92,13 @@ static bool cgroup_memory_nobpf __ro_after_init;
 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 #endif
 
-/* Whether legacy memory+swap accounting is active */
-static bool do_memsw_account(void)
-{
-       return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
-}
-
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 
-/* for OOM */
-struct mem_cgroup_eventfd_list {
-       struct list_head list;
-       struct eventfd_ctx *eventfd;
-};
-
-/*
- * cgroup_event represents events which userspace want to receive.
- */
-struct mem_cgroup_event {
-       /*
-        * memcg which the event belongs to.
-        */
-       struct mem_cgroup *memcg;
-       /*
-        * eventfd to signal userspace about the event.
-        */
-       struct eventfd_ctx *eventfd;
-       /*
-        * Each of these stored in a list by the cgroup.
-        */
-       struct list_head list;
-       /*
-        * register_event() callback will be used to add new userspace
-        * waiter for changes related to this event.  Use eventfd_signal()
-        * on eventfd to send notification to userspace.
-        */
-       int (*register_event)(struct mem_cgroup *memcg,
-                             struct eventfd_ctx *eventfd, const char *args);
-       /*
-        * unregister_event() callback will be called when userspace closes
-        * the eventfd or on cgroup removing.  This callback must be set,
-        * if you want provide notification functionality.
-        */
-       void (*unregister_event)(struct mem_cgroup *memcg,
-                                struct eventfd_ctx *eventfd);
-       /*
-        * All fields below needed to unregister event when
-        * userspace closes eventfd.
-        */
-       poll_table pt;
-       wait_queue_head_t *wqh;
-       wait_queue_entry_t wait;
-       struct work_struct remove;
-};
-
-static void mem_cgroup_threshold(struct mem_cgroup *memcg);
-static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
-
-/* for encoding cft->private value on file */
-enum res_type {
-       _MEM,
-       _MEMSWAP,
-       _KMEM,
-       _TCP,
-};
-
 #define MEMFILE_PRIVATE(x, val)        ((x) << 16 | (val))
 #define MEMFILE_TYPE(val)      ((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)      ((val) & 0xffff)
 
-/*
- * Iteration constructs for visiting all cgroups (under a tree).  If
- * loops are exited prematurely (break), mem_cgroup_iter_break() must
- * be used for reference counting.
- */
-#define for_each_mem_cgroup_tree(iter, root)           \
-       for (iter = mem_cgroup_iter(root, NULL, NULL);  \
-            iter != NULL;                              \
-            iter = mem_cgroup_iter(root, iter, NULL))
-
-#define for_each_mem_cgroup(iter)                      \
-       for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
-            iter != NULL;                              \
-            iter = mem_cgroup_iter(NULL, iter, NULL))
-
 static inline bool task_is_dying(void)
 {
        return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
@@ -939,8 +857,8 @@ void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
        __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
 }
 
-static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
-                                      enum mem_cgroup_events_target target)
+bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
+                               enum mem_cgroup_events_target target)
 {
        unsigned long val, next;
 
@@ -964,28 +882,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
        return false;
 }
 
-/*
- * Check events in order.
- *
- */
-void memcg_check_events(struct mem_cgroup *memcg, int nid)
-{
-       if (IS_ENABLED(CONFIG_PREEMPT_RT))
-               return;
-
-       /* threshold event is triggered in finer grain than soft limit */
-       if (unlikely(mem_cgroup_event_ratelimit(memcg,
-                                               MEM_CGROUP_TARGET_THRESH))) {
-               bool do_softlimit;
-
-               do_softlimit = mem_cgroup_event_ratelimit(memcg,
-                                               MEM_CGROUP_TARGET_SOFTLIMIT);
-               mem_cgroup_threshold(memcg);
-               if (unlikely(do_softlimit))
-                       memcg1_update_tree(memcg, nid);
-       }
-}
-
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
        /*
@@ -1725,7 +1621,7 @@ static struct lockdep_map memcg_oom_lock_dep_map = {
 };
 #endif
 
-static DEFINE_SPINLOCK(memcg_oom_lock);
+DEFINE_SPINLOCK(memcg_oom_lock);
 
 /*
  * Check OOM-Killer is already running under our hierarchy.
@@ -3543,7 +3439,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
        return -EINVAL;
 }
 
-static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
        unsigned long val;
 
@@ -4044,331 +3940,6 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
        return 0;
 }
 
-static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
-{
-       struct mem_cgroup_threshold_ary *t;
-       unsigned long usage;
-       int i;
-
-       rcu_read_lock();
-       if (!swap)
-               t = rcu_dereference(memcg->thresholds.primary);
-       else
-               t = rcu_dereference(memcg->memsw_thresholds.primary);
-
-       if (!t)
-               goto unlock;
-
-       usage = mem_cgroup_usage(memcg, swap);
-
-       /*
-        * current_threshold points to threshold just below or equal to usage.
-        * If it's not true, a threshold was crossed after last
-        * call of __mem_cgroup_threshold().
-        */
-       i = t->current_threshold;
-
-       /*
-        * Iterate backward over array of thresholds starting from
-        * current_threshold and check if a threshold is crossed.
-        * If none of thresholds below usage is crossed, we read
-        * only one element of the array here.
-        */
-       for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
-               eventfd_signal(t->entries[i].eventfd);
-
-       /* i = current_threshold + 1 */
-       i++;
-
-       /*
-        * Iterate forward over array of thresholds starting from
-        * current_threshold+1 and check if a threshold is crossed.
-        * If none of thresholds above usage is crossed, we read
-        * only one element of the array here.
-        */
-       for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
-               eventfd_signal(t->entries[i].eventfd);
-
-       /* Update current_threshold */
-       t->current_threshold = i - 1;
-unlock:
-       rcu_read_unlock();
-}
-
-static void mem_cgroup_threshold(struct mem_cgroup *memcg)
-{
-       while (memcg) {
-               __mem_cgroup_threshold(memcg, false);
-               if (do_memsw_account())
-                       __mem_cgroup_threshold(memcg, true);
-
-               memcg = parent_mem_cgroup(memcg);
-       }
-}
-
-static int compare_thresholds(const void *a, const void *b)
-{
-       const struct mem_cgroup_threshold *_a = a;
-       const struct mem_cgroup_threshold *_b = b;
-
-       if (_a->threshold > _b->threshold)
-               return 1;
-
-       if (_a->threshold < _b->threshold)
-               return -1;
-
-       return 0;
-}
-
-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
-{
-       struct mem_cgroup_eventfd_list *ev;
-
-       spin_lock(&memcg_oom_lock);
-
-       list_for_each_entry(ev, &memcg->oom_notify, list)
-               eventfd_signal(ev->eventfd);
-
-       spin_unlock(&memcg_oom_lock);
-       return 0;
-}
-
-static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
-{
-       struct mem_cgroup *iter;
-
-       for_each_mem_cgroup_tree(iter, memcg)
-               mem_cgroup_oom_notify_cb(iter);
-}
-
-static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
-       struct eventfd_ctx *eventfd, const char *args, enum res_type type)
-{
-       struct mem_cgroup_thresholds *thresholds;
-       struct mem_cgroup_threshold_ary *new;
-       unsigned long threshold;
-       unsigned long usage;
-       int i, size, ret;
-
-       ret = page_counter_memparse(args, "-1", &threshold);
-       if (ret)
-               return ret;
-
-       mutex_lock(&memcg->thresholds_lock);
-
-       if (type == _MEM) {
-               thresholds = &memcg->thresholds;
-               usage = mem_cgroup_usage(memcg, false);
-       } else if (type == _MEMSWAP) {
-               thresholds = &memcg->memsw_thresholds;
-               usage = mem_cgroup_usage(memcg, true);
-       } else
-               BUG();
-
-       /* Check if a threshold crossed before adding a new one */
-       if (thresholds->primary)
-               __mem_cgroup_threshold(memcg, type == _MEMSWAP);
-
-       size = thresholds->primary ? thresholds->primary->size + 1 : 1;
-
-       /* Allocate memory for new array of thresholds */
-       new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
-       if (!new) {
-               ret = -ENOMEM;
-               goto unlock;
-       }
-       new->size = size;
-
-       /* Copy thresholds (if any) to new array */
-       if (thresholds->primary)
-               memcpy(new->entries, thresholds->primary->entries,
-                      flex_array_size(new, entries, size - 1));
-
-       /* Add new threshold */
-       new->entries[size - 1].eventfd = eventfd;
-       new->entries[size - 1].threshold = threshold;
-
-       /* Sort thresholds. Registering of new threshold isn't time-critical */
-       sort(new->entries, size, sizeof(*new->entries),
-                       compare_thresholds, NULL);
-
-       /* Find current threshold */
-       new->current_threshold = -1;
-       for (i = 0; i < size; i++) {
-               if (new->entries[i].threshold <= usage) {
-                       /*
-                        * new->current_threshold will not be used until
-                        * rcu_assign_pointer(), so it's safe to increment
-                        * it here.
-                        */
-                       ++new->current_threshold;
-               } else
-                       break;
-       }
-
-       /* Free old spare buffer and save old primary buffer as spare */
-       kfree(thresholds->spare);
-       thresholds->spare = thresholds->primary;
-
-       rcu_assign_pointer(thresholds->primary, new);
-
-       /* To be sure that nobody uses thresholds */
-       synchronize_rcu();
-
-unlock:
-       mutex_unlock(&memcg->thresholds_lock);
-
-       return ret;
-}
-
-static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
-       struct eventfd_ctx *eventfd, const char *args)
-{
-       return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
-}
-
-static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
-       struct eventfd_ctx *eventfd, const char *args)
-{
-       return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
-}
-
-static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
-       struct eventfd_ctx *eventfd, enum res_type type)
-{
-       struct mem_cgroup_thresholds *thresholds;
-       struct mem_cgroup_threshold_ary *new;
-       unsigned long usage;
-       int i, j, size, entries;
-
-       mutex_lock(&memcg->thresholds_lock);
-
-       if (type == _MEM) {
-               thresholds = &memcg->thresholds;
-               usage = mem_cgroup_usage(memcg, false);
-       } else if (type == _MEMSWAP) {
-               thresholds = &memcg->memsw_thresholds;
-               usage = mem_cgroup_usage(memcg, true);
-       } else
-               BUG();
-
-       if (!thresholds->primary)
-               goto unlock;
-
-       /* Check if a threshold crossed before removing */
-       __mem_cgroup_threshold(memcg, type == _MEMSWAP);
-
-       /* Calculate new number of threshold */
-       size = entries = 0;
-       for (i = 0; i < thresholds->primary->size; i++) {
-               if (thresholds->primary->entries[i].eventfd != eventfd)
-                       size++;
-               else
-                       entries++;
-       }
-
-       new = thresholds->spare;
-
-       /* If no items related to eventfd have been cleared, nothing to do */
-       if (!entries)
-               goto unlock;
-
-       /* Set thresholds array to NULL if we don't have thresholds */
-       if (!size) {
-               kfree(new);
-               new = NULL;
-               goto swap_buffers;
-       }
-
-       new->size = size;
-
-       /* Copy thresholds and find current threshold */
-       new->current_threshold = -1;
-       for (i = 0, j = 0; i < thresholds->primary->size; i++) {
-               if (thresholds->primary->entries[i].eventfd == eventfd)
-                       continue;
-
-               new->entries[j] = thresholds->primary->entries[i];
-               if (new->entries[j].threshold <= usage) {
-                       /*
-                        * new->current_threshold will not be used
-                        * until rcu_assign_pointer(), so it's safe to increment
-                        * it here.
-                        */
-                       ++new->current_threshold;
-               }
-               j++;
-       }
-
-swap_buffers:
-       /* Swap primary and spare array */
-       thresholds->spare = thresholds->primary;
-
-       rcu_assign_pointer(thresholds->primary, new);
-
-       /* To be sure that nobody uses thresholds */
-       synchronize_rcu();
-
-       /* If all events are unregistered, free the spare array */
-       if (!new) {
-               kfree(thresholds->spare);
-               thresholds->spare = NULL;
-       }
-unlock:
-       mutex_unlock(&memcg->thresholds_lock);
-}
-
-static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
-       struct eventfd_ctx *eventfd)
-{
-       return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
-}
-
-static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
-       struct eventfd_ctx *eventfd)
-{
-       return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
-}
-
-static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
-       struct eventfd_ctx *eventfd, const char *args)
-{
-       struct mem_cgroup_eventfd_list *event;
-
-       event = kmalloc(sizeof(*event), GFP_KERNEL);
-       if (!event)
-               return -ENOMEM;
-
-       spin_lock(&memcg_oom_lock);
-
-       event->eventfd = eventfd;
-       list_add(&event->list, &memcg->oom_notify);
-
-       /* already in OOM ? */
-       if (memcg->under_oom)
-               eventfd_signal(eventfd);
-       spin_unlock(&memcg_oom_lock);
-
-       return 0;
-}
-
-static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
-       struct eventfd_ctx *eventfd)
-{
-       struct mem_cgroup_eventfd_list *ev, *tmp;
-
-       spin_lock(&memcg_oom_lock);
-
-       list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
-               if (ev->eventfd == eventfd) {
-                       list_del(&ev->list);
-                       kfree(ev);
-               }
-       }
-
-       spin_unlock(&memcg_oom_lock);
-}
-
 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
@@ -4609,243 +4180,6 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
 
 #endif /* CONFIG_CGROUP_WRITEBACK */
 
-/*
- * DO NOT USE IN NEW FILES.
- *
- * "cgroup.event_control" implementation.
- *
- * This is way over-engineered.  It tries to support fully configurable
- * events for each user.  Such level of flexibility is completely
- * unnecessary especially in the light of the planned unified hierarchy.
- *
- * Please deprecate this and replace with something simpler if at all
- * possible.
- */
-
-/*
- * Unregister event and free resources.
- *
- * Gets called from workqueue.
- */
-static void memcg_event_remove(struct work_struct *work)
-{
-       struct mem_cgroup_event *event =
-               container_of(work, struct mem_cgroup_event, remove);
-       struct mem_cgroup *memcg = event->memcg;
-
-       remove_wait_queue(event->wqh, &event->wait);
-
-       event->unregister_event(memcg, event->eventfd);
-
-       /* Notify userspace the event is going away. */
-       eventfd_signal(event->eventfd);
-
-       eventfd_ctx_put(event->eventfd);
-       kfree(event);
-       css_put(&memcg->css);
-}
-
-/*
- * Gets called on EPOLLHUP on eventfd when user closes it.
- *
- * Called with wqh->lock held and interrupts disabled.
- */
-static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
-                           int sync, void *key)
-{
-       struct mem_cgroup_event *event =
-               container_of(wait, struct mem_cgroup_event, wait);
-       struct mem_cgroup *memcg = event->memcg;
-       __poll_t flags = key_to_poll(key);
-
-       if (flags & EPOLLHUP) {
-               /*
-                * If the event has been detached at cgroup removal, we
-                * can simply return knowing the other side will cleanup
-                * for us.
-                *
-                * We can't race against event freeing since the other
-                * side will require wqh->lock via remove_wait_queue(),
-                * which we hold.
-                */
-               spin_lock(&memcg->event_list_lock);
-               if (!list_empty(&event->list)) {
-                       list_del_init(&event->list);
-                       /*
-                        * We are in atomic context, but cgroup_event_remove()
-                        * may sleep, so we have to call it in workqueue.
-                        */
-                       schedule_work(&event->remove);
-               }
-               spin_unlock(&memcg->event_list_lock);
-       }
-
-       return 0;
-}
-
-static void memcg_event_ptable_queue_proc(struct file *file,
-               wait_queue_head_t *wqh, poll_table *pt)
-{
-       struct mem_cgroup_event *event =
-               container_of(pt, struct mem_cgroup_event, pt);
-
-       event->wqh = wqh;
-       add_wait_queue(wqh, &event->wait);
-}
-
-/*
- * DO NOT USE IN NEW FILES.
- *
- * Parse input and register new cgroup event handler.
- *
- * Input must be in format '<event_fd> <control_fd> <args>'.
- * Interpretation of args is defined by control file implementation.
- */
-static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
-                                        char *buf, size_t nbytes, loff_t off)
-{
-       struct cgroup_subsys_state *css = of_css(of);
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-       struct mem_cgroup_event *event;
-       struct cgroup_subsys_state *cfile_css;
-       unsigned int efd, cfd;
-       struct fd efile;
-       struct fd cfile;
-       struct dentry *cdentry;
-       const char *name;
-       char *endp;
-       int ret;
-
-       if (IS_ENABLED(CONFIG_PREEMPT_RT))
-               return -EOPNOTSUPP;
-
-       buf = strstrip(buf);
-
-       efd = simple_strtoul(buf, &endp, 10);
-       if (*endp != ' ')
-               return -EINVAL;
-       buf = endp + 1;
-
-       cfd = simple_strtoul(buf, &endp, 10);
-       if ((*endp != ' ') && (*endp != '\0'))
-               return -EINVAL;
-       buf = endp + 1;
-
-       event = kzalloc(sizeof(*event), GFP_KERNEL);
-       if (!event)
-               return -ENOMEM;
-
-       event->memcg = memcg;
-       INIT_LIST_HEAD(&event->list);
-       init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
-       init_waitqueue_func_entry(&event->wait, memcg_event_wake);
-       INIT_WORK(&event->remove, memcg_event_remove);
-
-       efile = fdget(efd);
-       if (!efile.file) {
-               ret = -EBADF;
-               goto out_kfree;
-       }
-
-       event->eventfd = eventfd_ctx_fileget(efile.file);
-       if (IS_ERR(event->eventfd)) {
-               ret = PTR_ERR(event->eventfd);
-               goto out_put_efile;
-       }
-
-       cfile = fdget(cfd);
-       if (!cfile.file) {
-               ret = -EBADF;
-               goto out_put_eventfd;
-       }
-
-       /* the process need read permission on control file */
-       /* AV: shouldn't we check that it's been opened for read instead? */
-       ret = file_permission(cfile.file, MAY_READ);
-       if (ret < 0)
-               goto out_put_cfile;
-
-       /*
-        * The control file must be a regular cgroup1 file. As a regular cgroup
-        * file can't be renamed, it's safe to access its name afterwards.
-        */
-       cdentry = cfile.file->f_path.dentry;
-       if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
-               ret = -EINVAL;
-               goto out_put_cfile;
-       }
-
-       /*
-        * Determine the event callbacks and set them in @event.  This used
-        * to be done via struct cftype but cgroup core no longer knows
-        * about these events.  The following is crude but the whole thing
-        * is for compatibility anyway.
-        *
-        * DO NOT ADD NEW FILES.
-        */
-       name = cdentry->d_name.name;
-
-       if (!strcmp(name, "memory.usage_in_bytes")) {
-               event->register_event = mem_cgroup_usage_register_event;
-               event->unregister_event = mem_cgroup_usage_unregister_event;
-       } else if (!strcmp(name, "memory.oom_control")) {
-               event->register_event = mem_cgroup_oom_register_event;
-               event->unregister_event = mem_cgroup_oom_unregister_event;
-       } else if (!strcmp(name, "memory.pressure_level")) {
-               event->register_event = vmpressure_register_event;
-               event->unregister_event = vmpressure_unregister_event;
-       } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
-               event->register_event = memsw_cgroup_usage_register_event;
-               event->unregister_event = memsw_cgroup_usage_unregister_event;
-       } else {
-               ret = -EINVAL;
-               goto out_put_cfile;
-       }
-
-       /*
-        * Verify @cfile should belong to @css.  Also, remaining events are
-        * automatically removed on cgroup destruction but the removal is
-        * asynchronous, so take an extra ref on @css.
-        */
-       cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
-                                              &memory_cgrp_subsys);
-       ret = -EINVAL;
-       if (IS_ERR(cfile_css))
-               goto out_put_cfile;
-       if (cfile_css != css) {
-               css_put(cfile_css);
-               goto out_put_cfile;
-       }
-
-       ret = event->register_event(memcg, event->eventfd, buf);
-       if (ret)
-               goto out_put_css;
-
-       vfs_poll(efile.file, &event->pt);
-
-       spin_lock_irq(&memcg->event_list_lock);
-       list_add(&event->list, &memcg->event_list);
-       spin_unlock_irq(&memcg->event_list_lock);
-
-       fdput(cfile);
-       fdput(efile);
-
-       return nbytes;
-
-out_put_css:
-       css_put(css);
-out_put_cfile:
-       fdput(cfile);
-out_put_eventfd:
-       eventfd_ctx_put(event->eventfd);
-out_put_efile:
-       fdput(efile);
-out_kfree:
-       kfree(event);
-
-       return ret;
-}
-
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG)
 static int mem_cgroup_slab_show(struct seq_file *m, void *p)
 {
@@ -5312,19 +4646,8 @@ remove_id:
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-       struct mem_cgroup_event *event, *tmp;
 
-       /*
-        * Unregister events and notify userspace.
-        * Notify userspace about cgroup removing only after rmdir of cgroup
-        * directory to avoid race between userspace and kernelspace.
-        */
-       spin_lock_irq(&memcg->event_list_lock);
-       list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
-               list_del_init(&event->list);
-               schedule_work(&event->remove);
-       }
-       spin_unlock_irq(&memcg->event_list_lock);
+       memcg1_css_offline(memcg);
 
        page_counter_set_min(&memcg->memory, 0);
        page_counter_set_low(&memcg->memory, 0);