all the existing limitations and potential future extensions.
 
   memory.peak
-       A read-only single value file which exists on non-root
-       cgroups.
+       A read-write single value file which exists on non-root cgroups.
+
+       The max memory usage recorded for the cgroup and its descendants since
+       either the creation of the cgroup or the most recent reset for that FD.
 
-       The max memory usage recorded for the cgroup and its
-       descendants since the creation of the cgroup.
+       A write of any non-empty string to this file resets it to the
+       current memory usage for subsequent reads through the same
+       file descriptor.
 
   memory.oom.group
        A read-write single value file which exists on non-root
        Healthy workloads are not expected to reach this limit.
 
   memory.swap.peak
-       A read-only single value file which exists on non-root
-       cgroups.
+       A read-write single value file which exists on non-root cgroups.
+
+       The max swap usage recorded for the cgroup and its descendants since
+       the creation of the cgroup or the most recent reset for that FD.
 
-       The max swap usage recorded for the cgroup and its
-       descendants since the creation of the cgroup.
+       A write of any non-empty string to this file resets it to the
+       current memory usage for subsequent reads through the same
+       file descriptor.
 
   memory.swap.max
        A read-write single value file which exists on non-root
 
  * Copyright (C) 2020 Alibaba, Inc, Alex Shi
  */
 
+#include <linux/cgroup-defs.h>
 #include <linux/page_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/rcupdate.h>
 #include <linux/limits.h>
 #include <linux/export.h>
+#include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 
        INIT_WORK(&memcg->high_work, high_work_func);
        vmpressure_init(&memcg->vmpressure);
+       INIT_LIST_HEAD(&memcg->memory_peaks);
+       INIT_LIST_HEAD(&memcg->swap_peaks);
+       spin_lock_init(&memcg->peaks_lock);
        memcg->socket_pressure = jiffies;
        memcg1_memcg_init(memcg);
        memcg->kmemcg_id = -1;
        return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
 }
 
-static u64 memory_peak_read(struct cgroup_subsys_state *css,
-                           struct cftype *cft)
+#define OFP_PEAK_UNSET (((-1UL)))
+
+static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc)
 {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct cgroup_of_peak *ofp = of_peak(sf->private);
+       u64 fd_peak = READ_ONCE(ofp->value), peak;
+
+       /* User wants global or local peak? */
+       if (fd_peak == OFP_PEAK_UNSET)
+               peak = pc->watermark;
+       else
+               peak = max(fd_peak, READ_ONCE(pc->local_watermark));
+
+       seq_printf(sf, "%llu\n", peak * PAGE_SIZE);
+       return 0;
+}
+
+static int memory_peak_show(struct seq_file *sf, void *v)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+
+       return peak_show(sf, v, &memcg->memory);
+}
+
+static int peak_open(struct kernfs_open_file *of)
+{
+       struct cgroup_of_peak *ofp = of_peak(of);
+
+       ofp->value = OFP_PEAK_UNSET;
+       return 0;
+}
+
+static void peak_release(struct kernfs_open_file *of)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+       struct cgroup_of_peak *ofp = of_peak(of);
+
+       if (ofp->value == OFP_PEAK_UNSET) {
+               /* fast path (no writes on this fd) */
+               return;
+       }
+       spin_lock(&memcg->peaks_lock);
+       list_del(&ofp->list);
+       spin_unlock(&memcg->peaks_lock);
+}
+
+static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+                         loff_t off, struct page_counter *pc,
+                         struct list_head *watchers)
+{
+       unsigned long usage;
+       struct cgroup_of_peak *peer_ctx;
+       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+       struct cgroup_of_peak *ofp = of_peak(of);
+
+       spin_lock(&memcg->peaks_lock);
+
+       usage = page_counter_read(pc);
+       WRITE_ONCE(pc->local_watermark, usage);
+
+       list_for_each_entry(peer_ctx, watchers, list)
+               if (usage > peer_ctx->value)
+                       WRITE_ONCE(peer_ctx->value, usage);
+
+       /* initial write, register watcher */
+       if (ofp->value == -1)
+               list_add(&ofp->list, watchers);
+
+       WRITE_ONCE(ofp->value, usage);
+       spin_unlock(&memcg->peaks_lock);
+
+       return nbytes;
+}
+
+static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf,
+                                size_t nbytes, loff_t off)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 
-       return (u64)memcg->memory.watermark * PAGE_SIZE;
+       return peak_write(of, buf, nbytes, off, &memcg->memory,
+                         &memcg->memory_peaks);
 }
 
+#undef OFP_PEAK_UNSET
+
 static int memory_min_show(struct seq_file *m, void *v)
 {
        return seq_puts_memcg_tunable(m,
        {
                .name = "peak",
                .flags = CFTYPE_NOT_ON_ROOT,
-               .read_u64 = memory_peak_read,
+               .open = peak_open,
+               .release = peak_release,
+               .seq_show = memory_peak_show,
+               .write = memory_peak_write,
        },
        {
                .name = "min",
        return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
 }
 
-static u64 swap_peak_read(struct cgroup_subsys_state *css,
-                         struct cftype *cft)
+static int swap_peak_show(struct seq_file *sf, void *v)
 {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+
+       return peak_show(sf, v, &memcg->swap);
+}
+
+static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf,
+                              size_t nbytes, loff_t off)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 
-       return (u64)memcg->swap.watermark * PAGE_SIZE;
+       return peak_write(of, buf, nbytes, off, &memcg->swap,
+                         &memcg->swap_peaks);
 }
 
 static int swap_high_show(struct seq_file *m, void *v)
        {
                .name = "swap.peak",
                .flags = CFTYPE_NOT_ON_ROOT,
-               .read_u64 = swap_peak_read,
+               .open = peak_open,
+               .release = peak_release,
+               .seq_show = swap_peak_show,
+               .write = swap_peak_write,
        },
        {
                .name = "swap.events",
 
                /*
                 * This is indeed racy, but we can live with some
                 * inaccuracy in the watermark.
+                *
+                * Notably, we have two watermarks to allow for both a globally
+                * visible peak and one that can be reset at a smaller scope.
+                *
+                * Since we reset both watermarks when the global reset occurs,
+                * we can guarantee that watermark >= local_watermark, so we
+                * don't need to do both comparisons every time.
+                *
+                * On systems with branch predictors, the inner condition should
+                * be almost free.
                 */
-               if (new > READ_ONCE(c->watermark))
-                       WRITE_ONCE(c->watermark, new);
+               if (new > READ_ONCE(c->local_watermark)) {
+                       WRITE_ONCE(c->local_watermark, new);
+                       if (new > READ_ONCE(c->watermark))
+                               WRITE_ONCE(c->watermark, new);
+               }
        }
 }
 
                if (protection)
                        propagate_protected_usage(c, new);
 
-               /*
-                * Just like with failcnt, we can live with some
-                * inaccuracy in the watermark.
-                */
-               if (new > READ_ONCE(c->watermark))
-                       WRITE_ONCE(c->watermark, new);
+               /* see comment on page_counter_charge */
+               if (new > READ_ONCE(c->local_watermark)) {
+                       WRITE_ONCE(c->local_watermark, new);
+                       if (new > READ_ONCE(c->watermark))
+                               WRITE_ONCE(c->watermark, new);
+               }
        }
        return true;