When multiple memcgs are available, it is possible to make better choices
based on generations and tiers and therefore improve the overall
performance under global memory pressure. This patch adds a rudimentary
optimization to select memcgs that can drop single-use unmapped clean
pages first. Doing so reduces the chance of going into the aging path or
swapping. These two operations can be costly.
A typical example that benefits from this optimization is a server running
mixed types of workloads, e.g., heavy anon workload in one memcg and heavy
buffered I/O workload in the other.
Though this optimization can be applied to both kswapd and direct reclaim,
it is only added to kswapd to keep the patchset manageable. Later
improvements will cover the direct reclaim path.
Server benchmark results:
Mixed workloads:
fio (buffered I/O): +[1, 3]%
IOPS BW
patch1-8: 2154k 8415MiB/s
patch1-9: 2205k 8613MiB/s
memcached (anon): +[132, 136]%
Ops/sec KB/sec
patch1-8: 819618.49 31838.48
patch1-9:
1916516.06 74447.92
Mixed workloads:
fio (buffered I/O): +[59, 61]%
IOPS BW
5.18-rc1: 1378k 5385MiB/s
patch1-9: 2205k 8613MiB/s
memcached (anon): +[229, 233]%
Ops/sec KB/sec
5.18-rc1: 578946.00 22489.44
patch1-9:
1916516.06 74447.92
Configurations:
(changes since patch 6)
cat mixed.sh
modprobe brd rd_nr=2 rd_size=
56623104
swapoff -a
mkswap /dev/ram0
swapon /dev/ram0
mkfs.ext4 /dev/ram1
mount -t ext4 /dev/ram1 /mnt
memtier_benchmark -S /var/run/memcached/memcached.sock \
-P memcache_binary -n allkeys --key-minimum=1 \
--key-maximum=
50000000 --key-pattern=P:P -c 1 -t 36 \
--ratio 1:0 --pipeline 8 -d 2000
fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
--buffered=1 --ioengine=io_uring --iodepth=128 \
--iodepth_batch_submit=32 --iodepth_batch_complete=32 \
--rw=randread --random_distribution=random --norandommap \
--time_based --ramp_time=10m --runtime=90m --group_reporting &
pid=$!
sleep 200
memtier_benchmark -S /var/run/memcached/memcached.sock \
-P memcache_binary -n allkeys --key-minimum=1 \
--key-maximum=
50000000 --key-pattern=R:R -c 1 -t 36 \
--ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
kill -INT $pid
wait
Client benchmark results:
no change (CONFIG_MEMCG=n)
Link: https://lkml.kernel.org/r/20220407031525.2368067-10-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;
+#ifdef CONFIG_LRU_GEN
+ /* help make better choices when multiple memcgs are available */
+ unsigned int memcgs_need_aging:1;
+ unsigned int memcgs_need_swapping:1;
+ unsigned int memcgs_avoid_swapping:1;
+#endif
+
/* Allocation order */
s8 order;
VM_BUG_ON(!current_is_kswapd());
+ /*
+ * To reduce the chance of going into the aging path or swapping, which
+ * can be costly, optimistically skip them unless their corresponding
+ * flags were cleared in the eviction path. This improves the overall
+ * performance when multiple memcgs are available.
+ */
+ if (!sc->memcgs_need_aging) {
+ sc->memcgs_need_aging = true;
+ sc->memcgs_avoid_swapping = !sc->memcgs_need_swapping;
+ sc->memcgs_need_swapping = true;
+ return;
+ }
+
+ sc->memcgs_need_swapping = true;
+ sc->memcgs_avoid_swapping = true;
+
current->reclaim_state->mm_walk = &pgdat->mm_walk;
memcg = mem_cgroup_iter(NULL, NULL, NULL);
return scanned;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ bool *swapped)
{
int type;
int scanned;
sc->nr_reclaimed += reclaimed;
+ if (type == LRU_GEN_ANON && swapped)
+ *swapped = true;
+
return scanned;
}
if (!nr_to_scan)
return 0;
- if (!need_aging)
+ if (!need_aging) {
+ sc->memcgs_need_aging = false;
return nr_to_scan;
+ }
/* leave the work to lru_gen_age_node() */
if (current_is_kswapd())
{
struct blk_plug plug;
long scanned = 0;
+ bool swapped = false;
+ unsigned long reclaimed = sc->nr_reclaimed;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lru_add_drain();
if (!nr_to_scan)
break;
- delta = evict_folios(lruvec, sc, swappiness);
+ delta = evict_folios(lruvec, sc, swappiness, &swapped);
if (!delta)
break;
+ if (sc->memcgs_avoid_swapping && swappiness < 200 && swapped)
+ break;
+
scanned += delta;
- if (scanned >= nr_to_scan)
+ if (scanned >= nr_to_scan) {
+ if (!swapped && sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH)
+ sc->memcgs_need_swapping = false;
break;
+ }
cond_resched();
}