From f7f0d88b7d6da29d2b073740aa130685f0e18609 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 5 Mar 2025 14:27:29 -0800 Subject: [PATCH 01/16] mm/damon/core: expose damos_filter_for_ops() to DAMON kernel API callers damos_filter_for_ops() can be useful to avoid putting wrong type of filters in wrong place. Make it be exposed to DAMON kernel API callers. Link: https://lkml.kernel.org/r/20250305222733.59089-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + mm/damon/core.c | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 52559475dbe7..eed008b64a23 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -894,6 +894,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damos_filter *damos_new_filter(enum damos_filter_type type, bool matching, bool allow); void damos_add_filter(struct damos *s, struct damos_filter *f); +bool damos_filter_for_ops(enum damos_filter_type type); void damos_destroy_filter(struct damos_filter *f); struct damos_quota_goal *damos_new_quota_goal( diff --git a/mm/damon/core.c b/mm/damon/core.c index 511c464adcc5..ebbb22840435 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -281,7 +281,14 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type, return filter; } -static bool damos_filter_for_ops(enum damos_filter_type type) +/** + * damos_filter_for_ops() - Return if the filter is ops-hndled one. + * @type: type of the filter. + * + * Return: true if the filter of @type needs to be handled by ops layer, false + * otherwise. + */ +bool damos_filter_for_ops(enum damos_filter_type type) { switch (type) { case DAMOS_FILTER_TYPE_ADDR: -- 2.51.0 From 9f643a9854df682548bbcdf68cbd89e74cbfd6dc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 5 Mar 2025 14:27:30 -0800 Subject: [PATCH 02/16] mm/damon/sysfs-schemes: record filters of which layer should be added to the given filters directory Unlike their name and assumed purposes, {core,ops}_filters DAMOS sysfs directories are allowing installing any type of filters. As a first step for preventing such wrong installments, add information about filters that handled by what layer should the installed to the given filters directory in the DAMOS sysfs internal data structures. Link: https://lkml.kernel.org/r/20250305222733.59089-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 46 +++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 65c6091cd879..e4e36c34ec0e 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -309,8 +309,18 @@ static const struct kobj_type damon_sysfs_stats_ktype = { * filter directory */ +/* + * enum damos_sysfs_filter_handle_layer - Layers handling filters of a dir. + */ +enum damos_sysfs_filter_handle_layer { + DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE, + DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS, + DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH, +}; + struct damon_sysfs_scheme_filter { struct kobject kobj; + enum damos_sysfs_filter_handle_layer handle_layer; enum damos_filter_type type; bool matching; bool allow; @@ -320,9 +330,15 @@ struct damon_sysfs_scheme_filter { int target_idx; }; -static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) +static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc( + enum damos_sysfs_filter_handle_layer layer) { - return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL); + struct damon_sysfs_scheme_filter *filter; + + filter = kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL); + if (filter) + filter->handle_layer = layer; + return filter; } /* Should match with enum damos_filter_type */ @@ -595,14 +611,20 @@ static const struct kobj_type damon_sysfs_scheme_filter_ktype = { struct damon_sysfs_scheme_filters { struct kobject kobj; + enum damos_sysfs_filter_handle_layer handle_layer; struct damon_sysfs_scheme_filter **filters_arr; int nr; }; static struct damon_sysfs_scheme_filters * -damon_sysfs_scheme_filters_alloc(void) +damon_sysfs_scheme_filters_alloc(enum damos_sysfs_filter_handle_layer layer) { - return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL); + struct damon_sysfs_scheme_filters *filters; + + filters = kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL); + if (filters) + filters->handle_layer = layer; + return filters; } static void damon_sysfs_scheme_filters_rm_dirs( @@ -635,7 +657,8 @@ static int damon_sysfs_scheme_filters_add_dirs( filters->filters_arr = filters_arr; for (i = 0; i < nr_filters; i++) { - filter = damon_sysfs_scheme_filter_alloc(); + filter = damon_sysfs_scheme_filter_alloc( + filters->handle_layer); if (!filter) { damon_sysfs_scheme_filters_rm_dirs(filters); return -ENOMEM; @@ -1607,11 +1630,11 @@ static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) } static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme, - const char *name, + enum damos_sysfs_filter_handle_layer layer, const char *name, struct damon_sysfs_scheme_filters **filters_ptr) { struct damon_sysfs_scheme_filters *filters = - damon_sysfs_scheme_filters_alloc(); + damon_sysfs_scheme_filters_alloc(layer); int err; if (!filters) @@ -1630,15 +1653,18 @@ static int damos_sysfs_set_filter_dirs(struct damon_sysfs_scheme *scheme) { int err; - err = damon_sysfs_scheme_set_filters(scheme, "filters", + err = damon_sysfs_scheme_set_filters(scheme, + DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH, "filters", &scheme->filters); if (err) return err; - err = damon_sysfs_scheme_set_filters(scheme, "core_filters", + err = damon_sysfs_scheme_set_filters(scheme, + DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE, "core_filters", &scheme->core_filters); if (err) goto put_filters_out; - err = damon_sysfs_scheme_set_filters(scheme, "ops_filters", + err = damon_sysfs_scheme_set_filters(scheme, + DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS, "ops_filters", &scheme->ops_filters); if (err) goto put_core_filters_out; -- 2.51.0 From ae8fd5b6666b0d99f485748b2b2259de1a7458dc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 5 Mar 2025 14:27:31 -0800 Subject: [PATCH 03/16] mm/damon/sysfs-schemes: return error when for attempts to install filters on wrong sysfs directory Return error if the user tries to install a DAMOS filter on DAMOS filters sysfs directory that assumed to be used for filters that handled by a DAMON layer that not same to that for the installing filter. Link: https://lkml.kernel.org/r/20250305222733.59089-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index e4e36c34ec0e..1895d2d2c295 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -362,6 +362,23 @@ static ssize_t type_show(struct kobject *kobj, damon_sysfs_scheme_filter_type_strs[filter->type]); } +static bool damos_sysfs_scheme_filter_valid_type( + enum damos_sysfs_filter_handle_layer layer, + enum damos_filter_type type) +{ + switch (layer) { + case DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH: + return true; + case DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE: + return !damos_filter_for_ops(type); + case DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS: + return damos_filter_for_ops(type); + default: + break; + } + return false; +} + static ssize_t type_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { @@ -373,6 +390,9 @@ static ssize_t type_store(struct kobject *kobj, for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) { if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[ type])) { + if (!damos_sysfs_scheme_filter_valid_type( + filter->handle_layer, type)) + break; filter->type = type; ret = count; break; -- 2.51.0 From 899e4c14afa640b8f7374e162f2336b67f07123d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 5 Mar 2025 14:27:32 -0800 Subject: [PATCH 04/16] Docs/ABI/damon: document {core,ops}_filters directories Document the new DAMOS filters sysfs directories on ABI doc. Link: https://lkml.kernel.org/r/20250305222733.59089-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 76da77d7f7b6..293197f180ad 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -409,6 +409,22 @@ Description: Writing 'Y' or 'N' to this file sets whether to allow or reject applying the scheme's action to the memory that satisfies the 'type' and the 'matching' of the directory. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//core_filters +Date: Feb 2025 +Contact: SeongJae Park +Description: Directory for DAMON core layer-handled DAMOS filters. Files + under this directory works same to those of + /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters + directory. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//ops_filters +Date: Feb 2025 +Contact: SeongJae Park +Description: Directory for DAMON operations set layer-handled DAMOS filters. + Files under this directory works same to those of + /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters + directory. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/nr_tried Date: Mar 2022 Contact: SeongJae Park -- 2.51.0 From 114b480877698f7835a5ba95c6fbd97b63b119f6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 5 Mar 2025 14:27:33 -0800 Subject: [PATCH 05/16] Docs/admin-guide/mm/damon/usage: update for {core,ops}_filters directories Document {core,ops}_filters directories on usage document. Link: https://lkml.kernel.org/r/20250305222733.59089-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 31 ++++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index de549dd18107..ced2013db3df 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -83,7 +83,7 @@ comma (","). │ │ │ │ │ │ │ │ :ref:`goals `/nr_goals │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value │ │ │ │ │ │ │ :ref:`watermarks `/metric,interval_us,high,mid,low - │ │ │ │ │ │ │ :ref:`filters `/nr_filters + │ │ │ │ │ │ │ :ref:`{core_,ops_,}filters `/nr_filters │ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx,min,max │ │ │ │ │ │ │ :ref:`stats `/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds │ │ │ │ │ │ │ :ref:`tried_regions `/total_bytes @@ -307,9 +307,10 @@ to ``N-1``. Each directory represents each DAMON-based operation scheme. schemes// ------------ -In each scheme directory, five directories (``access_pattern``, ``quotas``, -``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and three files -(``action``, ``target_nid`` and ``apply_interval``) exist. +In each scheme directory, seven directories (``access_pattern``, ``quotas``, +``watermarks``, ``core_filters``, ``ops_filters``, ``filters``, ``stats``, and +``tried_regions``) and three files (``action``, ``target_nid`` and +``apply_interval``) exist. The ``action`` file is for setting and getting the scheme's :ref:`action `. The keywords that can be written to and read @@ -420,13 +421,24 @@ The ``interval`` should written in microseconds unit. .. _sysfs_filters: -schemes//filters/ --------------------- +schemes//{core\_,ops\_,}filters/ +----------------------------------- -The directory for the :ref:`filters ` of the given +Directories for :ref:`filters ` of the given DAMON-based operation scheme. -In the beginning, this directory has only one file, ``nr_filters``. Writing a +``core_filters`` and ``ops_filters`` directories are for the filters handled by +the DAMON core layer and operations set layer, respectively. ``filters`` +directory can be used for installing filters regardless of their handled +layers. Filters that requested by ``core_filters`` and ``ops_filters`` will be +installed before those of ``filters``. All three directories have same files. + +Use of ``filters`` directory can make expecting evaluation orders of given +filters with the files under directory bit confusing. Users are hence +recommended to use ``core_filters`` and ``ops_filters`` directories. The +``filters`` directory could be deprecated in future. + +In the beginning, the directory has only one file, ``nr_filters``. Writing a number (``N``) to the file creates the number of child directories named ``0`` to ``N-1``. Each directory represents each filter. The filters are evaluated in the numeric order. @@ -435,7 +447,7 @@ Each filter directory contains nine files, namely ``type``, ``matching``, ``allow``, ``memcg_path``, ``addr_start``, ``addr_end``, ``min``, ``max`` and ``target_idx``. To ``type`` file, you can write the type of the filter. Refer to :ref:`the design doc ` for available type -names and their meanings. +names, their meaning and on what layer those are handled. For ``memcg`` type, you can specify the memory cgroup of the interest by writing the path of the memory cgroup from the cgroups mount point to @@ -455,6 +467,7 @@ the ``type`` and ``matching`` should be allowed or not. For example, below restricts a DAMOS action to be applied to only non-anonymous pages of all memory cgroups except ``/having_care_already``.:: + # cd ops_filters/0/ # echo 2 > nr_filters # # disallow anonymous pages echo anon > 0/type -- 2.51.0 From 2273dea6b1e1fcdd06d207048a2cd563ed80111a Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 5 Mar 2025 11:54:09 +0800 Subject: [PATCH 06/16] mm/hugetlb: update nr_huge_pages and surplus_huge_pages together In alloc_surplus_hugetlb_folio(), we increase nr_huge_pages and surplus_huge_pages separately. In the middle window, if we set nr_hugepages to smaller and satisfy count < persistent_huge_pages(h), the surplus_huge_pages will be increased by adjust_pool_surplus(). After adding delay in the middle window, we can reproduce the problem easily by following step: 1. echo 3 > /proc/sys/vm/nr_overcommit_hugepages 2. mmap two hugepages. When nr_huge_pages=2 and surplus_huge_pages=1, goto step 3. 3. echo 0 > /proc/sys/vm/nr_huge_pages Finally, nr_huge_pages is less than surplus_huge_pages. To fix the problem, call only_alloc_fresh_hugetlb_folio() instead and move down __prep_account_new_huge_page() into the hugetlb_lock. Link: https://lkml.kernel.org/r/20250305035409.2391344-1-liushixin2@huawei.com Fixes: 0c397daea1d4 ("mm, hugetlb: further simplify hugetlb allocation API") Signed-off-by: Liu Shixin Acked-by: Peter Xu Acked-by: Oscar Salvador Cc: David Hildenbrand Cc: Kefeng Wang Cc: Liu Shixin Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 438de55dd38d..af9b8c1fca67 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2259,11 +2259,20 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, goto out_unlock; spin_unlock_irq(&hugetlb_lock); - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; + hugetlb_vmemmap_optimize_folio(h, folio); + spin_lock_irq(&hugetlb_lock); + /* + * nr_huge_pages needs to be adjusted within the same lock cycle + * as surplus_pages, otherwise it might confuse + * persistent_huge_pages() momentarily. + */ + __prep_account_new_huge_page(h, nid); + /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page -- 2.51.0 From ff22f9299d7b2c7874b560993c21543708b7e1b6 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 6 Mar 2025 12:50:10 -0800 Subject: [PATCH 07/16] page_io: zswap: do not crash the kernel on decompression failure Currently, we crash the kernel when a decompression failure occurs in zswap (either because of memory corruption, or a bug in the compression algorithm). This is overkill. We should only SIGBUS the unfortunate process asking for the zswap entry on zswap load, and skip the corrupted entry in zswap writeback. See [1] for a recent upstream discussion about this. The zswap writeback case is relatively straightforward to fix. For the zswap_load() case, we change the return behavior: * Return 0 on success. * Return -ENOENT (with the folio locked) if zswap does not own the swapped out content. * Return -EIO if zswap owns the swapped out content, but encounters a decompression failure for some reasons. The folio will be unlocked, but not be marked up-to-date, which will eventually cause the process requesting the page to SIGBUS (see the handling of not-up-to-date folio in do_swap_page() in mm/memory.c), without crashing the kernel. * Return -EINVAL if we encounter a large folio, as large folio should not be swapped in while zswap is being used. Similar to the -EIO case, we also unlock the folio but do not mark it as up-to-date to SIGBUS the faulting process. As a side effect, we require one extra zswap tree traversal in the load and writeback paths. Quick benchmarking on a kernel build test shows no performance difference: With the new scheme: real: mean: 125.1s, stdev: 0.12s user: mean: 3265.23s, stdev: 9.62s sys: mean: 2156.41s, stdev: 13.98s The old scheme: real: mean: 125.78s, stdev: 0.45s user: mean: 3287.18s, stdev: 5.95s sys: mean: 2177.08s, stdev: 26.52s [nphamcs@gmail.com: fix documentation of zswap_load()] Link: https://lkml.kernel.org/r/20250306222453.1269456-1-nphamcs@gmail.com Link: https://lore.kernel.org/all/ZsiLElTykamcYZ6J@casper.infradead.org/ [1] Link: https://lkml.kernel.org/r/20250306205011.784787-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Matthew Wilcox Suggested-by: Yosry Ahmed Suggested-by: Johannes Weiner Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/zswap.h | 6 +-- mm/page_io.c | 6 +-- mm/zswap.c | 119 +++++++++++++++++++++++++++++------------- 3 files changed, 88 insertions(+), 43 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index d961ead91bf1..30c193a1207e 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -26,7 +26,7 @@ struct zswap_lruvec_state { unsigned long zswap_total_pages(void); bool zswap_store(struct folio *folio); -bool zswap_load(struct folio *folio); +int zswap_load(struct folio *folio); void zswap_invalidate(swp_entry_t swp); int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); @@ -44,9 +44,9 @@ static inline bool zswap_store(struct folio *folio) return false; } -static inline bool zswap_load(struct folio *folio) +static inline int zswap_load(struct folio *folio) { - return false; + return -ENOENT; } static inline void zswap_invalidate(swp_entry_t swp) {} diff --git a/mm/page_io.c b/mm/page_io.c index 9b983de351f9..4bce19df557b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -638,11 +638,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) if (swap_read_folio_zeromap(folio)) { folio_unlock(folio); goto finish; - } else if (zswap_load(folio)) { - folio_unlock(folio); - goto finish; } + if (zswap_load(folio) != -ENOENT) + goto finish; + /* We have to read from slower devices. Increase zswap protection. */ zswap_folio_swapin(folio); diff --git a/mm/zswap.c b/mm/zswap.c index 5f0e62289444..0dcc54eab58b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail; static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; +/* Load or writeback failed due to decompression failure */ +static u64 zswap_decompress_fail; /* Store failed because underlying allocator could not get memory */ static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ @@ -985,11 +987,12 @@ unlock: return comp_ret == 0 && alloc_ret == 0; } -static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) { struct zpool *zpool = entry->pool->zpool; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; + int decomp_ret, dlen; u8 *src, *obj; acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); @@ -1012,11 +1015,21 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) sg_init_table(&output, 1); sg_set_folio(&output, folio, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); - BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); - BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); + decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; zpool_obj_read_end(zpool, entry->handle, obj); acomp_ctx_put_unlock(acomp_ctx); + + if (!decomp_ret && dlen == PAGE_SIZE) + return true; + + zswap_decompress_fail++; + pr_alert_ratelimited("Decompression error from zswap (%d:%lu %s %u->%d)\n", + swp_type(entry->swpentry), + swp_offset(entry->swpentry), + entry->pool->tfm_name, entry->length, dlen); + return false; } /********************************* @@ -1046,6 +1059,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; + int ret = 0; /* try to allocate swap cache folio */ si = get_swap_device(swpentry); @@ -1067,8 +1081,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * and freed when invalidated by the concurrent shrinker anyway. */ if (!folio_was_allocated) { - folio_put(folio); - return -EEXIST; + ret = -EEXIST; + goto out; } /* @@ -1081,14 +1095,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * be dereferenced. */ tree = swap_zswap_tree(swpentry); - if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) { - delete_from_swap_cache(folio); - folio_unlock(folio); - folio_put(folio); - return -ENOMEM; + if (entry != xa_load(tree, offset)) { + ret = -ENOMEM; + goto out; } - zswap_decompress(entry, folio); + if (!zswap_decompress(entry, folio)) { + ret = -EIO; + goto out; + } + + xa_erase(tree, offset); count_vm_event(ZSWPWB); if (entry->objcg) @@ -1104,9 +1121,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* start writeback */ __swap_writepage(folio, &wbc); - folio_put(folio); - return 0; +out: + if (ret && ret != -EEXIST) { + delete_from_swap_cache(folio); + folio_unlock(folio); + } + folio_put(folio); + return ret; } /********************************* @@ -1606,7 +1628,27 @@ check_old: return ret; } -bool zswap_load(struct folio *folio) +/** + * zswap_load() - load a folio from zswap + * @folio: folio to load + * + * Return: 0 on success, with the folio unlocked and marked up-to-date, or one + * of the following error codes: + * + * -EIO: if the swapped out content was in zswap, but could not be loaded + * into the page due to a decompression failure. The folio is unlocked, but + * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page() + * will SIGBUS). + * + * -EINVAL: if the swapped out content was in zswap, but the page belongs + * to a large folio, which is not supported by zswap. The folio is unlocked, + * but NOT marked up-to-date, so that an IO error is emitted (e.g. + * do_swap_page() will SIGBUS). + * + * -ENOENT: if the swapped out content was not in zswap. The folio remains + * locked on return. + */ +int zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); @@ -1617,18 +1659,32 @@ bool zswap_load(struct folio *folio) VM_WARN_ON_ONCE(!folio_test_locked(folio)); if (zswap_never_enabled()) - return false; + return -ENOENT; /* * Large folios should not be swapped in while zswap is being used, as * they are not properly handled. Zswap does not properly load large * folios, and a large folio may only be partially in zswap. - * - * Return true without marking the folio uptodate so that an IO error is - * emitted (e.g. do_swap_page() will sigbus). */ - if (WARN_ON_ONCE(folio_test_large(folio))) - return true; + if (WARN_ON_ONCE(folio_test_large(folio))) { + folio_unlock(folio); + return -EINVAL; + } + + entry = xa_load(tree, offset); + if (!entry) + return -ENOENT; + + if (!zswap_decompress(entry, folio)) { + folio_unlock(folio); + return -EIO; + } + + folio_mark_uptodate(folio); + + count_vm_event(ZSWPIN); + if (entry->objcg) + count_objcg_events(entry->objcg, ZSWPIN, 1); /* * When reading into the swapcache, invalidate our entry. The @@ -1642,27 +1698,14 @@ bool zswap_load(struct folio *folio) * files, which reads into a private page and may free it if * the fault fails. We remain the primary owner of the entry.) */ - if (swapcache) - entry = xa_erase(tree, offset); - else - entry = xa_load(tree, offset); - - if (!entry) - return false; - - zswap_decompress(entry, folio); - - count_vm_event(ZSWPIN); - if (entry->objcg) - count_objcg_events(entry->objcg, ZSWPIN, 1); - if (swapcache) { - zswap_entry_free(entry); folio_mark_dirty(folio); + xa_erase(tree, offset); + zswap_entry_free(entry); } - folio_mark_uptodate(folio); - return true; + folio_unlock(folio); + return 0; } void zswap_invalidate(swp_entry_t swp) @@ -1757,6 +1800,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_fail); debugfs_create_u64("reject_compress_poor", 0444, zswap_debugfs_root, &zswap_reject_compress_poor); + debugfs_create_u64("decompress_fail", 0444, + zswap_debugfs_root, &zswap_decompress_fail); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_file("pool_total_size", 0444, -- 2.51.0 From e11079dd25c525aaf238d81287851ef16a521ef3 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:51 +0200 Subject: [PATCH 08/16] arm: mem_init: use memblock_phys_free() to free DMA memory on SA1111 Patch series "arch, mm: reduce code duplication in mem_init()", v2. Every architecture has implementation of mem_init() function and some even more than one. All these release free memory to the buddy allocator, most of them set high_memory to the end of directly addressable memory and many of them set max_mapnr for FLATMEM case. These patches pull the commonalities into the generic code and refactor some of the mem_init() implementations so that many of them can be just dropped. This patch (of 13): This will help to pull out memblock_free_all() to generic code. Link: https://lkml.kernel.org/r/20250313135003.836600-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20250313135003.836600-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: "Mike Rapoport (IBM)" Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm/mm/init.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 5345d218899a..9aec1cb2386f 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -277,14 +277,14 @@ void __init mem_init(void) set_max_mapnr(pfn_to_page(max_pfn) - mem_map); - /* this will put all unused low memory onto the freelists */ - memblock_free_all(); - #ifdef CONFIG_SA1111 /* now that our DMA memory is actually so designated, we can free it */ - free_reserved_area(__va(PHYS_OFFSET), swapper_pg_dir, -1, NULL); + memblock_phys_free(PHYS_OFFSET, __pa(swapper_pg_dir) - PHYS_OFFSET); #endif + /* this will put all unused low memory onto the freelists */ + memblock_free_all(); + free_highpages(); /* -- 2.51.0 From 2b1d532e106ee63acb61a8e11608fafd75e52c4d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:52 +0200 Subject: [PATCH 09/16] csky: move setup_initrd() to setup.c Memory used by initrd should be reserved as soon as possible before there any memblock allocations that might overwrite that memory. This will also help with pulling out memblock_free_all() to the generic code and reducing code duplication in arch::mem_init(). Link: https://lkml.kernel.org/r/20250313135003.836600-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Guo Ren (csky) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/csky/kernel/setup.c | 43 ++++++++++++++++++++++++++++++++++++++++ arch/csky/mm/init.c | 43 ---------------------------------------- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index fe715b707fd0..e0d6ca86ea8c 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -12,6 +12,45 @@ #include #include +#ifdef CONFIG_BLK_DEV_INITRD +static void __init setup_initrd(void) +{ + unsigned long size; + + if (initrd_start >= initrd_end) { + pr_err("initrd not found or empty"); + goto disable; + } + + if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) { + pr_err("initrd extends beyond end of memory"); + goto disable; + } + + size = initrd_end - initrd_start; + + if (memblock_is_region_reserved(__pa(initrd_start), size)) { + pr_err("INITRD: 0x%08lx+0x%08lx overlaps in-use memory region", + __pa(initrd_start), size); + goto disable; + } + + memblock_reserve(__pa(initrd_start), size); + + pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n", + (void *)(initrd_start), size); + + initrd_below_start_ok = 1; + + return; + +disable: + initrd_start = initrd_end = 0; + + pr_err(" - disabling initrd\n"); +} +#endif + static void __init csky_memblock_init(void) { unsigned long lowmem_size = PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET); @@ -40,6 +79,10 @@ static void __init csky_memblock_init(void) max_low_pfn = min_low_pfn + sseg_size; } +#ifdef CONFIG_BLK_DEV_INITRD + setup_initrd(); +#endif + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; mmu_init(min_low_pfn, max_low_pfn); diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index bde7cabd23df..ab51acbc19b2 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -42,45 +42,6 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); -#ifdef CONFIG_BLK_DEV_INITRD -static void __init setup_initrd(void) -{ - unsigned long size; - - if (initrd_start >= initrd_end) { - pr_err("initrd not found or empty"); - goto disable; - } - - if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) { - pr_err("initrd extends beyond end of memory"); - goto disable; - } - - size = initrd_end - initrd_start; - - if (memblock_is_region_reserved(__pa(initrd_start), size)) { - pr_err("INITRD: 0x%08lx+0x%08lx overlaps in-use memory region", - __pa(initrd_start), size); - goto disable; - } - - memblock_reserve(__pa(initrd_start), size); - - pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n", - (void *)(initrd_start), size); - - initrd_below_start_ok = 1; - - return; - -disable: - initrd_start = initrd_end = 0; - - pr_err(" - disabling initrd\n"); -} -#endif - void __init mem_init(void) { #ifdef CONFIG_HIGHMEM @@ -92,10 +53,6 @@ void __init mem_init(void) #endif high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); -#ifdef CONFIG_BLK_DEV_INITRD - setup_initrd(); -#endif - memblock_free_all(); #ifdef CONFIG_HIGHMEM -- 2.51.0 From 30686816214b6062246e4918f3eadd1f55382425 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:53 +0200 Subject: [PATCH 10/16] hexagon: move initialization of init_mm.context init to paging_init() This will help with pulling out memblock_free_all() to the generic code and reducing code duplication in arch::mem_init(). Link: https://lkml.kernel.org/r/20250313135003.836600-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/hexagon/mm/init.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 3458f39ca2ac..508bb6a8dcc9 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -59,14 +59,6 @@ void __init mem_init(void) * To-Do: someone somewhere should wipe out the bootmem map * after we're done? */ - - /* - * This can be moved to some more virtual-memory-specific - * initialization hook at some point. Set the init_mm - * descriptors "context" value to point to the initial - * kernel segment table's physical address. - */ - init_mm.context.ptbase = __pa(init_mm.pgd); } void sync_icache_dcache(pte_t pte) @@ -103,6 +95,12 @@ static void __init paging_init(void) free_area_init(max_zone_pfn); /* sets up the zonelists and mem_map */ + /* + * Set the init_mm descriptors "context" value to point to the + * initial kernel segment table's physical address. + */ + init_mm.context.ptbase = __pa(init_mm.pgd); + /* * Start of high memory area. Will probably need something more * fancy if we... get more fancy. -- 2.51.0 From 67e7a600869ca557e259f1ce20f8b89bb95ca97d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:54 +0200 Subject: [PATCH 11/16] MIPS: consolidate mem_init() for NUMA machines Both MIPS systems that support numa (loongsoon3 and sgi-ip27) have identical mem_init() for NUMA case. Move that into arch/mips/mm/init.c and drop duplicate per-machine definitions. Link: https://lkml.kernel.org/r/20250313135003.836600-5-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/mips/loongson64/numa.c | 7 ------- arch/mips/mm/init.c | 7 +++++++ arch/mips/sgi-ip27/ip27-memory.c | 9 --------- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index 8388400d052f..95d5f553ce19 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -164,13 +164,6 @@ void __init paging_init(void) free_area_init(zones_size); } -void __init mem_init(void) -{ - high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); - memblock_free_all(); - setup_zero_pages(); /* This comes from node 0 */ -} - /* All PCI device belongs to logical Node-0 */ int pcibus_to_node(struct pci_bus *bus) { diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 4583d1a2a73e..3db6082c611e 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -482,6 +482,13 @@ void __init mem_init(void) 0x80000000 - 4, KCORE_TEXT); #endif } +#else /* CONFIG_NUMA */ +void __init mem_init(void) +{ + high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); + memblock_free_all(); + setup_zero_pages(); /* This comes from node 0 */ +} #endif /* !CONFIG_NUMA */ void free_init_pages(const char *what, unsigned long begin, unsigned long end) diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index 1963313f55d8..2b3e46e2e607 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -406,8 +406,6 @@ void __init prom_meminit(void) } } -extern void setup_zero_pages(void); - void __init paging_init(void) { unsigned long zones_size[MAX_NR_ZONES] = {0, }; @@ -416,10 +414,3 @@ void __init paging_init(void) zones_size[ZONE_NORMAL] = max_low_pfn; free_area_init(zones_size); } - -void __init mem_init(void) -{ - high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); - memblock_free_all(); - setup_zero_pages(); /* This comes from node 0 */ -} -- 2.51.0 From e74e2b8eb424c26dff35727d242437edb87684aa Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:55 +0200 Subject: [PATCH 12/16] MIPS: make setup_zero_pages() use memblock Allocating the zero pages from memblock is simpler because the memory is already reserved. This will also help with pulling out memblock_free_all() to the generic code and reducing code duplication in arch::mem_init(). Link: https://lkml.kernel.org/r/20250313135003.836600-6-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/mips/include/asm/mmzone.h | 2 -- arch/mips/mm/init.c | 18 +++++------------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/arch/mips/include/asm/mmzone.h b/arch/mips/include/asm/mmzone.h index 14226ea42036..602a21aee9d4 100644 --- a/arch/mips/include/asm/mmzone.h +++ b/arch/mips/include/asm/mmzone.h @@ -20,6 +20,4 @@ #define nid_to_addrbase(nid) 0 #endif -extern void setup_zero_pages(void); - #endif /* _ASM_MMZONE_H_ */ diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 3db6082c611e..820e35a59d4d 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -59,24 +59,16 @@ EXPORT_SYMBOL(zero_page_mask); /* * Not static inline because used by IP27 special magic initialization code */ -void setup_zero_pages(void) +static void __init setup_zero_pages(void) { - unsigned int order, i; - struct page *page; + unsigned int order; if (cpu_has_vce) order = 3; else order = 0; - empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!empty_zero_page) - panic("Oh boy, that early out of memory?"); - - page = virt_to_page((void *)empty_zero_page); - split_page(page, order); - for (i = 0; i < (1 << order); i++, page++) - mark_page_reserved(page); + empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE); zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK; } @@ -470,9 +462,9 @@ void __init mem_init(void) BUILD_BUG_ON(IS_ENABLED(CONFIG_32BIT) && (PFN_PTE_SHIFT > PAGE_SHIFT)); maar_init(); - memblock_free_all(); setup_zero_pages(); /* Setup zeroed pages. */ mem_init_free_highmem(); + memblock_free_all(); #ifdef CONFIG_64BIT if ((unsigned long) &_text > (unsigned long) CKSEG0) @@ -486,8 +478,8 @@ void __init mem_init(void) void __init mem_init(void) { high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); - memblock_free_all(); setup_zero_pages(); /* This comes from node 0 */ + memblock_free_all(); } #endif /* !CONFIG_NUMA */ -- 2.51.0 From be971f957a80e2bbd7747a295886df1472803ff1 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:56 +0200 Subject: [PATCH 13/16] nios2: move pr_debug() about memory start and end to setup_arch() This will help with pulling out memblock_free_all() to the generic code and reducing code duplication in arch::mem_init(). Link: https://lkml.kernel.org/r/20250313135003.836600-7-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/nios2/kernel/setup.c | 2 ++ arch/nios2/mm/init.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index da122a5fa43b..a4cffbfc1399 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -149,6 +149,8 @@ void __init setup_arch(char **cmdline_p) memory_start = memblock_start_of_DRAM(); memory_end = memblock_end_of_DRAM(); + pr_debug("%s: start=%lx, end=%lx\n", __func__, memory_start, memory_end); + setup_initial_init_mm(_stext, _etext, _edata, _end); init_task.thread.kregs = &fake_regs; diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index a2278485de19..aa692ad30044 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -65,8 +65,6 @@ void __init mem_init(void) unsigned long end_mem = memory_end; /* this must not include kernel stack at top */ - pr_debug("mem_init: start=%lx, end=%lx\n", memory_start, memory_end); - end_mem &= PAGE_MASK; high_memory = __va(end_mem); -- 2.51.0 From 54ccf66f99d6d2630895eb13156be19a033d4566 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:57 +0200 Subject: [PATCH 14/16] s390: make setup_zero_pages() use memblock Allocating the zero pages from memblock is simpler because the memory is already reserved. This will also help with pulling out memblock_free_all() to the generic code and reducing code duplication in arch::mem_init(). Link: https://lkml.kernel.org/r/20250313135003.836600-8-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Heiko Carstens Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/s390/mm/init.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index d88cb1c13f7d..2b41dc9b1fa3 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -73,8 +73,6 @@ static void __init setup_zero_pages(void) { unsigned long total_pages = memblock_estimated_nr_free_pages(); unsigned int order; - struct page *page; - int i; /* Latest machines require a mapping granularity of 512KB */ order = 7; @@ -83,16 +81,7 @@ static void __init setup_zero_pages(void) while (order > 2 && (total_pages >> 10) < (1UL << order)) order--; - empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!empty_zero_page) - panic("Out of memory in setup_zero_pages"); - - page = virt_to_page((void *) empty_zero_page); - split_page(page, order); - for (i = 1 << order; i > 0; i--) { - mark_page_reserved(page); - page++; - } + empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE); zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK; } @@ -176,9 +165,10 @@ void __init mem_init(void) pv_init(); kfence_split_mapping(); + setup_zero_pages(); /* Setup zeroed pages. */ + /* this will put all low memory onto the freelists */ memblock_free_all(); - setup_zero_pages(); /* Setup zeroed pages. */ } unsigned long memory_block_size_bytes(void) -- 2.51.0 From d319c8b4918d24aea6fd90bd39cd5bc9fcf40859 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:58 +0200 Subject: [PATCH 15/16] xtensa: split out printing of virtual memory layout to a function This will help with pulling out memblock_free_all() to the generic code and reducing code duplication in arch::mem_init(). Link: https://lkml.kernel.org/r/20250313135003.836600-9-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Max Filippov Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/xtensa/mm/init.c | 97 ++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index b2587a1a7c46..01577d33e602 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -66,6 +66,55 @@ void __init bootmem_init(void) memblock_dump_all(); } +static void __init print_vm_layout(void) +{ + pr_info("virtual kernel memory layout:\n" +#ifdef CONFIG_KASAN + " kasan : 0x%08lx - 0x%08lx (%5lu MB)\n" +#endif +#ifdef CONFIG_MMU + " vmalloc : 0x%08lx - 0x%08lx (%5lu MB)\n" +#endif +#ifdef CONFIG_HIGHMEM + " pkmap : 0x%08lx - 0x%08lx (%5lu kB)\n" + " fixmap : 0x%08lx - 0x%08lx (%5lu kB)\n" +#endif + " lowmem : 0x%08lx - 0x%08lx (%5lu MB)\n" + " .text : 0x%08lx - 0x%08lx (%5lu kB)\n" + " .rodata : 0x%08lx - 0x%08lx (%5lu kB)\n" + " .data : 0x%08lx - 0x%08lx (%5lu kB)\n" + " .init : 0x%08lx - 0x%08lx (%5lu kB)\n" + " .bss : 0x%08lx - 0x%08lx (%5lu kB)\n", +#ifdef CONFIG_KASAN + KASAN_SHADOW_START, KASAN_SHADOW_START + KASAN_SHADOW_SIZE, + KASAN_SHADOW_SIZE >> 20, +#endif +#ifdef CONFIG_MMU + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, +#ifdef CONFIG_HIGHMEM + PKMAP_BASE, PKMAP_BASE + LAST_PKMAP * PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, + FIXADDR_START, FIXADDR_END, + (FIXADDR_END - FIXADDR_START) >> 10, +#endif + PAGE_OFFSET, PAGE_OFFSET + + (max_low_pfn - min_low_pfn) * PAGE_SIZE, +#else + min_low_pfn * PAGE_SIZE, max_low_pfn * PAGE_SIZE, +#endif + ((max_low_pfn - min_low_pfn) * PAGE_SIZE) >> 20, + (unsigned long)_text, (unsigned long)_etext, + (unsigned long)(_etext - _text) >> 10, + (unsigned long)__start_rodata, (unsigned long)__end_rodata, + (unsigned long)(__end_rodata - __start_rodata) >> 10, + (unsigned long)_sdata, (unsigned long)_edata, + (unsigned long)(_edata - _sdata) >> 10, + (unsigned long)__init_begin, (unsigned long)__init_end, + (unsigned long)(__init_end - __init_begin) >> 10, + (unsigned long)__bss_start, (unsigned long)__bss_stop, + (unsigned long)(__bss_stop - __bss_start) >> 10); +} void __init zones_init(void) { @@ -77,6 +126,7 @@ void __init zones_init(void) #endif }; free_area_init(max_zone_pfn); + print_vm_layout(); } static void __init free_highpages(void) @@ -118,53 +168,6 @@ void __init mem_init(void) high_memory = (void *)__va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); - - pr_info("virtual kernel memory layout:\n" -#ifdef CONFIG_KASAN - " kasan : 0x%08lx - 0x%08lx (%5lu MB)\n" -#endif -#ifdef CONFIG_MMU - " vmalloc : 0x%08lx - 0x%08lx (%5lu MB)\n" -#endif -#ifdef CONFIG_HIGHMEM - " pkmap : 0x%08lx - 0x%08lx (%5lu kB)\n" - " fixmap : 0x%08lx - 0x%08lx (%5lu kB)\n" -#endif - " lowmem : 0x%08lx - 0x%08lx (%5lu MB)\n" - " .text : 0x%08lx - 0x%08lx (%5lu kB)\n" - " .rodata : 0x%08lx - 0x%08lx (%5lu kB)\n" - " .data : 0x%08lx - 0x%08lx (%5lu kB)\n" - " .init : 0x%08lx - 0x%08lx (%5lu kB)\n" - " .bss : 0x%08lx - 0x%08lx (%5lu kB)\n", -#ifdef CONFIG_KASAN - KASAN_SHADOW_START, KASAN_SHADOW_START + KASAN_SHADOW_SIZE, - KASAN_SHADOW_SIZE >> 20, -#endif -#ifdef CONFIG_MMU - VMALLOC_START, VMALLOC_END, - (VMALLOC_END - VMALLOC_START) >> 20, -#ifdef CONFIG_HIGHMEM - PKMAP_BASE, PKMAP_BASE + LAST_PKMAP * PAGE_SIZE, - (LAST_PKMAP*PAGE_SIZE) >> 10, - FIXADDR_START, FIXADDR_END, - (FIXADDR_END - FIXADDR_START) >> 10, -#endif - PAGE_OFFSET, PAGE_OFFSET + - (max_low_pfn - min_low_pfn) * PAGE_SIZE, -#else - min_low_pfn * PAGE_SIZE, max_low_pfn * PAGE_SIZE, -#endif - ((max_low_pfn - min_low_pfn) * PAGE_SIZE) >> 20, - (unsigned long)_text, (unsigned long)_etext, - (unsigned long)(_etext - _text) >> 10, - (unsigned long)__start_rodata, (unsigned long)__end_rodata, - (unsigned long)(__end_rodata - __start_rodata) >> 10, - (unsigned long)_sdata, (unsigned long)_edata, - (unsigned long)(_edata - _sdata) >> 10, - (unsigned long)__init_begin, (unsigned long)__init_end, - (unsigned long)(__init_end - __init_begin) >> 10, - (unsigned long)__bss_start, (unsigned long)__bss_stop, - (unsigned long)(__bss_stop - __bss_start) >> 10); } static void __init parse_memmap_one(char *p) -- 2.51.0 From 8268af309d07d1c6279080b4e6fd16ec75cc977c Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:59 +0200 Subject: [PATCH 16/16] arch, mm: set max_mapnr when allocating memory map for FLATMEM max_mapnr is essentially the size of the memory map for systems that use FLATMEM. There is no reason to calculate it in each and every architecture when it's anyway calculated in alloc_node_mem_map(). Drop setting of max_mapnr from architecture code and set it once in alloc_node_mem_map(). While on it, move definition of mem_map and max_mapnr to mm/mm_init.c so there won't be two copies for MMU and !MMU variants. Link: https://lkml.kernel.org/r/20250313135003.836600-10-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/mm/init.c | 1 - arch/arc/mm/init.c | 5 ----- arch/arm/mm/init.c | 2 -- arch/csky/mm/init.c | 4 ---- arch/loongarch/mm/init.c | 1 - arch/microblaze/mm/init.c | 4 ---- arch/mips/mm/init.c | 8 -------- arch/nios2/kernel/setup.c | 1 - arch/nios2/mm/init.c | 2 +- arch/openrisc/mm/init.c | 1 - arch/parisc/mm/init.c | 1 - arch/powerpc/kernel/setup-common.c | 2 -- arch/riscv/mm/init.c | 1 - arch/s390/mm/init.c | 1 - arch/sh/mm/init.c | 1 - arch/sparc/mm/init_32.c | 1 - arch/um/include/shared/mem_user.h | 1 - arch/um/kernel/physmem.c | 12 ------------ arch/um/kernel/um_arch.c | 1 - arch/x86/mm/init_32.c | 3 --- arch/xtensa/mm/init.c | 1 - include/asm-generic/memory_model.h | 5 +++-- include/linux/mm.h | 11 ----------- mm/memory.c | 8 -------- mm/mm_init.c | 25 +++++++++++++++++-------- mm/nommu.c | 4 ---- 26 files changed, 21 insertions(+), 86 deletions(-) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 61c2198b1359..ec0eeae9c653 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -276,7 +276,6 @@ srm_paging_stop (void) void __init mem_init(void) { - set_max_mapnr(max_low_pfn); high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); memblock_free_all(); } diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 6a71b23f1383..7ef883d58dc1 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -154,11 +154,6 @@ void __init setup_arch_memory(void) arch_pfn_offset = min(min_low_pfn, min_high_pfn); kmap_init(); - -#else /* CONFIG_HIGHMEM */ - /* pfn_valid() uses this when FLATMEM=y and HIGHMEM=n */ - max_mapnr = max_low_pfn - min_low_pfn; - #endif /* CONFIG_HIGHMEM */ free_area_init(max_zone_pfn); diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 9aec1cb2386f..d4bcc745a044 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -275,8 +275,6 @@ void __init mem_init(void) swiotlb_init(max_pfn > arm_dma_pfn_limit, SWIOTLB_VERBOSE); #endif - set_max_mapnr(pfn_to_page(max_pfn) - mem_map); - #ifdef CONFIG_SA1111 /* now that our DMA memory is actually so designated, we can free it */ memblock_phys_free(PHYS_OFFSET, __pa(swapper_pg_dir) - PHYS_OFFSET); diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index ab51acbc19b2..ba6694d6170a 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -46,10 +46,6 @@ void __init mem_init(void) { #ifdef CONFIG_HIGHMEM unsigned long tmp; - - set_max_mapnr(highend_pfn - ARCH_PFN_OFFSET); -#else - set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); #endif high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index ca5aa5f46a9f..00449df50db1 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -78,7 +78,6 @@ void __init paging_init(void) void __init mem_init(void) { - max_mapnr = max_low_pfn; high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 4520c5741579..857cd2b44bcf 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -104,17 +104,13 @@ void __init setup_memory(void) * * min_low_pfn - the first page (mm/bootmem.c - node_boot_start) * max_low_pfn - * max_mapnr - the first unused page (mm/bootmem.c - node_low_pfn) */ /* memory start is from the kernel end (aligned) to higher addr */ min_low_pfn = memory_start >> PAGE_SHIFT; /* minimum for allocation */ - /* RAM is assumed contiguous */ - max_mapnr = memory_size >> PAGE_SHIFT; max_low_pfn = ((u64)memory_start + (u64)lowmem_size) >> PAGE_SHIFT; max_pfn = ((u64)memory_start + (u64)memory_size) >> PAGE_SHIFT; - pr_info("%s: max_mapnr: %#lx\n", __func__, max_mapnr); pr_info("%s: min_low_pfn: %#lx\n", __func__, min_low_pfn); pr_info("%s: max_low_pfn: %#lx\n", __func__, max_low_pfn); pr_info("%s: max_pfn: %#lx\n", __func__, max_pfn); diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 820e35a59d4d..eb61a73520a0 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -415,15 +415,7 @@ void __init paging_init(void) " %ldk highmem ignored\n", (highend_pfn - max_low_pfn) << (PAGE_SHIFT - 10)); max_zone_pfns[ZONE_HIGHMEM] = max_low_pfn; - - max_mapnr = max_low_pfn; - } else if (highend_pfn) { - max_mapnr = highend_pfn; - } else { - max_mapnr = max_low_pfn; } -#else - max_mapnr = max_low_pfn; #endif high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index a4cffbfc1399..2a40150142c3 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -158,7 +158,6 @@ void __init setup_arch(char **cmdline_p) *cmdline_p = boot_command_line; find_limits(&min_low_pfn, &max_low_pfn, &max_pfn); - max_mapnr = max_low_pfn; memblock_reserve(__pa_symbol(_stext), _end - _stext); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index aa692ad30044..3cafa87ead9e 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -51,7 +51,7 @@ void __init paging_init(void) pagetable_init(); pgd_current = swapper_pg_dir; - max_zone_pfn[ZONE_NORMAL] = max_mapnr; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; /* pass the memory from the bootmem allocator to the main allocator */ free_area_init(max_zone_pfn); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index d0cb1a0126f9..9093c336e158 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -193,7 +193,6 @@ void __init mem_init(void) { BUG_ON(!mem_map); - max_mapnr = max_low_pfn; high_memory = (void *)__va(max_low_pfn * PAGE_SIZE); /* clear the zero-page */ diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 61c0a2477072..2cdfc0b1195c 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -563,7 +563,6 @@ void __init mem_init(void) #endif high_memory = __va((max_pfn << PAGE_SHIFT)); - set_max_mapnr(max_low_pfn); memblock_free_all(); #ifdef CONFIG_PA11 diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index a08b0ede4e64..68d47c53876c 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -957,8 +957,6 @@ void __init setup_arch(char **cmdline_p) /* Parse memory topology */ mem_topology_setup(); - /* Set max_mapnr before paging_init() */ - set_max_mapnr(max_pfn); high_memory = (void *)__va(max_low_pfn * PAGE_SIZE); /* diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 15b2eda4c364..157c9ca51541 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -298,7 +298,6 @@ static void __init setup_bootmem(void) high_memory = (void *)(__va(PFN_PHYS(max_low_pfn))); dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn)); - set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); reserve_initrd_mem(); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 2b41dc9b1fa3..ad567e2100b7 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -159,7 +159,6 @@ void __init mem_init(void) cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(0, mm_cpumask(&init_mm)); - set_max_mapnr(max_low_pfn); high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); pv_init(); diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 289a2fecebef..72aea5cd1b85 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -290,7 +290,6 @@ void __init paging_init(void) */ max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; min_low_pfn = __MEMORY_START >> PAGE_SHIFT; - set_max_mapnr(max_low_pfn - min_low_pfn); nodes_clear(node_online_map); diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index d96a14ffceeb..6b58da14edc6 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -275,7 +275,6 @@ void __init mem_init(void) taint_real_pages(); - max_mapnr = last_valid_pfn - pfn_base; high_memory = __va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); diff --git a/arch/um/include/shared/mem_user.h b/arch/um/include/shared/mem_user.h index adfa08062f88..d4727efcf23d 100644 --- a/arch/um/include/shared/mem_user.h +++ b/arch/um/include/shared/mem_user.h @@ -47,7 +47,6 @@ extern int iomem_size; #define ROUND_4M(n) ((((unsigned long) (n)) + (1 << 22)) & ~((1 << 22) - 1)) extern unsigned long find_iomem(char *driver, unsigned long *len_out); -extern void mem_total_pages(unsigned long physmem, unsigned long iomem); extern void setup_physmem(unsigned long start, unsigned long usable, unsigned long len); extern void map_memory(unsigned long virt, unsigned long phys, diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index a74f17b033c4..af02b5f9911d 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -22,18 +22,6 @@ static int physmem_fd = -1; unsigned long high_physmem; EXPORT_SYMBOL(high_physmem); -void __init mem_total_pages(unsigned long physmem, unsigned long iomem) -{ - unsigned long phys_pages, iomem_pages, total_pages; - - phys_pages = physmem >> PAGE_SHIFT; - iomem_pages = iomem >> PAGE_SHIFT; - - total_pages = phys_pages + iomem_pages; - - max_mapnr = total_pages; -} - void map_memory(unsigned long virt, unsigned long phys, unsigned long len, int r, int w, int x) { diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 79ea97d4797e..6414cbf00572 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -419,7 +419,6 @@ void __init setup_arch(char **cmdline_p) stack_protections((unsigned long) init_task.stack); setup_physmem(uml_physmem, uml_reserved, physmem_size); - mem_total_pages(physmem_size, iomem_size); uml_dtb_init(); read_initrd(); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ac41b1e0940d..6d2f8cb9451e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -650,9 +650,6 @@ void __init initmem_init(void) memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); -#ifdef CONFIG_FLATMEM - max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn; -#endif __vmalloc_start_set = true; printk(KERN_NOTICE "%ldMB LOWMEM available.\n", diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 01577d33e602..9f1b0d5fccc7 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -164,7 +164,6 @@ void __init mem_init(void) { free_highpages(); - max_mapnr = max_pfn - ARCH_PFN_OFFSET; high_memory = (void *)__va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 6d1fb6162ac1..a3b5029aebbd 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -19,11 +19,12 @@ #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET) +/* avoid include hell */ +extern unsigned long max_mapnr; + #ifndef pfn_valid static inline int pfn_valid(unsigned long pfn) { - /* avoid include hell */ - extern unsigned long max_mapnr; unsigned long pfn_offset = ARCH_PFN_OFFSET; return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr; diff --git a/include/linux/mm.h b/include/linux/mm.h index 82776b409391..8c4cb8c28507 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -46,17 +46,6 @@ extern int sysctl_page_lock_unfairness; void mm_core_init(void); void init_mm_internals(void); -#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ -extern unsigned long max_mapnr; - -static inline void set_max_mapnr(unsigned long limit) -{ - max_mapnr = limit; -} -#else -static inline void set_max_mapnr(unsigned long limit) { } -#endif - extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { diff --git a/mm/memory.c b/mm/memory.c index 8873b7a4962c..a1d7664855f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -95,14 +95,6 @@ #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif -#ifndef CONFIG_NUMA -unsigned long max_mapnr; -EXPORT_SYMBOL(max_mapnr); - -struct page *mem_map; -EXPORT_SYMBOL(mem_map); -#endif - static vm_fault_t do_fault(struct vm_fault *vmf); static vm_fault_t do_anonymous_page(struct vm_fault *vmf); static bool vmf_pte_changed(struct vm_fault *vmf); diff --git a/mm/mm_init.c b/mm/mm_init.c index 133640a93d1d..7fd48d2d5064 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -37,6 +37,14 @@ #include +#ifndef CONFIG_NUMA +unsigned long max_mapnr; +EXPORT_SYMBOL(max_mapnr); + +struct page *mem_map; +EXPORT_SYMBOL(mem_map); +#endif + #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; @@ -1639,7 +1647,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); offset = pgdat->node_start_pfn - start; /* - * The zone's endpoints aren't required to be MAX_PAGE_ORDER + * The zone's endpoints aren't required to be MAX_PAGE_ORDER * aligned but the node_mem_map endpoints must be in order * for the buddy allocator to function correctly. */ @@ -1655,14 +1663,15 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", __func__, pgdat->node_id, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); -#ifndef CONFIG_NUMA + /* the global mem_map is just set as node 0's */ - if (pgdat == NODE_DATA(0)) { - mem_map = NODE_DATA(0)->node_mem_map; - if (page_to_pfn(mem_map) != pgdat->node_start_pfn) - mem_map -= offset; - } -#endif + WARN_ON(pgdat != NODE_DATA(0)); + + mem_map = pgdat->node_mem_map; + if (page_to_pfn(mem_map) != pgdat->node_start_pfn) + mem_map -= offset; + + max_mapnr = end - start; } #else static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } diff --git a/mm/nommu.c b/mm/nommu.c index 8b31d8396297..43751726f977 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -44,16 +44,12 @@ void *high_memory; EXPORT_SYMBOL(high_memory); -struct page *mem_map; -unsigned long max_mapnr; -EXPORT_SYMBOL(max_mapnr); unsigned long highest_memmap_pfn; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; -EXPORT_SYMBOL(mem_map); /* list of mapped, potentially shareable regions */ static struct kmem_cache *vm_region_jar; -- 2.51.0