Blame - mm/mm_init.c - kernel/common

blob: 2630cc30147e052af5374464224dcc3887273bfd [file] [log] [blame]

Thomas Gleixner	457c899	2019-05-19 13:08:55 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Mel Gorman	6b74ab9	2008-07-23 21:26:49 -0700	[diff] [blame]	2	/*
				3	* mm_init.c - Memory initialisation verification and debugging
				4	*
				5	* Copyright 2008 IBM Corporation, 2008
				6	* Author Mel Gorman <mel@csn.ul.ie>
				7	*
				8	*/
				9	#include <linux/kernel.h>
				10	#include <linux/init.h>
Nishanth Aravamudan	ff7ea79	2008-07-23 21:27:39 -0700	[diff] [blame]	11	#include <linux/kobject.h>
Paul Gortmaker	b95f1b31	2011-10-16 02:01:52 -0400	[diff] [blame]	12	#include <linux/export.h>
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	13	#include <linux/memory.h>
				14	#include <linux/notifier.h>
Mel Gorman	7e18adb	2015-06-30 14:57:05 -0700	[diff] [blame]	15	#include <linux/sched.h>
Feng Tang	56f3547	2020-08-06 23:23:15 -0700	[diff] [blame]	16	#include <linux/mman.h>
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	17	#include <linux/memblock.h>
				18	#include <linux/page-isolation.h>
				19	#include <linux/padata.h>
				20	#include <linux/nmi.h>
				21	#include <linux/buffer_head.h>
				22	#include <linux/kmemleak.h>
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	23	#include <linux/kfence.h>
				24	#include <linux/page_ext.h>
				25	#include <linux/pti.h>
				26	#include <linux/pgtable.h>
Suren Baghdasaryan	dcfe378	2024-03-21 09:36:36 -0700	[diff] [blame]	27	#include <linux/stackdepot.h>
Mike Rapoport (IBM)	eb8589b	2023-03-21 19:05:10 +0200	[diff] [blame]	28	#include <linux/swap.h>
				29	#include <linux/cma.h>
Ma Wupeng	7ea6ec4	2024-01-09 12:15:36 +0800	[diff] [blame]	30	#include <linux/crash_dump.h>
Mike Rapoport (IBM)	f6bec26	2024-05-05 19:06:19 +0300	[diff] [blame]	31	#include <linux/execmem.h>
Sourav Panda	15995a3	2024-06-05 22:27:51 +0000	[diff] [blame]	32	#include <linux/vmstat.h>
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	33	#include "internal.h"
Mike Rapoport (IBM)	d5d2c02	2023-03-21 19:05:11 +0200	[diff] [blame]	34	#include "slab.h"
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	35	#include "shuffle.h"
Mel Gorman	6b74ab9	2008-07-23 21:26:49 -0700	[diff] [blame]	36
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	37	#include <asm/setup.h>
				38
Nishanth Aravamudan	5e9426ab	2008-07-23 21:27:39 -0700	[diff] [blame]	39	#ifdef CONFIG_DEBUG_MEMORY_INIT
Rasmus Villemoes	194e815	2015-02-12 15:00:12 -0800	[diff] [blame]	40	int __meminitdata mminit_loglevel;
Mel Gorman	6b74ab9	2008-07-23 21:26:49 -0700	[diff] [blame]	41
Mel Gorman	68ad8df	2008-07-23 21:26:52 -0700	[diff] [blame]	42	/* The zonelists are simply reported, validation is manual. */
Rasmus Villemoes	0e2342c	2015-02-12 15:00:09 -0800	[diff] [blame]	43	void __init mminit_verify_zonelist(void)
Mel Gorman	68ad8df	2008-07-23 21:26:52 -0700	[diff] [blame]	44	{
				45	int nid;
				46
				47	if (mminit_loglevel < MMINIT_VERIFY)
				48	return;
				49
				50	for_each_online_node(nid) {
				51	pg_data_t *pgdat = NODE_DATA(nid);
				52	struct zone *zone;
				53	struct zoneref *z;
				54	struct zonelist *zonelist;
				55	int i, listid, zoneid;
				56
Mel Gorman	68ad8df	2008-07-23 21:26:52 -0700	[diff] [blame]	57	for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
				58
				59	/* Identify the zone and nodelist */
				60	zoneid = i % MAX_NR_ZONES;
				61	listid = i / MAX_NR_ZONES;
				62	zonelist = &pgdat->node_zonelists[listid];
				63	zone = &pgdat->node_zones[zoneid];
				64	if (!populated_zone(zone))
				65	continue;
				66
				67	/* Print information about the zonelist */
				68	printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
				69	listid > 0 ? "thisnode" : "general", nid,
				70	zone->name);
				71
				72	/* Iterate the zonelist */
Pavel Tatashin	c1093b7	2018-08-21 21:53:32 -0700	[diff] [blame]	73	for_each_zone_zonelist(zone, z, zonelist, zoneid)
				74	pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
Joe Perches	1170532	2016-03-17 14:19:50 -0700	[diff] [blame]	75	pr_cont("\n");
Mel Gorman	68ad8df	2008-07-23 21:26:52 -0700	[diff] [blame]	76	}
				77	}
				78	}
				79
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	80	void __init mminit_verify_pageflags_layout(void)
				81	{
				82	int shift, width;
				83	unsigned long or_mask, add_mask;
				84
Miaohe Lin	daee07b	2023-08-07 10:35:28 +0800	[diff] [blame]	85	shift = BITS_PER_LONG;
Suren Baghdasaryan	4835f74	2024-10-23 10:07:59 -0700	[diff] [blame]	86	width = shift - NR_NON_PAGEFLAG_BITS;
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	87	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
Yu Zhao	ec1c86b2	2022-09-18 02:00:02 -0600	[diff] [blame]	88	"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	89	SECTIONS_WIDTH,
				90	NODES_WIDTH,
				91	ZONES_WIDTH,
Peter Zijlstra	9057289	2013-10-07 11:29:20 +0100	[diff] [blame]	92	LAST_CPUPID_WIDTH,
Jing Xia	86fea8b	2020-06-01 21:52:49 -0700	[diff] [blame]	93	KASAN_TAG_WIDTH,
Yu Zhao	ec1c86b2	2022-09-18 02:00:02 -0600	[diff] [blame]	94	LRU_GEN_WIDTH,
				95	LRU_REFS_WIDTH,
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	96	NR_PAGEFLAGS);
				97	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
Jing Xia	86fea8b	2020-06-01 21:52:49 -0700	[diff] [blame]	98	"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	99	SECTIONS_SHIFT,
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	100	NODES_SHIFT,
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	101	ZONES_SHIFT,
Jing Xia	86fea8b	2020-06-01 21:52:49 -0700	[diff] [blame]	102	LAST_CPUPID_SHIFT,
				103	KASAN_TAG_WIDTH);
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	104	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
Jing Xia	86fea8b	2020-06-01 21:52:49 -0700	[diff] [blame]	105	"Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	106	(unsigned long)SECTIONS_PGSHIFT,
				107	(unsigned long)NODES_PGSHIFT,
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	108	(unsigned long)ZONES_PGSHIFT,
Jing Xia	86fea8b	2020-06-01 21:52:49 -0700	[diff] [blame]	109	(unsigned long)LAST_CPUPID_PGSHIFT,
				110	(unsigned long)KASAN_TAG_PGSHIFT);
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	111	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
				112	"Node/Zone ID: %lu -> %lu\n",
				113	(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
				114	(unsigned long)ZONEID_PGOFF);
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	115	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	116	"location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	117	shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
				118	#ifdef NODE_NOT_IN_PAGE_FLAGS
				119	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
				120	"Node not in page flags");
				121	#endif
Peter Zijlstra	9057289	2013-10-07 11:29:20 +0100	[diff] [blame]	122	#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	123	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
Peter Zijlstra	9057289	2013-10-07 11:29:20 +0100	[diff] [blame]	124	"Last cpupid not in page flags");
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	125	#endif
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	126
				127	if (SECTIONS_WIDTH) {
				128	shift -= SECTIONS_WIDTH;
				129	BUG_ON(shift != SECTIONS_PGSHIFT);
				130	}
				131	if (NODES_WIDTH) {
				132	shift -= NODES_WIDTH;
				133	BUG_ON(shift != NODES_PGSHIFT);
				134	}
				135	if (ZONES_WIDTH) {
				136	shift -= ZONES_WIDTH;
				137	BUG_ON(shift != ZONES_PGSHIFT);
				138	}
				139
				140	/* Check for bitmask overlaps */
				141	or_mask = (ZONES_MASK << ZONES_PGSHIFT) \|
				142	(NODES_MASK << NODES_PGSHIFT) \|
				143	(SECTIONS_MASK << SECTIONS_PGSHIFT);
				144	add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
				145	(NODES_MASK << NODES_PGSHIFT) +
				146	(SECTIONS_MASK << SECTIONS_PGSHIFT);
				147	BUG_ON(or_mask != add_mask);
				148	}
				149
Mel Gorman	6b74ab9	2008-07-23 21:26:49 -0700	[diff] [blame]	150	static __init int set_mminit_loglevel(char *str)
				151	{
				152	get_option(&str, &mminit_loglevel);
				153	return 0;
				154	}
				155	early_param("mminit_loglevel", set_mminit_loglevel);
Nishanth Aravamudan	5e9426ab	2008-07-23 21:27:39 -0700	[diff] [blame]	156	#endif /* CONFIG_DEBUG_MEMORY_INIT */
Nishanth Aravamudan	ff7ea79	2008-07-23 21:27:39 -0700	[diff] [blame]	157
				158	struct kobject *mm_kobj;
Nishanth Aravamudan	ff7ea79	2008-07-23 21:27:39 -0700	[diff] [blame]	159
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	160	#ifdef CONFIG_SMP
				161	s32 vm_committed_as_batch = 32;
				162
Feng Tang	56f3547	2020-08-06 23:23:15 -0700	[diff] [blame]	163	void mm_compute_batch(int overcommit_policy)
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	164	{
				165	u64 memsized_batch;
				166	s32 nr = num_present_cpus();
				167	s32 batch = max_t(s32, nr*2, 32);
Feng Tang	56f3547	2020-08-06 23:23:15 -0700	[diff] [blame]	168	unsigned long ram_pages = totalram_pages();
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	169
Feng Tang	56f3547	2020-08-06 23:23:15 -0700	[diff] [blame]	170	/*
				171	* For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
				172	* (total memory/#cpus), and lift it to 25% for other policies
				173	* to easy the possible lock contention for percpu_counter
				174	* vm_committed_as, while the max limit is INT_MAX
				175	*/
				176	if (overcommit_policy == OVERCOMMIT_NEVER)
				177	memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);
				178	else
				179	memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	180
				181	vm_committed_as_batch = max_t(s32, memsized_batch, batch);
				182	}
				183
				184	static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
				185	unsigned long action, void *arg)
				186	{
				187	switch (action) {
				188	case MEM_ONLINE:
				189	case MEM_OFFLINE:
Feng Tang	56f3547	2020-08-06 23:23:15 -0700	[diff] [blame]	190	mm_compute_batch(sysctl_overcommit_memory);
Gustavo A. R. Silva	01359eb	2020-12-14 19:15:00 -0800	[diff] [blame]	191	break;
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	192	default:
				193	break;
				194	}
				195	return NOTIFY_OK;
				196	}
				197
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	198	static int __init mm_compute_batch_init(void)
				199	{
Feng Tang	56f3547	2020-08-06 23:23:15 -0700	[diff] [blame]	200	mm_compute_batch(sysctl_overcommit_memory);
Liu Shixin	1eeaa4f	2022-09-23 11:33:47 +0800	[diff] [blame]	201	hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI);
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	202	return 0;
				203	}
				204
				205	__initcall(mm_compute_batch_init);
				206
				207	#endif
				208
Nishanth Aravamudan	ff7ea79	2008-07-23 21:27:39 -0700	[diff] [blame]	209	static int __init mm_sysfs_init(void)
				210	{
				211	mm_kobj = kobject_create_and_add("mm", kernel_kobj);
				212	if (!mm_kobj)
				213	return -ENOMEM;
				214
				215	return 0;
				216	}
Hugh Dickins	e82cb95	2014-01-27 17:06:55 -0800	[diff] [blame]	217	postcore_initcall(mm_sysfs_init);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	218
				219	static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
				220	static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
				221	static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
				222
				223	static unsigned long required_kernelcore __initdata;
				224	static unsigned long required_kernelcore_percent __initdata;
				225	static unsigned long required_movablecore __initdata;
				226	static unsigned long required_movablecore_percent __initdata;
				227
				228	static unsigned long nr_kernel_pages __initdata;
				229	static unsigned long nr_all_pages __initdata;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	230
Mike Rapoport (IBM)	de57807	2023-03-21 19:05:09 +0200	[diff] [blame]	231	static bool deferred_struct_pages __meminitdata;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	232
				233	static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
				234
				235	static int __init cmdline_parse_core(char p, unsigned long core,
				236	unsigned long *percent)
				237	{
				238	unsigned long long coremem;
				239	char *endptr;
				240
				241	if (!p)
				242	return -EINVAL;
				243
				244	/* Value may be a percentage of total memory, otherwise bytes */
				245	coremem = simple_strtoull(p, &endptr, 0);
				246	if (*endptr == '%') {
				247	/* Paranoid check for percent values greater than 100 */
				248	WARN_ON(coremem > 100);
				249
				250	*percent = coremem;
				251	} else {
				252	coremem = memparse(p, &p);
				253	/* Paranoid check that UL is enough for the coremem value */
				254	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
				255
				256	*core = coremem >> PAGE_SHIFT;
				257	*percent = 0UL;
				258	}
				259	return 0;
				260	}
				261
Kefeng Wang	072ba38	2023-05-16 14:38:09 +0800	[diff] [blame]	262	bool mirrored_kernelcore __initdata_memblock;
				263
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	264	/*
				265	* kernelcore=size sets the amount of memory for use for allocations that
				266	* cannot be reclaimed or migrated.
				267	*/
				268	static int __init cmdline_parse_kernelcore(char *p)
				269	{
				270	/* parse kernelcore=mirror */
				271	if (parse_option_str(p, "mirror")) {
				272	mirrored_kernelcore = true;
				273	return 0;
				274	}
				275
				276	return cmdline_parse_core(p, &required_kernelcore,
				277	&required_kernelcore_percent);
				278	}
				279	early_param("kernelcore", cmdline_parse_kernelcore);
				280
				281	/*
				282	* movablecore=size sets the amount of memory for use for allocations that
				283	* can be reclaimed or migrated.
				284	*/
				285	static int __init cmdline_parse_movablecore(char *p)
				286	{
				287	return cmdline_parse_core(p, &required_movablecore,
				288	&required_movablecore_percent);
				289	}
				290	early_param("movablecore", cmdline_parse_movablecore);
				291
				292	/*
				293	* early_calculate_totalpages()
				294	* Sum pages in active regions for movable zone.
				295	* Populate N_MEMORY for calculating usable_nodes.
				296	*/
				297	static unsigned long __init early_calculate_totalpages(void)
				298	{
				299	unsigned long totalpages = 0;
				300	unsigned long start_pfn, end_pfn;
				301	int i, nid;
				302
				303	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
				304	unsigned long pages = end_pfn - start_pfn;
				305
				306	totalpages += pages;
				307	if (pages)
				308	node_set_state(nid, N_MEMORY);
				309	}
				310	return totalpages;
				311	}
				312
				313	/*
				314	* This finds a zone that can be used for ZONE_MOVABLE pages. The
				315	* assumption is made that zones within a node are ordered in monotonic
				316	* increasing memory addresses so that the "highest" populated zone is used
				317	*/
				318	static void __init find_usable_zone_for_movable(void)
				319	{
				320	int zone_index;
				321	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
				322	if (zone_index == ZONE_MOVABLE)
				323	continue;
				324
				325	if (arch_zone_highest_possible_pfn[zone_index] >
				326	arch_zone_lowest_possible_pfn[zone_index])
				327	break;
				328	}
				329
				330	VM_BUG_ON(zone_index == -1);
				331	movable_zone = zone_index;
				332	}
				333
				334	/*
				335	* Find the PFN the Movable zone begins in each node. Kernel memory
				336	* is spread evenly between nodes as long as the nodes have enough
				337	* memory. When they don't, some nodes will have more kernelcore than
				338	* others
				339	*/
				340	static void __init find_zone_movable_pfns_for_nodes(void)
				341	{
				342	int i, nid;
				343	unsigned long usable_startpfn;
				344	unsigned long kernelcore_node, kernelcore_remaining;
				345	/* save the state before borrow the nodemask */
				346	nodemask_t saved_node_state = node_states[N_MEMORY];
				347	unsigned long totalpages = early_calculate_totalpages();
				348	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
				349	struct memblock_region *r;
				350
				351	/* Need to find movable_zone earlier when movable_node is specified. */
				352	find_usable_zone_for_movable();
				353
				354	/*
				355	* If movable_node is specified, ignore kernelcore and movablecore
				356	* options.
				357	*/
				358	if (movable_node_is_enabled()) {
				359	for_each_mem_region(r) {
				360	if (!memblock_is_hotpluggable(r))
				361	continue;
				362
				363	nid = memblock_get_region_node(r);
				364
Wei Yang	3be381d	2024-05-25 02:30:38 +0000	[diff] [blame]	365	usable_startpfn = memblock_region_memory_base_pfn(r);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	366	zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
				367	min(usable_startpfn, zone_movable_pfn[nid]) :
				368	usable_startpfn;
				369	}
				370
				371	goto out2;
				372	}
				373
				374	/*
				375	* If kernelcore=mirror is specified, ignore movablecore option
				376	*/
				377	if (mirrored_kernelcore) {
				378	bool mem_below_4gb_not_mirrored = false;
				379
Ma Wupeng	0db31d6	2023-08-02 15:23:28 +0800	[diff] [blame]	380	if (!memblock_has_mirror()) {
				381	pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n");
				382	goto out;
				383	}
				384
Ma Wupeng	7ea6ec4	2024-01-09 12:15:36 +0800	[diff] [blame]	385	if (is_kdump_kernel()) {
				386	pr_warn("The system is under kdump, ignore kernelcore=mirror.\n");
				387	goto out;
				388	}
				389
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	390	for_each_mem_region(r) {
				391	if (memblock_is_mirror(r))
				392	continue;
				393
				394	nid = memblock_get_region_node(r);
				395
				396	usable_startpfn = memblock_region_memory_base_pfn(r);
				397
				398	if (usable_startpfn < PHYS_PFN(SZ_4G)) {
				399	mem_below_4gb_not_mirrored = true;
				400	continue;
				401	}
				402
				403	zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
				404	min(usable_startpfn, zone_movable_pfn[nid]) :
				405	usable_startpfn;
				406	}
				407
				408	if (mem_below_4gb_not_mirrored)
				409	pr_warn("This configuration results in unmirrored kernel memory.\n");
				410
				411	goto out2;
				412	}
				413
				414	/*
				415	* If kernelcore=nn% or movablecore=nn% was specified, calculate the
				416	* amount of necessary memory.
				417	*/
				418	if (required_kernelcore_percent)
				419	required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
				420	10000UL;
				421	if (required_movablecore_percent)
				422	required_movablecore = (totalpages * 100 * required_movablecore_percent) /
				423	10000UL;
				424
				425	/*
				426	* If movablecore= was specified, calculate what size of
				427	* kernelcore that corresponds so that memory usable for
				428	* any allocation type is evenly spread. If both kernelcore
				429	* and movablecore are specified, then the value of kernelcore
				430	* will be used for required_kernelcore if it's greater than
				431	* what movablecore would have allowed.
				432	*/
				433	if (required_movablecore) {
				434	unsigned long corepages;
				435
				436	/*
				437	* Round-up so that ZONE_MOVABLE is at least as large as what
				438	* was requested by the user
				439	*/
				440	required_movablecore =
				441	roundup(required_movablecore, MAX_ORDER_NR_PAGES);
				442	required_movablecore = min(totalpages, required_movablecore);
				443	corepages = totalpages - required_movablecore;
				444
				445	required_kernelcore = max(required_kernelcore, corepages);
				446	}
				447
				448	/*
				449	* If kernelcore was not specified or kernelcore size is larger
				450	* than totalpages, there is no ZONE_MOVABLE.
				451	*/
				452	if (!required_kernelcore \|\| required_kernelcore >= totalpages)
				453	goto out;
				454
				455	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
				456	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
				457
				458	restart:
				459	/* Spread kernelcore memory as evenly as possible throughout nodes */
				460	kernelcore_node = required_kernelcore / usable_nodes;
				461	for_each_node_state(nid, N_MEMORY) {
				462	unsigned long start_pfn, end_pfn;
				463
				464	/*
				465	* Recalculate kernelcore_node if the division per node
				466	* now exceeds what is necessary to satisfy the requested
				467	* amount of memory for the kernel
				468	*/
				469	if (required_kernelcore < kernelcore_node)
				470	kernelcore_node = required_kernelcore / usable_nodes;
				471
				472	/*
				473	* As the map is walked, we track how much memory is usable
				474	* by the kernel using kernelcore_remaining. When it is
				475	* 0, the rest of the node is usable by ZONE_MOVABLE
				476	*/
				477	kernelcore_remaining = kernelcore_node;
				478
				479	/* Go through each range of PFNs within this node */
				480	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
				481	unsigned long size_pages;
				482
				483	start_pfn = max(start_pfn, zone_movable_pfn[nid]);
				484	if (start_pfn >= end_pfn)
				485	continue;
				486
				487	/* Account for what is only usable for kernelcore */
				488	if (start_pfn < usable_startpfn) {
				489	unsigned long kernel_pages;
				490	kernel_pages = min(end_pfn, usable_startpfn)
				491	- start_pfn;
				492
				493	kernelcore_remaining -= min(kernel_pages,
				494	kernelcore_remaining);
				495	required_kernelcore -= min(kernel_pages,
				496	required_kernelcore);
				497
				498	/* Continue if range is now fully accounted */
				499	if (end_pfn <= usable_startpfn) {
				500
				501	/*
				502	* Push zone_movable_pfn to the end so
				503	* that if we have to rebalance
				504	* kernelcore across nodes, we will
				505	* not double account here
				506	*/
				507	zone_movable_pfn[nid] = end_pfn;
				508	continue;
				509	}
				510	start_pfn = usable_startpfn;
				511	}
				512
				513	/*
				514	* The usable PFN range for ZONE_MOVABLE is from
				515	* start_pfn->end_pfn. Calculate size_pages as the
				516	* number of pages used as kernelcore
				517	*/
				518	size_pages = end_pfn - start_pfn;
				519	if (size_pages > kernelcore_remaining)
				520	size_pages = kernelcore_remaining;
				521	zone_movable_pfn[nid] = start_pfn + size_pages;
				522
				523	/*
				524	* Some kernelcore has been met, update counts and
				525	* break if the kernelcore for this node has been
				526	* satisfied
				527	*/
				528	required_kernelcore -= min(required_kernelcore,
				529	size_pages);
				530	kernelcore_remaining -= size_pages;
				531	if (!kernelcore_remaining)
				532	break;
				533	}
				534	}
				535
				536	/*
				537	* If there is still required_kernelcore, we do another pass with one
				538	* less node in the count. This will push zone_movable_pfn[nid] further
				539	* along on the nodes that still have memory until kernelcore is
				540	* satisfied
				541	*/
				542	usable_nodes--;
				543	if (usable_nodes && required_kernelcore > usable_nodes)
				544	goto restart;
				545
				546	out2:
				547	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
				548	for (nid = 0; nid < MAX_NUMNODES; nid++) {
				549	unsigned long start_pfn, end_pfn;
				550
				551	zone_movable_pfn[nid] =
				552	roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
				553
				554	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
				555	if (zone_movable_pfn[nid] >= end_pfn)
				556	zone_movable_pfn[nid] = 0;
				557	}
				558
				559	out:
				560	/* restore the node_state */
				561	node_states[N_MEMORY] = saved_node_state;
				562	}
				563
Usama Arif	fde1c4e	2023-09-13 11:54:01 +0100	[diff] [blame]	564	void __meminit __init_single_page(struct page *page, unsigned long pfn,
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	565	unsigned long zone, int nid)
				566	{
				567	mm_zero_struct_page(page);
				568	set_page_links(page, zone, nid, pfn);
				569	init_page_count(page);
David Hildenbrand	11d5401	2024-05-29 13:19:04 +0200	[diff] [blame]	570	atomic_set(&page->_mapcount, -1);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	571	page_cpupid_reset_last(page);
				572	page_kasan_tag_reset(page);
				573
				574	INIT_LIST_HEAD(&page->lru);
				575	#ifdef WANT_PAGE_VIRTUAL
				576	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
				577	if (!is_highmem_idx(zone))
				578	set_page_address(page, __va(pfn << PAGE_SHIFT));
				579	#endif
				580	}
				581
				582	#ifdef CONFIG_NUMA
				583	/*
				584	* During memory init memblocks map pfns to nids. The search is expensive and
				585	* this caches recent lookups. The implementation of __early_pfn_to_nid
				586	* treats start/end as pfns.
				587	*/
				588	struct mminit_pfnnid_cache {
				589	unsigned long last_start;
				590	unsigned long last_end;
				591	int last_nid;
				592	};
				593
				594	static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
				595
				596	/*
				597	* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
				598	*/
				599	static int __meminit __early_pfn_to_nid(unsigned long pfn,
				600	struct mminit_pfnnid_cache *state)
				601	{
				602	unsigned long start_pfn, end_pfn;
				603	int nid;
				604
				605	if (state->last_start <= pfn && pfn < state->last_end)
				606	return state->last_nid;
				607
				608	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
				609	if (nid != NUMA_NO_NODE) {
				610	state->last_start = start_pfn;
				611	state->last_end = end_pfn;
				612	state->last_nid = nid;
				613	}
				614
				615	return nid;
				616	}
				617
				618	int __meminit early_pfn_to_nid(unsigned long pfn)
				619	{
				620	static DEFINE_SPINLOCK(early_pfn_lock);
				621	int nid;
				622
				623	spin_lock(&early_pfn_lock);
				624	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
				625	if (nid < 0)
				626	nid = first_online_node;
				627	spin_unlock(&early_pfn_lock);
				628
				629	return nid;
				630	}
Mike Rapoport (IBM)	534ef4e	2023-03-21 19:05:03 +0200	[diff] [blame]	631
				632	int hashdist = HASHDIST_DEFAULT;
				633
				634	static int __init set_hashdist(char *str)
				635	{
				636	if (!str)
				637	return 0;
				638	hashdist = simple_strtoul(str, &str, 0);
				639	return 1;
				640	}
				641	__setup("hashdist=", set_hashdist);
				642
				643	static inline void fixup_hashdist(void)
				644	{
				645	if (num_node_state(N_MEMORY) == 1)
				646	hashdist = 0;
				647	}
				648	#else
				649	static inline void fixup_hashdist(void) {}
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	650	#endif /* CONFIG_NUMA */
				651
				652	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
				653	static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
				654	{
				655	pgdat->first_deferred_pfn = ULONG_MAX;
				656	}
				657
				658	/* Returns true if the struct page for the pfn is initialised */
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	659	static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	660	{
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	661	if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
				662	return false;
				663
				664	return true;
				665	}
				666
				667	/*
				668	* Returns true when the remaining initialisation should be deferred until
				669	* later in the boot cycle when it can be parallelised.
				670	*/
				671	static bool __meminit
				672	defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
				673	{
				674	static unsigned long prev_end_pfn, nr_initialised;
				675
				676	if (early_page_ext_enabled())
				677	return false;
Wei Yang	922306a	2024-05-25 02:30:40 +0000	[diff] [blame]	678
				679	/* Always populate low zones for address-constrained allocations */
				680	if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
				681	return false;
				682
				683	if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
				684	return true;
				685
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	686	/*
				687	* prev_end_pfn static that contains the end of previous zone
				688	* No need to protect because called very early in boot before smp_init.
				689	*/
				690	if (prev_end_pfn != end_pfn) {
				691	prev_end_pfn = end_pfn;
				692	nr_initialised = 0;
				693	}
				694
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	695	/*
				696	* We start only with one section of pages, more pages are added as
				697	* needed until the rest of deferred pages are initialized.
				698	*/
				699	nr_initialised++;
				700	if ((nr_initialised > PAGES_PER_SECTION) &&
				701	(pfn & (PAGES_PER_SECTION - 1)) == 0) {
				702	NODE_DATA(nid)->first_deferred_pfn = pfn;
				703	return true;
				704	}
				705	return false;
				706	}
				707
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	708	static void __meminit init_reserved_page(unsigned long pfn, int nid)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	709	{
				710	pg_data_t *pgdat;
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	711	int zid;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	712
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	713	if (early_page_initialised(pfn, nid))
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	714	return;
				715
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	716	pgdat = NODE_DATA(nid);
				717
				718	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
				719	struct zone *zone = &pgdat->node_zones[zid];
				720
				721	if (zone_spans_pfn(zone, pfn))
				722	break;
				723	}
				724	__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
Hua Su	98b7beb	2024-10-21 13:11:51 +0800	[diff] [blame]	725
				726	if (pageblock_aligned(pfn))
				727	set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	728	}
				729	#else
				730	static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
				731
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	732	static inline bool early_page_initialised(unsigned long pfn, int nid)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	733	{
				734	return true;
				735	}
				736
				737	static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
				738	{
				739	return false;
				740	}
				741
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	742	static inline void init_reserved_page(unsigned long pfn, int nid)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	743	{
				744	}
				745	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
				746
				747	/*
				748	* Initialised pages do not have PageReserved set. This function is
				749	* called for each range allocated by the bootmem allocator and
				750	* marks the pages PageReserved. The remaining valid pages are later
				751	* sent to the buddy page allocator.
				752	*/
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	753	void __meminit reserve_bootmem_region(phys_addr_t start,
				754	phys_addr_t end, int nid)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	755	{
				756	unsigned long start_pfn = PFN_DOWN(start);
				757	unsigned long end_pfn = PFN_UP(end);
				758
				759	for (; start_pfn < end_pfn; start_pfn++) {
				760	if (pfn_valid(start_pfn)) {
				761	struct page *page = pfn_to_page(start_pfn);
				762
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	763	init_reserved_page(start_pfn, nid);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	764
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	765	/*
				766	* no need for atomic set_bit because the struct
				767	* page is not visible yet so nobody should
				768	* access it yet.
				769	*/
				770	__SetPageReserved(page);
				771	}
				772	}
				773	}
				774
				775	/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
				776	static bool __meminit
				777	overlap_memmap_init(unsigned long zone, unsigned long *pfn)
				778	{
				779	static struct memblock_region *r;
				780
				781	if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
				782	if (!r \|\| *pfn >= memblock_region_memory_end_pfn(r)) {
				783	for_each_mem_region(r) {
				784	if (*pfn < memblock_region_memory_end_pfn(r))
				785	break;
				786	}
				787	}
				788	if (*pfn >= memblock_region_memory_base_pfn(r) &&
				789	memblock_is_mirror(r)) {
				790	*pfn = memblock_region_memory_end_pfn(r);
				791	return true;
				792	}
				793	}
				794	return false;
				795	}
				796
				797	/*
				798	* Only struct pages that correspond to ranges defined by memblock.memory
				799	* are zeroed and initialized by going through __init_single_page() during
				800	* memmap_init_zone_range().
				801	*
				802	* But, there could be struct pages that correspond to holes in
				803	* memblock.memory. This can happen because of the following reasons:
				804	* - physical memory bank size is not necessarily the exact multiple of the
				805	* arbitrary section size
				806	* - early reserved memory may not be listed in memblock.memory
Serge Semin	ecf5dd1	2023-11-22 21:24:03 +0300	[diff] [blame]	807	* - non-memory regions covered by the contigious flatmem mapping
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	808	* - memory layouts defined with memmap= kernel parameter may not align
				809	* nicely with memmap sections
				810	*
				811	* Explicitly initialize those struct pages so that:
				812	* - PG_Reserved is set
				813	* - zone and node links point to zone and node that span the page if the
				814	* hole is in the middle of a zone
				815	* - zone and node links point to adjacent zone/node if the hole falls on
				816	* the zone boundary; the pages in such holes will be prepended to the
				817	* zone/node above the hole except for the trailing pages in the last
				818	* section that will be appended to the zone/node below.
				819	*/
				820	static void __init init_unavailable_range(unsigned long spfn,
				821	unsigned long epfn,
				822	int zone, int node)
				823	{
				824	unsigned long pfn;
				825	u64 pgcnt = 0;
				826
				827	for (pfn = spfn; pfn < epfn; pfn++) {
				828	if (!pfn_valid(pageblock_start_pfn(pfn))) {
				829	pfn = pageblock_end_pfn(pfn) - 1;
				830	continue;
				831	}
				832	__init_single_page(pfn_to_page(pfn), pfn, zone, node);
				833	__SetPageReserved(pfn_to_page(pfn));
				834	pgcnt++;
				835	}
				836
				837	if (pgcnt)
Serge Semin	01846c6	2023-11-22 21:24:04 +0300	[diff] [blame]	838	pr_info("On node %d, zone %s: %lld pages in unavailable ranges\n",
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	839	node, zone_names[zone], pgcnt);
				840	}
				841
				842	/*
				843	* Initially all pages are reserved - free ones are freed
				844	* up by memblock_free_all() once the early boot process is
				845	* done. Non-atomic initialization, single-pass.
				846	*
				847	* All aligned pageblocks are initialized to the specified migratetype
				848	* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
				849	* zone stats (e.g., nr_isolate_pageblock) are touched.
				850	*/
				851	void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
				852	unsigned long start_pfn, unsigned long zone_end_pfn,
				853	enum meminit_context context,
				854	struct vmem_altmap *altmap, int migratetype)
				855	{
				856	unsigned long pfn, end_pfn = start_pfn + size;
				857	struct page *page;
				858
				859	if (highest_memmap_pfn < end_pfn - 1)
				860	highest_memmap_pfn = end_pfn - 1;
				861
				862	#ifdef CONFIG_ZONE_DEVICE
				863	/*
				864	* Honor reservation requested by the driver for this ZONE_DEVICE
				865	* memory. We limit the total number of pages to initialize to just
				866	* those that might contain the memory mapping. We will defer the
				867	* ZONE_DEVICE page initialization until after we have released
				868	* the hotplug lock.
				869	*/
				870	if (zone == ZONE_DEVICE) {
				871	if (!altmap)
				872	return;
				873
				874	if (start_pfn == altmap->base_pfn)
				875	start_pfn += altmap->reserve;
				876	end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
				877	}
				878	#endif
				879
				880	for (pfn = start_pfn; pfn < end_pfn; ) {
				881	/*
				882	* There can be holes in boot-time mem_map[]s handed to this
				883	* function. They do not exist on hotplugged memory.
				884	*/
				885	if (context == MEMINIT_EARLY) {
				886	if (overlap_memmap_init(zone, &pfn))
				887	continue;
				888	if (defer_init(nid, pfn, zone_end_pfn)) {
				889	deferred_struct_pages = true;
				890	break;
				891	}
				892	}
				893
				894	page = pfn_to_page(pfn);
				895	__init_single_page(page, pfn, zone, nid);
David Hildenbrand	503b158	2024-06-07 11:09:37 +0200	[diff] [blame]	896	if (context == MEMINIT_HOTPLUG) {
				897	#ifdef CONFIG_ZONE_DEVICE
				898	if (zone == ZONE_DEVICE)
				899	__SetPageReserved(page);
				900	else
				901	#endif
				902	__SetPageOffline(page);
				903	}
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	904
				905	/*
				906	* Usually, we want to mark the pageblock MIGRATE_MOVABLE,
				907	* such that unmovable allocations won't be scattered all
				908	* over the place during system boot.
				909	*/
				910	if (pageblock_aligned(pfn)) {
				911	set_pageblock_migratetype(page, migratetype);
				912	cond_resched();
				913	}
				914	pfn++;
				915	}
				916	}
				917
				918	static void __init memmap_init_zone_range(struct zone *zone,
				919	unsigned long start_pfn,
				920	unsigned long end_pfn,
				921	unsigned long *hole_pfn)
				922	{
				923	unsigned long zone_start_pfn = zone->zone_start_pfn;
				924	unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
				925	int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
				926
				927	start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
				928	end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
				929
				930	if (start_pfn >= end_pfn)
				931	return;
				932
				933	memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
				934	zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
				935
				936	if (*hole_pfn < start_pfn)
				937	init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
				938
				939	*hole_pfn = end_pfn;
				940	}
				941
				942	static void __init memmap_init(void)
				943	{
				944	unsigned long start_pfn, end_pfn;
				945	unsigned long hole_pfn = 0;
				946	int i, j, zone_id = 0, nid;
				947
				948	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
				949	struct pglist_data *node = NODE_DATA(nid);
				950
				951	for (j = 0; j < MAX_NR_ZONES; j++) {
				952	struct zone *zone = node->node_zones + j;
				953
				954	if (!populated_zone(zone))
				955	continue;
				956
				957	memmap_init_zone_range(zone, start_pfn, end_pfn,
				958	&hole_pfn);
				959	zone_id = j;
				960	}
				961	}
				962
				963	#ifdef CONFIG_SPARSEMEM
				964	/*
				965	* Initialize the memory map for hole in the range [memory_end,
				966	* section_end].
				967	* Append the pages in this hole to the highest zone in the last
				968	* node.
				969	* The call to init_unavailable_range() is outside the ifdef to
				970	* silence the compiler warining about zone_id set but not used;
				971	* for FLATMEM it is a nop anyway
				972	*/
				973	end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
				974	if (hole_pfn < end_pfn)
				975	#endif
				976	init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
				977	}
				978
				979	#ifdef CONFIG_ZONE_DEVICE
				980	static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
				981	unsigned long zone_idx, int nid,
				982	struct dev_pagemap *pgmap)
				983	{
				984
				985	__init_single_page(page, pfn, zone_idx, nid);
				986
				987	/*
				988	* Mark page reserved as it will need to wait for onlining
				989	* phase for it to be fully associated with a zone.
				990	*
				991	* We can use the non-atomic __set_bit operation for setting
				992	* the flag as we are still initializing the pages.
				993	*/
				994	__SetPageReserved(page);
				995
				996	/*
				997	* ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
				998	* and zone_device_data. It is a bug if a ZONE_DEVICE page is
				999	* ever freed or placed on a driver-private list.
				1000	*/
				1001	page->pgmap = pgmap;
				1002	page->zone_device_data = NULL;
				1003
				1004	/*
				1005	* Mark the block movable so that blocks are reserved for
				1006	* movable at startup. This will force kernel allocations
				1007	* to reserve their blocks rather than leaking throughout
				1008	* the address space during boot when many long-lived
				1009	* kernel allocations are made.
				1010	*
				1011	* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
				1012	* because this is done early in section_activate()
				1013	*/
				1014	if (pageblock_aligned(pfn)) {
				1015	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
				1016	cond_resched();
				1017	}
				1018
				1019	/*
				1020	* ZONE_DEVICE pages are released directly to the driver page allocator
				1021	* which will set the page count to 1 when allocating the page.
				1022	*/
				1023	if (pgmap->type == MEMORY_DEVICE_PRIVATE \|\|
				1024	pgmap->type == MEMORY_DEVICE_COHERENT)
				1025	set_page_count(page, 0);
				1026	}
				1027
				1028	/*
				1029	* With compound page geometry and when struct pages are stored in ram most
				1030	* tail pages are reused. Consequently, the amount of unique struct pages to
				1031	* initialize is a lot smaller that the total amount of struct pages being
				1032	* mapped. This is a paired / mild layering violation with explicit knowledge
				1033	* of how the sparse_vmemmap internals handle compound pages in the lack
				1034	* of an altmap. See vmemmap_populate_compound_pages().
				1035	*/
				1036	static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
Aneesh Kumar K.V	87a7ae7	2023-04-11 19:52:13 +0530	[diff] [blame]	1037	struct dev_pagemap *pgmap)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1038	{
Aneesh Kumar K.V	87a7ae7	2023-04-11 19:52:13 +0530	[diff] [blame]	1039	if (!vmemmap_can_optimize(altmap, pgmap))
				1040	return pgmap_vmemmap_nr(pgmap);
				1041
Aneesh Kumar K.V	c1a6c53	2023-07-25 00:37:49 +0530	[diff] [blame]	1042	return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1043	}
				1044
				1045	static void __ref memmap_init_compound(struct page *head,
				1046	unsigned long head_pfn,
				1047	unsigned long zone_idx, int nid,
				1048	struct dev_pagemap *pgmap,
				1049	unsigned long nr_pages)
				1050	{
				1051	unsigned long pfn, end_pfn = head_pfn + nr_pages;
				1052	unsigned int order = pgmap->vmemmap_shift;
				1053
				1054	__SetPageHead(head);
				1055	for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
				1056	struct page *page = pfn_to_page(pfn);
				1057
				1058	__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
				1059	prep_compound_tail(head, pfn - head_pfn);
				1060	set_page_count(page, 0);
				1061
				1062	/*
				1063	* The first tail page stores important compound page info.
				1064	* Call prep_compound_head() after the first tail page has
				1065	* been initialized, to not have the data overwritten.
				1066	*/
				1067	if (pfn == head_pfn + 1)
				1068	prep_compound_head(head, order);
				1069	}
				1070	}
				1071
				1072	void __ref memmap_init_zone_device(struct zone *zone,
				1073	unsigned long start_pfn,
				1074	unsigned long nr_pages,
				1075	struct dev_pagemap *pgmap)
				1076	{
				1077	unsigned long pfn, end_pfn = start_pfn + nr_pages;
				1078	struct pglist_data *pgdat = zone->zone_pgdat;
				1079	struct vmem_altmap *altmap = pgmap_altmap(pgmap);
				1080	unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
				1081	unsigned long zone_idx = zone_idx(zone);
				1082	unsigned long start = jiffies;
				1083	int nid = pgdat->node_id;
				1084
				1085	if (WARN_ON_ONCE(!pgmap \|\| zone_idx != ZONE_DEVICE))
				1086	return;
				1087
				1088	/*
				1089	* The call to memmap_init should have already taken care
				1090	* of the pages reserved for the memmap, so we can just jump to
				1091	* the end of that region and start processing the device pages.
				1092	*/
				1093	if (altmap) {
				1094	start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
				1095	nr_pages = end_pfn - start_pfn;
				1096	}
				1097
				1098	for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
				1099	struct page *page = pfn_to_page(pfn);
				1100
				1101	__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
				1102
				1103	if (pfns_per_compound == 1)
				1104	continue;
				1105
				1106	memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
Aneesh Kumar K.V	87a7ae7	2023-04-11 19:52:13 +0530	[diff] [blame]	1107	compound_nr_pages(altmap, pgmap));
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1108	}
				1109
Tomas Krcka	dd31bad	2023-03-23 17:43:49 +0000	[diff] [blame]	1110	pr_debug("%s initialised %lu pages in %ums\n", __func__,
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1111	nr_pages, jiffies_to_msecs(jiffies - start));
				1112	}
				1113	#endif
				1114
				1115	/*
				1116	* The zone ranges provided by the architecture do not include ZONE_MOVABLE
				1117	* because it is sized independent of architecture. Unlike the other zones,
				1118	* the starting point for ZONE_MOVABLE is not fixed. It may be different
				1119	* in each node depending on the size of each node and how evenly kernelcore
				1120	* is distributed. This helper function adjusts the zone ranges
				1121	* provided by the architecture for a given node by using the end of the
				1122	* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
				1123	* zones within a node are in order of monotonic increases memory addresses
				1124	*/
				1125	static void __init adjust_zone_range_for_zone_movable(int nid,
				1126	unsigned long zone_type,
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1127	unsigned long node_end_pfn,
				1128	unsigned long *zone_start_pfn,
				1129	unsigned long *zone_end_pfn)
				1130	{
				1131	/* Only adjust if ZONE_MOVABLE is on this node */
				1132	if (zone_movable_pfn[nid]) {
				1133	/* Size ZONE_MOVABLE */
				1134	if (zone_type == ZONE_MOVABLE) {
				1135	*zone_start_pfn = zone_movable_pfn[nid];
				1136	*zone_end_pfn = min(node_end_pfn,
				1137	arch_zone_highest_possible_pfn[movable_zone]);
				1138
				1139	/* Adjust for ZONE_MOVABLE starting within this range */
				1140	} else if (!mirrored_kernelcore &&
				1141	*zone_start_pfn < zone_movable_pfn[nid] &&
				1142	*zone_end_pfn > zone_movable_pfn[nid]) {
				1143	*zone_end_pfn = zone_movable_pfn[nid];
				1144
				1145	/* Check if this whole range is within ZONE_MOVABLE */
				1146	} else if (*zone_start_pfn >= zone_movable_pfn[nid])
				1147	zone_start_pfn = zone_end_pfn;
				1148	}
				1149	}
				1150
				1151	/*
				1152	* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
				1153	* then all holes in the requested range will be accounted for.
				1154	*/
Baoquan He	b6dd945	2024-03-26 14:11:29 +0800	[diff] [blame]	1155	static unsigned long __init __absent_pages_in_range(int nid,
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1156	unsigned long range_start_pfn,
				1157	unsigned long range_end_pfn)
				1158	{
				1159	unsigned long nr_absent = range_end_pfn - range_start_pfn;
				1160	unsigned long start_pfn, end_pfn;
				1161	int i;
				1162
				1163	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
				1164	start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
				1165	end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
				1166	nr_absent -= end_pfn - start_pfn;
				1167	}
				1168	return nr_absent;
				1169	}
				1170
				1171	/**
				1172	* absent_pages_in_range - Return number of page frames in holes within a range
				1173	* @start_pfn: The start PFN to start searching for holes
				1174	* @end_pfn: The end PFN to stop searching for holes
				1175	*
				1176	* Return: the number of pages frames in memory holes within a range.
				1177	*/
				1178	unsigned long __init absent_pages_in_range(unsigned long start_pfn,
				1179	unsigned long end_pfn)
				1180	{
				1181	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
				1182	}
				1183
				1184	/* Return the number of page frames in holes in a zone on a node */
				1185	static unsigned long __init zone_absent_pages_in_node(int nid,
				1186	unsigned long zone_type,
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1187	unsigned long zone_start_pfn,
				1188	unsigned long zone_end_pfn)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1189	{
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1190	unsigned long nr_absent;
				1191
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1192	/* zone is empty, we don't have any absent pages */
				1193	if (zone_start_pfn == zone_end_pfn)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1194	return 0;
				1195
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1196	nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
				1197
				1198	/*
				1199	* ZONE_MOVABLE handling.
				1200	* Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
				1201	* and vice versa.
				1202	*/
				1203	if (mirrored_kernelcore && zone_movable_pfn[nid]) {
				1204	unsigned long start_pfn, end_pfn;
				1205	struct memblock_region *r;
				1206
				1207	for_each_mem_region(r) {
				1208	start_pfn = clamp(memblock_region_memory_base_pfn(r),
				1209	zone_start_pfn, zone_end_pfn);
				1210	end_pfn = clamp(memblock_region_memory_end_pfn(r),
				1211	zone_start_pfn, zone_end_pfn);
				1212
				1213	if (zone_type == ZONE_MOVABLE &&
				1214	memblock_is_mirror(r))
				1215	nr_absent += end_pfn - start_pfn;
				1216
				1217	if (zone_type == ZONE_NORMAL &&
				1218	!memblock_is_mirror(r))
				1219	nr_absent += end_pfn - start_pfn;
				1220	}
				1221	}
				1222
				1223	return nr_absent;
				1224	}
				1225
				1226	/*
				1227	* Return the number of pages a zone spans in a node, including holes
				1228	* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
				1229	*/
				1230	static unsigned long __init zone_spanned_pages_in_node(int nid,
				1231	unsigned long zone_type,
				1232	unsigned long node_start_pfn,
				1233	unsigned long node_end_pfn,
				1234	unsigned long *zone_start_pfn,
				1235	unsigned long *zone_end_pfn)
				1236	{
				1237	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
				1238	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1239
				1240	/* Get the start and end of the zone */
				1241	*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
				1242	*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
Haifeng Xu	0792e47d	2023-07-17 06:58:11 +0000	[diff] [blame]	1243	adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn,
				1244	zone_start_pfn, zone_end_pfn);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1245
				1246	/* Check that this node has pages within the zone's required range */
				1247	if (zone_end_pfn < node_start_pfn \|\| zone_start_pfn > node_end_pfn)
				1248	return 0;
				1249
				1250	/* Move the zone boundaries inside the node if necessary */
				1251	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
				1252	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
				1253
				1254	/* Return the spanned pages */
				1255	return zone_end_pfn - zone_start_pfn;
				1256	}
				1257
Haifeng Xu	ba1b67c	2023-05-26 08:52:50 +0000	[diff] [blame]	1258	static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
				1259	{
				1260	struct zone *z;
				1261
				1262	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) {
				1263	z->zone_start_pfn = 0;
				1264	z->spanned_pages = 0;
				1265	z->present_pages = 0;
				1266	#if defined(CONFIG_MEMORY_HOTPLUG)
				1267	z->present_early_pages = 0;
				1268	#endif
				1269	}
				1270
				1271	pgdat->node_spanned_pages = 0;
				1272	pgdat->node_present_pages = 0;
				1273	pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
				1274	}
				1275
Baoquan He	8ad4184	2024-03-25 22:56:43 +0800	[diff] [blame]	1276	static void __init calc_nr_kernel_pages(void)
				1277	{
				1278	unsigned long start_pfn, end_pfn;
				1279	phys_addr_t start_addr, end_addr;
				1280	u64 u;
				1281	#ifdef CONFIG_HIGHMEM
				1282	unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
				1283	#endif
				1284
				1285	for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
				1286	start_pfn = PFN_UP(start_addr);
				1287	end_pfn = PFN_DOWN(end_addr);
				1288
				1289	if (start_pfn < end_pfn) {
				1290	nr_all_pages += end_pfn - start_pfn;
				1291	#ifdef CONFIG_HIGHMEM
				1292	start_pfn = clamp(start_pfn, 0, high_zone_low);
				1293	end_pfn = clamp(end_pfn, 0, high_zone_low);
				1294	#endif
				1295	nr_kernel_pages += end_pfn - start_pfn;
				1296	}
				1297	}
				1298	}
				1299
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1300	static void __init calculate_node_totalpages(struct pglist_data *pgdat,
				1301	unsigned long node_start_pfn,
				1302	unsigned long node_end_pfn)
				1303	{
				1304	unsigned long realtotalpages = 0, totalpages = 0;
				1305	enum zone_type i;
				1306
				1307	for (i = 0; i < MAX_NR_ZONES; i++) {
				1308	struct zone *zone = pgdat->node_zones + i;
				1309	unsigned long zone_start_pfn, zone_end_pfn;
				1310	unsigned long spanned, absent;
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1311	unsigned long real_size;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1312
				1313	spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
				1314	node_start_pfn,
				1315	node_end_pfn,
				1316	&zone_start_pfn,
				1317	&zone_end_pfn);
				1318	absent = zone_absent_pages_in_node(pgdat->node_id, i,
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1319	zone_start_pfn,
				1320	zone_end_pfn);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1321
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1322	real_size = spanned - absent;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1323
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1324	if (spanned)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1325	zone->zone_start_pfn = zone_start_pfn;
				1326	else
				1327	zone->zone_start_pfn = 0;
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1328	zone->spanned_pages = spanned;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1329	zone->present_pages = real_size;
				1330	#if defined(CONFIG_MEMORY_HOTPLUG)
				1331	zone->present_early_pages = real_size;
				1332	#endif
				1333
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1334	totalpages += spanned;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1335	realtotalpages += real_size;
				1336	}
				1337
				1338	pgdat->node_spanned_pages = totalpages;
				1339	pgdat->node_present_pages = realtotalpages;
				1340	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
				1341	}
				1342
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1343	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				1344	static void pgdat_init_split_queue(struct pglist_data *pgdat)
				1345	{
				1346	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
				1347
				1348	spin_lock_init(&ds_queue->split_queue_lock);
				1349	INIT_LIST_HEAD(&ds_queue->split_queue);
				1350	ds_queue->split_queue_len = 0;
				1351	}
				1352	#else
				1353	static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
				1354	#endif
				1355
				1356	#ifdef CONFIG_COMPACTION
				1357	static void pgdat_init_kcompactd(struct pglist_data *pgdat)
				1358	{
				1359	init_waitqueue_head(&pgdat->kcompactd_wait);
				1360	}
				1361	#else
				1362	static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
				1363	#endif
				1364
				1365	static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
				1366	{
				1367	int i;
				1368
				1369	pgdat_resize_init(pgdat);
				1370	pgdat_kswapd_lock_init(pgdat);
				1371
				1372	pgdat_init_split_queue(pgdat);
				1373	pgdat_init_kcompactd(pgdat);
				1374
				1375	init_waitqueue_head(&pgdat->kswapd_wait);
				1376	init_waitqueue_head(&pgdat->pfmemalloc_wait);
				1377
				1378	for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
				1379	init_waitqueue_head(&pgdat->reclaim_wait[i]);
				1380
				1381	pgdat_page_ext_init(pgdat);
				1382	lruvec_init(&pgdat->__lruvec);
				1383	}
				1384
				1385	static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
				1386	unsigned long remaining_pages)
				1387	{
				1388	atomic_long_set(&zone->managed_pages, remaining_pages);
				1389	zone_set_nid(zone, nid);
				1390	zone->name = zone_names[idx];
				1391	zone->zone_pgdat = NODE_DATA(nid);
				1392	spin_lock_init(&zone->lock);
				1393	zone_seqlock_init(zone);
				1394	zone_pcp_init(zone);
				1395	}
				1396
				1397	static void __meminit zone_init_free_lists(struct zone *zone)
				1398	{
				1399	unsigned int order, t;
				1400	for_each_migratetype_order(order, t) {
				1401	INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
				1402	zone->free_area[order].nr_free = 0;
				1403	}
Kirill A. Shutemov	dcdfdd4	2023-06-06 17:26:29 +0300	[diff] [blame]	1404
				1405	#ifdef CONFIG_UNACCEPTED_MEMORY
				1406	INIT_LIST_HEAD(&zone->unaccepted_pages);
				1407	#endif
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1408	}
				1409
				1410	void __meminit init_currently_empty_zone(struct zone *zone,
				1411	unsigned long zone_start_pfn,
				1412	unsigned long size)
				1413	{
				1414	struct pglist_data *pgdat = zone->zone_pgdat;
				1415	int zone_idx = zone_idx(zone) + 1;
				1416
				1417	if (zone_idx > pgdat->nr_zones)
				1418	pgdat->nr_zones = zone_idx;
				1419
				1420	zone->zone_start_pfn = zone_start_pfn;
				1421
				1422	mminit_dprintk(MMINIT_TRACE, "memmap_init",
				1423	"Initialising map node %d zone %lu pfns %lu -> %lu\n",
				1424	pgdat->node_id,
				1425	(unsigned long)zone_idx(zone),
				1426	zone_start_pfn, (zone_start_pfn + size));
				1427
				1428	zone_init_free_lists(zone);
				1429	zone->initialized = 1;
				1430	}
				1431
				1432	#ifndef CONFIG_SPARSEMEM
				1433	/*
				1434	* Calculate the size of the zone->blockflags rounded to an unsigned long
				1435	* Start by making sure zonesize is a multiple of pageblock_order by rounding
				1436	* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
				1437	* round what is now in bits to nearest long in bits, then return it in
				1438	* bytes.
				1439	*/
				1440	static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
				1441	{
				1442	unsigned long usemapsize;
				1443
				1444	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
				1445	usemapsize = roundup(zonesize, pageblock_nr_pages);
				1446	usemapsize = usemapsize >> pageblock_order;
				1447	usemapsize *= NR_PAGEBLOCK_BITS;
Miaohe Lin	daee07b	2023-08-07 10:35:28 +0800	[diff] [blame]	1448	usemapsize = roundup(usemapsize, BITS_PER_LONG);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1449
Miaohe Lin	daee07b	2023-08-07 10:35:28 +0800	[diff] [blame]	1450	return usemapsize / BITS_PER_BYTE;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1451	}
				1452
				1453	static void __ref setup_usemap(struct zone *zone)
				1454	{
				1455	unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
				1456	zone->spanned_pages);
				1457	zone->pageblock_flags = NULL;
				1458	if (usemapsize) {
				1459	zone->pageblock_flags =
				1460	memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
				1461	zone_to_nid(zone));
				1462	if (!zone->pageblock_flags)
				1463	panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
				1464	usemapsize, zone->name, zone_to_nid(zone));
				1465	}
				1466	}
				1467	#else
				1468	static inline void setup_usemap(struct zone *zone) {}
				1469	#endif /* CONFIG_SPARSEMEM */
				1470
				1471	#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
				1472
				1473	/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
				1474	void __init set_pageblock_order(void)
				1475	{
Kirill A. Shutemov	5e0a760	2023-12-28 17:47:04 +0300	[diff] [blame]	1476	unsigned int order = MAX_PAGE_ORDER;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1477
				1478	/* Check that pageblock_nr_pages has not already been setup */
				1479	if (pageblock_order)
				1480	return;
				1481
				1482	/* Don't let pageblocks exceed the maximum allocation granularity. */
				1483	if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
				1484	order = HUGETLB_PAGE_ORDER;
				1485
				1486	/*
				1487	* Assume the largest contiguous order of interest is a huge page.
Kefeng Wang	e99fb98	2023-12-22 15:02:03 +0800	[diff] [blame]	1488	* This value may be variable depending on boot parameters on powerpc.
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1489	*/
				1490	pageblock_order = order;
				1491	}
				1492	#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
				1493
				1494	/*
				1495	* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
				1496	* is unused as pageblock_order is set at compile-time. See
				1497	* include/linux/pageblock-flags.h for the values of pageblock_order based on
				1498	* the kernel config
				1499	*/
				1500	void __init set_pageblock_order(void)
				1501	{
				1502	}
				1503
				1504	#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
				1505
				1506	/*
				1507	* Set up the zone data structures
				1508	* - init pgdat internals
				1509	* - init all zones belonging to this node
				1510	*
				1511	* NOTE: this function is only called during memory hotplug
				1512	*/
				1513	#ifdef CONFIG_MEMORY_HOTPLUG
				1514	void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
				1515	{
				1516	int nid = pgdat->node_id;
				1517	enum zone_type z;
				1518	int cpu;
				1519
				1520	pgdat_init_internals(pgdat);
				1521
				1522	if (pgdat->per_cpu_nodestats == &boot_nodestats)
				1523	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
				1524
				1525	/*
				1526	* Reset the nr_zones, order and highest_zoneidx before reuse.
				1527	* Note that kswapd will init kswapd_highest_zoneidx properly
				1528	* when it starts in the near future.
				1529	*/
				1530	pgdat->nr_zones = 0;
				1531	pgdat->kswapd_order = 0;
				1532	pgdat->kswapd_highest_zoneidx = 0;
				1533	pgdat->node_start_pfn = 0;
Haifeng Xu	32b6a4a	2023-06-07 02:50:56 +0000	[diff] [blame]	1534	pgdat->node_present_pages = 0;
				1535
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1536	for_each_online_cpu(cpu) {
				1537	struct per_cpu_nodestat *p;
				1538
				1539	p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
				1540	memset(p, 0, sizeof(*p));
				1541	}
				1542
Haifeng Xu	32b6a4a	2023-06-07 02:50:56 +0000	[diff] [blame]	1543	/*
				1544	* When memory is hot-added, all the memory is in offline state. So
				1545	* clear all zones' present_pages and managed_pages because they will
				1546	* be updated in online_pages() and offline_pages().
				1547	*/
				1548	for (z = 0; z < MAX_NR_ZONES; z++) {
				1549	struct zone *zone = pgdat->node_zones + z;
				1550
				1551	zone->present_pages = 0;
				1552	zone_init_internals(zone, z, nid, 0);
				1553	}
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1554	}
				1555	#endif
				1556
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1557	static void __init free_area_init_core(struct pglist_data *pgdat)
				1558	{
				1559	enum zone_type j;
				1560	int nid = pgdat->node_id;
				1561
				1562	pgdat_init_internals(pgdat);
				1563	pgdat->per_cpu_nodestats = &boot_nodestats;
				1564
				1565	for (j = 0; j < MAX_NR_ZONES; j++) {
				1566	struct zone *zone = pgdat->node_zones + j;
Baoquan He	0ac5e78	2024-03-25 22:56:44 +0800	[diff] [blame]	1567	unsigned long size = zone->spanned_pages;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1568
				1569	/*
Baoquan He	0ac5e78	2024-03-25 22:56:44 +0800	[diff] [blame]	1570	* Initialize zone->managed_pages as 0 , it will be reset
				1571	* when memblock allocator frees pages into buddy system.
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1572	*/
Baoquan He	0ac5e78	2024-03-25 22:56:44 +0800	[diff] [blame]	1573	zone_init_internals(zone, j, nid, zone->present_pages);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1574
				1575	if (!size)
				1576	continue;
				1577
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1578	setup_usemap(zone);
				1579	init_currently_empty_zone(zone, zone->zone_start_pfn, size);
				1580	}
				1581	}
				1582
				1583	void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
				1584	phys_addr_t min_addr, int nid, bool exact_nid)
				1585	{
				1586	void *ptr;
				1587
Guo Weikang	b2aad24	2025-01-06 10:11:25 +0800	[diff] [blame]	1588	/*
				1589	* Kmemleak will explicitly scan mem_map by traversing all valid
				1590	* `struct *page`,so memblock does not need to be added to the scan list.
				1591	*/
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1592	if (exact_nid)
				1593	ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
Guo Weikang	b2aad24	2025-01-06 10:11:25 +0800	[diff] [blame]	1594	MEMBLOCK_ALLOC_NOLEAKTRACE,
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1595	nid);
				1596	else
				1597	ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
Guo Weikang	b2aad24	2025-01-06 10:11:25 +0800	[diff] [blame]	1598	MEMBLOCK_ALLOC_NOLEAKTRACE,
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1599	nid);
				1600
				1601	if (ptr && size > 0)
				1602	page_init_poison(ptr, size);
				1603
				1604	return ptr;
				1605	}
				1606
				1607	#ifdef CONFIG_FLATMEM
				1608	static void __init alloc_node_mem_map(struct pglist_data *pgdat)
				1609	{
Kefeng Wang	e99fb98	2023-12-22 15:02:03 +0800	[diff] [blame]	1610	unsigned long start, offset, size, end;
				1611	struct page *map;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1612
				1613	/* Skip empty nodes */
				1614	if (!pgdat->node_spanned_pages)
				1615	return;
				1616
				1617	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
				1618	offset = pgdat->node_start_pfn - start;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1619	/*
Kirill A. Shutemov	5e0a760	2023-12-28 17:47:04 +0300	[diff] [blame]	1620	* The zone's endpoints aren't required to be MAX_PAGE_ORDER
Kefeng Wang	e99fb98	2023-12-22 15:02:03 +0800	[diff] [blame]	1621	* aligned but the node_mem_map endpoints must be in order
				1622	* for the buddy allocator to function correctly.
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1623	*/
Kefeng Wang	e99fb98	2023-12-22 15:02:03 +0800	[diff] [blame]	1624	end = ALIGN(pgdat_end_pfn(pgdat), MAX_ORDER_NR_PAGES);
				1625	size = (end - start) * sizeof(struct page);
				1626	map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
				1627	pgdat->node_id, false);
				1628	if (!map)
				1629	panic("Failed to allocate %ld bytes for node %d memory map\n",
				1630	size, pgdat->node_id);
				1631	pgdat->node_mem_map = map + offset;
Pasha Tatashin	9d85731	2024-08-08 21:34:36 +0000	[diff] [blame]	1632	memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));
Kefeng Wang	e99fb98	2023-12-22 15:02:03 +0800	[diff] [blame]	1633	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
				1634	__func__, pgdat->node_id, (unsigned long)pgdat,
				1635	(unsigned long)pgdat->node_mem_map);
				1636	#ifndef CONFIG_NUMA
				1637	/* the global mem_map is just set as node 0's */
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1638	if (pgdat == NODE_DATA(0)) {
				1639	mem_map = NODE_DATA(0)->node_mem_map;
				1640	if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
				1641	mem_map -= offset;
				1642	}
				1643	#endif
				1644	}
				1645	#else
				1646	static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
				1647	#endif /* CONFIG_FLATMEM */
				1648
				1649	/**
				1650	* get_pfn_range_for_nid - Return the start and end page frames for a node
				1651	* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
				1652	* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
				1653	* @end_pfn: Passed by reference. On return, it will have the node end_pfn.
				1654	*
				1655	* It returns the start and end page frame of a node based on information
				1656	* provided by memblock_set_node(). If called for a node
Miaohe Lin	3a29280	2023-06-25 11:33:40 +0800	[diff] [blame]	1657	* with no available memory, the start and end PFNs will be 0.
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1658	*/
				1659	void __init get_pfn_range_for_nid(unsigned int nid,
				1660	unsigned long start_pfn, unsigned long end_pfn)
				1661	{
				1662	unsigned long this_start_pfn, this_end_pfn;
				1663	int i;
				1664
				1665	*start_pfn = -1UL;
				1666	*end_pfn = 0;
				1667
				1668	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
				1669	start_pfn = min(start_pfn, this_start_pfn);
				1670	end_pfn = max(end_pfn, this_end_pfn);
				1671	}
				1672
				1673	if (*start_pfn == -1UL)
				1674	*start_pfn = 0;
				1675	}
				1676
				1677	static void __init free_area_init_node(int nid)
				1678	{
				1679	pg_data_t *pgdat = NODE_DATA(nid);
				1680	unsigned long start_pfn = 0;
				1681	unsigned long end_pfn = 0;
				1682
				1683	/* pg_data_t should be reset to zero when it's allocated */
				1684	WARN_ON(pgdat->nr_zones \|\| pgdat->kswapd_highest_zoneidx);
				1685
				1686	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
				1687
				1688	pgdat->node_id = nid;
				1689	pgdat->node_start_pfn = start_pfn;
				1690	pgdat->per_cpu_nodestats = NULL;
				1691
				1692	if (start_pfn != end_pfn) {
				1693	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
				1694	(u64)start_pfn << PAGE_SHIFT,
				1695	end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
Haifeng Xu	ba1b67c	2023-05-26 08:52:50 +0000	[diff] [blame]	1696
				1697	calculate_node_totalpages(pgdat, start_pfn, end_pfn);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1698	} else {
				1699	pr_info("Initmem setup node %d as memoryless\n", nid);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1700
Haifeng Xu	ba1b67c	2023-05-26 08:52:50 +0000	[diff] [blame]	1701	reset_memoryless_node_totalpages(pgdat);
				1702	}
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1703
				1704	alloc_node_mem_map(pgdat);
				1705	pgdat_set_deferred_range(pgdat);
				1706
				1707	free_area_init_core(pgdat);
				1708	lru_gen_init_pgdat(pgdat);
				1709	}
				1710
				1711	/* Any regular or high memory on that node ? */
Haifeng Xu	b894da0	2023-07-10 09:37:50 +0000	[diff] [blame]	1712	static void __init check_for_memory(pg_data_t *pgdat)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1713	{
				1714	enum zone_type zone_type;
				1715
				1716	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
				1717	struct zone *zone = &pgdat->node_zones[zone_type];
				1718	if (populated_zone(zone)) {
				1719	if (IS_ENABLED(CONFIG_HIGHMEM))
Haifeng Xu	91ff4d7	2023-06-07 03:24:02 +0000	[diff] [blame]	1720	node_set_state(pgdat->node_id, N_HIGH_MEMORY);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1721	if (zone_type <= ZONE_NORMAL)
Haifeng Xu	91ff4d7	2023-06-07 03:24:02 +0000	[diff] [blame]	1722	node_set_state(pgdat->node_id, N_NORMAL_MEMORY);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1723	break;
				1724	}
				1725	}
				1726	}
				1727
				1728	#if MAX_NUMNODES > 1
				1729	/*
				1730	* Figure out the number of possible node ids.
				1731	*/
				1732	void __init setup_nr_node_ids(void)
				1733	{
				1734	unsigned int highest;
				1735
				1736	highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
				1737	nr_node_ids = highest + 1;
				1738	}
				1739	#endif
				1740
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1741	/*
				1742	* Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
				1743	* such cases we allow max_zone_pfn sorted in the descending order
				1744	*/
Arnd Bergmann	5f300fd	2023-04-14 10:03:53 +0200	[diff] [blame]	1745	static bool arch_has_descending_max_zone_pfns(void)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1746	{
Arnd Bergmann	5f300fd	2023-04-14 10:03:53 +0200	[diff] [blame]	1747	return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1748	}
				1749
				1750	/**
				1751	* free_area_init - Initialise all pg_data_t and zone data
				1752	* @max_zone_pfn: an array of max PFNs for each zone
				1753	*
				1754	* This will call free_area_init_node() for each active node in the system.
				1755	* Using the page ranges provided by memblock_set_node(), the size of each
				1756	* zone in each node and their holes is calculated. If the maximum PFN
				1757	* between two adjacent zones match, it is assumed that the zone is empty.
				1758	* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
				1759	* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
				1760	* starts where the previous one ended. For example, ZONE_DMA32 starts
				1761	* at arch_max_dma_pfn.
				1762	*/
				1763	void __init free_area_init(unsigned long *max_zone_pfn)
				1764	{
				1765	unsigned long start_pfn, end_pfn;
				1766	int i, nid, zone;
				1767	bool descending;
				1768
				1769	/* Record where the zone boundaries are */
				1770	memset(arch_zone_lowest_possible_pfn, 0,
				1771	sizeof(arch_zone_lowest_possible_pfn));
				1772	memset(arch_zone_highest_possible_pfn, 0,
				1773	sizeof(arch_zone_highest_possible_pfn));
				1774
				1775	start_pfn = PHYS_PFN(memblock_start_of_DRAM());
				1776	descending = arch_has_descending_max_zone_pfns();
				1777
				1778	for (i = 0; i < MAX_NR_ZONES; i++) {
				1779	if (descending)
				1780	zone = MAX_NR_ZONES - i - 1;
				1781	else
				1782	zone = i;
				1783
				1784	if (zone == ZONE_MOVABLE)
				1785	continue;
				1786
				1787	end_pfn = max(max_zone_pfn[zone], start_pfn);
				1788	arch_zone_lowest_possible_pfn[zone] = start_pfn;
				1789	arch_zone_highest_possible_pfn[zone] = end_pfn;
				1790
				1791	start_pfn = end_pfn;
				1792	}
				1793
				1794	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
				1795	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
				1796	find_zone_movable_pfns_for_nodes();
				1797
				1798	/* Print out the zone ranges */
				1799	pr_info("Zone ranges:\n");
				1800	for (i = 0; i < MAX_NR_ZONES; i++) {
				1801	if (i == ZONE_MOVABLE)
				1802	continue;
				1803	pr_info(" %-8s ", zone_names[i]);
				1804	if (arch_zone_lowest_possible_pfn[i] ==
				1805	arch_zone_highest_possible_pfn[i])
				1806	pr_cont("empty\n");
				1807	else
				1808	pr_cont("[mem %#018Lx-%#018Lx]\n",
				1809	(u64)arch_zone_lowest_possible_pfn[i]
				1810	<< PAGE_SHIFT,
				1811	((u64)arch_zone_highest_possible_pfn[i]
				1812	<< PAGE_SHIFT) - 1);
				1813	}
				1814
				1815	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
				1816	pr_info("Movable zone start for each node\n");
				1817	for (i = 0; i < MAX_NUMNODES; i++) {
				1818	if (zone_movable_pfn[i])
				1819	pr_info(" Node %d: %#018Lx\n", i,
				1820	(u64)zone_movable_pfn[i] << PAGE_SHIFT);
				1821	}
				1822
				1823	/*
				1824	* Print out the early node map, and initialize the
				1825	* subsection-map relative to active online memory ranges to
				1826	* enable future "sub-section" extensions of the memory map.
				1827	*/
				1828	pr_info("Early memory node ranges\n");
				1829	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
				1830	pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
				1831	(u64)start_pfn << PAGE_SHIFT,
				1832	((u64)end_pfn << PAGE_SHIFT) - 1);
				1833	subsection_map_init(start_pfn, end_pfn - start_pfn);
				1834	}
				1835
				1836	/* Initialise every node */
				1837	mminit_verify_pageflags_layout();
				1838	setup_nr_node_ids();
Haifeng Xu	e3d9b45	2023-06-01 06:35:35 +0000	[diff] [blame]	1839	set_pageblock_order();
				1840
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1841	for_each_node(nid) {
				1842	pg_data_t *pgdat;
				1843
Mike Rapoport (Microsoft)	ec164cf	2024-08-07 09:40:52 +0300	[diff] [blame]	1844	if (!node_online(nid))
				1845	alloc_offline_node_data(nid);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1846
				1847	pgdat = NODE_DATA(nid);
				1848	free_area_init_node(nid);
				1849
Baoquan He	c091dd9	2024-03-26 14:11:28 +0800	[diff] [blame]	1850	/*
				1851	* No sysfs hierarcy will be created via register_one_node()
				1852	*for memory-less node because here it's not marked as N_MEMORY
				1853	*and won't be set online later. The benefit is userspace
				1854	*program won't be confused by sysfs files/directories of
				1855	*memory-less node. The pgdat will get fully initialized by
				1856	*hotadd_init_pgdat() when memory is hotplugged into this node.
				1857	*/
				1858	if (pgdat->node_present_pages) {
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1859	node_set_state(nid, N_MEMORY);
Baoquan He	c091dd9	2024-03-26 14:11:28 +0800	[diff] [blame]	1860	check_for_memory(pgdat);
				1861	}
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1862	}
				1863
Baoquan He	0ac5e78	2024-03-25 22:56:44 +0800	[diff] [blame]	1864	calc_nr_kernel_pages();
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1865	memmap_init();
Mike Rapoport (IBM)	534ef4e	2023-03-21 19:05:03 +0200	[diff] [blame]	1866
				1867	/* disable hash distribution for systems with a single node */
				1868	fixup_hashdist();
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1869	}
				1870
				1871	/**
				1872	* node_map_pfn_alignment - determine the maximum internode alignment
				1873	*
				1874	* This function should be called after node map is populated and sorted.
				1875	* It calculates the maximum power of two alignment which can distinguish
				1876	* all the nodes.
				1877	*
				1878	* For example, if all nodes are 1GiB and aligned to 1GiB, the return value
				1879	* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
				1880	* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
				1881	* shifted, 1GiB is enough and this function will indicate so.
				1882	*
				1883	* This is used to test whether pfn -> nid mapping of the chosen memory
				1884	* model has fine enough granularity to avoid incorrect mapping for the
				1885	* populated node map.
				1886	*
				1887	* Return: the determined alignment in pfn's. 0 if there is no alignment
				1888	* requirement (single node).
				1889	*/
				1890	unsigned long __init node_map_pfn_alignment(void)
				1891	{
				1892	unsigned long accl_mask = 0, last_end = 0;
				1893	unsigned long start, end, mask;
				1894	int last_nid = NUMA_NO_NODE;
				1895	int i, nid;
				1896
				1897	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
				1898	if (!start \|\| last_nid < 0 \|\| last_nid == nid) {
				1899	last_nid = nid;
				1900	last_end = end;
				1901	continue;
				1902	}
				1903
				1904	/*
				1905	* Start with a mask granular enough to pin-point to the
				1906	* start pfn and tick off bits one-by-one until it becomes
				1907	* too coarse to separate the current node from the last.
				1908	*/
				1909	mask = ~((1 << __ffs(start)) - 1);
				1910	while (mask && last_end <= (start & (mask << 1)))
				1911	mask <<= 1;
				1912
				1913	/* accumulate all internode masks */
				1914	accl_mask \|= mask;
				1915	}
				1916
				1917	/* convert mask to number of pages */
				1918	return ~accl_mask + 1;
				1919	}
				1920
				1921	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
Wei Yang	972b89c1	2024-06-12 02:04:21 +0000	[diff] [blame]	1922	static void __init deferred_free_pages(unsigned long pfn,
				1923	unsigned long nr_pages)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1924	{
				1925	struct page *page;
				1926	unsigned long i;
				1927
				1928	if (!nr_pages)
				1929	return;
				1930
				1931	page = pfn_to_page(pfn);
				1932
				1933	/* Free a large naturally-aligned chunk if possible */
Kirill A. Shutemov	3f6dac0	2023-03-21 03:24:15 +0300	[diff] [blame]	1934	if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
				1935	for (i = 0; i < nr_pages; i += pageblock_nr_pages)
				1936	set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
David Hildenbrand	13c5265	2024-06-07 11:09:36 +0200	[diff] [blame]	1937	__free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1938	return;
				1939	}
				1940
Kirill A. Shutemov	5e0a760	2023-12-28 17:47:04 +0300	[diff] [blame]	1941	/* Accept chunks smaller than MAX_PAGE_ORDER upfront */
Kirill A. Shutemov	5adfeae	2024-08-09 14:48:51 +0300	[diff] [blame]	1942	accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE);
Kirill A. Shutemov	dcdfdd4	2023-06-06 17:26:29 +0300	[diff] [blame]	1943
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1944	for (i = 0; i < nr_pages; i++, page++, pfn++) {
				1945	if (pageblock_aligned(pfn))
				1946	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
David Hildenbrand	13c5265	2024-06-07 11:09:36 +0200	[diff] [blame]	1947	__free_pages_core(page, 0, MEMINIT_EARLY);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1948	}
				1949	}
				1950
				1951	/* Completion tracking for deferred_init_memmap() threads */
				1952	static atomic_t pgdat_init_n_undone __initdata;
				1953	static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
				1954
				1955	static inline void __init pgdat_init_report_one_done(void)
				1956	{
				1957	if (atomic_dec_and_test(&pgdat_init_n_undone))
				1958	complete(&pgdat_init_all_done_comp);
				1959	}
				1960
				1961	/*
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1962	* Initialize struct pages. We minimize pfn page lookups and scheduler checks
Kirill A. Shutemov	3f6dac0	2023-03-21 03:24:15 +0300	[diff] [blame]	1963	* by performing it only once every MAX_ORDER_NR_PAGES.
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1964	* Return number of pages initialized.
				1965	*/
Wei Yang	972b89c1	2024-06-12 02:04:21 +0000	[diff] [blame]	1966	static unsigned long __init deferred_init_pages(struct zone *zone,
				1967	unsigned long pfn, unsigned long end_pfn)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1968	{
				1969	int nid = zone_to_nid(zone);
Wei Yang	972b89c1	2024-06-12 02:04:21 +0000	[diff] [blame]	1970	unsigned long nr_pages = end_pfn - pfn;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1971	int zid = zone_idx(zone);
Wei Yang	972b89c1	2024-06-12 02:04:21 +0000	[diff] [blame]	1972	struct page *page = pfn_to_page(pfn);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1973
Wei Yang	972b89c1	2024-06-12 02:04:21 +0000	[diff] [blame]	1974	for (; pfn < end_pfn; pfn++, page++)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1975	__init_single_page(page, pfn, zid, nid);
Baoquan He	f55d347	2024-03-26 14:11:31 +0800	[diff] [blame]	1976	return nr_pages;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1977	}
				1978
				1979	/*
Wei Yang	f1180fd	2024-06-05 07:13:39 +0000	[diff] [blame]	1980	* This function is meant to pre-load the iterator for the zone init from
				1981	* a given point.
				1982	* Specifically it walks through the ranges starting with initial index
				1983	* passed to it until we are caught up to the first_init_pfn value and
				1984	* exits there. If we never encounter the value we return false indicating
				1985	* there are no valid ranges left.
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1986	*/
				1987	static bool __init
				1988	deferred_init_mem_pfn_range_in_zone(u64 i, struct zone zone,
				1989	unsigned long spfn, unsigned long epfn,
				1990	unsigned long first_init_pfn)
				1991	{
Wei Yang	f1180fd	2024-06-05 07:13:39 +0000	[diff] [blame]	1992	u64 j = *i;
				1993
				1994	if (j == 0)
				1995	__next_mem_pfn_range_in_zone(&j, zone, spfn, epfn);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1996
				1997	/*
				1998	* Start out by walking through the ranges in this zone that have
				1999	* already been initialized. We don't need to do anything with them
				2000	* so we just need to flush them out of the system.
				2001	*/
Wei Yang	f1180fd	2024-06-05 07:13:39 +0000	[diff] [blame]	2002	for_each_free_mem_pfn_range_in_zone_from(j, zone, spfn, epfn) {
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2003	if (*epfn <= first_init_pfn)
				2004	continue;
				2005	if (*spfn < first_init_pfn)
				2006	*spfn = first_init_pfn;
				2007	*i = j;
				2008	return true;
				2009	}
				2010
				2011	return false;
				2012	}
				2013
				2014	/*
				2015	* Initialize and free pages. We do it in two loops: first we initialize
				2016	* struct page, then free to buddy allocator, because while we are
				2017	* freeing pages we can access pages that are ahead (computing buddy
				2018	* page in __free_one_page()).
				2019	*
				2020	* In order to try and keep some memory in the cache we have the loop
				2021	* broken along max page order boundaries. This way we will not cause
				2022	* any issues with the buddy page computation.
				2023	*/
				2024	static unsigned long __init
				2025	deferred_init_maxorder(u64 i, struct zone zone, unsigned long *start_pfn,
				2026	unsigned long *end_pfn)
				2027	{
				2028	unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
				2029	unsigned long spfn = start_pfn, epfn = end_pfn;
				2030	unsigned long nr_pages = 0;
				2031	u64 j = *i;
				2032
				2033	/* First we loop through and initialize the page values */
				2034	for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
				2035	unsigned long t;
				2036
				2037	if (mo_pfn <= *start_pfn)
				2038	break;
				2039
				2040	t = min(mo_pfn, *end_pfn);
				2041	nr_pages += deferred_init_pages(zone, *start_pfn, t);
				2042
				2043	if (mo_pfn < *end_pfn) {
				2044	*start_pfn = mo_pfn;
				2045	break;
				2046	}
				2047	}
				2048
				2049	/* Reset values and now loop through freeing pages as needed */
				2050	swap(j, *i);
				2051
				2052	for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
				2053	unsigned long t;
				2054
				2055	if (mo_pfn <= spfn)
				2056	break;
				2057
				2058	t = min(mo_pfn, epfn);
Wei Yang	972b89c1	2024-06-12 02:04:21 +0000	[diff] [blame]	2059	deferred_free_pages(spfn, t - spfn);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2060
				2061	if (mo_pfn <= epfn)
				2062	break;
				2063	}
				2064
				2065	return nr_pages;
				2066	}
				2067
				2068	static void __init
				2069	deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
				2070	void *arg)
				2071	{
				2072	unsigned long spfn, epfn;
				2073	struct zone *zone = arg;
Wei Yang	f1180fd	2024-06-05 07:13:39 +0000	[diff] [blame]	2074	u64 i = 0;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2075
				2076	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
				2077
				2078	/*
Kirill A. Shutemov	5e0a760	2023-12-28 17:47:04 +0300	[diff] [blame]	2079	* Initialize and free pages in MAX_PAGE_ORDER sized increments so that
				2080	* we can avoid introducing any issues with the buddy allocator.
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2081	*/
				2082	while (spfn < end_pfn) {
				2083	deferred_init_maxorder(&i, zone, &spfn, &epfn);
				2084	cond_resched();
				2085	}
				2086	}
				2087
Eric Chanudet	188f87f	2024-05-22 16:38:01 -0400	[diff] [blame]	2088	static unsigned int __init
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2089	deferred_page_init_max_threads(const struct cpumask *node_cpumask)
				2090	{
Eric Chanudet	188f87f	2024-05-22 16:38:01 -0400	[diff] [blame]	2091	return max(cpumask_weight(node_cpumask), 1U);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2092	}
				2093
				2094	/* Initialise remaining memory on a node */
				2095	static int __init deferred_init_memmap(void *data)
				2096	{
				2097	pg_data_t *pgdat = data;
				2098	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
				2099	unsigned long spfn = 0, epfn = 0;
				2100	unsigned long first_init_pfn, flags;
				2101	unsigned long start = jiffies;
				2102	struct zone *zone;
Wei Yang	ce8ebb9	2024-06-05 07:13:37 +0000	[diff] [blame]	2103	int max_threads;
Wei Yang	f1180fd	2024-06-05 07:13:39 +0000	[diff] [blame]	2104	u64 i = 0;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2105
				2106	/* Bind memory initialisation thread to a local node if possible */
				2107	if (!cpumask_empty(cpumask))
				2108	set_cpus_allowed_ptr(current, cpumask);
				2109
				2110	pgdat_resize_lock(pgdat, &flags);
				2111	first_init_pfn = pgdat->first_deferred_pfn;
				2112	if (first_init_pfn == ULONG_MAX) {
				2113	pgdat_resize_unlock(pgdat, &flags);
				2114	pgdat_init_report_one_done();
				2115	return 0;
				2116	}
				2117
				2118	/* Sanity check boundaries */
				2119	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
				2120	BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
				2121	pgdat->first_deferred_pfn = ULONG_MAX;
				2122
				2123	/*
				2124	* Once we unlock here, the zone cannot be grown anymore, thus if an
				2125	* interrupt thread must allocate this early in boot, zone must be
				2126	* pre-grown prior to start of deferred page initialization.
				2127	*/
				2128	pgdat_resize_unlock(pgdat, &flags);
				2129
Wei Yang	ce8ebb9	2024-06-05 07:13:37 +0000	[diff] [blame]	2130	/* Only the highest zone is deferred */
				2131	zone = pgdat->node_zones + pgdat->nr_zones - 1;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2132
				2133	max_threads = deferred_page_init_max_threads(cpumask);
				2134
Wei Yang	544b8e1	2024-06-05 07:13:38 +0000	[diff] [blame]	2135	while (deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, first_init_pfn)) {
				2136	first_init_pfn = ALIGN(epfn, PAGES_PER_SECTION);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2137	struct padata_mt_job job = {
				2138	.thread_fn = deferred_init_memmap_chunk,
				2139	.fn_arg = zone,
				2140	.start = spfn,
Wei Yang	544b8e1	2024-06-05 07:13:38 +0000	[diff] [blame]	2141	.size = first_init_pfn - spfn,
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2142	.align = PAGES_PER_SECTION,
				2143	.min_chunk = PAGES_PER_SECTION,
				2144	.max_threads = max_threads,
Gang Li Subject: padata: dispatch works on	eb52286	2024-03-06 13:04:17 -0800	[diff] [blame]	2145	.numa_aware = false,
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2146	};
				2147
				2148	padata_do_multithreaded(&job);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2149	}
Wei Yang	544b8e1	2024-06-05 07:13:38 +0000	[diff] [blame]	2150
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2151	/* Sanity check that the next zone really is unpopulated */
Wei Yang	ce8ebb9	2024-06-05 07:13:37 +0000	[diff] [blame]	2152	WARN_ON(pgdat->nr_zones < MAX_NR_ZONES && populated_zone(++zone));
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2153
				2154	pr_info("node %d deferred pages initialised in %ums\n",
				2155	pgdat->node_id, jiffies_to_msecs(jiffies - start));
				2156
				2157	pgdat_init_report_one_done();
				2158	return 0;
				2159	}
				2160
				2161	/*
				2162	* If this zone has deferred pages, try to grow it by initializing enough
				2163	* deferred pages to satisfy the allocation specified by order, rounded up to
				2164	* the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
				2165	* of SECTION_SIZE bytes by initializing struct pages in increments of
				2166	* PAGES_PER_SECTION * sizeof(struct page) bytes.
				2167	*
				2168	* Return true when zone was grown, otherwise return false. We return true even
				2169	* when we grow less than requested, to let the caller decide if there are
				2170	* enough pages to satisfy the allocation.
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2171	*/
				2172	bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
				2173	{
				2174	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
				2175	pg_data_t *pgdat = zone->zone_pgdat;
				2176	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
				2177	unsigned long spfn, epfn, flags;
				2178	unsigned long nr_pages = 0;
Wei Yang	f1180fd	2024-06-05 07:13:39 +0000	[diff] [blame]	2179	u64 i = 0;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2180
				2181	/* Only the last zone may have deferred pages */
				2182	if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
				2183	return false;
				2184
				2185	pgdat_resize_lock(pgdat, &flags);
				2186
				2187	/*
				2188	* If someone grew this zone while we were waiting for spinlock, return
				2189	* true, as there might be enough pages already.
				2190	*/
				2191	if (first_deferred_pfn != pgdat->first_deferred_pfn) {
				2192	pgdat_resize_unlock(pgdat, &flags);
				2193	return true;
				2194	}
				2195
				2196	/* If the zone is empty somebody else may have cleared out the zone */
				2197	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
				2198	first_deferred_pfn)) {
				2199	pgdat->first_deferred_pfn = ULONG_MAX;
				2200	pgdat_resize_unlock(pgdat, &flags);
				2201	/* Retry only once. */
				2202	return first_deferred_pfn != ULONG_MAX;
				2203	}
				2204
				2205	/*
Kirill A. Shutemov	5e0a760	2023-12-28 17:47:04 +0300	[diff] [blame]	2206	* Initialize and free pages in MAX_PAGE_ORDER sized increments so
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2207	* that we can avoid introducing any issues with the buddy
				2208	* allocator.
				2209	*/
				2210	while (spfn < epfn) {
				2211	/* update our first deferred PFN for this section */
				2212	first_deferred_pfn = spfn;
				2213
				2214	nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
				2215	touch_nmi_watchdog();
				2216
				2217	/* We should only stop along section boundaries */
				2218	if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
				2219	continue;
				2220
				2221	/* If our quota has been met we can stop here */
				2222	if (nr_pages >= nr_pages_needed)
				2223	break;
				2224	}
				2225
				2226	pgdat->first_deferred_pfn = spfn;
				2227	pgdat_resize_unlock(pgdat, &flags);
				2228
				2229	return nr_pages > 0;
				2230	}
				2231
				2232	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
				2233
				2234	#ifdef CONFIG_CMA
				2235	void __init init_cma_reserved_pageblock(struct page *page)
				2236	{
				2237	unsigned i = pageblock_nr_pages;
				2238	struct page *p = page;
				2239
				2240	do {
				2241	__ClearPageReserved(p);
				2242	set_page_count(p, 0);
				2243	} while (++p, --i);
				2244
				2245	set_pageblock_migratetype(page, MIGRATE_CMA);
				2246	set_page_refcounted(page);
Suren Baghdasaryan	766c163	2024-08-13 08:07:57 -0700	[diff] [blame]	2247	/* pages were reserved and not allocated */
				2248	clear_page_tag_ref(page);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2249	__free_pages(page, pageblock_order);
				2250
				2251	adjust_managed_page_count(page, pageblock_nr_pages);
				2252	page_zone(page)->cma_pages += pageblock_nr_pages;
				2253	}
				2254	#endif
				2255
Kefeng Wang	904d585	2023-05-16 14:38:11 +0800	[diff] [blame]	2256	void set_zone_contiguous(struct zone *zone)
				2257	{
				2258	unsigned long block_start_pfn = zone->zone_start_pfn;
				2259	unsigned long block_end_pfn;
				2260
				2261	block_end_pfn = pageblock_end_pfn(block_start_pfn);
				2262	for (; block_start_pfn < zone_end_pfn(zone);
				2263	block_start_pfn = block_end_pfn,
				2264	block_end_pfn += pageblock_nr_pages) {
				2265
				2266	block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
				2267
				2268	if (!__pageblock_pfn_to_page(block_start_pfn,
				2269	block_end_pfn, zone))
				2270	return;
				2271	cond_resched();
				2272	}
				2273
				2274	/* We confirm that there is no hole */
				2275	zone->contiguous = true;
				2276	}
				2277
Wei Yang	4f66da8	2024-06-11 14:52:23 +0000	[diff] [blame]	2278	static void __init mem_init_print_info(void);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2279	void __init page_alloc_init_late(void)
				2280	{
				2281	struct zone *zone;
				2282	int nid;
				2283
				2284	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
				2285
				2286	/* There will be num_node_state(N_MEMORY) threads */
				2287	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
				2288	for_each_node_state(nid, N_MEMORY) {
				2289	kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
				2290	}
				2291
				2292	/* Block until all are initialised */
				2293	wait_for_completion(&pgdat_init_all_done_comp);
				2294
				2295	/*
				2296	* We initialized the rest of the deferred pages. Permanently disable
				2297	* on-demand struct page initialization.
				2298	*/
				2299	static_branch_disable(&deferred_pages);
				2300
				2301	/* Reinit limits that are based on free pages after the kernel is up */
				2302	files_maxfiles_init();
				2303	#endif
				2304
Wei Yang	4f66da8	2024-06-11 14:52:23 +0000	[diff] [blame]	2305	/* Accounting of total+free memory is stable at this point. */
				2306	mem_init_print_info();
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2307	buffer_init();
				2308
				2309	/* Discard memblock private memory */
				2310	memblock_discard();
				2311
				2312	for_each_node_state(nid, N_MEMORY)
				2313	shuffle_free_memory(NODE_DATA(nid));
				2314
				2315	for_each_populated_zone(zone)
				2316	set_zone_contiguous(zone);
Mike Rapoport (IBM)	de57807	2023-03-21 19:05:09 +0200	[diff] [blame]	2317
				2318	/* Initialize page ext after all struct pages are initialized. */
				2319	if (deferred_struct_pages)
				2320	page_ext_init();
Kefeng Wang	e95d372	2023-05-16 14:38:20 +0800	[diff] [blame]	2321
				2322	page_alloc_sysctl_init();
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2323	}
				2324
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2325	/*
				2326	* Adaptive scale is meant to reduce sizes of hash tables on large memory
				2327	* machines. As memory size is increased the scale is also increased but at
				2328	* slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
				2329	* quadruples the scale is increased by one, which means the size of hash table
				2330	* only doubles, instead of quadrupling as well.
				2331	* Because 32-bit systems cannot have large physical memory, where this scaling
				2332	* makes sense, it is disabled on such platforms.
				2333	*/
				2334	#if __BITS_PER_LONG > 32
				2335	#define ADAPT_SCALE_BASE (64ul << 30)
				2336	#define ADAPT_SCALE_SHIFT 2
				2337	#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
				2338	#endif
				2339
				2340	/*
				2341	* allocate a large system hash table from bootmem
				2342	* - it is assumed that the hash table must contain an exact power-of-2
				2343	* quantity of entries
				2344	* - limit is the number of hash buckets, not the total allocation size
				2345	*/
				2346	void __init alloc_large_system_hash(const char tablename,
				2347	unsigned long bucketsize,
				2348	unsigned long numentries,
				2349	int scale,
				2350	int flags,
				2351	unsigned int *_hash_shift,
				2352	unsigned int *_hash_mask,
				2353	unsigned long low_limit,
				2354	unsigned long high_limit)
				2355	{
				2356	unsigned long long max = high_limit;
				2357	unsigned long log2qty, size;
				2358	void *table;
				2359	gfp_t gfp_flags;
				2360	bool virt;
				2361	bool huge;
				2362
				2363	/* allow the kernel cmdline to have a say */
				2364	if (!numentries) {
				2365	/* round applicable memory size up to nearest megabyte */
				2366	numentries = nr_kernel_pages;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2367
				2368	/* It isn't necessary when PAGE_SIZE >= 1MB */
				2369	if (PAGE_SIZE < SZ_1M)
				2370	numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
				2371
				2372	#if __BITS_PER_LONG > 32
				2373	if (!high_limit) {
				2374	unsigned long adapt;
				2375
				2376	for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
				2377	adapt <<= ADAPT_SCALE_SHIFT)
				2378	scale++;
				2379	}
				2380	#endif
				2381
				2382	/* limit to 1 bucket per 2^scale bytes of low memory */
				2383	if (scale > PAGE_SHIFT)
				2384	numentries >>= (scale - PAGE_SHIFT);
				2385	else
				2386	numentries <<= (PAGE_SHIFT - scale);
				2387
Miaohe Lin	3fade62	2023-06-25 10:13:23 +0800	[diff] [blame]	2388	if (unlikely((numentries * bucketsize) < PAGE_SIZE))
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2389	numentries = PAGE_SIZE / bucketsize;
				2390	}
				2391	numentries = roundup_pow_of_two(numentries);
				2392
				2393	/* limit allocation size to 1/16 total memory by default */
				2394	if (max == 0) {
				2395	max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
				2396	do_div(max, bucketsize);
				2397	}
				2398	max = min(max, 0x80000000ULL);
				2399
				2400	if (numentries < low_limit)
				2401	numentries = low_limit;
				2402	if (numentries > max)
				2403	numentries = max;
				2404
				2405	log2qty = ilog2(numentries);
				2406
				2407	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC \| __GFP_ZERO : GFP_ATOMIC;
				2408	do {
				2409	virt = false;
				2410	size = bucketsize << log2qty;
				2411	if (flags & HASH_EARLY) {
				2412	if (flags & HASH_ZERO)
				2413	table = memblock_alloc(size, SMP_CACHE_BYTES);
				2414	else
				2415	table = memblock_alloc_raw(size,
				2416	SMP_CACHE_BYTES);
Kirill A. Shutemov	5e0a760	2023-12-28 17:47:04 +0300	[diff] [blame]	2417	} else if (get_order(size) > MAX_PAGE_ORDER \|\| hashdist) {
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2418	table = vmalloc_huge(size, gfp_flags);
				2419	virt = true;
				2420	if (table)
				2421	huge = is_vm_area_hugepages(table);
				2422	} else {
				2423	/*
				2424	* If bucketsize is not a power-of-two, we may free
				2425	* some pages at the end of hash table which
				2426	* alloc_pages_exact() automatically does
				2427	*/
				2428	table = alloc_pages_exact(size, gfp_flags);
				2429	kmemleak_alloc(table, size, 1, gfp_flags);
				2430	}
				2431	} while (!table && size > PAGE_SIZE && --log2qty);
				2432
				2433	if (!table)
				2434	panic("Failed to allocate %s hash table\n", tablename);
				2435
				2436	pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
				2437	tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
				2438	virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
				2439
				2440	if (_hash_shift)
				2441	*_hash_shift = log2qty;
				2442	if (_hash_mask)
				2443	*_hash_mask = (1 << log2qty) - 1;
				2444
				2445	return table;
				2446	}
				2447
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2448	void __init memblock_free_pages(struct page *page, unsigned long pfn,
				2449	unsigned int order)
				2450	{
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	2451	if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
				2452	int nid = early_pfn_to_nid(pfn);
				2453
				2454	if (!early_page_initialised(pfn, nid))
				2455	return;
				2456	}
				2457
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2458	if (!kmsan_memblock_free_pages(page, order)) {
				2459	/* KMSAN will take care of these pages. */
				2460	return;
				2461	}
Suren Baghdasaryan	d224eb0	2024-03-21 09:36:56 -0700	[diff] [blame]	2462
				2463	/* pages were reserved and not allocated */
Suren Baghdasaryan	a8fc28d	2024-08-13 08:07:56 -0700	[diff] [blame]	2464	clear_page_tag_ref(page);
David Hildenbrand	13c5265	2024-06-07 11:09:36 +0200	[diff] [blame]	2465	__free_pages_core(page, order, MEMINIT_EARLY);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2466	}
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2467
Kefeng Wang	5e7d5da	2023-05-16 14:38:10 +0800	[diff] [blame]	2468	DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
				2469	EXPORT_SYMBOL(init_on_alloc);
				2470
				2471	DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
				2472	EXPORT_SYMBOL(init_on_free);
				2473
Mike Rapoport (IBM)	f2fc4b4	2023-03-21 19:05:08 +0200	[diff] [blame]	2474	static bool _init_on_alloc_enabled_early __read_mostly
				2475	= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
				2476	static int __init early_init_on_alloc(char *buf)
				2477	{
				2478
				2479	return kstrtobool(buf, &_init_on_alloc_enabled_early);
				2480	}
				2481	early_param("init_on_alloc", early_init_on_alloc);
				2482
				2483	static bool _init_on_free_enabled_early __read_mostly
				2484	= IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
				2485	static int __init early_init_on_free(char *buf)
				2486	{
				2487	return kstrtobool(buf, &_init_on_free_enabled_early);
				2488	}
				2489	early_param("init_on_free", early_init_on_free);
				2490
				2491	DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
				2492
				2493	/*
				2494	* Enable static keys related to various memory debugging and hardening options.
				2495	* Some override others, and depend on early params that are evaluated in the
				2496	* order of appearance. So we need to first gather the full picture of what was
				2497	* enabled, and then make decisions.
				2498	*/
				2499	static void __init mem_debugging_and_hardening_init(void)
				2500	{
				2501	bool page_poisoning_requested = false;
				2502	bool want_check_pages = false;
				2503
				2504	#ifdef CONFIG_PAGE_POISONING
				2505	/*
				2506	* Page poisoning is debug page alloc for some arches. If
				2507	* either of those options are enabled, enable poisoning.
				2508	*/
				2509	if (page_poisoning_enabled() \|\|
				2510	(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
				2511	debug_pagealloc_enabled())) {
				2512	static_branch_enable(&_page_poisoning_enabled);
				2513	page_poisoning_requested = true;
				2514	want_check_pages = true;
				2515	}
				2516	#endif
				2517
David Hildenbrand	384a746	2024-06-05 11:17:10 +0200	[diff] [blame]	2518	if ((_init_on_alloc_enabled_early \|\| _init_on_free_enabled_early) &&
Mike Rapoport (IBM)	f2fc4b4	2023-03-21 19:05:08 +0200	[diff] [blame]	2519	page_poisoning_requested) {
				2520	pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
David Hildenbrand	384a746	2024-06-05 11:17:10 +0200	[diff] [blame]	2521	"will take precedence over init_on_alloc and init_on_free\n");
Mike Rapoport (IBM)	f2fc4b4	2023-03-21 19:05:08 +0200	[diff] [blame]	2522	_init_on_alloc_enabled_early = false;
				2523	_init_on_free_enabled_early = false;
				2524	}
				2525
				2526	if (_init_on_alloc_enabled_early) {
				2527	want_check_pages = true;
				2528	static_branch_enable(&init_on_alloc);
				2529	} else {
				2530	static_branch_disable(&init_on_alloc);
				2531	}
				2532
				2533	if (_init_on_free_enabled_early) {
				2534	want_check_pages = true;
				2535	static_branch_enable(&init_on_free);
				2536	} else {
				2537	static_branch_disable(&init_on_free);
				2538	}
				2539
David Hildenbrand	384a746	2024-06-05 11:17:10 +0200	[diff] [blame]	2540	if (IS_ENABLED(CONFIG_KMSAN) &&
				2541	(_init_on_alloc_enabled_early \|\| _init_on_free_enabled_early))
				2542	pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
Mike Rapoport (IBM)	f2fc4b4	2023-03-21 19:05:08 +0200	[diff] [blame]	2543
				2544	#ifdef CONFIG_DEBUG_PAGEALLOC
				2545	if (debug_pagealloc_enabled()) {
				2546	want_check_pages = true;
				2547	static_branch_enable(&_debug_pagealloc_enabled);
				2548
				2549	if (debug_guardpage_minorder())
				2550	static_branch_enable(&_debug_guardpage_enabled);
				2551	}
				2552	#endif
				2553
				2554	/*
				2555	* Any page debugging or hardening option also enables sanity checking
				2556	* of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's
				2557	* enabled already.
				2558	*/
				2559	if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)
				2560	static_branch_enable(&check_pages_enabled);
				2561	}
				2562
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2563	/* Report memory auto-initialization states for this boot. */
				2564	static void __init report_meminit(void)
				2565	{
				2566	const char *stack;
				2567
				2568	if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
				2569	stack = "all(pattern)";
				2570	else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
				2571	stack = "all(zero)";
				2572	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
				2573	stack = "byref_all(zero)";
				2574	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
				2575	stack = "byref(zero)";
				2576	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
				2577	stack = "__user(zero)";
				2578	else
				2579	stack = "off";
				2580
David Hildenbrand	384a746	2024-06-05 11:17:10 +0200	[diff] [blame]	2581	pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
Thorsten Blum	4bb21db	2024-10-18 12:31:51 +0200	[diff] [blame]	2582	stack, str_on_off(want_init_on_alloc(GFP_KERNEL)),
				2583	str_on_off(want_init_on_free()));
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2584	if (want_init_on_free())
				2585	pr_info("mem auto-init: clearing system memory may take some time...\n");
				2586	}
				2587
Mike Rapoport (IBM)	eb8589b	2023-03-21 19:05:10 +0200	[diff] [blame]	2588	static void __init mem_init_print_info(void)
				2589	{
				2590	unsigned long physpages, codesize, datasize, rosize, bss_size;
				2591	unsigned long init_code_size, init_data_size;
				2592
				2593	physpages = get_num_physpages();
				2594	codesize = _etext - _stext;
				2595	datasize = _edata - _sdata;
				2596	rosize = __end_rodata - __start_rodata;
				2597	bss_size = __bss_stop - __bss_start;
				2598	init_data_size = __init_end - __init_begin;
				2599	init_code_size = _einittext - _sinittext;
				2600
				2601	/*
				2602	* Detect special cases and adjust section sizes accordingly:
				2603	* 1) .init.* may be embedded into .data sections
				2604	* 2) .init.text.* may be out of [__init_begin, __init_end],
				2605	* please refer to arch/tile/kernel/vmlinux.lds.S.
				2606	* 3) .rodata.* may be embedded into .text or .data sections.
				2607	*/
				2608	#define adj_init_size(start, end, size, pos, adj) \
				2609	do { \
				2610	if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
				2611	size -= adj; \
				2612	} while (0)
				2613
				2614	adj_init_size(__init_begin, __init_end, init_data_size,
				2615	_sinittext, init_code_size);
				2616	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
				2617	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
				2618	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
				2619	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
				2620
				2621	#undef adj_init_size
				2622
				2623	pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
				2624	#ifdef CONFIG_HIGHMEM
				2625	", %luK highmem"
				2626	#endif
				2627	")\n",
				2628	K(nr_free_pages()), K(physpages),
				2629	codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
				2630	(init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
				2631	K(physpages - totalram_pages() - totalcma_pages),
				2632	K(totalcma_pages)
				2633	#ifdef CONFIG_HIGHMEM
				2634	, K(totalhigh_pages())
				2635	#endif
				2636	);
				2637	}
				2638
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2639	/*
				2640	* Set up kernel memory allocators
				2641	*/
				2642	void __init mm_core_init(void)
				2643	{
				2644	/* Initializations relying on SMP setup */
Wei Yang	64e0ba3	2024-06-19 01:06:10 +0000	[diff] [blame]	2645	BUILD_BUG_ON(MAX_ZONELISTS > 2);
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2646	build_all_zonelists(NULL);
				2647	page_alloc_init_cpuhp();
Suren Baghdasaryan	4835f74	2024-10-23 10:07:59 -0700	[diff] [blame]	2648	alloc_tag_sec_init();
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2649	/*
				2650	* page_ext requires contiguous pages,
Kirill A. Shutemov	5e0a760	2023-12-28 17:47:04 +0300	[diff] [blame]	2651	* bigger than MAX_PAGE_ORDER unless SPARSEMEM.
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2652	*/
				2653	page_ext_init_flatmem();
Mike Rapoport (IBM)	f2fc4b4	2023-03-21 19:05:08 +0200	[diff] [blame]	2654	mem_debugging_and_hardening_init();
Peng Zhang	cabdf74	2023-07-18 15:30:19 +0800	[diff] [blame]	2655	kfence_alloc_pool_and_metadata();
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2656	report_meminit();
				2657	kmsan_init_shadow();
				2658	stack_depot_early_init();
				2659	mem_init();
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2660	kmem_cache_init();
				2661	/*
				2662	* page_owner must be initialized after buddy is ready, and also after
				2663	* slab is ready so that stack_depot_init() works properly
				2664	*/
				2665	page_ext_init_flatmem_late();
				2666	kmemleak_init();
Mike Rapoport (IBM)	4cd1e9e	2023-03-21 19:05:07 +0200	[diff] [blame]	2667	ptlock_cache_init();
				2668	pgtable_cache_init();
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2669	debug_objects_mem_init();
				2670	vmalloc_init();
				2671	/* If no deferred init page_ext now, as vmap is fully initialized */
				2672	if (!deferred_struct_pages)
				2673	page_ext_init();
				2674	/* Should be run before the first non-init thread is created */
				2675	init_espfix_bsp();
				2676	/* Should be run after espfix64 is set up. */
				2677	pti_init();
				2678	kmsan_init_runtime();
				2679	mm_cache_init();
Mike Rapoport (IBM)	f6bec26	2024-05-05 19:06:19 +0300	[diff] [blame]	2680	execmem_init();
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2681	}