page_alloc.c 219 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/*
 *  linux/mm/page_alloc.c
 *
 *  Manages the free list, the system allocates free pages here.
 *  Note that kmalloc() lives in slab.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
 *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
22
#include <linux/jiffies.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
Linus Torvalds's avatar
Linus Torvalds committed
25
#include <linux/compiler.h>
26
#include <linux/kernel.h>
27
#include <linux/kasan.h>
Linus Torvalds's avatar
Linus Torvalds committed
28 29 30 31 32
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
33
#include <linux/ratelimit.h>
34
#include <linux/oom.h>
Linus Torvalds's avatar
Linus Torvalds committed
35 36 37 38 39
#include <linux/notifier.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
40
#include <linux/memory_hotplug.h>
Linus Torvalds's avatar
Linus Torvalds committed
41 42
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
43
#include <linux/vmstat.h>
44
#include <linux/mempolicy.h>
45
#include <linux/memremap.h>
46
#include <linux/stop_machine.h>
47 48
#include <linux/sort.h>
#include <linux/pfn.h>
49
#include <linux/backing-dev.h>
50
#include <linux/fault-inject.h>
51
#include <linux/page-isolation.h>
52
#include <linux/page_ext.h>
53
#include <linux/debugobjects.h>
54
#include <linux/kmemleak.h>
55
#include <linux/compaction.h>
56
#include <trace/events/kmem.h>
57
#include <trace/events/oom.h>
58
#include <linux/prefetch.h>
59
#include <linux/mm_inline.h>
60
#include <linux/migrate.h>
61
#include <linux/hugetlb.h>
62
#include <linux/sched/rt.h>
63
#include <linux/sched/mm.h>
64
#include <linux/page_owner.h>
65
#include <linux/kthread.h>
66
#include <linux/memcontrol.h>
67
#include <linux/ftrace.h>
68
#include <linux/lockdep.h>
69
#include <linux/nmi.h>
Linus Torvalds's avatar
Linus Torvalds committed
70

71
#include <asm/sections.h>
Linus Torvalds's avatar
Linus Torvalds committed
72
#include <asm/tlbflush.h>
73
#include <asm/div64.h>
Linus Torvalds's avatar
Linus Torvalds committed
74 75
#include "internal.h"

76 77
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
78
#define MIN_PERCPU_PAGELIST_FRACTION	(8)
79

80 81 82 83 84
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif

85 86
DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);

87 88 89 90 91 92 93 94 95
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
 * defined in <linux/topology.h>.
 */
DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
96
int _node_numa_mem_[MAX_NUMNODES];
97 98
#endif

99 100 101 102
/* work_structs for global per-cpu drains */
DEFINE_MUTEX(pcpu_drain_mutex);
DEFINE_PER_CPU(struct work_struct, pcpu_drain);

103
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
104
volatile unsigned long latent_entropy __latent_entropy;
105 106 107
EXPORT_SYMBOL(latent_entropy);
#endif

Linus Torvalds's avatar
Linus Torvalds committed
108
/*
109
 * Array of node states.
Linus Torvalds's avatar
Linus Torvalds committed
110
 */
111 112 113 114 115 116 117
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
	[N_POSSIBLE] = NODE_MASK_ALL,
	[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
	[N_HIGH_MEMORY] = { { [0] = 1UL } },
118 119
#endif
	[N_MEMORY] = { { [0] = 1UL } },
120 121 122 123 124
	[N_CPU] = { { [0] = 1UL } },
#endif	/* NUMA */
};
EXPORT_SYMBOL(node_states);

125 126 127
/* Protect totalram_pages and zone->managed_pages */
static DEFINE_SPINLOCK(managed_page_count_lock);

128
unsigned long totalram_pages __read_mostly;
129
unsigned long totalreserve_pages __read_mostly;
130
unsigned long totalcma_pages __read_mostly;
131

132
int percpu_pagelist_fraction;
133
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
134

135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
/*
 * A cached value of the page's pageblock's migratetype, used when the page is
 * put on a pcplist. Used to avoid the pageblock migratetype lookup when
 * freeing from pcplists in most cases, at the cost of possibly becoming stale.
 * Also the migratetype set in the page does not necessarily match the pcplist
 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
 * other index - this ensures that it will be put on the correct CMA freelist.
 */
static inline int get_pcppage_migratetype(struct page *page)
{
	return page->index;
}

static inline void set_pcppage_migratetype(struct page *page, int migratetype)
{
	page->index = migratetype;
}

153 154 155 156 157 158 159 160 161
#ifdef CONFIG_PM_SLEEP
/*
 * The following functions are used by the suspend/hibernate code to temporarily
 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
 * while devices are suspended.  To avoid races with the suspend/hibernate code,
 * they should always be called with pm_mutex held (gfp_allowed_mask also should
 * only be modified with pm_mutex held, unless the suspend/hibernate code is
 * guaranteed not to run in parallel with that modification).
 */
162 163 164 165

static gfp_t saved_gfp_mask;

void pm_restore_gfp_mask(void)
166 167
{
	WARN_ON(!mutex_is_locked(&pm_mutex));
168 169 170 171
	if (saved_gfp_mask) {
		gfp_allowed_mask = saved_gfp_mask;
		saved_gfp_mask = 0;
	}
172 173
}

174
void pm_restrict_gfp_mask(void)
175 176
{
	WARN_ON(!mutex_is_locked(&pm_mutex));
177 178
	WARN_ON(saved_gfp_mask);
	saved_gfp_mask = gfp_allowed_mask;
179
	gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
180
}
181 182 183

bool pm_suspended_storage(void)
{
184
	if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
185 186 187
		return false;
	return true;
}
188 189
#endif /* CONFIG_PM_SLEEP */

190
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
191
unsigned int pageblock_order __read_mostly;
192 193
#endif

194
static void __free_pages_ok(struct page *page, unsigned int order);
195

Linus Torvalds's avatar
Linus Torvalds committed
196 197 198 199 200 201
/*
 * results with 256, 32 in the lowmem_reserve sysctl:
 *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
 *	1G machine -> (16M dma, 784M normal, 224M high)
 *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
Yaowei Bai's avatar
Yaowei Bai committed
202
 *	HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
203 204 205
 *
 * TBD: should special case ZONE_DMA32 machines here - in those we normally
 * don't need any ZONE_NORMAL reservation
Linus Torvalds's avatar
Linus Torvalds committed
206
 */
207
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
208
#ifdef CONFIG_ZONE_DMA
209
	[ZONE_DMA] = 256,
210
#endif
211
#ifdef CONFIG_ZONE_DMA32
212
	[ZONE_DMA32] = 256,
213
#endif
214
	[ZONE_NORMAL] = 32,
215
#ifdef CONFIG_HIGHMEM
216
	[ZONE_HIGHMEM] = 0,
217
#endif
218
	[ZONE_MOVABLE] = 0,
219
};
Linus Torvalds's avatar
Linus Torvalds committed
220 221 222

EXPORT_SYMBOL(totalram_pages);

223
static char * const zone_names[MAX_NR_ZONES] = {
224
#ifdef CONFIG_ZONE_DMA
225
	 "DMA",
226
#endif
227
#ifdef CONFIG_ZONE_DMA32
228
	 "DMA32",
229
#endif
230
	 "Normal",
231
#ifdef CONFIG_HIGHMEM
232
	 "HighMem",
233
#endif
234
	 "Movable",
235 236 237
#ifdef CONFIG_ZONE_DEVICE
	 "Device",
#endif
238 239
};

240 241 242 243 244 245 246 247 248 249 250 251 252
char * const migratetype_names[MIGRATE_TYPES] = {
	"Unmovable",
	"Movable",
	"Reclaimable",
	"HighAtomic",
#ifdef CONFIG_CMA
	"CMA",
#endif
#ifdef CONFIG_MEMORY_ISOLATION
	"Isolate",
#endif
};

253 254 255 256 257 258
compound_page_dtor * const compound_page_dtors[] = {
	NULL,
	free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
	free_huge_page,
#endif
259 260 261
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	free_transhuge_page,
#endif
262 263
};

Linus Torvalds's avatar
Linus Torvalds committed
264
int min_free_kbytes = 1024;
265
int user_min_free_kbytes = -1;
266
int watermark_scale_factor = 10;
Linus Torvalds's avatar
Linus Torvalds committed
267

268 269 270
static unsigned long nr_kernel_pages __meminitdata;
static unsigned long nr_all_pages __meminitdata;
static unsigned long dma_reserve __meminitdata;
Linus Torvalds's avatar
Linus Torvalds committed
271

272
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
273 274 275
static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
static unsigned long required_kernelcore __initdata;
276
static unsigned long required_kernelcore_percent __initdata;
277
static unsigned long required_movablecore __initdata;
278
static unsigned long required_movablecore_percent __initdata;
279 280
static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
static bool mirrored_kernelcore __meminitdata;
281 282 283 284 285

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
286

287 288
#if MAX_NUMNODES > 1
int nr_node_ids __read_mostly = MAX_NUMNODES;
289
int nr_online_nodes __read_mostly = 1;
290
EXPORT_SYMBOL(nr_node_ids);
291
EXPORT_SYMBOL(nr_online_nodes);
292 293
#endif

294 295
int page_group_by_mobility_disabled __read_mostly;

296 297
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Returns true if the struct page for the pfn is uninitialised */
298
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
299
{
300 301 302
	int nid = early_pfn_to_nid(pfn);

	if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
303 304 305 306 307 308 309 310 311 312 313 314 315
		return true;

	return false;
}

/*
 * Returns false when the remaining initialisation should be deferred until
 * later in the boot cycle when it can be parallelised.
 */
static inline bool update_defer_init(pg_data_t *pgdat,
				unsigned long pfn, unsigned long zone_end,
				unsigned long *nr_initialised)
{
316
	/* Always populate low zones for address-constrained allocations */
317 318 319
	if (zone_end < pgdat_end_pfn(pgdat))
		return true;
	(*nr_initialised)++;
320
	if ((*nr_initialised > pgdat->static_init_pgcnt) &&
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
	    (pfn & (PAGES_PER_SECTION - 1)) == 0) {
		pgdat->first_deferred_pfn = pfn;
		return false;
	}

	return true;
}
#else
static inline bool early_page_uninitialised(unsigned long pfn)
{
	return false;
}

static inline bool update_defer_init(pg_data_t *pgdat,
				unsigned long pfn, unsigned long zone_end,
				unsigned long *nr_initialised)
{
	return true;
}
#endif

342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(struct page *page,
							unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
	return __pfn_to_section(pfn)->pageblock_flags;
#else
	return page_zone(page)->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
}

static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
	pfn &= (PAGES_PER_SECTION-1);
	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#else
	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#endif /* CONFIG_SPARSEMEM */
}

/**
 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
 * @page: The page within the block of interest
 * @pfn: The target page frame number
 * @end_bitidx: The last bit of interest to retrieve
 * @mask: mask of bits that the caller is interested in
 *
 * Return: pageblock_bits flags
 */
static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
					unsigned long pfn,
					unsigned long end_bitidx,
					unsigned long mask)
{
	unsigned long *bitmap;
	unsigned long bitidx, word_bitidx;
	unsigned long word;

	bitmap = get_pageblock_bitmap(page, pfn);
	bitidx = pfn_to_bitidx(page, pfn);
	word_bitidx = bitidx / BITS_PER_LONG;
	bitidx &= (BITS_PER_LONG-1);

	word = bitmap[word_bitidx];
	bitidx += end_bitidx;
	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
}

unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
					unsigned long end_bitidx,
					unsigned long mask)
{
	return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
}

static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
{
	return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
}

/**
 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
 * @page: The page within the block of interest
 * @flags: The flags to set
 * @pfn: The target page frame number
 * @end_bitidx: The last bit of interest
 * @mask: mask of bits that the caller is interested in
 */
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
					unsigned long pfn,
					unsigned long end_bitidx,
					unsigned long mask)
{
	unsigned long *bitmap;
	unsigned long bitidx, word_bitidx;
	unsigned long old_word, word;

	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

	bitmap = get_pageblock_bitmap(page, pfn);
	bitidx = pfn_to_bitidx(page, pfn);
	word_bitidx = bitidx / BITS_PER_LONG;
	bitidx &= (BITS_PER_LONG-1);

	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);

	bitidx += end_bitidx;
	mask <<= (BITS_PER_LONG - bitidx - 1);
	flags <<= (BITS_PER_LONG - bitidx - 1);

	word = READ_ONCE(bitmap[word_bitidx]);
	for (;;) {
		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
		if (word == old_word)
			break;
		word = old_word;
	}
}
442

443
void set_pageblock_migratetype(struct page *page, int migratetype)
444
{
445 446
	if (unlikely(page_group_by_mobility_disabled &&
		     migratetype < MIGRATE_PCPTYPES))
447 448
		migratetype = MIGRATE_UNMOVABLE;

449 450 451 452
	set_pageblock_flags_group(page, (unsigned long)migratetype,
					PB_migrate, PB_migrate_end);
}

453
#ifdef CONFIG_DEBUG_VM
454
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
455
{
456 457 458
	int ret = 0;
	unsigned seq;
	unsigned long pfn = page_to_pfn(page);
459
	unsigned long sp, start_pfn;
460

461 462
	do {
		seq = zone_span_seqbegin(zone);
463 464
		start_pfn = zone->zone_start_pfn;
		sp = zone->spanned_pages;
465
		if (!zone_spans_pfn(zone, pfn))
466 467 468
			ret = 1;
	} while (zone_span_seqretry(zone, seq));

469
	if (ret)
470 471 472
		pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
			pfn, zone_to_nid(zone), zone->name,
			start_pfn, start_pfn + sp);
473

474
	return ret;
475 476 477 478
}

static int page_is_consistent(struct zone *zone, struct page *page)
{
479
	if (!pfn_valid_within(page_to_pfn(page)))
480
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
481
	if (zone != page_zone(page))
482 483 484 485 486 487 488
		return 0;

	return 1;
}
/*
 * Temporary debugging check for pages not lying within a given zone.
 */
489
static int __maybe_unused bad_range(struct zone *zone, struct page *page)
490 491
{
	if (page_outside_zone_boundaries(zone, page))
Linus Torvalds's avatar
Linus Torvalds committed
492
		return 1;
493 494 495
	if (!page_is_consistent(zone, page))
		return 1;

Linus Torvalds's avatar
Linus Torvalds committed
496 497
	return 0;
}
498
#else
499
static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
500 501 502 503 504
{
	return 0;
}
#endif

505 506
static void bad_page(struct page *page, const char *reason,
		unsigned long bad_flags)
Linus Torvalds's avatar
Linus Torvalds committed
507
{
508 509 510 511 512 513 514 515 516 517 518 519 520 521
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			goto out;
		}
		if (nr_unshown) {
522
			pr_alert(
523
			      "BUG: Bad page state: %lu messages suppressed\n",
524 525 526 527 528 529 530 531
				nr_unshown);
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;

532
	pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
533
		current->comm, page_to_pfn(page));
534 535 536 537 538
	__dump_page(page, reason);
	bad_flags &= page->flags;
	if (bad_flags)
		pr_alert("bad because of flags: %#lx(%pGp)\n",
						bad_flags, &bad_flags);
539
	dump_page_owner(page);
540

541
	print_modules();
Linus Torvalds's avatar
Linus Torvalds committed
542
	dump_stack();
543
out:
544
	/* Leave bad fields for debug, except PageBuddy could make trouble */
545
	page_mapcount_reset(page); /* remove PageBuddy */
546
	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
Linus Torvalds's avatar
Linus Torvalds committed
547 548 549 550 551
}

/*
 * Higher-order pages are called "compound pages".  They are structured thusly:
 *
552
 * The first PAGE_SIZE page is called the "head page" and have PG_head set.
Linus Torvalds's avatar
Linus Torvalds committed
553
 *
554 555
 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
 * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
Linus Torvalds's avatar
Linus Torvalds committed
556
 *
557 558
 * The first tail page's ->compound_dtor holds the offset in array of compound
 * page destructors. See compound_page_dtors.
Linus Torvalds's avatar
Linus Torvalds committed
559
 *
560
 * The first tail page's ->compound_order holds the order of allocation.
561
 * This usage means that zero-order pages may not be compound.
Linus Torvalds's avatar
Linus Torvalds committed
562
 */
563

564
void free_compound_page(struct page *page)
565
{
566
	__free_pages_ok(page, compound_order(page));
567 568
}

569
void prep_compound_page(struct page *page, unsigned int order)
570 571 572 573
{
	int i;
	int nr_pages = 1 << order;

574
	set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
575 576 577 578
	set_compound_order(page, order);
	__SetPageHead(page);
	for (i = 1; i < nr_pages; i++) {
		struct page *p = page + i;
579
		set_page_count(p, 0);
580
		p->mapping = TAIL_MAPPING;
581
		set_compound_head(p, page);
582
	}
583
	atomic_set(compound_mapcount_ptr(page), -1);
584 585
}

586 587
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
588 589
bool _debug_pagealloc_enabled __read_mostly
			= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
590
EXPORT_SYMBOL(_debug_pagealloc_enabled);
591 592
bool _debug_guardpage_enabled __read_mostly;

593 594 595 596
static int __init early_debug_pagealloc(char *buf)
{
	if (!buf)
		return -EINVAL;
597
	return kstrtobool(buf, &_debug_pagealloc_enabled);
598 599 600
}
early_param("debug_pagealloc", early_debug_pagealloc);

601 602
static bool need_debug_guardpage(void)
{
603 604 605 606
	/* If we don't use debug_pagealloc, we don't need guard page */
	if (!debug_pagealloc_enabled())
		return false;

607 608 609
	if (!debug_guardpage_minorder())
		return false;

610 611 612 613 614
	return true;
}

static void init_debug_guardpage(void)
{
615 616 617
	if (!debug_pagealloc_enabled())
		return;

618 619 620
	if (!debug_guardpage_minorder())
		return;

621 622 623 624 625 626 627
	_debug_guardpage_enabled = true;
}

struct page_ext_operations debug_guardpage_ops = {
	.need = need_debug_guardpage,
	.init = init_debug_guardpage,
};
628 629 630 631 632 633

static int __init debug_guardpage_minorder_setup(char *buf)
{
	unsigned long res;

	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
634
		pr_err("Bad debug_guardpage_minorder value\n");
635 636 637
		return 0;
	}
	_debug_guardpage_minorder = res;
638
	pr_info("Setting debug_guardpage_minorder to %lu\n", res);
639 640
	return 0;
}
641
early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
642

643
static inline bool set_page_guard(struct zone *zone, struct page *page,
644
				unsigned int order, int migratetype)
645
{
646 647 648
	struct page_ext *page_ext;

	if (!debug_guardpage_enabled())
649 650 651 652
		return false;

	if (order >= debug_guardpage_minorder())
		return false;
653 654

	page_ext = lookup_page_ext(page);
655
	if (unlikely(!page_ext))
656
		return false;
657

658 659
	__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);

660 661 662 663
	INIT_LIST_HEAD(&page->lru);
	set_page_private(page, order);
	/* Guard pages are not available for any usage */
	__mod_zone_freepage_state(zone, -(1 << order), migratetype);
664 665

	return true;
666 667
}

668 669
static inline void clear_page_guard(struct zone *zone, struct page *page,
				unsigned int order, int migratetype)
670
{
671 672 673 674 675 676
	struct page_ext *page_ext;

	if (!debug_guardpage_enabled())
		return;

	page_ext = lookup_page_ext(page);
677 678 679
	if (unlikely(!page_ext))
		return;

680 681
	__clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);

682 683 684
	set_page_private(page, 0);
	if (!is_migrate_isolate(migratetype))
		__mod_zone_freepage_state(zone, (1 << order), migratetype);
685 686
}
#else
687
struct page_ext_operations debug_guardpage_ops;
688 689
static inline bool set_page_guard(struct zone *zone, struct page *page,
			unsigned int order, int migratetype) { return false; }
690 691
static inline void clear_page_guard(struct zone *zone, struct page *page,
				unsigned int order, int migratetype) {}
692 693
#endif

694
static inline void set_page_order(struct page *page, unsigned int order)
695
{
696
	set_page_private(page, order);
697
	__SetPageBuddy(page);
Linus Torvalds's avatar
Linus Torvalds committed
698 699 700 701
}

static inline void rmv_page_order(struct page *page)
{
702
	__ClearPageBuddy(page);
703
	set_page_private(page, 0);
Linus Torvalds's avatar
Linus Torvalds committed
704 705 706 707
}

/*
 * This function checks whether a page is free && is the buddy
708
 * we can coalesce a page and its buddy if
709
 * (a) the buddy is not in a hole (check before calling!) &&
710
 * (b) the buddy is in the buddy system &&
711 712
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
713
 *
714 715
 * For recording whether a page is in the buddy system, we set PageBuddy.
 * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
Linus Torvalds's avatar
Linus Torvalds committed
716
 *
717
 * For recording page's order, we use page_private(page).
Linus Torvalds's avatar
Linus Torvalds committed
718
 */
719
static inline int page_is_buddy(struct page *page, struct page *buddy,
720
							unsigned int order)
Linus Torvalds's avatar
Linus Torvalds committed
721
{
722
	if (page_is_guard(buddy) && page_order(buddy) == order) {
723 724 725
		if (page_zone_id(page) != page_zone_id(buddy))
			return 0;

726 727
		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

728 729 730
		return 1;
	}

731
	if (PageBuddy(buddy) && page_order(buddy) == order) {
732 733 734 735 736 737 738 739
		/*
		 * zone check is done late to avoid uselessly
		 * calculating zone/node ids for pages that could
		 * never merge.
		 */
		if (page_zone_id(page) != page_zone_id(buddy))
			return 0;

740 741
		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

742
		return 1;
743
	}
744
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
745 746 747 748 749 750 751 752 753 754 755 756 757 758 759
}

/*
 * Freeing function for a buddy system allocator.
 *
 * The concept of a buddy system is to maintain direct-mapped table
 * (containing bit values) for memory blocks of various "orders".
 * The bottom level table contains the map for the smallest allocatable
 * units of memory (here, pages), and each level above it describes
 * pairs of units from the levels below, hence, "buddies".
 * At a high level, all that happens here is marking the table entry
 * at the bottom level available, and propagating the changes upward
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
760 761
 * free pages of length of (1 << order) and marked with PageBuddy.
 * Page's order is recorded in page_private(page) field.
Linus Torvalds's avatar
Linus Torvalds committed
762
 * So when we are allocating or freeing one, we can derive the state of the
763 764
 * other.  That is, if we allocate a small block, and both were
 * free, the remainder of the region must be split into blocks.
Linus Torvalds's avatar
Linus Torvalds committed
765
 * If a block is freed, and its buddy is also free, then this
766
 * triggers coalescing into a block of larger size.
Linus Torvalds's avatar
Linus Torvalds committed
767
 *
768
 * -- nyc
Linus Torvalds's avatar
Linus Torvalds committed
769 770
 */

Nick Piggin's avatar
Nick Piggin committed
771
static inline void __free_one_page(struct page *page,
772
		unsigned long pfn,
773 774
		struct zone *zone, unsigned int order,
		int migratetype)
Linus Torvalds's avatar
Linus Torvalds committed
775
{
776 777
	unsigned long combined_pfn;
	unsigned long uninitialized_var(buddy_pfn);
778
	struct page *buddy;
779 780 781
	unsigned int max_order;

	max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
Linus Torvalds's avatar
Linus Torvalds committed
782

783
	VM_BUG_ON(!zone_is_initialized(zone));
784
	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
Linus Torvalds's avatar
Linus Torvalds committed
785

786
	VM_BUG_ON(migratetype == -1);
787
	if (likely(!is_migrate_isolate(migratetype)))
788
		__mod_zone_freepage_state(zone, 1 << order, migratetype);
789

790
	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
791
	VM_BUG_ON_PAGE(bad_range(zone, page), page);
Linus Torvalds's avatar
Linus Torvalds committed
792

793
continue_merging:
794
	while (order < max_order - 1) {
795 796
		buddy_pfn = __find_buddy_pfn(pfn, order);
		buddy = page + (buddy_pfn - pfn);
797 798 799

		if (!pfn_valid_within(buddy_pfn))
			goto done_merging;
800
		if (!page_is_buddy(page, buddy, order))
801
			goto done_merging;
802 803 804 805 806
		/*
		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
		 * merge with it and move up one order.
		 */
		if (page_is_guard(buddy)) {
807
			clear_page_guard(zone, buddy, order, migratetype);
808 809 810 811 812
		} else {
			list_del(&buddy->lru);
			zone->free_area[order].nr_free--;
			rmv_page_order(buddy);
		}
813 814 815
		combined_pfn = buddy_pfn & pfn;
		page = page + (combined_pfn - pfn);
		pfn = combined_pfn;
Linus Torvalds's avatar
Linus Torvalds committed
816 817
		order++;
	}
818 819 820 821 822 823 824 825 826 827 828 829
	if (max_order < MAX_ORDER) {
		/* If we are here, it means order is >= pageblock_order.
		 * We want to prevent merge between freepages on isolate
		 * pageblock and normal pageblock. Without this, pageblock
		 * isolation could cause incorrect freepage or CMA accounting.
		 *
		 * We don't want to hit this code for the more frequent
		 * low-order merging.
		 */
		if (unlikely(has_isolate_pageblock(zone))) {
			int buddy_mt;

830 831
			buddy_pfn = __find_buddy_pfn(pfn, order);
			buddy = page + (buddy_pfn - pfn);
832 833 834 835 836 837 838 839 840 841 842 843
			buddy_mt = get_pageblock_migratetype(buddy);

			if (migratetype != buddy_mt
					&& (is_migrate_isolate(migratetype) ||
						is_migrate_isolate(buddy_mt)))
				goto done_merging;
		}
		max_order++;
		goto continue_merging;
	}

done_merging:
Linus Torvalds's avatar
Linus Torvalds committed
844
	set_page_order(page, order);
845 846 847 848 849 850 851 852 853

	/*
	 * If this is not the largest possible page, check if the buddy
	 * of the next-highest order is free. If it is, it's possible
	 * that pages are being freed that will coalesce soon. In case,
	 * that is happening, add the free page to the tail of the list
	 * so it's less likely to be used soon and more likely to be merged
	 * as a higher order page
	 */
854
	if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
855
		struct page *higher_page, *higher_buddy;
856 857 858 859
		combined_pfn = buddy_pfn & pfn;
		higher_page = page + (combined_pfn - pfn);
		buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
		higher_buddy = higher_page + (buddy_pfn - combined_pfn);
860 861
		if (pfn_valid_within(buddy_pfn) &&
		    page_is_buddy(higher_page, higher_buddy, order + 1)) {
862 863 864 865 866 867 868 869
			list_add_tail(&page->lru,
				&zone->free_area[order].free_list[migratetype]);
			goto out;
		}
	}

	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
out:
Linus Torvalds's avatar
Linus Torvalds committed
870 871 872
	zone->free_area[order].nr_free++;
}

873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894
/*
 * A bad page could be due to a number of fields. Instead of multiple branches,
 * try and check multiple fields with one check. The caller must do a detailed
 * check if necessary.
 */
static inline bool page_expected_state(struct page *page,
					unsigned long check_flags)
{
	if (unlikely(atomic_read(&page->_mapcount) != -1))
		return false;

	if (unlikely((unsigned long)page->mapping |
			page_ref_count(page) |
#ifdef CONFIG_MEMCG
			(unsigned long)page->mem_cgroup |
#endif
			(page->flags & check_flags)))
		return false;

	return true;
}

895
static void free_pages_check_bad(struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
896
{
897 898 899 900 901
	const char *bad_reason;
	unsigned long bad_flags;

	bad_reason = NULL;
	bad_flags = 0;
902

903
	if (unlikely(atomic_read(&page->_mapcount) != -1))
904 905 906
		bad_reason = "nonzero mapcount";
	if (unlikely(page->mapping != NULL))
		bad_reason = "non-NULL mapping";
907
	if (unlikely(page_ref_count(page) != 0))
908
		bad_reason = "nonzero _refcount";
909 910 911 912
	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
		bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
	}
913 914 915 916
#ifdef CONFIG_MEMCG
	if (unlikely(page->mem_cgroup))
		bad_reason = "page still charged to cgroup";
#endif
917
	bad_page(page, bad_reason, bad_flags);
918 919 920 921
}

static inline int free_pages_check(struct page *page)
{
922
	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
923 924 925 926
		return 0;

	/* Something has gone sideways, find it */
	free_pages_check_bad(page);
927
	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
928 929
}

930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945
static int free_tail_pages_check(struct page *head_page, struct page *page)
{
	int ret = 1;

	/*
	 * We rely page->lru.next never has bit 0 set, unless the page
	 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
	 */
	BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);

	if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
		ret = 0;
		goto out;
	}
	switch (page - head_page) {
	case 1:
946
		/* the first tail page: ->mapping may be compound_mapcount() */
947 948 949 950 951 952 953 954
		if (unlikely(compound_mapcount(page))) {
			bad_page(page, "nonzero compound_mapcount", 0);
			goto out;
		}
		break;
	case 2:
		/*
		 * the second tail page: ->mapping is
955
		 * deferred_list.next -- ignore value.
956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979
		 */
		break;
	default:
		if (page->mapping != TAIL_MAPPING) {
			bad_page(page, "corrupted mapping in tail page", 0);
			goto out;
		}
		break;
	}
	if (unlikely(!PageTail(page))) {
		bad_page(page, "PageTail not set", 0);
		goto out;
	}
	if (unlikely(compound_head(page) != head_page)) {
		bad_page(page, "compound_head not consistent", 0);
		goto out;
	}
	ret = 0;
out:
	page->mapping = NULL;
	clear_compound_head(page);
	return ret;
}

980 981
static __always_inline bool free_pages_prepare(struct page *page,
					unsigned int order, bool check_free)
982
{
983
	int bad = 0;
984 985 986

	VM_BUG_ON_PAGE(PageTail(page), page);

987 988 989 990 991 992 993 994 995 996 997
	trace_mm_page_free(page, order);

	/*
	 * Check tail pages before head page information is cleared to
	 * avoid checking PageCompound for order-0 pages.
	 */
	if (unlikely(order)) {
		bool compound = PageCompound(page);
		int i;

		VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
998

999 1000
		if (compound)
			ClearPageDoubleMap(page);
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
		for (i = 1; i < (1 << order); i++) {
			if (compound)
				bad += free_tail_pages_check(page, page + i);
			if (unlikely(free_pages_check(page + i))) {
				bad++;
				continue;
			}
			(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
		}
	}
1011
	if (PageMappingFlags(page))
1012
		page->mapping = NULL;
1013
	if (memcg_kmem_enabled() && PageKmemcg(page))
1014
		memcg_kmem_uncharge(page, order);
1015 1016 1017 1018
	if (check_free)
		bad += free_pages_check(page);
	if (bad)
		return false;
1019

1020 1021 1022
	page_cpupid_reset_last(page);
	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
	reset_page_owner(page, order);
1023 1024 1025

	if (!PageHighMem(page)) {
		debug_check_no_locks_freed(page_address(page),
1026
					   PAGE_SIZE << order);
1027
		debug_check_no_obj_freed(page_address(page),
1028
					   PAGE_SIZE << order);
1029
	}
1030 1031 1032
	arch_free_page(page, order);
	kernel_poison_pages(page, 1 << order, 0);
	kernel_map_pages(page, 1 << order, 0);
1033
	kasan_free_pages(page, order);
1034 1035 1036 1037

	return true;
}

1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053
#ifdef CONFIG_DEBUG_VM
static inline bool free_pcp_prepare(struct page *page)
{
	return free_pages_prepare(page, 0, true);
}

static inline bool bulkfree_pcp_prepare(struct page *page)
{
	return false;
}
#else
static bool free_pcp_prepare(struct page *page)
{
	return free_pages_prepare(page, 0, false);
}

1054 1055 1056 1057 1058 1059
static bool bulkfree_pcp_prepare(struct page *page)
{
	return free_pages_check(page);
}
#endif /* CONFIG_DEBUG_VM */

1060 1061 1062 1063 1064 1065 1066 1067 1068
static inline void prefetch_buddy(struct page *page)
{
	unsigned long pfn = page_to_pfn(page);
	unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
	struct page *buddy = page + (buddy_pfn - pfn);

	prefetch(buddy);
}

Linus Torvalds's avatar
Linus Torvalds committed
1069
/*
1070
 * Frees a number of pages from the PCP lists
Linus Torvalds's avatar
Linus Torvalds committed
1071
 * Assumes all pages on list are in same zone, and of same order.
1072
 * count is the number of pages to free.
Linus Torvalds's avatar
Linus Torvalds committed
1073 1074 1075 1076 1077 1078 1079
 *
 * If the zone was previously in an "all pages pinned" state then look to
 * see if this freeing clears that state.
 *
 * And clear the zone's pages_scanned counter, to hold off the "all pages are
 * pinned" detection logic.
 */
1080 1081
static void free_pcppages_bulk(struct zone *zone, int count,
					struct per_cpu_pages *pcp)
Linus Torvalds's avatar
Linus Torvalds committed
1082
{
1083
	int migratetype = 0;
1084
	int batch_free = 0;
1085
	int prefetch_nr = 0;
1086
	bool isolated_pageblocks;
1087 1088
	struct page *page, *tmp;
	LIST_HEAD(head);
1089

1090
	while (count) {
1091 1092 1093
		struct list_head *list;

		/*
1094 1095 1096 1097 1098
		 * Remove pages from lists in a round-robin fashion. A
		 * batch_free count is maintained that is incremented when an
		 * empty list is encountered.  This is so more pages are freed
		 * off fuller lists instead of spinning excessively around empty
		 * lists
1099 1100
		 */
		do {
1101
			batch_free++;
1102 1103 1104 1105
			if (++migratetype == MIGRATE_PCPTYPES)
				migratetype = 0;
			list = &pcp->lists[migratetype];
		} while (list_empty(list));
Nick Piggin's avatar
Nick Piggin committed
1106

1107 1108
		/* This is the only non-empty list. Free them all. */
		if (batch_free == MIGRATE_PCPTYPES)
1109
			batch_free = count;
1110

1111
		do {
1112
			page = list_last_entry(list, struct page, lru);
1113
			/* must delete to avoid corrupting pcp list */
1114
			list_del(&page->lru);
1115
			pcp->count--;
1116

1117 1118 1119
			if (bulkfree_pcp_prepare(page))
				continue;

1120
			list_add_tail(&page->lru, &head);
1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132

			/*
			 * We are going to put the page back to the global
			 * pool, prefetch its buddy to speed up later access
			 * under zone->lock. It is believed the overhead of
			 * an additional test and calculating buddy_pfn here
			 * can be offset by reduced memory latency later. To
			 * avoid excessive prefetching due to large count, only
			 * prefetch buddy for the first pcp->batch nr of pages.
			 */
			if (prefetch_nr++ < pcp->batch)
				prefetch_buddy(page);
1133
		} while (--count && --batch_free && !list_empty(list));
Linus Torvalds's avatar
Linus Torvalds committed
1134
	}
1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153

	spin_lock(&zone->lock);
	isolated_pageblocks = has_isolate_pageblock(zone);

	/*
	 * Use safe version since after __free_one_page(),
	 * page->lru.next will not point to original list.
	 */
	list_for_each_entry_safe(page, tmp, &head, lru) {
		int mt = get_pcppage_migratetype(page);
		/* MIGRATE_ISOLATE page should not go to pcplists */
		VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
		/* Pageblock could have been isolated meanwhile */
		if (unlikely(isolated_pageblocks))
			mt = get_pageblock_migratetype(page);

		__free_one_page(page, page_to_pfn(page), zone, 0, mt);
		trace_mm_page_pcpu_drain(page, 0, mt);
	}
1154
	spin_unlock(&zone->lock);
Linus Torvalds's avatar
Linus Torvalds committed
1155 1156
}

1157 1158
static void free_one_page(struct zone *zone,
				struct page *page, unsigned long pfn,
1159
				unsigned int order,
1160
				int migratetype)
Linus Torvalds's avatar
Linus Torvalds committed
1161
{
1162
	spin_lock(&zone->lock);
1163 1164 1165 1166
	if (unlikely(has_isolate_pageblock(zone) ||
		is_migrate_isolate(migratetype))) {
		migratetype = get_pfnblock_migratetype(page, pfn);
	}
1167
	__free_one_page(page, pfn, zone, order, migratetype);
1168
	spin_unlock(&zone->lock);
Nick Piggin's avatar
Nick Piggin committed
1169 1170
}

1171
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1172
				unsigned long zone, int nid)
1173
{
1174
	mm_zero_struct_page(page);
1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
	set_page_links(page, zone, nid, pfn);
	init_page_count(page);
	page_mapcount_reset(page);
	page_cpupid_reset_last(page);

	INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
	if (!is_highmem_idx(zone))
		set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}

1188
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1189
static void __meminit init_reserved_page(unsigned long pfn)
1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205
{
	pg_data_t *pgdat;
	int nid, zid;

	if (!early_page_uninitialised(pfn))
		return;

	nid = early_pfn_to_nid(pfn);
	pgdat = NODE_DATA(nid);

	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
		struct zone *zone = &pgdat->node_zones[zid];

		if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
			break;
	}
1206
	__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
1207 1208 1209 1210 1211 1212 1213
}
#else
static inline void init_reserved_page(unsigned long pfn)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

1214 1215 1216 1217 1218 1219
/*
 * Initialised pages do not have PageReserved set. This function is
 * called for each range allocated by the bootmem allocator and
 * marks the pages PageReserved. The remaining valid pages are later
 * sent to the buddy page allocator.
 */
1220
void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
1221 1222 1223 1224
{
	unsigned long start_pfn = PFN_DOWN(start);
	unsigned long end_pfn = PFN_UP(end);

1225 1226 1227 1228 1229
	for (; start_pfn < end_pfn; start_pfn++) {
		if (pfn_valid(start_pfn)) {
			struct page *page = pfn_to_page(start_pfn);

			init_reserved_page(start_pfn);
1230 1231 1232 1233

			/* Avoid false-positive PageTail() */
			INIT_LIST_HEAD(&page->lru);

1234 1235 1236
			SetPageReserved(page);
		}
	}
1237 1238
}

1239 1240
static void __free_pages_ok(struct page *page, unsigned int order)
{
1241
	unsigned long flags;
1242
	int migratetype;
1243
	unsigned long pfn = page_to_pfn(page);
1244

1245
	if (!free_pages_prepare(page, order, true))
1246 1247
		return;

1248
	migratetype = get_pfnblock_migratetype(page, pfn);
1249 1250
	local_irq_save(flags);
	__count_vm_events(PGFREE, 1 << order);
1251
	free_one_page(page_zone(page), page, pfn, order, migratetype);
1252
	local_irq_restore(flags);
Linus Torvalds's avatar
Linus Torvalds committed
1253 1254
}

1255
static void __init __free_pages_boot_core(struct page *page, unsigned int order)
1256
{
1257
	unsigned int nr_pages = 1 << order;
1258
	struct page *p = page;
1259
	unsigned int loop;
1260

1261 1262 1263
	prefetchw(p);
	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
		prefetchw(p + 1);
1264 1265
		__ClearPageReserved(p);
		set_page_count(p, 0);
1266
	}
1267 1268
	__ClearPageReserved(p);
	set_page_count(p, 0);
1269

1270
	page_zone(page)->managed_pages += nr_pages;
1271 1272
	set_page_refcounted(page);
	__free_pages(page, order);
1273 1274
}

1275 1276
#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
	defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
1277

1278 1279 1280 1281
static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;

int __meminit early_pfn_to_nid(unsigned long pfn)
{
1282
	static DEFINE_SPINLOCK(early_pfn_lock);
1283 1284
	int nid;

1285
	spin_lock(&early_pfn_lock);
1286
	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
1287
	if (nid < 0)
1288
		nid = first_online_node;
1289 1290 1291
	spin_unlock(&early_pfn_lock);

	return nid;
1292 1293 1294 1295
}
#endif

#ifdef CONFIG_NODES_SPAN_OTHER_NODES
1296 1297 1298
static inline bool __meminit __maybe_unused
meminit_pfn_in_nid(unsigned long pfn, int node,
		   struct mminit_pfnnid_cache *state)
1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319
{
	int nid;

	nid = __early_pfn_to_nid(pfn, state);
	if (nid >= 0 && nid != node)
		return false;
	return true;
}

/* Only safe to use early in boot when initialisation is single-threaded */
static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
	return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
}

#else

static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
	return true;
}
1320 1321 1322
static inline bool __meminit  __maybe_unused
meminit_pfn_in_nid(unsigned long pfn, int node,
		   struct mminit_pfnnid_cache *state)
1323 1324 1325 1326 1327 1328
{
	return true;
}
#endif


1329
void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
1330 1331 1332 1333
							unsigned int order)
{
	if (early_page_uninitialised(pfn))
		return;
1334
	return __free_pages_boot_core(page, order);
1335 1336
}

1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365
/*
 * Check that the whole (or subset of) a pageblock given by the interval of
 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
 * with the migration of free c