memremap.c 9.94 KB
Newer Older
1 2
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
3
#include <linux/device.h>
4
#include <linux/io.h>
5
#include <linux/kasan.h>
6
#include <linux/memory_hotplug.h>
7 8
#include <linux/mm.h>
#include <linux/pfn_t.h>
9 10
#include <linux/swap.h>
#include <linux/swapops.h>
11
#include <linux/types.h>
12
#include <linux/wait_bit.h>
13
#include <linux/xarray.h>
14
#include <linux/hmm.h>
15

16
static DEFINE_XARRAY(pgmap_array);
17 18 19
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)

20
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
21
vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
22 23 24 25 26 27
		       unsigned long addr,
		       swp_entry_t entry,
		       unsigned int flags,
		       pmd_t *pmdp)
{
	struct page *page = device_private_entry_to_page(entry);
28 29 30
	struct hmm_devmem *devmem;

	devmem = container_of(page->pgmap, typeof(*devmem), pagemap);
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45

	/*
	 * The page_fault() callback must migrate page back to system memory
	 * so that CPU can access it. This might fail for various reasons
	 * (device issue, device was unsafely unplugged, ...). When such
	 * error conditions happen, the callback must return VM_FAULT_SIGBUS.
	 *
	 * Note that because memory cgroup charges are accounted to the device
	 * memory, this should never fail because of memory restrictions (but
	 * allocation of regular system page might still fail because we are
	 * out of memory).
	 *
	 * There is a more in-depth description of what that callback can and
	 * cannot do, in include/linux/memremap.h
	 */
46
	return devmem->page_fault(vma, addr, page, flags, pmdp);
47 48 49 50
}
EXPORT_SYMBOL(device_private_entry_fault);
#endif /* CONFIG_DEVICE_PRIVATE */

51
static void pgmap_array_delete(struct resource *res)
52
{
53 54
	xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
			NULL, GFP_KERNEL);
55
	synchronize_rcu();
56 57
}

58
static unsigned long pfn_first(struct dev_pagemap *pgmap)
59
{
60 61
	const struct resource *res = &pgmap->res;
	struct vmem_altmap *altmap = &pgmap->altmap;
62 63 64
	unsigned long pfn;

	pfn = res->start >> PAGE_SHIFT;
65
	if (pgmap->altmap_valid)
66 67 68 69
		pfn += vmem_altmap_offset(altmap);
	return pfn;
}

70
static unsigned long pfn_end(struct dev_pagemap *pgmap)
71
{
72
	const struct resource *res = &pgmap->res;
73 74 75 76

	return (res->start + resource_size(res)) >> PAGE_SHIFT;
}

77 78 79 80 81 82 83
static unsigned long pfn_next(unsigned long pfn)
{
	if (pfn % 1024 == 0)
		cond_resched();
	return pfn + 1;
}

84
#define for_each_device_pfn(pfn, map) \
85
	for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
86

87
static void devm_memremap_pages_release(void *data)
88
{
89
	struct dev_pagemap *pgmap = data;
90
	struct device *dev = pgmap->dev;
91
	struct resource *res = &pgmap->res;
92
	resource_size_t align_start, align_size;
93
	unsigned long pfn;
94
	int nid;
95

96
	pgmap->kill(pgmap->ref);
97
	for_each_device_pfn(pfn, pgmap)
98
		put_page(pfn_to_page(pfn));
99

100
	/* pages are dead and unused, undo the arch mapping */
101
	align_start = res->start & ~(SECTION_SIZE - 1);
102 103
	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
		- align_start;
104

105 106
	nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT));

107
	mem_hotplug_begin();
108 109 110 111 112
	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
		pfn = align_start >> PAGE_SHIFT;
		__remove_pages(page_zone(pfn_to_page(pfn)), pfn,
				align_size >> PAGE_SHIFT, NULL);
	} else {
113
		arch_remove_memory(nid, align_start, align_size,
114 115 116
				pgmap->altmap_valid ? &pgmap->altmap : NULL);
		kasan_remove_zero_shadow(__va(align_start), align_size);
	}
117
	mem_hotplug_done();
118

119
	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
120
	pgmap_array_delete(res);
121 122
	dev_WARN_ONCE(dev, pgmap->altmap.alloc,
		      "%s: failed to free all reserved pages\n", __func__);
123 124
}

125 126 127
/**
 * devm_memremap_pages - remap and provide memmap backing for the given resource
 * @dev: hosting device for @res
128
 * @pgmap: pointer to a struct dev_pagemap
129
 *
130
 * Notes:
131 132 133 134 135 136
 * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
 *    by the caller before passing it to this function
 *
 * 2/ The altmap field may optionally be initialized, in which case altmap_valid
 *    must be set to true
 *
137 138
 * 3/ pgmap->ref must be 'live' on entry and will be killed at
 *    devm_memremap_pages_release() time, or if this routine fails.
139
 *
140
 * 4/ res is expected to be a host memory range that could feasibly be
141 142
 *    treated as a "System RAM" range, i.e. not a device mmio range, but
 *    this is not enforced.
143
 */
144
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
145
{
146
	resource_size_t align_start, align_size, align_end;
147 148
	struct vmem_altmap *altmap = pgmap->altmap_valid ?
			&pgmap->altmap : NULL;
149
	struct resource *res = &pgmap->res;
150
	struct dev_pagemap *conflict_pgmap;
151
	pgprot_t pgprot = PAGE_KERNEL;
152
	int error, nid, is_ram;
153

154 155 156
	if (!pgmap->ref || !pgmap->kill)
		return ERR_PTR(-EINVAL);

157 158 159
	align_start = res->start & ~(SECTION_SIZE - 1);
	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
		- align_start;
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
	align_end = align_start + align_size - 1;

	conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL);
	if (conflict_pgmap) {
		dev_WARN(dev, "Conflicting mapping in same section\n");
		put_dev_pagemap(conflict_pgmap);
		return ERR_PTR(-ENOMEM);
	}

	conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
	if (conflict_pgmap) {
		dev_WARN(dev, "Conflicting mapping in same section\n");
		put_dev_pagemap(conflict_pgmap);
		return ERR_PTR(-ENOMEM);
	}

176 177
	is_ram = region_intersects(align_start, align_size,
		IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
178

179 180 181
	if (is_ram != REGION_DISJOINT) {
		WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
				is_ram == REGION_MIXED ? "mixed" : "ram", res);
182 183
		error = -ENXIO;
		goto err_array;
184 185
	}

186 187
	pgmap->dev = dev;

188 189
	error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
				PHYS_PFN(res->end), pgmap, GFP_KERNEL));
190
	if (error)
191
		goto err_array;
192

193 194
	nid = dev_to_node(dev);
	if (nid < 0)
195
		nid = numa_mem_id();
196

197 198 199 200 201
	error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0,
			align_size);
	if (error)
		goto err_pfn_remap;

202
	mem_hotplug_begin();
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234

	/*
	 * For device private memory we call add_pages() as we only need to
	 * allocate and initialize struct page for the device memory. More-
	 * over the device memory is un-accessible thus we do not want to
	 * create a linear mapping for the memory like arch_add_memory()
	 * would do.
	 *
	 * For all other device memory types, which are accessible by
	 * the CPU, we do want the linear mapping and thus use
	 * arch_add_memory().
	 */
	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
		error = add_pages(nid, align_start >> PAGE_SHIFT,
				align_size >> PAGE_SHIFT, NULL, false);
	} else {
		error = kasan_add_zero_shadow(__va(align_start), align_size);
		if (error) {
			mem_hotplug_done();
			goto err_kasan;
		}

		error = arch_add_memory(nid, align_start, align_size, altmap,
				false);
	}

	if (!error) {
		struct zone *zone;

		zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
		move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
				align_size >> PAGE_SHIFT, altmap);
235 236
	}

237
	mem_hotplug_done();
238 239
	if (error)
		goto err_add_memory;
240

241 242 243 244 245 246 247 248
	/*
	 * Initialization of the pages has been deferred until now in order
	 * to allow us to do the work while not holding the hotplug lock.
	 */
	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
				align_start >> PAGE_SHIFT,
				align_size >> PAGE_SHIFT, pgmap);
	percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
249

250 251 252 253
	error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
			pgmap);
	if (error)
		return ERR_PTR(error);
254

255
	return __va(res->start);
256 257

 err_add_memory:
258 259
	kasan_remove_zero_shadow(__va(align_start), align_size);
 err_kasan:
260 261
	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
 err_pfn_remap:
262 263
	pgmap_array_delete(res);
 err_array:
264
	pgmap->kill(pgmap->ref);
265
	return ERR_PTR(error);
266
}
267
EXPORT_SYMBOL_GPL(devm_memremap_pages);
268 269 270 271 272 273 274 275 276 277 278 279

unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
	/* number of pfns from base where pfn_to_page() is valid */
	return altmap->reserve + altmap->free;
}

void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
{
	altmap->alloc -= nr_pfns;
}

280 281 282 283 284
/**
 * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
 * @pfn: page frame number to lookup page_map
 * @pgmap: optional known pgmap that already has a reference
 *
285 286
 * If @pgmap is non-NULL and covers @pfn it will be returned as-is.  If @pgmap
 * is non-NULL but does not cover @pfn the reference to it will be released.
287 288 289 290 291 292 293
 */
struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
		struct dev_pagemap *pgmap)
{
	resource_size_t phys = PFN_PHYS(pfn);

	/*
294
	 * In the cached case we're already holding a live reference.
295
	 */
296
	if (pgmap) {
297
		if (phys >= pgmap->res.start && phys <= pgmap->res.end)
298 299
			return pgmap;
		put_dev_pagemap(pgmap);
300 301 302 303
	}

	/* fall back to slow path lookup */
	rcu_read_lock();
304
	pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
305 306 307 308 309 310
	if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
		pgmap = NULL;
	rcu_read_unlock();

	return pgmap;
}
311
EXPORT_SYMBOL_GPL(get_dev_pagemap);
312

313 314
#ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
315
EXPORT_SYMBOL(devmap_managed_key);
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
static atomic_t devmap_enable;

/*
 * Toggle the static key for ->page_free() callbacks when dev_pagemap
 * pages go idle.
 */
void dev_pagemap_get_ops(void)
{
	if (atomic_inc_return(&devmap_enable) == 1)
		static_branch_enable(&devmap_managed_key);
}
EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);

void dev_pagemap_put_ops(void)
{
	if (atomic_dec_and_test(&devmap_enable))
		static_branch_disable(&devmap_managed_key);
}
EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);

void __put_devmap_managed_page(struct page *page)
337 338 339 340 341 342 343 344 345 346 347 348
{
	int count = page_ref_dec_return(page);

	/*
	 * If refcount is 1 then page is freed and refcount is stable as nobody
	 * holds a reference on the page.
	 */
	if (count == 1) {
		/* Clear Active bit in case of parallel mark_page_accessed */
		__ClearPageActive(page);
		__ClearPageWaiters(page);

349
		mem_cgroup_uncharge(page);
350 351 352 353 354

		page->pgmap->page_free(page, page->pgmap->data);
	} else if (!count)
		__put_page(page);
}
355
EXPORT_SYMBOL(__put_devmap_managed_page);
356
#endif /* CONFIG_DEV_PAGEMAP_OPS */