namespace.c 78 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *	Released under GPL v2.
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
Al Viro's avatar
Al Viro committed
12
#include <linux/export.h>
13
#include <linux/capability.h>
14
#include <linux/mnt_namespace.h>
15
#include <linux/user_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
16 17
#include <linux/namei.h>
#include <linux/security.h>
18
#include <linux/idr.h>
19
#include <linux/init.h>		/* init_rootfs */
Al Viro's avatar
Al Viro committed
20 21 22
#include <linux/fs_struct.h>	/* get_fs_root et.al. */
#include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
#include <linux/uaccess.h>
23
#include <linux/proc_ns.h>
24
#include <linux/magic.h>
Al Viro's avatar
Al Viro committed
25
#include <linux/bootmem.h>
Al Viro's avatar
Al Viro committed
26
#include <linux/task_work.h>
27
#include "pnode.h"
28
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
29

Al Viro's avatar
Al Viro committed
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
static unsigned int m_hash_mask __read_mostly;
static unsigned int m_hash_shift __read_mostly;
static unsigned int mp_hash_mask __read_mostly;
static unsigned int mp_hash_shift __read_mostly;

static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
	if (!str)
		return 0;
	mhash_entries = simple_strtoul(str, &str, 0);
	return 1;
}
__setup("mhash_entries=", set_mhash_entries);

static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
	if (!str)
		return 0;
	mphash_entries = simple_strtoul(str, &str, 0);
	return 1;
}
__setup("mphash_entries=", set_mphash_entries);
Eric Dumazet's avatar
Eric Dumazet committed
54

Al Viro's avatar
Al Viro committed
55
static u64 event;
56
static DEFINE_IDA(mnt_id_ida);
57
static DEFINE_IDA(mnt_group_ida);
Nick Piggin's avatar
Nick Piggin committed
58
static DEFINE_SPINLOCK(mnt_id_lock);
59 60
static int mnt_id_start = 0;
static int mnt_group_start = 1;
Linus Torvalds's avatar
Linus Torvalds committed
61

Al Viro's avatar
Al Viro committed
62
static struct hlist_head *mount_hashtable __read_mostly;
Al Viro's avatar
Al Viro committed
63
static struct hlist_head *mountpoint_hashtable __read_mostly;
64
static struct kmem_cache *mnt_cache __read_mostly;
65
static DECLARE_RWSEM(namespace_sem);
Linus Torvalds's avatar
Linus Torvalds committed
66

Miklos Szeredi's avatar
Miklos Szeredi committed
67
/* /sys/fs */
68 69
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
Miklos Szeredi's avatar
Miklos Szeredi committed
70

Nick Piggin's avatar
Nick Piggin committed
71 72 73 74 75 76 77 78
/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
Al Viro's avatar
Al Viro committed
79
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
Nick Piggin's avatar
Nick Piggin committed
80

Al Viro's avatar
Al Viro committed
81
static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
Linus Torvalds's avatar
Linus Torvalds committed
82
{
Ram Pai's avatar
Ram Pai committed
83 84
	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
Al Viro's avatar
Al Viro committed
85 86 87 88 89 90 91 92 93
	tmp = tmp + (tmp >> m_hash_shift);
	return &mount_hashtable[tmp & m_hash_mask];
}

static inline struct hlist_head *mp_hash(struct dentry *dentry)
{
	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
	tmp = tmp + (tmp >> mp_hash_shift);
	return &mountpoint_hashtable[tmp & mp_hash_mask];
Linus Torvalds's avatar
Linus Torvalds committed
94 95
}

Nick Piggin's avatar
Nick Piggin committed
96 97 98 99
/*
 * allocation is serialized by namespace_sem, but we need the spinlock to
 * serialize with freeing.
 */
100
static int mnt_alloc_id(struct mount *mnt)
101 102 103 104 105
{
	int res;

retry:
	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
Nick Piggin's avatar
Nick Piggin committed
106
	spin_lock(&mnt_id_lock);
Al Viro's avatar
Al Viro committed
107
	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
108
	if (!res)
Al Viro's avatar
Al Viro committed
109
		mnt_id_start = mnt->mnt_id + 1;
Nick Piggin's avatar
Nick Piggin committed
110
	spin_unlock(&mnt_id_lock);
111 112 113 114 115 116
	if (res == -EAGAIN)
		goto retry;

	return res;
}

117
static void mnt_free_id(struct mount *mnt)
118
{
Al Viro's avatar
Al Viro committed
119
	int id = mnt->mnt_id;
Nick Piggin's avatar
Nick Piggin committed
120
	spin_lock(&mnt_id_lock);
121 122 123
	ida_remove(&mnt_id_ida, id);
	if (mnt_id_start > id)
		mnt_id_start = id;
Nick Piggin's avatar
Nick Piggin committed
124
	spin_unlock(&mnt_id_lock);
125 126
}

127 128 129 130 131
/*
 * Allocate a new peer group ID
 *
 * mnt_group_ida is protected by namespace_sem
 */
132
static int mnt_alloc_group_id(struct mount *mnt)
133
{
134 135
	int res;

136 137 138
	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
		return -ENOMEM;

139 140
	res = ida_get_new_above(&mnt_group_ida,
				mnt_group_start,
Al Viro's avatar
Al Viro committed
141
				&mnt->mnt_group_id);
142
	if (!res)
Al Viro's avatar
Al Viro committed
143
		mnt_group_start = mnt->mnt_group_id + 1;
144 145

	return res;
146 147 148 149 150
}

/*
 * Release a peer group ID
 */
151
void mnt_release_group_id(struct mount *mnt)
152
{
Al Viro's avatar
Al Viro committed
153
	int id = mnt->mnt_group_id;
154 155 156
	ida_remove(&mnt_group_ida, id);
	if (mnt_group_start > id)
		mnt_group_start = id;
Al Viro's avatar
Al Viro committed
157
	mnt->mnt_group_id = 0;
158 159
}

Nick Piggin's avatar
Nick Piggin committed
160 161 162
/*
 * vfsmount lock must be held for read
 */
163
static inline void mnt_add_count(struct mount *mnt, int n)
Nick Piggin's avatar
Nick Piggin committed
164 165
{
#ifdef CONFIG_SMP
166
	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
Nick Piggin's avatar
Nick Piggin committed
167 168
#else
	preempt_disable();
169
	mnt->mnt_count += n;
Nick Piggin's avatar
Nick Piggin committed
170 171 172 173 174 175 176
	preempt_enable();
#endif
}

/*
 * vfsmount lock must be held for write
 */
177
unsigned int mnt_get_count(struct mount *mnt)
Nick Piggin's avatar
Nick Piggin committed
178 179
{
#ifdef CONFIG_SMP
180
	unsigned int count = 0;
Nick Piggin's avatar
Nick Piggin committed
181 182 183
	int cpu;

	for_each_possible_cpu(cpu) {
184
		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
Nick Piggin's avatar
Nick Piggin committed
185 186 187 188
	}

	return count;
#else
189
	return mnt->mnt_count;
Nick Piggin's avatar
Nick Piggin committed
190 191 192
#endif
}

193 194 195 196 197 198 199 200
static void drop_mountpoint(struct fs_pin *p)
{
	struct mount *m = container_of(p, struct mount, mnt_umount);
	dput(m->mnt_ex_mountpoint);
	pin_remove(p);
	mntput(&m->mnt);
}

201
static struct mount *alloc_vfsmnt(const char *name)
Linus Torvalds's avatar
Linus Torvalds committed
202
{
203 204
	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
	if (mnt) {
205 206
		int err;

207
		err = mnt_alloc_id(mnt);
208 209 210 211
		if (err)
			goto out_free_cache;

		if (name) {
212
			mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
213
			if (!mnt->mnt_devname)
214
				goto out_free_id;
215 216
		}

Nick Piggin's avatar
Nick Piggin committed
217
#ifdef CONFIG_SMP
218 219
		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
		if (!mnt->mnt_pcp)
Nick Piggin's avatar
Nick Piggin committed
220 221
			goto out_free_devname;

222
		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
Nick Piggin's avatar
Nick Piggin committed
223
#else
224 225
		mnt->mnt_count = 1;
		mnt->mnt_writers = 0;
Nick Piggin's avatar
Nick Piggin committed
226 227
#endif

Al Viro's avatar
Al Viro committed
228
		INIT_HLIST_NODE(&mnt->mnt_hash);
229 230 231 232 233 234 235
		INIT_LIST_HEAD(&mnt->mnt_child);
		INIT_LIST_HEAD(&mnt->mnt_mounts);
		INIT_LIST_HEAD(&mnt->mnt_list);
		INIT_LIST_HEAD(&mnt->mnt_expire);
		INIT_LIST_HEAD(&mnt->mnt_share);
		INIT_LIST_HEAD(&mnt->mnt_slave_list);
		INIT_LIST_HEAD(&mnt->mnt_slave);
236
		INIT_HLIST_NODE(&mnt->mnt_mp_list);
237 238
#ifdef CONFIG_FSNOTIFY
		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
239
#endif
240
		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
Linus Torvalds's avatar
Linus Torvalds committed
241
	}
242
	return mnt;
243

244 245
#ifdef CONFIG_SMP
out_free_devname:
246
	kfree_const(mnt->mnt_devname);
247
#endif
248
out_free_id:
249
	mnt_free_id(mnt);
250
out_free_cache:
251
	kmem_cache_free(mnt_cache, mnt);
252
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
253 254
}

255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
int __mnt_is_readonly(struct vfsmount *mnt)
{
276 277 278 279 280
	if (mnt->mnt_flags & MNT_READONLY)
		return 1;
	if (mnt->mnt_sb->s_flags & MS_RDONLY)
		return 1;
	return 0;
281 282 283
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

284
static inline void mnt_inc_writers(struct mount *mnt)
285 286
{
#ifdef CONFIG_SMP
287
	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
288
#else
289
	mnt->mnt_writers++;
290 291
#endif
}
292

293
static inline void mnt_dec_writers(struct mount *mnt)
294
{
295
#ifdef CONFIG_SMP
296
	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
297
#else
298
	mnt->mnt_writers--;
299
#endif
300 301
}

302
static unsigned int mnt_get_writers(struct mount *mnt)
303
{
304 305
#ifdef CONFIG_SMP
	unsigned int count = 0;
306 307 308
	int cpu;

	for_each_possible_cpu(cpu) {
309
		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
310 311
	}

312 313 314 315
	return count;
#else
	return mnt->mnt_writers;
#endif
316 317
}

318 319 320 321 322 323 324 325 326
static int mnt_is_readonly(struct vfsmount *mnt)
{
	if (mnt->mnt_sb->s_readonly_remount)
		return 1;
	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
	smp_rmb();
	return __mnt_is_readonly(mnt);
}

327
/*
328 329 330 331
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
332 333
 */
/**
334
 * __mnt_want_write - get write access to a mount without freeze protection
335
 * @m: the mount on which to take a write
336
 *
337 338 339 340 341
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, __mnt_drop_write() must be
 * called. This is effectively a refcount.
342
 */
343
int __mnt_want_write(struct vfsmount *m)
344
{
345
	struct mount *mnt = real_mount(m);
346 347
	int ret = 0;

348
	preempt_disable();
349
	mnt_inc_writers(mnt);
350
	/*
351
	 * The store to mnt_inc_writers must be visible before we pass
352 353 354 355
	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
	 * incremented count after it has set MNT_WRITE_HOLD.
	 */
	smp_mb();
356
	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
357 358 359 360 361 362 363
		cpu_relax();
	/*
	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
	 * be set to match its requirements. So we must not load that until
	 * MNT_WRITE_HOLD is cleared.
	 */
	smp_rmb();
364
	if (mnt_is_readonly(m)) {
365
		mnt_dec_writers(mnt);
366 367
		ret = -EROFS;
	}
368
	preempt_enable();
369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389

	return ret;
}

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{
	int ret;

	sb_start_write(m->mnt_sb);
	ret = __mnt_want_write(m);
	if (ret)
		sb_end_write(m->mnt_sb);
390
	return ret;
391 392 393
}
EXPORT_SYMBOL_GPL(mnt_want_write);

394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
/**
 * mnt_clone_write - get write access to a mount
 * @mnt: the mount on which to take a write
 *
 * This is effectively like mnt_want_write, except
 * it must only be used to take an extra write reference
 * on a mountpoint that we already know has a write reference
 * on it. This allows some optimisation.
 *
 * After finished, mnt_drop_write must be called as usual to
 * drop the reference.
 */
int mnt_clone_write(struct vfsmount *mnt)
{
	/* superblock may be r/o */
	if (__mnt_is_readonly(mnt))
		return -EROFS;
	preempt_disable();
412
	mnt_inc_writers(real_mount(mnt));
413 414 415 416 417 418
	preempt_enable();
	return 0;
}
EXPORT_SYMBOL_GPL(mnt_clone_write);

/**
419
 * __mnt_want_write_file - get write access to a file's mount
420 421
 * @file: the file who's mount on which to take a write
 *
422
 * This is like __mnt_want_write, but it takes a file and can
423 424
 * do some optimisations if the file is open for write already
 */
425
int __mnt_want_write_file(struct file *file)
426
{
427
	if (!(file->f_mode & FMODE_WRITER))
428
		return __mnt_want_write(file->f_path.mnt);
429 430 431
	else
		return mnt_clone_write(file->f_path.mnt);
}
432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449

/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
 */
int mnt_want_write_file(struct file *file)
{
	int ret;

	sb_start_write(file->f_path.mnt->mnt_sb);
	ret = __mnt_want_write_file(file);
	if (ret)
		sb_end_write(file->f_path.mnt->mnt_sb);
	return ret;
}
450 451
EXPORT_SYMBOL_GPL(mnt_want_write_file);

452
/**
453
 * __mnt_drop_write - give up write access to a mount
454 455 456 457
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
458
 * __mnt_want_write() call above.
459
 */
460
void __mnt_drop_write(struct vfsmount *mnt)
461
{
462
	preempt_disable();
463
	mnt_dec_writers(real_mount(mnt));
464
	preempt_enable();
465
}
466 467 468 469 470 471 472 473 474 475 476 477 478 479

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{
	__mnt_drop_write(mnt);
	sb_end_write(mnt->mnt_sb);
}
480 481
EXPORT_SYMBOL_GPL(mnt_drop_write);

482 483 484 485 486
void __mnt_drop_write_file(struct file *file)
{
	__mnt_drop_write(file->f_path.mnt);
}

Al Viro's avatar
Al Viro committed
487 488 489 490 491 492
void mnt_drop_write_file(struct file *file)
{
	mnt_drop_write(file->f_path.mnt);
}
EXPORT_SYMBOL(mnt_drop_write_file);

493
static int mnt_make_readonly(struct mount *mnt)
494
{
495 496
	int ret = 0;

497
	lock_mount_hash();
498
	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
499
	/*
500 501
	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
	 * should be visible before we do.
502
	 */
503 504
	smp_mb();

505
	/*
506 507 508 509 510 511 512 513 514 515 516 517 518 519
	 * With writers on hold, if this value is zero, then there are
	 * definitely no active writers (although held writers may subsequently
	 * increment the count, they'll have to wait, and decrement it after
	 * seeing MNT_READONLY).
	 *
	 * It is OK to have counter incremented on one CPU and decremented on
	 * another: the sum will add up correctly. The danger would be when we
	 * sum up each counter, if we read a counter before it is incremented,
	 * but then read another CPU's count which it has been subsequently
	 * decremented from -- we would see more decrements than we should.
	 * MNT_WRITE_HOLD protects against this scenario, because
	 * mnt_want_write first increments count, then smp_mb, then spins on
	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
	 * we're counting up here.
520
	 */
521
	if (mnt_get_writers(mnt) > 0)
522 523
		ret = -EBUSY;
	else
524
		mnt->mnt.mnt_flags |= MNT_READONLY;
525 526 527 528 529
	/*
	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
	 * that become unheld will see MNT_READONLY.
	 */
	smp_wmb();
530
	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
531
	unlock_mount_hash();
532
	return ret;
533 534
}

535
static void __mnt_unmake_readonly(struct mount *mnt)
536
{
537
	lock_mount_hash();
538
	mnt->mnt.mnt_flags &= ~MNT_READONLY;
539
	unlock_mount_hash();
540 541
}

542 543 544 545 546
int sb_prepare_remount_readonly(struct super_block *sb)
{
	struct mount *mnt;
	int err = 0;

547 548 549 550
	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
	if (atomic_long_read(&sb->s_remove_count))
		return -EBUSY;

551
	lock_mount_hash();
552 553 554 555 556 557 558 559 560 561
	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
			smp_mb();
			if (mnt_get_writers(mnt) > 0) {
				err = -EBUSY;
				break;
			}
		}
	}
562 563 564
	if (!err && atomic_long_read(&sb->s_remove_count))
		err = -EBUSY;

565 566 567 568 569 570 571 572
	if (!err) {
		sb->s_readonly_remount = 1;
		smp_wmb();
	}
	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
	}
573
	unlock_mount_hash();
574 575 576 577

	return err;
}

578
static void free_vfsmnt(struct mount *mnt)
Linus Torvalds's avatar
Linus Torvalds committed
579
{
580
	kfree_const(mnt->mnt_devname);
581
#ifdef CONFIG_SMP
582
	free_percpu(mnt->mnt_pcp);
583
#endif
584
	kmem_cache_free(mnt_cache, mnt);
Linus Torvalds's avatar
Linus Torvalds committed
585 586
}

587 588 589 590 591
static void delayed_free_vfsmnt(struct rcu_head *head)
{
	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
}

Al Viro's avatar
Al Viro committed
592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613
/* call under rcu_read_lock */
bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
	struct mount *mnt;
	if (read_seqretry(&mount_lock, seq))
		return false;
	if (bastard == NULL)
		return true;
	mnt = real_mount(bastard);
	mnt_add_count(mnt, 1);
	if (likely(!read_seqretry(&mount_lock, seq)))
		return true;
	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
		mnt_add_count(mnt, -1);
		return false;
	}
	rcu_read_unlock();
	mntput(bastard);
	rcu_read_lock();
	return false;
}

Linus Torvalds's avatar
Linus Torvalds committed
614
/*
615
 * find the first mount at @dentry on vfsmount @mnt.
Al Viro's avatar
Al Viro committed
616
 * call under rcu_read_lock()
Linus Torvalds's avatar
Linus Torvalds committed
617
 */
618
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
Linus Torvalds's avatar
Linus Torvalds committed
619
{
Al Viro's avatar
Al Viro committed
620
	struct hlist_head *head = m_hash(mnt, dentry);
621 622
	struct mount *p;

Al Viro's avatar
Al Viro committed
623
	hlist_for_each_entry_rcu(p, head, mnt_hash)
624 625 626 627 628 629 630
		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
			return p;
	return NULL;
}

/*
 * find the last mount at @dentry on vfsmount @mnt.
Al Viro's avatar
Al Viro committed
631
 * mount_lock must be held.
632 633 634
 */
struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
{
Al Viro's avatar
Al Viro committed
635 636 637 638 639
	struct mount *p, *res;
	res = p = __lookup_mnt(mnt, dentry);
	if (!p)
		goto out;
	hlist_for_each_entry_continue(p, mnt_hash) {
Al Viro's avatar
Al Viro committed
640 641 642 643
		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
			break;
		res = p;
	}
Al Viro's avatar
Al Viro committed
644
out:
Al Viro's avatar
Al Viro committed
645
	return res;
Linus Torvalds's avatar
Linus Torvalds committed
646 647
}

648
/*
649 650 651 652 653 654 655 656 657 658 659 660 661 662
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
 * following mounts:
 *
 * mount /dev/sda1 /mnt
 * mount /dev/sda2 /mnt
 * mount /dev/sda3 /mnt
 *
 * Then lookup_mnt() on the base /mnt dentry in the root mount will
 * return successively the root dentry and vfsmount of /dev/sda1, then
 * /dev/sda2, then /dev/sda3, then NULL.
 *
 * lookup_mnt takes a reference to the found vfsmount.
663
 */
Al Viro's avatar
Al Viro committed
664
struct vfsmount *lookup_mnt(struct path *path)
665
{
666
	struct mount *child_mnt;
Al Viro's avatar
Al Viro committed
667 668
	struct vfsmount *m;
	unsigned seq;
Nick Piggin's avatar
Nick Piggin committed
669

Al Viro's avatar
Al Viro committed
670 671 672 673 674 675 676 677
	rcu_read_lock();
	do {
		seq = read_seqbegin(&mount_lock);
		child_mnt = __lookup_mnt(path->mnt, path->dentry);
		m = child_mnt ? &child_mnt->mnt : NULL;
	} while (!legitimize_mnt(m, seq));
	rcu_read_unlock();
	return m;
678 679
}

680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714
/*
 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 *                         current mount namespace.
 *
 * The common case is dentries are not mountpoints at all and that
 * test is handled inline.  For the slow case when we are actually
 * dealing with a mountpoint of some kind, walk through all of the
 * mounts in the current mount namespace and test to see if the dentry
 * is a mountpoint.
 *
 * The mount_hashtable is not usable in the context because we
 * need to identify all mounts that may be in the current mount
 * namespace not just a mount that happens to have some specified
 * parent mount.
 */
bool __is_local_mountpoint(struct dentry *dentry)
{
	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
	struct mount *mnt;
	bool is_covered = false;

	if (!d_mountpoint(dentry))
		goto out;

	down_read(&namespace_sem);
	list_for_each_entry(mnt, &ns->list, mnt_list) {
		is_covered = (mnt->mnt_mountpoint == dentry);
		if (is_covered)
			break;
	}
	up_read(&namespace_sem);
out:
	return is_covered;
}

715
static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
716
{
Al Viro's avatar
Al Viro committed
717
	struct hlist_head *chain = mp_hash(dentry);
718 719
	struct mountpoint *mp;

Al Viro's avatar
Al Viro committed
720
	hlist_for_each_entry(mp, chain, m_hash) {
721 722 723 724 725 726 727 728
		if (mp->m_dentry == dentry) {
			/* might be worth a WARN_ON() */
			if (d_unlinked(dentry))
				return ERR_PTR(-ENOENT);
			mp->m_count++;
			return mp;
		}
	}
729 730 731 732 733 734 735 736
	return NULL;
}

static struct mountpoint *new_mountpoint(struct dentry *dentry)
{
	struct hlist_head *chain = mp_hash(dentry);
	struct mountpoint *mp;
	int ret;
737 738 739 740 741

	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
	if (!mp)
		return ERR_PTR(-ENOMEM);

742 743
	ret = d_set_mounted(dentry);
	if (ret) {
744
		kfree(mp);
745
		return ERR_PTR(ret);
746
	}
747

748 749
	mp->m_dentry = dentry;
	mp->m_count = 1;
Al Viro's avatar
Al Viro committed
750
	hlist_add_head(&mp->m_hash, chain);
751
	INIT_HLIST_HEAD(&mp->m_list);
752 753 754 755 756 757 758
	return mp;
}

static void put_mountpoint(struct mountpoint *mp)
{
	if (!--mp->m_count) {
		struct dentry *dentry = mp->m_dentry;
759
		BUG_ON(!hlist_empty(&mp->m_list));
760 761 762
		spin_lock(&dentry->d_lock);
		dentry->d_flags &= ~DCACHE_MOUNTED;
		spin_unlock(&dentry->d_lock);
Al Viro's avatar
Al Viro committed
763
		hlist_del(&mp->m_hash);
764 765 766 767
		kfree(mp);
	}
}

Al Viro's avatar
Al Viro committed
768
static inline int check_mnt(struct mount *mnt)
Linus Torvalds's avatar
Linus Torvalds committed
769
{
770
	return mnt->mnt_ns == current->nsproxy->mnt_ns;
Linus Torvalds's avatar
Linus Torvalds committed
771 772
}

Nick Piggin's avatar
Nick Piggin committed
773 774 775
/*
 * vfsmount lock must be held for write
 */
776
static void touch_mnt_namespace(struct mnt_namespace *ns)
Al Viro's avatar
Al Viro committed
777 778 779 780 781 782 783
{
	if (ns) {
		ns->event = ++event;
		wake_up_interruptible(&ns->poll);
	}
}

Nick Piggin's avatar
Nick Piggin committed
784 785 786
/*
 * vfsmount lock must be held for write
 */
787
static void __touch_mnt_namespace(struct mnt_namespace *ns)
Al Viro's avatar
Al Viro committed
788 789 790 791 792 793 794
{
	if (ns && ns->event != event) {
		ns->event = event;
		wake_up_interruptible(&ns->poll);
	}
}

Nick Piggin's avatar
Nick Piggin committed
795 796 797
/*
 * vfsmount lock must be held for write
 */
798 799
static void detach_mnt(struct mount *mnt, struct path *old_path)
{
800
	old_path->dentry = mnt->mnt_mountpoint;
801 802
	old_path->mnt = &mnt->mnt_parent->mnt;
	mnt->mnt_parent = mnt;
803
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
804
	list_del_init(&mnt->mnt_child);
Al Viro's avatar
Al Viro committed
805
	hlist_del_init_rcu(&mnt->mnt_hash);
806
	hlist_del_init(&mnt->mnt_mp_list);
807 808
	put_mountpoint(mnt->mnt_mp);
	mnt->mnt_mp = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
809 810
}

Nick Piggin's avatar
Nick Piggin committed
811 812 813
/*
 * vfsmount lock must be held for write
 */
814 815
void mnt_set_mountpoint(struct mount *mnt,
			struct mountpoint *mp,
816
			struct mount *child_mnt)
817
{
818
	mp->m_count++;
819
	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
820
	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
821
	child_mnt->mnt_parent = mnt;
822
	child_mnt->mnt_mp = mp;
823
	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
824 825
}

Nick Piggin's avatar
Nick Piggin committed
826 827 828
/*
 * vfsmount lock must be held for write
 */
829 830 831
static void attach_mnt(struct mount *mnt,
			struct mount *parent,
			struct mountpoint *mp)
Linus Torvalds's avatar
Linus Torvalds committed
832
{
833
	mnt_set_mountpoint(parent, mp, mnt);
Al Viro's avatar
Al Viro committed
834
	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
835
	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
836 837
}

Al Viro's avatar
Al Viro committed
838 839 840 841 842
static void attach_shadowed(struct mount *mnt,
			struct mount *parent,
			struct mount *shadows)
{
	if (shadows) {
843
		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
Al Viro's avatar
Al Viro committed
844 845 846 847 848 849 850 851
		list_add(&mnt->mnt_child, &shadows->mnt_child);
	} else {
		hlist_add_head_rcu(&mnt->mnt_hash,
				m_hash(&parent->mnt, mnt->mnt_mountpoint));
		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
	}
}

852
/*
Nick Piggin's avatar
Nick Piggin committed
853
 * vfsmount lock must be held for write
854
 */
Al Viro's avatar
Al Viro committed
855
static void commit_tree(struct mount *mnt, struct mount *shadows)
856
{
857
	struct mount *parent = mnt->mnt_parent;
858
	struct mount *m;
859
	LIST_HEAD(head);
Al Viro's avatar
Al Viro committed
860
	struct mnt_namespace *n = parent->mnt_ns;
861

862
	BUG_ON(parent == mnt);
863

Al Viro's avatar
Al Viro committed
864
	list_add_tail(&head, &mnt->mnt_list);
Al Viro's avatar
Al Viro committed
865
	list_for_each_entry(m, &head, mnt_list)
Al Viro's avatar
Al Viro committed
866
		m->mnt_ns = n;
867

868 869
	list_splice(&head, n->list.prev);

Al Viro's avatar
Al Viro committed
870
	attach_shadowed(mnt, parent, shadows);
871
	touch_mnt_namespace(n);
Linus Torvalds's avatar
Linus Torvalds committed
872 873
}

874
static struct mount *next_mnt(struct mount *p, struct mount *root)
Linus Torvalds's avatar
Linus Torvalds committed
875
{
876 877
	struct list_head *next = p->mnt_mounts.next;
	if (next == &p->mnt_mounts) {
Linus Torvalds's avatar
Linus Torvalds committed
878
		while (1) {
879
			if (p == root)
Linus Torvalds's avatar
Linus Torvalds committed
880
				return NULL;
881 882
			next = p->mnt_child.next;
			if (next != &p->mnt_parent->mnt_mounts)
Linus Torvalds's avatar
Linus Torvalds committed
883
				break;
884
			p = p->mnt_parent;
Linus Torvalds's avatar
Linus Torvalds committed
885 886
		}
	}
887
	return list_entry(next, struct mount, mnt_child);
Linus Torvalds's avatar
Linus Torvalds committed
888 889
}

890
static struct mount *skip_mnt_tree(struct mount *p)
Ram Pai's avatar
Ram Pai committed
891
{
892 893 894 895
	struct list_head *prev = p->mnt_mounts.prev;
	while (prev != &p->mnt_mounts) {
		p = list_entry(prev, struct mount, mnt_child);
		prev = p->mnt_mounts.prev;
Ram Pai's avatar
Ram Pai committed
896 897 898 899
	}
	return p;
}

900 901 902
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
903
	struct mount *mnt;
904 905 906 907 908 909 910 911 912 913
	struct dentry *root;

	if (!type)
		return ERR_PTR(-ENODEV);

	mnt = alloc_vfsmnt(name);
	if (!mnt)
		return ERR_PTR(-ENOMEM);

	if (flags & MS_KERNMOUNT)
914
		mnt->mnt.mnt_flags = MNT_INTERNAL;
915 916 917

	root = mount_fs(type, flags, name, data);
	if (IS_ERR(root)) {
918
		mnt_free_id(mnt);
919 920 921 922
		free_vfsmnt(mnt);
		return ERR_CAST(root);
	}

923 924
	mnt->mnt.mnt_root = root;
	mnt->mnt.mnt_sb = root->d_sb;
925
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
926
	mnt->mnt_parent = mnt;
927
	lock_mount_hash();
928
	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
929
	unlock_mount_hash();
930
	return &mnt->mnt;
931 932 933
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);

934
static struct mount *clone_mnt(struct mount *old, struct dentry *root,
Ram Pai's avatar
Ram Pai committed
935
					int flag)
Linus Torvalds's avatar
Linus Torvalds committed
936
{
937
	struct super_block *sb = old->mnt.mnt_sb;
938 939
	struct mount *mnt;
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
940

941 942 943
	mnt = alloc_vfsmnt(old->mnt_devname);
	if (!mnt)
		return ERR_PTR(-ENOMEM);
944

945
	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
946 947 948
		mnt->mnt_group_id = 0; /* not a peer of original */
	else
		mnt->mnt_group_id = old->mnt_group_id;
949

950 951 952 953
	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
		err = mnt_alloc_group_id(mnt);
		if (err)
			goto out_free;
Linus Torvalds's avatar
Linus Torvalds committed
954
	}
955

Al Viro's avatar
Al Viro committed
956
	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
957
	/* Don't allow unprivileged users to change mount flags */
958 959 960 961 962 963 964 965 966 967 968 969 970 971 972
	if (flag & CL_UNPRIVILEGED) {
		mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;

		if (mnt->mnt.mnt_flags & MNT_READONLY)
			mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;

		if (mnt->mnt.mnt_flags & MNT_NODEV)
			mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;

		if (mnt->mnt.mnt_flags & MNT_NOSUID)
			mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;

		if (mnt->mnt.mnt_flags & MNT_NOEXEC)
			mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
	}
973

974
	/* Don't allow unprivileged users to reveal what is under a mount */
975 976
	if ((flag & CL_UNPRIVILEGED) &&
	    (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire)))
977 978
		mnt->mnt.mnt_flags |= MNT_LOCKED;

979 980 981 982 983
	atomic_inc(&sb->s_active);
	mnt->mnt.mnt_sb = sb;
	mnt->mnt.mnt_root = dget(root);
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
	mnt->mnt_parent = mnt;
984
	lock_mount_hash();
985
	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
986
	unlock_mount_hash();
987

988 989
	if ((flag & CL_SLAVE) ||
	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009
		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
		mnt->mnt_master = old;
		CLEAR_MNT_SHARED(mnt);
	} else if (!(flag & CL_PRIVATE)) {
		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
			list_add(&mnt->mnt_share, &old->mnt_share);
		if (IS_MNT_SLAVE(old))
			list_add(&mnt->mnt_slave, &old->mnt_slave);
		mnt->mnt_master = old->mnt_master;
	}
	if (flag & CL_MAKE_SHARED)
		set_mnt_shared(mnt);

	/* stick the duplicate mount on the same expiry list
	 * as the original if that was on one */
	if (flag & CL_EXPIRE) {
		if (!list_empty(&old->mnt_expire))
			list_add(&mnt->mnt_expire, &old->mnt_expire);
	}

1010
	return mnt;
1011 1012

 out_free:
1013
	mnt_free_id(mnt);
1014
	free_vfsmnt(mnt);
1015
	return ERR_PTR(err);
Linus Torvalds's avatar
Linus Torvalds committed
1016 1017
}

Al Viro's avatar
Al Viro committed
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
static void cleanup_mnt(struct mount *mnt)
{
	/*
	 * This probably indicates that somebody messed
	 * up a mnt_want/drop_write() pair.  If this
	 * happens, the filesystem was probably unable
	 * to make r/w->r/o transitions.
	 */
	/*
	 * The locking used to deal with mnt_count decrement provides barriers,
	 * so mnt_get_writers() below is safe.
	 */
	WARN_ON(mnt_get_writers(mnt));
	if (unlikely(mnt->mnt_pins.first))
		mnt_pin_kill(mnt);
	fsnotify_vfsmount_delete(&mnt->mnt);
	dput(mnt->mnt.mnt_root);
	deactivate_super(mnt->mnt.mnt_sb);
	mnt_free_id(mnt);
	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
}

static void __cleanup_mnt(struct rcu_head *head)
{
	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
}

static LLIST_HEAD(delayed_mntput_list);
static void delayed_mntput(struct work_struct *unused)
{
	struct llist_node *node = llist_del_all(&delayed_mntput_list);
	struct llist_node *next;

	for (; node; node = next) {
		next = llist_next(node);
		cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
	}
}
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

1058
static void mntput_no_expire(struct mount *mnt)
Nick Piggin's avatar
Nick Piggin committed
1059
{
Al Viro's avatar
Al Viro committed
1060 1061 1062 1063
	rcu_read_lock();
	mnt_add_count(mnt, -1);
	if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
		rcu_read_unlock();
1064
		return;
Nick Piggin's avatar
Nick Piggin committed
1065
	}
1066
	lock_mount_hash();
Nick Piggin's avatar
Nick Piggin committed
1067
	if (mnt_get_count(mnt)) {
Al Viro's avatar
Al Viro committed
1068
		rcu_read_unlock();
1069
		unlock_mount_hash();
Nick Piggin's avatar
Nick Piggin committed
1070 1071
		return;
	}
Al Viro's avatar
Al Viro committed
1072 1073 1074 1075 1076 1077 1078
	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
		rcu_read_unlock();
		unlock_mount_hash();
		return;
	}
	mnt->mnt.mnt_flags |= MNT_DOOMED;
	rcu_read_unlock();
Andi Kleen's avatar
Andi Kleen committed
1079

1080
	list_del(&mnt->mnt_instance);
1081
	unlock_mount_hash();
1082

Al Viro's avatar
Al Viro committed
1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
		struct task_struct *task = current;
		if (likely(!(task->flags & PF_KTHREAD))) {
			init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
			if (!task_work_add(task, &mnt->mnt_rcu, true))
				return;
		}
		if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
			schedule_delayed_work(&delayed_mntput_work, 1);
		return;
	}
	cleanup_mnt(mnt);
Nick Piggin's avatar
Nick Piggin committed
1095 1096 1097 1098 1099
}

void mntput(struct vfsmount *mnt)
{
	if (mnt) {