kprobes.c 63.2 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
/*
 *  Kernel Probes (KProbes)
 *  kernel/kprobes.c
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 * Copyright (C) IBM Corporation, 2002, 2004
 *
 * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
 *		Probes initial implementation (includes suggestions from
 *		Rusty Russell).
 * 2004-Aug	Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
 *		hlists and exceptions notifier as suggested by Andi Kleen.
 * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
 *		interface to access function arguments.
 * 2004-Sep	Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
 *		exceptions notifier to be first on the priority list.
30 31 32
 * 2005-May	Hien Nguyen <hien@us.ibm.com>, Jim Keniston
 *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi
 *		<prasanna@in.ibm.com> added function-return probes.
Linus Torvalds's avatar
Linus Torvalds committed
33 34 35 36
 */
#include <linux/kprobes.h>
#include <linux/hash.h>
#include <linux/init.h>
37
#include <linux/slab.h>
38
#include <linux/stddef.h>
39
#include <linux/export.h>
40
#include <linux/moduleloader.h>
41
#include <linux/kallsyms.h>
42
#include <linux/freezer.h>
43 44
#include <linux/seq_file.h>
#include <linux/debugfs.h>
45
#include <linux/sysctl.h>
46
#include <linux/kdebug.h>
47
#include <linux/memory.h>
48
#include <linux/ftrace.h>
49
#include <linux/cpu.h>
50
#include <linux/jump_label.h>
51

52
#include <asm/sections.h>
Linus Torvalds's avatar
Linus Torvalds committed
53 54
#include <asm/cacheflush.h>
#include <asm/errno.h>
55
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
56 57 58 59

#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)

60

61
static int kprobes_initialized;
Linus Torvalds's avatar
Linus Torvalds committed
62
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
63
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
Linus Torvalds's avatar
Linus Torvalds committed
64

65
/* NOTE: change this value only with kprobe_mutex held */
66
static bool kprobes_all_disarmed;
67

68 69
/* This protects kprobe_table and optimizing_list */
static DEFINE_MUTEX(kprobe_mutex);
70
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
71
static struct {
72
	raw_spinlock_t lock ____cacheline_aligned_in_smp;
73 74
} kretprobe_table_locks[KPROBE_TABLE_SIZE];

75 76
kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
					unsigned int __unused)
77 78 79 80
{
	return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
}

81
static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
82 83 84
{
	return &(kretprobe_table_locks[hash].lock);
}
Linus Torvalds's avatar
Linus Torvalds committed
85

86 87
/* Blacklist -- list of struct kprobe_blacklist_entry */
static LIST_HEAD(kprobe_blacklist);
88

89
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
90 91 92 93 94 95 96
/*
 * kprobe->ainsn.insn points to the copy of the instruction to be
 * single-stepped. x86_64, POWER4 and above have no-exec support and
 * stepping on the instruction on a vmalloced/kmalloced/data page
 * is a recipe for disaster
 */
struct kprobe_insn_page {
97
	struct list_head list;
98
	kprobe_opcode_t *insns;		/* Page of instruction slots */
99
	struct kprobe_insn_cache *cache;
100
	int nused;
101
	int ngarbage;
102
	char slot_used[];
103 104
};

105 106 107 108 109 110 111 112 113
#define KPROBE_INSN_PAGE_SIZE(slots)			\
	(offsetof(struct kprobe_insn_page, slot_used) +	\
	 (sizeof(char) * (slots)))

static int slots_per_page(struct kprobe_insn_cache *c)
{
	return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
}

114 115 116 117 118 119
enum kprobe_slot_state {
	SLOT_CLEAN = 0,
	SLOT_DIRTY = 1,
	SLOT_USED = 2,
};

120
void __weak *alloc_insn_page(void)
121 122 123 124
{
	return module_alloc(PAGE_SIZE);
}

125
void __weak free_insn_page(void *page)
126
{
127
	module_memfree(page);
128 129
}

130 131
struct kprobe_insn_cache kprobe_insn_slots = {
	.mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
132 133
	.alloc = alloc_insn_page,
	.free = free_insn_page,
134 135 136 137
	.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
	.insn_size = MAX_INSN_SIZE,
	.nr_garbage = 0,
};
138
static int collect_garbage_slots(struct kprobe_insn_cache *c);
139

140
/**
141
 * __get_insn_slot() - Find a slot on an executable page for an instruction.
142 143
 * We allocate an executable page if there's no room on existing ones.
 */
144
kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
145 146
{
	struct kprobe_insn_page *kip;
147
	kprobe_opcode_t *slot = NULL;
148

149
	/* Since the slot array is not protected by rcu, we need a mutex */
150
	mutex_lock(&c->mutex);
151
 retry:
152 153
	rcu_read_lock();
	list_for_each_entry_rcu(kip, &c->pages, list) {
154
		if (kip->nused < slots_per_page(c)) {
155
			int i;
156
			for (i = 0; i < slots_per_page(c); i++) {
157 158
				if (kip->slot_used[i] == SLOT_CLEAN) {
					kip->slot_used[i] = SLOT_USED;
159
					kip->nused++;
160
					slot = kip->insns + (i * c->insn_size);
161
					rcu_read_unlock();
162
					goto out;
163 164
				}
			}
165 166 167
			/* kip->nused is broken. Fix it. */
			kip->nused = slots_per_page(c);
			WARN_ON(1);
168 169
		}
	}
170
	rcu_read_unlock();
171

172
	/* If there are any garbage slots, collect it and try again. */
173
	if (c->nr_garbage && collect_garbage_slots(c) == 0)
174
		goto retry;
175 176 177

	/* All out of space.  Need to allocate a new page. */
	kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
178
	if (!kip)
179
		goto out;
180 181 182 183 184 185

	/*
	 * Use module_alloc so this page is within +/- 2GB of where the
	 * kernel image and loaded module images reside. This is required
	 * so x86_64 can correctly handle the %rip-relative fixups.
	 */
186
	kip->insns = c->alloc();
187 188
	if (!kip->insns) {
		kfree(kip);
189
		goto out;
190
	}
191
	INIT_LIST_HEAD(&kip->list);
192
	memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
193
	kip->slot_used[0] = SLOT_USED;
194
	kip->nused = 1;
195
	kip->ngarbage = 0;
196
	kip->cache = c;
197
	list_add_rcu(&kip->list, &c->pages);
198 199 200 201
	slot = kip->insns;
out:
	mutex_unlock(&c->mutex);
	return slot;
202 203
}

204
/* Return 1 if all garbages are collected, otherwise 0. */
205
static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
206
{
207
	kip->slot_used[idx] = SLOT_CLEAN;
208 209 210 211 212 213 214 215
	kip->nused--;
	if (kip->nused == 0) {
		/*
		 * Page is no longer in use.  Free it unless
		 * it's the last one.  We keep the last one
		 * so as not to have to set it up again the
		 * next time somebody inserts a probe.
		 */
216
		if (!list_is_singular(&kip->list)) {
217 218
			list_del_rcu(&kip->list);
			synchronize_rcu();
219
			kip->cache->free(kip->insns);
220 221 222 223 224 225 226
			kfree(kip);
		}
		return 1;
	}
	return 0;
}

227
static int collect_garbage_slots(struct kprobe_insn_cache *c)
228
{
229
	struct kprobe_insn_page *kip, *next;
230

231
	/* Ensure no-one is interrupted on the garbages */
232
	synchronize_rcu();
233

234
	list_for_each_entry_safe(kip, next, &c->pages, list) {
235 236 237 238
		int i;
		if (kip->ngarbage == 0)
			continue;
		kip->ngarbage = 0;	/* we will collect all garbages */
239
		for (i = 0; i < slots_per_page(c); i++) {
240
			if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
241 242 243
				break;
		}
	}
244
	c->nr_garbage = 0;
245 246 247
	return 0;
}

248 249
void __free_insn_slot(struct kprobe_insn_cache *c,
		      kprobe_opcode_t *slot, int dirty)
250 251
{
	struct kprobe_insn_page *kip;
252
	long idx;
253

254
	mutex_lock(&c->mutex);
255 256 257 258 259
	rcu_read_lock();
	list_for_each_entry_rcu(kip, &c->pages, list) {
		idx = ((long)slot - (long)kip->insns) /
			(c->insn_size * sizeof(kprobe_opcode_t));
		if (idx >= 0 && idx < slots_per_page(c))
260
			goto out;
261
	}
262
	/* Could not find this slot. */
263
	WARN_ON(1);
264
	kip = NULL;
265
out:
266 267 268 269 270 271 272 273 274 275 276 277 278 279
	rcu_read_unlock();
	/* Mark and sweep: this may sleep */
	if (kip) {
		/* Check double free */
		WARN_ON(kip->slot_used[idx] != SLOT_USED);
		if (dirty) {
			kip->slot_used[idx] = SLOT_DIRTY;
			kip->ngarbage++;
			if (++c->nr_garbage > slots_per_page(c))
				collect_garbage_slots(c);
		} else {
			collect_one_slot(kip, idx);
		}
	}
280
	mutex_unlock(&c->mutex);
281
}
282

283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
/*
 * Check given address is on the page of kprobe instruction slots.
 * This will be used for checking whether the address on a stack
 * is on a text area or not.
 */
bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
{
	struct kprobe_insn_page *kip;
	bool ret = false;

	rcu_read_lock();
	list_for_each_entry_rcu(kip, &c->pages, list) {
		if (addr >= (unsigned long)kip->insns &&
		    addr < (unsigned long)kip->insns + PAGE_SIZE) {
			ret = true;
			break;
		}
	}
	rcu_read_unlock();

	return ret;
}

306 307
#ifdef CONFIG_OPTPROBES
/* For optimized_kprobe buffer */
308 309
struct kprobe_insn_cache kprobe_optinsn_slots = {
	.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
310 311
	.alloc = alloc_insn_page,
	.free = free_insn_page,
312 313 314 315 316
	.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
	/* .insn_size is initialized later */
	.nr_garbage = 0,
};
#endif
317
#endif
318

319 320 321
/* We have preemption disabled.. so it is safe to use __ versions */
static inline void set_kprobe_instance(struct kprobe *kp)
{
322
	__this_cpu_write(kprobe_instance, kp);
323 324 325 326
}

static inline void reset_kprobe_instance(void)
{
327
	__this_cpu_write(kprobe_instance, NULL);
328 329
}

330 331
/*
 * This routine is called either:
332
 * 	- under the kprobe_mutex - during kprobe_[un]register()
333
 * 				OR
334
 * 	- with preemption disabled - from arch/xxx/kernel/kprobes.c
335
 */
336
struct kprobe *get_kprobe(void *addr)
Linus Torvalds's avatar
Linus Torvalds committed
337 338
{
	struct hlist_head *head;
339
	struct kprobe *p;
Linus Torvalds's avatar
Linus Torvalds committed
340 341

	head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
342
	hlist_for_each_entry_rcu(p, head, hlist) {
Linus Torvalds's avatar
Linus Torvalds committed
343 344 345
		if (p->addr == addr)
			return p;
	}
346

Linus Torvalds's avatar
Linus Torvalds committed
347 348
	return NULL;
}
349
NOKPROBE_SYMBOL(get_kprobe);
Linus Torvalds's avatar
Linus Torvalds committed
350

351
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
352 353 354 355 356 357 358

/* Return true if the kprobe is an aggregator */
static inline int kprobe_aggrprobe(struct kprobe *p)
{
	return p->pre_handler == aggr_pre_handler;
}

359 360 361 362 363 364 365
/* Return true(!0) if the kprobe is unused */
static inline int kprobe_unused(struct kprobe *p)
{
	return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
	       list_empty(&p->list);
}

366 367 368
/*
 * Keep all fields in the kprobe consistent
 */
369
static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
370
{
371 372
	memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
	memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
373 374 375
}

#ifdef CONFIG_OPTPROBES
376 377 378
/* NOTE: change this value only with kprobe_mutex held */
static bool kprobes_allow_optimization;

379 380 381 382
/*
 * Call all pre_handler on the list, but ignores its return value.
 * This must be called from arch-dep optimized caller.
 */
383
void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
384 385 386 387 388 389
{
	struct kprobe *kp;

	list_for_each_entry_rcu(kp, &p->list, list) {
		if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
			set_kprobe_instance(kp);
390
			kp->pre_handler(kp, regs);
391 392 393 394
		}
		reset_kprobe_instance();
	}
}
395
NOKPROBE_SYMBOL(opt_pre_handler);
396

397
/* Free optimized instructions and optimized_kprobe */
398
static void free_aggr_kprobe(struct kprobe *p)
399 400 401 402 403 404 405 406 407
{
	struct optimized_kprobe *op;

	op = container_of(p, struct optimized_kprobe, kp);
	arch_remove_optimized_kprobe(op);
	arch_remove_kprobe(p);
	kfree(op);
}

408 409 410 411 412 413 414 415 416 417 418 419 420
/* Return true(!0) if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
	struct optimized_kprobe *op;

	if (kprobe_aggrprobe(p)) {
		op = container_of(p, struct optimized_kprobe, kp);
		return arch_prepared_optinsn(&op->optinsn);
	}

	return 0;
}

421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
static inline int kprobe_disarmed(struct kprobe *p)
{
	struct optimized_kprobe *op;

	/* If kprobe is not aggr/opt probe, just return kprobe is disabled */
	if (!kprobe_aggrprobe(p))
		return kprobe_disabled(p);

	op = container_of(p, struct optimized_kprobe, kp);

	return kprobe_disabled(p) && list_empty(&op->list);
}

/* Return true(!0) if the probe is queued on (un)optimizing lists */
436
static int kprobe_queued(struct kprobe *p)
437 438 439 440 441 442 443 444 445 446 447
{
	struct optimized_kprobe *op;

	if (kprobe_aggrprobe(p)) {
		op = container_of(p, struct optimized_kprobe, kp);
		if (!list_empty(&op->list))
			return 1;
	}
	return 0;
}

448 449 450 451
/*
 * Return an optimized kprobe whose optimizing code replaces
 * instructions including addr (exclude breakpoint).
 */
452
static struct kprobe *get_optimized_kprobe(unsigned long addr)
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472
{
	int i;
	struct kprobe *p = NULL;
	struct optimized_kprobe *op;

	/* Don't check i == 0, since that is a breakpoint case. */
	for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
		p = get_kprobe((void *)(addr - i));

	if (p && kprobe_optready(p)) {
		op = container_of(p, struct optimized_kprobe, kp);
		if (arch_within_optimized_kprobe(op, addr))
			return p;
	}

	return NULL;
}

/* Optimization staging list, protected by kprobe_mutex */
static LIST_HEAD(optimizing_list);
473
static LIST_HEAD(unoptimizing_list);
474
static LIST_HEAD(freeing_list);
475 476 477 478 479

static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
#define OPTIMIZE_DELAY 5

480 481 482 483
/*
 * Optimize (replace a breakpoint with a jump) kprobes listed on
 * optimizing_list.
 */
484
static void do_optimize_kprobes(void)
485 486 487 488 489 490 491 492
{
	/*
	 * The optimization/unoptimization refers online_cpus via
	 * stop_machine() and cpu-hotplug modifies online_cpus.
	 * And same time, text_mutex will be held in cpu-hotplug and here.
	 * This combination can cause a deadlock (cpu-hotplug try to lock
	 * text_mutex but stop_machine can not be done because online_cpus
	 * has been changed)
493
	 * To avoid this deadlock, caller must have locked cpu hotplug
494 495
	 * for preventing cpu-hotplug outside of text_mutex locking.
	 */
496 497 498 499 500 501 502
	lockdep_assert_cpus_held();

	/* Optimization never be done when disarmed */
	if (kprobes_all_disarmed || !kprobes_allow_optimization ||
	    list_empty(&optimizing_list))
		return;

503
	mutex_lock(&text_mutex);
504
	arch_optimize_kprobes(&optimizing_list);
505
	mutex_unlock(&text_mutex);
506 507
}

508 509 510 511
/*
 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
 * if need) kprobes listed on unoptimizing_list.
 */
512
static void do_unoptimize_kprobes(void)
513 514 515
{
	struct optimized_kprobe *op, *tmp;

516 517 518
	/* See comment in do_optimize_kprobes() */
	lockdep_assert_cpus_held();

519 520 521 522 523
	/* Unoptimization must be done anytime */
	if (list_empty(&unoptimizing_list))
		return;

	mutex_lock(&text_mutex);
524
	arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
525
	/* Loop free_list for disarming */
526
	list_for_each_entry_safe(op, tmp, &freeing_list, list) {
527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543
		/* Disarm probes if marked disabled */
		if (kprobe_disabled(&op->kp))
			arch_disarm_kprobe(&op->kp);
		if (kprobe_unused(&op->kp)) {
			/*
			 * Remove unused probes from hash list. After waiting
			 * for synchronization, these probes are reclaimed.
			 * (reclaiming is done by do_free_cleaned_kprobes.)
			 */
			hlist_del_rcu(&op->kp.hlist);
		} else
			list_del_init(&op->list);
	}
	mutex_unlock(&text_mutex);
}

/* Reclaim all kprobes on the free_list */
544
static void do_free_cleaned_kprobes(void)
545 546 547
{
	struct optimized_kprobe *op, *tmp;

548
	list_for_each_entry_safe(op, tmp, &freeing_list, list) {
549
		list_del_init(&op->list);
550 551 552 553 554 555 556
		if (WARN_ON_ONCE(!kprobe_unused(&op->kp))) {
			/*
			 * This must not happen, but if there is a kprobe
			 * still in use, keep it on kprobes hash list.
			 */
			continue;
		}
557 558 559 560 561
		free_aggr_kprobe(&op->kp);
	}
}

/* Start optimizer after OPTIMIZE_DELAY passed */
562
static void kick_kprobe_optimizer(void)
563
{
564
	schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
565 566
}

567
/* Kprobe jump optimizer */
568
static void kprobe_optimizer(struct work_struct *work)
569
{
570
	mutex_lock(&kprobe_mutex);
571
	cpus_read_lock();
572 573 574 575
	/* Lock modules while optimizing kprobes */
	mutex_lock(&module_mutex);

	/*
576 577 578
	 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
	 * kprobes before waiting for quiesence period.
	 */
579
	do_unoptimize_kprobes();
580 581

	/*
582 583 584 585 586 587 588
	 * Step 2: Wait for quiesence period to ensure all potentially
	 * preempted tasks to have normally scheduled. Because optprobe
	 * may modify multiple instructions, there is a chance that Nth
	 * instruction is preempted. In that case, such tasks can return
	 * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
	 * Note that on non-preemptive kernel, this is transparently converted
	 * to synchronoze_sched() to wait for all interrupts to have completed.
589
	 */
590
	synchronize_rcu_tasks();
591

592
	/* Step 3: Optimize kprobes after quiesence period */
593
	do_optimize_kprobes();
594 595

	/* Step 4: Free cleaned kprobes after quiesence period */
596
	do_free_cleaned_kprobes();
597

598
	mutex_unlock(&module_mutex);
599
	cpus_read_unlock();
600
	mutex_unlock(&kprobe_mutex);
601

602
	/* Step 5: Kick optimizer again if needed */
603
	if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
604
		kick_kprobe_optimizer();
605 606 607
}

/* Wait for completing optimization and unoptimization */
608
void wait_for_kprobe_optimizer(void)
609
{
610 611 612 613 614 615 616 617 618 619 620 621 622 623
	mutex_lock(&kprobe_mutex);

	while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
		mutex_unlock(&kprobe_mutex);

		/* this will also make optimizing_work execute immmediately */
		flush_delayed_work(&optimizing_work);
		/* @optimizing_work might not have been queued yet, relax */
		cpu_relax();

		mutex_lock(&kprobe_mutex);
	}

	mutex_unlock(&kprobe_mutex);
624 625 626
}

/* Optimize kprobe if p is ready to be optimized */
627
static void optimize_kprobe(struct kprobe *p)
628 629 630 631
{
	struct optimized_kprobe *op;

	/* Check if the kprobe is disabled or not ready for optimization. */
632
	if (!kprobe_optready(p) || !kprobes_allow_optimization ||
633 634 635
	    (kprobe_disabled(p) || kprobes_all_disarmed))
		return;

636 637
	/* kprobes with post_handler can not be optimized */
	if (p->post_handler)
638 639 640 641 642 643 644 645 646 647 648 649
		return;

	op = container_of(p, struct optimized_kprobe, kp);

	/* Check there is no other kprobes at the optimized instructions */
	if (arch_check_optimized_kprobe(op) < 0)
		return;

	/* Check if it is already optimized. */
	if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
		return;
	op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
650 651 652 653 654 655 656 657 658 659 660

	if (!list_empty(&op->list))
		/* This is under unoptimizing. Just dequeue the probe */
		list_del_init(&op->list);
	else {
		list_add(&op->list, &optimizing_list);
		kick_kprobe_optimizer();
	}
}

/* Short cut to direct unoptimizing */
661
static void force_unoptimize_kprobe(struct optimized_kprobe *op)
662
{
663
	lockdep_assert_cpus_held();
664 665 666
	arch_unoptimize_kprobe(op);
	if (kprobe_disabled(&op->kp))
		arch_disarm_kprobe(&op->kp);
667 668 669
}

/* Unoptimize a kprobe if p is optimized */
670
static void unoptimize_kprobe(struct kprobe *p, bool force)
671 672 673
{
	struct optimized_kprobe *op;

674 675 676 677 678 679 680 681 682 683 684 685
	if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
		return; /* This is not an optprobe nor optimized */

	op = container_of(p, struct optimized_kprobe, kp);
	if (!kprobe_optimized(p)) {
		/* Unoptimized or unoptimizing case */
		if (force && !list_empty(&op->list)) {
			/*
			 * Only if this is unoptimizing kprobe and forced,
			 * forcibly unoptimize it. (No need to unoptimize
			 * unoptimized kprobe again :)
			 */
686
			list_del_init(&op->list);
687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704
			force_unoptimize_kprobe(op);
		}
		return;
	}

	op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
	if (!list_empty(&op->list)) {
		/* Dequeue from the optimization queue */
		list_del_init(&op->list);
		return;
	}
	/* Optimized kprobe case */
	if (force)
		/* Forcibly update the code: this is a special case */
		force_unoptimize_kprobe(op);
	else {
		list_add(&op->list, &unoptimizing_list);
		kick_kprobe_optimizer();
705 706 707
	}
}

708
/* Cancel unoptimizing for reusing */
709
static int reuse_unused_kprobe(struct kprobe *ap)
710 711
{
	struct optimized_kprobe *op;
712
	int ret;
713 714 715 716 717 718

	/*
	 * Unused kprobe MUST be on the way of delayed unoptimizing (means
	 * there is still a relative jump) and disabled.
	 */
	op = container_of(ap, struct optimized_kprobe, kp);
719
	WARN_ON_ONCE(list_empty(&op->list));
720 721 722
	/* Enable the probe again */
	ap->flags &= ~KPROBE_FLAG_DISABLED;
	/* Optimize it again (remove from op->list) */
723 724 725 726
	ret = kprobe_optready(ap);
	if (ret)
		return ret;

727
	optimize_kprobe(ap);
728
	return 0;
729 730
}

731
/* Remove optimized instructions */
732
static void kill_optimized_kprobe(struct kprobe *p)
733 734 735 736
{
	struct optimized_kprobe *op;

	op = container_of(p, struct optimized_kprobe, kp);
737 738
	if (!list_empty(&op->list))
		/* Dequeue from the (un)optimization queue */
739
		list_del_init(&op->list);
740
	op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
741 742 743 744 745 746 747 748 749 750 751 752

	if (kprobe_unused(p)) {
		/* Enqueue if it is unused */
		list_add(&op->list, &freeing_list);
		/*
		 * Remove unused probes from the hash list. After waiting
		 * for synchronization, this probe is reclaimed.
		 * (reclaiming is done by do_free_cleaned_kprobes().)
		 */
		hlist_del_rcu(&op->kp.hlist);
	}

753
	/* Don't touch the code, because it is already freed. */
754 755 756
	arch_remove_optimized_kprobe(op);
}

757 758 759 760 761 762 763
static inline
void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
{
	if (!kprobe_ftrace(p))
		arch_prepare_optimized_kprobe(op, p);
}

764
/* Try to prepare optimized instructions */
765
static void prepare_optimized_kprobe(struct kprobe *p)
766 767 768 769
{
	struct optimized_kprobe *op;

	op = container_of(p, struct optimized_kprobe, kp);
770
	__prepare_optimized_kprobe(op, p);
771 772 773
}

/* Allocate new optimized_kprobe and try to prepare optimized instructions */
774
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
775 776 777 778 779 780 781 782 783
{
	struct optimized_kprobe *op;

	op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
	if (!op)
		return NULL;

	INIT_LIST_HEAD(&op->list);
	op->kp.addr = p->addr;
784
	__prepare_optimized_kprobe(op, p);
785 786 787 788

	return &op->kp;
}

789
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
790 791 792 793 794

/*
 * Prepare an optimized_kprobe and optimize it
 * NOTE: p must be a normal registered kprobe
 */
795
static void try_to_optimize_kprobe(struct kprobe *p)
796 797 798 799
{
	struct kprobe *ap;
	struct optimized_kprobe *op;

800 801 802 803
	/* Impossible to optimize ftrace-based kprobe */
	if (kprobe_ftrace(p))
		return;

804
	/* For preparing optimization, jump_label_text_reserved() is called */
805
	cpus_read_lock();
806 807 808
	jump_label_lock();
	mutex_lock(&text_mutex);

809 810
	ap = alloc_aggr_kprobe(p);
	if (!ap)
811
		goto out;
812 813 814 815

	op = container_of(ap, struct optimized_kprobe, kp);
	if (!arch_prepared_optinsn(&op->optinsn)) {
		/* If failed to setup optimizing, fallback to kprobe */
816 817
		arch_remove_optimized_kprobe(op);
		kfree(op);
818
		goto out;
819 820 821
	}

	init_aggr_kprobe(ap, p);
822 823 824 825 826
	optimize_kprobe(ap);	/* This just kicks optimizer thread */

out:
	mutex_unlock(&text_mutex);
	jump_label_unlock();
827
	cpus_read_unlock();
828 829
}

830
#ifdef CONFIG_SYSCTL
831
static void optimize_all_kprobes(void)
832 833 834 835 836
{
	struct hlist_head *head;
	struct kprobe *p;
	unsigned int i;

837
	mutex_lock(&kprobe_mutex);
838 839
	/* If optimization is already allowed, just return */
	if (kprobes_allow_optimization)
840
		goto out;
841

842
	cpus_read_lock();
843 844 845
	kprobes_allow_optimization = true;
	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
		head = &kprobe_table[i];
846
		hlist_for_each_entry_rcu(p, head, hlist)
847 848 849
			if (!kprobe_disabled(p))
				optimize_kprobe(p);
	}
850
	cpus_read_unlock();
851
	printk(KERN_INFO "Kprobes globally optimized\n");
852 853
out:
	mutex_unlock(&kprobe_mutex);
854 855
}

856
static void unoptimize_all_kprobes(void)
857 858 859 860 861
{
	struct hlist_head *head;
	struct kprobe *p;
	unsigned int i;

862
	mutex_lock(&kprobe_mutex);
863
	/* If optimization is already prohibited, just return */
864 865
	if (!kprobes_allow_optimization) {
		mutex_unlock(&kprobe_mutex);
866
		return;
867
	}
868

869
	cpus_read_lock();
870 871 872
	kprobes_allow_optimization = false;
	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
		head = &kprobe_table[i];
873
		hlist_for_each_entry_rcu(p, head, hlist) {
874
			if (!kprobe_disabled(p))
875
				unoptimize_kprobe(p, false);
876 877
		}
	}
878
	cpus_read_unlock();
879 880
	mutex_unlock(&kprobe_mutex);

881 882 883
	/* Wait for unoptimizing completion */
	wait_for_kprobe_optimizer();
	printk(KERN_INFO "Kprobes globally unoptimized\n");
884 885
}

886
static DEFINE_MUTEX(kprobe_sysctl_mutex);
887 888 889 890 891 892 893
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
				      void __user *buffer, size_t *length,
				      loff_t *ppos)
{
	int ret;

894
	mutex_lock(&kprobe_sysctl_mutex);
895 896 897 898 899 900 901
	sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

	if (sysctl_kprobes_optimization)
		optimize_all_kprobes();
	else
		unoptimize_all_kprobes();
902
	mutex_unlock(&kprobe_sysctl_mutex);
903 904 905 906 907

	return ret;
}
#endif /* CONFIG_SYSCTL */

908
/* Put a breakpoint for a probe. Must be called with text_mutex locked */
909
static void __arm_kprobe(struct kprobe *p)
910
{
911
	struct kprobe *_p;
912 913

	/* Check collision with other optimized kprobes */
914 915
	_p = get_optimized_kprobe((unsigned long)p->addr);
	if (unlikely(_p))
916 917
		/* Fallback to unoptimized kprobe */
		unoptimize_kprobe(_p, true);
918 919 920 921 922

	arch_arm_kprobe(p);
	optimize_kprobe(p);	/* Try to optimize (add kprobe to a list) */
}

923
/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
924
static void __disarm_kprobe(struct kprobe *p, bool reopt)
925
{
926
	struct kprobe *_p;
927

928 929
	/* Try to unoptimize */
	unoptimize_kprobe(p, kprobes_all_disarmed);
930

931 932 933 934 935 936 937 938
	if (!kprobe_queued(p)) {
		arch_disarm_kprobe(p);
		/* If another kprobe was blocked, optimize it. */
		_p = get_optimized_kprobe((unsigned long)p->addr);
		if (unlikely(_p) && reopt)
			optimize_kprobe(_p);
	}
	/* TODO: reoptimize others after unoptimized this probe */
939 940 941 942 943
}

#else /* !CONFIG_OPTPROBES */

#define optimize_kprobe(p)			do {} while (0)
944
#define unoptimize_kprobe(p, f)			do {} while (0)
945 946 947 948
#define kill_optimized_kprobe(p)		do {} while (0)
#define prepare_optimized_kprobe(p)		do {} while (0)
#define try_to_optimize_kprobe(p)		do {} while (0)
#define __arm_kprobe(p)				arch_arm_kprobe(p)
949 950 951
#define __disarm_kprobe(p, o)			arch_disarm_kprobe(p)
#define kprobe_disarmed(p)			kprobe_disabled(p)
#define wait_for_kprobe_optimizer()		do {} while (0)
952

953
static int reuse_unused_kprobe(struct kprobe *ap)
954
{
955 956 957 958 959 960
	/*
	 * If the optimized kprobe is NOT supported, the aggr kprobe is
	 * released at the same time that the last aggregated kprobe is
	 * unregistered.
	 * Thus there should be no chance to reuse unused kprobe.
	 */
961
	printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
962
	return -EINVAL;
963 964
}

965
static void free_aggr_kprobe(struct kprobe *p)
966
{
967
	arch_remove_kprobe(p);
968 969 970
	kfree(p);
}

971
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
972 973 974 975 976
{
	return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
}
#endif /* CONFIG_OPTPROBES */

977
#ifdef CONFIG_KPROBES_ON_FTRACE
978
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
979
	.func = kprobe_ftrace_handler,
980
	.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
981 982 983 984
};
static int kprobe_ftrace_enabled;

/* Must ensure p->addr is really on ftrace */
985
static int prepare_kprobe(struct kprobe *p)
986 987 988 989 990 991 992 993
{
	if (!kprobe_ftrace(p))
		return arch_prepare_kprobe(p);

	return arch_prepare_kprobe_ftrace(p);
}

/* Caller must lock kprobe_mutex */
994
static int arm_kprobe_ftrace(struct kprobe *p)
995
{
996
	int ret = 0;
997 998 999

	ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
				   (unsigned long)p->addr, 0, 0);
1000
	if (ret) {
1001 1002
		pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n",
			 p->addr, ret);
1003 1004 1005 1006
		return ret;
	}

	if (kprobe_ftrace_enabled == 0) {
1007
		ret = register_ftrace_function(&kprobe_ftrace_ops);
1008 1009 1010 1011
		if (ret) {
			pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);
			goto err_ftrace;
		}
1012
	}
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024

	kprobe_ftrace_enabled++;
	return ret;

err_ftrace:
	/*
	 * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a
	 * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental
	 * empty filter_hash which would undesirably trace all functions.
	 */
	ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0);
	return ret;
1025 1026 1027
}

/* Caller must lock kprobe_mutex */
1028
static int disarm_kprobe_ftrace(struct kprobe *p)
1029
{
1030
	int ret = 0;
1031

1032
	if (kprobe_ftrace_enabled == 1) {
1033
		ret = unregister_ftrace_function(&kprobe_ftrace_ops);
1034 1035
		if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))
			return ret;
1036
	}
1037 1038 1039

	kprobe_ftrace_enabled--;

1040 1041
	ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
			   (unsigned long)p->addr, 1, 0);
1042 1043
	WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n",
		  p->addr, ret);
1044
	return ret;
1045
}
1046
#else	/* !CONFIG_KPROBES_ON_FTRACE */
1047
#define prepare_kprobe(p)	arch_prepare_kprobe(p)
1048
#define arm_kprobe_ftrace(p)	(-ENODEV)
1049
#define disarm_kprobe_ftrace(p)	(-ENODEV)
1050 1051
#endif

1052
/* Arm a kprobe with text_mutex */
1053
static int arm_kprobe(struct kprobe *kp)
1054
{
1055 1056 1057
	if (unlikely(kprobe_ftrace(kp)))
		return arm_kprobe_ftrace(kp);

1058
	cpus_read_lock();
1059
	mutex_lock(&text_mutex);
1060
	__arm_kprobe(kp);
1061
	mutex_unlock(&text_mutex);
1062
	cpus_read_unlock();
1063 1064

	return 0;
1065 1066 1067
}

/* Disarm a kprobe with text_mutex */
1068
static int disarm_kprobe(struct kprobe *kp, bool reopt)
1069
{
1070 1071
	if (unlikely(kprobe_ftrace(kp)))
		return disarm_kprobe_ftrace(kp);
1072 1073

	cpus_read_lock();
1074
	mutex_lock(&text_mutex);
1075
	__disarm_kprobe(kp, reopt);
1076
	mutex_unlock(&text_mutex);
1077
	cpus_read_unlock();
1078 1079

	return 0;
1080 1081
}

1082 1083 1084 1085
/*
 * Aggregate handlers for multiple kprobes support - these handlers
 * take care of invoking the individual kprobe handlers on p->list
 */
1086
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
1087 1088 1089
{
	struct kprobe *kp;

1090
	list_for_each_entry_rcu(kp, &p->list, list) {
1091
		if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
1092
			set_kprobe_instance(kp);
1093 1094
			if (kp->pre_handler(kp, regs))
				return 1;
1095
		}
1096
		reset_kprobe_instance();
1097 1098 1099
	}
	return 0;
}
1100
NOKPROBE_SYMBOL(aggr_pre_handler);
1101

1102 1103
static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
			      unsigned long flags)
1104 1105 1106
{
	struct kprobe *kp;

1107
	list_for_each_entry_rcu(kp, &p->list, list) {
1108
		if (kp->post_handler && likely(!kprobe_disabled(kp))) {
1109
			set_kprobe_instance(kp);
1110
			kp->post_handler(kp, regs, flags);
1111
			reset_kprobe_instance();
1112 1113 1114
		}
	}
}
1115
NOKPROBE_SYMBOL(aggr_post_handler);
1116

1117 1118
static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
			      int trapnr)
1119
{
1120
	struct kprobe *cur = __this_cpu_read(kprobe_instance);
1121

1122 1123 1124 1125
	/*
	 * if we faulted "during" the execution of a user specified
	 * probe handler, invoke just that probe's fault handler
	 */
1126 1127
	if (cur && cur->fault_handler) {
		if (cur->fault_handler(cur, regs, trapnr))
1128 1129 1130 1131
			return 1;
	}
	return 0;
}
1132
NOKPROBE_SYMBOL(aggr_fault_handler);
1133

1134
/* Walks the list and increments nmissed count for multiprobe case */
1135
void kprobes_inc_nmissed_count(struct kprobe *p)
1136 1137
{
	struct kprobe *kp;
1138
	if (!kprobe_aggrprobe(p)) {
1139 1140 1141 1142 1143 1144 1145
		p->nmissed++;
	} else {
		list_for_each_entry_rcu(kp, &p->list, list)
			kp->nmissed++;
	}
	return;
}
1146
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
1147

1148 1149
void recycle_rp_inst(struct kretprobe_instance *ri,
		     struct hlist_head *head)
1150
{
1151 1152
	struct kretprobe *rp = ri->rp;

1153 1154
	/* remove rp inst off the rprobe_inst_table */
	hlist_del(&ri->hlist);
1155 1156
	INIT_HLIST_NODE(&ri->hlist);
	if (likely(rp)) {
1157
		raw_spin_lock(&rp->lock);
1158
		hlist_add_head(&ri->hlist, &rp->free_instances);
1159
		raw_spin_unlock(&rp->lock);
1160 1161
	} else
		/* Unregistering */
1162
		hlist_add_head(&ri->hlist, head);
1163
}
1164
NOKPROBE_SYMBOL(recycle_rp_inst);
1165

1166
void kretprobe_hash_lock(struct task_struct *tsk,
1167
			 struct hlist_head **head, unsigned long *flags)
1168
__acquires(hlist_lock)
1169 1170
{
	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
1171
	raw_spinlock_t *hlist_lock;
1172 1173 1174

	*head = &kretprobe_inst_table[hash];
	hlist_lock = kretprobe_table_lock_ptr(hash);
1175
	raw_spin_lock_irqsave(hlist_lock, *flags);
1176
}
1177
NOKPROBE_SYMBOL(kretprobe_hash_lock);
1178

1179 1180
static void kretprobe_table_lock(unsigned long hash,
				 unsigned long *flags)
1181
__acquires(hlist_lock)
1182
{
1183 1184
	raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
	raw_spin_lock_irqsave(hlist_lock, *flags);
1185
}
1186
NOKPROBE_SYMBOL(kretprobe_table_lock);
1187

1188 1189
void kretprobe_hash_unlock(struct task_struct *tsk,
			   unsigned long *flags)
1190
__releases(hlist_lock)
1191 1192
{
	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
1193
	raw_spinlock_t *hlist_lock;
1194 1195

	hlist_lock = kretprobe_table_lock_ptr(hash);
1196
	raw_spin_unlock_irqrestore(hlist_lock, *flags);
1197
}
1198
NOKPROBE_SYMBOL(kretprobe_hash_unlock);
1199

1200 1201
static void kretprobe_table_unlock(unsigned long hash,
				   unsigned long *flags)
1202
__releases(hlist_lock)
1203
{
1204 1205
	raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
	raw_spin_unlock_irqrestore(hlist_lock, *flags);
1206
}
1207
NOKPROBE_SYMBOL(kretprobe_table_unlock);
1208 1209

/*
1210 1211 1212 1213
 * This function is called from finish_task_switch when task tk becomes dead,
 * so that we can recycle any function-return probe instances associated
 * with this task. These left over instances represent probed functions
 * that have been called but will never return.
1214
 */
1215
void kprobe_flush_task(struct task_struct *tk)
1216
{
1217
	struct kretprobe_instance *ri;