builtin-sched.c 87.3 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
#include "builtin.h"
3
#include "perf.h"
4 5

#include "util/util.h"
6
#include "util/evlist.h"
7
#include "util/cache.h"
8
#include "util/evsel.h"
9 10 11
#include "util/symbol.h"
#include "util/thread.h"
#include "util/header.h"
12
#include "util/session.h"
13
#include "util/tool.h"
14
#include "util/cloexec.h"
15
#include "util/thread_map.h"
16
#include "util/color.h"
17
#include "util/stat.h"
18
#include "util/callchain.h"
19
#include "util/time-utils.h"
20

21
#include <subcmd/parse-options.h>
22
#include "util/trace-event.h"
23 24 25

#include "util/debug.h"

26
#include <linux/kernel.h>
27
#include <linux/log2.h>
28
#include <sys/prctl.h>
29
#include <sys/resource.h>
30
#include <inttypes.h>
31

32
#include <errno.h>
33 34 35
#include <semaphore.h>
#include <pthread.h>
#include <math.h>
36
#include <api/fs/fs.h>
37
#include <linux/time64.h>
38

39 40
#include "sane_ctype.h"

41 42 43 44
#define PR_SET_NAME		15               /* Set process name */
#define MAX_CPUS		4096
#define COMM_LEN		20
#define SYM_LEN			129
45
#define MAX_PID			1024000
46

47
struct sched_atom;
48

49 50 51 52
struct task_desc {
	unsigned long		nr;
	unsigned long		pid;
	char			comm[COMM_LEN];
53

54 55
	unsigned long		nr_events;
	unsigned long		curr_event;
56
	struct sched_atom	**atoms;
57 58 59

	pthread_t		thread;
	sem_t			sleep_sem;
60

61 62 63 64 65 66 67 68 69 70
	sem_t			ready_for_work;
	sem_t			work_done_sem;

	u64			cpu_usage;
};

enum sched_event_type {
	SCHED_EVENT_RUN,
	SCHED_EVENT_SLEEP,
	SCHED_EVENT_WAKEUP,
71
	SCHED_EVENT_MIGRATION,
72 73
};

74
struct sched_atom {
75
	enum sched_event_type	type;
76
	int			specific_wait;
77 78 79 80 81 82 83
	u64			timestamp;
	u64			duration;
	unsigned long		nr;
	sem_t			*wait_sem;
	struct task_desc	*wakee;
};

84
#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
85

86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
/* task state bitmask, copied from include/linux/sched.h */
#define TASK_RUNNING		0
#define TASK_INTERRUPTIBLE	1
#define TASK_UNINTERRUPTIBLE	2
#define __TASK_STOPPED		4
#define __TASK_TRACED		8
/* in tsk->exit_state */
#define EXIT_DEAD		16
#define EXIT_ZOMBIE		32
#define EXIT_TRACE		(EXIT_ZOMBIE | EXIT_DEAD)
/* in tsk->state again */
#define TASK_DEAD		64
#define TASK_WAKEKILL		128
#define TASK_WAKING		256
#define TASK_PARKED		512

102 103 104 105 106 107 108 109 110 111
enum thread_state {
	THREAD_SLEEPING = 0,
	THREAD_WAIT_CPU,
	THREAD_SCHED_IN,
	THREAD_IGNORE
};

struct work_atom {
	struct list_head	list;
	enum thread_state	state;
112
	u64			sched_out_time;
113 114 115 116 117
	u64			wake_up_time;
	u64			sched_in_time;
	u64			runtime;
};

118 119
struct work_atoms {
	struct list_head	work_list;
120 121 122
	struct thread		*thread;
	struct rb_node		node;
	u64			max_lat;
123
	u64			max_lat_at;
124 125 126
	u64			total_lat;
	u64			nb_atoms;
	u64			total_runtime;
127
	int			num_merged;
128 129
};

130
typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
131

132
struct perf_sched;
133

134 135 136
struct trace_sched_handler {
	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
137

138 139
	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			     struct perf_sample *sample, struct machine *machine);
140

141 142
	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
143

144 145 146
	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
			  struct machine *machine);
147 148

	int (*migrate_task_event)(struct perf_sched *sched,
149 150 151
				  struct perf_evsel *evsel,
				  struct perf_sample *sample,
				  struct machine *machine);
152 153
};

154
#define COLOR_PIDS PERF_COLOR_BLUE
155
#define COLOR_CPUS PERF_COLOR_BG_RED
156

157 158 159 160
struct perf_sched_map {
	DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
	int			*comp_cpus;
	bool			 comp;
161 162
	struct thread_map	*color_pids;
	const char		*color_pids_str;
163 164
	struct cpu_map		*color_cpus;
	const char		*color_cpus_str;
165 166
	struct cpu_map		*cpus;
	const char		*cpus_str;
167 168
};

169 170 171 172
struct perf_sched {
	struct perf_tool tool;
	const char	 *sort_order;
	unsigned long	 nr_tasks;
173
	struct task_desc **pid_to_task;
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
	struct task_desc **tasks;
	const struct trace_sched_handler *tp_handler;
	pthread_mutex_t	 start_work_mutex;
	pthread_mutex_t	 work_done_wait_mutex;
	int		 profile_cpu;
/*
 * Track the current task - that way we can know whether there's any
 * weird events, such as a task being switched away that is not current.
 */
	int		 max_cpu;
	u32		 curr_pid[MAX_CPUS];
	struct thread	 *curr_thread[MAX_CPUS];
	char		 next_shortname1;
	char		 next_shortname2;
	unsigned int	 replay_repeat;
	unsigned long	 nr_run_events;
	unsigned long	 nr_sleep_events;
	unsigned long	 nr_wakeup_events;
	unsigned long	 nr_sleep_corrections;
	unsigned long	 nr_run_events_optimized;
	unsigned long	 targetless_wakeups;
	unsigned long	 multitarget_wakeups;
	unsigned long	 nr_runs;
	unsigned long	 nr_timestamps;
	unsigned long	 nr_unordered_timestamps;
	unsigned long	 nr_context_switch_bugs;
	unsigned long	 nr_events;
	unsigned long	 nr_lost_chunks;
	unsigned long	 nr_lost_events;
	u64		 run_measurement_overhead;
	u64		 sleep_measurement_overhead;
	u64		 start_time;
	u64		 cpu_usage;
	u64		 runavg_cpu_usage;
	u64		 parent_cpu_usage;
	u64		 runavg_parent_cpu_usage;
	u64		 sum_runtime;
	u64		 sum_fluct;
	u64		 run_avg;
	u64		 all_runtime;
	u64		 all_count;
	u64		 cpu_last_switched[MAX_CPUS];
216
	struct rb_root	 atom_root, sorted_atom_root, merged_atom_root;
217
	struct list_head sort_list, cmp_pid;
218
	bool force;
219
	bool skip_merge;
220
	struct perf_sched_map map;
221 222 223 224

	/* options for timehist command */
	bool		summary;
	bool		summary_only;
225
	bool		idle_hist;
226 227
	bool		show_callchain;
	unsigned int	max_stack;
228
	bool		show_cpu_visual;
229
	bool		show_wakeups;
230
	bool		show_next;
231
	bool		show_migrations;
232
	bool		show_state;
233
	u64		skipped_samples;
234 235
	const char	*time_str;
	struct perf_time_interval ptime;
236
	struct perf_time_interval hist_time;
237
};
238

239 240 241 242
/* per thread run time data */
struct thread_runtime {
	u64 last_time;      /* time of previous sched in/out event */
	u64 dt_run;         /* run time */
243 244 245
	u64 dt_sleep;       /* time between CPU access by sleep (off cpu) */
	u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
	u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
246 247 248 249 250
	u64 dt_delay;       /* time between wakeup and sched-in */
	u64 ready_to_run;   /* time of wakeup */

	struct stats run_stats;
	u64 total_run_time;
251 252 253 254
	u64 total_sleep_time;
	u64 total_iowait_time;
	u64 total_preempt_time;
	u64 total_delay_time;
255

256
	int last_state;
257 258

	char shortname[3];
259 260
	bool comm_changed;

261
	u64 migrations;
262 263 264 265 266 267 268 269
};

/* per event run time data */
struct evsel_runtime {
	u64 *last_time; /* time this event was last seen per cpu */
	u32 ncpu;       /* highest cpu slot allocated */
};

270 271 272 273 274 275 276 277 278
/* per cpu idle time data */
struct idle_thread_runtime {
	struct thread_runtime	tr;
	struct thread		*last_thread;
	struct rb_root		sorted_root;
	struct callchain_root	callchain;
	struct callchain_cursor	cursor;
};

279 280 281 282 283
/* track idle times per cpu */
static struct thread **idle_threads;
static int idle_max_cpu;
static char idle_comm[] = "<idle>";

284
static u64 get_nsecs(void)
285 286 287 288 289
{
	struct timespec ts;

	clock_gettime(CLOCK_MONOTONIC, &ts);

290
	return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
291 292
}

293
static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
294
{
295
	u64 T0 = get_nsecs(), T1;
296 297 298

	do {
		T1 = get_nsecs();
299
	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
300 301
}

302
static void sleep_nsecs(u64 nsecs)
303 304 305 306 307 308 309 310 311
{
	struct timespec ts;

	ts.tv_nsec = nsecs % 999999999;
	ts.tv_sec = nsecs / 999999999;

	nanosleep(&ts, NULL);
}

312
static void calibrate_run_measurement_overhead(struct perf_sched *sched)
313
{
314
	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
315 316 317 318
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
319
		burn_nsecs(sched, 0);
320 321 322 323
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
324
	sched->run_measurement_overhead = min_delta;
325

326
	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
327 328
}

329
static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
330
{
331
	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
332 333 334 335 336 337 338 339 340 341
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
		sleep_nsecs(10000);
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
	min_delta -= 10000;
342
	sched->sleep_measurement_overhead = min_delta;
343

344
	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
345 346
}

347
static struct sched_atom *
348
get_new_event(struct task_desc *task, u64 timestamp)
349
{
350
	struct sched_atom *event = zalloc(sizeof(*event));
351 352 353 354 355 356 357
	unsigned long idx = task->nr_events;
	size_t size;

	event->timestamp = timestamp;
	event->nr = idx;

	task->nr_events++;
358 359 360
	size = sizeof(struct sched_atom *) * task->nr_events;
	task->atoms = realloc(task->atoms, size);
	BUG_ON(!task->atoms);
361

362
	task->atoms[idx] = event;
363 364 365 366

	return event;
}

367
static struct sched_atom *last_event(struct task_desc *task)
368 369 370 371
{
	if (!task->nr_events)
		return NULL;

372
	return task->atoms[task->nr_events - 1];
373 374
}

375 376
static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
				u64 timestamp, u64 duration)
377
{
378
	struct sched_atom *event, *curr_event = last_event(task);
379 380

	/*
381 382 383
	 * optimize an existing RUN event by merging this one
	 * to it:
	 */
384
	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
385
		sched->nr_run_events_optimized++;
386 387 388 389 390 391 392 393 394
		curr_event->duration += duration;
		return;
	}

	event = get_new_event(task, timestamp);

	event->type = SCHED_EVENT_RUN;
	event->duration = duration;

395
	sched->nr_run_events++;
396 397
}

398 399
static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
				   u64 timestamp, struct task_desc *wakee)
400
{
401
	struct sched_atom *event, *wakee_event;
402 403 404 405 406 407 408

	event = get_new_event(task, timestamp);
	event->type = SCHED_EVENT_WAKEUP;
	event->wakee = wakee;

	wakee_event = last_event(wakee);
	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
409
		sched->targetless_wakeups++;
410 411 412
		return;
	}
	if (wakee_event->wait_sem) {
413
		sched->multitarget_wakeups++;
414 415 416
		return;
	}

417
	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
418 419 420 421
	sem_init(wakee_event->wait_sem, 0, 0);
	wakee_event->specific_wait = 1;
	event->wait_sem = wakee_event->wait_sem;

422
	sched->nr_wakeup_events++;
423 424
}

425 426
static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
				  u64 timestamp, u64 task_state __maybe_unused)
427
{
428
	struct sched_atom *event = get_new_event(task, timestamp);
429 430 431

	event->type = SCHED_EVENT_SLEEP;

432
	sched->nr_sleep_events++;
433 434
}

435 436
static struct task_desc *register_pid(struct perf_sched *sched,
				      unsigned long pid, const char *comm)
437 438
{
	struct task_desc *task;
439
	static int pid_max;
440

441 442 443 444 445
	if (sched->pid_to_task == NULL) {
		if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
			pid_max = MAX_PID;
		BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
	}
446 447 448 449 450 451
	if (pid >= (unsigned long)pid_max) {
		BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
			sizeof(struct task_desc *))) == NULL);
		while (pid >= (unsigned long)pid_max)
			sched->pid_to_task[pid_max++] = NULL;
	}
452

453
	task = sched->pid_to_task[pid];
454 455 456 457

	if (task)
		return task;

458
	task = zalloc(sizeof(*task));
459
	task->pid = pid;
460
	task->nr = sched->nr_tasks;
461 462 463 464 465
	strcpy(task->comm, comm);
	/*
	 * every task starts in sleeping state - this gets ignored
	 * if there's no wakeup pointing to this sleep state:
	 */
466
	add_sched_event_sleep(sched, task, 0, 0);
467

468 469
	sched->pid_to_task[pid] = task;
	sched->nr_tasks++;
470
	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
471 472
	BUG_ON(!sched->tasks);
	sched->tasks[task->nr] = task;
473

474
	if (verbose > 0)
475
		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
476 477 478 479 480

	return task;
}


481
static void print_task_traces(struct perf_sched *sched)
482 483 484 485
{
	struct task_desc *task;
	unsigned long i;

486 487
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
488
		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
489 490 491 492
			task->nr, task->comm, task->pid, task->nr_events);
	}
}

493
static void add_cross_task_wakeups(struct perf_sched *sched)
494 495 496 497
{
	struct task_desc *task1, *task2;
	unsigned long i, j;

498 499
	for (i = 0; i < sched->nr_tasks; i++) {
		task1 = sched->tasks[i];
500
		j = i + 1;
501
		if (j == sched->nr_tasks)
502
			j = 0;
503 504
		task2 = sched->tasks[j];
		add_sched_event_wakeup(sched, task1, 0, task2);
505 506 507
	}
}

508 509
static void perf_sched__process_event(struct perf_sched *sched,
				      struct sched_atom *atom)
510 511 512
{
	int ret = 0;

513
	switch (atom->type) {
514
		case SCHED_EVENT_RUN:
515
			burn_nsecs(sched, atom->duration);
516 517
			break;
		case SCHED_EVENT_SLEEP:
518 519
			if (atom->wait_sem)
				ret = sem_wait(atom->wait_sem);
520 521 522
			BUG_ON(ret);
			break;
		case SCHED_EVENT_WAKEUP:
523 524
			if (atom->wait_sem)
				ret = sem_post(atom->wait_sem);
525 526
			BUG_ON(ret);
			break;
527 528
		case SCHED_EVENT_MIGRATION:
			break;
529 530 531 532 533
		default:
			BUG_ON(1);
	}
}

534
static u64 get_cpu_usage_nsec_parent(void)
535 536
{
	struct rusage ru;
537
	u64 sum;
538 539 540 541 542
	int err;

	err = getrusage(RUSAGE_SELF, &ru);
	BUG_ON(err);

543 544
	sum =  ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC;
	sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC;
545 546 547 548

	return sum;
}

549
static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
550
{
551
	struct perf_event_attr attr;
552
	char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
553
	int fd;
554 555
	struct rlimit limit;
	bool need_privilege = false;
556

557
	memset(&attr, 0, sizeof(attr));
558

559 560
	attr.type = PERF_TYPE_SOFTWARE;
	attr.config = PERF_COUNT_SW_TASK_CLOCK;
561

562
force_again:
563 564
	fd = sys_perf_event_open(&attr, 0, -1, -1,
				 perf_event_open_cloexec_flag());
565

566
	if (fd < 0) {
567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582
		if (errno == EMFILE) {
			if (sched->force) {
				BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
				limit.rlim_cur += sched->nr_tasks - cur_task;
				if (limit.rlim_cur > limit.rlim_max) {
					limit.rlim_max = limit.rlim_cur;
					need_privilege = true;
				}
				if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
					if (need_privilege && errno == EPERM)
						strcpy(info, "Need privilege\n");
				} else
					goto force_again;
			} else
				strcpy(info, "Have a try with -f option\n");
		}
583
		pr_err("Error: sys_perf_event_open() syscall returned "
584
		       "with %d (%s)\n%s", fd,
585
		       str_error_r(errno, sbuf, sizeof(sbuf)), info);
586 587
		exit(EXIT_FAILURE);
	}
588 589 590 591 592 593 594 595 596 597 598 599
	return fd;
}

static u64 get_cpu_usage_nsec_self(int fd)
{
	u64 runtime;
	int ret;

	ret = read(fd, &runtime, sizeof(runtime));
	BUG_ON(ret != sizeof(runtime));

	return runtime;
600 601
}

602 603 604
struct sched_thread_parms {
	struct task_desc  *task;
	struct perf_sched *sched;
605
	int fd;
606 607
};

608 609
static void *thread_func(void *ctx)
{
610 611 612
	struct sched_thread_parms *parms = ctx;
	struct task_desc *this_task = parms->task;
	struct perf_sched *sched = parms->sched;
613
	u64 cpu_usage_0, cpu_usage_1;
614 615
	unsigned long i, ret;
	char comm2[22];
616
	int fd = parms->fd;
617

618
	zfree(&parms);
619

620 621
	sprintf(comm2, ":%s", this_task->comm);
	prctl(PR_SET_NAME, comm2);
622 623
	if (fd < 0)
		return NULL;
624 625 626
again:
	ret = sem_post(&this_task->ready_for_work);
	BUG_ON(ret);
627
	ret = pthread_mutex_lock(&sched->start_work_mutex);
628
	BUG_ON(ret);
629
	ret = pthread_mutex_unlock(&sched->start_work_mutex);
630 631
	BUG_ON(ret);

632
	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
633 634 635

	for (i = 0; i < this_task->nr_events; i++) {
		this_task->curr_event = i;
636
		perf_sched__process_event(sched, this_task->atoms[i]);
637 638
	}

639
	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
640 641 642 643
	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
	ret = sem_post(&this_task->work_done_sem);
	BUG_ON(ret);

644
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
645
	BUG_ON(ret);
646
	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
647 648 649 650 651
	BUG_ON(ret);

	goto again;
}

652
static void create_tasks(struct perf_sched *sched)
653 654 655 656 657 658 659 660
{
	struct task_desc *task;
	pthread_attr_t attr;
	unsigned long i;
	int err;

	err = pthread_attr_init(&attr);
	BUG_ON(err);
661 662
	err = pthread_attr_setstacksize(&attr,
			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
663
	BUG_ON(err);
664
	err = pthread_mutex_lock(&sched->start_work_mutex);
665
	BUG_ON(err);
666
	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
667
	BUG_ON(err);
668 669 670 671 672
	for (i = 0; i < sched->nr_tasks; i++) {
		struct sched_thread_parms *parms = malloc(sizeof(*parms));
		BUG_ON(parms == NULL);
		parms->task = task = sched->tasks[i];
		parms->sched = sched;
673
		parms->fd = self_open_counters(sched, i);
674 675 676 677
		sem_init(&task->sleep_sem, 0, 0);
		sem_init(&task->ready_for_work, 0, 0);
		sem_init(&task->work_done_sem, 0, 0);
		task->curr_event = 0;
678
		err = pthread_create(&task->thread, &attr, thread_func, parms);
679 680 681 682
		BUG_ON(err);
	}
}

683
static void wait_for_tasks(struct perf_sched *sched)
684
{
685
	u64 cpu_usage_0, cpu_usage_1;
686 687 688
	struct task_desc *task;
	unsigned long i, ret;

689 690 691
	sched->start_time = get_nsecs();
	sched->cpu_usage = 0;
	pthread_mutex_unlock(&sched->work_done_wait_mutex);
692

693 694
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
695 696 697 698
		ret = sem_wait(&task->ready_for_work);
		BUG_ON(ret);
		sem_init(&task->ready_for_work, 0, 0);
	}
699
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
700 701 702 703
	BUG_ON(ret);

	cpu_usage_0 = get_cpu_usage_nsec_parent();

704
	pthread_mutex_unlock(&sched->start_work_mutex);
705

706 707
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
708 709 710
		ret = sem_wait(&task->work_done_sem);
		BUG_ON(ret);
		sem_init(&task->work_done_sem, 0, 0);
711
		sched->cpu_usage += task->cpu_usage;
712 713 714 715
		task->cpu_usage = 0;
	}

	cpu_usage_1 = get_cpu_usage_nsec_parent();
716 717
	if (!sched->runavg_cpu_usage)
		sched->runavg_cpu_usage = sched->cpu_usage;
718
	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
719

720 721 722
	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
	if (!sched->runavg_parent_cpu_usage)
		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
723 724
	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
					 sched->parent_cpu_usage)/sched->replay_repeat;
725

726
	ret = pthread_mutex_lock(&sched->start_work_mutex);
727 728
	BUG_ON(ret);

729 730
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
731 732 733 734 735
		sem_init(&task->sleep_sem, 0, 0);
		task->curr_event = 0;
	}
}

736
static void run_one_test(struct perf_sched *sched)
737
{
738
	u64 T0, T1, delta, avg_delta, fluct;
739 740

	T0 = get_nsecs();
741
	wait_for_tasks(sched);
742 743 744
	T1 = get_nsecs();

	delta = T1 - T0;
745 746
	sched->sum_runtime += delta;
	sched->nr_runs++;
747

748
	avg_delta = sched->sum_runtime / sched->nr_runs;
749 750 751 752
	if (delta < avg_delta)
		fluct = avg_delta - delta;
	else
		fluct = delta - avg_delta;
753 754 755
	sched->sum_fluct += fluct;
	if (!sched->run_avg)
		sched->run_avg = delta;
756
	sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
757

758
	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC);
759

760
	printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC);
761

762
	printf("cpu: %0.2f / %0.2f",
763
		(double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC);
764 765 766

#if 0
	/*
767
	 * rusage statistics done by the parent, these are less
768
	 * accurate than the sched->sum_exec_runtime based statistics:
769
	 */
770
	printf(" [%0.2f / %0.2f]",
771 772
		(double)sched->parent_cpu_usage / NSEC_PER_MSEC,
		(double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC);
773 774
#endif

775
	printf("\n");
776

777 778 779
	if (sched->nr_sleep_corrections)
		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
	sched->nr_sleep_corrections = 0;
780 781
}

782
static void test_calibrations(struct perf_sched *sched)
783
{
784
	u64 T0, T1;
785 786

	T0 = get_nsecs();
787
	burn_nsecs(sched, NSEC_PER_MSEC);
788 789
	T1 = get_nsecs();

790
	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
791 792

	T0 = get_nsecs();
793
	sleep_nsecs(NSEC_PER_MSEC);
794 795
	T1 = get_nsecs();

796
	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
797 798
}

799
static int
800
replay_wakeup_event(struct perf_sched *sched,
801 802
		    struct perf_evsel *evsel, struct perf_sample *sample,
		    struct machine *machine __maybe_unused)
803
{
804 805
	const char *comm = perf_evsel__strval(evsel, sample, "comm");
	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
806
	struct task_desc *waker, *wakee;
807

808
	if (verbose > 0) {
809
		printf("sched_wakeup event %p\n", evsel);
810

811
		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
812
	}
813

814
	waker = register_pid(sched, sample->tid, "<unknown>");
815
	wakee = register_pid(sched, pid, comm);
816

817
	add_sched_event_wakeup(sched, waker, sample->time, wakee);
818
	return 0;
819 820
}

821 822 823 824
static int replay_switch_event(struct perf_sched *sched,
			       struct perf_evsel *evsel,
			       struct perf_sample *sample,
			       struct machine *machine __maybe_unused)
825
{
826 827 828 829 830
	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
831
	struct task_desc *prev, __maybe_unused *next;
832 833
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
834 835
	s64 delta;

836
	if (verbose > 0)
837
		printf("sched_switch event %p\n", evsel);
838

839
	if (cpu >= MAX_CPUS || cpu < 0)
840
		return 0;
841

842
	timestamp0 = sched->cpu_last_switched[cpu];
843 844 845 846 847
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

848
	if (delta < 0) {
849
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
850 851
		return -1;
	}
852

853 854
	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
		 prev_comm, prev_pid, next_comm, next_pid, delta);
855

856 857
	prev = register_pid(sched, prev_pid, prev_comm);
	next = register_pid(sched, next_pid, next_comm);
858

859
	sched->cpu_last_switched[cpu] = timestamp;
860

861
	add_sched_event_run(sched, prev, timestamp, delta);
862
	add_sched_event_sleep(sched, prev, timestamp, prev_state);
863 864

	return 0;
865 866
}

867 868 869
static int replay_fork_event(struct perf_sched *sched,
			     union perf_event *event,
			     struct machine *machine)
870
{
871 872
	struct thread *child, *parent;

873 874 875 876
	child = machine__findnew_thread(machine, event->fork.pid,
					event->fork.tid);
	parent = machine__findnew_thread(machine, event->fork.ppid,
					 event->fork.ptid);
877 878 879 880

	if (child == NULL || parent == NULL) {
		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
				 child, parent);
881
		goto out_put;
882
	}
883

884
	if (verbose > 0) {
885
		printf("fork event\n");
886 887
		printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
		printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
888
	}
889

890 891
	register_pid(sched, parent->tid, thread__comm_str(parent));
	register_pid(sched, child->tid, thread__comm_str(child));
892 893 894
out_put:
	thread__put(child);
	thread__put(parent);
895
	return 0;
896
}
897

898 899
struct sort_dimension {
	const char		*name;
900
	sort_fn_t		cmp;
901 902 903
	struct list_head	list;
};

904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934
/*
 * handle runtime stats saved per thread
 */
static struct thread_runtime *thread__init_runtime(struct thread *thread)
{
	struct thread_runtime *r;

	r = zalloc(sizeof(struct thread_runtime));
	if (!r)
		return NULL;

	init_stats(&r->run_stats);
	thread__set_priv(thread, r);

	return r;
}

static struct thread_runtime *thread__get_runtime(struct thread *thread)
{
	struct thread_runtime *tr;

	tr = thread__priv(thread);
	if (tr == NULL) {
		tr = thread__init_runtime(thread);
		if (tr == NULL)
			pr_debug("Failed to malloc memory for runtime data.\n");
	}

	return tr;
}

935
static int
936
thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
937 938 939 940
{
	struct sort_dimension *sort;
	int ret = 0;

941 942
	BUG_ON(list_empty(list));

943 944 945 946 947 948 949 950 951
	list_for_each_entry(sort, list, list) {
		ret = sort->cmp(l, r);
		if (ret)
			return ret;
	}

	return ret;
}

952
static struct work_atoms *
953 954 955 956
thread_atoms_search(struct rb_root *root, struct thread *thread,
			 struct list_head *sort_list)
{
	struct rb_node *node = root->rb_node;
957
	struct work_atoms key = { .thread = thread };
958 959

	while (node) {
960
		struct work_atoms *atoms;
961 962
		int cmp;

963
		atoms = container_of(node, struct work_atoms, node);
964 965 966 967 968 969 970 971 972 973 974 975 976 977

		cmp = thread_lat_cmp(sort_list, &key, atoms);
		if (cmp > 0)
			node = node->rb_left;
		else if (cmp < 0)
			node = node->rb_right;
		else {
			BUG_ON(thread != atoms->thread);
			return atoms;
		}
	}
	return NULL;
}

978
static void
979
__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
980
			 struct list_head *sort_list)
981 982 983 984
{
	struct rb_node **new = &(root->rb_node), *parent = NULL;

	while (*new) {
985
		struct work_atoms *this;
986
		int cmp;
987

988
		this = container_of(*new, struct work_atoms, node);
989
		parent = *new;
990 991 992 993

		cmp = thread_lat_cmp(sort_list, data, this);

		if (cmp > 0)
994 995
			new = &((*new)->rb_left);
		else
996
			new = &((*new)->rb_right);
997 998 999 1000 1001 1002
	}

	rb_link_node(&data->node, parent, new);
	rb_insert_color(&data->node, root);
}

1003
static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
1004
{
1005
	struct work_atoms *atoms = zalloc(sizeof(*atoms));
1006 1007 1008 1009
	if (!atoms) {
		pr_err("No memory at %s\n", __func__);
		return -1;
	}
1010

1011
	atoms->thread = thread__get(thread);
1012
	INIT_LIST_HEAD(&atoms->work_list);
1013
	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
1014
	return 0;
1015 1016
}

1017
static char sched_out_state(u64 prev_state)
1018 1019 1020
{
	const char *str = TASK_STATE_TO_CHAR_STR;

1021
	return str[prev_state];
1022 1023
}

1024
static int
1025 1026 1027
add_sched_out_event(struct work_atoms *atoms,
		    char run_state,
		    u64 timestamp)
1028
{
1029
	struct work_atom *atom = zalloc(sizeof(*atom));
1030 1031 1032 1033
	if (!atom) {
		pr_err("Non memory at %s", __func__);
		return -1;
	}
1034

1035 1036
	atom->sched_out_time = timestamp;

1037
	if (run_state == 'R') {
1038
		atom->state = THREAD_WAIT_CPU;
1039
		atom->wake_up_time = atom->sched_out_time;
1040 1041
	}

1042
	list_add_tail(&atom->list, &atoms->work_list);
1043
	return 0;
1044 1045 1046
}

static void
1047 1048
add_runtime_event(struct work_atoms *atoms, u64 delta,
		  u64 timestamp __maybe_unused)
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
{
	struct work_atom *atom;

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);

	atom->runtime += delta;
	atoms->total_runtime += delta;
}

static void
add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
1062
{
1063
	struct work_atom *atom;
1064
	u64 delta;
1065

1066
	if (list_empty(&atoms->work_list))
1067 1068
		return;

1069
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1070

1071
	if (atom->state != THREAD_WAIT_CPU)
1072 1073
		return;

1074 1075
	if (timestamp < atom->wake_up_time) {
		atom->state = THREAD_IGNORE;
1076 1077 1078
		return;
	}

1079 1080
	atom->state = THREAD_SCHED_IN;
	atom->sched_in_time = timestamp;
1081

1082
	delta = atom->sched_in_time - atom->wake_up_time;
1083
	atoms->total_lat += delta;
1084
	if (delta > atoms->max_lat) {
1085
		atoms->max_lat = delta;
1086 1087
		atoms->max_lat_at = timestamp;
	}
1088
	atoms->nb_atoms++;
1089 1090
}

1091 1092 1093 1094
static int latency_switch_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
1095
{
1096 1097 1098
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
1099
	struct work_atoms *out_events, *in_events;
1100
	struct thread *sched_out, *sched_in;
1101
	u64 timestamp0, timestamp = sample->time;
1102
	int cpu = sample->cpu, err = -1;
1103 1104
	s64 delta;

1105
	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1106

1107 1108
	timestamp0 = sched->cpu_last_switched[cpu];
	sched->cpu_last_switched[cpu] = timestamp;
1109 1110 1111 1112 1113
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

1114 1115 1116 1117
	if (delta < 0) {
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
		return -1;
	}
1118

1119 1120
	sched_out = machine__findnew_thread(machine, -1, prev_pid);
	sched_in = machine__findnew_thread(machine, -1, next_pid);
1121 1122
	if (sched_out == NULL || sched_in == NULL)
		goto out_put;
1123

1124
	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1125
	if (!out_events) {
1126
		if (thread_atoms_insert(sched, sched_out))
1127
			goto out_put;
1128
		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
1129 1130
		if (!out_events) {
			pr_err("out-event: Internal tree error");
1131
			goto out_put;
1132
		}
1133
	}
1134
	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
1135
		return -1;
1136

1137
	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1138
	if (!in_events) {
1139
		if (thread_atoms_insert(sched, sched_in))
1140
			goto out_put;
1141
		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
1142 1143
		if (!in_events) {
			pr_err("in-event: Internal tree error");
1144
			goto out_put;
1145
		}