1 /*
2  * Copyright (C) 2012 Fusion-io
3  *
4  *  This program is free software; you can redistribute it and/or
5  *  modify it under the terms of the GNU General Public
6  *  License v2 as published by the Free Software Foundation.
7  *
8  *  This program is distributed in the hope that it will be useful,
9  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
10  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11  *  GNU General Public License for more details.
12  *
13  *  You should have received a copy of the GNU General Public License
14  *  along with this program; if not, write to the Free Software
15  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16  *
17  *  Parts of this file were imported from Jens Axboe's blktrace sources (also GPL)
18  */
19 #include <sys/types.h>
20 #include <sys/stat.h>
21 #include <fcntl.h>
22 #include <unistd.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <math.h>
26 #include <inttypes.h>
27 #include <string.h>
28 #include <asm/types.h>
29 #include <errno.h>
30 #include <sys/mman.h>
31 #include <time.h>
32 #include <math.h>
33 #include <dirent.h>
34 
35 #include "plot.h"
36 #include "blkparse.h"
37 #include "list.h"
38 #include "tracers.h"
39 
40 #define IO_HASH_TABLE_BITS  11
41 #define IO_HASH_TABLE_SIZE (1 << IO_HASH_TABLE_BITS)
42 static struct list_head io_hash_table[IO_HASH_TABLE_SIZE];
43 static u64 ios_in_flight = 0;
44 
45 #define PROCESS_HASH_TABLE_BITS 7
46 #define PROCESS_HASH_TABLE_SIZE (1 << PROCESS_HASH_TABLE_BITS)
47 static struct list_head process_hash_table[PROCESS_HASH_TABLE_SIZE];
48 
49 extern int plot_io_action;
50 extern int io_per_process;
51 
52 /*
53  * Trace categories
54  */
55 enum {
56 	BLK_TC_READ	= 1 << 0,	/* reads */
57 	BLK_TC_WRITE	= 1 << 1,	/* writes */
58 	BLK_TC_FLUSH	= 1 << 2,	/* flush */
59 	BLK_TC_SYNC	= 1 << 3,	/* sync */
60 	BLK_TC_QUEUE	= 1 << 4,	/* queueing/merging */
61 	BLK_TC_REQUEUE	= 1 << 5,	/* requeueing */
62 	BLK_TC_ISSUE	= 1 << 6,	/* issue */
63 	BLK_TC_COMPLETE	= 1 << 7,	/* completions */
64 	BLK_TC_FS	= 1 << 8,	/* fs requests */
65 	BLK_TC_PC	= 1 << 9,	/* pc requests */
66 	BLK_TC_NOTIFY	= 1 << 10,	/* special message */
67 	BLK_TC_AHEAD	= 1 << 11,	/* readahead */
68 	BLK_TC_META	= 1 << 12,	/* metadata */
69 	BLK_TC_DISCARD	= 1 << 13,	/* discard requests */
70 	BLK_TC_DRV_DATA	= 1 << 14,	/* binary driver data */
71 	BLK_TC_FUA	= 1 << 15,	/* fua requests */
72 
73 	BLK_TC_END	= 1 << 15,	/* we've run out of bits! */
74 };
75 
76 #define BLK_TC_SHIFT		(16)
77 #define BLK_TC_ACT(act)		((act) << BLK_TC_SHIFT)
78 #define BLK_DATADIR(a) (((a) >> BLK_TC_SHIFT) & (BLK_TC_READ | BLK_TC_WRITE))
79 
80 /*
81  * Basic trace actions
82  */
83 enum {
84 	__BLK_TA_QUEUE = 1,		/* queued */
85 	__BLK_TA_BACKMERGE,		/* back merged to existing rq */
86 	__BLK_TA_FRONTMERGE,		/* front merge to existing rq */
87 	__BLK_TA_GETRQ,			/* allocated new request */
88 	__BLK_TA_SLEEPRQ,		/* sleeping on rq allocation */
89 	__BLK_TA_REQUEUE,		/* request requeued */
90 	__BLK_TA_ISSUE,			/* sent to driver */
91 	__BLK_TA_COMPLETE,		/* completed by driver */
92 	__BLK_TA_PLUG,			/* queue was plugged */
93 	__BLK_TA_UNPLUG_IO,		/* queue was unplugged by io */
94 	__BLK_TA_UNPLUG_TIMER,		/* queue was unplugged by timer */
95 	__BLK_TA_INSERT,		/* insert request */
96 	__BLK_TA_SPLIT,			/* bio was split */
97 	__BLK_TA_BOUNCE,		/* bio was bounced */
98 	__BLK_TA_REMAP,			/* bio was remapped */
99 	__BLK_TA_ABORT,			/* request aborted */
100 	__BLK_TA_DRV_DATA,		/* binary driver data */
101 };
102 
103 #define BLK_TA_MASK ((1 << BLK_TC_SHIFT) - 1)
104 
105 /*
106  * Notify events.
107  */
108 enum blktrace_notify {
109 	__BLK_TN_PROCESS = 0,		/* establish pid/name mapping */
110 	__BLK_TN_TIMESTAMP,		/* include system clock */
111 	__BLK_TN_MESSAGE,               /* Character string message */
112 };
113 
114 /*
115  * Trace actions in full. Additionally, read or write is masked
116  */
117 #define BLK_TA_QUEUE		(__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
118 #define BLK_TA_BACKMERGE	(__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
119 #define BLK_TA_FRONTMERGE	(__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
120 #define	BLK_TA_GETRQ		(__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
121 #define	BLK_TA_SLEEPRQ		(__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
122 #define	BLK_TA_REQUEUE		(__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
123 #define BLK_TA_ISSUE		(__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
124 #define BLK_TA_COMPLETE		(__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
125 #define BLK_TA_PLUG		(__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
126 #define BLK_TA_UNPLUG_IO	(__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
127 #define BLK_TA_UNPLUG_TIMER	(__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
128 #define BLK_TA_INSERT		(__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
129 #define BLK_TA_SPLIT		(__BLK_TA_SPLIT)
130 #define BLK_TA_BOUNCE		(__BLK_TA_BOUNCE)
131 #define BLK_TA_REMAP		(__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
132 #define BLK_TA_ABORT		(__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE))
133 #define BLK_TA_DRV_DATA		(__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA))
134 
135 #define BLK_TN_PROCESS		(__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
136 #define BLK_TN_TIMESTAMP	(__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
137 #define BLK_TN_MESSAGE		(__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY))
138 
139 #define BLK_IO_TRACE_MAGIC	0x65617400
140 #define BLK_IO_TRACE_VERSION	0x07
141 /*
142  * The trace itself
143  */
144 struct blk_io_trace {
145 	__u32 magic;		/* MAGIC << 8 | version */
146 	__u32 sequence;		/* event number */
147 	__u64 time;		/* in nanoseconds */
148 	__u64 sector;		/* disk offset */
149 	__u32 bytes;		/* transfer length */
150 	__u32 action;		/* what happened */
151 	__u32 pid;		/* who did it */
152 	__u32 device;		/* device identifier (dev_t) */
153 	__u32 cpu;		/* on what cpu did it happen */
154 	__u16 error;		/* completion error */
155 	__u16 pdu_len;		/* length of data after this trace */
156 };
157 
158 struct pending_io {
159 	/* sector offset of this IO */
160 	u64 sector;
161 
162 	/* dev_t for this IO */
163 	u32 device;
164 
165 	/* time this IO was dispatched */
166 	u64 dispatch_time;
167 	/* time this IO was finished */
168 	u64 completion_time;
169 	struct list_head hash_list;
170 	/* process which queued this IO */
171 	u32 pid;
172 };
173 
174 struct pid_map {
175 	struct list_head hash_list;
176 	u32 pid;
177 	int index;
178 	char name[0];
179 };
180 
get_record_time(struct trace * trace)181 u64 get_record_time(struct trace *trace)
182 {
183 	return trace->io->time;
184 }
185 
init_io_hash_table(void)186 void init_io_hash_table(void)
187 {
188 	int i;
189 	struct list_head *head;
190 
191 	for (i = 0; i < IO_HASH_TABLE_SIZE; i++) {
192 		head = io_hash_table + i;
193 		INIT_LIST_HEAD(head);
194 	}
195 }
196 
197 /* taken from the kernel hash.h */
hash_sector(u64 val)198 static inline u64 hash_sector(u64 val)
199 {
200 	u64 hash = val;
201 
202 	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
203 	u64 n = hash;
204 	n <<= 18;
205 	hash -= n;
206 	n <<= 33;
207 	hash -= n;
208 	n <<= 3;
209 	hash += n;
210 	n <<= 3;
211 	hash -= n;
212 	n <<= 4;
213 	hash += n;
214 	n <<= 2;
215 	hash += n;
216 
217 	/* High bits are more random, so use them. */
218 	return hash >> (64 - IO_HASH_TABLE_BITS);
219 }
220 
io_hash_table_insert(struct pending_io * ins_pio)221 static int io_hash_table_insert(struct pending_io *ins_pio)
222 {
223 	u64 sector = ins_pio->sector;
224 	u32 dev = ins_pio->device;
225 	int slot = hash_sector(sector);
226 	struct list_head *head;
227 	struct pending_io *pio;
228 
229 	head = io_hash_table + slot;
230 	list_for_each_entry(pio, head, hash_list) {
231 		if (pio->sector == sector && pio->device == dev)
232 			return -EEXIST;
233 	}
234 	list_add_tail(&ins_pio->hash_list, head);
235 	return 0;
236 }
237 
io_hash_table_search(u64 sector,u32 dev)238 static struct pending_io *io_hash_table_search(u64 sector, u32 dev)
239 {
240 	int slot = hash_sector(sector);
241 	struct list_head *head;
242 	struct pending_io *pio;
243 
244 	head = io_hash_table + slot;
245 	list_for_each_entry(pio, head, hash_list) {
246 		if (pio->sector == sector && pio->device == dev)
247 			return pio;
248 	}
249 	return NULL;
250 }
251 
hash_queued_io(struct blk_io_trace * io)252 static struct pending_io *hash_queued_io(struct blk_io_trace *io)
253 {
254 	struct pending_io *pio;
255 	int ret;
256 
257 	pio = calloc(1, sizeof(*pio));
258 	pio->sector = io->sector;
259 	pio->device = io->device;
260 	pio->pid = io->pid;
261 
262 	ret = io_hash_table_insert(pio);
263 	if (ret < 0) {
264 		/* crud, the IO is there already */
265 		free(pio);
266 		return NULL;
267 	}
268 	return pio;
269 }
270 
hash_dispatched_io(struct blk_io_trace * io)271 static struct pending_io *hash_dispatched_io(struct blk_io_trace *io)
272 {
273 	struct pending_io *pio;
274 
275 	pio = io_hash_table_search(io->sector, io->device);
276 	if (!pio) {
277 		pio = hash_queued_io(io);
278 		if (!pio)
279 			return NULL;
280 	}
281 	pio->dispatch_time = io->time;
282 	return pio;
283 }
284 
hash_completed_io(struct blk_io_trace * io)285 static struct pending_io *hash_completed_io(struct blk_io_trace *io)
286 {
287 	struct pending_io *pio;
288 
289 	pio = io_hash_table_search(io->sector, io->device);
290 
291 	if (!pio)
292 		return NULL;
293 	return pio;
294 }
295 
init_process_hash_table(void)296 void init_process_hash_table(void)
297 {
298 	int i;
299 	struct list_head *head;
300 
301 	for (i = 0; i < PROCESS_HASH_TABLE_SIZE; i++) {
302 		head = process_hash_table + i;
303 		INIT_LIST_HEAD(head);
304 	}
305 }
306 
hash_pid(u32 pid)307 static u32 hash_pid(u32 pid)
308 {
309 	u32 hash = pid;
310 
311 	hash ^= pid >> 3;
312 	hash ^= pid >> 3;
313 	hash ^= pid >> 4;
314 	hash ^= pid >> 6;
315 	return (hash & (PROCESS_HASH_TABLE_SIZE - 1));
316 }
317 
process_hash_search(u32 pid)318 static struct pid_map *process_hash_search(u32 pid)
319 {
320 	int slot = hash_pid(pid);
321 	struct list_head *head;
322 	struct pid_map *pm;
323 
324 	head = process_hash_table + slot;
325 	list_for_each_entry(pm, head, hash_list) {
326 		if (pm->pid == pid)
327 			return pm;
328 	}
329 	return NULL;
330 }
331 
process_hash_insert(u32 pid,char * name)332 static struct pid_map *process_hash_insert(u32 pid, char *name)
333 {
334 	int slot = hash_pid(pid);
335 	struct pid_map *pm;
336 	int old_index = 0;
337 	char buf[16];
338 
339 	pm = process_hash_search(pid);
340 	if (pm) {
341 		/* Entry exists and name shouldn't be changed? */
342 		if (!name || !strcmp(name, pm->name))
343 			return pm;
344 		list_del(&pm->hash_list);
345 		old_index = pm->index;
346 		free(pm);
347 	}
348 	if (!name) {
349 		sprintf(buf, "[%u]", pid);
350 		name = buf;
351 	}
352 	pm = malloc(sizeof(struct pid_map) + strlen(name) + 1);
353 	pm->pid = pid;
354 	pm->index = old_index;
355 	strcpy(pm->name, name);
356 	list_add_tail(&pm->hash_list, process_hash_table + slot);
357 
358 	return pm;
359 }
360 
handle_notify(struct trace * trace)361 static void handle_notify(struct trace *trace)
362 {
363 	struct blk_io_trace *io = trace->io;
364 	void *payload = (char *)io + sizeof(*io);
365 	u32 two32[2];
366 
367 	if (io->action == BLK_TN_PROCESS) {
368 		if (io_per_process)
369 			process_hash_insert(io->pid, payload);
370 		return;
371 	}
372 
373 	if (io->action != BLK_TN_TIMESTAMP)
374 		return;
375 
376 	if (io->pdu_len != sizeof(two32))
377 		return;
378 
379 	memcpy(two32, payload, sizeof(two32));
380 	trace->start_timestamp = io->time;
381 	trace->abs_start_time.tv_sec = two32[0];
382 	trace->abs_start_time.tv_nsec = two32[1];
383 	if (trace->abs_start_time.tv_nsec < 0) {
384 		trace->abs_start_time.tv_sec--;
385 		trace->abs_start_time.tv_nsec += 1000000000;
386 	}
387 }
388 
next_record(struct trace * trace)389 int next_record(struct trace *trace)
390 {
391 	int skip = trace->io->pdu_len;
392 	u64 offset;
393 
394 	trace->cur += sizeof(*trace->io) + skip;
395 	offset = trace->cur - trace->start;
396 	if (offset >= trace->len)
397 		return 1;
398 
399 	trace->io = (struct blk_io_trace *)trace->cur;
400 	return 0;
401 }
402 
first_record(struct trace * trace)403 void first_record(struct trace *trace)
404 {
405 	trace->cur = trace->start;
406 	trace->io = (struct blk_io_trace *)trace->cur;
407 }
408 
is_io_event(struct blk_io_trace * test)409 static int is_io_event(struct blk_io_trace *test)
410 {
411 	char *message;
412 	if (!(test->action & BLK_TC_ACT(BLK_TC_NOTIFY)))
413 		return 1;
414 	if (test->action == BLK_TN_MESSAGE) {
415 		int len = test->pdu_len;
416 		if (len < 3)
417 			return 0;
418 		message = (char *)(test + 1);
419 		if (strncmp(message, "fio ", 4) == 0) {
420 			return 1;
421 		}
422 	}
423 	return 0;
424 }
425 
find_last_time(struct trace * trace)426 u64 find_last_time(struct trace *trace)
427 {
428 	char *p = trace->start + trace->len;
429 	struct blk_io_trace *test;
430 	int search_len = 0;
431 	u64 found = 0;
432 
433 	if (trace->len < sizeof(*trace->io))
434 		return 0;
435 	p -= sizeof(*trace->io);
436 	while (p >= trace->start) {
437 		test = (struct blk_io_trace *)p;
438 		if (CHECK_MAGIC(test) && is_io_event(test)) {
439 			u64 offset = p - trace->start;
440 			if (offset + sizeof(*test) + test->pdu_len == trace->len) {
441 				return test->time;
442 			}
443 		}
444 		p--;
445 		search_len++;
446 		if (search_len > 8192) {
447 			break;
448 		}
449 	}
450 
451 	/* searching backwards didn't work out, we'll have to scan the file */
452 	first_record(trace);
453 	while (1) {
454 		if (is_io_event(trace->io))
455 			found = trace->io->time;
456 		if (next_record(trace))
457 			break;
458 	}
459 	first_record(trace);
460 	return found;
461 }
462 
parse_fio_bank_message(struct trace * trace,u64 * bank_ret,u64 * offset_ret,u64 * num_banks_ret)463 static int parse_fio_bank_message(struct trace *trace, u64 *bank_ret, u64 *offset_ret,
464 			   u64 *num_banks_ret)
465 {
466 	char *s;
467 	char *next;
468 	char *message;
469 	struct blk_io_trace *test = trace->io;
470 	int len = test->pdu_len;
471 	u64 bank;
472 	u64 offset;
473 	u64 num_banks;
474 
475 	if (!(test->action & BLK_TC_ACT(BLK_TC_NOTIFY)))
476 		return -1;
477 	if (test->action != BLK_TN_MESSAGE)
478 		return -1;
479 
480 	/* the message is fio rw bank offset num_banks */
481 	if (len < 3)
482 		return -1;
483 	message = (char *)(test + 1);
484 	if (strncmp(message, "fio r ", 6) != 0)
485 		return -1;
486 
487 	message = strndup(message, len);
488 	s = strchr(message, ' ');
489 	if (!s)
490 		goto out;
491 	s++;
492 	s = strchr(s, ' ');
493 	if (!s)
494 		goto out;
495 
496 	bank = strtoll(s, &next, 10);
497 	if (s == next)
498 		goto out;
499 	s = next;
500 
501 	offset = strtoll(s, &next, 10);
502 	if (s == next)
503 		goto out;
504 	s = next;
505 
506 	num_banks = strtoll(s, &next, 10);
507 	if (s == next)
508 		goto out;
509 
510 	*bank_ret = bank;
511 	*offset_ret = offset;
512 	*num_banks_ret = num_banks;
513 
514 	return 0;
515 out:
516 	free(message);
517 	return -1;
518 }
519 
lookup_dev(struct trace * trace,struct blk_io_trace * io)520 static struct dev_info *lookup_dev(struct trace *trace, struct blk_io_trace *io)
521 {
522 	u32 dev = io->device;
523 	int i;
524 	struct dev_info *di = NULL;
525 
526 	for (i = 0; i < trace->num_devices; i++) {
527 		if (trace->devices[i].device == dev) {
528 			di = trace->devices + i;
529 			goto found;
530 		}
531 	}
532 	i = trace->num_devices++;
533 	if (i >= MAX_DEVICES_PER_TRACE) {
534 		fprintf(stderr, "Trace contains too many devices (%d)\n", i);
535 		exit(1);
536 	}
537 	di = trace->devices + i;
538 	di->device = dev;
539 found:
540 	return di;
541 }
542 
map_devices(struct trace * trace)543 static void map_devices(struct trace *trace)
544 {
545 	struct dev_info *di;
546 	u64 found;
547 	u64 map_start = 0;
548 	int i;
549 
550 	first_record(trace);
551 	while (1) {
552 		if (!(trace->io->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
553 			di = lookup_dev(trace, trace->io);
554 			found = trace->io->sector << 9;
555 			if (found < di->min)
556 				di->min = found;
557 
558 			found += trace->io->bytes;
559 			if (di->max < found)
560 				di->max = found;
561 		}
562 		if (next_record(trace))
563 			break;
564 	}
565 	first_record(trace);
566 	for (i = 0; i < trace->num_devices; i++) {
567 		di = trace->devices + i;
568 		di->map = map_start;
569 		map_start += di->max - di->min;
570 	}
571 }
572 
map_io(struct trace * trace,struct blk_io_trace * io)573 static u64 map_io(struct trace *trace, struct blk_io_trace *io)
574 {
575 	struct dev_info *di = lookup_dev(trace, io);
576 	u64 val = trace->io->sector << 9;
577 	return di->map + val - di->min;
578 }
579 
find_extreme_offsets(struct trace * trace,u64 * min_ret,u64 * max_ret,u64 * max_bank_ret,u64 * max_offset_ret)580 void find_extreme_offsets(struct trace *trace, u64 *min_ret, u64 *max_ret, u64 *max_bank_ret,
581 			  u64 *max_offset_ret)
582 {
583 	u64 found = 0;
584 	u64 max = 0, min = ~(u64)0;
585 	u64 max_bank = 0;
586 	u64 max_bank_offset = 0;
587 	u64 num_banks = 0;
588 
589 	map_devices(trace);
590 
591 	first_record(trace);
592 	while (1) {
593 		if (!(trace->io->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
594 			found = map_io(trace, trace->io);
595 			if (found < min)
596 				min = found;
597 
598 			found += trace->io->bytes;
599 			if (max < found)
600 				max = found;
601 		} else {
602 			u64 bank;
603 			u64 offset;
604 			if (!parse_fio_bank_message(trace, &bank,
605 						    &offset, &num_banks)) {
606 				if (bank > max_bank)
607 					max_bank = bank;
608 				if (offset > max_bank_offset)
609 					max_bank_offset = offset;
610 			}
611 		}
612 		if (next_record(trace))
613 			break;
614 	}
615 	first_record(trace);
616 	*min_ret = min;
617 	*max_ret = max;
618 	*max_bank_ret = max_bank;
619 	*max_offset_ret = max_bank_offset;
620 }
621 
check_io_types(struct trace * trace)622 static void check_io_types(struct trace *trace)
623 {
624 	struct blk_io_trace *io = trace->io;
625 	int action = io->action & BLK_TA_MASK;
626 
627 	if (!(io->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
628 		switch (action) {
629 		case __BLK_TA_COMPLETE:
630 			trace->found_completion = 1;
631 			break;
632 		case __BLK_TA_ISSUE:
633 			trace->found_issue = 1;
634 			break;
635 		case __BLK_TA_QUEUE:
636 			trace->found_queue = 1;
637 			break;
638 		};
639 	}
640 }
641 
642 
filter_outliers(struct trace * trace,u64 min_offset,u64 max_offset,u64 * yzoom_min,u64 * yzoom_max)643 int filter_outliers(struct trace *trace, u64 min_offset, u64 max_offset,
644 		    u64 *yzoom_min, u64 *yzoom_max)
645 {
646 	int hits[11];
647 	u64 max_per_bucket[11];
648 	u64 min_per_bucket[11];
649 	u64 bytes_per_bucket = (max_offset - min_offset + 1) / 10;
650 	int slot;
651 	int fat_count = 0;
652 
653 	memset(hits, 0, sizeof(int) * 11);
654 	memset(max_per_bucket, 0, sizeof(u64) * 11);
655 	memset(min_per_bucket, 0xff, sizeof(u64) * 11);
656 	first_record(trace);
657 	while (1) {
658 		check_io_types(trace);
659 		if (!(trace->io->action & BLK_TC_ACT(BLK_TC_NOTIFY)) &&
660 		    (trace->io->action & BLK_TA_MASK) == __BLK_TA_QUEUE) {
661 			u64 off = map_io(trace, trace->io) - min_offset;
662 
663 			slot = (int)(off / bytes_per_bucket);
664 			hits[slot]++;
665 			if (off < min_per_bucket[slot])
666 				min_per_bucket[slot] = off;
667 
668 			off += trace->io->bytes;
669 			slot = (int)(off / bytes_per_bucket);
670 			hits[slot]++;
671 			if (off > max_per_bucket[slot])
672 				max_per_bucket[slot] = off;
673 		}
674 		if (next_record(trace))
675 			break;
676 	}
677 	first_record(trace);
678 	for (slot = 0; slot < 11; slot++) {
679 		if (hits[slot] > fat_count) {
680 			fat_count = hits[slot];
681 		}
682 	}
683 
684 	*yzoom_max = max_offset;
685 	for (slot = 10; slot >= 0; slot--) {
686 		double d = hits[slot];
687 
688 		if (d >= (double)fat_count * .05) {
689 			*yzoom_max = max_per_bucket[slot] + min_offset;
690 			break;
691 		}
692 	}
693 
694 	*yzoom_min = min_offset;
695 	for (slot = 0; slot < 10; slot++) {
696 		double d = hits[slot];
697 
698 		if (d >= (double)fat_count * .05) {
699 			*yzoom_min = min_per_bucket[slot] + min_offset;
700 			break;
701 		}
702 	}
703 	return 0;
704 }
705 
706 static char footer[] = ".blktrace.0";
707 static int footer_len = sizeof(footer) - 1;
708 
match_trace(char * name,int * len)709 static int match_trace(char *name, int *len)
710 {
711 	int match_len;
712 	int footer_start;
713 
714 	match_len = strlen(name);
715 	if (match_len <= footer_len)
716 		return 0;
717 
718 	footer_start = match_len - footer_len;
719 	if (strcmp(name + footer_start, footer) != 0)
720 		return 0;
721 
722 	if (len)
723 		*len = match_len;
724 	return 1;
725 }
726 
727 struct tracelist {
728 	struct tracelist *next;
729 	char *name;
730 };
731 
traces_list(char * dir_name,int * len)732 static struct tracelist *traces_list(char *dir_name, int *len)
733 {
734 	int count = 0;
735 	struct tracelist *traces = NULL;
736 	int dlen = strlen(dir_name);
737 	DIR *dir = opendir(dir_name);
738 	if (!dir)
739 		return NULL;
740 
741 	while (1) {
742 		int n = 0;
743 		struct tracelist *tl;
744 		struct dirent *d = readdir(dir);
745 		if (!d)
746 			break;
747 
748 		if (!match_trace(d->d_name, &n))
749 			continue;
750 
751 		n += dlen + 1; /* dir + '/' + file */
752 		/* Allocate space for tracelist + filename */
753 		tl = calloc(1, sizeof(struct tracelist) + (sizeof(char) * (n + 1)));
754 		if (!tl) {
755 			closedir(dir);
756 			return NULL;
757 		}
758 		tl->next = traces;
759 		tl->name = (char *)(tl + 1);
760 		snprintf(tl->name, n, "%s/%s", dir_name, d->d_name);
761 		traces = tl;
762 		count++;
763 	}
764 
765 	closedir(dir);
766 
767 	if (len)
768 		*len = count;
769 
770 	return traces;
771 }
772 
traces_free(struct tracelist * traces)773 static void traces_free(struct tracelist *traces)
774 {
775 	while (traces) {
776 		struct tracelist *tl = traces;
777 		traces = traces->next;
778 		free(tl);
779 	}
780 }
781 
dump_traces(struct tracelist * traces,int count,char * dumpfile)782 static int dump_traces(struct tracelist *traces, int count, char *dumpfile)
783 {
784 	struct tracelist *tl;
785 	char **argv = NULL;
786 	int argc = 0;
787 	int i;
788 	int err = 0;
789 
790 	argc = count * 2; /* {"-i", trace } */
791 	argc += 4; /* See below */
792 	argv = calloc(argc + 1, sizeof(char *));
793 	if (!argv)
794 		return -errno;
795 
796 	i = 0;
797 	argv[i++] = "blkparse";
798 	argv[i++] = "-O";
799 	argv[i++] = "-d";
800 	argv[i++] = dumpfile;
801 	for (tl = traces; tl != NULL; tl = tl->next) {
802 		argv[i++] = "-i";
803 		argv[i++] = tl->name;
804 	}
805 
806 	err = run_program(argc, argv, 1, NULL, NULL);
807 	if (err)
808 		fprintf(stderr, "%s exited with %d, expected 0\n", argv[0], err);
809 	free(argv);
810 	return err;
811 }
812 
find_trace_file(char * filename)813 static char *find_trace_file(char *filename)
814 {
815 	int ret;
816 	struct stat st;
817 	char *dot;
818 	int found_dir = 0;
819 	char *dumpfile;
820 	int len = strlen(filename);
821 
822 	/* look for an exact match of whatever they pass in.
823 	 * If it is a file, assume it is the dump file.
824 	 * If a directory, remember that it existed so we
825 	 * can combine traces in that directory later
826 	 */
827 	ret = stat(filename, &st);
828 	if (ret == 0) {
829 		if (S_ISREG(st.st_mode))
830 			return strdup(filename);
831 
832 		if (S_ISDIR(st.st_mode))
833 			found_dir = 1;
834 	}
835 
836 	if (found_dir) {
837 		int i;
838 		/* Eat up trailing '/'s */
839 		for (i = len - 1; filename[i] == '/'; i--)
840 			filename[i] = '\0';
841 	}
842 
843 	/*
844 	 * try tacking .dump onto the end and see if that already
845 	 * has been generated
846 	 */
847 	ret = asprintf(&dumpfile, "%s.dump", filename);
848 	if (ret == -1) {
849 		perror("Error building dump file name");
850 		return NULL;
851 	}
852 	ret = stat(dumpfile, &st);
853 	if (ret == 0)
854 		return dumpfile;
855 
856 	/*
857 	 * try to generate the .dump from all the traces in
858 	 * a single dir.
859 	 */
860 	if (found_dir) {
861 		int count;
862 		struct tracelist *traces = traces_list(filename, &count);
863 		if (traces) {
864 			ret = dump_traces(traces, count, dumpfile);
865 			traces_free(traces);
866 			if (ret == 0)
867 				return dumpfile;
868 		}
869 	}
870 	free(dumpfile);
871 
872 	/*
873 	 * try to generate the .dump from all the blktrace
874 	 * files for a named trace
875 	 */
876 	dot = strrchr(filename, '.');
877 	if (!dot || strcmp(".dump", dot) != 0) {
878 		struct tracelist trace = {0 ,NULL};
879 		if (dot && dot != filename)
880 			len = dot - filename;
881 
882 		ret = asprintf(&trace.name, "%*s.blktrace.0", len, filename);
883 		if (ret == -1)
884 			return NULL;
885 		ret = asprintf(&dumpfile, "%*s.dump", len, filename);
886 		if (ret == -1) {
887 			free(trace.name);
888 			return NULL;
889 		}
890 
891 		ret = dump_traces(&trace, 1, dumpfile);
892 		if (ret == 0) {
893 			free(trace.name);
894 			return dumpfile;
895 		}
896 		free(trace.name);
897 		free(dumpfile);
898 	}
899 	return NULL;
900 }
open_trace(char * filename)901 struct trace *open_trace(char *filename)
902 {
903 	int fd;
904 	char *p;
905 	struct stat st;
906 	int ret;
907 	struct trace *trace;
908 	char *found_filename;
909 
910 	trace = calloc(1, sizeof(*trace));
911 	if (!trace) {
912 		fprintf(stderr, "unable to allocate memory for trace\n");
913 		return NULL;
914 	}
915 
916 	found_filename = find_trace_file(filename);
917 	if (!found_filename) {
918 		fprintf(stderr, "Unable to find trace file %s\n", filename);
919 		goto fail;
920 	}
921 	filename = found_filename;
922 
923 	fd = open(filename, O_RDONLY);
924 	if (fd < 0) {
925 		fprintf(stderr, "Unable to open trace file %s err %s\n", filename, strerror(errno));
926 		goto fail;
927 	}
928 	ret = fstat(fd, &st);
929 	if (ret < 0) {
930 		fprintf(stderr, "stat failed on %s err %s\n", filename, strerror(errno));
931 		goto fail_fd;
932 	}
933 	p = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
934 	if (p == MAP_FAILED) {
935 		fprintf(stderr, "Unable to mmap trace file %s, err %s\n", filename, strerror(errno));
936 		goto fail_fd;
937 	}
938 	trace->fd = fd;
939 	trace->len = st.st_size;
940 	trace->start = p;
941 	trace->cur = p;
942 	trace->io = (struct blk_io_trace *)p;
943 	return trace;
944 
945 fail_fd:
946 	close(fd);
947 fail:
948 	free(trace);
949 	return NULL;
950 }
tput_event(struct trace * trace)951 static inline int tput_event(struct trace *trace)
952 {
953 	if (trace->found_completion)
954 		return __BLK_TA_COMPLETE;
955 	if (trace->found_issue)
956 		return __BLK_TA_ISSUE;
957 	if (trace->found_queue)
958 		return __BLK_TA_QUEUE;
959 
960 	return __BLK_TA_COMPLETE;
961 }
962 
action_char_to_num(char action)963 int action_char_to_num(char action)
964 {
965 	switch (action) {
966 	case 'Q':
967 		return __BLK_TA_QUEUE;
968 	case 'D':
969 		return __BLK_TA_ISSUE;
970 	case 'C':
971 		return __BLK_TA_COMPLETE;
972 	}
973 	return -1;
974 }
975 
io_event(struct trace * trace)976 static inline int io_event(struct trace *trace)
977 {
978 	if (plot_io_action)
979 		return plot_io_action;
980 	if (trace->found_queue)
981 		return __BLK_TA_QUEUE;
982 	if (trace->found_issue)
983 		return __BLK_TA_ISSUE;
984 	if (trace->found_completion)
985 		return __BLK_TA_COMPLETE;
986 
987 	return __BLK_TA_COMPLETE;
988 }
989 
add_tput(struct trace * trace,struct graph_line_data * writes_gld,struct graph_line_data * reads_gld)990 void add_tput(struct trace *trace, struct graph_line_data *writes_gld,
991 	      struct graph_line_data *reads_gld)
992 {
993 	struct blk_io_trace *io = trace->io;
994 	struct graph_line_data *gld;
995 	int action = io->action & BLK_TA_MASK;
996 	int seconds;
997 
998 	if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
999 		return;
1000 
1001 	if (action != tput_event(trace))
1002 		return;
1003 
1004 	if (BLK_DATADIR(io->action) & BLK_TC_READ)
1005 		gld = reads_gld;
1006 	else
1007 		gld = writes_gld;
1008 
1009 	seconds = SECONDS(io->time);
1010 	gld->data[seconds].sum += io->bytes;
1011 
1012 	gld->data[seconds].count = 1;
1013 	if (gld->data[seconds].sum > gld->max)
1014 		gld->max = gld->data[seconds].sum;
1015 }
1016 
1017 #define GDD_PTR_ALLOC_STEP 16
1018 
get_pid_map(struct trace_file * tf,u32 pid)1019 static struct pid_map *get_pid_map(struct trace_file *tf, u32 pid)
1020 {
1021 	struct pid_map *pm;
1022 
1023 	if (!io_per_process) {
1024 		if (!tf->io_plots)
1025 			tf->io_plots = 1;
1026 		return NULL;
1027 	}
1028 
1029 	pm = process_hash_insert(pid, NULL);
1030 	/* New entry? */
1031 	if (!pm->index) {
1032 		if (tf->io_plots == tf->io_plots_allocated) {
1033 			tf->io_plots_allocated += GDD_PTR_ALLOC_STEP;
1034 			tf->gdd_reads = realloc(tf->gdd_reads, tf->io_plots_allocated * sizeof(struct graph_dot_data *));
1035 			if (!tf->gdd_reads)
1036 				abort();
1037 			tf->gdd_writes = realloc(tf->gdd_writes, tf->io_plots_allocated * sizeof(struct graph_dot_data *));
1038 			if (!tf->gdd_writes)
1039 				abort();
1040 			memset(tf->gdd_reads + tf->io_plots_allocated - GDD_PTR_ALLOC_STEP,
1041 			       0, GDD_PTR_ALLOC_STEP * sizeof(struct graph_dot_data *));
1042 			memset(tf->gdd_writes + tf->io_plots_allocated - GDD_PTR_ALLOC_STEP,
1043 			       0, GDD_PTR_ALLOC_STEP * sizeof(struct graph_dot_data *));
1044 		}
1045 		pm->index = tf->io_plots++;
1046 
1047 		return pm;
1048 	}
1049 	return pm;
1050 }
1051 
add_io(struct trace * trace,struct trace_file * tf)1052 void add_io(struct trace *trace, struct trace_file *tf)
1053 {
1054 	struct blk_io_trace *io = trace->io;
1055 	int action = io->action & BLK_TA_MASK;
1056 	u64 offset;
1057 	int index;
1058 	char *label;
1059 	struct pid_map *pm;
1060 
1061 	if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
1062 		return;
1063 
1064 	if (action != io_event(trace))
1065 		return;
1066 
1067 	offset = map_io(trace, io);
1068 
1069 	pm = get_pid_map(tf, io->pid);
1070 	if (!pm) {
1071 		index = 0;
1072 		label = "";
1073 	} else {
1074 		index = pm->index;
1075 		label = pm->name;
1076 	}
1077 	if (BLK_DATADIR(io->action) & BLK_TC_READ) {
1078 		if (!tf->gdd_reads[index])
1079 			tf->gdd_reads[index] = alloc_dot_data(tf->min_seconds, tf->max_seconds, tf->min_offset, tf->max_offset, tf->stop_seconds, pick_color(), strdup(label));
1080 		set_gdd_bit(tf->gdd_reads[index], offset, io->bytes, io->time);
1081 	} else if (BLK_DATADIR(io->action) & BLK_TC_WRITE) {
1082 		if (!tf->gdd_writes[index])
1083 			tf->gdd_writes[index] = alloc_dot_data(tf->min_seconds, tf->max_seconds, tf->min_offset, tf->max_offset, tf->stop_seconds, pick_color(), strdup(label));
1084 		set_gdd_bit(tf->gdd_writes[index], offset, io->bytes, io->time);
1085 	}
1086 }
1087 
add_pending_io(struct trace * trace,struct graph_line_data * gld)1088 void add_pending_io(struct trace *trace, struct graph_line_data *gld)
1089 {
1090 	unsigned int seconds;
1091 	struct blk_io_trace *io = trace->io;
1092 	int action = io->action & BLK_TA_MASK;
1093 	double avg;
1094 	struct pending_io *pio;
1095 
1096 	if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
1097 		return;
1098 
1099 	if (action == __BLK_TA_QUEUE) {
1100 		if (io->sector == 0)
1101 			return;
1102 		if (trace->found_issue || trace->found_completion) {
1103 			pio = hash_queued_io(trace->io);
1104 			/*
1105 			 * When there are no ISSUE events count depth and
1106 			 * latency at least from queue events
1107 			 */
1108 			if (pio && !trace->found_issue) {
1109 				pio->dispatch_time = io->time;
1110 				goto account_io;
1111 			}
1112 		}
1113 		return;
1114 	}
1115 	if (action == __BLK_TA_REQUEUE) {
1116 		if (ios_in_flight > 0)
1117 			ios_in_flight--;
1118 		return;
1119 	}
1120 	if (action != __BLK_TA_ISSUE)
1121 		return;
1122 
1123 	pio = hash_dispatched_io(trace->io);
1124 	if (!pio)
1125 		return;
1126 
1127 	if (!trace->found_completion) {
1128 		list_del(&pio->hash_list);
1129 		free(pio);
1130 	}
1131 
1132 account_io:
1133 	ios_in_flight++;
1134 
1135 	seconds = SECONDS(io->time);
1136 	gld->data[seconds].sum += ios_in_flight;
1137 	gld->data[seconds].count++;
1138 
1139 	avg = (double)gld->data[seconds].sum / gld->data[seconds].count;
1140 	if (gld->max < (u64)avg) {
1141 		gld->max = avg;
1142 	}
1143 }
1144 
add_completed_io(struct trace * trace,struct graph_line_data * latency_gld)1145 void add_completed_io(struct trace *trace,
1146 		      struct graph_line_data *latency_gld)
1147 {
1148 	struct blk_io_trace *io = trace->io;
1149 	int seconds;
1150 	int action = io->action & BLK_TA_MASK;
1151 	struct pending_io *pio;
1152 	double avg;
1153 	u64 latency;
1154 
1155 	if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
1156 		return;
1157 
1158 	if (action != __BLK_TA_COMPLETE)
1159 		return;
1160 
1161 	seconds = SECONDS(io->time);
1162 
1163 	pio = hash_completed_io(trace->io);
1164 	if (!pio)
1165 		return;
1166 
1167 	if (ios_in_flight > 0)
1168 		ios_in_flight--;
1169 	if (io->time >= pio->dispatch_time) {
1170 		latency = io->time - pio->dispatch_time;
1171 		latency_gld->data[seconds].sum += latency;
1172 		latency_gld->data[seconds].count++;
1173 	}
1174 
1175 	list_del(&pio->hash_list);
1176 	free(pio);
1177 
1178 	avg = (double)latency_gld->data[seconds].sum /
1179 		latency_gld->data[seconds].count;
1180 	if (latency_gld->max < (u64)avg) {
1181 		latency_gld->max = avg;
1182 	}
1183 }
1184 
add_iop(struct trace * trace,struct graph_line_data * gld)1185 void add_iop(struct trace *trace, struct graph_line_data *gld)
1186 {
1187 	struct blk_io_trace *io = trace->io;
1188 	int action = io->action & BLK_TA_MASK;
1189 	int seconds;
1190 
1191 	if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
1192 		return;
1193 
1194 	/* iops and tput use the same events */
1195 	if (action != tput_event(trace))
1196 		return;
1197 
1198 	seconds = SECONDS(io->time);
1199 	gld->data[seconds].sum += 1;
1200 	gld->data[seconds].count = 1;
1201 	if (gld->data[seconds].sum > gld->max)
1202 		gld->max = gld->data[seconds].sum;
1203 }
1204 
check_record(struct trace * trace)1205 void check_record(struct trace *trace)
1206 {
1207 	handle_notify(trace);
1208 }
1209