1 /*
2  * block queue tracing application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
6  *
7  * Rewrite to have a single thread per CPU (managing all devices on that CPU)
8  *	Alan D. Brunelle <alan.brunelle@hp.com> - January 2009
9  *
10  *  This program is free software; you can redistribute it and/or modify
11  *  it under the terms of the GNU General Public License as published by
12  *  the Free Software Foundation; either version 2 of the License, or
13  *  (at your option) any later version.
14  *
15  *  This program is distributed in the hope that it will be useful,
16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *  GNU General Public License for more details.
19  *
20  *  You should have received a copy of the GNU General Public License
21  *  along with this program; if not, write to the Free Software
22  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23  *
24  */
25 
26 #include <errno.h>
27 #include <stdarg.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <fcntl.h>
32 #include <getopt.h>
33 #include <sched.h>
34 #include <unistd.h>
35 #include <poll.h>
36 #include <signal.h>
37 #include <pthread.h>
38 #include <locale.h>
39 #include <sys/ioctl.h>
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/vfs.h>
43 #include <sys/mman.h>
44 #include <sys/param.h>
45 #include <sys/time.h>
46 #include <sys/resource.h>
47 #include <sys/socket.h>
48 #include <netinet/in.h>
49 #include <arpa/inet.h>
50 #include <netdb.h>
51 #include <sys/sendfile.h>
52 
53 #include "btt/list.h"
54 #include "blktrace.h"
55 
56 /*
57  * You may want to increase this even more, if you are logging at a high
58  * rate and see skipped/missed events
59  */
60 #define BUF_SIZE		(512 * 1024)
61 #define BUF_NR			(4)
62 
63 #define FILE_VBUF_SIZE		(128 * 1024)
64 
65 #define DEBUGFS_TYPE		(0x64626720)
66 #define TRACE_NET_PORT		(8462)
67 
68 enum {
69 	Net_none = 0,
70 	Net_server,
71 	Net_client,
72 };
73 
74 enum thread_status {
75 	Th_running,
76 	Th_leaving,
77 	Th_error
78 };
79 
80 /*
81  * Generic stats collected: nevents can be _roughly_ estimated by data_read
82  * (discounting pdu...)
83  *
84  * These fields are updated w/ pdc_dr_update & pdc_nev_update below.
85  */
86 struct pdc_stats {
87 	unsigned long long data_read;
88 	unsigned long long nevents;
89 };
90 
91 struct devpath {
92 	struct list_head head;
93 	char *path;			/* path to device special file */
94 	char *buts_name;		/* name returned from bt kernel code */
95 	struct pdc_stats *stats;
96 	int fd, idx, ncpus;
97 	unsigned long long drops;
98 
99 	/*
100 	 * For piped output only:
101 	 *
102 	 * Each tracer will have a tracer_devpath_head that it will add new
103 	 * data onto. It's list is protected above (tracer_devpath_head.mutex)
104 	 * and it will signal the processing thread using the dp_cond,
105 	 * dp_mutex & dp_entries variables above.
106 	 */
107 	struct tracer_devpath_head *heads;
108 
109 	/*
110 	 * For network server mode only:
111 	 */
112 	struct cl_host *ch;
113 	u32 cl_id;
114 	time_t cl_connect_time;
115 	struct io_info *ios;
116 };
117 
118 /*
119  * For piped output to stdout we will have each tracer thread (one per dev)
120  * tack buffers read from the relay queues on a per-device list.
121  *
122  * The main thread will then collect trace buffers from each of lists in turn.
123  *
124  * We will use a mutex to guard each of the trace_buf list. The tracers
125  * can then signal the main thread using <dp_cond,dp_mutex> and
126  * dp_entries. (When dp_entries is 0, and a tracer adds an entry it will
127  * signal. When dp_entries is 0, the main thread will wait for that condition
128  * to be signalled.)
129  *
130  * adb: It may be better just to have a large buffer per tracer per dev,
131  * and then use it as a ring-buffer. This would certainly cut down a lot
132  * of malloc/free thrashing, at the cost of more memory movements (potentially).
133  */
134 struct trace_buf {
135 	struct list_head head;
136 	struct devpath *dpp;
137 	void *buf;
138 	int cpu, len;
139 };
140 
141 struct tracer_devpath_head {
142 	pthread_mutex_t mutex;
143 	struct list_head head;
144 	struct trace_buf *prev;
145 };
146 
147 /*
148  * Used to handle the mmap() interfaces for output file (containing traces)
149  */
150 struct mmap_info {
151 	void *fs_buf;
152 	unsigned long long fs_size, fs_max_size, fs_off, fs_buf_len;
153 	unsigned long buf_size, buf_nr;
154 	int pagesize;
155 };
156 
157 /*
158  * Each thread doing work on a (client) side of blktrace will have one
159  * of these. The ios array contains input/output information, pfds holds
160  * poll() data. The volatile's provide flags to/from the main executing
161  * thread.
162  */
163 struct tracer {
164 	struct list_head head;
165 	struct io_info *ios;
166 	struct pollfd *pfds;
167 	pthread_t thread;
168 	int cpu, nios;
169 	volatile int status, is_done;
170 };
171 
172 /*
173  * networking stuff follows. we include a magic number so we know whether
174  * to endianness convert or not.
175  *
176  * The len field is overloaded:
177  *	0 - Indicates an "open" - allowing the server to set up for a dev/cpu
178  *	1 - Indicates a "close" - Shut down connection orderly
179  *
180  * The cpu field is overloaded on close: it will contain the number of drops.
181  */
182 struct blktrace_net_hdr {
183 	u32 magic;		/* same as trace magic */
184 	char buts_name[32];	/* trace name */
185 	u32 cpu;		/* for which cpu */
186 	u32 max_cpus;
187 	u32 len;		/* length of following trace data */
188 	u32 cl_id;		/* id for set of client per-cpu connections */
189 	u32 buf_size;		/* client buf_size for this trace  */
190 	u32 buf_nr;		/* client buf_nr for this trace  */
191 	u32 page_size;		/* client page_size for this trace  */
192 };
193 
194 /*
195  * Each host encountered has one of these. The head is used to link this
196  * on to the network server's ch_list. Connections associated with this
197  * host are linked on conn_list, and any devices traced on that host
198  * are connected on the devpaths list.
199  */
200 struct cl_host {
201 	struct list_head head;
202 	struct list_head conn_list;
203 	struct list_head devpaths;
204 	struct net_server_s *ns;
205 	char *hostname;
206 	struct in_addr cl_in_addr;
207 	int connects, ndevs, cl_opens;
208 };
209 
210 /*
211  * Each connection (client to server socket ('fd')) has one of these. A
212  * back reference to the host ('ch'), and lists headers (for the host
213  * list, and the network server conn_list) are also included.
214  */
215 struct cl_conn {
216 	struct list_head ch_head, ns_head;
217 	struct cl_host *ch;
218 	int fd, ncpus;
219 	time_t connect_time;
220 };
221 
222 /*
223  * The network server requires some poll structures to be maintained -
224  * one per conection currently on conn_list. The nchs/ch_list values
225  * are for each host connected to this server. The addr field is used
226  * for scratch as new connections are established.
227  */
228 struct net_server_s {
229 	struct list_head conn_list;
230 	struct list_head ch_list;
231 	struct pollfd *pfds;
232 	int listen_fd, connects, nchs;
233 	struct sockaddr_in addr;
234 };
235 
236 /*
237  * This structure is (generically) used to providide information
238  * for a read-to-write set of values.
239  *
240  * ifn & ifd represent input information
241  *
242  * ofn, ofd, ofp, obuf & mmap_info are used for output file (optionally).
243  */
244 struct io_info {
245 	struct devpath *dpp;
246 	FILE *ofp;
247 	char *obuf;
248 	struct cl_conn *nc;	/* Server network connection */
249 
250 	/*
251 	 * mmap controlled output files
252 	 */
253 	struct mmap_info mmap_info;
254 
255 	/*
256 	 * Client network fields
257 	 */
258 	unsigned int ready;
259 	unsigned long long data_queued;
260 
261 	/*
262 	 * Input/output file descriptors & names
263 	 */
264 	int ifd, ofd;
265 	char ifn[MAXPATHLEN + 64];
266 	char ofn[MAXPATHLEN + 64];
267 };
268 
269 static char blktrace_version[] = "2.0.0";
270 
271 /*
272  * Linkage to blktrace helper routines (trace conversions)
273  */
274 int data_is_native = -1;
275 
276 static int ndevs;
277 static int ncpus;
278 static int pagesize;
279 static int act_mask = ~0U;
280 static int kill_running_trace;
281 static int stop_watch;
282 static int piped_output;
283 
284 static char *debugfs_path = "/sys/kernel/debug";
285 static char *output_name;
286 static char *output_dir;
287 
288 static unsigned long buf_size = BUF_SIZE;
289 static unsigned long buf_nr = BUF_NR;
290 
291 static FILE *pfp;
292 
293 static LIST_HEAD(devpaths);
294 static LIST_HEAD(tracers);
295 
296 static volatile int done;
297 
298 /*
299  * tracer threads add entries, the main thread takes them off and processes
300  * them. These protect the dp_entries variable.
301  */
302 static pthread_cond_t dp_cond = PTHREAD_COND_INITIALIZER;
303 static pthread_mutex_t dp_mutex = PTHREAD_MUTEX_INITIALIZER;
304 static volatile int dp_entries;
305 
306 /*
307  * These synchronize master / thread interactions.
308  */
309 static pthread_cond_t mt_cond = PTHREAD_COND_INITIALIZER;
310 static pthread_mutex_t mt_mutex = PTHREAD_MUTEX_INITIALIZER;
311 static volatile int nthreads_running;
312 static volatile int nthreads_leaving;
313 static volatile int nthreads_error;
314 static volatile int tracers_run;
315 
316 /*
317  * network cmd line params
318  */
319 static struct sockaddr_in hostname_addr;
320 static char hostname[MAXHOSTNAMELEN];
321 static int net_port = TRACE_NET_PORT;
322 static int net_use_sendfile = 1;
323 static int net_mode;
324 static int *cl_fds;
325 
326 static int (*handle_pfds)(struct tracer *, int, int);
327 static int (*handle_list)(struct tracer_devpath_head *, struct list_head *);
328 
329 #define S_OPTS	"d:a:A:r:o:kw:vVb:n:D:lh:p:sI:"
330 static struct option l_opts[] = {
331 	{
332 		.name = "dev",
333 		.has_arg = required_argument,
334 		.flag = NULL,
335 		.val = 'd'
336 	},
337 	{
338 		.name = "input-devs",
339 		.has_arg = required_argument,
340 		.flag = NULL,
341 		.val = 'I'
342 	},
343 	{
344 		.name = "act-mask",
345 		.has_arg = required_argument,
346 		.flag = NULL,
347 		.val = 'a'
348 	},
349 	{
350 		.name = "set-mask",
351 		.has_arg = required_argument,
352 		.flag = NULL,
353 		.val = 'A'
354 	},
355 	{
356 		.name = "relay",
357 		.has_arg = required_argument,
358 		.flag = NULL,
359 		.val = 'r'
360 	},
361 	{
362 		.name = "output",
363 		.has_arg = required_argument,
364 		.flag = NULL,
365 		.val = 'o'
366 	},
367 	{
368 		.name = "kill",
369 		.has_arg = no_argument,
370 		.flag = NULL,
371 		.val = 'k'
372 	},
373 	{
374 		.name = "stopwatch",
375 		.has_arg = required_argument,
376 		.flag = NULL,
377 		.val = 'w'
378 	},
379 	{
380 		.name = "version",
381 		.has_arg = no_argument,
382 		.flag = NULL,
383 		.val = 'v'
384 	},
385 	{
386 		.name = "version",
387 		.has_arg = no_argument,
388 		.flag = NULL,
389 		.val = 'V'
390 	},
391 	{
392 		.name = "buffer-size",
393 		.has_arg = required_argument,
394 		.flag = NULL,
395 		.val = 'b'
396 	},
397 	{
398 		.name = "num-sub-buffers",
399 		.has_arg = required_argument,
400 		.flag = NULL,
401 		.val = 'n'
402 	},
403 	{
404 		.name = "output-dir",
405 		.has_arg = required_argument,
406 		.flag = NULL,
407 		.val = 'D'
408 	},
409 	{
410 		.name = "listen",
411 		.has_arg = no_argument,
412 		.flag = NULL,
413 		.val = 'l'
414 	},
415 	{
416 		.name = "host",
417 		.has_arg = required_argument,
418 		.flag = NULL,
419 		.val = 'h'
420 	},
421 	{
422 		.name = "port",
423 		.has_arg = required_argument,
424 		.flag = NULL,
425 		.val = 'p'
426 	},
427 	{
428 		.name = "no-sendfile",
429 		.has_arg = no_argument,
430 		.flag = NULL,
431 		.val = 's'
432 	},
433 	{
434 		.name = NULL,
435 	}
436 };
437 
438 static char usage_str[] = \
439 	"-d <dev> [ -r debugfs path ] [ -o <output> ] [-k ] [ -w time ]\n" \
440 	"[ -a action ] [ -A action mask ] [ -I  <devs file> ] [ -v ]\n\n" \
441 	"\t-d Use specified device. May also be given last after options\n" \
442 	"\t-r Path to mounted debugfs, defaults to /sys/kernel/debug\n" \
443 	"\t-o File(s) to send output to\n" \
444 	"\t-D Directory to prepend to output file names\n" \
445 	"\t-k Kill a running trace\n" \
446 	"\t-w Stop after defined time, in seconds\n" \
447 	"\t-a Only trace specified actions. See documentation\n" \
448 	"\t-A Give trace mask as a single value. See documentation\n" \
449 	"\t-b Sub buffer size in KiB\n" \
450 	"\t-n Number of sub buffers\n" \
451 	"\t-l Run in network listen mode (blktrace server)\n" \
452 	"\t-h Run in network client mode, connecting to the given host\n" \
453 	"\t-p Network port to use (default 8462)\n" \
454 	"\t-s Make the network client NOT use sendfile() to transfer data\n" \
455 	"\t-I Add devices found in <devs file>\n" \
456 	"\t-V Print program version info\n\n";
457 
clear_events(struct pollfd * pfd)458 static void clear_events(struct pollfd *pfd)
459 {
460 	pfd->events = 0;
461 	pfd->revents = 0;
462 }
463 
net_client_use_sendfile(void)464 static inline int net_client_use_sendfile(void)
465 {
466 	return net_mode == Net_client && net_use_sendfile;
467 }
468 
net_client_use_send(void)469 static inline int net_client_use_send(void)
470 {
471 	return net_mode == Net_client && !net_use_sendfile;
472 }
473 
use_tracer_devpaths(void)474 static inline int use_tracer_devpaths(void)
475 {
476 	return piped_output || net_client_use_send();
477 }
478 
in_addr_eq(struct in_addr a,struct in_addr b)479 static inline int in_addr_eq(struct in_addr a, struct in_addr b)
480 {
481 	return a.s_addr == b.s_addr;
482 }
483 
pdc_dr_update(struct devpath * dpp,int cpu,int data_read)484 static inline void pdc_dr_update(struct devpath *dpp, int cpu, int data_read)
485 {
486 	dpp->stats[cpu].data_read += data_read;
487 }
488 
pdc_nev_update(struct devpath * dpp,int cpu,int nevents)489 static inline void pdc_nev_update(struct devpath *dpp, int cpu, int nevents)
490 {
491 	dpp->stats[cpu].nevents += nevents;
492 }
493 
show_usage(char * prog)494 static void show_usage(char *prog)
495 {
496 	fprintf(stderr, "Usage: %s %s %s", prog, blktrace_version, usage_str);
497 }
498 
499 /*
500  * Create a timespec 'msec' milliseconds into the future
501  */
make_timespec(struct timespec * tsp,long delta_msec)502 static inline void make_timespec(struct timespec *tsp, long delta_msec)
503 {
504 	struct timeval now;
505 
506 	gettimeofday(&now, NULL);
507 	tsp->tv_sec = now.tv_sec;
508 	tsp->tv_nsec = 1000L * now.tv_usec;
509 
510 	tsp->tv_nsec += (delta_msec * 1000000L);
511 	if (tsp->tv_nsec > 1000000000L) {
512 		long secs = tsp->tv_nsec / 1000000000L;
513 
514 		tsp->tv_sec += secs;
515 		tsp->tv_nsec -= (secs * 1000000000L);
516 	}
517 }
518 
519 /*
520  * Add a timer to ensure wait ends
521  */
t_pthread_cond_wait(pthread_cond_t * cond,pthread_mutex_t * mutex)522 static void t_pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
523 {
524 	struct timespec ts;
525 
526 	make_timespec(&ts, 50);
527 	pthread_cond_timedwait(cond, mutex, &ts);
528 }
529 
unblock_tracers(void)530 static void unblock_tracers(void)
531 {
532 	pthread_mutex_lock(&mt_mutex);
533 	tracers_run = 1;
534 	pthread_cond_broadcast(&mt_cond);
535 	pthread_mutex_unlock(&mt_mutex);
536 }
537 
tracer_wait_unblock(struct tracer * tp)538 static void tracer_wait_unblock(struct tracer *tp)
539 {
540 	pthread_mutex_lock(&mt_mutex);
541 	while (!tp->is_done && !tracers_run)
542 		pthread_cond_wait(&mt_cond, &mt_mutex);
543 	pthread_mutex_unlock(&mt_mutex);
544 }
545 
tracer_signal_ready(struct tracer * tp,enum thread_status th_status,int status)546 static void tracer_signal_ready(struct tracer *tp,
547 				enum thread_status th_status,
548 				int status)
549 {
550 	pthread_mutex_lock(&mt_mutex);
551 	tp->status = status;
552 
553 	if (th_status == Th_running)
554 		nthreads_running++;
555 	else if (th_status == Th_error)
556 		nthreads_error++;
557 	else
558 		nthreads_leaving++;
559 
560 	pthread_cond_signal(&mt_cond);
561 	pthread_mutex_unlock(&mt_mutex);
562 }
563 
wait_tracers_ready(int ncpus_started)564 static void wait_tracers_ready(int ncpus_started)
565 {
566 	pthread_mutex_lock(&mt_mutex);
567 	while ((nthreads_running + nthreads_error) < ncpus_started)
568 		t_pthread_cond_wait(&mt_cond, &mt_mutex);
569 	pthread_mutex_unlock(&mt_mutex);
570 }
571 
wait_tracers_leaving(void)572 static void wait_tracers_leaving(void)
573 {
574 	pthread_mutex_lock(&mt_mutex);
575 	while (nthreads_leaving < nthreads_running)
576 		t_pthread_cond_wait(&mt_cond, &mt_mutex);
577 	pthread_mutex_unlock(&mt_mutex);
578 }
579 
init_mmap_info(struct mmap_info * mip)580 static void init_mmap_info(struct mmap_info *mip)
581 {
582 	mip->buf_size = buf_size;
583 	mip->buf_nr = buf_nr;
584 	mip->pagesize = pagesize;
585 }
586 
net_close_connection(int * fd)587 static void net_close_connection(int *fd)
588 {
589 	shutdown(*fd, SHUT_RDWR);
590 	close(*fd);
591 	*fd = -1;
592 }
593 
dpp_free(struct devpath * dpp)594 static void dpp_free(struct devpath *dpp)
595 {
596 	if (dpp->stats)
597 		free(dpp->stats);
598 	if (dpp->ios)
599 		free(dpp->ios);
600 	if (dpp->path)
601 		free(dpp->path);
602 	if (dpp->buts_name)
603 		free(dpp->buts_name);
604 	free(dpp);
605 }
606 
lock_on_cpu(int cpu)607 static int lock_on_cpu(int cpu)
608 {
609 #ifndef _ANDROID_
610 	cpu_set_t cpu_mask;
611 
612 	CPU_ZERO(&cpu_mask);
613 	CPU_SET(cpu, &cpu_mask);
614 	if (sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask) < 0)
615 		return errno;
616 #endif
617 
618 	return 0;
619 }
620 
621 #ifndef _ANDROID_
increase_limit(int resource,rlim_t increase)622 static int increase_limit(int resource, rlim_t increase)
623 {
624 	struct rlimit rlim;
625 	int save_errno = errno;
626 
627 	if (!getrlimit(resource, &rlim)) {
628 		rlim.rlim_cur += increase;
629 		if (rlim.rlim_cur >= rlim.rlim_max)
630 			rlim.rlim_max = rlim.rlim_cur + increase;
631 
632 		if (!setrlimit(resource, &rlim))
633 			return 1;
634 	}
635 
636 	errno = save_errno;
637 	return 0;
638 }
639 #endif
640 
handle_open_failure(void)641 static int handle_open_failure(void)
642 {
643 	if (errno == ENFILE || errno == EMFILE)
644 #ifndef _ANDROID_
645 		return increase_limit(RLIMIT_NOFILE, 16);
646 #else
647 		return -ENOSYS;
648 #endif
649 	return 0;
650 }
651 
handle_mem_failure(size_t length)652 static int handle_mem_failure(size_t length)
653 {
654 	if (errno == ENFILE)
655 		return handle_open_failure();
656 	else if (errno == ENOMEM)
657 #ifndef _ANDROID_
658 		return increase_limit(RLIMIT_MEMLOCK, 2 * length);
659 #else
660 		return -ENOSYS;
661 #endif
662 	return 0;
663 }
664 
my_fopen(const char * path,const char * mode)665 static FILE *my_fopen(const char *path, const char *mode)
666 {
667 	FILE *fp;
668 
669 	do {
670 		fp = fopen(path, mode);
671 	} while (fp == NULL && handle_open_failure());
672 
673 	return fp;
674 }
675 
my_open(const char * path,int flags)676 static int my_open(const char *path, int flags)
677 {
678 	int fd;
679 
680 	do {
681 		fd = open(path, flags);
682 	} while (fd < 0 && handle_open_failure());
683 
684 	return fd;
685 }
686 
my_socket(int domain,int type,int protocol)687 static int my_socket(int domain, int type, int protocol)
688 {
689 	int fd;
690 
691 	do {
692 		fd = socket(domain, type, protocol);
693 	} while (fd < 0 && handle_open_failure());
694 
695 	return fd;
696 }
697 
my_accept(int sockfd,struct sockaddr * addr,socklen_t * addrlen)698 static int my_accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
699 {
700 	int fd;
701 
702 	do {
703 		fd = accept(sockfd, addr, addrlen);
704 	} while (fd < 0 && handle_open_failure());
705 
706 	return fd;
707 }
708 
my_mmap(void * addr,size_t length,int prot,int flags,int fd,off_t offset)709 static void *my_mmap(void *addr, size_t length, int prot, int flags, int fd,
710 		     off_t offset)
711 {
712 	void *new;
713 
714 	do {
715 		new = mmap(addr, length, prot, flags, fd, offset);
716 	} while (new == MAP_FAILED && handle_mem_failure(length));
717 
718 	return new;
719 }
720 
my_mlock(const void * addr,size_t len)721 static int my_mlock(const void *addr, size_t len)
722 {
723 	int ret;
724 
725 	do {
726 		ret = mlock(addr, len);
727 	} while (ret < 0 && handle_mem_failure(len));
728 
729 	return ret;
730 }
731 
setup_mmap(int fd,unsigned int maxlen,struct mmap_info * mip)732 static int setup_mmap(int fd, unsigned int maxlen, struct mmap_info *mip)
733 {
734 	if (mip->fs_off + maxlen > mip->fs_buf_len) {
735 		unsigned long nr = max(16, mip->buf_nr);
736 
737 		if (mip->fs_buf) {
738 			munlock(mip->fs_buf, mip->fs_buf_len);
739 			munmap(mip->fs_buf, mip->fs_buf_len);
740 			mip->fs_buf = NULL;
741 		}
742 
743 		mip->fs_off = mip->fs_size & (mip->pagesize - 1);
744 		mip->fs_buf_len = (nr * mip->buf_size) - mip->fs_off;
745 		mip->fs_max_size += mip->fs_buf_len;
746 
747 		if (ftruncate(fd, mip->fs_max_size) < 0) {
748 			perror("setup_mmap: ftruncate");
749 			return 1;
750 		}
751 
752 		mip->fs_buf = my_mmap(NULL, mip->fs_buf_len, PROT_WRITE,
753 				      MAP_SHARED, fd,
754 				      mip->fs_size - mip->fs_off);
755 		if (mip->fs_buf == MAP_FAILED) {
756 			perror("setup_mmap: mmap");
757 			return 1;
758 		}
759 		my_mlock(mip->fs_buf, mip->fs_buf_len);
760 	}
761 
762 	return 0;
763 }
764 
__stop_trace(int fd)765 static int __stop_trace(int fd)
766 {
767 	/*
768 	 * Should be stopped, don't complain if it isn't
769 	 */
770 	ioctl(fd, BLKTRACESTOP);
771 	return ioctl(fd, BLKTRACETEARDOWN);
772 }
773 
write_data(char * buf,int len)774 static int write_data(char *buf, int len)
775 {
776 	int ret;
777 
778 rewrite:
779 	ret = fwrite(buf, len, 1, pfp);
780 	if (ferror(pfp) || ret != 1) {
781 		if (errno == EINTR) {
782 			clearerr(pfp);
783 			goto rewrite;
784 		}
785 
786 		if (!piped_output || (errno != EPIPE && errno != EBADF)) {
787 			fprintf(stderr, "write(%d) failed: %d/%s\n",
788 				len, errno, strerror(errno));
789 		}
790 		goto err;
791 	}
792 
793 	fflush(pfp);
794 	return 0;
795 
796 err:
797 	clearerr(pfp);
798 	return 1;
799 }
800 
801 /*
802  * Returns the number of bytes read (successfully)
803  */
__net_recv_data(int fd,void * buf,unsigned int len)804 static int __net_recv_data(int fd, void *buf, unsigned int len)
805 {
806 	unsigned int bytes_left = len;
807 
808 	while (bytes_left && !done) {
809 		int ret = recv(fd, buf, bytes_left, MSG_WAITALL);
810 
811 		if (ret == 0)
812 			break;
813 		else if (ret < 0) {
814 			if (errno == EAGAIN) {
815 				usleep(50);
816 				continue;
817 			}
818 			perror("server: net_recv_data: recv failed");
819 			break;
820 		} else {
821 			buf += ret;
822 			bytes_left -= ret;
823 		}
824 	}
825 
826 	return len - bytes_left;
827 }
828 
net_recv_data(int fd,void * buf,unsigned int len)829 static int net_recv_data(int fd, void *buf, unsigned int len)
830 {
831 	return __net_recv_data(fd, buf, len);
832 }
833 
834 /*
835  * Returns number of bytes written
836  */
net_send_data(int fd,void * buf,unsigned int buf_len)837 static int net_send_data(int fd, void *buf, unsigned int buf_len)
838 {
839 	int ret;
840 	unsigned int bytes_left = buf_len;
841 
842 	while (bytes_left) {
843 		ret = send(fd, buf, bytes_left, 0);
844 		if (ret < 0) {
845 			perror("send");
846 			break;
847 		}
848 
849 		buf += ret;
850 		bytes_left -= ret;
851 	}
852 
853 	return buf_len - bytes_left;
854 }
855 
net_send_header(int fd,int cpu,char * buts_name,int len)856 static int net_send_header(int fd, int cpu, char *buts_name, int len)
857 {
858 	struct blktrace_net_hdr hdr;
859 
860 	memset(&hdr, 0, sizeof(hdr));
861 
862 	hdr.magic = BLK_IO_TRACE_MAGIC;
863 	strncpy(hdr.buts_name, buts_name, sizeof(hdr.buts_name));
864 	hdr.buts_name[sizeof(hdr.buts_name)-1] = '\0';
865 	hdr.cpu = cpu;
866 	hdr.max_cpus = ncpus;
867 	hdr.len = len;
868 	hdr.cl_id = getpid();
869 	hdr.buf_size = buf_size;
870 	hdr.buf_nr = buf_nr;
871 	hdr.page_size = pagesize;
872 
873 	return net_send_data(fd, &hdr, sizeof(hdr)) != sizeof(hdr);
874 }
875 
net_send_open_close(int fd,int cpu,char * buts_name,int len)876 static void net_send_open_close(int fd, int cpu, char *buts_name, int len)
877 {
878 	struct blktrace_net_hdr ret_hdr;
879 
880 	net_send_header(fd, cpu, buts_name, len);
881 	net_recv_data(fd, &ret_hdr, sizeof(ret_hdr));
882 }
883 
net_send_open(int fd,int cpu,char * buts_name)884 static void net_send_open(int fd, int cpu, char *buts_name)
885 {
886 	net_send_open_close(fd, cpu, buts_name, 0);
887 }
888 
net_send_close(int fd,char * buts_name,int drops)889 static void net_send_close(int fd, char *buts_name, int drops)
890 {
891 	/*
892 	 * Overload CPU w/ number of drops
893 	 *
894 	 * XXX: Need to clear/set done around call - done=1 (which
895 	 * is true here) stops reads from happening... :-(
896 	 */
897 	done = 0;
898 	net_send_open_close(fd, drops, buts_name, 1);
899 	done = 1;
900 }
901 
ack_open_close(int fd,char * buts_name)902 static void ack_open_close(int fd, char *buts_name)
903 {
904 	net_send_header(fd, 0, buts_name, 2);
905 }
906 
net_send_drops(int fd)907 static void net_send_drops(int fd)
908 {
909 	struct list_head *p;
910 
911 	__list_for_each(p, &devpaths) {
912 		struct devpath *dpp = list_entry(p, struct devpath, head);
913 
914 		net_send_close(fd, dpp->buts_name, dpp->drops);
915 	}
916 }
917 
918 /*
919  * Returns:
920  *	 0: "EOF"
921  *	 1: OK
922  *	-1: Error
923  */
net_get_header(struct cl_conn * nc,struct blktrace_net_hdr * bnh)924 static int net_get_header(struct cl_conn *nc, struct blktrace_net_hdr *bnh)
925 {
926 	int bytes_read;
927 	int fl = fcntl(nc->fd, F_GETFL);
928 
929 	fcntl(nc->fd, F_SETFL, fl | O_NONBLOCK);
930 	bytes_read = __net_recv_data(nc->fd, bnh, sizeof(*bnh));
931 	fcntl(nc->fd, F_SETFL, fl & ~O_NONBLOCK);
932 
933 	if (bytes_read == sizeof(*bnh))
934 		return 1;
935 	else if (bytes_read == 0)
936 		return 0;
937 	else
938 		return -1;
939 }
940 
net_setup_addr(void)941 static int net_setup_addr(void)
942 {
943 	struct sockaddr_in *addr = &hostname_addr;
944 
945 	memset(addr, 0, sizeof(*addr));
946 	addr->sin_family = AF_INET;
947 	addr->sin_port = htons(net_port);
948 
949 	if (inet_aton(hostname, &addr->sin_addr) != 1) {
950 		struct hostent *hent;
951 retry:
952 		hent = gethostbyname(hostname);
953 		if (!hent) {
954 			if (h_errno == TRY_AGAIN) {
955 				usleep(100);
956 				goto retry;
957 			} else if (h_errno == NO_RECOVERY) {
958 				fprintf(stderr, "gethostbyname(%s)"
959 					"non-recoverable error encountered\n",
960 					hostname);
961 			} else {
962 				/*
963 				 * HOST_NOT_FOUND, NO_ADDRESS or NO_DATA
964 				 */
965 				fprintf(stderr, "Host %s not found\n",
966 					hostname);
967 			}
968 			return 1;
969 		}
970 
971 		memcpy(&addr->sin_addr, hent->h_addr, 4);
972 		strcpy(hostname, hent->h_name);
973 	}
974 
975 	return 0;
976 }
977 
net_setup_client(void)978 static int net_setup_client(void)
979 {
980 	int fd;
981 	struct sockaddr_in *addr = &hostname_addr;
982 
983 	fd = my_socket(AF_INET, SOCK_STREAM, 0);
984 	if (fd < 0) {
985 		perror("client: socket");
986 		return -1;
987 	}
988 
989 	if (connect(fd, (struct sockaddr *)addr, sizeof(*addr)) < 0) {
990 		if (errno == ECONNREFUSED)
991 			fprintf(stderr,
992 				"\nclient: Connection to %s refused, "
993 				"perhaps the server is not started?\n\n",
994 				hostname);
995 		else
996 			perror("client: connect");
997 
998 		close(fd);
999 		return -1;
1000 	}
1001 
1002 	return fd;
1003 }
1004 
open_client_connections(void)1005 static int open_client_connections(void)
1006 {
1007 	int cpu;
1008 
1009 	cl_fds = calloc(ncpus, sizeof(*cl_fds));
1010 	for (cpu = 0; cpu < ncpus; cpu++) {
1011 		cl_fds[cpu] = net_setup_client();
1012 		if (cl_fds[cpu] < 0)
1013 			goto err;
1014 	}
1015 	return 0;
1016 
1017 err:
1018 	while (cpu > 0)
1019 		close(cl_fds[cpu--]);
1020 	free(cl_fds);
1021 	return 1;
1022 }
1023 
close_client_connections(void)1024 static void close_client_connections(void)
1025 {
1026 	if (cl_fds) {
1027 		int cpu, *fdp;
1028 
1029 		for (cpu = 0, fdp = cl_fds; cpu < ncpus; cpu++, fdp++) {
1030 			if (*fdp >= 0) {
1031 				net_send_drops(*fdp);
1032 				net_close_connection(fdp);
1033 			}
1034 		}
1035 		free(cl_fds);
1036 	}
1037 }
1038 
setup_buts(void)1039 static void setup_buts(void)
1040 {
1041 	struct list_head *p;
1042 
1043 	__list_for_each(p, &devpaths) {
1044 		struct blk_user_trace_setup buts;
1045 		struct devpath *dpp = list_entry(p, struct devpath, head);
1046 
1047 		memset(&buts, 0, sizeof(buts));
1048 		buts.buf_size = buf_size;
1049 		buts.buf_nr = buf_nr;
1050 		buts.act_mask = act_mask;
1051 		if (ioctl(dpp->fd, BLKTRACESETUP, &buts) >= 0) {
1052 			dpp->ncpus = ncpus;
1053 			dpp->buts_name = strdup(buts.name);
1054 			if (dpp->stats)
1055 				free(dpp->stats);
1056 			dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
1057 			memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
1058 		} else
1059 			fprintf(stderr, "BLKTRACESETUP(2) %s failed: %d/%s\n",
1060 				dpp->path, errno, strerror(errno));
1061 	}
1062 }
1063 
start_buts(void)1064 static void start_buts(void)
1065 {
1066 	struct list_head *p;
1067 
1068 	__list_for_each(p, &devpaths) {
1069 		struct devpath *dpp = list_entry(p, struct devpath, head);
1070 
1071 		if (ioctl(dpp->fd, BLKTRACESTART) < 0) {
1072 			fprintf(stderr, "BLKTRACESTART %s failed: %d/%s\n",
1073 				dpp->path, errno, strerror(errno));
1074 		}
1075 	}
1076 }
1077 
get_drops(struct devpath * dpp)1078 static int get_drops(struct devpath *dpp)
1079 {
1080 	int fd, drops = 0;
1081 	char fn[MAXPATHLEN + 64], tmp[256];
1082 
1083 	snprintf(fn, sizeof(fn), "%s/block/%s/dropped", debugfs_path,
1084 		 dpp->buts_name);
1085 
1086 	fd = my_open(fn, O_RDONLY);
1087 	if (fd < 0) {
1088 		/*
1089 		 * This may be ok: the kernel may not support
1090 		 * dropped counts.
1091 		 */
1092 		if (errno != ENOENT)
1093 			fprintf(stderr, "Could not open %s: %d/%s\n",
1094 				fn, errno, strerror(errno));
1095 		return 0;
1096 	} else if (read(fd, tmp, sizeof(tmp)) < 0) {
1097 		fprintf(stderr, "Could not read %s: %d/%s\n",
1098 			fn, errno, strerror(errno));
1099 	} else
1100 		drops = atoi(tmp);
1101 	close(fd);
1102 
1103 	return drops;
1104 }
1105 
get_all_drops(void)1106 static void get_all_drops(void)
1107 {
1108 	struct list_head *p;
1109 
1110 	__list_for_each(p, &devpaths) {
1111 		struct devpath *dpp = list_entry(p, struct devpath, head);
1112 
1113 		dpp->drops = get_drops(dpp);
1114 	}
1115 }
1116 
alloc_trace_buf(int cpu,int bufsize)1117 static inline struct trace_buf *alloc_trace_buf(int cpu, int bufsize)
1118 {
1119 	struct trace_buf *tbp;
1120 
1121 	tbp = malloc(sizeof(*tbp) + bufsize);
1122 	INIT_LIST_HEAD(&tbp->head);
1123 	tbp->len = 0;
1124 	tbp->buf = (void *)(tbp + 1);
1125 	tbp->cpu = cpu;
1126 	tbp->dpp = NULL;	/* Will be set when tbp is added */
1127 
1128 	return tbp;
1129 }
1130 
free_tracer_heads(struct devpath * dpp)1131 static void free_tracer_heads(struct devpath *dpp)
1132 {
1133 	int cpu;
1134 	struct tracer_devpath_head *hd;
1135 
1136 	for (cpu = 0, hd = dpp->heads; cpu < ncpus; cpu++, hd++) {
1137 		if (hd->prev)
1138 			free(hd->prev);
1139 
1140 		pthread_mutex_destroy(&hd->mutex);
1141 	}
1142 	free(dpp->heads);
1143 }
1144 
setup_tracer_devpaths(void)1145 static int setup_tracer_devpaths(void)
1146 {
1147 	struct list_head *p;
1148 
1149 	if (net_client_use_send())
1150 		if (open_client_connections())
1151 			return 1;
1152 
1153 	__list_for_each(p, &devpaths) {
1154 		int cpu;
1155 		struct tracer_devpath_head *hd;
1156 		struct devpath *dpp = list_entry(p, struct devpath, head);
1157 
1158 		dpp->heads = calloc(ncpus, sizeof(struct tracer_devpath_head));
1159 		for (cpu = 0, hd = dpp->heads; cpu < ncpus; cpu++, hd++) {
1160 			INIT_LIST_HEAD(&hd->head);
1161 			pthread_mutex_init(&hd->mutex, NULL);
1162 			hd->prev = NULL;
1163 		}
1164 	}
1165 
1166 	return 0;
1167 }
1168 
add_trace_buf(struct devpath * dpp,int cpu,struct trace_buf ** tbpp)1169 static inline void add_trace_buf(struct devpath *dpp, int cpu,
1170 						struct trace_buf **tbpp)
1171 {
1172 	struct trace_buf *tbp = *tbpp;
1173 	struct tracer_devpath_head *hd = &dpp->heads[cpu];
1174 
1175 	tbp->dpp = dpp;
1176 
1177 	pthread_mutex_lock(&hd->mutex);
1178 	list_add_tail(&tbp->head, &hd->head);
1179 	pthread_mutex_unlock(&hd->mutex);
1180 
1181 	*tbpp = alloc_trace_buf(cpu, buf_size);
1182 }
1183 
incr_entries(int entries_handled)1184 static inline void incr_entries(int entries_handled)
1185 {
1186 	pthread_mutex_lock(&dp_mutex);
1187 	if (dp_entries == 0)
1188 		pthread_cond_signal(&dp_cond);
1189 	dp_entries += entries_handled;
1190 	pthread_mutex_unlock(&dp_mutex);
1191 }
1192 
decr_entries(int handled)1193 static void decr_entries(int handled)
1194 {
1195 	pthread_mutex_lock(&dp_mutex);
1196 	dp_entries -= handled;
1197 	pthread_mutex_unlock(&dp_mutex);
1198 }
1199 
wait_empty_entries(void)1200 static int wait_empty_entries(void)
1201 {
1202 	pthread_mutex_lock(&dp_mutex);
1203 	while (!done && dp_entries == 0)
1204 		t_pthread_cond_wait(&dp_cond, &dp_mutex);
1205 	pthread_mutex_unlock(&dp_mutex);
1206 
1207 	return !done;
1208 }
1209 
add_devpath(char * path)1210 static int add_devpath(char *path)
1211 {
1212 	int fd;
1213 	struct devpath *dpp;
1214 
1215 	/*
1216 	 * Verify device is valid before going too far
1217 	 */
1218 	fd = my_open(path, O_RDONLY | O_NONBLOCK);
1219 	if (fd < 0) {
1220 		fprintf(stderr, "Invalid path %s specified: %d/%s\n",
1221 			path, errno, strerror(errno));
1222 		return 1;
1223 	}
1224 
1225 	dpp = malloc(sizeof(*dpp));
1226 	memset(dpp, 0, sizeof(*dpp));
1227 	dpp->path = strdup(path);
1228 	dpp->fd = fd;
1229 	dpp->idx = ndevs++;
1230 	list_add_tail(&dpp->head, &devpaths);
1231 
1232 	return 0;
1233 }
1234 
rel_devpaths(void)1235 static void rel_devpaths(void)
1236 {
1237 	struct list_head *p, *q;
1238 
1239 	list_for_each_safe(p, q, &devpaths) {
1240 		struct devpath *dpp = list_entry(p, struct devpath, head);
1241 
1242 		list_del(&dpp->head);
1243 		__stop_trace(dpp->fd);
1244 		close(dpp->fd);
1245 
1246 		if (dpp->heads)
1247 			free_tracer_heads(dpp);
1248 
1249 		dpp_free(dpp);
1250 		ndevs--;
1251 	}
1252 }
1253 
flush_subbuf_net(struct trace_buf * tbp)1254 static int flush_subbuf_net(struct trace_buf *tbp)
1255 {
1256 	int fd = cl_fds[tbp->cpu];
1257 	struct devpath *dpp = tbp->dpp;
1258 
1259 	if (net_send_header(fd, tbp->cpu, dpp->buts_name, tbp->len))
1260 		return 1;
1261 	else if (net_send_data(fd, tbp->buf, tbp->len) != tbp->len)
1262 		return 1;
1263 
1264 	return 0;
1265 }
1266 
1267 static int
handle_list_net(struct tracer_devpath_head * hd,struct list_head * list)1268 handle_list_net(__attribute__((__unused__))struct tracer_devpath_head *hd,
1269 		struct list_head *list)
1270 {
1271 	struct trace_buf *tbp;
1272 	struct list_head *p, *q;
1273 	int entries_handled = 0;
1274 
1275 	list_for_each_safe(p, q, list) {
1276 		tbp = list_entry(p, struct trace_buf, head);
1277 
1278 		list_del(&tbp->head);
1279 		entries_handled++;
1280 
1281 		if (cl_fds[tbp->cpu] >= 0) {
1282 			if (flush_subbuf_net(tbp)) {
1283 				close(cl_fds[tbp->cpu]);
1284 				cl_fds[tbp->cpu] = -1;
1285 			}
1286 		}
1287 
1288 		free(tbp);
1289 	}
1290 
1291 	return entries_handled;
1292 }
1293 
1294 /*
1295  * Tack 'tbp's buf onto the tail of 'prev's buf
1296  */
tb_combine(struct trace_buf * prev,struct trace_buf * tbp)1297 static struct trace_buf *tb_combine(struct trace_buf *prev,
1298 				    struct trace_buf *tbp)
1299 {
1300 	unsigned long tot_len;
1301 
1302 	tot_len = prev->len + tbp->len;
1303 	if (tot_len > buf_size) {
1304 		/*
1305 		 * tbp->head isn't connected (it was 'prev'
1306 		 * so it had been taken off of the list
1307 		 * before). Therefore, we can realloc
1308 		 * the whole structures, as the other fields
1309 		 * are "static".
1310 		 */
1311 		prev = realloc(prev->buf, sizeof(*prev) + tot_len);
1312 		prev->buf = (void *)(prev + 1);
1313 	}
1314 
1315 	memcpy(prev->buf + prev->len, tbp->buf, tbp->len);
1316 	prev->len = tot_len;
1317 
1318 	free(tbp);
1319 	return prev;
1320 }
1321 
handle_list_file(struct tracer_devpath_head * hd,struct list_head * list)1322 static int handle_list_file(struct tracer_devpath_head *hd,
1323 			    struct list_head *list)
1324 {
1325 	int off, t_len, nevents;
1326 	struct blk_io_trace *t;
1327 	struct list_head *p, *q;
1328 	int entries_handled = 0;
1329 	struct trace_buf *tbp, *prev;
1330 
1331 	prev = hd->prev;
1332 	list_for_each_safe(p, q, list) {
1333 		tbp = list_entry(p, struct trace_buf, head);
1334 		list_del(&tbp->head);
1335 		entries_handled++;
1336 
1337 		/*
1338 		 * If there was some leftover before, tack this new
1339 		 * entry onto the tail of the previous one.
1340 		 */
1341 		if (prev)
1342 			tbp = tb_combine(prev, tbp);
1343 
1344 		/*
1345 		 * See how many whole traces there are - send them
1346 		 * all out in one go.
1347 		 */
1348 		off = 0;
1349 		nevents = 0;
1350 		while (off + (int)sizeof(*t) <= tbp->len) {
1351 			t = (struct blk_io_trace *)(tbp->buf + off);
1352 			t_len = sizeof(*t) + t->pdu_len;
1353 			if (off + t_len > tbp->len)
1354 				break;
1355 
1356 			off += t_len;
1357 			nevents++;
1358 		}
1359 		if (nevents)
1360 			pdc_nev_update(tbp->dpp, tbp->cpu, nevents);
1361 
1362 		/*
1363 		 * Write any full set of traces, any remaining data is kept
1364 		 * for the next pass.
1365 		 */
1366 		if (off) {
1367 			if (write_data(tbp->buf, off) || off == tbp->len) {
1368 				free(tbp);
1369 				prev = NULL;
1370 			}
1371 			else {
1372 				/*
1373 				 * Move valid data to beginning of buffer
1374 				 */
1375 				tbp->len -= off;
1376 				memmove(tbp->buf, tbp->buf + off, tbp->len);
1377 				prev = tbp;
1378 			}
1379 		} else
1380 			prev = tbp;
1381 	}
1382 	hd->prev = prev;
1383 
1384 	return entries_handled;
1385 }
1386 
__process_trace_bufs(void)1387 static void __process_trace_bufs(void)
1388 {
1389 	int cpu;
1390 	struct list_head *p;
1391 	struct list_head list;
1392 	int handled = 0;
1393 
1394 	__list_for_each(p, &devpaths) {
1395 		struct devpath *dpp = list_entry(p, struct devpath, head);
1396 		struct tracer_devpath_head *hd = dpp->heads;
1397 
1398 		for (cpu = 0; cpu < ncpus; cpu++, hd++) {
1399 			pthread_mutex_lock(&hd->mutex);
1400 			if (list_empty(&hd->head)) {
1401 				pthread_mutex_unlock(&hd->mutex);
1402 				continue;
1403 			}
1404 
1405 			list_replace_init(&hd->head, &list);
1406 			pthread_mutex_unlock(&hd->mutex);
1407 
1408 			handled += handle_list(hd, &list);
1409 		}
1410 	}
1411 
1412 	if (handled)
1413 		decr_entries(handled);
1414 }
1415 
process_trace_bufs(void)1416 static void process_trace_bufs(void)
1417 {
1418 	while (wait_empty_entries())
1419 		__process_trace_bufs();
1420 }
1421 
clean_trace_bufs(void)1422 static void clean_trace_bufs(void)
1423 {
1424 	/*
1425 	 * No mutex needed here: we're only reading from the lists,
1426 	 * tracers are done
1427 	 */
1428 	while (dp_entries)
1429 		__process_trace_bufs();
1430 }
1431 
read_err(int cpu,char * ifn)1432 static inline void read_err(int cpu, char *ifn)
1433 {
1434 	if (errno != EAGAIN)
1435 		fprintf(stderr, "Thread %d failed read of %s: %d/%s\n",
1436 			cpu, ifn, errno, strerror(errno));
1437 }
1438 
net_sendfile(struct io_info * iop)1439 static int net_sendfile(struct io_info *iop)
1440 {
1441 	int ret;
1442 
1443 	ret = sendfile(iop->ofd, iop->ifd, NULL, iop->ready);
1444 	if (ret < 0) {
1445 		perror("sendfile");
1446 		return 1;
1447 	} else if (ret < (int)iop->ready) {
1448 		fprintf(stderr, "short sendfile send (%d of %d)\n",
1449 			ret, iop->ready);
1450 		return 1;
1451 	}
1452 
1453 	return 0;
1454 }
1455 
net_sendfile_data(struct tracer * tp,struct io_info * iop)1456 static inline int net_sendfile_data(struct tracer *tp, struct io_info *iop)
1457 {
1458 	struct devpath *dpp = iop->dpp;
1459 
1460 	if (net_send_header(iop->ofd, tp->cpu, dpp->buts_name, iop->ready))
1461 		return 1;
1462 	return net_sendfile(iop);
1463 }
1464 
fill_ofname(struct io_info * iop,int cpu)1465 static int fill_ofname(struct io_info *iop, int cpu)
1466 {
1467 	int len;
1468 	struct stat sb;
1469 	char *dst = iop->ofn;
1470 
1471 	if (output_dir)
1472 		len = snprintf(iop->ofn, sizeof(iop->ofn), "%s/", output_dir);
1473 	else
1474 		len = snprintf(iop->ofn, sizeof(iop->ofn), "./");
1475 
1476 	if (net_mode == Net_server) {
1477 		struct cl_conn *nc = iop->nc;
1478 
1479 		len += sprintf(dst + len, "%s-", nc->ch->hostname);
1480 		len += strftime(dst + len, 64, "%F-%T/",
1481 				gmtime(&iop->dpp->cl_connect_time));
1482 	}
1483 
1484 	if (stat(iop->ofn, &sb) < 0) {
1485 		if (errno != ENOENT) {
1486 			fprintf(stderr,
1487 				"Destination dir %s stat failed: %d/%s\n",
1488 				iop->ofn, errno, strerror(errno));
1489 			return 1;
1490 		}
1491 		/*
1492 		 * There is no synchronization between multiple threads
1493 		 * trying to create the directory at once.  It's harmless
1494 		 * to let them try, so just detect the problem and move on.
1495 		 */
1496 		if (mkdir(iop->ofn, 0755) < 0 && errno != EEXIST) {
1497 			fprintf(stderr,
1498 				"Destination dir %s can't be made: %d/%s\n",
1499 				iop->ofn, errno, strerror(errno));
1500 			return 1;
1501 		}
1502 	}
1503 
1504 	if (output_name)
1505 		snprintf(iop->ofn + len, sizeof(iop->ofn), "%s.blktrace.%d",
1506 			 output_name, cpu);
1507 	else
1508 		snprintf(iop->ofn + len, sizeof(iop->ofn), "%s.blktrace.%d",
1509 			 iop->dpp->buts_name, cpu);
1510 
1511 	return 0;
1512 }
1513 
set_vbuf(struct io_info * iop,int mode,size_t size)1514 static int set_vbuf(struct io_info *iop, int mode, size_t size)
1515 {
1516 	iop->obuf = malloc(size);
1517 	if (setvbuf(iop->ofp, iop->obuf, mode, size) < 0) {
1518 		fprintf(stderr, "setvbuf(%s, %d) failed: %d/%s\n",
1519 			iop->dpp->path, (int)size, errno,
1520 			strerror(errno));
1521 		free(iop->obuf);
1522 		return 1;
1523 	}
1524 
1525 	return 0;
1526 }
1527 
iop_open(struct io_info * iop,int cpu)1528 static int iop_open(struct io_info *iop, int cpu)
1529 {
1530 	iop->ofd = -1;
1531 	if (fill_ofname(iop, cpu))
1532 		return 1;
1533 
1534 	iop->ofp = my_fopen(iop->ofn, "w+");
1535 	if (iop->ofp == NULL) {
1536 		fprintf(stderr, "Open output file %s failed: %d/%s\n",
1537 			iop->ofn, errno, strerror(errno));
1538 		return 1;
1539 	}
1540 
1541 	if (set_vbuf(iop, _IOLBF, FILE_VBUF_SIZE)) {
1542 		fprintf(stderr, "set_vbuf for file %s failed: %d/%s\n",
1543 			iop->ofn, errno, strerror(errno));
1544 		fclose(iop->ofp);
1545 		return 1;
1546 	}
1547 
1548 	iop->ofd = fileno(iop->ofp);
1549 	return 0;
1550 }
1551 
close_iop(struct io_info * iop)1552 static void close_iop(struct io_info *iop)
1553 {
1554 	struct mmap_info *mip = &iop->mmap_info;
1555 
1556 	if (mip->fs_buf)
1557 		munmap(mip->fs_buf, mip->fs_buf_len);
1558 
1559 	if (!piped_output) {
1560 		if (ftruncate(fileno(iop->ofp), mip->fs_size) < 0) {
1561 			fprintf(stderr,
1562 				"Ignoring err: ftruncate(%s): %d/%s\n",
1563 				iop->ofn, errno, strerror(errno));
1564 		}
1565 	}
1566 
1567 	if (iop->ofp)
1568 		fclose(iop->ofp);
1569 	if (iop->obuf)
1570 		free(iop->obuf);
1571 }
1572 
close_ios(struct tracer * tp)1573 static void close_ios(struct tracer *tp)
1574 {
1575 	while (tp->nios > 0) {
1576 		struct io_info *iop = &tp->ios[--tp->nios];
1577 
1578 		iop->dpp->drops = get_drops(iop->dpp);
1579 		if (iop->ifd >= 0)
1580 			close(iop->ifd);
1581 
1582 		if (iop->ofp)
1583 			close_iop(iop);
1584 		else if (iop->ofd >= 0) {
1585 			struct devpath *dpp = iop->dpp;
1586 
1587 			net_send_close(iop->ofd, dpp->buts_name, dpp->drops);
1588 			net_close_connection(&iop->ofd);
1589 		}
1590 	}
1591 
1592 	free(tp->ios);
1593 	free(tp->pfds);
1594 }
1595 
open_ios(struct tracer * tp)1596 static int open_ios(struct tracer *tp)
1597 {
1598 	struct pollfd *pfd;
1599 	struct io_info *iop;
1600 	struct list_head *p;
1601 
1602 	tp->ios = calloc(ndevs, sizeof(struct io_info));
1603 	memset(tp->ios, 0, ndevs * sizeof(struct io_info));
1604 
1605 	tp->pfds = calloc(ndevs, sizeof(struct pollfd));
1606 	memset(tp->pfds, 0, ndevs * sizeof(struct pollfd));
1607 
1608 	tp->nios = 0;
1609 	iop = tp->ios;
1610 	pfd = tp->pfds;
1611 	__list_for_each(p, &devpaths) {
1612 		struct devpath *dpp = list_entry(p, struct devpath, head);
1613 
1614 		iop->dpp = dpp;
1615 		iop->ofd = -1;
1616 		snprintf(iop->ifn, sizeof(iop->ifn), "%s/block/%s/trace%d",
1617 			debugfs_path, dpp->buts_name, tp->cpu);
1618 
1619 		iop->ifd = my_open(iop->ifn, O_RDONLY | O_NONBLOCK);
1620 		if (iop->ifd < 0) {
1621 			fprintf(stderr, "Thread %d failed open %s: %d/%s\n",
1622 				tp->cpu, iop->ifn, errno, strerror(errno));
1623 			return 1;
1624 		}
1625 
1626 		init_mmap_info(&iop->mmap_info);
1627 
1628 		pfd->fd = iop->ifd;
1629 		pfd->events = POLLIN;
1630 
1631 		if (piped_output)
1632 			;
1633 		else if (net_client_use_sendfile()) {
1634 			iop->ofd = net_setup_client();
1635 			if (iop->ofd < 0)
1636 				goto err;
1637 			net_send_open(iop->ofd, tp->cpu, dpp->buts_name);
1638 		} else if (net_mode == Net_none) {
1639 			if (iop_open(iop, tp->cpu))
1640 				goto err;
1641 		} else {
1642 			/*
1643 			 * This ensures that the server knows about all
1644 			 * connections & devices before _any_ closes
1645 			 */
1646 			net_send_open(cl_fds[tp->cpu], tp->cpu, dpp->buts_name);
1647 		}
1648 
1649 		pfd++;
1650 		iop++;
1651 		tp->nios++;
1652 	}
1653 
1654 	return 0;
1655 
1656 err:
1657 	close(iop->ifd);	/* tp->nios _not_ bumped */
1658 	close_ios(tp);
1659 	return 1;
1660 }
1661 
handle_pfds_file(struct tracer * tp,int nevs,int force_read)1662 static int handle_pfds_file(struct tracer *tp, int nevs, int force_read)
1663 {
1664 	struct mmap_info *mip;
1665 	int i, ret, nentries = 0;
1666 	struct pollfd *pfd = tp->pfds;
1667 	struct io_info *iop = tp->ios;
1668 
1669 	for (i = 0; nevs > 0 && i < ndevs; i++, pfd++, iop++) {
1670 		if (pfd->revents & POLLIN || force_read) {
1671 			mip = &iop->mmap_info;
1672 
1673 			ret = setup_mmap(iop->ofd, buf_size, mip);
1674 			if (ret < 0) {
1675 				pfd->events = 0;
1676 				break;
1677 			}
1678 
1679 			ret = read(iop->ifd, mip->fs_buf + mip->fs_off,
1680 				   buf_size);
1681 			if (ret > 0) {
1682 				pdc_dr_update(iop->dpp, tp->cpu, ret);
1683 				mip->fs_size += ret;
1684 				mip->fs_off += ret;
1685 				nentries++;
1686 			} else if (ret == 0) {
1687 				/*
1688 				 * Short reads after we're done stop us
1689 				 * from trying reads.
1690 				 */
1691 				if (tp->is_done)
1692 					clear_events(pfd);
1693 			} else {
1694 				read_err(tp->cpu, iop->ifn);
1695 				if (errno != EAGAIN || tp->is_done)
1696 					clear_events(pfd);
1697 			}
1698 			nevs--;
1699 		}
1700 	}
1701 
1702 	return nentries;
1703 }
1704 
handle_pfds_netclient(struct tracer * tp,int nevs,int force_read)1705 static int handle_pfds_netclient(struct tracer *tp, int nevs, int force_read)
1706 {
1707 	struct stat sb;
1708 	int i, nentries = 0;
1709 	struct pdc_stats *sp;
1710 	struct pollfd *pfd = tp->pfds;
1711 	struct io_info *iop = tp->ios;
1712 
1713 	for (i = 0; i < ndevs; i++, pfd++, iop++, sp++) {
1714 		if (pfd->revents & POLLIN || force_read) {
1715 			if (fstat(iop->ifd, &sb) < 0) {
1716 				perror(iop->ifn);
1717 				pfd->events = 0;
1718 			} else if (sb.st_size > (off_t)iop->data_queued) {
1719 				iop->ready = sb.st_size - iop->data_queued;
1720 				iop->data_queued = sb.st_size;
1721 
1722 				if (!net_sendfile_data(tp, iop)) {
1723 					pdc_dr_update(iop->dpp, tp->cpu,
1724 						      iop->ready);
1725 					nentries++;
1726 				} else
1727 					clear_events(pfd);
1728 			}
1729 			if (--nevs == 0)
1730 				break;
1731 		}
1732 	}
1733 
1734 	if (nentries)
1735 		incr_entries(nentries);
1736 
1737 	return nentries;
1738 }
1739 
handle_pfds_entries(struct tracer * tp,int nevs,int force_read)1740 static int handle_pfds_entries(struct tracer *tp, int nevs, int force_read)
1741 {
1742 	int i, nentries = 0;
1743 	struct trace_buf *tbp;
1744 	struct pollfd *pfd = tp->pfds;
1745 	struct io_info *iop = tp->ios;
1746 
1747 	tbp = alloc_trace_buf(tp->cpu, buf_size);
1748 	for (i = 0; i < ndevs; i++, pfd++, iop++) {
1749 		if (pfd->revents & POLLIN || force_read) {
1750 			tbp->len = read(iop->ifd, tbp->buf, buf_size);
1751 			if (tbp->len > 0) {
1752 				pdc_dr_update(iop->dpp, tp->cpu, tbp->len);
1753 				add_trace_buf(iop->dpp, tp->cpu, &tbp);
1754 				nentries++;
1755 			} else if (tbp->len == 0) {
1756 				/*
1757 				 * Short reads after we're done stop us
1758 				 * from trying reads.
1759 				 */
1760 				if (tp->is_done)
1761 					clear_events(pfd);
1762 			} else {
1763 				read_err(tp->cpu, iop->ifn);
1764 				if (errno != EAGAIN || tp->is_done)
1765 					clear_events(pfd);
1766 			}
1767 			if (!piped_output && --nevs == 0)
1768 				break;
1769 		}
1770 	}
1771 	free(tbp);
1772 
1773 	if (nentries)
1774 		incr_entries(nentries);
1775 
1776 	return nentries;
1777 }
1778 
thread_main(void * arg)1779 static void *thread_main(void *arg)
1780 {
1781 	int ret, ndone, to_val;
1782 	struct tracer *tp = arg;
1783 
1784 	ret = lock_on_cpu(tp->cpu);
1785 	if (ret)
1786 		goto err;
1787 
1788 	ret = open_ios(tp);
1789 	if (ret)
1790 		goto err;
1791 
1792 	if (piped_output)
1793 		to_val = 50;		/* Frequent partial handles */
1794 	else
1795 		to_val = 500;		/* 1/2 second intervals */
1796 
1797 
1798 	tracer_signal_ready(tp, Th_running, 0);
1799 	tracer_wait_unblock(tp);
1800 
1801 	while (!tp->is_done) {
1802 		ndone = poll(tp->pfds, ndevs, to_val);
1803 		if (ndone || piped_output)
1804 			(void)handle_pfds(tp, ndone, piped_output);
1805 		else if (ndone < 0 && errno != EINTR)
1806 			fprintf(stderr, "Thread %d poll failed: %d/%s\n",
1807 				tp->cpu, errno, strerror(errno));
1808 	}
1809 
1810 	/*
1811 	 * Trace is stopped, pull data until we get a short read
1812 	 */
1813 	while (handle_pfds(tp, ndevs, 1) > 0)
1814 		;
1815 
1816 	close_ios(tp);
1817 	tracer_signal_ready(tp, Th_leaving, 0);
1818 	return NULL;
1819 
1820 err:
1821 	tracer_signal_ready(tp, Th_error, ret);
1822 	return NULL;
1823 }
1824 
start_tracer(int cpu)1825 static int start_tracer(int cpu)
1826 {
1827 	struct tracer *tp;
1828 
1829 	tp = malloc(sizeof(*tp));
1830 	memset(tp, 0, sizeof(*tp));
1831 
1832 	INIT_LIST_HEAD(&tp->head);
1833 	tp->status = 0;
1834 	tp->cpu = cpu;
1835 
1836 	if (pthread_create(&tp->thread, NULL, thread_main, tp)) {
1837 		fprintf(stderr, "FAILED to start thread on CPU %d: %d/%s\n",
1838 			cpu, errno, strerror(errno));
1839 		free(tp);
1840 		return 1;
1841 	}
1842 
1843 	list_add_tail(&tp->head, &tracers);
1844 	return 0;
1845 }
1846 
start_tracers(void)1847 static void start_tracers(void)
1848 {
1849 	int cpu;
1850 	struct list_head *p;
1851 
1852 	for (cpu = 0; cpu < ncpus; cpu++)
1853 		if (start_tracer(cpu))
1854 			break;
1855 
1856 	wait_tracers_ready(cpu);
1857 
1858 	__list_for_each(p, &tracers) {
1859 		struct tracer *tp = list_entry(p, struct tracer, head);
1860 		if (tp->status)
1861 			fprintf(stderr,
1862 				"FAILED to start thread on CPU %d: %d/%s\n",
1863 				tp->cpu, tp->status, strerror(tp->status));
1864 	}
1865 }
1866 
stop_tracers(void)1867 static void stop_tracers(void)
1868 {
1869 	struct list_head *p;
1870 
1871 	/*
1872 	 * Stop the tracing - makes the tracer threads clean up quicker.
1873 	 */
1874 	__list_for_each(p, &devpaths) {
1875 		struct devpath *dpp = list_entry(p, struct devpath, head);
1876 		(void)ioctl(dpp->fd, BLKTRACESTOP);
1877 	}
1878 
1879 	/*
1880 	 * Tell each tracer to quit
1881 	 */
1882 	__list_for_each(p, &tracers) {
1883 		struct tracer *tp = list_entry(p, struct tracer, head);
1884 		tp->is_done = 1;
1885 	}
1886 }
1887 
del_tracers(void)1888 static void del_tracers(void)
1889 {
1890 	struct list_head *p, *q;
1891 
1892 	list_for_each_safe(p, q, &tracers) {
1893 		struct tracer *tp = list_entry(p, struct tracer, head);
1894 
1895 		list_del(&tp->head);
1896 		free(tp);
1897 	}
1898 }
1899 
wait_tracers(void)1900 static void wait_tracers(void)
1901 {
1902 	struct list_head *p;
1903 
1904 	if (use_tracer_devpaths())
1905 		process_trace_bufs();
1906 
1907 	wait_tracers_leaving();
1908 
1909 	__list_for_each(p, &tracers) {
1910 		int ret;
1911 		struct tracer *tp = list_entry(p, struct tracer, head);
1912 
1913 		ret = pthread_join(tp->thread, NULL);
1914 		if (ret)
1915 			fprintf(stderr, "Thread join %d failed %d\n",
1916 				tp->cpu, ret);
1917 	}
1918 
1919 	if (use_tracer_devpaths())
1920 		clean_trace_bufs();
1921 
1922 	get_all_drops();
1923 }
1924 
exit_tracing(void)1925 static void exit_tracing(void)
1926 {
1927 	signal(SIGINT, SIG_IGN);
1928 	signal(SIGHUP, SIG_IGN);
1929 	signal(SIGTERM, SIG_IGN);
1930 	signal(SIGALRM, SIG_IGN);
1931 
1932 	stop_tracers();
1933 	wait_tracers();
1934 	del_tracers();
1935 	rel_devpaths();
1936 }
1937 
handle_sigint(int sig)1938 static void handle_sigint(__attribute__((__unused__)) int sig)
1939 {
1940 	done = 1;
1941 	stop_tracers();
1942 }
1943 
show_stats(struct list_head * devpaths)1944 static void show_stats(struct list_head *devpaths)
1945 {
1946 	FILE *ofp;
1947 	struct list_head *p;
1948 	unsigned long long nevents, data_read;
1949 	unsigned long long total_drops = 0;
1950 	unsigned long long total_events = 0;
1951 
1952 	if (piped_output)
1953 		ofp = my_fopen("/dev/null", "w");
1954 	else
1955 		ofp = stdout;
1956 
1957 	__list_for_each(p, devpaths) {
1958 		int cpu;
1959 		struct pdc_stats *sp;
1960 		struct devpath *dpp = list_entry(p, struct devpath, head);
1961 
1962 		if (net_mode == Net_server)
1963 			printf("server: end of run for %s:%s\n",
1964 				dpp->ch->hostname, dpp->buts_name);
1965 
1966 		data_read = 0;
1967 		nevents = 0;
1968 
1969 		fprintf(ofp, "=== %s ===\n", dpp->buts_name);
1970 		for (cpu = 0, sp = dpp->stats; cpu < dpp->ncpus; cpu++, sp++) {
1971 			/*
1972 			 * Estimate events if not known...
1973 			 */
1974 			if (sp->nevents == 0) {
1975 				sp->nevents = sp->data_read /
1976 						sizeof(struct blk_io_trace);
1977 			}
1978 
1979 			fprintf(ofp,
1980 				"  CPU%3d: %20llu events, %8llu KiB data\n",
1981 				cpu, sp->nevents, (sp->data_read + 1023) >> 10);
1982 
1983 			data_read += sp->data_read;
1984 			nevents += sp->nevents;
1985 		}
1986 
1987 		fprintf(ofp, "  Total:  %20llu events (dropped %llu),"
1988 			     " %8llu KiB data\n", nevents,
1989 			     dpp->drops, (data_read + 1024) >> 10);
1990 
1991 		total_drops += dpp->drops;
1992 		total_events += (nevents + dpp->drops);
1993 	}
1994 
1995 	fflush(ofp);
1996 	if (piped_output)
1997 		fclose(ofp);
1998 
1999 	if (total_drops) {
2000 		double drops_ratio = 1.0;
2001 
2002 		if (total_events)
2003 			drops_ratio = (double)total_drops/(double)total_events;
2004 
2005 		fprintf(stderr, "\nYou have %llu (%5.1lf%%) dropped events\n"
2006 				"Consider using a larger buffer size (-b) "
2007 				"and/or more buffers (-n)\n",
2008 			total_drops, 100.0 * drops_ratio);
2009 	}
2010 }
2011 
handle_args(int argc,char * argv[])2012 static int handle_args(int argc, char *argv[])
2013 {
2014 	int c, i;
2015 	struct statfs st;
2016 	int act_mask_tmp = 0;
2017 
2018 	while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
2019 		switch (c) {
2020 		case 'a':
2021 			i = find_mask_map(optarg);
2022 			if (i < 0) {
2023 				fprintf(stderr, "Invalid action mask %s\n",
2024 					optarg);
2025 				return 1;
2026 			}
2027 			act_mask_tmp |= i;
2028 			break;
2029 
2030 		case 'A':
2031 			if ((sscanf(optarg, "%x", &i) != 1) ||
2032 							!valid_act_opt(i)) {
2033 				fprintf(stderr,
2034 					"Invalid set action mask %s/0x%x\n",
2035 					optarg, i);
2036 				return 1;
2037 			}
2038 			act_mask_tmp = i;
2039 			break;
2040 
2041 		case 'd':
2042 			if (add_devpath(optarg) != 0)
2043 				return 1;
2044 			break;
2045 
2046 		case 'I': {
2047 			char dev_line[256];
2048 			FILE *ifp = my_fopen(optarg, "r");
2049 
2050 			if (!ifp) {
2051 				fprintf(stderr,
2052 					"Invalid file for devices %s\n",
2053 					optarg);
2054 				return 1;
2055 			}
2056 
2057 			while (fscanf(ifp, "%s\n", dev_line) == 1)
2058 				if (add_devpath(dev_line) != 0)
2059 					return 1;
2060 			break;
2061 		}
2062 
2063 		case 'r':
2064 			debugfs_path = optarg;
2065 			break;
2066 
2067 		case 'o':
2068 			output_name = optarg;
2069 			break;
2070 		case 'k':
2071 			kill_running_trace = 1;
2072 			break;
2073 		case 'w':
2074 			stop_watch = atoi(optarg);
2075 			if (stop_watch <= 0) {
2076 				fprintf(stderr,
2077 					"Invalid stopwatch value (%d secs)\n",
2078 					stop_watch);
2079 				return 1;
2080 			}
2081 			break;
2082 		case 'V':
2083 		case 'v':
2084 			printf("%s version %s\n", argv[0], blktrace_version);
2085 			exit(0);
2086 			/*NOTREACHED*/
2087 		case 'b':
2088 			buf_size = strtoul(optarg, NULL, 10);
2089 			if (buf_size <= 0 || buf_size > 16*1024) {
2090 				fprintf(stderr, "Invalid buffer size (%lu)\n",
2091 					buf_size);
2092 				return 1;
2093 			}
2094 			buf_size <<= 10;
2095 			break;
2096 		case 'n':
2097 			buf_nr = strtoul(optarg, NULL, 10);
2098 			if (buf_nr <= 0) {
2099 				fprintf(stderr,
2100 					"Invalid buffer nr (%lu)\n", buf_nr);
2101 				return 1;
2102 			}
2103 			break;
2104 		case 'D':
2105 			output_dir = optarg;
2106 			break;
2107 		case 'h':
2108 			net_mode = Net_client;
2109 			strcpy(hostname, optarg);
2110 			break;
2111 		case 'l':
2112 			net_mode = Net_server;
2113 			break;
2114 		case 'p':
2115 			net_port = atoi(optarg);
2116 			break;
2117 		case 's':
2118 			net_use_sendfile = 0;
2119 			break;
2120 		default:
2121 			show_usage(argv[0]);
2122 			exit(1);
2123 			/*NOTREACHED*/
2124 		}
2125 	}
2126 
2127 	while (optind < argc)
2128 		if (add_devpath(argv[optind++]) != 0)
2129 			return 1;
2130 
2131 	if (net_mode != Net_server && ndevs == 0) {
2132 		show_usage(argv[0]);
2133 		return 1;
2134 	}
2135 
2136 	if (statfs(debugfs_path, &st) < 0 || st.f_type != (long)DEBUGFS_TYPE) {
2137 		fprintf(stderr, "Invalid debug path %s: %d/%s\n",
2138 			debugfs_path, errno, strerror(errno));
2139 		return 1;
2140 	}
2141 
2142 	if (act_mask_tmp != 0)
2143 		act_mask = act_mask_tmp;
2144 
2145 	if (net_mode == Net_client && net_setup_addr())
2146 		return 1;
2147 
2148 	/*
2149 	 * Set up for appropriate PFD handler based upon output name.
2150 	 */
2151 	if (net_client_use_sendfile())
2152 		handle_pfds = handle_pfds_netclient;
2153 	else if (net_client_use_send())
2154 		handle_pfds = handle_pfds_entries;
2155 	else if (output_name && (strcmp(output_name, "-") == 0)) {
2156 		piped_output = 1;
2157 		handle_pfds = handle_pfds_entries;
2158 		pfp = stdout;
2159 		setvbuf(pfp, NULL, _IONBF, 0);
2160 	} else
2161 		handle_pfds = handle_pfds_file;
2162 	return 0;
2163 }
2164 
ch_add_connection(struct net_server_s * ns,struct cl_host * ch,int fd)2165 static void ch_add_connection(struct net_server_s *ns, struct cl_host *ch,
2166 			      int fd)
2167 {
2168 	struct cl_conn *nc;
2169 
2170 	nc = malloc(sizeof(*nc));
2171 	memset(nc, 0, sizeof(*nc));
2172 
2173 	time(&nc->connect_time);
2174 	nc->ch = ch;
2175 	nc->fd = fd;
2176 	nc->ncpus = -1;
2177 
2178 	list_add_tail(&nc->ch_head, &ch->conn_list);
2179 	ch->connects++;
2180 
2181 	list_add_tail(&nc->ns_head, &ns->conn_list);
2182 	ns->connects++;
2183 	ns->pfds = realloc(ns->pfds, (ns->connects+1) * sizeof(struct pollfd));
2184 }
2185 
ch_rem_connection(struct net_server_s * ns,struct cl_host * ch,struct cl_conn * nc)2186 static void ch_rem_connection(struct net_server_s *ns, struct cl_host *ch,
2187 			      struct cl_conn *nc)
2188 {
2189 	net_close_connection(&nc->fd);
2190 
2191 	list_del(&nc->ch_head);
2192 	ch->connects--;
2193 
2194 	list_del(&nc->ns_head);
2195 	ns->connects--;
2196 	ns->pfds = realloc(ns->pfds, (ns->connects+1) * sizeof(struct pollfd));
2197 
2198 	free(nc);
2199 }
2200 
net_find_client_host(struct net_server_s * ns,struct in_addr cl_in_addr)2201 static struct cl_host *net_find_client_host(struct net_server_s *ns,
2202 					    struct in_addr cl_in_addr)
2203 {
2204 	struct list_head *p;
2205 
2206 	__list_for_each(p, &ns->ch_list) {
2207 		struct cl_host *ch = list_entry(p, struct cl_host, head);
2208 
2209 		if (in_addr_eq(ch->cl_in_addr, cl_in_addr))
2210 			return ch;
2211 	}
2212 
2213 	return NULL;
2214 }
2215 
net_add_client_host(struct net_server_s * ns,struct sockaddr_in * addr)2216 static struct cl_host *net_add_client_host(struct net_server_s *ns,
2217 					   struct sockaddr_in *addr)
2218 {
2219 	struct cl_host *ch;
2220 
2221 	ch = malloc(sizeof(*ch));
2222 	memset(ch, 0, sizeof(*ch));
2223 
2224 	ch->ns = ns;
2225 	ch->cl_in_addr = addr->sin_addr;
2226 	list_add_tail(&ch->head, &ns->ch_list);
2227 	ns->nchs++;
2228 
2229 	ch->hostname = strdup(inet_ntoa(addr->sin_addr));
2230 	printf("server: connection from %s\n", ch->hostname);
2231 
2232 	INIT_LIST_HEAD(&ch->conn_list);
2233 	INIT_LIST_HEAD(&ch->devpaths);
2234 
2235 	return ch;
2236 }
2237 
device_done(struct devpath * dpp,int ncpus)2238 static void device_done(struct devpath *dpp, int ncpus)
2239 {
2240 	int cpu;
2241 	struct io_info *iop;
2242 
2243 	for (cpu = 0, iop = dpp->ios; cpu < ncpus; cpu++, iop++)
2244 		close_iop(iop);
2245 
2246 	list_del(&dpp->head);
2247 	dpp_free(dpp);
2248 }
2249 
net_ch_remove(struct cl_host * ch,int ncpus)2250 static void net_ch_remove(struct cl_host *ch, int ncpus)
2251 {
2252 	struct list_head *p, *q;
2253 	struct net_server_s *ns = ch->ns;
2254 
2255 	list_for_each_safe(p, q, &ch->devpaths) {
2256 		struct devpath *dpp = list_entry(p, struct devpath, head);
2257 		device_done(dpp, ncpus);
2258 	}
2259 
2260 	list_for_each_safe(p, q, &ch->conn_list) {
2261 		struct cl_conn *nc = list_entry(p, struct cl_conn, ch_head);
2262 
2263 		ch_rem_connection(ns, ch, nc);
2264 	}
2265 
2266 	list_del(&ch->head);
2267 	ns->nchs--;
2268 
2269 	if (ch->hostname)
2270 		free(ch->hostname);
2271 	free(ch);
2272 }
2273 
net_add_connection(struct net_server_s * ns)2274 static void net_add_connection(struct net_server_s *ns)
2275 {
2276 	int fd;
2277 	struct cl_host *ch;
2278 	socklen_t socklen = sizeof(ns->addr);
2279 
2280 	fd = my_accept(ns->listen_fd, (struct sockaddr *)&ns->addr, &socklen);
2281 	if (fd < 0) {
2282 		/*
2283 		 * This is OK: we just won't accept this connection,
2284 		 * nothing fatal.
2285 		 */
2286 		perror("accept");
2287 	} else {
2288 		ch = net_find_client_host(ns, ns->addr.sin_addr);
2289 		if (!ch)
2290 			ch = net_add_client_host(ns, &ns->addr);
2291 
2292 		ch_add_connection(ns, ch, fd);
2293 	}
2294 }
2295 
nc_add_dpp(struct cl_conn * nc,struct blktrace_net_hdr * bnh,time_t connect_time)2296 static struct devpath *nc_add_dpp(struct cl_conn *nc,
2297 				  struct blktrace_net_hdr *bnh,
2298 				  time_t connect_time)
2299 {
2300 	int cpu;
2301 	struct io_info *iop;
2302 	struct devpath *dpp;
2303 
2304 	dpp = malloc(sizeof(*dpp));
2305 	memset(dpp, 0, sizeof(*dpp));
2306 
2307 	dpp->buts_name = strdup(bnh->buts_name);
2308 	dpp->path = strdup(bnh->buts_name);
2309 	dpp->fd = -1;
2310 	dpp->ch = nc->ch;
2311 	dpp->cl_id = bnh->cl_id;
2312 	dpp->cl_connect_time = connect_time;
2313 	dpp->ncpus = nc->ncpus;
2314 	dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
2315 	memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
2316 
2317 	list_add_tail(&dpp->head, &nc->ch->devpaths);
2318 	nc->ch->ndevs++;
2319 
2320 	dpp->ios = calloc(nc->ncpus, sizeof(*iop));
2321 	memset(dpp->ios, 0, ndevs * sizeof(*iop));
2322 
2323 	for (cpu = 0, iop = dpp->ios; cpu < nc->ncpus; cpu++, iop++) {
2324 		iop->dpp = dpp;
2325 		iop->nc = nc;
2326 		init_mmap_info(&iop->mmap_info);
2327 
2328 		if (iop_open(iop, cpu))
2329 			goto err;
2330 	}
2331 
2332 	return dpp;
2333 
2334 err:
2335 	/*
2336 	 * Need to unravel what's been done...
2337 	 */
2338 	while (cpu >= 0)
2339 		close_iop(&dpp->ios[cpu--]);
2340 	dpp_free(dpp);
2341 
2342 	return NULL;
2343 }
2344 
nc_find_dpp(struct cl_conn * nc,struct blktrace_net_hdr * bnh)2345 static struct devpath *nc_find_dpp(struct cl_conn *nc,
2346 				   struct blktrace_net_hdr *bnh)
2347 {
2348 	struct list_head *p;
2349 	time_t connect_time = nc->connect_time;
2350 
2351 	__list_for_each(p, &nc->ch->devpaths) {
2352 		struct devpath *dpp = list_entry(p, struct devpath, head);
2353 
2354 		if (!strcmp(dpp->buts_name, bnh->buts_name))
2355 			return dpp;
2356 
2357 		if (dpp->cl_id == bnh->cl_id)
2358 			connect_time = dpp->cl_connect_time;
2359 	}
2360 
2361 	return nc_add_dpp(nc, bnh, connect_time);
2362 }
2363 
net_client_read_data(struct cl_conn * nc,struct devpath * dpp,struct blktrace_net_hdr * bnh)2364 static void net_client_read_data(struct cl_conn *nc, struct devpath *dpp,
2365 				 struct blktrace_net_hdr *bnh)
2366 {
2367 	int ret;
2368 	struct io_info *iop = &dpp->ios[bnh->cpu];
2369 	struct mmap_info *mip = &iop->mmap_info;
2370 
2371 	if (setup_mmap(iop->ofd, bnh->len, &iop->mmap_info)) {
2372 		fprintf(stderr, "ncd(%s:%d): mmap failed\n",
2373 			nc->ch->hostname, nc->fd);
2374 		exit(1);
2375 	}
2376 
2377 	ret = net_recv_data(nc->fd, mip->fs_buf + mip->fs_off, bnh->len);
2378 	if (ret > 0) {
2379 		pdc_dr_update(dpp, bnh->cpu, ret);
2380 		mip->fs_size += ret;
2381 		mip->fs_off += ret;
2382 	} else if (ret < 0)
2383 		exit(1);
2384 }
2385 
2386 /*
2387  * Returns 1 if we closed a host - invalidates other polling information
2388  * that may be present.
2389  */
net_client_data(struct cl_conn * nc)2390 static int net_client_data(struct cl_conn *nc)
2391 {
2392 	int ret;
2393 	struct devpath *dpp;
2394 	struct blktrace_net_hdr bnh;
2395 
2396 	ret = net_get_header(nc, &bnh);
2397 	if (ret == 0)
2398 		return 0;
2399 
2400 	if (ret < 0) {
2401 		fprintf(stderr, "ncd(%d): header read failed\n", nc->fd);
2402 		exit(1);
2403 	}
2404 
2405 	if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
2406 		fprintf(stderr, "ncd(%d): received data is bad\n", nc->fd);
2407 		exit(1);
2408 	}
2409 
2410 	if (!data_is_native) {
2411 		bnh.magic = be32_to_cpu(bnh.magic);
2412 		bnh.cpu = be32_to_cpu(bnh.cpu);
2413 		bnh.max_cpus = be32_to_cpu(bnh.max_cpus);
2414 		bnh.len = be32_to_cpu(bnh.len);
2415 		bnh.cl_id = be32_to_cpu(bnh.cl_id);
2416 		bnh.buf_size = be32_to_cpu(bnh.buf_size);
2417 		bnh.buf_nr = be32_to_cpu(bnh.buf_nr);
2418 		bnh.page_size = be32_to_cpu(bnh.page_size);
2419 	}
2420 
2421 	if ((bnh.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2422 		fprintf(stderr, "ncd(%s:%d): bad data magic\n",
2423 			nc->ch->hostname, nc->fd);
2424 		exit(1);
2425 	}
2426 
2427 	if (nc->ncpus == -1)
2428 		nc->ncpus = bnh.max_cpus;
2429 
2430 	/*
2431 	 * len == 0 means the other end is sending us a new connection/dpp
2432 	 * len == 1 means that the other end signalled end-of-run
2433 	 */
2434 	dpp = nc_find_dpp(nc, &bnh);
2435 	if (bnh.len == 0) {
2436 		/*
2437 		 * Just adding in the dpp above is enough
2438 		 */
2439 		ack_open_close(nc->fd, dpp->buts_name);
2440 		nc->ch->cl_opens++;
2441 	} else if (bnh.len == 1) {
2442 		/*
2443 		 * overload cpu count with dropped events
2444 		 */
2445 		dpp->drops = bnh.cpu;
2446 
2447 		ack_open_close(nc->fd, dpp->buts_name);
2448 		if (--nc->ch->cl_opens == 0) {
2449 			show_stats(&nc->ch->devpaths);
2450 			net_ch_remove(nc->ch, nc->ncpus);
2451 			return 1;
2452 		}
2453 	} else
2454 		net_client_read_data(nc, dpp, &bnh);
2455 
2456 	return 0;
2457 }
2458 
handle_client_data(struct net_server_s * ns,int events)2459 static void handle_client_data(struct net_server_s *ns, int events)
2460 {
2461 	struct cl_conn *nc;
2462 	struct pollfd *pfd;
2463 	struct list_head *p, *q;
2464 
2465 	pfd = &ns->pfds[1];
2466 	list_for_each_safe(p, q, &ns->conn_list) {
2467 		if (pfd->revents & POLLIN) {
2468 			nc = list_entry(p, struct cl_conn, ns_head);
2469 
2470 			if (net_client_data(nc) || --events == 0)
2471 				break;
2472 		}
2473 		pfd++;
2474 	}
2475 }
2476 
net_setup_pfds(struct net_server_s * ns)2477 static void net_setup_pfds(struct net_server_s *ns)
2478 {
2479 	struct pollfd *pfd;
2480 	struct list_head *p;
2481 
2482 	ns->pfds[0].fd = ns->listen_fd;
2483 	ns->pfds[0].events = POLLIN;
2484 
2485 	pfd = &ns->pfds[1];
2486 	__list_for_each(p, &ns->conn_list) {
2487 		struct cl_conn *nc = list_entry(p, struct cl_conn, ns_head);
2488 
2489 		pfd->fd = nc->fd;
2490 		pfd->events = POLLIN;
2491 		pfd++;
2492 	}
2493 }
2494 
net_server_handle_connections(struct net_server_s * ns)2495 static int net_server_handle_connections(struct net_server_s *ns)
2496 {
2497 	int events;
2498 
2499 	printf("server: waiting for connections...\n");
2500 
2501 	while (!done) {
2502 		net_setup_pfds(ns);
2503 		events = poll(ns->pfds, ns->connects + 1, -1);
2504 		if (events < 0) {
2505 			if (errno != EINTR) {
2506 				perror("FATAL: poll error");
2507 				return 1;
2508 			}
2509 		} else if (events > 0) {
2510 			if (ns->pfds[0].revents & POLLIN) {
2511 				net_add_connection(ns);
2512 				events--;
2513 			}
2514 
2515 			if (events)
2516 				handle_client_data(ns, events);
2517 		}
2518 	}
2519 
2520 	return 0;
2521 }
2522 
net_server(void)2523 static int net_server(void)
2524 {
2525 	int fd, opt;
2526 	int ret = 1;
2527 	struct net_server_s net_server;
2528 	struct net_server_s *ns = &net_server;
2529 
2530 	memset(ns, 0, sizeof(*ns));
2531 	INIT_LIST_HEAD(&ns->ch_list);
2532 	INIT_LIST_HEAD(&ns->conn_list);
2533 	ns->pfds = malloc(sizeof(struct pollfd));
2534 
2535 	fd = my_socket(AF_INET, SOCK_STREAM, 0);
2536 	if (fd < 0) {
2537 		perror("server: socket");
2538 		goto out;
2539 	}
2540 
2541 	opt = 1;
2542 	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
2543 		perror("setsockopt");
2544 		goto out;
2545 	}
2546 
2547 	memset(&ns->addr, 0, sizeof(ns->addr));
2548 	ns->addr.sin_family = AF_INET;
2549 	ns->addr.sin_addr.s_addr = htonl(INADDR_ANY);
2550 	ns->addr.sin_port = htons(net_port);
2551 
2552 	if (bind(fd, (struct sockaddr *) &ns->addr, sizeof(ns->addr)) < 0) {
2553 		perror("bind");
2554 		goto out;
2555 	}
2556 
2557 	if (listen(fd, 1) < 0) {
2558 		perror("listen");
2559 		goto out;
2560 	}
2561 
2562 	/*
2563 	 * The actual server looping is done here:
2564 	 */
2565 	ns->listen_fd = fd;
2566 	ret = net_server_handle_connections(ns);
2567 
2568 	/*
2569 	 * Clean up and return...
2570 	 */
2571 out:
2572 	free(ns->pfds);
2573 	return ret;
2574 }
2575 
run_tracers(void)2576 static int run_tracers(void)
2577 {
2578 	atexit(exit_tracing);
2579 	if (net_mode == Net_client)
2580 		printf("blktrace: connecting to %s\n", hostname);
2581 
2582 	setup_buts();
2583 
2584 	if (use_tracer_devpaths()) {
2585 		if (setup_tracer_devpaths())
2586 			return 1;
2587 
2588 		if (piped_output)
2589 			handle_list = handle_list_file;
2590 		else
2591 			handle_list = handle_list_net;
2592 	}
2593 
2594 	start_tracers();
2595 	if (nthreads_running == ncpus) {
2596 		unblock_tracers();
2597 		start_buts();
2598 		if (net_mode == Net_client)
2599 			printf("blktrace: connected!\n");
2600 		if (stop_watch)
2601 			alarm(stop_watch);
2602 	} else
2603 		stop_tracers();
2604 
2605 	wait_tracers();
2606 	if (nthreads_running == ncpus)
2607 		show_stats(&devpaths);
2608 	if (net_client_use_send())
2609 		close_client_connections();
2610 	del_tracers();
2611 
2612 	return 0;
2613 }
2614 
main(int argc,char * argv[])2615 int main(int argc, char *argv[])
2616 {
2617 	int ret = 0;
2618 
2619 	setlocale(LC_NUMERIC, "en_US");
2620 	pagesize = getpagesize();
2621 	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
2622 	if (ncpus < 0) {
2623 		fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed %d/%s\n",
2624 			errno, strerror(errno));
2625 		ret = 1;
2626 		goto out;
2627 	} else if (handle_args(argc, argv)) {
2628 		ret = 1;
2629 		goto out;
2630 	}
2631 
2632 	signal(SIGINT, handle_sigint);
2633 	signal(SIGHUP, handle_sigint);
2634 	signal(SIGTERM, handle_sigint);
2635 	signal(SIGALRM, handle_sigint);
2636 	signal(SIGPIPE, SIG_IGN);
2637 
2638 	if (kill_running_trace) {
2639 		struct devpath *dpp;
2640 		struct list_head *p;
2641 
2642 		__list_for_each(p, &devpaths) {
2643 			dpp = list_entry(p, struct devpath, head);
2644 			if (__stop_trace(dpp->fd)) {
2645 				fprintf(stderr,
2646 					"BLKTRACETEARDOWN %s failed: %d/%s\n",
2647 					dpp->path, errno, strerror(errno));
2648 			}
2649 		}
2650 	} else if (net_mode == Net_server) {
2651 		if (output_name) {
2652 			fprintf(stderr, "-o ignored in server mode\n");
2653 			output_name = NULL;
2654 		}
2655 		ret = net_server();
2656 	} else
2657 		ret = run_tracers();
2658 
2659 out:
2660 	if (pfp)
2661 		fclose(pfp);
2662 	rel_devpaths();
2663 	return ret;
2664 }
2665