1#!/usr/bin/python 2 3""" 4Poll server-status on cautotest to watch for RPCs taking longer than 10s. Then 5we go and ssh around to figure out what the command line of the process that 6caused the RPC was so that one can track down what is generating the expensive 7RPC load. 8""" 9 10try: 11 from bs4 import BeautifulSoup 12except ImportError: 13 print 'Run `apt-get install python-bs4`' 14 raise 15 16import time 17import subprocess 18import multiprocessing 19 20import common 21import requests 22 23 24def check_cautotest(): 25 page = requests.get('http://cautotest/server-status').text 26 soup = BeautifulSoup(page) 27 pids = [] 28 for row in soup.table.findAll('tr'): 29 cols = [x.text.strip() for x in row.findAll('td')] 30 if not cols: 31 continue 32 if cols[3] == 'W' and int(cols[5]) > 10 and cols[1] != '-': 33 pids.append((cols[1], cols[3], cols[5])) 34 return pids 35 36def pull_cautotest_info(proc_id): 37 try: 38 conn = subprocess.check_output('become chromeos-test@cautotest -- ' 39 '"sudo lsof -i | grep -e %s | grep -e ESTABLISHED"' % proc_id, 40 shell=True) 41 remote_info = conn.split()[8].split('->')[1].split(':') 42 except Exception: 43 remote_info = None 44 return remote_info 45 46def strace_cautotest(proc_id): 47 try: 48 straced = subprocess.check_output('become chromeos-test@cautotest -- ' 49 '"sudo strace -s 500 -p %s 2>&1 | head -n 20"' % proc_id, 50 shell=True) 51 except subprocess.CalledProcessError: 52 straced = "" 53 return straced 54 55def pull_drone_info(host, port): 56 try: 57 lsof = subprocess.check_output('become chromeos-test@%s -- ' 58 '"sudo lsof -i | grep -e :%s | grep -e ESTABLISHED"' 59 % (host, port), shell=True) 60 proc_id = lsof.split()[1] 61 cmdline = subprocess.check_output('become chromeos-test@%s -- ' 62 '"cat /proc/%s/cmdline"' % (host,proc_id), shell=True) 63 except Exception: 64 cmdline = '' 65 return cmdline 66 67def pull_all_data(pid, queue): 68 try: 69 remote_info = pull_cautotest_info(pid[0]) 70 if remote_info: 71 drone_info = pull_drone_info(*remote_info) 72 else: 73 drone_info = None 74 straced = strace_cautotest(pid[0]) 75 queue.put((pid, remote_info, drone_info, straced)) 76 except Exception: 77 queue.put(None) 78 79def print_data(x): 80 (pid, remote_info, drone_info, straced) = x 81 print "*** %s stuck in %s for %s secs" % pid 82 print remote_info 83 print drone_info 84 print straced 85 print '\a' 86 87while True: 88 queue = multiprocessing.Queue() 89 processes = [] 90 pids = check_cautotest() 91 for pid in pids: 92 proc = multiprocessing.Process(target=pull_all_data, args=(pid, queue)) 93 proc.start() 94 processes.append(proc) 95 for proc in processes: 96 x = queue.get() 97 if x: 98 print_data(x) 99 for proc in processes: 100 proc.terminate() 101 time.sleep(5) 102