1#pylint: disable-msg=C0111 2 3""" 4Pidfile monitor. 5""" 6 7import logging 8import time, traceback 9from autotest_lib.client.common_lib import global_config 10from autotest_lib.client.common_lib.cros.graphite import autotest_stats 11from autotest_lib.scheduler import drone_manager, email_manager 12from autotest_lib.scheduler import scheduler_config 13 14 15 16def _get_pidfile_timeout_secs(): 17 """@returns How long to wait for autoserv to write pidfile.""" 18 pidfile_timeout_mins = global_config.global_config.get_config_value( 19 scheduler_config.CONFIG_SECTION, 'pidfile_timeout_mins', type=int) 20 return pidfile_timeout_mins * 60 21 22 23class PidfileRunMonitor(object): 24 """ 25 Client must call either run() to start a new process or 26 attach_to_existing_process(). 27 """ 28 29 class _PidfileException(Exception): 30 """ 31 Raised when there's some unexpected behavior with the pid file, but only 32 used internally (never allowed to escape this class). 33 """ 34 35 36 def __init__(self): 37 self._drone_manager = drone_manager.instance() 38 self.lost_process = False 39 self._start_time = None 40 self.pidfile_id = None 41 self._killed = False 42 self._state = drone_manager.PidfileContents() 43 44 45 def _add_nice_command(self, command, nice_level): 46 if not nice_level: 47 return command 48 return ['nice', '-n', str(nice_level)] + command 49 50 51 def _set_start_time(self): 52 self._start_time = time.time() 53 54 55 def run(self, command, working_directory, num_processes, nice_level=None, 56 log_file=None, pidfile_name=None, paired_with_pidfile=None, 57 username=None, drone_hostnames_allowed=None): 58 assert command is not None 59 if nice_level is not None: 60 command = ['nice', '-n', str(nice_level)] + command 61 self._set_start_time() 62 self.pidfile_id = self._drone_manager.execute_command( 63 command, working_directory, pidfile_name=pidfile_name, 64 num_processes=num_processes, log_file=log_file, 65 paired_with_pidfile=paired_with_pidfile, username=username, 66 drone_hostnames_allowed=drone_hostnames_allowed) 67 68 69 def attach_to_existing_process(self, execution_path, 70 pidfile_name=drone_manager.AUTOSERV_PID_FILE, 71 num_processes=None): 72 self._set_start_time() 73 self.pidfile_id = self._drone_manager.get_pidfile_id_from( 74 execution_path, pidfile_name=pidfile_name) 75 if num_processes is not None: 76 self._drone_manager.declare_process_count(self.pidfile_id, num_processes) 77 78 79 def kill(self): 80 if self.has_process(): 81 self._drone_manager.kill_process(self.get_process()) 82 self._killed = True 83 84 85 def has_process(self): 86 self._get_pidfile_info() 87 return self._state.process is not None 88 89 90 def get_process(self): 91 self._get_pidfile_info() 92 assert self._state.process is not None 93 return self._state.process 94 95 96 def _read_pidfile(self, use_second_read=False): 97 assert self.pidfile_id is not None, ( 98 'You must call run() or attach_to_existing_process()') 99 contents = self._drone_manager.get_pidfile_contents( 100 self.pidfile_id, use_second_read=use_second_read) 101 if contents.is_invalid(): 102 self._state = drone_manager.PidfileContents() 103 raise self._PidfileException(contents) 104 self._state = contents 105 106 107 def _handle_pidfile_error(self, error, message=''): 108 metadata = {'_type': 'scheduler_error', 109 'error': 'autoserv died without writing exit code', 110 'process': str(self._state.process), 111 'pidfile_id': str(self.pidfile_id)} 112 autotest_stats.Counter('autoserv_died_without_writing_exit_code', 113 metadata=metadata).increment() 114 self.on_lost_process(self._state.process) 115 116 117 def _get_pidfile_info_helper(self): 118 if self.lost_process: 119 return 120 121 self._read_pidfile() 122 123 if self._state.process is None: 124 self._handle_no_process() 125 return 126 127 if self._state.exit_status is None: 128 # double check whether or not autoserv is running 129 if self._drone_manager.is_process_running(self._state.process): 130 return 131 132 # pid but no running process - maybe process *just* exited 133 self._read_pidfile(use_second_read=True) 134 if self._state.exit_status is None: 135 # autoserv exited without writing an exit code 136 # to the pidfile 137 self._handle_pidfile_error( 138 'autoserv died without writing exit code') 139 140 141 def _get_pidfile_info(self): 142 """\ 143 After completion, self._state will contain: 144 pid=None, exit_status=None if autoserv has not yet run 145 pid!=None, exit_status=None if autoserv is running 146 pid!=None, exit_status!=None if autoserv has completed 147 """ 148 try: 149 self._get_pidfile_info_helper() 150 except self._PidfileException, exc: 151 self._handle_pidfile_error('Pidfile error', traceback.format_exc()) 152 153 154 def _handle_no_process(self): 155 """\ 156 Called when no pidfile is found or no pid is in the pidfile. 157 """ 158 message = 'No pid found at %s' % self.pidfile_id 159 if time.time() - self._start_time > _get_pidfile_timeout_secs(): 160 # If we aborted the process, and we find that it has exited without 161 # writing a pidfile, then it's because we killed it, and thus this 162 # isn't a surprising situation. 163 if not self._killed: 164 email_manager.manager.enqueue_notify_email( 165 'Process has failed to write pidfile', message) 166 else: 167 logging.warning("%s didn't exit after SIGTERM", self.pidfile_id) 168 self.on_lost_process() 169 170 171 def on_lost_process(self, process=None): 172 """\ 173 Called when autoserv has exited without writing an exit status, 174 or we've timed out waiting for autoserv to write a pid to the 175 pidfile. In either case, we just return failure and the caller 176 should signal some kind of warning. 177 178 process is unimportant here, as it shouldn't be used by anyone. 179 """ 180 self.lost_process = True 181 self._state.process = process 182 self._state.exit_status = 1 183 self._state.num_tests_failed = 0 184 185 186 def exit_code(self): 187 self._get_pidfile_info() 188 return self._state.exit_status 189 190 191 def num_tests_failed(self): 192 """@returns The number of tests that failed or -1 if unknown.""" 193 self._get_pidfile_info() 194 if self._state.num_tests_failed is None: 195 return -1 196 return self._state.num_tests_failed 197 198 199 def try_copy_results_on_drone(self, **kwargs): 200 if self.has_process(): 201 # copy results logs into the normal place for job results 202 self._drone_manager.copy_results_on_drone(self.get_process(), **kwargs) 203 204 205 def try_copy_to_results_repository(self, source, **kwargs): 206 if self.has_process(): 207 self._drone_manager.copy_to_results_repository(self.get_process(), 208 source, **kwargs) 209 210