1#pylint: disable-msg=C0111
2
3"""
4Pidfile monitor.
5"""
6
7import logging
8import time, traceback
9from autotest_lib.client.common_lib import global_config
10from autotest_lib.client.common_lib.cros.graphite import autotest_stats
11from autotest_lib.scheduler import drone_manager, email_manager
12from autotest_lib.scheduler import scheduler_config
13
14
15
16def _get_pidfile_timeout_secs():
17    """@returns How long to wait for autoserv to write pidfile."""
18    pidfile_timeout_mins = global_config.global_config.get_config_value(
19            scheduler_config.CONFIG_SECTION, 'pidfile_timeout_mins', type=int)
20    return pidfile_timeout_mins * 60
21
22
23class PidfileRunMonitor(object):
24    """
25    Client must call either run() to start a new process or
26    attach_to_existing_process().
27    """
28
29    class _PidfileException(Exception):
30        """
31        Raised when there's some unexpected behavior with the pid file, but only
32        used internally (never allowed to escape this class).
33        """
34
35
36    def __init__(self):
37        self._drone_manager = drone_manager.instance()
38        self.lost_process = False
39        self._start_time = None
40        self.pidfile_id = None
41        self._killed = False
42        self._state = drone_manager.PidfileContents()
43
44
45    def _add_nice_command(self, command, nice_level):
46        if not nice_level:
47            return command
48        return ['nice', '-n', str(nice_level)] + command
49
50
51    def _set_start_time(self):
52        self._start_time = time.time()
53
54
55    def run(self, command, working_directory, num_processes, nice_level=None,
56            log_file=None, pidfile_name=None, paired_with_pidfile=None,
57            username=None, drone_hostnames_allowed=None):
58        assert command is not None
59        if nice_level is not None:
60            command = ['nice', '-n', str(nice_level)] + command
61        self._set_start_time()
62        self.pidfile_id = self._drone_manager.execute_command(
63            command, working_directory, pidfile_name=pidfile_name,
64            num_processes=num_processes, log_file=log_file,
65            paired_with_pidfile=paired_with_pidfile, username=username,
66            drone_hostnames_allowed=drone_hostnames_allowed)
67
68
69    def attach_to_existing_process(self, execution_path,
70                                   pidfile_name=drone_manager.AUTOSERV_PID_FILE,
71                                   num_processes=None):
72        self._set_start_time()
73        self.pidfile_id = self._drone_manager.get_pidfile_id_from(
74            execution_path, pidfile_name=pidfile_name)
75        if num_processes is not None:
76            self._drone_manager.declare_process_count(self.pidfile_id, num_processes)
77
78
79    def kill(self):
80        if self.has_process():
81            self._drone_manager.kill_process(self.get_process())
82            self._killed = True
83
84
85    def has_process(self):
86        self._get_pidfile_info()
87        return self._state.process is not None
88
89
90    def get_process(self):
91        self._get_pidfile_info()
92        assert self._state.process is not None
93        return self._state.process
94
95
96    def _read_pidfile(self, use_second_read=False):
97        assert self.pidfile_id is not None, (
98            'You must call run() or attach_to_existing_process()')
99        contents = self._drone_manager.get_pidfile_contents(
100            self.pidfile_id, use_second_read=use_second_read)
101        if contents.is_invalid():
102            self._state = drone_manager.PidfileContents()
103            raise self._PidfileException(contents)
104        self._state = contents
105
106
107    def _handle_pidfile_error(self, error, message=''):
108        metadata = {'_type': 'scheduler_error',
109                    'error': 'autoserv died without writing exit code',
110                    'process': str(self._state.process),
111                    'pidfile_id': str(self.pidfile_id)}
112        autotest_stats.Counter('autoserv_died_without_writing_exit_code',
113                               metadata=metadata).increment()
114        self.on_lost_process(self._state.process)
115
116
117    def _get_pidfile_info_helper(self):
118        if self.lost_process:
119            return
120
121        self._read_pidfile()
122
123        if self._state.process is None:
124            self._handle_no_process()
125            return
126
127        if self._state.exit_status is None:
128            # double check whether or not autoserv is running
129            if self._drone_manager.is_process_running(self._state.process):
130                return
131
132            # pid but no running process - maybe process *just* exited
133            self._read_pidfile(use_second_read=True)
134            if self._state.exit_status is None:
135                # autoserv exited without writing an exit code
136                # to the pidfile
137                self._handle_pidfile_error(
138                    'autoserv died without writing exit code')
139
140
141    def _get_pidfile_info(self):
142        """\
143        After completion, self._state will contain:
144         pid=None, exit_status=None if autoserv has not yet run
145         pid!=None, exit_status=None if autoserv is running
146         pid!=None, exit_status!=None if autoserv has completed
147        """
148        try:
149            self._get_pidfile_info_helper()
150        except self._PidfileException, exc:
151            self._handle_pidfile_error('Pidfile error', traceback.format_exc())
152
153
154    def _handle_no_process(self):
155        """\
156        Called when no pidfile is found or no pid is in the pidfile.
157        """
158        message = 'No pid found at %s' % self.pidfile_id
159        if time.time() - self._start_time > _get_pidfile_timeout_secs():
160            # If we aborted the process, and we find that it has exited without
161            # writing a pidfile, then it's because we killed it, and thus this
162            # isn't a surprising situation.
163            if not self._killed:
164                email_manager.manager.enqueue_notify_email(
165                    'Process has failed to write pidfile', message)
166            else:
167                logging.warning("%s didn't exit after SIGTERM", self.pidfile_id)
168            self.on_lost_process()
169
170
171    def on_lost_process(self, process=None):
172        """\
173        Called when autoserv has exited without writing an exit status,
174        or we've timed out waiting for autoserv to write a pid to the
175        pidfile.  In either case, we just return failure and the caller
176        should signal some kind of warning.
177
178        process is unimportant here, as it shouldn't be used by anyone.
179        """
180        self.lost_process = True
181        self._state.process = process
182        self._state.exit_status = 1
183        self._state.num_tests_failed = 0
184
185
186    def exit_code(self):
187        self._get_pidfile_info()
188        return self._state.exit_status
189
190
191    def num_tests_failed(self):
192        """@returns The number of tests that failed or -1 if unknown."""
193        self._get_pidfile_info()
194        if self._state.num_tests_failed is None:
195            return -1
196        return self._state.num_tests_failed
197
198
199    def try_copy_results_on_drone(self, **kwargs):
200        if self.has_process():
201            # copy results logs into the normal place for job results
202            self._drone_manager.copy_results_on_drone(self.get_process(), **kwargs)
203
204
205    def try_copy_to_results_repository(self, source, **kwargs):
206        if self.has_process():
207            self._drone_manager.copy_to_results_repository(self.get_process(),
208                                                      source, **kwargs)
209
210