1#pylint: disable-msg=C0111
2
3"""
4Pidfile monitor.
5"""
6
7import logging
8import time
9import traceback
10
11import common
12
13from autotest_lib.client.common_lib import utils
14from autotest_lib.client.common_lib import global_config
15from autotest_lib.scheduler import drone_manager
16from autotest_lib.scheduler import scheduler_config
17
18try:
19    from chromite.lib import metrics
20except ImportError:
21    metrics = utils.metrics_mock
22
23
24def _get_pidfile_timeout_secs():
25    """@returns How long to wait for autoserv to write pidfile."""
26    pidfile_timeout_mins = global_config.global_config.get_config_value(
27            scheduler_config.CONFIG_SECTION, 'pidfile_timeout_mins', type=int)
28    return pidfile_timeout_mins * 60
29
30
31class PidfileRunMonitor(object):
32    """
33    Client must call either run() to start a new process or
34    attach_to_existing_process().
35    """
36
37    class _PidfileException(Exception):
38        """
39        Raised when there's some unexpected behavior with the pid file, but only
40        used internally (never allowed to escape this class).
41        """
42
43
44    def __init__(self):
45        self._drone_manager = drone_manager.instance()
46        self.lost_process = False
47        self._start_time = None
48        self.pidfile_id = None
49        self._killed = False
50        self._state = drone_manager.PidfileContents()
51
52
53    def _add_nice_command(self, command, nice_level):
54        if not nice_level:
55            return command
56        return ['nice', '-n', str(nice_level)] + command
57
58
59    def _set_start_time(self):
60        self._start_time = time.time()
61
62
63    def run(self, command, working_directory, num_processes, nice_level=None,
64            log_file=None, pidfile_name=None, paired_with_pidfile=None,
65            username=None, drone_hostnames_allowed=None):
66        assert command is not None
67        if nice_level is not None:
68            command = ['nice', '-n', str(nice_level)] + command
69        self._set_start_time()
70        self.pidfile_id = self._drone_manager.execute_command(
71            command, working_directory, pidfile_name=pidfile_name,
72            num_processes=num_processes, log_file=log_file,
73            paired_with_pidfile=paired_with_pidfile, username=username,
74            drone_hostnames_allowed=drone_hostnames_allowed)
75
76
77    def attach_to_existing_process(self, execution_path,
78                                   pidfile_name=drone_manager.AUTOSERV_PID_FILE,
79                                   num_processes=None):
80        self._set_start_time()
81        self.pidfile_id = self._drone_manager.get_pidfile_id_from(
82            execution_path, pidfile_name=pidfile_name)
83        if num_processes is not None:
84            self._drone_manager.declare_process_count(self.pidfile_id, num_processes)
85
86
87    def kill(self):
88        if self.has_process():
89            self._drone_manager.kill_process(self.get_process())
90            self._killed = True
91
92
93    def has_process(self):
94        self._get_pidfile_info()
95        return self._state.process is not None
96
97
98    def get_process(self):
99        self._get_pidfile_info()
100        assert self._state.process is not None
101        return self._state.process
102
103
104    def _read_pidfile(self, use_second_read=False):
105        assert self.pidfile_id is not None, (
106            'You must call run() or attach_to_existing_process()')
107        contents = self._drone_manager.get_pidfile_contents(
108            self.pidfile_id, use_second_read=use_second_read)
109        if contents.is_invalid():
110            self._state = drone_manager.PidfileContents()
111            raise self._PidfileException(contents)
112        self._state = contents
113
114
115    def _handle_pidfile_error(self, error, message=''):
116        self.on_lost_process(self._state.process)
117
118
119    def _get_pidfile_info_helper(self):
120        if self.lost_process:
121            return
122
123        self._read_pidfile()
124
125        if self._state.process is None:
126            self._handle_no_process()
127            return
128
129        if self._state.exit_status is None:
130            # double check whether or not autoserv is running
131            if self._drone_manager.is_process_running(self._state.process):
132                return
133
134            # pid but no running process - maybe process *just* exited
135            self._read_pidfile(use_second_read=True)
136            if self._state.exit_status is None:
137                # autoserv exited without writing an exit code
138                # to the pidfile
139                self._handle_pidfile_error(
140                    'autoserv died without writing exit code')
141
142
143    def _get_pidfile_info(self):
144        """\
145        After completion, self._state will contain:
146         pid=None, exit_status=None if autoserv has not yet run
147         pid!=None, exit_status=None if autoserv is running
148         pid!=None, exit_status!=None if autoserv has completed
149        """
150        try:
151            self._get_pidfile_info_helper()
152        except self._PidfileException, exc:
153            self._handle_pidfile_error('Pidfile error', traceback.format_exc())
154
155
156    def _handle_no_process(self):
157        """\
158        Called when no pidfile is found or no pid is in the pidfile.
159        """
160        if time.time() - self._start_time > _get_pidfile_timeout_secs():
161            # If we aborted the process, and we find that it has exited without
162            # writing a pidfile, then it's because we killed it, and thus this
163            # isn't a surprising situation.
164            if not self._killed:
165                metrics.Counter('chromeos/autotest/errors/scheduler/no_pidfile'
166                                ).increment()
167            else:
168                logging.warning("%s didn't exit after SIGTERM", self.pidfile_id)
169            self.on_lost_process()
170
171
172    def on_lost_process(self, process=None):
173        """\
174        Called when autoserv has exited without writing an exit status,
175        or we've timed out waiting for autoserv to write a pid to the
176        pidfile.  In either case, we just return failure and the caller
177        should signal some kind of warning.
178
179        process is unimportant here, as it shouldn't be used by anyone.
180        """
181        self.lost_process = True
182        self._state.process = process
183        self._state.exit_status = 1
184        self._state.num_tests_failed = 0
185
186
187    def exit_code(self):
188        self._get_pidfile_info()
189        return self._state.exit_status
190
191
192    def num_tests_failed(self):
193        """@returns The number of tests that failed or -1 if unknown."""
194        self._get_pidfile_info()
195        if self._state.num_tests_failed is None:
196            return -1
197        return self._state.num_tests_failed
198
199
200    def try_copy_results_on_drone(self, **kwargs):
201        if self.has_process():
202            # copy results logs into the normal place for job results
203            self._drone_manager.copy_results_on_drone(self.get_process(), **kwargs)
204
205
206    def try_copy_to_results_repository(self, source, **kwargs):
207        if self.has_process():
208            self._drone_manager.copy_to_results_repository(self.get_process(),
209                                                      source, **kwargs)
210
211