1# Lint as: python2, python3
2# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6import collections
7import six.moves.http_client
8import logging
9import os
10import re
11import socket
12import time
13
14import common
15
16from autotest_lib.client.common_lib import error
17from autotest_lib.client.common_lib.cros import retry
18from autotest_lib.server import utils
19
20
21GOOFY_JSONRPC_SERVER_PORT = 0x0FAC
22GOOFY_RUNNING = 'RUNNING'
23
24
25class GoofyProxyException(Exception):
26    """Exception raised when a goofy rpc fails."""
27    pass
28
29
30class GoofyRuntimeException(Exception):
31    """Exception raised when something goes wrong while a test is running."""
32    pass
33
34
35def retry_goofy_rpc(exception_tuple, timeout_min=30):
36    """A decorator to use with goofy rpcs.
37
38    This decorator tries to recreate the goofy client proxy on
39    socket error. It will continue trying to do so until it
40    executes the method without any socket errors or till the
41    retry.retry decorator hits it's timeout.
42
43    Usage:
44        If you just want to recreate the proxy:
45        1. @retry_goofy_rpc(exception_tuple=(<exception>, socket.error),
46                            timeout_min=<timeout>)
47        2. @retry_goofy_rpc(socket.error, timeout_min=<timeout>)
48            Note: you need to specify the socket.error exception because we
49            want to retry the call after recreating the proxy.
50
51    @param exception_tuple: A tuple of exceptions to pass to
52        the retry decorator. Any of these exceptions will result
53        in retries for the duration of timeout_min.
54    @param timeout_min: The timeout, in minutes, for which we should
55        retry the method ignoring any exception in exception_tuple.
56    """
57    def inner_decorator(method):
58        """Inner retry decorator applied to the method.
59
60        @param method: The method that needs to be wrapped in the decorator.
61
62        @return A wrapper function that implements the retry.
63        """
64
65        @retry.retry(exception_tuple, timeout_min=timeout_min)
66        def wrapper(*args, **kwargs):
67            """This wrapper handles socket errors.
68
69            If the method in question:
70            1. Throws an exception in exception_tuple and it is not a
71               socket.error, retry for timeout_min through retry.retry.
72            2. Throws a socket.error, recreate the client proxy, and
73               retry for timeout_min through retry.retry.
74            3. Throws an exception not in exception_tuple, fail.
75            """
76            try:
77                return method(*args, **kwargs)
78            except socket.error as e:
79                goofy_proxy = args[0]
80                if type(goofy_proxy) is GoofyProxy:
81                    logging.warning('Socket error while running factory tests '
82                                    '%s, recreating goofy proxy.', e)
83                    goofy_proxy._create_client_proxy(timeout_min=timeout_min)
84                else:
85                    logging.warning('Connectivity was lost and the retry '
86                                    'decorator was unable to recreate a goofy '
87                                    'client proxy, args: %s.', args)
88                raise
89
90        return wrapper
91
92    return inner_decorator
93
94
95class GoofyProxy(object):
96    """Client capable of making rpc calls to goofy.
97
98    Methods of this class that can cause goofy to change state
99    usually need a retry decorator. Methods that have a retry decorator
100    need to be 'pure', i.e return the same results when called multiple
101    times with the same argument.
102
103    There are 2 known exceptions this class can deal with, a socket.error
104    which happens when we try to execute an rpc when the DUT is, say, suspended
105    and a BadStatusLine, which we get when we try to execute an rpc while the
106    DUT is going through a factory_restart. Ideally we would like to handle
107    socket timeouts different from BadStatusLines as we can get connection
108    errors even when a device reboots and BadStatusLines ususally only when
109    factory restarts. crbug.com/281714.
110    """
111
112    # This timeout was arbitrarily chosen as many tests in the factory test
113    # suite run for days. Ideally we would like to split this into at least 2
114    # timeouts, one which we use for rpcs that run while no other test is,
115    # running and is smaller than the second that is designed for use with rpcs
116    # that might execute simultaneously with a test. The latter needs a longer
117    # timeout since tests could suspend,resume for a long time, and a call like
118    # GetGoofyStatus should be tolerant to these suspend/resumes. In designing
119    # the base timeout one needs to allocate time to component methods of this
120    # class (such as _set_test_list) as a multiple of the number of rpcs it
121    # executes.
122    BASE_RPC_TIMEOUT = 1440
123    POLLING_INTERVAL = 5
124    FACTORY_BUG_RE = r'.*(/tmp/factory_bug.*tar.bz2).*'
125    UNTAR_COMMAND = 'tar jxf %s -C %s'
126
127
128    def __init__(self, host):
129        """
130        @param host: The host object representing the DUT running goofy.
131        """
132        self._host = host
133        self._raw_stop_running_tests()
134        self._create_client_proxy(timeout_min=self.BASE_RPC_TIMEOUT)
135
136
137    def _create_client_proxy(self, timeout_min=30):
138        """Create a goofy client proxy.
139
140        Ping the host till it's up, then proceed to create a goofy proxy. We
141        don't wrap this method with a retry because it's used in the retry
142        decorator itself.
143        """
144
145        # We don't ssh ping here as there is a potential dealy in O(minutes)
146        # with our ssh command against a sleeping DUT, once it wakes up, and
147        # that will lead to significant overhead incurred over many reboots.
148        self._host.ping_wait_up(timeout_min)
149        logging.info('Host is pingable, creating goofy client proxy')
150        self._client = self._host.rpc_server_tracker.jsonrpc_connect(
151                GOOFY_JSONRPC_SERVER_PORT)
152
153
154    @retry.retry((six.moves.http_client.BadStatusLine, socket.error),
155                 timeout_min=BASE_RPC_TIMEOUT)
156    def _raw_stop_running_tests(self):
157        """Stop running tests by shelling out to the DUT.
158
159        Use this method only before we have actually created the client
160        proxy, as shelling out has several pitfalls. We need to stop all
161        tests in a retry loop because tests will start executing as soon
162        as we have reimaged a DUT and trying to create the proxy while
163        the DUT is rebooting will lead to a spurious failure.
164
165        Note that we use the plain retry decorator for this method since
166        we don't need to recreate the client proxy on failure.
167        """
168        logging.info('Stopping all tests and clearing factory state')
169        self._host.run('factory clear')
170
171
172    @retry_goofy_rpc((six.moves.http_client.BadStatusLine, socket.error),
173                     timeout_min=BASE_RPC_TIMEOUT)
174    def _get_goofy_status(self):
175        """Return status of goofy, ignoring socket timeouts and http exceptions.
176        """
177        status = self._client.GetGoofyStatus().get('status')
178        return status
179
180
181    def _wait_for_goofy(self, timeout_min=BASE_RPC_TIMEOUT*2):
182        """Wait till goofy is running or a timeout occurs.
183
184        @param timeout_min: Minutes to wait before timing this call out.
185        """
186        current_time = time.time()
187        timeout_secs = timeout_min * 60
188        logging.info('Waiting on goofy')
189        while self._get_goofy_status() != GOOFY_RUNNING:
190            if time.time() - current_time > timeout_secs:
191                break
192        return
193
194
195    @retry_goofy_rpc(socket.error, timeout_min=BASE_RPC_TIMEOUT*2)
196    def _set_test_list(self, next_list):
197        """Set the given test list for execution and turn on test automation.
198
199        Confirm that the given test list is a test that has been baked into
200        the image, then run it. Some test lists are configured to start
201        execution automatically when we call SetTestList, while others wait
202        for a corresponding RunTest.
203
204        @param next_list: The name of the test list.
205
206        @raise jsonrpclib.ProtocolError: If the test list we're trying to switch
207                                         to isn't on the DUT.
208        """
209
210        # We can get a BadStatus line on 2 occassions:
211        # 1. As part of SwitchTestList goofy performs a factory restart, which
212        # will throw a BadStatusLine because the rpc can't exit cleanly. We
213        # don't want to retry on this exception, since we've already set the
214        # right test list.
215        # 2. If we try to set a test list while goofy is already down
216        # (from a previous factory restart). In this case we wouldn't have
217        # set the new test list, because we coulnd't connect to goofy.
218        # To properly set a new test list it's important to wait till goofy is
219        # up before attempting to set the test list, while being aware that the
220        # preceding httplib error is from the rpc we just executed leading to
221        # a factory restart. Also note that if the test list is not already on
222        # the DUT this method will fail, emitting the possible test lists one
223        # can switch to.
224        self._wait_for_goofy()
225        logging.info('Switching to test list %s', next_list)
226        try:
227            # Enable full factory test automation. Full test automation mode
228            # skips all manual tests and test barriers, which is what we want in
229            # the test lab. There are other automation modes: partial and none.
230            # In partial automation mode manual tests and barrier are enabled
231            # and user intervention is required; none disables automation.
232            self._client.SwitchTestList(next_list, 'full')
233        except six.moves.http_client.BadStatusLine:
234            logging.info('Switched to list %s, goofy restarting', next_list)
235
236
237    @retry_goofy_rpc((six.moves.http_client.BadStatusLine, socket.error),
238                     timeout_min=BASE_RPC_TIMEOUT*2)
239    def _stop_running_tests(self):
240        """Stop all running tests.
241
242        Wrap the StopTest rpc so we can attempt to stop tests even while a DUT
243        is suspended or rebooting.
244        """
245        logging.info('Stopping tests.')
246        self._client.StopTest()
247
248
249    def _get_test_map(self):
250        """Get a mapping of test suites -> tests.
251
252        Ignore entries for tests that don't have a path.
253
254        @return: A dictionary of the form
255                 {'suite_name': ['suite_name.path_to_test', ...]}.
256        """
257        test_all = set([test['path'] for test in self._client.GetTests()
258                        if test.get('path')])
259
260        test_map = collections.defaultdict(list)
261        for names in test_all:
262            test_map[names.split('.')[0]].append(names)
263        return test_map
264
265
266    def _log_test_results(self, test_status, current_suite):
267        """Format test status results and write them to status.log.
268
269        @param test_status: The status dictionary of a single test.
270        @param current_suite: The current suite name.
271        """
272        try:
273            self._host.job.record('INFO', None, None,
274                                  'suite %s, test %s, status: %s' %
275                                  (current_suite, test_status.get('path'),
276                                   test_status.get('status')))
277        except AttributeError as e:
278            logging.error('Could not gather results for current test: %s', e)
279
280
281    @retry_goofy_rpc((six.moves.http_client.BadStatusLine, socket.error),
282                     timeout_min=BASE_RPC_TIMEOUT*2)
283    def _get_test_info(self, test_name):
284        """Get the status of one test.
285
286        @param test_name: The name of the test we need the status of.
287
288        @return: The entry for the test in the status dictionary.
289        """
290        for test in self._client.GetTests():
291            if test['path'] == test_name:
292                return test
293        raise ValueError('Could not find test_name %s in _get_test_info.' %
294                          test_name)
295
296
297    @retry_goofy_rpc((six.moves.http_client.BadStatusLine, socket.error),
298                     timeout_min=BASE_RPC_TIMEOUT*2)
299    def _get_test_run_info(self, run_id):
300        """Get the information about the given test run.
301
302        @param run_id: The ID of the test run.
303
304        @return: A dict of test run status.
305        """
306        return self._client.GetTestRunStatus(run_id)
307
308
309    def _wait_on_run(self, run_id):
310        """Wait until the given test run to end.
311
312        @param run_id: The ID of the test run.
313
314        @raises GoofyRuntimeException: If the test run does not finish
315            gracefully.
316        """
317        finished_tests = set()
318        run_info = self._get_test_run_info(run_id)
319        while run_info['status'] == 'RUNNING':
320            finished = [(t['path'], t['status']) for t in
321                        run_info['scheduled_tests']
322                        if t['status'] in ('PASSED', 'FAILED')]
323            for t in finished:
324                if t not in finished_tests:
325                    logging.info('[%s] %s', t[1], t[0])
326                    finished_tests.add(t)
327            time.sleep(self.POLLING_INTERVAL)
328            run_info = self._get_test_run_info(run_id)
329        if run_info['status'] != 'FINISHED':
330            raise GoofyRuntimeException(
331                    'The requested test run was interrupted.')
332
333
334    def _synchronous_run_suite(self, suite_name):
335        """Run one suite and wait for it to finish.
336
337        Will start a test run for the specified suite_name and wait until it
338        ends.
339
340        @param suite_name: The name of the suite to wait for.
341
342        @raises GoofyProxyException: If the status of the suite
343            doesn't switch to active after we call RunTest.
344
345        @return: The result of the suite.
346        """
347        logging.info('Starting suite: %s', suite_name)
348        run_id = self._client.RunTest(suite_name)
349        self._wait_on_run(run_id)
350        return self._get_test_run_info(run_id)
351
352
353    def monitor_tests(self, test_list):
354        """Run a test list.
355
356        Will run each suite in the given list in sequence, starting each one
357        by name and waiting on its results. This method makes the following
358        assumptions:
359            - A test list is made up of self contained suites.
360            - These suites trigger several things in parallel.
361            - After a suite finishes it leaves goofy in an idle state.
362
363        It is not safe to pull results for individual tests during the suite
364        as the device could be rebooting, or goofy could be under stress.
365        Instead, this method synchronously waits on an entire suite, then
366        asks goofy for the status of each test in the suite. Since certain
367        test lists automatically start and others don't, this method stops
368        test list execution regardless, and sequentially triggers each suite.
369
370        @param test_list: The test list to run.
371        """
372        self._set_test_list(test_list)
373        self._wait_for_goofy()
374        self._stop_running_tests()
375
376        test_map = self._get_test_map()
377        if test_map:
378            logging.info('About to execute tests: %s', test_map)
379        else:
380            raise GoofyRuntimeException('Test map is empty, you might have an '
381                                        'error in your test_list.')
382
383
384        for current_suite in test_map.keys():
385            logging.info('Processing suite %s', current_suite)
386
387            result = self._synchronous_run_suite(current_suite)
388            logging.info(result)
389
390            for test_names in test_map.get(current_suite):
391                self._log_test_results(self._get_test_info(test_names),
392                                       current_suite)
393
394
395    @retry_goofy_rpc((six.moves.http_client.BadStatusLine, socket.error),
396                     timeout_min=BASE_RPC_TIMEOUT*2)
397    def get_results(self, resultsdir):
398        """Copies results from the DUT to a local results directory.
399
400        Copy the tarball over to the results folder, untar, and delete the
401        tarball if everything was successful. This will effectively place
402        all the logs relevant to factory testing in the job's results folder.
403
404        @param resultsdir: The directory in which to untar the contents of the
405                           tarball factory_bug generates.
406        """
407        logging.info('Getting results logs for test_list.')
408
409        try:
410            factory_bug_log = self._host.run('factory_bug').stderr
411        except error.CmdError as e:
412            logging.error('Could not execute factory_bug: %s', e)
413            return
414
415        try:
416            factory_bug_tar = re.match(self.FACTORY_BUG_RE,
417                                       factory_bug_log).groups(1)[0]
418        except (IndexError, AttributeError):
419            logging.error('could not collect logs for factory results, '
420                          'factory bug returned %s', factory_bug_log)
421            return
422
423        factory_bug_tar_file = os.path.basename(factory_bug_tar)
424        local_factory_bug_tar = os.path.join(resultsdir, factory_bug_tar_file)
425
426        try:
427            self._host.get_file(factory_bug_tar, local_factory_bug_tar)
428        except error.AutoservRunError as e:
429            logging.error('Failed to pull back the results tarball: %s', e)
430            return
431
432        try:
433            utils.run(self.UNTAR_COMMAND % (local_factory_bug_tar, resultsdir))
434        except error.CmdError as e:
435            logging.error('Failed to untar the results tarball: %s', e)
436            return
437        finally:
438            if os.path.exists(local_factory_bug_tar):
439                os.remove(local_factory_bug_tar)
440