1# Lint as: python2, python3
2# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5#
6# Expects to be run in an environment with sudo and no interactive password
7# prompt, such as within the Chromium OS development chroot.
8
9
10"""This file provides core logic for servo verify/repair process."""
11
12
13from __future__ import absolute_import
14from __future__ import division
15from __future__ import print_function
16
17import logging
18import os
19import re
20import tarfile
21import threading
22import json
23import time
24import six
25import six.moves.xmlrpc_client
26import calendar
27
28from autotest_lib.client.bin import utils
29from autotest_lib.client.common_lib import error
30from autotest_lib.client.common_lib import hosts
31from autotest_lib.client.common_lib import lsbrelease_utils
32from autotest_lib.client.common_lib.cros import retry
33from autotest_lib.client.common_lib.cros.network import ping_runner
34from autotest_lib.server.cros.servo import servo
35from autotest_lib.server.hosts import servo_repair
36from autotest_lib.server.hosts import base_servohost
37from autotest_lib.server.hosts import servo_constants
38from autotest_lib.server.cros.faft.utils import config
39from autotest_lib.client.common_lib import global_config
40from autotest_lib.site_utils.admin_audit import servo_updater
41from autotest_lib.server.cros.servo.topology import servo_topology
42
43try:
44    from chromite.lib import metrics
45except ImportError:
46    metrics = utils.metrics_mock
47
48_CONFIG = global_config.global_config
49
50
51class ServoHost(base_servohost.BaseServoHost):
52    """Host class for a servo host(e.g. beaglebone, labstation)
53     that with a servo instance for a specific port.
54
55     @type _servo: servo.Servo | None
56     """
57
58    DEFAULT_PORT = int(os.getenv('SERVOD_PORT', '9999'))
59
60    # Timeout for initializing servo signals.
61    INITIALIZE_SERVO_TIMEOUT_SECS = 60
62
63    # Default timeout for run terminal command.
64    DEFAULT_TERMINAL_TIMEOUT = 30
65
66    # Ready test function
67    SERVO_READY_METHOD = 'get_version'
68
69    # Directory prefix on the servo host where the servod logs are stored.
70    SERVOD_LOG_PREFIX = '/var/log/servod'
71
72    # Exit code to use when symlinks for servod logs are not found.
73    NO_SYMLINKS_CODE = 9
74
75    # Directory in the job's results directory to dump the logs into.
76    LOG_DIR = 'servod'
77
78    # Prefix for joint loglevel files in the logs.
79    JOINT_LOG_PREFIX = 'log'
80
81    # Regex group to extract timestamp from logfile name.
82    TS_GROUP = 'ts'
83
84    # This regex is used to extract the timestamp from servod logs.
85    # files always start with log.
86    TS_RE = (r'log.'
87             # The timestamp is of format %Y-%m-%d--%H-%M-%S.MS
88             r'(?P<%s>\d{4}(\-\d{2}){2}\-(-\d{2}){3}.\d{3})'
89             # The loglevel is optional depending on labstation version.
90             r'(.(INFO|DEBUG|WARNING))?' % TS_GROUP)
91    TS_EXTRACTOR = re.compile(TS_RE)
92
93    # Regex group to extract MCU name from logline in servod logs.
94    MCU_GROUP = 'mcu'
95
96    # Regex group to extract logline from MCU logline in servod logs.
97    LINE_GROUP = 'line'
98
99    # This regex is used to extract the mcu and the line content from an
100    # MCU logline in servod logs. e.g. EC or servo_v4 console logs.
101    # Here is an example log-line:
102    #
103    # 2020-01-23 13:15:12,223 - servo_v4 - EC3PO.Console - DEBUG -
104    # console.py:219:LogConsoleOutput - /dev/pts/9 - cc polarity: cc1
105    #
106    # Here is conceptually how they are formatted:
107    #
108    #  <time> - <MCU> - EC3PO.Console - <LVL> - <file:line:func> - <pts> -
109    #  <output>
110    #
111    # The log format starts with a timestamp
112    MCU_RE = (r'[\d\-]+ [\d:,]+ '
113              # The mcu that is logging this is next.
114              r'- (?P<%s>\w+) - '
115              # Next, we have more log outputs before the actual line.
116              # Information about the file line, logging function etc.
117              # Anchor on EC3PO Console, LogConsoleOutput and dev/pts.
118              # NOTE: if the log format changes, this regex needs to be
119              # adjusted.
120              r'EC3PO\.Console[\s\-\w\d:.]+LogConsoleOutput - /dev/pts/\d+ - '
121              # Lastly, we get the MCU's console line.
122              r'(?P<%s>.+$)' % (MCU_GROUP, LINE_GROUP))
123    MCU_EXTRACTOR = re.compile(MCU_RE)
124
125    # Regex to detect timeout messages when USBC pigtail has timeout issue.
126    # e.g.:  [475635.427072 PD TMOUT RX 1/1]
127    USBC_PIGTAIL_TIMEOUT_RE = r'\[[\d \.]{1,20}(PD TMOUT RX 1\/1)\]'
128
129    # Suffix to identify compressed logfiles.
130    COMPRESSION_SUFFIX = '.tbz2'
131
132    # A suffix to mark servod log directories that came from instance that
133    # ran during this servo_host, but are not the last one running e.g. when
134    # an instance (on purpose, or due to a bug) restarted in the middle of the
135    # run.
136    OLD_LOG_SUFFIX = 'old'
137
138    # Mapping servo board with their vid-pid
139    SERVO_VID_PID = {
140            'servo_v4': '18d1:501b',
141            'ccd_cr50': '18d1:5014',
142            'servo_micro': '18d1:501a',
143            'servo_v3': ['18d1:5004', '0403:6014'],
144            'c2d2': '1d81:5041',
145    }
146
147    def _init_attributes(self):
148        self._servo_state = None
149        self.servo_port = None
150        self.servo_board = None
151        self.servo_model = None
152        self.servo_serial = None
153        self.servo_setup = None
154        self.additional_servod_args = None
155        self._dut_health_profile = None
156        # The flag that indicate if a servo is connected to a smart usbhub.
157        # TODO(xianuowang@) remove this flag once all usbhubs in the lab
158        # get replaced.
159        self.smart_usbhub = None
160        self._servo = None
161        self._topology = None
162        self._tunnel_proxy = None
163        self._tunnel_proxy_lock = threading.Lock()
164        self._initial_instance_ts = None
165        # Flag to make sure that multiple calls to close do not result in the
166        # logic executing multiple times.
167        self._closed = False
168        # Per-thread local data
169        self._local = threading.local()
170
171    def _initialize(self,
172                    servo_host='localhost',
173                    servo_port=DEFAULT_PORT,
174                    servo_board=None,
175                    servo_model=None,
176                    servo_serial=None,
177                    servo_setup=None,
178                    additional_servod_args=None,
179                    is_in_lab=None,
180                    *args,
181                    **dargs):
182        """Initialize a ServoHost instance.
183
184        A ServoHost instance represents a host that controls a servo.
185
186        @param servo_host: Name of the host where the servod process
187                           is running.
188        @param servo_port: Port the servod process is listening on. Defaults
189                           to the SERVOD_PORT environment variable if set,
190                           otherwise 9999.
191        @param servo_board: Board that the servo is connected to.
192        @param servo_model: Model that the servo is connected to.
193        @param servo_serial: Serial number of the servo device.
194        @param servo_setup: Type of servo setup, e.g. REGULAR or DUAL_V4.
195        @param additional_servod_args: Additional args that will append to
196                                       servod start command.
197        @param is_in_lab: True if the servo host is in Cros Lab. Default is set
198                          to None, for which utils.host_is_in_lab_zone will be
199                          called to check if the servo host is in Cros lab.
200
201        """
202        super(ServoHost, self)._initialize(hostname=servo_host,
203                                           is_in_lab=is_in_lab, *args, **dargs)
204        self._init_attributes()
205        self.servo_port = int(servo_port)
206        self.servo_board = servo_board
207        self.servo_model = servo_model
208        self.servo_serial = servo_serial
209        self.servo_setup = servo_setup
210        self.additional_servod_args = additional_servod_args
211
212        # The location of the log files on the servo host for this instance.
213        self.remote_log_dir = '%s_%s' % (self.SERVOD_LOG_PREFIX,
214                                         self.servo_port)
215        # Path of the servo host lock file.
216        self._lock_file = (self.TEMP_FILE_DIR + str(self.servo_port) +
217                           self.LOCK_FILE_POSTFIX)
218        # File path to declare a reboot request.
219        self._reboot_file = (self.TEMP_FILE_DIR + str(self.servo_port) +
220                             self.REBOOT_FILE_POSTFIX)
221
222        # Lock the servo host if it's an in-lab labstation to prevent other
223        # task to reboot it until current task completes. We also wait and
224        # make sure the labstation is up here, in the case of the labstation is
225        # in the middle of reboot.
226        self._is_locked = False
227        if (self.wait_up(self.REBOOT_TIMEOUT) and self.is_in_lab()
228            and self.is_labstation()):
229            self._lock()
230            try:
231                self.wait_ready()
232            except Exception as e:
233                logging.info(
234                        'Unexpected error while ensure labstation'
235                        ' readiness; %s', str(e))
236
237        self._repair_strategy = (
238                servo_repair.create_servo_repair_strategy())
239
240    def __str__(self):
241        return "<%s '%s:%s'>" % (
242                type(self).__name__, self.hostname, self.servo_port)
243
244    def connect_servo(self):
245        """ Initialize and setup servo for later use.
246        """
247        self.initilize_servo()
248        self.initialize_dut_for_servo()
249
250    def initilize_servo(self):
251        """Establish a connection to the servod server on this host.
252
253        Initializes `self._servo` and then verifies that all network
254        connections are working.  This will create an ssh tunnel if
255        it's required.
256        """
257        self._servo = servo.Servo(servo_host=self,
258                                  servo_serial=self.servo_serial)
259
260    def initialize_dut_for_servo(self):
261        """This method will do some setup for dut control, e.g. setup
262        main servo_v4 device, and also testing the connection between servo
263        and DUT. As a side effect of testing the connection, all signals on
264        the target servo are reset to default values, and the USB stick is
265        set to the neutral (off) position.
266        """
267        if not self._servo:
268            raise hosts.AutoservVerifyError('Servo object needs to be'
269                                            ' initialized before initialize'
270                                            ' DUT.')
271        timeout, _ = retry.timeout(
272            self._servo.initialize_dut,
273            timeout_sec=self.INITIALIZE_SERVO_TIMEOUT_SECS)
274        if timeout:
275            raise hosts.AutoservVerifyError('Initialize dut for servo timed'
276                                            ' out.')
277
278    def disconnect_servo(self):
279        """Disconnect our servo if it exists.
280
281        If we've previously successfully connected to our servo,
282        disconnect any established ssh tunnel, and set `self._servo`
283        back to `None`.
284        """
285        if self._servo:
286            # N.B. This call is safe even without a tunnel:
287            # rpc_server_tracker.disconnect() silently ignores
288            # unknown ports.
289            self.rpc_server_tracker.disconnect(self.servo_port)
290            self._servo = None
291
292    def _maybe_create_servod_ssh_tunnel_proxy(self):
293        """Create a xmlrpc proxy for use with a ssh tunnel.
294        A lock is used to safely create a singleton proxy.
295        """
296        with self._tunnel_proxy_lock:
297            if self._tunnel_proxy is None:
298                self._tunnel_proxy = self.rpc_server_tracker.xmlrpc_connect(
299                       None,
300                       self.servo_port,
301                       ready_test_name=self.SERVO_READY_METHOD,
302                       timeout_seconds=60,
303                       request_timeout_seconds=3600,
304                       server_desc=str(self))
305
306    def get_servod_server_proxy(self):
307        """Return a proxy if it exists; otherwise, create a new one.
308        A proxy can either be a ssh tunnel based proxy, or a httplib
309        based proxy.
310
311        @returns: An xmlrpclib.ServerProxy that is connected to the servod
312                  server on the host.
313        """
314        if (servo_constants.ENABLE_SSH_TUNNEL_FOR_SERVO
315                and not self.is_localhost()):
316            # Check for existing ssh tunnel proxy.
317            if self._tunnel_proxy is None:
318                self._maybe_create_servod_ssh_tunnel_proxy()
319            return self._tunnel_proxy
320        else:
321            # xmlrpc/httplib is not thread-safe, so each thread must have its
322            # own separate proxy connection.
323            if not hasattr(self._local, "_per_thread_proxy"):
324                remote = 'http://%s:%s' % (self.hostname, self.servo_port)
325                self._local._per_thread_proxy = six.moves.xmlrpc_client.ServerProxy(remote)
326            return self._local._per_thread_proxy
327
328    def verify(self, silent=False):
329        """Update the servo host and verify it's in a good state.
330
331        @param silent   If true, suppress logging in `status.log`.
332        """
333        message = 'Beginning verify for servo host %s port %s serial %s'
334        message %= (self.hostname, self.servo_port, self.servo_serial)
335        self.record('INFO', None, None, message)
336        try:
337            self._repair_strategy.verify(self, silent)
338            self._servo_state = servo_constants.SERVO_STATE_WORKING
339            self.record('INFO', None, None,
340                        'ServoHost verify set servo_state as WORKING')
341            if self.is_servo_topology_supported():
342                self._topology = servo_topology.ServoTopology(self)
343                self._topology.generate()
344        except Exception as e:
345            if not self.is_localhost():
346                self._servo_state = self.determine_servo_state()
347                self.record('INFO', None, None,
348                            'ServoHost verify set servo_state as %s'
349                            % self._servo_state)
350            if self._is_critical_error(e):
351                raise
352
353    def _get_default_usbkey_mount_path(self):
354        return '/media/servo_usb/%s' % self.servo_port
355
356    def get_image_name_from_usbkey(self, usbkey_dev):
357        """Mount usb drive and check ChromeOS image name on it if there is
358        one. This method assumes the image_usbkey_direction is already set
359        to servo side.
360
361        @param usbkey_dev: usbkey dev path(e.g. /dev/sdb).
362
363        @returns: image_name on the usbkey, e.g. nami-release/R82.10138.0.0,
364                  or empty string if no test image detected, or unexpected
365                  error occurred.
366        """
367        logging.info('Checking ChromeOS image name on usbkey.')
368        mount_dst = self._get_default_usbkey_mount_path()
369        # Unmount if there is an existing stale mount.
370        self._unmount_drive(mount_dst)
371        # ChromeOS root fs is in /dev/sdx3
372        mount_src = usbkey_dev + '3'
373        try:
374            if not self._mount_drive(mount_src, mount_dst):
375                logging.debug('Unexpected error occurred on mount usb drive.')
376                return ''
377
378            release_content = self.run(
379                'cat %s/etc/lsb-release' % mount_dst,
380                ignore_status=True).stdout.strip()
381
382            if not re.search(r'RELEASE_TRACK=.*test', release_content):
383                logging.info('The image on usbkey is not a test image')
384                return ''
385
386            return lsbrelease_utils.get_chromeos_release_builder_path(
387                lsb_release_content=release_content)
388        finally:
389            logging.debug('Image check compeleted, unmounting the usb drive.')
390            self._unmount_drive(mount_dst)
391
392    def _extract_firmware_image_from_usbkey(self, fw_dst):
393        """Extract firmware images from the usbkey on servo, this method
394        assumes there is already a ChromeOS test image staged on servo.
395
396        @param fw_dst: the path that we'll copy firmware images to.
397
398        @returns: a json format string of firmware manifest data.
399        """
400        usbkey_dev = self._probe_and_validate_usb_dev()
401        if not usbkey_dev:
402            raise hosts.AutoservRepairError('Unexpected error occurred when'
403                      ' probe usbkey dev path, please check logs for detail.')
404
405        mount_dst = self._get_default_usbkey_mount_path()
406        # Unmount if there is an existing stale mount.
407        self._unmount_drive(mount_dst)
408        # ChromeOS root fs is in /dev/sdx3
409        mount_src = usbkey_dev + '3'
410        try:
411            if not self._mount_drive(mount_src, mount_dst):
412                raise hosts.AutoservRepairError('Failed to extract firmware'
413                          ' image; Unable to mount %s.' % usbkey_dev,
414                          'unable to mount usbkey')
415            updater_bin = os.path.join(mount_dst,
416                                       'usr/sbin/chromeos-firmwareupdate')
417            self.run('%s --unpack %s' % (updater_bin, fw_dst))
418            return self.run('%s --manifest' % updater_bin).stdout
419        finally:
420            self._unmount_drive(mount_dst)
421
422    def prepare_repair_firmware_image(self, fw_dst=None):
423        """Prepare firmware image on the servohost for auto repair process
424        to consume.
425
426        @param fw_dst: the path that we want to store firmware image on
427                       the servohost.
428
429        @returns: A tuple that containes ec firmware image path and bios
430                  firmware image path on the servohost, or None if type of
431                  image is not available based on manifest and dut's model.
432        """
433        model = self.servo_model or self._dut_host_info.model
434        if not model:
435            raise hosts.AutoservRepairError(
436                      'Could not determine DUT\'s model.',
437                      'model infomation unknown')
438
439        if not fw_dst:
440            fw_dst = '/tmp/firmware_image/%s' % self.servo_port
441        # Cleanup and re-create dst path to have a fresh start.
442        self.run('rm -rf %s' % fw_dst)
443        self.run('mkdir -p %s' % fw_dst)
444
445        manifest = json.loads(self._extract_firmware_image_from_usbkey(fw_dst))
446        # For models that have packed $MODEL_signed variant, we want use the
447        # 'signed' variant once we get DVT devices, so try to read manifest
448        # from $MODEL_signed first.
449        build = manifest.get('%s_signed' % model) or manifest.get(model)
450        if not build:
451            raise hosts.AutoservRepairError('Could not find firmware manifest'
452                      ' for model:%s' % model, 'model manifest not found')
453        try:
454            ec_image = os.path.join(fw_dst, build['ec']['image'])
455        except KeyError:
456            ec_image = None
457        try:
458            bios_image = os.path.join(fw_dst, build['host']['image'])
459        except KeyError:
460            bios_image = None
461        if not ec_image and not bios_image:
462            raise hosts.AutoservRepairError('Could not find any firmware image'
463                      ' for model:%s' % model, 'cannot find firmware image')
464        return ec_image, bios_image
465
466    def flash_ap_firmware_via_servo(self, image):
467        """Flash AP firmware by use a provided image.
468
469        This is will be a short term enhanment for infra repair use, it use
470        'futility update' which will automatically determine various parameters
471        needed for flashrom, and will preserve the GBB, VPD, and HWID for
472        AP firmware update.
473        @TODO(xianuowang@) Remove this method once b/148403277 implemented.
474
475        @param image: the firmware image path on servohost.
476        """
477        cmd = 'futility update -i %s --servo_port=%s'
478        self.run(cmd % (image, self.servo_port), timeout=900)
479
480    def _probe_and_validate_usb_dev(self):
481        """This method probe the usb dev path by talking to servo, and then
482        validate the dev path is valid block device to servohost.
483        Possible output:
484        1. Encounter error during probe usb dev, returns empty string.
485        2. probe usb dev completed without error but cannot find usb dev,
486           raise AutoservRepairError.
487        3. probe usb dev find a usb dev path, but failed validation in this
488           method, raise AutoservRepairError.
489
490        @returns: A string of usb dev path(e.g. '/dev/sdb'), or empty string
491        if unexpected error occurred during probe.
492        @raises: AutoservRepairError if servo couldn't probe the usb dev path
493        (servo.probe_host_usb_dev() returns empty string), or the dev path is
494        not valid block device to servohost.
495        """
496        logging.info('Validating image usbkey on servo.')
497        try:
498            usb_dev = self._servo.probe_host_usb_dev()
499        except Exception as e:
500            # We don't want any unexpected or transient servo communicating
501            # failure block usb repair, so capture all errors here.
502            logging.error(e, exc_info=True)
503            logging.error('Unexpected error occurred on get usbkey dev path,'
504                          ' skipping usbkey validation.')
505            return ''
506
507        if usb_dev:
508            # probe_host_usb_dev() sometimes return stale record,
509            # so we need to make sure the path exists in fdisk.
510            validate_cmd = 'fdisk -l %s' % usb_dev
511            try:
512                resp = self.run(validate_cmd, ignore_status=True, timeout=30)
513                if resp.exit_status == 0:
514                    return usb_dev
515                logging.error('%s is reported from "image_usbkey_dev" control'
516                              ' but not detected by fdisk!', usb_dev)
517            except error.AutoservRunError as e:
518                if 'Timeout encountered' in str(e):
519                    logging.warning('Timeout encountered during fdisk run,'
520                                    ' skipping usbkey validation.')
521                    return ''
522                raise
523
524        raise hosts.AutoservRepairError(
525              'No usbkey detected on servo, the usbkey may be either missing'
526              ' or broken. Please replace usbkey on the servo and retry.',
527              'missing usbkey')
528
529    def is_ec_supported(self):
530        """Check if ec is supported on the servo_board"""
531        if self.servo_board:
532            try:
533                frm_config = config.Config(self.servo_board, self.servo_model)
534                return frm_config.chrome_ec
535            except Exception as e:
536                logging.error('Unexpected error when read from firmware'
537                    ' configs; %s', str(e))
538        return False
539
540    def validate_image_usbkey(self):
541        """This method first validate if there is a recover usbkey on servo
542        that accessible to servohost, and second check if a ChromeOS image is
543        already on the usb drive and return the image_name so we can avoid
544        unnecessary download and flash to the recover usbkey on servo.
545
546        Please note that, there is special error handling logic here:
547        1. If unexpected error happens, we return empty string. So repair
548           actions will not get blocked.
549        2. If no working usbkey present on servo, but no errors, we'll raise
550           AutoservRepairError here.
551
552        @returns: image_name on the usbkey, e.g. nami-release/R82.10138.0.0,
553                  or empty string if no test image detected, or unexpected
554                  error occurred.
555        @raises:  AutoservRepairError if the usbkey is not detected on servo.
556        """
557        usb_dev = self._probe_and_validate_usb_dev()
558        if usb_dev:
559            return self.get_image_name_from_usbkey(usb_dev)
560        else:
561            return ''
562
563    def repair(self, silent=False):
564        """Attempt to repair servo host.
565
566        @param silent   If true, suppress logging in `status.log`.
567        """
568        message = 'Beginning repair for servo host %s port %s serial %s'
569        message %= (self.hostname, self.servo_port, self.servo_serial)
570        self.record('INFO', None, None, message)
571        try:
572            self._repair_strategy.repair(self, silent)
573            self._servo_state = servo_constants.SERVO_STATE_WORKING
574            self.record('INFO', None, None,
575                        'ServoHost repair set servo_state as WORKING')
576            # If target is a labstation then try to withdraw any existing
577            # reboot request created by this servo because it passed repair.
578            if self.is_labstation():
579                self.withdraw_reboot_request()
580            if self.is_servo_topology_supported():
581                self._topology = servo_topology.ServoTopology(self)
582                self._topology.generate()
583        except Exception as e:
584            if not self.is_localhost():
585                self._servo_state = self.determine_servo_state()
586                self.record('INFO', None, None,
587                            'ServoHost repair set servo_state as %s'
588                            % self._servo_state)
589            if self._is_critical_error(e):
590                self.disconnect_servo()
591                self.stop_servod()
592                raise
593
594    def _is_critical_error(self, error):
595        if (isinstance(error, hosts.AutoservVerifyDependencyError)
596            and not error.is_critical()):
597            logging.warning('Non-critical verify failure(s) detected during'
598                            ' verify/repair servo, servo connection will'
599                            ' still be up but may not be fully functional.'
600                            ' Some repair actions and servo dependent'
601                            ' tests may not run.')
602            return False
603        logging.info(
604                'Critical verify failure(s) detected during repair/verify '
605                'servo. Disconnecting servo and running `stop servod`, all'
606                ' repair actions and tests that depends on servo will not '
607                'run.')
608        return True
609
610    def get_servo(self):
611        """Get the cached servo.Servo object.
612
613        @return: a servo.Servo object.
614        @rtype: autotest_lib.server.cros.servo.servo.Servo
615        """
616        return self._servo
617
618    def request_reboot(self):
619        """Request servohost to be rebooted when it's safe to by touch a file.
620        """
621        logging.debug('Request to reboot servohost %s has been created by '
622                      'servo with port # %s', self.hostname, self.servo_port)
623        self.run('touch %s' % self._reboot_file, ignore_status=True)
624
625    def withdraw_reboot_request(self):
626        """Withdraw a servohost reboot request if exists by remove the flag
627        file.
628        """
629        logging.debug('Withdrawing request to reboot servohost %s that created'
630                      ' by servo with port # %s if exists.',
631                      self.hostname, self.servo_port)
632        self.run('rm -f %s' % self._reboot_file, ignore_status=True)
633
634    def start_servod(self, quick_startup=False):
635        """Start the servod process on servohost.
636        """
637        # Skip if running on the localhost.(crbug.com/1038168)
638        if self.is_localhost():
639            logging.debug("Servohost is a localhost, skipping start servod.")
640            return
641
642        cmd = 'start servod'
643        if self.servo_board:
644            cmd += ' BOARD=%s' % self.servo_board
645            if self.servo_model:
646                cmd += ' MODEL=%s' % self.servo_model
647        else:
648            logging.warning('Board for DUT is unknown; starting servod'
649                            ' assuming a pre-configured board.')
650
651        cmd += ' PORT=%d' % self.servo_port
652        if self.servo_serial:
653            cmd += ' SERIAL=%s' % self.servo_serial
654
655        # Start servod with dual_v4 based on servo_setup.
656        if self.is_dual_setup():
657            cmd += ' DUAL_V4=1'
658
659        # Start servod with CONFIG=cr50.xml which required for some pools.
660        if self._require_cr50_servod_config():
661            cmd += ' CONFIG=cr50.xml'
662
663        # Adding customized args if any.
664        if self.additional_servod_args:
665            cmd += ' ' + self.additional_servod_args
666
667        # Remove the symbolic links from the logs. This helps ensure that
668        # a failed servod instantiation does not cause us to grab old logs
669        # by mistake.
670        self.remove_latest_log_symlinks()
671        self.run(cmd, timeout=60)
672
673        # There's a lag between when `start servod` completes and when
674        # the _ServodConnectionVerifier trigger can actually succeed.
675        # The call to time.sleep() below gives time to make sure that
676        # the trigger won't fail after we return.
677
678        # Normally servod on servo_v3 and labstation take ~10 seconds to ready,
679        # But in the rare case all servo on a labstation are in heavy use they
680        # may take ~30 seconds. So the timeout value will double these value,
681        # and we'll try quick start up when first time initialize servohost,
682        # and use standard start up timeout in repair.
683        if quick_startup:
684            timeout = servo_constants.SERVOD_QUICK_STARTUP_TIMEOUT
685        else:
686            timeout = servo_constants.SERVOD_STARTUP_TIMEOUT
687        logging.debug('Wait %s seconds for servod process fully up.', timeout)
688        time.sleep(timeout)
689        # Cache the initial instance timestamp to check against servod restarts
690        self._initial_instance_ts = self.get_instance_logs_ts()
691
692    def stop_servod(self):
693        """Stop the servod process on servohost.
694        """
695        # Skip if running on the localhost.(crbug.com/1038168)
696        if self.is_localhost():
697            logging.debug("Servohost is a localhost, skipping stop servod.")
698            return
699
700        logging.debug('Stopping servod on port %s', self.servo_port)
701        self.run('stop servod PORT=%d' % self.servo_port,
702                 timeout=60, ignore_status=True)
703        logging.debug('Wait %s seconds for servod process fully teardown.',
704                      servo_constants.SERVOD_TEARDOWN_TIMEOUT)
705        time.sleep(servo_constants.SERVOD_TEARDOWN_TIMEOUT)
706
707    def restart_servod(self, quick_startup=False):
708        """Restart the servod process on servohost.
709        """
710        self.stop_servod()
711        self.start_servod(quick_startup)
712
713    def _process_servodtool_error(self, response):
714        """Helper function to handle non-zero servodtool response.
715        """
716        if re.search(servo_constants.ERROR_MESSAGE_USB_HUB_NOT_COMPATIBLE,
717                     response.stdout):
718            logging.error('The servo is not plugged on a usb hub that supports'
719                          ' power-cycle!')
720            # change the flag so we can update this label in later process.
721            self.smart_usbhub = False
722            return
723
724        if re.search(servo_constants.ERROR_MESSAGE_DEVICE_NOT_FOUND %
725                     self.servo_serial, response.stdout):
726            logging.error('No servo with serial %s found!', self.servo_serial)
727            return
728
729        logging.error('Unexpected error occurred from usbhub control, please'
730                      ' file a bug and inform chrome-fleet-software@ team!')
731
732    def get_main_servo_usb_path(self):
733        """Helper function to collect current usb-path to main servo.
734
735        The usb-path is path to the folder where usb-device was enumerated.
736        If fail then will return an empty string ('').
737
738        @returns: string, usb-path to the main servo device.
739            e.g.: '/sys/bus/usb/devices/1-6.1.3.1'
740        """
741        # TODO remove try-except when fix crbug.com/1087964
742        try:
743            cmd = 'servodtool device -s %s usb-path' % self.servo_serial
744            resp = self.run(cmd, ignore_status=True, timeout=30)
745        except Exception as e:
746            # Here we catch only timeout errors.
747            # Other errors is filtered by ignore_status=True
748            logging.debug('Attempt to get servo usb-path failed due to '
749                          'timeout; %s', e)
750            return ''
751        if resp.exit_status != 0:
752            self._process_servodtool_error(resp)
753            return ''
754        usb_path = resp.stdout.strip()
755        logging.info('Usb path of servo %s is %s', self.servo_serial, usb_path)
756        return usb_path
757
758    def _get_servo_usb_devnum(self):
759        """Helper function to collect current usb devnum of servo."""
760        usb_path = self.get_main_servo_usb_path()
761        if not usb_path:
762            return ''
763        resp = self.run('cat %s/devnum' % usb_path, ignore_status=True)
764        if resp.exit_status != 0:
765            self._process_servodtool_error(resp)
766            return ''
767        return resp.stdout.strip()
768
769    def reboot_servo_v3_on_need(self):
770        """Check and reboot servo_v3 based on below conditions.
771               1. If there is an update pending on reboot.
772               2. Servo_v3 has been up for more than 96 hours.
773        """
774        if self.get_board() != 'beaglebone_servo':
775            logging.info('Servo reboot is only applicable for servo V3.')
776            return
777
778        update_pending_reboot = (self._check_update_status() ==
779                                 self.UPDATE_STATE.PENDING_REBOOT)
780        uptime_hours = float(self.check_uptime())/3600
781        logging.info('Uptime of servo_v3: %s hour(s)', uptime_hours)
782        long_up_time = uptime_hours > 96
783
784        # Skip reboot if neither condition are met.
785        if not (update_pending_reboot or long_up_time):
786            return
787
788        if update_pending_reboot:
789            message = 'Starting reboot servo_v3 because an update is pending.'
790            reboot_method = self._post_update_reboot
791        elif long_up_time:
792            message = 'Starting reboot servo_v3 because uptime > 96 hours.'
793            reboot_method = self._servo_host_reboot
794        self.record('INFO', None, None, message)
795        logging.info(message)
796        try:
797            reboot_method()
798            message = 'Servo_v3 reboot completed successfully.'
799        except Exception as e:
800            logging.debug("Fail to reboot servo_v3; %s", e)
801            message = ('Servo_v3 reboot failed, please check debug log '
802                       'for details.')
803        logging.info(message)
804        self.record('INFO', None, None, message)
805
806    def _reset_servo(self):
807        logging.info('Resetting servo through smart usbhub.')
808        # TODO remove try-except when fix crbug.com/1087964
809        try:
810            resp = self.run('servodtool device -s %s power-cycle' %
811                            self.servo_serial, ignore_status=True,
812                            timeout=30)
813            if resp.exit_status != 0:
814                self._process_servodtool_error(resp)
815                return False
816        except Exception as e:
817            # Here we catch only timeout errors.
818            # Other errors is filtered by ignore_status=True
819            logging.debug('Attempt to reset servo failed due to timeout;'
820                          ' %s', e)
821            return False
822
823        logging.debug('Wait %s seconds for servo to come back from reset.',
824                      servo_constants.SERVO_RESET_TIMEOUT_SECONDS)
825        time.sleep(servo_constants.SERVO_RESET_TIMEOUT_SECONDS)
826        # change the flag so we can update this label in later process.
827        self.smart_usbhub = True
828        return True
829
830    def reset_servo(self):
831        """Reset(power-cycle) the servo via smart usbhub.
832        """
833        if not self.is_labstation():
834            logging.info('Servo reset is not applicable to servo_v3.')
835            return
836
837        pre_reset_devnum = self._get_servo_usb_devnum()
838        logging.info('Servo usb devnum before reset: %s', pre_reset_devnum)
839        result = self._reset_servo()
840        if not result:
841            message = ('Failed to reset servo with serial: %s. (Please ignore'
842                       ' this error if the DUT is not connected to a smart'
843                       ' usbhub).' % self.servo_serial)
844            logging.warning(message)
845            self.record('INFO', None, None, message)
846            return
847
848        post_reset_devnum = self._get_servo_usb_devnum()
849        logging.info('Servo usb devnum after reset: %s', post_reset_devnum)
850        if not (pre_reset_devnum and post_reset_devnum):
851            message = ('Servo reset completed but unable to verify'
852                       ' devnum change!')
853        elif pre_reset_devnum != post_reset_devnum:
854            message = ('Reset servo with serial %s completed successfully!'
855                       % self.servo_serial)
856        else:
857            message = 'Servo reset completed but devnum is still not changed!'
858        logging.info(message)
859        self.record('INFO', None, None, message)
860
861    def _extract_compressed_logs(self, logdir, relevant_files):
862        """Decompress servod logs in |logdir|.
863
864        @param logdir: directory containing compressed servod logs.
865        @param relevant_files: list of files in |logdir| to consider.
866
867        @returns: tuple, (tarfiles, files) where
868                  tarfiles: list of the compressed filenames that have been
869                            extracted and deleted
870                  files:  list of the uncompressed files that were generated
871        """
872        # For all tar-files, first extract them to the directory, and
873        # then let the common flow handle them.
874        tarfiles = [cf for cf in relevant_files if
875                    cf.endswith(self.COMPRESSION_SUFFIX)]
876        files = []
877        for f in tarfiles:
878            norm_name = os.path.basename(f)[:-len(self.COMPRESSION_SUFFIX)]
879            with tarfile.open(f) as tf:
880                # Each tarfile has only one member, as
881                # that's the compressed log.
882                member = tf.members[0]
883                # Manipulate so that it only extracts the basename, and not
884                # the directories etc.
885                member.name = norm_name
886                files.append(os.path.join(logdir, member.name))
887                tf.extract(member, logdir)
888            # File has been extracted: remove the compressed file.
889            os.remove(f)
890        return tarfiles, files
891
892    def _extract_mcu_logs(self, log_subdir):
893        """Extract MCU (EC, Cr50, etc) console output from servod debug logs.
894
895        Using the MCU_EXTRACTOR regex (above) extract and split out MCU console
896        lines from the logs to generate invidiual console logs e.g. after
897        this method, you can find an ec.txt and servo_v4.txt in |log_dir| if
898        those MCUs had any console input/output.
899
900        @param log_subdir: directory with log.DEBUG.txt main servod debug logs.
901        """
902        # Extract the MCU for each one. The MCU logs are only in the .DEBUG
903        # files
904        mcu_lines_file = os.path.join(log_subdir, 'log.DEBUG.txt')
905        if not os.path.exists(mcu_lines_file):
906            logging.info('No DEBUG logs found to extract MCU logs from.')
907            return
908        mcu_files = {}
909        mcu_file_template = '%s.txt'
910        with open(mcu_lines_file, 'r') as f:
911            for line in f:
912                match = self.MCU_EXTRACTOR.match(line)
913                if match:
914                    mcu = match.group(self.MCU_GROUP).lower()
915                    line = match.group(self.LINE_GROUP)
916                    if mcu not in mcu_files:
917                        mcu_file = os.path.join(log_subdir,
918                                                mcu_file_template % mcu)
919                        mcu_files[mcu] = open(mcu_file, 'a')
920                    fd = mcu_files[mcu]
921                    fd.write(line + '\n')
922        for f in mcu_files:
923            mcu_files[f].close()
924
925    def remove_latest_log_symlinks(self):
926        """Remove the conveninence symlinks 'latest' servod logs."""
927        symlink_wildcard = '%s/latest*' % self.remote_log_dir
928        cmd = 'rm ' + symlink_wildcard
929        self.run(cmd, stderr_tee=None, ignore_status=True)
930
931    def probe_servod_restart(self, instance_ts, outdir):
932        """Grab servod logs from previous instances if part of this session.
933
934        If since the last time this host called start_servod() servod crashed
935        and restarted, this helper finds those logs as well, and stores them
936        with the |OLD_LOG_SUFFIX| to investigate if necessary.
937
938        It also issues a panicinfo command to servo devices after the restart
939        to try and collect reboot information for debugging.
940
941        @param instance_ts: the log timestamp that the current instance uses
942        @param outdir: directory to create a subdirectory into to place the
943                       servod logs into.
944        """
945        if self._initial_instance_ts is None:
946            logging.info('No log timestamp grabbed successfully on servod '
947                         'startup. Cannot check device restarts. Ignoring.')
948            return
949        if instance_ts == self._initial_instance_ts:
950            logging.debug('Servod appears to have run without restarting')
951            return
952        # Servod seems to have restarted (at least once). |_initial_instance_ts|
953        # is the first timestamp, and instance_ts is the current timestamp. Find
954        # all timestamps in between them, and grab the logs for each.
955        tss = self._find_instance_timestamps_between(self._initial_instance_ts,
956                                                     instance_ts)
957        logging.info('Servod has restarted %d times between the start and the '
958                     'end of this servo_host.', len(tss))
959        logging.info('This might be an issue. Will extract all logs from each '
960                     'instance.')
961        logging.info('Logs that are not the currently running (about to turn '
962                     'down) instance are maked with a .%s in their folder.',
963                     self.OLD_LOG_SUFFIX)
964        for ts in tss:
965            self.get_instance_logs(ts, outdir, old=True)
966        # Lastly, servod has restarted due to a potential issue. Try to get
967        # panic information from servo micro and servo v4 for the current logs.
968        # This can only happen if the |_servo| attribute is initialized.
969        if self._servo:
970            for mcu in ['servo_micro', 'servo_v4']:
971                ctrl = '%s_uart_cmd' % mcu
972                if self._servo.has_control(ctrl):
973                    logging.info('Trying to retrieve %r panicinfo into logs',
974                                 mcu)
975                    try:
976                        self._servo.set_nocheck(ctrl, 'panicinfo')
977                    except error.TestFail as e:
978                        logging.error('Failed to generate panicinfo for %r '
979                                      'logs. %s', mcu, str(e))
980
981    def _find_instance_timestamps_between(self, start_ts, end_ts):
982        """Find all log timestamps between [start_ts, end_ts).
983
984        @param start_ts: str, earliest log timestamp of interest
985        @param end_ts: str, latest log timestamp of interest
986
987        @returns: list, all timestamps between start_ts and end_ts, end_ts
988                  exclusive, on the servo_host. An empty list on errors
989        """
990        # Simply get all timestamp, and then sort and remove
991        cmd = 'ls %s' % self.remote_log_dir
992        res = self.run(cmd, stderr_tee=None, ignore_status=True)
993        if res.exit_status != 0:
994            # Here we failed to find anything.
995            logging.info('Failed to find remote servod logs. Ignoring.')
996            return []
997        logfiles = res.stdout.strip().split()
998        timestamps = set()
999        for logfile in logfiles:
1000            ts_match = self.TS_EXTRACTOR.match(logfile)
1001            if not ts_match:
1002                # Simply ignore files that fail the check. It might be the
1003                # 'latest' symlinks or random files.
1004                continue
1005            timestamps.add(ts_match.group(self.TS_GROUP))
1006        # At this point we have all unique timestamps.
1007        timestamps = sorted(timestamps)
1008        for ts in [start_ts, end_ts]:
1009            if ts not in timestamps:
1010                logging.error('Timestamp %r not in servod logs. Cannot query '
1011                              'for timestamps in between %r and %r', ts,
1012                              start_ts, end_ts)
1013                return []
1014        return timestamps[timestamps.index(start_ts):timestamps.index(end_ts)]
1015
1016    def get_instance_logs_ts(self):
1017        """Retrieve the currently running servod instance's log timestamp
1018
1019        @returns: str, timestamp for current instance, or None on failure
1020        """
1021        # First, extract the timestamp. This cmd gives the real filename of
1022        # the latest aka current log file.
1023        cmd = ('if [ -f %(dir)s/latest.DEBUG ];'
1024               'then realpath %(dir)s/latest.DEBUG;'
1025               'elif [ -f %(dir)s/latest ];'
1026               'then realpath %(dir)s/latest;'
1027               'else exit %(code)d;'
1028               'fi' % {'dir': self.remote_log_dir,
1029                       'code': self.NO_SYMLINKS_CODE})
1030        res = self.run(cmd, stderr_tee=None, ignore_status=True)
1031        if res.exit_status != 0:
1032            if res.exit_status == self.NO_SYMLINKS_CODE:
1033                logging.warning('servod log latest symlinks not found. '
1034                                'This is likely due to an error starting up '
1035                                'servod. Ignoring..')
1036            else:
1037                logging.warning('Failed to find servod logs on servo host.')
1038                logging.warning(res.stderr.strip())
1039            return None
1040        fname = os.path.basename(res.stdout.strip())
1041        # From the fname, ought to extract the timestamp using the TS_EXTRACTOR
1042        ts_match = self.TS_EXTRACTOR.match(fname)
1043        if not ts_match:
1044            logging.warning('Failed to extract timestamp from servod log file '
1045                            '%r. Skipping. The servo host is using outdated '
1046                            'servod logging and needs to be updated.', fname)
1047            return None
1048        return ts_match.group(self.TS_GROUP)
1049
1050    def get_instance_logs(self, instance_ts, outdir, old=False):
1051        """Collect all logs with |instance_ts| and dump into a dir in |outdir|
1052
1053        This method first collects all logs on the servo_host side pertaining
1054        to this servod instance (port, instatiation). It glues them together
1055        into combined log.[level].txt files and extracts all available MCU
1056        console I/O from the logs into individual files e.g. servo_v4.txt
1057
1058        All the output can be found in a directory inside |outdir| that
1059        this generates based on |LOG_DIR|, the servod port, and the instance
1060        timestamp on the servo_host side.
1061
1062        @param instance_ts: log timestamp to grab logfiles for
1063        @param outdir: directory to create a subdirectory into to place the
1064                       servod logs into.
1065        @param old: bool, whether to append |OLD_LOG_SUFFIX| to output dir
1066        """
1067        # Create the local results log dir.
1068        log_dir = os.path.join(outdir, '%s_%s.%s' % (self.LOG_DIR,
1069                                                     str(self.servo_port),
1070                                                     instance_ts))
1071        if old:
1072            log_dir = '%s.%s' % (log_dir, self.OLD_LOG_SUFFIX)
1073        logging.info('Saving servod logs to %r.', log_dir)
1074        os.mkdir(log_dir)
1075        # Now, get all files with that timestamp.
1076        cmd = 'find %s -maxdepth 1 -name "log.%s*"' % (self.remote_log_dir,
1077                                                       instance_ts)
1078        res = self.run(cmd, stderr_tee=None, ignore_status=True)
1079        files = res.stdout.strip().split()
1080        try:
1081            self.get_file(files, log_dir, try_rsync=False)
1082            if not os.listdir(log_dir):
1083                logging.info('No servod logs retrieved. Ignoring, and removing '
1084                             '%r again.', log_dir)
1085                os.rmdir(log_dir)
1086                return
1087        except error.AutoservRunError as e:
1088            result = e.result_obj
1089            if result.exit_status != 0:
1090                stderr = result.stderr.strip()
1091                logging.warning("Couldn't retrieve servod logs. Ignoring: %s",
1092                                stderr or '\n%s' % result)
1093            # Remove the log_dir as nothing was added to it.
1094            os.rmdir(log_dir)
1095            return
1096        local_files = [os.path.join(log_dir, f) for f in os.listdir(log_dir)]
1097        # TODO(crrev.com/c/1793030): remove no-level case once CL is pushed
1098        for level_name in ('DEBUG', 'INFO', 'WARNING', ''):
1099            # Create the joint files for each loglevel. i.e log.DEBUG
1100            joint_file = self.JOINT_LOG_PREFIX
1101            if level_name:
1102                joint_file = '%s.%s' % (self.JOINT_LOG_PREFIX, level_name)
1103            # This helps with some online tools to avoid complaints about an
1104            # unknown filetype.
1105            joint_file = joint_file + '.txt'
1106            joint_path = os.path.join(log_dir, joint_file)
1107            files = [f for f in local_files if level_name in f]
1108            if not files:
1109                # TODO(crrev.com/c/1793030): remove no-level case once CL
1110                # is pushed
1111                continue
1112            # Extract compressed logs if any.
1113            compressed, extracted = self._extract_compressed_logs(log_dir,
1114                                                                  files)
1115            files = list(set(files) - set(compressed))
1116            files.extend(extracted)
1117            # Need to sort. As they all share the same timestamp, and
1118            # loglevel, the index itself is sufficient. The highest index
1119            # is the oldest file, therefore we need a descending sort.
1120            def sortkey(f, level=level_name):
1121                """Custom sortkey to sort based on rotation number int."""
1122                if f.endswith(level_name): return 0
1123                return int(f.split('.')[-1])
1124
1125            files.sort(reverse=True, key=sortkey)
1126            # Just rename the first file rather than building from scratch.
1127            os.rename(files[0], joint_path)
1128            with open(joint_path, 'a') as joint_f:
1129                for logfile in files[1:]:
1130                    # Transfer the file to the joint file line by line.
1131                    with open(logfile, 'r') as log_f:
1132                        for line in log_f:
1133                            joint_f.write(line)
1134                    # File has been written over. Delete safely.
1135                    os.remove(logfile)
1136            # Need to remove all files form |local_files| so we don't
1137            # analyze them again.
1138            local_files = list(set(local_files) - set(files) - set(compressed))
1139        # Lastly, extract MCU logs from the joint logs.
1140        self._extract_mcu_logs(log_dir)
1141
1142    def _lock(self):
1143        """lock servohost by touching a file.
1144        """
1145        logging.debug('Locking servohost %s by touching %s file',
1146                      self.hostname, self._lock_file)
1147        self.run('touch %s' % self._lock_file, ignore_status=True)
1148        self._is_locked = True
1149
1150    def _unlock(self):
1151        """Unlock servohost by removing the lock file.
1152        """
1153        logging.debug('Unlocking servohost by removing %s file',
1154                      self._lock_file)
1155        self.run('rm %s' % self._lock_file, ignore_status=True)
1156        self._is_locked = False
1157
1158    def close(self):
1159        """Close the associated servo and the host object."""
1160        # NOTE: throughout this method there are multiple attempts to catch
1161        # all errors. This is WAI as log grabbing should not fail tests.
1162        # However, the goal is to catch and handle/process all errors, thus
1163        # we print the traceback and ask for a bug.
1164        if self._closed:
1165            logging.debug('ServoHost is already closed.')
1166            return
1167
1168        # Only attempt ssh related actions if servohost is sshable. We call
1169        # check_cached_up_status() first because it's lightweighted and return
1170        # much faster in the case servohost is down, however, we still want
1171        # to call is_up() later since check_cached_up_status() is ping based check
1172        # and not guarantee the servohost is sshable.
1173        servo_host_ready = self.check_cached_up_status() and self.is_up()
1174
1175        if servo_host_ready:
1176            instance_ts = self.get_instance_logs_ts()
1177        else:
1178            logging.info('Servohost is down, will skip servod log collecting.')
1179            instance_ts = None
1180        # TODO(crbug.com/1011516): once enabled, remove the check against
1181        # localhost and instead check against log-rotiation enablement.
1182        logs_available = (instance_ts is not None and
1183                          self.job and
1184                          not self.is_localhost())
1185        if logs_available:
1186            # Probe whether there was a servod restart, and grab those old
1187            # logs as well.
1188            try:
1189                self.probe_servod_restart(instance_ts, self.job.resultdir)
1190            except (error.AutoservRunError, error.TestFail) as e:
1191                logging.info('Failed to grab servo logs due to: %s. '
1192                             'This error is forgiven.', str(e))
1193            except Exception as e:
1194                logging.error('Unexpected error probing for old logs. %s. '
1195                              'Forgiven. Please file a bug and fix or catch '
1196                              'in log probing function', str(e),
1197                              exc_info=True)
1198        if self._servo:
1199            outdir = None if not self.job else self.job.resultdir
1200            # In some cases when we run as lab-tools, the job object is None.
1201            self._servo.close(outdir)
1202
1203        if logs_available:
1204            # Grab current (not old like above) logs after the servo instance
1205            # was closed out.
1206            try:
1207                self.get_instance_logs(instance_ts, self.job.resultdir)
1208            except error.AutoservRunError as e:
1209                logging.info('Failed to grab servo logs due to: %s. '
1210                             'This error is forgiven.', str(e))
1211            except Exception as e:
1212                logging.error('Unexpected error grabbing servod logs. %s. '
1213                              'Forgiven. Please file a bug and fix or catch '
1214                              'in log grabbing function', str(e), exc_info=True)
1215
1216        if self._is_locked and servo_host_ready:
1217            # Remove the lock if the servohost has been locked.
1218            try:
1219                self._unlock()
1220            except error.AutoservSSHTimeout:
1221                logging.error('Unlock servohost failed due to ssh timeout.'
1222                              ' It may caused by servohost went down during'
1223                              ' the task.')
1224        # We want always stop servod after task to minimum the impact of bad
1225        # servod process interfere other servods.(see crbug.com/1028665)
1226        if servo_host_ready:
1227            try:
1228                self.stop_servod()
1229            except error.AutoservRunError as e:
1230                logging.info(
1231                        "Failed to stop servod due to:\n%s\n"
1232                        "This error is forgiven.", str(e))
1233
1234        super(ServoHost, self).close()
1235        # Mark closed.
1236        self._closed = True
1237
1238    def get_servo_state(self):
1239        return self._servo_state
1240
1241    def _get_host_metrics_data(self):
1242        return {'port': self.servo_port,
1243                'host': self.get_dut_hostname() or self.hostname,
1244                'board': self.servo_board or ''}
1245
1246    def _is_servo_device_connected(self, servo_type, serial):
1247        """Check if device is connected to the labstation.
1248
1249        Works for all servo devices connected to the labstation.
1250        For servo_v3 please use 'self._is_servo_board_present_on_servo_v3'
1251
1252        @param servo_type:  The type of servo device. Expecting value can be
1253                            servo_v4 or servo_micro.
1254        @param serial:      The serial number of the device to detect it.
1255        """
1256        vid_pid = self.SERVO_VID_PID.get(servo_type)
1257        if not vid_pid or not serial:
1258            # device cannot detected without VID/PID or serial number
1259            return False
1260        logging.debug('Started to detect %s', servo_type)
1261        try:
1262            cmd = 'lsusb -v -d %s |grep iSerial |grep %s' % (vid_pid, serial)
1263            result = self.run(cmd, ignore_status=True, timeout=30)
1264            if result.exit_status == 0 and result.stdout.strip():
1265                logging.debug('The %s is plugged in to the host.', servo_type)
1266                return True
1267            logging.debug('%s device is not detected; %s', servo_type, result)
1268            return False
1269        except Exception as e:
1270            # can be triggered by timeout issue due running the script
1271            metrics.Counter(
1272                'chromeos/autotest/repair/servo_detection/timeout'
1273                ).increment(fields=self._get_host_metrics_data())
1274            logging.error('%s device is not detected; %s', servo_type, str(e))
1275        return None
1276
1277    def _is_servo_board_present_on_servo_v3(self):
1278        """Check if servo board is detected on servo_v3"""
1279        vid_pids = self.SERVO_VID_PID['servo_v3']
1280        if not vid_pids or len(vid_pids) == 0:
1281            # device cannot detected without VID/PID
1282            return False
1283        logging.debug('Started to detect servo board on servo_v3')
1284        not_detected = 'The servo board is not detected on servo_v3'
1285        try:
1286            cmd = 'lsusb | grep "%s"' % "\|".join(vid_pids)
1287            result = self.run(cmd, ignore_status=True, timeout=30)
1288            if result.exit_status == 0 and result.stdout.strip():
1289                logging.debug('The servo board is detected on servo_v3')
1290                return True
1291            logging.debug('%s; %s', not_detected, result)
1292            return False
1293        except Exception as e:
1294            # can be triggered by timeout issue due running the script
1295            metrics.Counter(
1296                'chromeos/autotest/repair/servo_detection/timeout'
1297                ).increment(fields=self._get_host_metrics_data())
1298            logging.error('%s; %s', not_detected, str(e))
1299        return None
1300
1301    def _is_main_device_not_detected_on_servo_v4(self):
1302        """Check if servod cannot find main device on servo.
1303
1304        The check based on reading servod logs for servo_v4.
1305        """
1306        if not self._initial_instance_ts:
1307            # latest log not found
1308            return False
1309        logging.debug('latest log for servod created at %s',
1310                      self._initial_instance_ts)
1311        try:
1312            log_created = calendar.timegm(time.strptime(
1313                self._initial_instance_ts,
1314                "%Y-%m-%d--%H-%M-%S.%f"))
1315        except ValueError as e:
1316            logging.debug('Cannot read time from log file name: %s',
1317                          self._initial_instance_ts)
1318            return False
1319        min_time_created = calendar.timegm(time.gmtime())
1320        if min_time_created > log_created + 3600:
1321            # the log file is old we cannot use it
1322            logging.debug('log file was created more than hour ago, too old')
1323            return False
1324        logging.debug('latest log was created not longer then 1 hour ago')
1325
1326        # check if servod can detect main device by servo_v4
1327        message = 'ERROR - No servo micro or CCD detected for board'
1328        cmd = ('cat /var/log/servod_%s/log.%s.INFO |grep "%s"'
1329               % (self.servo_port, self._initial_instance_ts, message))
1330        result = self.run(cmd, ignore_status=True)
1331        if result.stdout.strip():
1332            logging.info('Servod cannot detect main device on the servo; '
1333                        'Can be caused by bad hardware of servo or '
1334                        'issue on the DUT side.')
1335            return True
1336        logging.debug('The main device is detected')
1337        return False
1338
1339    def _require_cr50_servod_config(self):
1340        """Check whether we need start servod with CONFIG=cr50.xml"""
1341        dut_host_info = self.get_dut_host_info()
1342        if not dut_host_info:
1343            return False
1344        for pool in dut_host_info.pools:
1345            if pool.startswith(servo_constants.CR50_CONFIG_POOL_PREFIX):
1346                return True
1347        return False
1348
1349    def get_verifier_state(self, tag):
1350        """Return the state of servo verifier.
1351
1352        @returns: bool or None
1353        """
1354        return self._repair_strategy.verifier_is_good(tag)
1355
1356    def determine_servo_state(self):
1357        """Determine servo state based on the failed verifier.
1358
1359        @returns: servo state value
1360        The state detecting based on first fail verifier or collecting of
1361        them.
1362        """
1363        ssh = self.get_verifier_state('servo_ssh')
1364        disk_space = self.get_verifier_state('disk_space')
1365        start_servod = self.get_verifier_state('servod_job')
1366        create_servo = self.get_verifier_state('servod_connection')
1367        init_servo = self.get_verifier_state('servod_control')
1368        servo_topology = self.get_verifier_state('servo_topology')
1369        dut_connected = self.get_verifier_state('dut_connected')
1370        hub_connected = self.get_verifier_state('hub_connected')
1371        pwr_button = self.get_verifier_state('pwr_button')
1372        lid_open = self.get_verifier_state('lid_open')
1373        ec_board = self.get_verifier_state('ec_board')
1374        cr50_console = self.get_verifier_state('cr50_console')
1375        ccd_testlab = self.get_verifier_state('ccd_testlab')
1376
1377        if not ssh:
1378            return servo_constants.SERVO_STATE_NO_SSH
1379
1380        if (start_servod == hosts.VERIFY_FAILED
1381                    or create_servo == hosts.VERIFY_FAILED):
1382            # sometimes servo can start with out present servo
1383            if self.is_labstation():
1384                if not self.servo_serial:
1385                    return servo_constants.SERVO_STATE_WRONG_CONFIG
1386                if self._is_servo_device_connected(
1387                    'servo_v4',
1388                    self.servo_serial) == False:
1389                    return servo_constants.SERVO_STATE_NOT_CONNECTED
1390            elif self._is_servo_board_present_on_servo_v3() == False:
1391                return servo_constants.SERVO_STATE_NOT_CONNECTED
1392
1393        if servo_topology == hosts.VERIFY_FAILED:
1394            return servo_constants.SERVO_STATE_TOPOLOGY_ISSUE
1395
1396        if (dut_connected == hosts.VERIFY_FAILED
1397                    or hub_connected == hosts.VERIFY_FAILED):
1398            return servo_constants.SERVO_STATE_DUT_NOT_CONNECTED
1399        # TODO(otabek@): detect special cases detected by pwr_button
1400        if dut_connected == hosts.VERIFY_SUCCESS:
1401            if pwr_button == hosts.VERIFY_FAILED:
1402                metrics.Counter(
1403                        'chromeos/autotest/repair/servo_unexpected/pwr_button2'
1404                ).increment(fields=self._get_host_metrics_data())
1405
1406        if start_servod == hosts.VERIFY_FAILED:
1407            return servo_constants.SERVO_STATE_SERVOD_ISSUE
1408
1409        if create_servo == hosts.VERIFY_FAILED:
1410            if (self.is_labstation()
1411                and self._is_main_device_not_detected_on_servo_v4()):
1412                servo_type = None
1413                if self.get_dut_host_info():
1414                    servo_type = self.get_dut_host_info().get_label_value(
1415                        servo_constants.SERVO_TYPE_LABEL_PREFIX)
1416                if servo_type and 'servo_micro' in servo_type:
1417                    serial = self.get_servo_micro_serial_number()
1418                    logging.debug('servo_micro serial: %s', serial)
1419                    if self._is_servo_device_detected('servo_micro',
1420                                                      serial):
1421                        return servo_constants.SERVO_STATE_BAD_RIBBON_CABLE
1422                # Device can be not detected because of DUT
1423                # TODO (otabek) update after b/159755652 and b/159754985
1424                metrics.Counter(
1425                    'chromeos/autotest/repair/servo_state/needs_replacement'
1426                    ).increment(fields=self._get_host_metrics_data())
1427            elif not self.is_labstation():
1428                # Here need logic to check if flex cable is connected
1429                pass
1430
1431        # one of the reason why servo can not initialized
1432        if cr50_console == hosts.VERIFY_FAILED:
1433            return servo_constants.SERVO_STATE_CR50_CONSOLE_MISSING
1434        if ccd_testlab == hosts.VERIFY_FAILED:
1435            return servo_constants.SERVO_STATE_CCD_TESTLAB_ISSUE
1436
1437        if (create_servo == hosts.VERIFY_FAILED
1438                    or init_servo == hosts.VERIFY_FAILED):
1439            return servo_constants.SERVO_STATE_SERVOD_ISSUE
1440
1441        if ec_board == hosts.VERIFY_FAILED:
1442            return servo_constants.SERVO_STATE_EC_BROKEN
1443        if pwr_button == hosts.VERIFY_FAILED:
1444            return servo_constants.SERVO_STATE_BAD_RIBBON_CABLE
1445        if lid_open == hosts.VERIFY_FAILED:
1446            return servo_constants.SERVO_STATE_LID_OPEN_FAILED
1447
1448        metrics.Counter(
1449            'chromeos/autotest/repair/unknown_servo_state'
1450            ).increment(fields=self._get_host_metrics_data())
1451        logging.info('We do not have special state for this failure yet :)')
1452        return servo_constants.SERVO_STATE_BROKEN
1453
1454    def is_servo_topology_supported(self):
1455        """Check if servo_topology is supported."""
1456        if not self.is_up_fast():
1457            logging.info('Servo-Host is not reachable.')
1458            return False
1459        if not self.is_labstation():
1460            logging.info('Servo-topology supported only for labstation.')
1461            return False
1462        if not self.servo_serial:
1463            logging.info('Servo-topology required a servo serial.')
1464            return False
1465        return True
1466
1467    def get_topology(self):
1468        """Get servo topology."""
1469        return self._topology
1470
1471    def is_dual_setup(self):
1472        """Check is servo will run in dual setup.
1473
1474        Dual setup used only for servo_v4 when used ccd_cr50 and servo_micro
1475        at the same time.
1476        """
1477        return self.servo_setup == servo_constants.SERVO_SETUP_VALUE_DUAL_V4
1478
1479    def set_dut_health_profile(self, dut_health_profile):
1480        """
1481        @param dut_health_profile: A DeviceHealthProfile object.
1482        """
1483        logging.debug('setting dut_health_profile field to (%s)',
1484                      dut_health_profile)
1485        self._dut_health_profile = dut_health_profile
1486
1487    def get_dut_health_profile(self):
1488        """
1489        @return A DeviceHealthProfile object.
1490        """
1491        return self._dut_health_profile
1492
1493
1494def make_servo_hostname(dut_hostname):
1495    """Given a DUT's hostname, return the hostname of its servo.
1496
1497    @param dut_hostname: hostname of a DUT.
1498
1499    @return hostname of the DUT's servo.
1500
1501    """
1502    host_parts = dut_hostname.split('.')
1503    host_parts[0] = host_parts[0] + '-servo'
1504    return '.'.join(host_parts)
1505
1506
1507def _map_afe_board_to_servo_board(afe_board):
1508    """Map a board we get from the AFE to a servo appropriate value.
1509
1510    Many boards are identical to other boards for servo's purposes.
1511    This function makes that mapping.
1512
1513    @param afe_board string board name received from AFE.
1514    @return board we expect servo to have.
1515
1516    """
1517    KNOWN_SUFFIXES = ['-freon', '_freon', '_moblab', '-cheets']
1518    BOARD_MAP = {'gizmo': 'panther'}
1519    mapped_board = afe_board
1520    if afe_board in BOARD_MAP:
1521        mapped_board = BOARD_MAP[afe_board]
1522    else:
1523        for suffix in KNOWN_SUFFIXES:
1524            if afe_board.endswith(suffix):
1525                mapped_board = afe_board[0:-len(suffix)]
1526                break
1527    if mapped_board != afe_board:
1528        logging.info('Mapping AFE board=%s to %s', afe_board, mapped_board)
1529    return mapped_board
1530
1531
1532def get_servo_args_for_host(dut_host):
1533    """Return servo data associated with a given DUT.
1534
1535    @param dut_host   Instance of `Host` on which to find the servo
1536                      attributes.
1537    @return `servo_args` dict with host and an optional port.
1538    """
1539    info = dut_host.host_info_store.get()
1540    servo_args = {k: v for k, v in six.iteritems(info.attributes)
1541                  if k in servo_constants.SERVO_ATTR_KEYS}
1542
1543    if servo_constants.SERVO_PORT_ATTR in servo_args:
1544        try:
1545            servo_args[servo_constants.SERVO_PORT_ATTR] = int(
1546                servo_args[servo_constants.SERVO_PORT_ATTR])
1547        except ValueError:
1548            logging.error('servo port is not an int: %s',
1549                          servo_args[servo_constants.SERVO_PORT_ATTR])
1550            # Reset servo_args because we don't want to use an invalid port.
1551            servo_args.pop(servo_constants.SERVO_HOST_ATTR, None)
1552
1553    if info.board:
1554        servo_board = _map_afe_board_to_servo_board(info.board)
1555        servo_args[servo_constants.SERVO_BOARD_ATTR] = servo_board
1556    if info.model:
1557        servo_args[servo_constants.SERVO_MODEL_ATTR] = info.model
1558    return servo_args if servo_constants.SERVO_HOST_ATTR in servo_args else None
1559
1560
1561def _tweak_args_for_ssp_moblab(servo_args):
1562    if (servo_args[servo_constants.SERVO_HOST_ATTR]
1563            in ['localhost', '127.0.0.1']):
1564        servo_args[servo_constants.SERVO_HOST_ATTR] = _CONFIG.get_config_value(
1565                'SSP', 'host_container_ip', type=str, default=None)
1566
1567
1568def create_servo_host(dut,
1569                      servo_args,
1570                      try_lab_servo=False,
1571                      try_servo_repair=False,
1572                      dut_host_info=None,
1573                      dut_health_profile=None):
1574    """Create a ServoHost object for a given DUT, if appropriate.
1575
1576    This function attempts to create and verify or repair a `ServoHost`
1577    object for a servo connected to the given `dut`, subject to various
1578    constraints imposed by the parameters:
1579      * When the `servo_args` parameter is not `None`, a servo
1580        host must be created, and must be checked with `repair()`.
1581      * Otherwise, if a servo exists in the lab and `try_lab_servo` is
1582        true:
1583          * If `try_servo_repair` is true, then create a servo host and
1584            check it with `repair()`.
1585          * Otherwise, if the servo responds to `ping` then create a
1586            servo host and check it with `verify()`.
1587
1588    In cases where `servo_args` was not `None`, repair failure
1589    exceptions are passed back to the caller; otherwise, exceptions
1590    are logged and then discarded.  Note that this only happens in cases
1591    where we're called from a test (not special task) control file that
1592    has an explicit dependency on servo.  In that case, we require that
1593    repair not write to `status.log`, so as to avoid polluting test
1594    results.
1595
1596    TODO(jrbarnette):  The special handling for servo in test control
1597    files is a thorn in my flesh; I dearly hope to see it cut out before
1598    my retirement.
1599
1600    Parameters for a servo host consist of a host name, port number, and
1601    DUT board, and are determined from one of these sources, in order of
1602    priority:
1603      * Servo attributes from the `dut` parameter take precedence over
1604        all other sources of information.
1605      * If a DNS entry for the servo based on the DUT hostname exists in
1606        the CrOS lab network, that hostname is used with the default
1607        port and the DUT's board.
1608      * If no other options are found, the parameters will be taken
1609        from the `servo_args` dict passed in from the caller.
1610
1611    @param dut            An instance of `Host` from which to take
1612                          servo parameters (if available).
1613    @param servo_args     A dictionary with servo parameters to use if
1614                          they can't be found from `dut`.  If this
1615                          argument is supplied, unrepaired exceptions
1616                          from `verify()` will be passed back to the
1617                          caller.
1618    @param try_lab_servo  If not true, servo host creation will be
1619                          skipped unless otherwise required by the
1620                          caller.
1621    @param try_servo_repair  If true, check a servo host with
1622                          `repair()` instead of `verify()`.
1623    @param dut_host_info: A HostInfo object of the DUT that connected
1624                          to this servo.
1625    @param dut_health_profile: DUT repair info with history.
1626
1627    @returns: A ServoHost object or None. See comments above.
1628
1629    """
1630    servo_dependency = servo_args is not None
1631    if dut is not None and (try_lab_servo or servo_dependency):
1632        servo_args_override = get_servo_args_for_host(dut)
1633        if servo_args_override is not None:
1634            if utils.in_moblab_ssp():
1635                _tweak_args_for_ssp_moblab(servo_args_override)
1636            logging.debug(
1637                    'Overriding provided servo_args (%s) with arguments'
1638                    ' determined from the host (%s)',
1639                    servo_args,
1640                    servo_args_override,
1641            )
1642            servo_args = servo_args_override
1643
1644    if servo_args is None:
1645        logging.debug('No servo_args provided, and failed to find overrides.')
1646        if try_lab_servo or servo_dependency:
1647            return None, servo_constants.SERVO_STATE_MISSING_CONFIG
1648        else:
1649            # For regular test case which not required the servo
1650            return None, None
1651
1652    servo_hostname = servo_args.get(servo_constants.SERVO_HOST_ATTR)
1653    servo_port = servo_args.get(servo_constants.SERVO_PORT_ATTR)
1654    if not _is_servo_host_information_exist(servo_hostname, servo_port):
1655        logging.debug(
1656            'Servo connection info missed hostname: %s , port: %s',
1657            servo_hostname, servo_port)
1658        return None, servo_constants.SERVO_STATE_MISSING_CONFIG
1659    if not is_servo_host_information_valid(servo_hostname, servo_port):
1660        logging.debug(
1661            'Servo connection info is incorrect hostname: %s , port: %s',
1662            servo_hostname, servo_port)
1663        return None, servo_constants.SERVO_STATE_WRONG_CONFIG
1664
1665    newhost = ServoHost(**servo_args)
1666    if not newhost.is_up_fast(count=3):
1667        # ServoHost has internal check to wait if servo-host is in reboot
1668        # process. If servo-host still is not available this check will stop
1669        # further attempts as we do not have any option to recover servo_host.
1670        return None, servo_constants.SERVO_STATE_NO_SSH
1671
1672    # Reset or reboot servo device only during AdminRepair tasks.
1673    if try_servo_repair:
1674        if newhost._is_locked:
1675            # Reset servo if the servo is locked, as we check if the servohost
1676            # is up, if the servohost is labstation and if the servohost is in
1677            # lab inside the locking logic.
1678            newhost.reset_servo()
1679        else:
1680            try:
1681                newhost.reboot_servo_v3_on_need()
1682            except Exception as e:
1683                logging.info('[Non-critical] Unexpected error while trying to'
1684                              ' reboot servo_v3, skipping the reboot; %s', e)
1685
1686    if dut:
1687        newhost.set_dut_hostname(dut.hostname)
1688    if dut_host_info:
1689        newhost.set_dut_host_info(dut_host_info)
1690    if dut_health_profile and (try_lab_servo or try_servo_repair):
1691        try:
1692            if newhost.is_localhost():
1693                logging.info('Servohost is a localhost, skip device'
1694                             ' health profile setup...')
1695            else:
1696                dut_health_profile.init_profile(newhost)
1697                newhost.set_dut_health_profile(dut_health_profile)
1698        except Exception as e:
1699            logging.info(
1700                    '[Non-critical] Unexpected error while trying to'
1701                    ' load device health profile; %s', e)
1702
1703    if try_lab_servo or try_servo_repair:
1704        try:
1705            logging.info("Check and update servo firmware.")
1706            servo_updater.update_servo_firmware(
1707                newhost,
1708                force_update=False)
1709        except Exception as e:
1710            logging.error("Servo device update error: %s", e)
1711
1712    try:
1713        newhost.restart_servod(quick_startup=True)
1714    except error.AutoservSSHTimeout:
1715        logging.warning("Restart servod failed due ssh connection "
1716                        "to servohost timed out. This error is forgiven"
1717                        " here, we will retry in servo repair process.")
1718    except error.AutoservRunError as e:
1719        logging.warning("Restart servod failed due to:\n%s\n"
1720                        "This error is forgiven here, we will retry"
1721                        " in servo repair process.", str(e))
1722
1723    # Note that the logic of repair() includes everything done
1724    # by verify().  It's sufficient to call one or the other;
1725    # we don't need both.
1726    if servo_dependency:
1727        newhost.repair(silent=True)
1728        return newhost, newhost.get_servo_state()
1729
1730    if try_servo_repair:
1731        try:
1732            newhost.repair()
1733        except Exception:
1734            logging.exception('servo repair failed for %s', newhost.hostname)
1735    else:
1736        try:
1737            newhost.verify()
1738        except Exception:
1739            logging.exception('servo verify failed for %s', newhost.hostname)
1740    return newhost, newhost.get_servo_state()
1741
1742
1743def _is_servo_host_information_exist(hostname, port):
1744    if hostname is None or len(hostname.strip()) == 0:
1745        return False
1746    if port is None:
1747        return False
1748    if not type(port) is int:
1749        try:
1750            int(port)
1751        except ValueError:
1752            return False
1753
1754    return True
1755
1756
1757def is_servo_host_information_valid(hostname, port):
1758    """Check if provided servo attributes are valid.
1759
1760    @param hostname Hostname of the servohost.
1761    @param port     servo port number.
1762
1763    @returns: A bool value to indicate if provided servo attribute valid.
1764    """
1765    if not _is_servo_host_information_exist(hostname, port):
1766        return False
1767    # checking range and correct of the port
1768    port_int = int(port)
1769    if port_int < 1 or port_int > 65000:
1770        return False
1771    # we expecting host contain only latters, digits and '-' or '_'
1772    if not re.match('[a-zA-Z0-9-_\.]*$', hostname) or len(hostname) < 5:
1773        return False
1774    return True
1775