1# Lint as: python2, python3
2# Copyright 2009 Google Inc. Released under the GPL v2
3
4"""
5This module defines the base classes for the Host hierarchy.
6
7Implementation details:
8You should import the "hosts" package instead of importing each type of host.
9
10        Host: a machine on which you can run programs
11"""
12
13from __future__ import absolute_import
14from __future__ import division
15from __future__ import print_function
16
17
18__author__ = """
19mbligh@google.com (Martin J. Bligh),
20poirier@google.com (Benjamin Poirier),
21stutsman@google.com (Ryan Stutsman)
22"""
23
24import json, logging, os, re, time
25
26from autotest_lib.client.common_lib import global_config, error, utils
27from autotest_lib.client.common_lib.cros import path_utils
28import six
29
30
31class Host(object):
32    """
33    This class represents a machine on which you can run programs.
34
35    It may be a local machine, the one autoserv is running on, a remote
36    machine or a virtual machine.
37
38    Implementation details:
39    This is an abstract class, leaf subclasses must implement the methods
40    listed here. You must not instantiate this class but should
41    instantiate one of those leaf subclasses.
42
43    When overriding methods that raise NotImplementedError, the leaf class
44    is fully responsible for the implementation and should not chain calls
45    to super. When overriding methods that are a NOP in Host, the subclass
46    should chain calls to super(). The criteria for fitting a new method into
47    one category or the other should be:
48        1. If two separate generic implementations could reasonably be
49           concatenated, then the abstract implementation should pass and
50           subclasses should chain calls to super.
51        2. If only one class could reasonably perform the stated function
52           (e.g. two separate run() implementations cannot both be executed)
53           then the method should raise NotImplementedError in Host, and
54           the implementor should NOT chain calls to super, to ensure that
55           only one implementation ever gets executed.
56    """
57
58    job = None
59    DEFAULT_REBOOT_TIMEOUT = global_config.global_config.get_config_value(
60        "HOSTS", "default_reboot_timeout", type=int, default=1800)
61    WAIT_DOWN_REBOOT_TIMEOUT = global_config.global_config.get_config_value(
62        "HOSTS", "wait_down_reboot_timeout", type=int, default=840)
63    WAIT_DOWN_REBOOT_WARNING = global_config.global_config.get_config_value(
64        "HOSTS", "wait_down_reboot_warning", type=int, default=540)
65    HOURS_TO_WAIT_FOR_RECOVERY = global_config.global_config.get_config_value(
66        "HOSTS", "hours_to_wait_for_recovery", type=float, default=2.5)
67    # the number of hardware repair requests that need to happen before we
68    # actually send machines to hardware repair
69    HARDWARE_REPAIR_REQUEST_THRESHOLD = 4
70    OP_REBOOT = 'reboot'
71    OP_SUSPEND = 'suspend'
72    PWR_OPERATION = [OP_REBOOT, OP_SUSPEND]
73
74
75    def __init__(self, *args, **dargs):
76        self._initialize(*args, **dargs)
77
78
79    def _initialize(self, *args, **dargs):
80        pass
81
82
83    @property
84    def job_repo_url_attribute(self):
85        """Get the host attribute name for job_repo_url.
86        """
87        return 'job_repo_url'
88
89
90    def close(self):
91        """Close the connection to the host.
92        """
93        pass
94
95
96    def setup(self):
97        """Setup the host object.
98        """
99        pass
100
101
102    def run(self, command, timeout=3600, ignore_status=False,
103            stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS,
104            stdin=None, args=()):
105        """
106        Run a command on this host.
107
108        @param command: the command line string
109        @param timeout: time limit in seconds before attempting to
110                kill the running process. The run() function
111                will take a few seconds longer than 'timeout'
112                to complete if it has to kill the process.
113        @param ignore_status: do not raise an exception, no matter
114                what the exit code of the command is.
115        @param stdout_tee: where to tee the stdout
116        @param stderr_tee: where to tee the stderr
117        @param stdin: stdin to pass (a string) to the executed command
118        @param args: sequence of strings to pass as arguments to command by
119                quoting them in " and escaping their contents if necessary
120
121        @return a utils.CmdResult object
122
123        @raises AutotestHostRunError: the exit code of the command execution
124                was not 0 and ignore_status was not enabled
125        """
126        raise NotImplementedError('Run not implemented!')
127
128
129    def run_output(self, command, *args, **dargs):
130        """Run and retrieve the value of stdout stripped of whitespace.
131
132        @param command: Command to execute.
133        @param *args: Extra arguments to run.
134        @param **dargs: Extra keyword arguments to run.
135
136        @return: String value of stdout.
137        """
138        return self.run(command, *args, **dargs).stdout.rstrip()
139
140
141    def reboot(self):
142        """Reboot the host.
143        """
144        raise NotImplementedError('Reboot not implemented!')
145
146
147    def suspend(self):
148        """Suspend the host.
149        """
150        raise NotImplementedError('Suspend not implemented!')
151
152
153    def sysrq_reboot(self):
154        """Execute host reboot via SysRq key.
155        """
156        raise NotImplementedError('Sysrq reboot not implemented!')
157
158
159    def reboot_setup(self, *args, **dargs):
160        """Prepare for reboot.
161
162        This doesn't appear to be implemented by any current hosts.
163
164        @param *args: Extra arguments to ?.
165        @param **dargs: Extra keyword arguments to ?.
166        """
167        pass
168
169
170    def reboot_followup(self, *args, **dargs):
171        """Post reboot work.
172
173        This doesn't appear to be implemented by any current hosts.
174
175        @param *args: Extra arguments to ?.
176        @param **dargs: Extra keyword arguments to ?.
177        """
178        pass
179
180
181    def get_file(self, source, dest, delete_dest=False):
182        """Retrieve a file from the host.
183
184        @param source: Remote file path (directory, file or list).
185        @param dest: Local file path (directory, file or list).
186        @param delete_dest: Delete files in remote path that are not in local
187            path.
188        """
189        raise NotImplementedError('Get file not implemented!')
190
191
192    def send_file(self, source, dest, delete_dest=False, excludes=None):
193        """Send a file to the host.
194
195        @param source: Local file path (directory, file or list).
196        @param dest: Remote file path (directory, file or list).
197        @param delete_dest: Delete files in remote path that are not in local
198                path.
199        @param excludes: A list of file pattern that matches files not to be
200                         sent. `send_file` will fail if exclude is not
201                         supported.
202        """
203        raise NotImplementedError('Send file not implemented!')
204
205
206    def get_tmp_dir(self):
207        """Create a temporary directory on the host.
208        """
209        raise NotImplementedError('Get temp dir not implemented!')
210
211
212    def is_up(self):
213        """Confirm the host is online.
214        """
215        raise NotImplementedError('Is up not implemented!')
216
217
218    def is_shutting_down(self):
219        """ Indicates is a machine is currently shutting down. """
220        return False
221
222
223    def get_wait_up_processes(self):
224        """ Gets the list of local processes to wait for in wait_up. """
225        get_config = global_config.global_config.get_config_value
226        proc_list = get_config("HOSTS", "wait_up_processes",
227                               default="").strip()
228        processes = set(p.strip() for p in proc_list.split(","))
229        processes.discard("")
230        return processes
231
232
233    def get_boot_id(self, timeout=60):
234        """ Get a unique ID associated with the current boot.
235
236        Should return a string with the semantics such that two separate
237        calls to Host.get_boot_id() return the same string if the host did
238        not reboot between the two calls, and two different strings if it
239        has rebooted at least once between the two calls.
240
241        @param timeout The number of seconds to wait before timing out.
242
243        @return A string unique to this boot or None if not available."""
244        BOOT_ID_FILE = '/proc/sys/kernel/random/boot_id'
245        NO_ID_MSG = 'no boot_id available'
246        cmd = 'if [ -f %r ]; then cat %r; else echo %r; fi' % (
247                BOOT_ID_FILE, BOOT_ID_FILE, NO_ID_MSG)
248        boot_id = self.run(cmd, timeout=timeout).stdout.strip()
249        if boot_id == NO_ID_MSG:
250            return None
251        return boot_id
252
253
254    def wait_up(self, timeout=None):
255        """Wait for the host to come up.
256
257        @param timeout: Max seconds to wait.
258        """
259        raise NotImplementedError('Wait up not implemented!')
260
261
262    def wait_down(self, timeout=None, warning_timer=None, old_boot_id=None):
263        """Wait for the host to go down.
264
265        @param timeout: Max seconds to wait before returning.
266        @param warning_timer: Seconds before warning host is not down.
267        @param old_boot_id: Result of self.get_boot_id() before shutdown.
268        """
269        raise NotImplementedError('Wait down not implemented!')
270
271
272    def _construct_host_metadata(self, type_str):
273        """Returns dict of metadata with type_str, hostname, time_recorded.
274
275        @param type_str: String representing _type field in es db.
276            For example: type_str='reboot_total'.
277        """
278        metadata = {
279            'hostname': self.hostname,
280            'time_recorded': time.time(),
281            '_type': type_str,
282        }
283        return metadata
284
285
286    def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT,
287                         down_timeout=WAIT_DOWN_REBOOT_TIMEOUT,
288                         down_warning=WAIT_DOWN_REBOOT_WARNING,
289                         log_failure=True, old_boot_id=None, **dargs):
290        """Wait for the host to come back from a reboot.
291
292        This is a generic implementation based entirely on wait_up and
293        wait_down.
294
295        @param timeout: Max seconds to wait for reboot to start.
296        @param down_timeout: Max seconds to wait for host to go down.
297        @param down_warning: Seconds to wait before warning host hasn't gone
298            down.
299        @param log_failure: bool(Log when host does not go down.)
300        @param old_boot_id: Result of self.get_boot_id() before restart.
301        @param **dargs: Extra arguments to reboot_followup.
302
303        @raises AutoservRebootError if host does not come back up.
304        """
305        if not self.wait_down(timeout=down_timeout,
306                              warning_timer=down_warning,
307                              old_boot_id=old_boot_id):
308            if log_failure:
309                self.record("ABORT", None, "reboot.verify", "shut down failed")
310            raise error.AutoservShutdownError("Host did not shut down")
311        if self.wait_up(timeout):
312            self.record("GOOD", None, "reboot.verify")
313            self.reboot_followup(**dargs)
314        else:
315            self.record("ABORT", None, "reboot.verify",
316                        "Host did not return from reboot")
317            raise error.AutoservRebootError("Host did not return from reboot")
318
319
320    def verify(self):
321        """Check if host is in good state.
322        """
323        self.verify_hardware()
324        self.verify_connectivity()
325        self.verify_software()
326
327
328    def verify_hardware(self):
329        """Check host hardware.
330        """
331        pass
332
333
334    def verify_connectivity(self):
335        """Check host network connectivity.
336        """
337        pass
338
339
340    def verify_software(self):
341        """Check host software.
342        """
343        pass
344
345
346    def check_diskspace(self, path, gb):
347        """Raises an error if path does not have at least gb GB free.
348
349        @param path The path to check for free disk space.
350        @param gb A floating point number to compare with a granularity
351            of 1 MB.
352
353        1000 based SI units are used.
354
355        @raises AutoservDiskFullHostError if path has less than gb GB free.
356        @raises AutoservDirectoryNotFoundError if path is not a valid directory.
357        @raises AutoservDiskSizeUnknownError the return from du is not parsed
358            correctly.
359        """
360        one_mb = 10 ** 6  # Bytes (SI unit).
361        mb_per_gb = 1000.0
362        logging.info('Checking for >= %s GB of space under %s on machine %s',
363                     gb, path, self.hostname)
364
365        if not self.path_exists(path):
366            msg = 'Path does not exist on host: %s' % path
367            logging.warning(msg)
368            raise error.AutoservDirectoryNotFoundError(msg)
369
370        cmd = 'df -PB %d %s | tail -1' % (one_mb, path)
371        df = self.run(cmd).stdout.split()
372        try:
373            free_space_gb = int(df[3]) / mb_per_gb
374        except (IndexError, ValueError):
375            msg = ('Could not determine the size of %s. '
376                   'Output from df: %s') % (path, df)
377            logging.error(msg)
378            raise error.AutoservDiskSizeUnknownError(msg)
379
380        if free_space_gb < gb:
381            raise error.AutoservDiskFullHostError(path, gb, free_space_gb)
382        else:
383            logging.info('Found %s GB >= %s GB of space under %s on machine %s',
384                free_space_gb, gb, path, self.hostname)
385
386
387    def check_inodes(self, path, min_kilo_inodes):
388        """Raises an error if a file system is short on i-nodes.
389
390        @param path The path to check for free i-nodes.
391        @param min_kilo_inodes Minimum number of i-nodes required,
392                               in units of 1000 i-nodes.
393
394        @raises AutoservNoFreeInodesError If the minimum required
395                                  i-node count isn't available.
396        """
397        min_inodes = 1000 * min_kilo_inodes
398        logging.info('Checking for >= %d i-nodes under %s '
399                     'on machine %s', min_inodes, path, self.hostname)
400        df = self.run('df -Pi %s | tail -1' % path).stdout.split()
401        free_inodes = int(df[3])
402        if free_inodes < min_inodes:
403            raise error.AutoservNoFreeInodesError(path, min_inodes,
404                                                  free_inodes)
405        else:
406            logging.info('Found %d >= %d i-nodes under %s on '
407                         'machine %s', free_inodes, min_inodes,
408                         path, self.hostname)
409
410
411    def erase_dir_contents(self, path, ignore_status=True, timeout=3600):
412        """Empty a given directory path contents.
413
414        @param path: Path to empty.
415        @param ignore_status: Ignore the exit status from run.
416        @param timeout: Max seconds to allow command to complete.
417        """
418        rm_cmd = 'find "%s" -mindepth 1 -maxdepth 1 -print0 | xargs -0 rm -rf'
419        self.run(rm_cmd % path, ignore_status=ignore_status, timeout=timeout)
420
421
422    def repair(self):
423        """Try and get the host to pass `self.verify()`."""
424        self.verify()
425
426
427    def disable_ipfilters(self):
428        """Allow all network packets in and out of the host."""
429        self.run('iptables-save > /tmp/iptable-rules')
430        self.run('iptables -P INPUT ACCEPT')
431        self.run('iptables -P FORWARD ACCEPT')
432        self.run('iptables -P OUTPUT ACCEPT')
433
434
435    def enable_ipfilters(self):
436        """Re-enable the IP filters disabled from disable_ipfilters()"""
437        if self.path_exists('/tmp/iptable-rules'):
438            self.run('iptables-restore < /tmp/iptable-rules')
439
440
441    def cleanup(self):
442        """Restore host to clean state.
443        """
444        pass
445
446
447    def install(self, installableObject):
448        """Call install on a thing.
449
450        @param installableObject: Thing with install method that will accept our
451            self.
452        """
453        installableObject.install(self)
454
455
456    def get_autodir(self):
457        raise NotImplementedError('Get autodir not implemented!')
458
459
460    def set_autodir(self):
461        raise NotImplementedError('Set autodir not implemented!')
462
463
464    def start_loggers(self):
465        """ Called to start continuous host logging. """
466        pass
467
468
469    def stop_loggers(self):
470        """ Called to stop continuous host logging. """
471        pass
472
473
474    # some extra methods simplify the retrieval of information about the
475    # Host machine, with generic implementations based on run(). subclasses
476    # should feel free to override these if they can provide better
477    # implementations for their specific Host types
478
479    def get_num_cpu(self):
480        """ Get the number of CPUs in the host according to /proc/cpuinfo. """
481        proc_cpuinfo = self.run('cat /proc/cpuinfo',
482                                stdout_tee=open(os.devnull, 'w')).stdout
483        cpus = 0
484        for line in proc_cpuinfo.splitlines():
485            if line.startswith('processor'):
486                cpus += 1
487        return cpus
488
489
490    def get_arch(self):
491        """ Get the hardware architecture of the remote machine. """
492        cmd_uname = path_utils.must_be_installed('/bin/uname', host=self)
493        arch = self.run('%s -m' % cmd_uname).stdout.rstrip()
494        if re.match(r'i\d86$', arch):
495            arch = 'i386'
496        return arch
497
498
499    def get_kernel_ver(self):
500        """ Get the kernel version of the remote machine. """
501        cmd_uname = path_utils.must_be_installed('/bin/uname', host=self)
502        return self.run('%s -r' % cmd_uname).stdout.rstrip()
503
504
505    def get_cmdline(self):
506        """ Get the kernel command line of the remote machine. """
507        return self.run('cat /proc/cmdline').stdout.rstrip()
508
509
510    def get_meminfo(self):
511        """ Get the kernel memory info (/proc/meminfo) of the remote machine
512        and return a dictionary mapping the various statistics. """
513        meminfo_dict = {}
514        meminfo = self.run('cat /proc/meminfo').stdout.splitlines()
515        for key, val in (line.split(':', 1) for line in meminfo):
516            meminfo_dict[key.strip()] = val.strip()
517        return meminfo_dict
518
519
520    def path_exists(self, path):
521        """Determine if path exists on the remote machine.
522
523        @param path: path to check
524
525        @return: bool(path exists)"""
526        result = self.run('test -e "%s"' % utils.sh_escape(path),
527                          ignore_status=True)
528        return result.exit_status == 0
529
530
531    # some extra helpers for doing job-related operations
532
533    def record(self, *args, **dargs):
534        """ Helper method for recording status logs against Host.job that
535        silently becomes a NOP if Host.job is not available. The args and
536        dargs are passed on to Host.job.record unchanged. """
537        if self.job:
538            self.job.record(*args, **dargs)
539
540
541    def log_kernel(self):
542        """ Helper method for logging kernel information into the status logs.
543        Intended for cases where the "current" kernel is not really defined
544        and we want to explicitly log it. Does nothing if this host isn't
545        actually associated with a job. """
546        if self.job:
547            kernel = self.get_kernel_ver()
548            self.job.record("INFO", None, None,
549                            optional_fields={"kernel": kernel})
550
551
552    def log_op(self, op, op_func):
553        """ Decorator for wrapping a management operaiton in a group for status
554        logging purposes.
555
556        @param op: name of the operation.
557        @param op_func: a function that carries out the operation
558                        (reboot, suspend)
559        """
560        if self.job and not hasattr(self, "RUNNING_LOG_OP"):
561            self.RUNNING_LOG_OP = True
562            try:
563                self.job.run_op(op, op_func, self.get_kernel_ver)
564            finally:
565                del self.RUNNING_LOG_OP
566        else:
567            op_func()
568
569
570    def list_files_glob(self, glob):
571        """Get a list of files on a remote host given a glob pattern path.
572
573        @param glob: pattern
574
575        @return: list of files
576        """
577        SCRIPT = ("python -c 'import json, glob, sys;"
578                  "json.dump(glob.glob(sys.argv[1]), sys.stdout)'")
579        output = self.run(SCRIPT, args=(glob,), stdout_tee=None,
580                          timeout=60).stdout
581        return json.loads(output)
582
583
584    def symlink_closure(self, paths):
585        """
586        Given a sequence of path strings, return the set of all paths that
587        can be reached from the initial set by following symlinks.
588
589        @param paths: sequence of path strings.
590        @return: a sequence of path strings that are all the unique paths that
591                can be reached from the given ones after following symlinks.
592        """
593        SCRIPT = ("python -c 'import json, os, sys\n"
594                  "paths = json.load(sys.stdin)\n"
595                  "closure = {}\n"
596                  "while paths:\n"
597                  "    path = next(iter(paths))\n"
598                  "    del paths[path]\n"
599                  "    if not os.path.exists(path):\n"
600                  "        continue\n"
601                  "    closure[path] = None\n"
602                  "    if os.path.islink(path):\n"
603                  "        link_to = os.path.join(os.path.dirname(path),\n"
604                  "                               os.readlink(path))\n"
605                  "        if link_to not in closure:\n"
606                  "            paths[link_to] = None\n"
607                  "json.dump(closure.keys(), sys.stdout, 0)'")
608        input_data = json.dumps(dict((path, None) for path in paths), 0)
609        output = self.run(SCRIPT, stdout_tee=None, stdin=input_data,
610                          timeout=60).stdout
611        return json.loads(output)
612
613
614    def cleanup_kernels(self, boot_dir='/boot'):
615        """
616        Remove any kernel image and associated files (vmlinux, system.map,
617        modules) for any image found in the boot directory that is not
618        referenced by entries in the bootloader configuration.
619
620        @param boot_dir: boot directory path string, default '/boot'
621        """
622        # find all the vmlinuz images referenced by the bootloader
623        vmlinuz_prefix = os.path.join(boot_dir, 'vmlinuz-')
624        boot_info = self.bootloader.get_entries()
625        used_kernver = [boot['kernel'][len(vmlinuz_prefix):]
626                        for boot in six.itervalues(boot_info)]
627
628        # find all the unused vmlinuz images in /boot
629        all_vmlinuz = self.list_files_glob(vmlinuz_prefix + '*')
630        used_vmlinuz = self.symlink_closure(vmlinuz_prefix + kernver
631                                            for kernver in used_kernver)
632        unused_vmlinuz = set(all_vmlinuz) - set(used_vmlinuz)
633
634        # find all the unused vmlinux images in /boot
635        vmlinux_prefix = os.path.join(boot_dir, 'vmlinux-')
636        all_vmlinux = self.list_files_glob(vmlinux_prefix + '*')
637        used_vmlinux = self.symlink_closure(vmlinux_prefix + kernver
638                                            for kernver in used_kernver)
639        unused_vmlinux = set(all_vmlinux) - set(used_vmlinux)
640
641        # find all the unused System.map files in /boot
642        systemmap_prefix = os.path.join(boot_dir, 'System.map-')
643        all_system_map = self.list_files_glob(systemmap_prefix + '*')
644        used_system_map = self.symlink_closure(
645            systemmap_prefix + kernver for kernver in used_kernver)
646        unused_system_map = set(all_system_map) - set(used_system_map)
647
648        # find all the module directories associated with unused kernels
649        modules_prefix = '/lib/modules/'
650        all_moddirs = [dir for dir in self.list_files_glob(modules_prefix + '*')
651                       if re.match(modules_prefix + r'\d+\.\d+\.\d+.*', dir)]
652        used_moddirs = self.symlink_closure(modules_prefix + kernver
653                                            for kernver in used_kernver)
654        unused_moddirs = set(all_moddirs) - set(used_moddirs)
655
656        # remove all the vmlinuz files we don't use
657        # TODO: if needed this should become package manager agnostic
658        for vmlinuz in unused_vmlinuz:
659            # try and get an rpm package name
660            rpm = self.run('rpm -qf', args=(vmlinuz,),
661                           ignore_status=True, timeout=120)
662            if rpm.exit_status == 0:
663                packages = set(line.strip() for line in
664                               rpm.stdout.splitlines())
665                # if we found some package names, try to remove them
666                for package in packages:
667                    self.run('rpm -e', args=(package,),
668                             ignore_status=True, timeout=120)
669            # remove the image files anyway, even if rpm didn't
670            self.run('rm -f', args=(vmlinuz,),
671                     ignore_status=True, timeout=120)
672
673        # remove all the vmlinux and System.map files left over
674        for f in (unused_vmlinux | unused_system_map):
675            self.run('rm -f', args=(f,),
676                     ignore_status=True, timeout=120)
677
678        # remove all unused module directories
679        # the regex match should keep us safe from removing the wrong files
680        for moddir in unused_moddirs:
681            self.run('rm -fr', args=(moddir,), ignore_status=True)
682
683
684    def get_attributes_to_clear_before_provision(self):
685        """Get a list of attributes to be cleared before machine_install starts.
686
687        If provision runs in a lab environment, it is necessary to clear certain
688        host attributes for the host in afe_host_attributes table. For example,
689        `job_repo_url` is a devserver url pointed to autotest packages for
690        CrosHost, it needs to be removed before provision starts for tests to
691        run reliably.
692        """
693        return ['job_repo_url']
694
695
696    def get_platform(self):
697        """Determine the correct platform label for this host.
698
699        @return: A string representing this host's platform.
700        """
701        raise NotImplementedError("Get platform not implemented!")
702
703
704    def get_labels(self):
705        """Return a list of the labels gathered from the devices connected.
706
707        @return: A list of strings that denote the labels from all the devices
708        connected.
709        """
710        raise NotImplementedError("Get labels not implemented!")
711
712
713    def check_cached_up_status(self, expiration_seconds):
714        """Check if the DUT responded to ping in the past `expiration_seconds`.
715
716        @param expiration_seconds: The number of seconds to keep the cached
717                status of whether the DUT responded to ping.
718        @return: True if the DUT has responded to ping during the past
719                 `expiration_seconds`.
720        """
721        raise NotImplementedError("check_cached_up_status not implemented!")
722