1#!/usr/bin/python -u
2import os, socket, sys, signal, time, subprocess, logging
3from optparse import OptionParser
4import common
5from autotest_lib.scheduler import babysitter_logging_config
6from autotest_lib.client.common_lib import error, global_config, utils
7from autotest_lib.client.common_lib import logging_manager
8from autotest_lib.scheduler import scheduler_logging_config
9from autotest_lib.scheduler import status_server
10from autotest_lib.scheduler import monitor_db
11
12PAUSE_LENGTH = 60
13STALL_TIMEOUT = 2*60*60
14
15parser = OptionParser()
16parser.add_option("-r", action="store_true", dest="recover",
17                  help=("run recovery mode (implicit after any crash)"))
18parser.add_option("--background", dest="background", action="store_true",
19                  default=False, help=("runs the scheduler monitor on "
20                                       "background"))
21(options, args) = parser.parse_args()
22
23autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
24results_dir = os.path.join(autodir, 'results')
25monitor_db_path = os.path.join(autodir, 'scheduler/monitor_db.py')
26recover = (options.recover == True)
27
28if len(args) != 0:
29    parser.print_help()
30    sys.exit(1)
31
32
33def run_banner_output(cmd):
34    """Returns ------ CMD ------\nCMD_OUTPUT in a string"""
35    banner_output = '%s\n%%s\n\n' % cmd.center(60, '-')
36    command_output = ''
37    try:
38        cmd_out = utils.run(cmd, ignore_status=True, timeout=30)
39        command_output = cmd_out.stdout + cmd_out.stderr
40    except error.CmdError:
41        command_output = 'Timed out'
42
43    return banner_output % command_output
44
45
46def kill_monitor():
47    logging.info("Killing monitor_db")
48    # try shutdown first
49    utils.signal_program(monitor_db.PID_FILE_PREFIX, sig=signal.SIGINT)
50    if utils.program_is_alive(monitor_db.PID_FILE_PREFIX): # was it killed?
51        # give it some time to shutdown
52        time.sleep(30)
53        # kill it
54        utils.signal_process(monitor_db.PID_FILE_PREFIX)
55
56
57def handle_sigterm(signum, frame):
58    logging.info('Caught SIGTERM')
59    kill_monitor()
60    utils.delete_pid_file_if_exists(monitor_db.BABYSITTER_PID_FILE_PREFIX)
61    sys.exit(1)
62
63signal.signal(signal.SIGTERM, handle_sigterm)
64
65
66SiteMonitorProc = utils.import_site_class(
67    __file__, 'autotest_lib.scheduler.site_monitor_db_babysitter',
68    'SiteMonitorProc', object)
69
70
71class MonitorProc(SiteMonitorProc):
72    def __init__(self, do_recovery=False):
73        args = [monitor_db_path]
74        if do_recovery:
75            args.append("--recover-hosts")
76        args.append(results_dir)
77
78        kill_monitor()
79        environ = os.environ
80        scheduler_config = scheduler_logging_config.SchedulerLoggingConfig
81        log_name = scheduler_config.get_log_name()
82        os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name
83        scheduler_log_dir = scheduler_config.get_server_log_dir()
84        self.log_path = os.path.join(scheduler_log_dir, log_name)
85
86        self.log_size = 0
87        self.last_log_change = time.time()
88
89        logging.info("STARTING monitor_db with log file %s" % self.log_path)
90        self.args = args
91
92        # Allow site specific code to run, set environment variables and
93        # modify self.args if desired.
94        super(MonitorProc, self).__init__()
95
96
97    def start(self):
98        devnull = open(os.devnull, 'w')
99        self.proc = subprocess.Popen(self.args, stdout=devnull)
100
101
102    def is_running(self):
103        if self.proc.poll() is not None:
104            logging.info("monitor_db DIED")
105            return False
106
107        old_size = self.log_size
108        new_size = os.path.getsize(self.log_path)
109        if old_size != new_size:
110            logging.info("Log was touched")
111            self.log_size = new_size
112            self.last_log_change = time.time()
113        elif self.last_log_change + STALL_TIMEOUT < time.time():
114            logging.info("monitor_db STALLED")
115            self.collect_stalled_info()
116            return False
117
118        return True
119
120
121    def collect_stalled_info(self):
122        INFO_TO_COLLECT = ['uptime',
123                           'ps auxwww',
124                           'iostat -k -x 2 4',
125                          ]
126        db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s'
127        config = global_config.global_config
128        try:
129            user = config.get_config_value("BACKUP", "user")
130            password = config.get_config_value("BACKUP", "password")
131            db_cmd %= (user, password)
132            INFO_TO_COLLECT.append(db_cmd)
133        except global_config.ConfigError:
134            pass
135        stall_log_path = self.log_path + '.stall_info'
136        log = open(stall_log_path, "w")
137        for cmd in INFO_TO_COLLECT:
138            log.write(run_banner_output(cmd))
139
140        log.close()
141
142
143if os.getuid() == 0:
144    logging.critical("Running as root, aborting!")
145    sys.exit(1)
146
147if utils.program_is_alive(monitor_db.BABYSITTER_PID_FILE_PREFIX):
148    logging.critical("Monitor_db_babysitter already running, aborting!")
149    sys.exit(1)
150
151utils.write_pid(monitor_db.BABYSITTER_PID_FILE_PREFIX)
152
153if options.background:
154    logging_manager.configure_logging(
155           babysitter_logging_config.BabysitterLoggingConfig(use_console=False))
156
157    # Double fork - see http://code.activestate.com/recipes/66012/
158    try:
159        pid = os.fork()
160        if (pid > 0):
161            sys.exit(0) # exit from first parent
162    except OSError, e:
163        sys.stderr.write("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror))
164        sys.exit(1)
165
166    # Decouple from parent environment
167    os.chdir("/")
168    os.umask(0)
169    os.setsid()
170
171    # Second fork
172    try:
173        pid = os.fork()
174        if (pid > 0):
175            sys.exit(0) # exit from second parent
176    except OSError, e:
177        sys.stderr.write("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror))
178        sys.exit(1)
179else:
180    logging_manager.configure_logging(
181                            babysitter_logging_config.BabysitterLoggingConfig())
182
183
184while True:
185    sock = socket.socket()
186    try:
187        # Try to bind to the same port as the status_server.
188        sock.bind(('localhost', status_server._PORT))
189    except socket.error, msg:
190        # If binding failed, open the port.
191        logging.error('Failed to open socket with error:%s. Closing socket.',
192                      msg)
193        release_port_cmd_list = ['fuser', '-k', '-n', 'tcp',
194                                 '%d' % status_server._PORT]
195        process = subprocess.Popen(release_port_cmd_list)
196        process.wait()
197    sock.close()
198    proc = MonitorProc(do_recovery=recover)
199    proc.start()
200    time.sleep(PAUSE_LENGTH)
201    while proc.is_running():
202        logging.info("Tick")
203        time.sleep(PAUSE_LENGTH)
204    recover = False
205