1#!/usr/bin/env python2
2
3# Copyright 2017 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Script to upload metrics from apache access logs to Monarch."""
8
9from __future__ import print_function
10
11import argparse
12import re
13import sys
14import urlparse
15
16import common
17
18from chromite.lib import ts_mon_config
19from chromite.lib import metrics
20
21from autotest_lib.site_utils.stats import log_daemon_common
22# Not used, but needed for importing rpc_interface.
23from autotest_lib.frontend import setup_django_environment
24from autotest_lib.frontend.afe import rpc_interface
25from autotest_lib.frontend.afe import moblab_rpc_interface
26
27
28"""
29The log format is set to:
30  %v:%p %h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\" %T
31
32These are documented as follows:
33  (from https://httpd.apache.org/docs/current/mod/mod_log_config.html)
34
35%h: Remote host
36%l: Remote logname (from identd, if supplied)
37%O: Bytes sent, including headers. May be zero in rare cases such as when a
38    request is aborted before a response is sent. You need to enable mod_logio
39    to use this.
40%p: The canonical Port of the server serving the request
41%r: First line of request
42%s: Status.  For requests that got internally redirected, this is
43    the status of the *original* request --- %...>s for the last.
44%t: Time, in common log format time format (standard english format)
45%T: The time taken to serve the request, in seconds.
46%u: Remote user (from auth; may be bogus if return status (%s) is 401)
47%v: The canonical ServerName of the server serving the request.
48"""
49
50# Lemma: a regex to match sections delimited be double-quotes ("), which
51# possible contained escaped quotes (\").
52# This works by matching non-quotes or the string r'\"' repeatedly; then it ends
53# when finding a quote (") preceeded by a character which is not a backslash.
54MATCH_UNTIL_QUOTE = r'([^"]|\\")*[^\\]'
55
56ACCESS_MATCHER = re.compile(
57    r'^'
58    r'\S+ \S+ \S+ \S+ '               # Ignore %v:%p %h %l %u
59    r'\[[^]]+\] '                     # Ignore %t
60    r'"('                             # Begin %r
61    r'(?P<request_method>\S+) '       # e.g. POST
62    r'(?P<endpoint>\S+)'              # e.g. /afe/server/noauth/rpc/
63    + MATCH_UNTIL_QUOTE +             # Ignore protocol (e.g. HTTP/1.1)
64    r'|-'                             # The request data might just be "-"
65    r')" '                            # End %r
66    r'(?P<response_code>\d+) '        # %>s (e.g. 200)
67    r'(?P<bytes_sent>\d+)'            # %O
68    r' "' + MATCH_UNTIL_QUOTE + '"'   # Ignore Referer
69    r' "' + MATCH_UNTIL_QUOTE + '"'   # Ignore User-Agent
70    r' ?(?P<response_seconds>\d+?)'   # The server time in seconds
71    r'.*'                             # Allow adding extra stuff afterward.
72)
73
74ACCESS_TIME_METRIC = '/chromeos/autotest/http/server/response_seconds'
75ACCESS_BYTES_METRIC = '/chromeos/autotest/http/server/response_bytes'
76
77RPC_ACCESS_TIME_METRIC = '/chromeos/autotest/http/server/rpc/response_seconds'
78RPC_ACCESS_BYTES_METRIC = '/chromeos/autotest/http/server/rpc/response_bytes'
79RPC_METHOD_ARGUMENT = 'method'
80
81
82# TODO(phobbs) use a memory-efficient structure to detect non-unique paths.
83# We can't just include the endpoint because it will cause a cardinality
84# explosion.
85ALLOWLISTED_ENDPOINTS = frozenset((
86        '/',
87        '/afe/',
88        '/new_tko/server/rpc/',
89        '/afe/server/rpc/',
90        '/___rPc_sWiTcH___',
91        '*',
92        '/afe/server/noauth/rpc/',
93))
94
95
96# A bad actor could DOS Monarch by requesting millions of different RPC methods,
97# each of which would create a different stream. Only allow an allowlist of
98# methods to be recorded in Monarch.
99ALLOWLISTED_METHODS = (frozenset(dir(rpc_interface))
100                       | frozenset(dir(moblab_rpc_interface)))
101
102
103def EmitRequestMetrics(m):
104    """Emits metrics for each line in the access log.
105
106    @param m: A regex match object
107    """
108    fields = {
109        'request_method': m.groupdict().get('request_method', ''),
110        'endpoint': SanitizeEndpoint(m.group('endpoint')),
111        'response_code': int(m.group('response_code')),
112    }
113
114    send_rpc_metrics = (
115        '?' in m.group('endpoint') and '/rpc' in m.group('endpoint'))
116    if send_rpc_metrics:
117        EmitRPCMetrics(m)
118
119    # Request seconds and bytes sent are both extremely high cardinality, so
120    # they must be the VAL of a metric, not a metric field.
121    if m.group('response_seconds'):
122        response_seconds = int(m.group('response_seconds'))
123        metrics.SecondsDistribution(ACCESS_TIME_METRIC).add(response_seconds,
124                                                            fields=fields)
125
126    bytes_sent = int(m.group('bytes_sent'))
127    metrics.CumulativeDistribution(ACCESS_BYTES_METRIC).add(
128        bytes_sent, fields=fields)
129
130
131def EmitRPCMetrics(m):
132    """Emit a special metric including the method when the request was an RPC."""
133    fields = {
134            'request_method': m.groupdict().get('request_method', ''),
135            'rpc_method': ParseRPCMethod(m.group('endpoint')),
136            'response_code': int(m.group('response_code')),
137    }
138
139    if m.group('response_seconds'):
140        response_seconds = int(m.group('response_seconds'))
141        metrics.SecondsDistribution(RPC_ACCESS_TIME_METRIC).add(
142                response_seconds, fields=fields)
143
144    bytes_sent = int(m.group('bytes_sent'))
145    metrics.CumulativeDistribution(RPC_ACCESS_BYTES_METRIC).add(bytes_sent,
146                                                                fields=fields)
147
148
149def ParseRPCMethod(url):
150    """Parses the RPC method from an RPC query string.
151
152  Args:
153    url: The URL requested.
154  """
155    query = urlparse.urlparse(url).query
156    return urlparse.parse_qs(query)[RPC_METHOD_ARGUMENT][-1]
157
158
159def SanitizeEndpoint(endpoint):
160    """Returns empty string if endpoint is not allowlisted.
161
162    @param endpoint: The endpoint to sanitize.
163    """
164    if endpoint in ALLOWLISTED_ENDPOINTS:
165        return endpoint
166    else:
167        return ''
168
169
170MATCHERS = [
171    (ACCESS_MATCHER, EmitRequestMetrics),
172]
173
174
175def ParseArgs():
176    """Parses the command line arguments."""
177    p = argparse.ArgumentParser(
178        description='Parses apache logs and emits metrics to Monarch')
179    p.add_argument('--output-logfile')
180    p.add_argument('--debug-metrics-file',
181                   help='Output metrics to the given file instead of sending '
182                   'them to production.')
183    return p.parse_args()
184
185
186def Main():
187    """Sets up logging and runs matchers against stdin."""
188    args = ParseArgs()
189    log_daemon_common.SetupLogging(args)
190
191    # Set up metrics sending and go.
192    ts_mon_args = {}
193    if args.debug_metrics_file:
194        ts_mon_args['debug_file'] = args.debug_metrics_file
195
196    with ts_mon_config.SetupTsMonGlobalState('apache_access_log_metrics',
197                                             **ts_mon_args):
198        log_daemon_common.RunMatchers(sys.stdin, MATCHERS)
199
200
201if __name__ == '__main__':
202    Main()
203