1#!/usr/bin/env python
2# Copyright 2010 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#      http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""View and edit HTTP Archives.
17
18To list all URLs in an archive:
19  $ ./httparchive.py ls archive.wpr
20
21To view the content of all URLs from example.com:
22  $ ./httparchive.py cat --host example.com archive.wpr
23
24To view the content of a particular URL:
25  $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr
26
27To view the content of all URLs:
28  $ ./httparchive.py cat archive.wpr
29
30To edit a particular URL:
31  $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr
32
33To print statistics of an archive:
34  $ ./httparchive.py stats archive.wpr
35
36To print statistics of a set of URLs:
37  $ ./httparchive.py stats --host www.example.com archive.wpr
38
39To merge multiple archives
40  $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ...
41"""
42
43import calendar
44import certutils
45import datetime
46import cPickle
47import difflib
48import email.utils
49import httplib
50import httpzlib
51import json
52import logging
53import optparse
54import os
55import StringIO
56import subprocess
57import sys
58import tempfile
59import time
60import urlparse
61from collections import defaultdict
62
63
64
65def LogRunTime(fn):
66  """Annotation which logs the run time of the function."""
67  def wrapped(self, *args, **kwargs):
68    start_time = time.time()
69    try:
70      return fn(self, *args, **kwargs)
71    finally:
72      run_time = (time.time() - start_time) * 1000.0
73      logging.debug('%s: %dms', fn.__name__, run_time)
74  return wrapped
75
76
77class HttpArchiveException(Exception):
78  """Base class for all exceptions in httparchive."""
79  pass
80
81
82class HttpArchive(dict):
83  """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values.
84
85  Attributes:
86    responses_by_host: dict of {hostname, {request: response}}. This must remain
87        in sync with the underlying dict of self. It is used as an optimization
88        so that get_requests() doesn't have to linearly search all requests in
89        the archive to find potential matches.
90  """
91
92  def __init__(self):  # pylint: disable=super-init-not-called
93    self.responses_by_host = defaultdict(dict)
94
95  def __setstate__(self, state):
96    """Influence how to unpickle.
97
98    Args:
99      state: a dictionary for __dict__
100    """
101    self.__dict__.update(state)
102    self.responses_by_host = defaultdict(dict)
103    for request in self:
104      self.responses_by_host[request.host][request] = self[request]
105
106  def __getstate__(self):
107    """Influence how to pickle.
108
109    Returns:
110      a dict to use for pickling
111    """
112    state = self.__dict__.copy()
113    del state['responses_by_host']
114    return state
115
116  def __setitem__(self, key, value):
117    super(HttpArchive, self).__setitem__(key, value)
118    if hasattr(self, 'responses_by_host'):
119      self.responses_by_host[key.host][key] = value
120
121  def __delitem__(self, key):
122    super(HttpArchive, self).__delitem__(key)
123    del self.responses_by_host[key.host][key]
124
125  def get(self, request, default=None):
126    """Return the archived response for a given request.
127
128    Does extra checking for handling some HTTP request headers.
129
130    Args:
131      request: instance of ArchivedHttpRequest
132      default: default value to return if request is not found
133
134    Returns:
135      Instance of ArchivedHttpResponse or default if no matching
136      response is found
137    """
138    if request in self:
139      return self[request]
140    return self.get_conditional_response(request, default)
141
142  def get_conditional_response(self, request, default):
143    """Get the response based on the conditional HTTP request headers.
144
145    Args:
146      request: an ArchivedHttpRequest representing the original request.
147      default: default ArchivedHttpResponse
148          original request with matched headers removed.
149
150    Returns:
151      an ArchivedHttpResponse with a status of 200, 302 (not modified), or
152          412 (precondition failed)
153    """
154    response = default
155    if request.is_conditional():
156      stripped_request = request.create_request_without_conditions()
157      if stripped_request in self:
158        response = self[stripped_request]
159        if response.status == 200:
160          status = self.get_conditional_status(request, response)
161          if status != 200:
162            response = create_response(status)
163    return response
164
165  def get_conditional_status(self, request, response):
166    status = 200
167    last_modified = email.utils.parsedate(
168        response.update_date(response.get_header('last-modified')))
169    response_etag = response.get_header('etag')
170    is_get_or_head = request.command.upper() in ('GET', 'HEAD')
171
172    match_value = request.headers.get('if-match', None)
173    if match_value:
174      if self.is_etag_match(match_value, response_etag):
175        status = 200
176      else:
177        status = 412  # precondition failed
178    none_match_value = request.headers.get('if-none-match', None)
179    if none_match_value:
180      if self.is_etag_match(none_match_value, response_etag):
181        status = 304
182      elif is_get_or_head:
183        status = 200
184      else:
185        status = 412
186    if is_get_or_head and last_modified:
187      for header in ('if-modified-since', 'if-unmodified-since'):
188        date = email.utils.parsedate(request.headers.get(header, None))
189        if date:
190          if ((header == 'if-modified-since' and last_modified > date) or
191              (header == 'if-unmodified-since' and last_modified < date)):
192            if status != 412:
193              status = 200
194          else:
195            status = 304  # not modified
196    return status
197
198  @staticmethod
199  def is_etag_match(request_etag, response_etag):
200    """Determines whether the entity tags of the request/response matches.
201
202    Args:
203      request_etag: the value string of the "if-(none)-match:"
204                    portion of the request header
205      response_etag: the etag value of the response
206
207    Returns:
208      True on match, False otherwise
209    """
210    response_etag = response_etag.strip('" ')
211    for etag in request_etag.split(','):
212      etag = etag.strip('" ')
213      if etag in ('*', response_etag):
214        return True
215    return False
216
217  def get_requests(self, command=None, host=None, full_path=None, is_ssl=None,
218                   use_query=True):
219    """Return a list of requests that match the given args."""
220    if host:
221      return [r for r in self.responses_by_host[host]
222              if r.matches(command, None, full_path, is_ssl,
223                           use_query=use_query)]
224    else:
225      return [r for r in self
226              if r.matches(command, host, full_path, is_ssl,
227                           use_query=use_query)]
228
229  def ls(self, command=None, host=None, full_path=None):
230    """List all URLs that match given params."""
231    return ''.join(sorted(
232        '%s\n' % r for r in self.get_requests(command, host, full_path)))
233
234  def cat(self, command=None, host=None, full_path=None):
235    """Print the contents of all URLs that match given params."""
236    out = StringIO.StringIO()
237    for request in self.get_requests(command, host, full_path):
238      print >>out, str(request)
239      print >>out, 'Untrimmed request headers:'
240      for k in request.headers:
241        print >>out, '    %s: %s' % (k, request.headers[k])
242      if request.request_body:
243        print >>out, request.request_body
244      print >>out, '---- Response Info', '-' * 51
245      response = self[request]
246      chunk_lengths = [len(x) for x in response.response_data]
247      print >>out, ('Status: %s\n'
248                    'Reason: %s\n'
249                    'Headers delay: %s\n'
250                    'Untrimmed response headers:') % (
251          response.status, response.reason, response.delays['headers'])
252      for k, v in response.original_headers:
253        print >>out, '    %s: %s' % (k, v)
254      print >>out, ('Chunk count: %s\n'
255                    'Chunk lengths: %s\n'
256                    'Chunk delays: %s') % (
257          len(chunk_lengths), chunk_lengths, response.delays['data'])
258      body = response.get_data_as_text()
259      print >>out, '---- Response Data', '-' * 51
260      if body:
261        print >>out, body
262      else:
263        print >>out, '[binary data]'
264      print >>out, '=' * 70
265    return out.getvalue()
266
267  def stats(self, command=None, host=None, full_path=None):
268    """Print stats about the archive for all URLs that match given params."""
269    matching_requests = self.get_requests(command, host, full_path)
270    if not matching_requests:
271      print 'Failed to find any requests matching given command, host, path.'
272      return
273
274    out = StringIO.StringIO()
275    stats = {
276        'Total': len(matching_requests),
277        'Domains': defaultdict(int),
278        'HTTP_response_code': defaultdict(int),
279        'content_type': defaultdict(int),
280        'Documents': defaultdict(int),
281        }
282
283    for request in matching_requests:
284      stats['Domains'][request.host] += 1
285      stats['HTTP_response_code'][self[request].status] += 1
286
287      content_type = self[request].get_header('content-type')
288      # Remove content type options for readability and higher level groupings.
289      str_content_type = str(content_type.split(';')[0]
290                            if content_type else None)
291      stats['content_type'][str_content_type] += 1
292
293      #  Documents are the main URL requested and not a referenced resource.
294      if str_content_type == 'text/html' and not 'referer' in request.headers:
295        stats['Documents'][request.host] += 1
296
297    print >>out, json.dumps(stats, indent=4)
298    return out.getvalue()
299
300  def merge(self, merged_archive=None, other_archives=None):
301    """Merge multiple archives into merged_archive by 'chaining' resources,
302    only resources that are not part of the accumlated archive are added"""
303    if not other_archives:
304      print 'No archives passed to merge'
305      return
306
307    # Note we already loaded 'replay_file'.
308    print 'Loaded %d responses' % len(self)
309
310    for archive in other_archives:
311      if not os.path.exists(archive):
312        print 'Error: Replay file "%s" does not exist' % archive
313        return
314
315      http_archive_other = HttpArchive.Load(archive)
316      print 'Loaded %d responses from %s' % (len(http_archive_other), archive)
317      for r in http_archive_other:
318        # Only resources that are not already part of the current archive
319        # get added.
320        if r not in self:
321          print '\t %s ' % r
322          self[r] = http_archive_other[r]
323    self.Persist('%s' % merged_archive)
324
325  def edit(self, command=None, host=None, full_path=None):
326    """Edits the single request which matches given params."""
327    editor = os.getenv('EDITOR')
328    if not editor:
329      print 'You must set the EDITOR environmental variable.'
330      return
331
332    matching_requests = self.get_requests(command, host, full_path)
333    if not matching_requests:
334      print ('Failed to find any requests matching given command, host, '
335             'full_path.')
336      return
337
338    if len(matching_requests) > 1:
339      print 'Found multiple matching requests. Please refine.'
340      print self.ls(command, host, full_path)
341
342    response = self[matching_requests[0]]
343    tmp_file = tempfile.NamedTemporaryFile(delete=False)
344    tmp_file.write(response.get_response_as_text())
345    tmp_file.close()
346    subprocess.check_call([editor, tmp_file.name])
347    response.set_response_from_text(''.join(open(tmp_file.name).readlines()))
348    os.remove(tmp_file.name)
349
350  def find_closest_request(self, request, use_path=False):
351    """Find the closest matching request in the archive to the given request.
352
353    Args:
354      request: an ArchivedHttpRequest
355      use_path: If True, closest matching request's path component must match.
356        (Note: this refers to the 'path' component within the URL, not the
357         'full path' which includes the query string component.)
358
359        If use_path=True, candidate will NOT match in example below
360        e.g. request   = GET www.test.com/a?p=1
361             candidate = GET www.test.com/b?p=1
362
363        Even if use_path=False, urls with same paths are always favored.
364        For example, candidate1 is considered a better match than candidate2.
365          request    = GET www.test.com/a?p=1&q=2&r=3
366          candidate1 = GET www.test.com/a?s=4
367          candidate2 = GET www.test.com/b?p=1&q=2&r=3
368
369    Returns:
370      If a close match is found, return the instance of ArchivedHttpRequest.
371      Otherwise, return None.
372    """
373    # Start with strictest constraints. This trims search space considerably.
374    requests = self.get_requests(request.command, request.host,
375                                 request.full_path, is_ssl=request.is_ssl,
376                                 use_query=True)
377    # Relax constraint: use_query if there is no match.
378    if not requests:
379      requests = self.get_requests(request.command, request.host,
380                                   request.full_path, is_ssl=request.is_ssl,
381                                   use_query=False)
382    # Relax constraint: full_path if there is no match and use_path=False.
383    if not requests and not use_path:
384      requests = self.get_requests(request.command, request.host,
385                                   None, is_ssl=request.is_ssl,
386                                   use_query=False)
387
388    if not requests:
389      return None
390
391    if len(requests) == 1:
392      return requests[0]
393
394    matcher = difflib.SequenceMatcher(b=request.cmp_seq)
395
396    # quick_ratio() is cheap to compute, but ratio() is expensive. So we call
397    # quick_ratio() on all requests, sort them descending, and then loop through
398    # until we find a candidate whose ratio() is >= the next quick_ratio().
399    # This works because quick_ratio() is guaranteed to be an upper bound on
400    # ratio().
401    candidates = []
402    for candidate in requests:
403      matcher.set_seq1(candidate.cmp_seq)
404      candidates.append((matcher.quick_ratio(), candidate))
405
406    candidates.sort(reverse=True, key=lambda c: c[0])
407
408    best_match = (0, None)
409    for i in xrange(len(candidates)):
410      matcher.set_seq1(candidates[i][1].cmp_seq)
411      best_match = max(best_match, (matcher.ratio(), candidates[i][1]))
412      if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]:
413        break
414    return best_match[1]
415
416  def diff(self, request):
417    """Diff the given request to the closest matching request in the archive.
418
419    Args:
420      request: an ArchivedHttpRequest
421    Returns:
422      If a close match is found, return a textual diff between the requests.
423      Otherwise, return None.
424    """
425    request_lines = request.formatted_request.split('\n')
426    closest_request = self.find_closest_request(request)
427    if closest_request:
428      closest_request_lines = closest_request.formatted_request.split('\n')
429      return '\n'.join(difflib.ndiff(closest_request_lines, request_lines))
430    return None
431
432  def get_server_cert(self, host):
433    """Gets certificate from the server and stores it in archive"""
434    request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {})
435    if request not in self:
436      self[request] = create_response(200, body=certutils.get_host_cert(host))
437    return self[request].response_data[0]
438
439  def get_certificate(self, host):
440    request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {})
441    if request not in self:
442      self[request] = create_response(200, body=self._generate_cert(host))
443    return self[request].response_data[0]
444
445  @classmethod
446  def AssertWritable(cls, filename):
447    """Raises an IOError if filename is not writable."""
448    persist_dir = os.path.dirname(os.path.abspath(filename))
449    if not os.path.exists(persist_dir):
450      raise IOError('Directory does not exist: %s' % persist_dir)
451    if os.path.exists(filename):
452      if not os.access(filename, os.W_OK):
453        raise IOError('Need write permission on file: %s' % filename)
454    elif not os.access(persist_dir, os.W_OK):
455      raise IOError('Need write permission on directory: %s' % persist_dir)
456
457  @classmethod
458  def Load(cls, filename):
459    """Load an instance from filename."""
460    return cPickle.load(open(filename, 'rb'))
461
462  def Persist(self, filename):
463    """Persist all state to filename."""
464    try:
465      original_checkinterval = sys.getcheckinterval()
466      sys.setcheckinterval(2**31-1)  # Lock out other threads so nothing can
467                                     # modify |self| during pickling.
468      pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL)
469    finally:
470      sys.setcheckinterval(original_checkinterval)
471    with open(filename, 'wb') as f:
472      f.write(pickled_self)
473
474
475class ArchivedHttpRequest(object):
476  """Record all the state that goes into a request.
477
478  ArchivedHttpRequest instances are considered immutable so they can
479  serve as keys for HttpArchive instances.
480  (The immutability is not enforced.)
481
482  Upon creation, the headers are "trimmed" (i.e. edited or dropped)
483  and saved to self.trimmed_headers to allow requests to match in a wider
484  variety of playback situations (e.g. using different user agents).
485
486  For unpickling, 'trimmed_headers' is recreated from 'headers'. That
487  allows for changes to the trim function and can help with debugging.
488  """
489  CONDITIONAL_HEADERS = [
490      'if-none-match', 'if-match',
491      'if-modified-since', 'if-unmodified-since']
492
493  def __init__(self, command, host, full_path, request_body, headers,
494               is_ssl=False):
495    """Initialize an ArchivedHttpRequest.
496
497    Args:
498      command: a string (e.g. 'GET' or 'POST').
499      host: a host name (e.g. 'www.google.com').
500      full_path: a request path.  Includes everything after the host & port in
501          the URL (e.g. '/search?q=dogs').
502      request_body: a request body string for a POST or None.
503      headers: {key: value, ...} where key and value are strings.
504      is_ssl: a boolean which is True iff request is make via SSL.
505    """
506    self.command = command
507    self.host = host
508    self.full_path = full_path
509    parsed_url = urlparse.urlparse(full_path) if full_path else None
510    self.path = parsed_url.path if parsed_url else None
511    self.request_body = request_body
512    self.headers = headers
513    self.is_ssl = is_ssl
514    self.trimmed_headers = self._TrimHeaders(headers)
515    self.formatted_request = self._GetFormattedRequest()
516    self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None)
517
518  def __str__(self):
519    scheme = 'https' if self.is_ssl else 'http'
520    return '%s %s://%s%s %s' % (
521        self.command, scheme, self.host, self.full_path, self.trimmed_headers)
522
523  def __repr__(self):
524    return repr((self.command, self.host, self.full_path, self.request_body,
525                 self.trimmed_headers, self.is_ssl))
526
527  def __hash__(self):
528    """Return a integer hash to use for hashed collections including dict."""
529    return hash(repr(self))
530
531  def __eq__(self, other):
532    """Define the __eq__ method to match the hash behavior."""
533    return repr(self) == repr(other)
534
535  def __setstate__(self, state):
536    """Influence how to unpickle.
537
538    "headers" are the original request headers.
539    "trimmed_headers" are the trimmed headers used for matching requests
540    during replay.
541
542    Args:
543      state: a dictionary for __dict__
544    """
545    if 'full_headers' in state:
546      # Fix older version of archive.
547      state['headers'] = state['full_headers']
548      del state['full_headers']
549    if 'headers' not in state:
550      raise HttpArchiveException(
551          'Archived HTTP request is missing "headers". The HTTP archive is'
552          ' likely from a previous version and must be re-recorded.')
553    if 'path' in state:
554      # before, 'path' and 'path_without_query' were used and 'path' was
555      # pickled.  Now, 'path' has been renamed to 'full_path' and
556      # 'path_without_query' has been renamed to 'path'.  'full_path' is
557      # pickled, but 'path' is not.  If we see 'path' here it means we are
558      # dealing with an older archive.
559      state['full_path'] = state['path']
560      del state['path']
561    state['trimmed_headers'] = self._TrimHeaders(dict(state['headers']))
562    if 'is_ssl' not in state:
563      state['is_ssl'] = False
564    self.__dict__.update(state)
565    parsed_url = urlparse.urlparse(self.full_path)
566    self.path = parsed_url.path
567    self.formatted_request = self._GetFormattedRequest()
568    self.cmp_seq = self._GetCmpSeq(parsed_url.query)
569
570  def __getstate__(self):
571    """Influence how to pickle.
572
573    Returns:
574      a dict to use for pickling
575    """
576    state = self.__dict__.copy()
577    del state['trimmed_headers']
578    del state['path']
579    del state['formatted_request']
580    del state['cmp_seq']
581    return state
582
583  def _GetFormattedRequest(self):
584    """Format request to make diffs easier to read.
585
586    Returns:
587      A string consisting of the request. Example:
588      'GET www.example.com/path\nHeader-Key: header value\n'
589    """
590    parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)]
591    if self.request_body:
592      parts.append('%s\n' % self.request_body)
593    for k, v in self.trimmed_headers:
594      k = '-'.join(x.capitalize() for x in k.split('-'))
595      parts.append('%s: %s\n' % (k, v))
596    return ''.join(parts)
597
598  def _GetCmpSeq(self, query=None):
599    """Compute a sequence out of query and header for difflib to compare.
600    For example:
601      [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')]
602    will be returned for a request with URL:
603      http://example.com/index.html?q1=a2&q2=a2
604    and header:
605      k1: v1
606      k2: v2
607
608    Args:
609      query: the query string in the URL.
610
611    Returns:
612      A sequence for difflib to compare.
613    """
614    if not query:
615      return self.trimmed_headers
616    return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers
617
618  def matches(self, command=None, host=None, full_path=None, is_ssl=None,
619              use_query=True):
620    """Returns true iff the request matches all parameters.
621
622    Args:
623      command: a string (e.g. 'GET' or 'POST').
624      host: a host name (e.g. 'www.google.com').
625      full_path: a request path with query string (e.g. '/search?q=dogs')
626      is_ssl: whether the request is secure.
627      use_query:
628        If use_query is True, request matching uses both the hierarchical path
629        and query string component.
630        If use_query is False, request matching only uses the hierarchical path
631
632        e.g. req1 = GET www.test.com/index?aaaa
633             req2 = GET www.test.com/index?bbbb
634
635        If use_query is True, req1.matches(req2) evaluates to False
636        If use_query is False, req1.matches(req2) evaluates to True
637
638    Returns:
639      True iff the request matches all parameters
640    """
641    if command is not None and command != self.command:
642      return False
643    if is_ssl is not None and is_ssl != self.is_ssl:
644      return False
645    if host is not None and host != self.host:
646      return False
647    if full_path is None:
648      return True
649    if use_query:
650      return full_path == self.full_path
651    else:
652      return self.path == urlparse.urlparse(full_path).path
653
654  @classmethod
655  def _TrimHeaders(cls, headers):
656    """Removes headers that are known to cause problems during replay.
657
658    These headers are removed for the following reasons:
659    - accept: Causes problems with www.bing.com. During record, CSS is fetched
660              with *. During replay, it's text/css.
661    - accept-charset, accept-language, referer: vary between clients.
662    - cache-control:  sometimes sent from Chrome with 'max-age=0' as value.
663    - connection, method, scheme, url, version: Cause problems with spdy.
664    - cookie: Extremely sensitive to request/response order.
665    - keep-alive: Doesn't affect the content of the request, only some
666      transient state of the transport layer.
667    - user-agent: Changes with every Chrome version.
668    - proxy-connection: Sent for proxy requests.
669    - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by
670      Google to collect statistics about Chrome's enabled features.
671
672    Another variant to consider is dropping only the value from the header.
673    However, this is particularly bad for the cookie header, because the
674    presence of the cookie depends on the responses we've seen when the request
675    is made.
676
677    Args:
678      headers: {header_key: header_value, ...}
679
680    Returns:
681      [(header_key, header_value), ...]  # (with undesirable headers removed)
682    """
683    # TODO(tonyg): Strip sdch from the request headers because we can't
684    # guarantee that the dictionary will be recorded, so replay may not work.
685    if 'accept-encoding' in headers:
686      accept_encoding = headers['accept-encoding']
687      accept_encoding = accept_encoding.replace('sdch', '')
688      # Strip lzma so Opera's requests matches archives recorded using Chrome.
689      accept_encoding = accept_encoding.replace('lzma', '')
690      stripped_encodings = [e.strip() for e in accept_encoding.split(',')]
691      accept_encoding = ','.join(filter(bool, stripped_encodings))
692      headers['accept-encoding'] = accept_encoding
693    undesirable_keys = [
694        'accept', 'accept-charset', 'accept-language', 'cache-control',
695        'connection', 'cookie', 'keep-alive', 'method',
696        'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection',
697        'x-chrome-variations', 'x-client-data']
698    return sorted([(k, v) for k, v in headers.items()
699                   if k.lower() not in undesirable_keys])
700
701  def is_conditional(self):
702    """Return list of headers that match conditional headers."""
703    for header in self.CONDITIONAL_HEADERS:
704      if header in self.headers:
705        return True
706    return False
707
708  def create_request_without_conditions(self):
709    stripped_headers = dict((k, v) for k, v in self.headers.iteritems()
710                            if k.lower() not in self.CONDITIONAL_HEADERS)
711    return ArchivedHttpRequest(
712        self.command, self.host, self.full_path, self.request_body,
713        stripped_headers, self.is_ssl)
714
715class ArchivedHttpResponse(object):
716  """All the data needed to recreate all HTTP response.
717
718  Upon creation, the headers are "trimmed" (i.e. edited or dropped).
719  The original headers are saved to self.original_headers, while the
720  trimmed ones are used to allow responses to match in a wider variety
721  of playback situations.
722
723  For pickling, 'original_headers' are stored in the archive.  For unpickling
724  the headers are trimmed again. That allows for changes to the trim
725  function and can help with debugging.
726  """
727
728  # CHUNK_EDIT_SEPARATOR is used to edit and view text content.
729  # It is not sent in responses. It is added by get_data_as_text()
730  # and removed by set_data().
731  CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]'
732
733  # DELAY_EDIT_SEPARATOR is used to edit and view server delays.
734  DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- '
735                          'Delays are above. Response content is below.]\n')
736
737  # This date was used in deterministic.js prior to switching to recorded
738  # request time.  See https://github.com/chromium/web-page-replay/issues/71
739  # for details.
740  DEFAULT_REQUEST_TIME = datetime.datetime(2008, 2, 29, 2, 26, 8, 254000)
741
742  def __init__(self, version, status, reason, headers, response_data,
743               delays=None, request_time=None):
744    """Initialize an ArchivedHttpResponse.
745
746    Args:
747      version: HTTP protocol version used by server.
748          10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib).
749      status: Status code returned by server (e.g. 200).
750      reason: Reason phrase returned by server (e.g. "OK").
751      headers: list of (header, value) tuples.
752      response_data: list of content chunks.
753          Concatenating the chunks gives the complete contents
754          (i.e. the chunks do not have any lengths or delimiters).
755          Do not include the final, zero-length chunk that marks the end.
756      delays: dict of (ms) delays for 'connect', 'headers' and 'data'.
757          e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]}
758          connect - The time to connect to the server.
759            Each resource has a value because Replay's record mode captures it.
760            This includes the time for the SYN and SYN/ACK (1 rtt).
761          headers - The time elapsed between the TCP connect and the headers.
762            This typically includes all the server-time to generate a response.
763          data - If the response is chunked, these are the times for each chunk.
764    """
765    self.version = version
766    self.status = status
767    self.reason = reason
768    self.original_headers = headers
769    self.headers = self._TrimHeaders(headers)
770    self.response_data = response_data
771    self.delays = delays
772    self.fix_delays()
773    self.request_time = (
774        request_time or ArchivedHttpResponse.DEFAULT_REQUEST_TIME
775    )
776
777  def fix_delays(self):
778    """Initialize delays, or check the number of data delays."""
779    expected_num_delays = len(self.response_data)
780    if not self.delays:
781      self.delays = {
782          'connect': 0,
783          'headers': 0,
784          'data': [0] * expected_num_delays
785          }
786    else:
787      num_delays = len(self.delays['data'])
788      if num_delays != expected_num_delays:
789        raise HttpArchiveException(
790            'Server delay length mismatch: %d (expected %d): %s',
791            num_delays, expected_num_delays, self.delays['data'])
792
793  @classmethod
794  def _TrimHeaders(cls, headers):
795    """Removes headers that are known to cause problems during replay.
796
797    These headers are removed for the following reasons:
798    - content-security-policy: Causes problems with script injection.
799    """
800    undesirable_keys = ['content-security-policy']
801    return [(k, v) for k, v in headers if k.lower() not in undesirable_keys]
802
803  def __repr__(self):
804    return repr((self.version, self.status, self.reason, sorted(self.headers),
805                 self.response_data, self.request_time))
806
807  def __hash__(self):
808    """Return a integer hash to use for hashed collections including dict."""
809    return hash(repr(self))
810
811  def __eq__(self, other):
812    """Define the __eq__ method to match the hash behavior."""
813    return repr(self) == repr(other)
814
815  def __setstate__(self, state):
816    """Influence how to unpickle.
817
818    "original_headers" are the original request headers.
819    "headers" are the trimmed headers used for replaying responses.
820
821    Args:
822      state: a dictionary for __dict__
823    """
824    if 'server_delays' in state:
825      state['delays'] = {
826          'connect': 0,
827          'headers': 0,
828          'data': state['server_delays']
829          }
830      del state['server_delays']
831    elif 'delays' not in state:
832      state['delays'] = None
833    # Set to date that was hardcoded in deterministic.js originally.
834    state.setdefault('request_time', ArchivedHttpResponse.DEFAULT_REQUEST_TIME)
835    state['original_headers'] = state['headers']
836    state['headers'] = self._TrimHeaders(state['original_headers'])
837    self.__dict__.update(state)
838    self.fix_delays()
839
840  def __getstate__(self):
841    """Influence how to pickle.
842
843    Returns:
844      a dict to use for pickling
845    """
846    state = self.__dict__.copy()
847    state['headers'] = state['original_headers']
848    del state['original_headers']
849    return state
850
851  def get_header(self, key, default=None):
852    for k, v in self.headers:
853      if key.lower() == k.lower():
854        return v
855    return default
856
857  def set_header(self, key, value):
858    for i, (k, v) in enumerate(self.headers):
859      if key == k:
860        self.headers[i] = (key, value)
861        return
862    self.headers.append((key, value))
863
864  def remove_header(self, key):
865    for i, (k, v) in enumerate(self.headers):
866      if key.lower() == k.lower():
867        self.headers.pop(i)
868        return
869
870  @staticmethod
871  def _get_epoch_seconds(date_str):
872    """Return the epoch seconds of a date header.
873
874    Args:
875      date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
876    Returns:
877      epoch seconds as a float
878    """
879    date_tuple = email.utils.parsedate(date_str)
880    if date_tuple:
881      return calendar.timegm(date_tuple)
882    return None
883
884  def update_date(self, date_str, now=None):
885    """Return an updated date based on its delta from the "Date" header.
886
887    For example, if |date_str| is one week later than the "Date" header,
888    then the returned date string is one week later than the current date.
889
890    Args:
891      date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
892    Returns:
893      a date string
894    """
895    date_seconds = self._get_epoch_seconds(self.get_header('date'))
896    header_seconds = self._get_epoch_seconds(date_str)
897    if date_seconds and header_seconds:
898      updated_seconds = header_seconds + (now or time.time()) - date_seconds
899      return email.utils.formatdate(updated_seconds, usegmt=True)
900    return date_str
901
902  def is_gzip(self):
903    return self.get_header('content-encoding') == 'gzip'
904
905  def is_compressed(self):
906    return self.get_header('content-encoding') in ('gzip', 'deflate')
907
908  def is_chunked(self):
909    return self.get_header('transfer-encoding') == 'chunked'
910
911  def get_data_as_chunks(self):
912    """Return content as a list of strings, each corresponding to a chunk.
913
914    Uncompresses the chunks, if needed.
915    """
916    content_type = self.get_header('content-type')
917    if (not content_type or
918        not (content_type.startswith('text/') or
919             content_type == 'application/x-javascript' or
920             content_type.startswith('application/json'))):
921      return None
922    if self.is_compressed():
923      return httpzlib.uncompress_chunks(self.response_data, self.is_gzip())
924    else:
925      return self.response_data
926
927  def get_data_as_text(self):
928    """Return content as a single string.
929
930    Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR.
931    """
932    return self.CHUNK_EDIT_SEPARATOR.join(self.get_data_as_chunks())
933
934  def get_delays_as_text(self):
935    """Return delays as editable text."""
936    return json.dumps(self.delays, indent=2)
937
938  def get_response_as_text(self):
939    """Returns response content as a single string.
940
941    Server delays are separated on a per-chunk basis. Delays are in seconds.
942    Response content begins after DELAY_EDIT_SEPARATOR
943    """
944    data = self.get_data_as_text()
945    if data is None:
946      logging.warning('Data can not be represented as text.')
947      data = ''
948    delays = self.get_delays_as_text()
949    return self.DELAY_EDIT_SEPARATOR.join((delays, data))
950
951  def set_data_from_chunks(self, text_chunks):
952    """Inverse of get_data_as_chunks().
953
954    Compress, if needed.
955    """
956    if self.is_compressed():
957      self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip())
958    else:
959      self.response_data = text_chunks
960    if not self.is_chunked():
961      content_length = sum(len(c) for c in self.response_data)
962      self.set_header('content-length', str(content_length))
963
964  def set_data(self, text):
965    """Inverse of get_data_as_text().
966
967    Split on CHUNK_EDIT_SEPARATOR and compress if needed.
968    """
969    self.set_data_from_chunks(text.split(self.CHUNK_EDIT_SEPARATOR))
970
971  def set_delays(self, delays_text):
972    """Inverse of get_delays_as_text().
973
974    Args:
975      delays_text: JSON encoded text such as the following:
976          {
977            connect: 80,
978            headers: 80,
979            data: [6, 55, 0]
980          }
981        Times are in milliseconds.
982        Each data delay corresponds with one response_data value.
983    """
984    try:
985      self.delays = json.loads(delays_text)
986    except (ValueError, KeyError) as e:
987      logging.critical('Unable to parse delays %s: %s', delays_text, e)
988    self.fix_delays()
989
990  def set_response_from_text(self, text):
991    """Inverse of get_response_as_text().
992
993    Modifies the state of the archive according to the textual representation.
994    """
995    try:
996      delays, data = text.split(self.DELAY_EDIT_SEPARATOR)
997    except ValueError:
998      logging.critical(
999          'Error parsing text representation. Skipping edits.')
1000      return
1001    self.set_delays(delays)
1002    self.set_data(data)
1003
1004
1005def create_response(status, reason=None, headers=None, body=None):
1006  """Convenience method for creating simple ArchivedHttpResponse objects."""
1007  if reason is None:
1008    reason = httplib.responses.get(status, 'Unknown')
1009  if headers is None:
1010    headers = [('content-type', 'text/plain')]
1011  if body is None:
1012    body = "%s %s" % (status, reason)
1013  return ArchivedHttpResponse(11, status, reason, headers, [body])
1014
1015
1016def main():
1017  class PlainHelpFormatter(optparse.IndentedHelpFormatter):
1018    def format_description(self, description):
1019      if description:
1020        return description + '\n'
1021      else:
1022        return ''
1023
1024  option_parser = optparse.OptionParser(
1025      usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)',
1026      formatter=PlainHelpFormatter(),
1027      description=__doc__,
1028      epilog='http://code.google.com/p/web-page-replay/')
1029
1030  option_parser.add_option('-c', '--command', default=None,
1031      action='store',
1032      type='string',
1033      help='Only show URLs matching this command.')
1034  option_parser.add_option('-o', '--host', default=None,
1035      action='store',
1036      type='string',
1037      help='Only show URLs matching this host.')
1038  option_parser.add_option('-p', '--full_path', default=None,
1039      action='store',
1040      type='string',
1041      help='Only show URLs matching this full path.')
1042  option_parser.add_option('-f', '--merged_file', default=None,
1043        action='store',
1044        type='string',
1045        help='The output file to use when using the merge command.')
1046
1047  options, args = option_parser.parse_args()
1048
1049  # Merge command expects an umlimited number of archives.
1050  if len(args) < 2:
1051    print 'args: %s' % args
1052    option_parser.error('Must specify a command and replay_file')
1053
1054  command = args[0]
1055  replay_file = args[1]
1056
1057  if not os.path.exists(replay_file):
1058    option_parser.error('Replay file "%s" does not exist' % replay_file)
1059
1060  http_archive = HttpArchive.Load(replay_file)
1061  if command == 'ls':
1062    print http_archive.ls(options.command, options.host, options.full_path)
1063  elif command == 'cat':
1064    print http_archive.cat(options.command, options.host, options.full_path)
1065  elif command == 'stats':
1066    print http_archive.stats(options.command, options.host, options.full_path)
1067  elif command == 'merge':
1068    if not options.merged_file:
1069      print 'Error: Must specify a merged file name (use --merged_file)'
1070      return
1071    http_archive.merge(options.merged_file, args[2:])
1072  elif command == 'edit':
1073    http_archive.edit(options.command, options.host, options.full_path)
1074    http_archive.Persist(replay_file)
1075  else:
1076    option_parser.error('Unknown command "%s"' % command)
1077  return 0
1078
1079
1080if __name__ == '__main__':
1081  sys.exit(main())
1082