1#!/usr/bin/env python
2# Copyright 2010 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#      http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""View and edit HTTP Archives.
17
18To list all URLs in an archive:
19  $ ./httparchive.py ls archive.wpr
20
21To view the content of all URLs from example.com:
22  $ ./httparchive.py cat --host example.com archive.wpr
23
24To view the content of a particular URL:
25  $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr
26
27To view the content of all URLs:
28  $ ./httparchive.py cat archive.wpr
29
30To edit a particular URL:
31  $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr
32
33To print statistics of an archive:
34  $ ./httparchive.py stats archive.wpr
35
36To print statistics of a set of URLs:
37  $ ./httparchive.py stats --host www.example.com archive.wpr
38
39To merge multiple archives
40  $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ...
41"""
42
43import calendar
44import certutils
45import cPickle
46import difflib
47import email.utils
48import httplib
49import httpzlib
50import json
51import logging
52import optparse
53import os
54import StringIO
55import subprocess
56import sys
57import tempfile
58import time
59import urlparse
60from collections import defaultdict
61
62
63
64def LogRunTime(fn):
65  """Annotation which logs the run time of the function."""
66  def wrapped(self, *args, **kwargs):
67    start_time = time.time()
68    try:
69      return fn(self, *args, **kwargs)
70    finally:
71      run_time = (time.time() - start_time) * 1000.0
72      logging.debug('%s: %dms', fn.__name__, run_time)
73  return wrapped
74
75
76class HttpArchiveException(Exception):
77  """Base class for all exceptions in httparchive."""
78  pass
79
80
81class HttpArchive(dict):
82  """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values.
83
84  Attributes:
85    responses_by_host: dict of {hostname, {request: response}}. This must remain
86        in sync with the underlying dict of self. It is used as an optimization
87        so that get_requests() doesn't have to linearly search all requests in
88        the archive to find potential matches.
89  """
90
91  def __init__(self):  # pylint: disable=super-init-not-called
92    self.responses_by_host = defaultdict(dict)
93
94  def __setstate__(self, state):
95    """Influence how to unpickle.
96
97    Args:
98      state: a dictionary for __dict__
99    """
100    self.__dict__.update(state)
101    self.responses_by_host = defaultdict(dict)
102    for request in self:
103      self.responses_by_host[request.host][request] = self[request]
104
105  def __getstate__(self):
106    """Influence how to pickle.
107
108    Returns:
109      a dict to use for pickling
110    """
111    state = self.__dict__.copy()
112    del state['responses_by_host']
113    return state
114
115  def __setitem__(self, key, value):
116    super(HttpArchive, self).__setitem__(key, value)
117    if hasattr(self, 'responses_by_host'):
118      self.responses_by_host[key.host][key] = value
119
120  def __delitem__(self, key):
121    super(HttpArchive, self).__delitem__(key)
122    del self.responses_by_host[key.host][key]
123
124  def get(self, request, default=None):
125    """Return the archived response for a given request.
126
127    Does extra checking for handling some HTTP request headers.
128
129    Args:
130      request: instance of ArchivedHttpRequest
131      default: default value to return if request is not found
132
133    Returns:
134      Instance of ArchivedHttpResponse or default if no matching
135      response is found
136    """
137    if request in self:
138      return self[request]
139    return self.get_conditional_response(request, default)
140
141  def get_conditional_response(self, request, default):
142    """Get the response based on the conditional HTTP request headers.
143
144    Args:
145      request: an ArchivedHttpRequest representing the original request.
146      default: default ArchivedHttpResponse
147          original request with matched headers removed.
148
149    Returns:
150      an ArchivedHttpResponse with a status of 200, 302 (not modified), or
151          412 (precondition failed)
152    """
153    response = default
154    if request.is_conditional():
155      stripped_request = request.create_request_without_conditions()
156      if stripped_request in self:
157        response = self[stripped_request]
158        if response.status == 200:
159          status = self.get_conditional_status(request, response)
160          if status != 200:
161            response = create_response(status)
162    return response
163
164  def get_conditional_status(self, request, response):
165    status = 200
166    last_modified = email.utils.parsedate(
167        response.update_date(response.get_header('last-modified')))
168    response_etag = response.get_header('etag')
169    is_get_or_head = request.command.upper() in ('GET', 'HEAD')
170
171    match_value = request.headers.get('if-match', None)
172    if match_value:
173      if self.is_etag_match(match_value, response_etag):
174        status = 200
175      else:
176        status = 412  # precondition failed
177    none_match_value = request.headers.get('if-none-match', None)
178    if none_match_value:
179      if self.is_etag_match(none_match_value, response_etag):
180        status = 304
181      elif is_get_or_head:
182        status = 200
183      else:
184        status = 412
185    if is_get_or_head and last_modified:
186      for header in ('if-modified-since', 'if-unmodified-since'):
187        date = email.utils.parsedate(request.headers.get(header, None))
188        if date:
189          if ((header == 'if-modified-since' and last_modified > date) or
190              (header == 'if-unmodified-since' and last_modified < date)):
191            if status != 412:
192              status = 200
193          else:
194            status = 304  # not modified
195    return status
196
197  @staticmethod
198  def is_etag_match(request_etag, response_etag):
199    """Determines whether the entity tags of the request/response matches.
200
201    Args:
202      request_etag: the value string of the "if-(none)-match:"
203                    portion of the request header
204      response_etag: the etag value of the response
205
206    Returns:
207      True on match, False otherwise
208    """
209    response_etag = response_etag.strip('" ')
210    for etag in request_etag.split(','):
211      etag = etag.strip('" ')
212      if etag in ('*', response_etag):
213        return True
214    return False
215
216  def get_requests(self, command=None, host=None, full_path=None, is_ssl=None,
217                   use_query=True):
218    """Return a list of requests that match the given args."""
219    if host:
220      return [r for r in self.responses_by_host[host]
221              if r.matches(command, None, full_path, is_ssl,
222                           use_query=use_query)]
223    else:
224      return [r for r in self
225              if r.matches(command, host, full_path, is_ssl,
226                           use_query=use_query)]
227
228  def ls(self, command=None, host=None, full_path=None):
229    """List all URLs that match given params."""
230    return ''.join(sorted(
231        '%s\n' % r for r in self.get_requests(command, host, full_path)))
232
233  def cat(self, command=None, host=None, full_path=None):
234    """Print the contents of all URLs that match given params."""
235    out = StringIO.StringIO()
236    for request in self.get_requests(command, host, full_path):
237      print >>out, str(request)
238      print >>out, 'Untrimmed request headers:'
239      for k in request.headers:
240        print >>out, '    %s: %s' % (k, request.headers[k])
241      if request.request_body:
242        print >>out, request.request_body
243      print >>out, '---- Response Info', '-' * 51
244      response = self[request]
245      chunk_lengths = [len(x) for x in response.response_data]
246      print >>out, ('Status: %s\n'
247                    'Reason: %s\n'
248                    'Headers delay: %s\n'
249                    'Response headers:') % (
250          response.status, response.reason, response.delays['headers'])
251      for k, v in response.headers:
252        print >>out, '    %s: %s' % (k, v)
253      print >>out, ('Chunk count: %s\n'
254                    'Chunk lengths: %s\n'
255                    'Chunk delays: %s') % (
256          len(chunk_lengths), chunk_lengths, response.delays['data'])
257      body = response.get_data_as_text()
258      print >>out, '---- Response Data', '-' * 51
259      if body:
260        print >>out, body
261      else:
262        print >>out, '[binary data]'
263      print >>out, '=' * 70
264    return out.getvalue()
265
266  def stats(self, command=None, host=None, full_path=None):
267    """Print stats about the archive for all URLs that match given params."""
268    matching_requests = self.get_requests(command, host, full_path)
269    if not matching_requests:
270      print 'Failed to find any requests matching given command, host, path.'
271      return
272
273    out = StringIO.StringIO()
274    stats = {
275        'Total': len(matching_requests),
276        'Domains': defaultdict(int),
277        'HTTP_response_code': defaultdict(int),
278        'content_type': defaultdict(int),
279        'Documents': defaultdict(int),
280        }
281
282    for request in matching_requests:
283      stats['Domains'][request.host] += 1
284      stats['HTTP_response_code'][self[request].status] += 1
285
286      content_type = self[request].get_header('content-type')
287      # Remove content type options for readability and higher level groupings.
288      str_content_type = str(content_type.split(';')[0]
289                            if content_type else None)
290      stats['content_type'][str_content_type] += 1
291
292      #  Documents are the main URL requested and not a referenced resource.
293      if str_content_type == 'text/html' and not 'referer' in request.headers:
294        stats['Documents'][request.host] += 1
295
296    print >>out, json.dumps(stats, indent=4)
297    return out.getvalue()
298
299  def merge(self, merged_archive=None, other_archives=None):
300    """Merge multiple archives into merged_archive by 'chaining' resources,
301    only resources that are not part of the accumlated archive are added"""
302    if not other_archives:
303      print 'No archives passed to merge'
304      return
305
306    # Note we already loaded 'replay_file'.
307    print 'Loaded %d responses' % len(self)
308
309    for archive in other_archives:
310      if not os.path.exists(archive):
311        print 'Error: Replay file "%s" does not exist' % archive
312        return
313
314      http_archive_other = HttpArchive.Load(archive)
315      print 'Loaded %d responses from %s' % (len(http_archive_other), archive)
316      for r in http_archive_other:
317        # Only resources that are not already part of the current archive
318        # get added.
319        if r not in self:
320          print '\t %s ' % r
321          self[r] = http_archive_other[r]
322    self.Persist('%s' % merged_archive)
323
324  def edit(self, command=None, host=None, full_path=None):
325    """Edits the single request which matches given params."""
326    editor = os.getenv('EDITOR')
327    if not editor:
328      print 'You must set the EDITOR environmental variable.'
329      return
330
331    matching_requests = self.get_requests(command, host, full_path)
332    if not matching_requests:
333      print ('Failed to find any requests matching given command, host, '
334             'full_path.')
335      return
336
337    if len(matching_requests) > 1:
338      print 'Found multiple matching requests. Please refine.'
339      print self.ls(command, host, full_path)
340
341    response = self[matching_requests[0]]
342    tmp_file = tempfile.NamedTemporaryFile(delete=False)
343    tmp_file.write(response.get_response_as_text())
344    tmp_file.close()
345    subprocess.check_call([editor, tmp_file.name])
346    response.set_response_from_text(''.join(open(tmp_file.name).readlines()))
347    os.remove(tmp_file.name)
348
349  def find_closest_request(self, request, use_path=False):
350    """Find the closest matching request in the archive to the given request.
351
352    Args:
353      request: an ArchivedHttpRequest
354      use_path: If True, closest matching request's path component must match.
355        (Note: this refers to the 'path' component within the URL, not the
356         'full path' which includes the query string component.)
357
358        If use_path=True, candidate will NOT match in example below
359        e.g. request   = GET www.test.com/a?p=1
360             candidate = GET www.test.com/b?p=1
361
362        Even if use_path=False, urls with same paths are always favored.
363        For example, candidate1 is considered a better match than candidate2.
364          request    = GET www.test.com/a?p=1&q=2&r=3
365          candidate1 = GET www.test.com/a?s=4
366          candidate2 = GET www.test.com/b?p=1&q=2&r=3
367
368    Returns:
369      If a close match is found, return the instance of ArchivedHttpRequest.
370      Otherwise, return None.
371    """
372    # Start with strictest constraints. This trims search space considerably.
373    requests = self.get_requests(request.command, request.host,
374                                 request.full_path, is_ssl=request.is_ssl,
375                                 use_query=True)
376    # Relax constraint: use_query if there is no match.
377    if not requests:
378      requests = self.get_requests(request.command, request.host,
379                                   request.full_path, is_ssl=request.is_ssl,
380                                   use_query=False)
381    # Relax constraint: full_path if there is no match and use_path=False.
382    if not requests and not use_path:
383      requests = self.get_requests(request.command, request.host,
384                                   None, is_ssl=request.is_ssl,
385                                   use_query=False)
386
387    if not requests:
388      return None
389
390    if len(requests) == 1:
391      return requests[0]
392
393    matcher = difflib.SequenceMatcher(b=request.cmp_seq)
394
395    # quick_ratio() is cheap to compute, but ratio() is expensive. So we call
396    # quick_ratio() on all requests, sort them descending, and then loop through
397    # until we find a candidate whose ratio() is >= the next quick_ratio().
398    # This works because quick_ratio() is guaranteed to be an upper bound on
399    # ratio().
400    candidates = []
401    for candidate in requests:
402      matcher.set_seq1(candidate.cmp_seq)
403      candidates.append((matcher.quick_ratio(), candidate))
404
405    candidates.sort(reverse=True, key=lambda c: c[0])
406
407    best_match = (0, None)
408    for i in xrange(len(candidates)):
409      matcher.set_seq1(candidates[i][1].cmp_seq)
410      best_match = max(best_match, (matcher.ratio(), candidates[i][1]))
411      if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]:
412        break
413    return best_match[1]
414
415  def diff(self, request):
416    """Diff the given request to the closest matching request in the archive.
417
418    Args:
419      request: an ArchivedHttpRequest
420    Returns:
421      If a close match is found, return a textual diff between the requests.
422      Otherwise, return None.
423    """
424    request_lines = request.formatted_request.split('\n')
425    closest_request = self.find_closest_request(request)
426    if closest_request:
427      closest_request_lines = closest_request.formatted_request.split('\n')
428      return '\n'.join(difflib.ndiff(closest_request_lines, request_lines))
429    return None
430
431  def get_server_cert(self, host):
432    """Gets certificate from the server and stores it in archive"""
433    request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {})
434    if request not in self:
435      self[request] = create_response(200, body=certutils.get_host_cert(host))
436    return self[request].response_data[0]
437
438  def get_certificate(self, host):
439    request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {})
440    if request not in self:
441      self[request] = create_response(200, body=self._generate_cert(host))
442    return self[request].response_data[0]
443
444  @classmethod
445  def AssertWritable(cls, filename):
446    """Raises an IOError if filename is not writable."""
447    persist_dir = os.path.dirname(os.path.abspath(filename))
448    if not os.path.exists(persist_dir):
449      raise IOError('Directory does not exist: %s' % persist_dir)
450    if os.path.exists(filename):
451      if not os.access(filename, os.W_OK):
452        raise IOError('Need write permission on file: %s' % filename)
453    elif not os.access(persist_dir, os.W_OK):
454      raise IOError('Need write permission on directory: %s' % persist_dir)
455
456  @classmethod
457  def Load(cls, filename):
458    """Load an instance from filename."""
459    return cPickle.load(open(filename, 'rb'))
460
461  def Persist(self, filename):
462    """Persist all state to filename."""
463    try:
464      original_checkinterval = sys.getcheckinterval()
465      sys.setcheckinterval(2**31-1)  # Lock out other threads so nothing can
466                                     # modify |self| during pickling.
467      pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL)
468    finally:
469      sys.setcheckinterval(original_checkinterval)
470    with open(filename, 'wb') as f:
471      f.write(pickled_self)
472
473
474class ArchivedHttpRequest(object):
475  """Record all the state that goes into a request.
476
477  ArchivedHttpRequest instances are considered immutable so they can
478  serve as keys for HttpArchive instances.
479  (The immutability is not enforced.)
480
481  Upon creation, the headers are "trimmed" (i.e. edited or dropped)
482  and saved to self.trimmed_headers to allow requests to match in a wider
483  variety of playback situations (e.g. using different user agents).
484
485  For unpickling, 'trimmed_headers' is recreated from 'headers'. That
486  allows for changes to the trim function and can help with debugging.
487  """
488  CONDITIONAL_HEADERS = [
489      'if-none-match', 'if-match',
490      'if-modified-since', 'if-unmodified-since']
491
492  def __init__(self, command, host, full_path, request_body, headers,
493               is_ssl=False):
494    """Initialize an ArchivedHttpRequest.
495
496    Args:
497      command: a string (e.g. 'GET' or 'POST').
498      host: a host name (e.g. 'www.google.com').
499      full_path: a request path.  Includes everything after the host & port in
500          the URL (e.g. '/search?q=dogs').
501      request_body: a request body string for a POST or None.
502      headers: {key: value, ...} where key and value are strings.
503      is_ssl: a boolean which is True iff request is make via SSL.
504    """
505    self.command = command
506    self.host = host
507    self.full_path = full_path
508    parsed_url = urlparse.urlparse(full_path) if full_path else None
509    self.path = parsed_url.path if parsed_url else None
510    self.request_body = request_body
511    self.headers = headers
512    self.is_ssl = is_ssl
513    self.trimmed_headers = self._TrimHeaders(headers)
514    self.formatted_request = self._GetFormattedRequest()
515    self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None)
516
517  def __str__(self):
518    scheme = 'https' if self.is_ssl else 'http'
519    return '%s %s://%s%s %s' % (
520        self.command, scheme, self.host, self.full_path, self.trimmed_headers)
521
522  def __repr__(self):
523    return repr((self.command, self.host, self.full_path, self.request_body,
524                 self.trimmed_headers, self.is_ssl))
525
526  def __hash__(self):
527    """Return a integer hash to use for hashed collections including dict."""
528    return hash(repr(self))
529
530  def __eq__(self, other):
531    """Define the __eq__ method to match the hash behavior."""
532    return repr(self) == repr(other)
533
534  def __setstate__(self, state):
535    """Influence how to unpickle.
536
537    "headers" are the original request headers.
538    "trimmed_headers" are the trimmed headers used for matching requests
539    during replay.
540
541    Args:
542      state: a dictionary for __dict__
543    """
544    if 'full_headers' in state:
545      # Fix older version of archive.
546      state['headers'] = state['full_headers']
547      del state['full_headers']
548    if 'headers' not in state:
549      raise HttpArchiveException(
550          'Archived HTTP request is missing "headers". The HTTP archive is'
551          ' likely from a previous version and must be re-recorded.')
552    if 'path' in state:
553      # before, 'path' and 'path_without_query' were used and 'path' was
554      # pickled.  Now, 'path' has been renamed to 'full_path' and
555      # 'path_without_query' has been renamed to 'path'.  'full_path' is
556      # pickled, but 'path' is not.  If we see 'path' here it means we are
557      # dealing with an older archive.
558      state['full_path'] = state['path']
559      del state['path']
560    state['trimmed_headers'] = self._TrimHeaders(dict(state['headers']))
561    if 'is_ssl' not in state:
562      state['is_ssl'] = False
563    self.__dict__.update(state)
564    parsed_url = urlparse.urlparse(self.full_path)
565    self.path = parsed_url.path
566    self.formatted_request = self._GetFormattedRequest()
567    self.cmp_seq = self._GetCmpSeq(parsed_url.query)
568
569  def __getstate__(self):
570    """Influence how to pickle.
571
572    Returns:
573      a dict to use for pickling
574    """
575    state = self.__dict__.copy()
576    del state['trimmed_headers']
577    del state['path']
578    del state['formatted_request']
579    del state['cmp_seq']
580    return state
581
582  def _GetFormattedRequest(self):
583    """Format request to make diffs easier to read.
584
585    Returns:
586      A string consisting of the request. Example:
587      'GET www.example.com/path\nHeader-Key: header value\n'
588    """
589    parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)]
590    if self.request_body:
591      parts.append('%s\n' % self.request_body)
592    for k, v in self.trimmed_headers:
593      k = '-'.join(x.capitalize() for x in k.split('-'))
594      parts.append('%s: %s\n' % (k, v))
595    return ''.join(parts)
596
597  def _GetCmpSeq(self, query=None):
598    """Compute a sequence out of query and header for difflib to compare.
599    For example:
600      [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')]
601    will be returned for a request with URL:
602      http://example.com/index.html?q1=a2&q2=a2
603    and header:
604      k1: v1
605      k2: v2
606
607    Args:
608      query: the query string in the URL.
609
610    Returns:
611      A sequence for difflib to compare.
612    """
613    if not query:
614      return self.trimmed_headers
615    return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers
616
617  def matches(self, command=None, host=None, full_path=None, is_ssl=None,
618              use_query=True):
619    """Returns true iff the request matches all parameters.
620
621    Args:
622      command: a string (e.g. 'GET' or 'POST').
623      host: a host name (e.g. 'www.google.com').
624      full_path: a request path with query string (e.g. '/search?q=dogs')
625      is_ssl: whether the request is secure.
626      use_query:
627        If use_query is True, request matching uses both the hierarchical path
628        and query string component.
629        If use_query is False, request matching only uses the hierarchical path
630
631        e.g. req1 = GET www.test.com/index?aaaa
632             req2 = GET www.test.com/index?bbbb
633
634        If use_query is True, req1.matches(req2) evaluates to False
635        If use_query is False, req1.matches(req2) evaluates to True
636
637    Returns:
638      True iff the request matches all parameters
639    """
640    if command is not None and command != self.command:
641      return False
642    if is_ssl is not None and is_ssl != self.is_ssl:
643      return False
644    if host is not None and host != self.host:
645      return False
646    if full_path is None:
647      return True
648    if use_query:
649      return full_path == self.full_path
650    else:
651      return self.path == urlparse.urlparse(full_path).path
652
653  @classmethod
654  def _TrimHeaders(cls, headers):
655    """Removes headers that are known to cause problems during replay.
656
657    These headers are removed for the following reasons:
658    - accept: Causes problems with www.bing.com. During record, CSS is fetched
659              with *. During replay, it's text/css.
660    - accept-charset, accept-language, referer: vary between clients.
661    - cache-control:  sometimes sent from Chrome with 'max-age=0' as value.
662    - connection, method, scheme, url, version: Cause problems with spdy.
663    - cookie: Extremely sensitive to request/response order.
664    - keep-alive: Doesn't affect the content of the request, only some
665      transient state of the transport layer.
666    - user-agent: Changes with every Chrome version.
667    - proxy-connection: Sent for proxy requests.
668    - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by
669      Google to collect statistics about Chrome's enabled features.
670
671    Another variant to consider is dropping only the value from the header.
672    However, this is particularly bad for the cookie header, because the
673    presence of the cookie depends on the responses we've seen when the request
674    is made.
675
676    Args:
677      headers: {header_key: header_value, ...}
678
679    Returns:
680      [(header_key, header_value), ...]  # (with undesirable headers removed)
681    """
682    # TODO(tonyg): Strip sdch from the request headers because we can't
683    # guarantee that the dictionary will be recorded, so replay may not work.
684    if 'accept-encoding' in headers:
685      accept_encoding = headers['accept-encoding']
686      accept_encoding = accept_encoding.replace('sdch', '')
687      # Strip lzma so Opera's requests matches archives recorded using Chrome.
688      accept_encoding = accept_encoding.replace('lzma', '')
689      stripped_encodings = [e.strip() for e in accept_encoding.split(',')]
690      accept_encoding = ','.join(filter(bool, stripped_encodings))
691      headers['accept-encoding'] = accept_encoding
692    undesirable_keys = [
693        'accept', 'accept-charset', 'accept-language', 'cache-control',
694        'connection', 'cookie', 'keep-alive', 'method',
695        'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection',
696        'x-chrome-variations', 'x-client-data']
697    return sorted([(k, v) for k, v in headers.items()
698                   if k.lower() not in undesirable_keys])
699
700  def is_conditional(self):
701    """Return list of headers that match conditional headers."""
702    for header in self.CONDITIONAL_HEADERS:
703      if header in self.headers:
704        return True
705    return False
706
707  def create_request_without_conditions(self):
708    stripped_headers = dict((k, v) for k, v in self.headers.iteritems()
709                            if k.lower() not in self.CONDITIONAL_HEADERS)
710    return ArchivedHttpRequest(
711        self.command, self.host, self.full_path, self.request_body,
712        stripped_headers, self.is_ssl)
713
714class ArchivedHttpResponse(object):
715  """All the data needed to recreate all HTTP response."""
716
717  # CHUNK_EDIT_SEPARATOR is used to edit and view text content.
718  # It is not sent in responses. It is added by get_data_as_text()
719  # and removed by set_data().
720  CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]'
721
722  # DELAY_EDIT_SEPARATOR is used to edit and view server delays.
723  DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- '
724                          'Delays are above. Response content is below.]\n')
725
726  def __init__(self, version, status, reason, headers, response_data,
727               delays=None):
728    """Initialize an ArchivedHttpResponse.
729
730    Args:
731      version: HTTP protocol version used by server.
732          10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib).
733      status: Status code returned by server (e.g. 200).
734      reason: Reason phrase returned by server (e.g. "OK").
735      headers: list of (header, value) tuples.
736      response_data: list of content chunks.
737          Concatenating the chunks gives the complete contents
738          (i.e. the chunks do not have any lengths or delimiters).
739          Do not include the final, zero-length chunk that marks the end.
740      delays: dict of (ms) delays for 'connect', 'headers' and 'data'.
741          e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]}
742          connect - The time to connect to the server.
743            Each resource has a value because Replay's record mode captures it.
744            This includes the time for the SYN and SYN/ACK (1 rtt).
745          headers - The time elapsed between the TCP connect and the headers.
746            This typically includes all the server-time to generate a response.
747          data - If the response is chunked, these are the times for each chunk.
748    """
749    self.version = version
750    self.status = status
751    self.reason = reason
752    self.headers = headers
753    self.response_data = response_data
754    self.delays = delays
755    self.fix_delays()
756
757  def fix_delays(self):
758    """Initialize delays, or check the number of data delays."""
759    expected_num_delays = len(self.response_data)
760    if not self.delays:
761      self.delays = {
762          'connect': 0,
763          'headers': 0,
764          'data': [0] * expected_num_delays
765          }
766    else:
767      num_delays = len(self.delays['data'])
768      if num_delays != expected_num_delays:
769        raise HttpArchiveException(
770            'Server delay length mismatch: %d (expected %d): %s',
771            num_delays, expected_num_delays, self.delays['data'])
772
773  def __repr__(self):
774    return repr((self.version, self.status, self.reason, sorted(self.headers),
775                 self.response_data))
776
777  def __hash__(self):
778    """Return a integer hash to use for hashed collections including dict."""
779    return hash(repr(self))
780
781  def __eq__(self, other):
782    """Define the __eq__ method to match the hash behavior."""
783    return repr(self) == repr(other)
784
785  def __setstate__(self, state):
786    """Influence how to unpickle.
787
788    Args:
789      state: a dictionary for __dict__
790    """
791    if 'server_delays' in state:
792      state['delays'] = {
793          'connect': 0,
794          'headers': 0,
795          'data': state['server_delays']
796          }
797      del state['server_delays']
798    elif 'delays' not in state:
799      state['delays'] = None
800    self.__dict__.update(state)
801    self.fix_delays()
802
803  def get_header(self, key, default=None):
804    for k, v in self.headers:
805      if key.lower() == k.lower():
806        return v
807    return default
808
809  def set_header(self, key, value):
810    for i, (k, v) in enumerate(self.headers):
811      if key == k:
812        self.headers[i] = (key, value)
813        return
814    self.headers.append((key, value))
815
816  def remove_header(self, key):
817    for i, (k, v) in enumerate(self.headers):
818      if key.lower() == k.lower():
819        self.headers.pop(i)
820        return
821
822  @staticmethod
823  def _get_epoch_seconds(date_str):
824    """Return the epoch seconds of a date header.
825
826    Args:
827      date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
828    Returns:
829      epoch seconds as a float
830    """
831    date_tuple = email.utils.parsedate(date_str)
832    if date_tuple:
833      return calendar.timegm(date_tuple)
834    return None
835
836  def update_date(self, date_str, now=None):
837    """Return an updated date based on its delta from the "Date" header.
838
839    For example, if |date_str| is one week later than the "Date" header,
840    then the returned date string is one week later than the current date.
841
842    Args:
843      date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
844    Returns:
845      a date string
846    """
847    date_seconds = self._get_epoch_seconds(self.get_header('date'))
848    header_seconds = self._get_epoch_seconds(date_str)
849    if date_seconds and header_seconds:
850      updated_seconds = header_seconds + (now or time.time()) - date_seconds
851      return email.utils.formatdate(updated_seconds, usegmt=True)
852    return date_str
853
854  def is_gzip(self):
855    return self.get_header('content-encoding') == 'gzip'
856
857  def is_compressed(self):
858    return self.get_header('content-encoding') in ('gzip', 'deflate')
859
860  def is_chunked(self):
861    return self.get_header('transfer-encoding') == 'chunked'
862
863  def get_data_as_text(self):
864    """Return content as a single string.
865
866    Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR.
867    """
868    content_type = self.get_header('content-type')
869    if (not content_type or
870        not (content_type.startswith('text/') or
871             content_type == 'application/x-javascript' or
872             content_type.startswith('application/json'))):
873      return None
874    if self.is_compressed():
875      uncompressed_chunks = httpzlib.uncompress_chunks(
876          self.response_data, self.is_gzip())
877    else:
878      uncompressed_chunks = self.response_data
879    return self.CHUNK_EDIT_SEPARATOR.join(uncompressed_chunks)
880
881  def get_delays_as_text(self):
882    """Return delays as editable text."""
883    return json.dumps(self.delays, indent=2)
884
885  def get_response_as_text(self):
886    """Returns response content as a single string.
887
888    Server delays are separated on a per-chunk basis. Delays are in seconds.
889    Response content begins after DELAY_EDIT_SEPARATOR
890    """
891    data = self.get_data_as_text()
892    if data is None:
893      logging.warning('Data can not be represented as text.')
894      data = ''
895    delays = self.get_delays_as_text()
896    return self.DELAY_EDIT_SEPARATOR.join((delays, data))
897
898  def set_data(self, text):
899    """Inverse of get_data_as_text().
900
901    Split on CHUNK_EDIT_SEPARATOR and compress if needed.
902    """
903    text_chunks = text.split(self.CHUNK_EDIT_SEPARATOR)
904    if self.is_compressed():
905      self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip())
906    else:
907      self.response_data = text_chunks
908    if not self.is_chunked():
909      content_length = sum(len(c) for c in self.response_data)
910      self.set_header('content-length', str(content_length))
911
912  def set_delays(self, delays_text):
913    """Inverse of get_delays_as_text().
914
915    Args:
916      delays_text: JSON encoded text such as the following:
917          {
918            connect: 80,
919            headers: 80,
920            data: [6, 55, 0]
921          }
922        Times are in milliseconds.
923        Each data delay corresponds with one response_data value.
924    """
925    try:
926      self.delays = json.loads(delays_text)
927    except (ValueError, KeyError) as e:
928      logging.critical('Unable to parse delays %s: %s', delays_text, e)
929    self.fix_delays()
930
931  def set_response_from_text(self, text):
932    """Inverse of get_response_as_text().
933
934    Modifies the state of the archive according to the textual representation.
935    """
936    try:
937      delays, data = text.split(self.DELAY_EDIT_SEPARATOR)
938    except ValueError:
939      logging.critical(
940          'Error parsing text representation. Skipping edits.')
941      return
942    self.set_delays(delays)
943    self.set_data(data)
944
945
946def create_response(status, reason=None, headers=None, body=None):
947  """Convenience method for creating simple ArchivedHttpResponse objects."""
948  if reason is None:
949    reason = httplib.responses.get(status, 'Unknown')
950  if headers is None:
951    headers = [('content-type', 'text/plain')]
952  if body is None:
953    body = "%s %s" % (status, reason)
954  return ArchivedHttpResponse(11, status, reason, headers, [body])
955
956
957def main():
958  class PlainHelpFormatter(optparse.IndentedHelpFormatter):
959    def format_description(self, description):
960      if description:
961        return description + '\n'
962      else:
963        return ''
964
965  option_parser = optparse.OptionParser(
966      usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)',
967      formatter=PlainHelpFormatter(),
968      description=__doc__,
969      epilog='http://code.google.com/p/web-page-replay/')
970
971  option_parser.add_option('-c', '--command', default=None,
972      action='store',
973      type='string',
974      help='Only show URLs matching this command.')
975  option_parser.add_option('-o', '--host', default=None,
976      action='store',
977      type='string',
978      help='Only show URLs matching this host.')
979  option_parser.add_option('-p', '--full_path', default=None,
980      action='store',
981      type='string',
982      help='Only show URLs matching this full path.')
983  option_parser.add_option('-f', '--merged_file', default=None,
984        action='store',
985        type='string',
986        help='The output file to use when using the merge command.')
987
988  options, args = option_parser.parse_args()
989
990  # Merge command expects an umlimited number of archives.
991  if len(args) < 2:
992    print 'args: %s' % args
993    option_parser.error('Must specify a command and replay_file')
994
995  command = args[0]
996  replay_file = args[1]
997
998  if not os.path.exists(replay_file):
999    option_parser.error('Replay file "%s" does not exist' % replay_file)
1000
1001  http_archive = HttpArchive.Load(replay_file)
1002  if command == 'ls':
1003    print http_archive.ls(options.command, options.host, options.full_path)
1004  elif command == 'cat':
1005    print http_archive.cat(options.command, options.host, options.full_path)
1006  elif command == 'stats':
1007    print http_archive.stats(options.command, options.host, options.full_path)
1008  elif command == 'merge':
1009    if not options.merged_file:
1010      print 'Error: Must specify a merged file name (use --merged_file)'
1011      return
1012    http_archive.merge(options.merged_file, args[2:])
1013  elif command == 'edit':
1014    http_archive.edit(options.command, options.host, options.full_path)
1015    http_archive.Persist(replay_file)
1016  else:
1017    option_parser.error('Unknown command "%s"' % command)
1018  return 0
1019
1020
1021if __name__ == '__main__':
1022  sys.exit(main())
1023