1#!/usr/bin/env python 2# Copyright 2010 Google Inc. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15 16"""View and edit HTTP Archives. 17 18To list all URLs in an archive: 19 $ ./httparchive.py ls archive.wpr 20 21To view the content of all URLs from example.com: 22 $ ./httparchive.py cat --host example.com archive.wpr 23 24To view the content of a particular URL: 25 $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr 26 27To view the content of all URLs: 28 $ ./httparchive.py cat archive.wpr 29 30To edit a particular URL: 31 $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr 32 33To print statistics of an archive: 34 $ ./httparchive.py stats archive.wpr 35 36To print statistics of a set of URLs: 37 $ ./httparchive.py stats --host www.example.com archive.wpr 38 39To merge multiple archives 40 $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ... 41""" 42 43import calendar 44import certutils 45import cPickle 46import difflib 47import email.utils 48import httplib 49import httpzlib 50import json 51import logging 52import optparse 53import os 54import StringIO 55import subprocess 56import sys 57import tempfile 58import time 59import urlparse 60from collections import defaultdict 61 62 63 64def LogRunTime(fn): 65 """Annotation which logs the run time of the function.""" 66 def wrapped(self, *args, **kwargs): 67 start_time = time.time() 68 try: 69 return fn(self, *args, **kwargs) 70 finally: 71 run_time = (time.time() - start_time) * 1000.0 72 logging.debug('%s: %dms', fn.__name__, run_time) 73 return wrapped 74 75 76class HttpArchiveException(Exception): 77 """Base class for all exceptions in httparchive.""" 78 pass 79 80 81class HttpArchive(dict): 82 """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values. 83 84 Attributes: 85 responses_by_host: dict of {hostname, {request: response}}. This must remain 86 in sync with the underlying dict of self. It is used as an optimization 87 so that get_requests() doesn't have to linearly search all requests in 88 the archive to find potential matches. 89 """ 90 91 def __init__(self): # pylint: disable=super-init-not-called 92 self.responses_by_host = defaultdict(dict) 93 94 def __setstate__(self, state): 95 """Influence how to unpickle. 96 97 Args: 98 state: a dictionary for __dict__ 99 """ 100 self.__dict__.update(state) 101 self.responses_by_host = defaultdict(dict) 102 for request in self: 103 self.responses_by_host[request.host][request] = self[request] 104 105 def __getstate__(self): 106 """Influence how to pickle. 107 108 Returns: 109 a dict to use for pickling 110 """ 111 state = self.__dict__.copy() 112 del state['responses_by_host'] 113 return state 114 115 def __setitem__(self, key, value): 116 super(HttpArchive, self).__setitem__(key, value) 117 if hasattr(self, 'responses_by_host'): 118 self.responses_by_host[key.host][key] = value 119 120 def __delitem__(self, key): 121 super(HttpArchive, self).__delitem__(key) 122 del self.responses_by_host[key.host][key] 123 124 def get(self, request, default=None): 125 """Return the archived response for a given request. 126 127 Does extra checking for handling some HTTP request headers. 128 129 Args: 130 request: instance of ArchivedHttpRequest 131 default: default value to return if request is not found 132 133 Returns: 134 Instance of ArchivedHttpResponse or default if no matching 135 response is found 136 """ 137 if request in self: 138 return self[request] 139 return self.get_conditional_response(request, default) 140 141 def get_conditional_response(self, request, default): 142 """Get the response based on the conditional HTTP request headers. 143 144 Args: 145 request: an ArchivedHttpRequest representing the original request. 146 default: default ArchivedHttpResponse 147 original request with matched headers removed. 148 149 Returns: 150 an ArchivedHttpResponse with a status of 200, 302 (not modified), or 151 412 (precondition failed) 152 """ 153 response = default 154 if request.is_conditional(): 155 stripped_request = request.create_request_without_conditions() 156 if stripped_request in self: 157 response = self[stripped_request] 158 if response.status == 200: 159 status = self.get_conditional_status(request, response) 160 if status != 200: 161 response = create_response(status) 162 return response 163 164 def get_conditional_status(self, request, response): 165 status = 200 166 last_modified = email.utils.parsedate( 167 response.update_date(response.get_header('last-modified'))) 168 response_etag = response.get_header('etag') 169 is_get_or_head = request.command.upper() in ('GET', 'HEAD') 170 171 match_value = request.headers.get('if-match', None) 172 if match_value: 173 if self.is_etag_match(match_value, response_etag): 174 status = 200 175 else: 176 status = 412 # precondition failed 177 none_match_value = request.headers.get('if-none-match', None) 178 if none_match_value: 179 if self.is_etag_match(none_match_value, response_etag): 180 status = 304 181 elif is_get_or_head: 182 status = 200 183 else: 184 status = 412 185 if is_get_or_head and last_modified: 186 for header in ('if-modified-since', 'if-unmodified-since'): 187 date = email.utils.parsedate(request.headers.get(header, None)) 188 if date: 189 if ((header == 'if-modified-since' and last_modified > date) or 190 (header == 'if-unmodified-since' and last_modified < date)): 191 if status != 412: 192 status = 200 193 else: 194 status = 304 # not modified 195 return status 196 197 @staticmethod 198 def is_etag_match(request_etag, response_etag): 199 """Determines whether the entity tags of the request/response matches. 200 201 Args: 202 request_etag: the value string of the "if-(none)-match:" 203 portion of the request header 204 response_etag: the etag value of the response 205 206 Returns: 207 True on match, False otherwise 208 """ 209 response_etag = response_etag.strip('" ') 210 for etag in request_etag.split(','): 211 etag = etag.strip('" ') 212 if etag in ('*', response_etag): 213 return True 214 return False 215 216 def get_requests(self, command=None, host=None, full_path=None, is_ssl=None, 217 use_query=True): 218 """Return a list of requests that match the given args.""" 219 if host: 220 return [r for r in self.responses_by_host[host] 221 if r.matches(command, None, full_path, is_ssl, 222 use_query=use_query)] 223 else: 224 return [r for r in self 225 if r.matches(command, host, full_path, is_ssl, 226 use_query=use_query)] 227 228 def ls(self, command=None, host=None, full_path=None): 229 """List all URLs that match given params.""" 230 return ''.join(sorted( 231 '%s\n' % r for r in self.get_requests(command, host, full_path))) 232 233 def cat(self, command=None, host=None, full_path=None): 234 """Print the contents of all URLs that match given params.""" 235 out = StringIO.StringIO() 236 for request in self.get_requests(command, host, full_path): 237 print >>out, str(request) 238 print >>out, 'Untrimmed request headers:' 239 for k in request.headers: 240 print >>out, ' %s: %s' % (k, request.headers[k]) 241 if request.request_body: 242 print >>out, request.request_body 243 print >>out, '---- Response Info', '-' * 51 244 response = self[request] 245 chunk_lengths = [len(x) for x in response.response_data] 246 print >>out, ('Status: %s\n' 247 'Reason: %s\n' 248 'Headers delay: %s\n' 249 'Response headers:') % ( 250 response.status, response.reason, response.delays['headers']) 251 for k, v in response.headers: 252 print >>out, ' %s: %s' % (k, v) 253 print >>out, ('Chunk count: %s\n' 254 'Chunk lengths: %s\n' 255 'Chunk delays: %s') % ( 256 len(chunk_lengths), chunk_lengths, response.delays['data']) 257 body = response.get_data_as_text() 258 print >>out, '---- Response Data', '-' * 51 259 if body: 260 print >>out, body 261 else: 262 print >>out, '[binary data]' 263 print >>out, '=' * 70 264 return out.getvalue() 265 266 def stats(self, command=None, host=None, full_path=None): 267 """Print stats about the archive for all URLs that match given params.""" 268 matching_requests = self.get_requests(command, host, full_path) 269 if not matching_requests: 270 print 'Failed to find any requests matching given command, host, path.' 271 return 272 273 out = StringIO.StringIO() 274 stats = { 275 'Total': len(matching_requests), 276 'Domains': defaultdict(int), 277 'HTTP_response_code': defaultdict(int), 278 'content_type': defaultdict(int), 279 'Documents': defaultdict(int), 280 } 281 282 for request in matching_requests: 283 stats['Domains'][request.host] += 1 284 stats['HTTP_response_code'][self[request].status] += 1 285 286 content_type = self[request].get_header('content-type') 287 # Remove content type options for readability and higher level groupings. 288 str_content_type = str(content_type.split(';')[0] 289 if content_type else None) 290 stats['content_type'][str_content_type] += 1 291 292 # Documents are the main URL requested and not a referenced resource. 293 if str_content_type == 'text/html' and not 'referer' in request.headers: 294 stats['Documents'][request.host] += 1 295 296 print >>out, json.dumps(stats, indent=4) 297 return out.getvalue() 298 299 def merge(self, merged_archive=None, other_archives=None): 300 """Merge multiple archives into merged_archive by 'chaining' resources, 301 only resources that are not part of the accumlated archive are added""" 302 if not other_archives: 303 print 'No archives passed to merge' 304 return 305 306 # Note we already loaded 'replay_file'. 307 print 'Loaded %d responses' % len(self) 308 309 for archive in other_archives: 310 if not os.path.exists(archive): 311 print 'Error: Replay file "%s" does not exist' % archive 312 return 313 314 http_archive_other = HttpArchive.Load(archive) 315 print 'Loaded %d responses from %s' % (len(http_archive_other), archive) 316 for r in http_archive_other: 317 # Only resources that are not already part of the current archive 318 # get added. 319 if r not in self: 320 print '\t %s ' % r 321 self[r] = http_archive_other[r] 322 self.Persist('%s' % merged_archive) 323 324 def edit(self, command=None, host=None, full_path=None): 325 """Edits the single request which matches given params.""" 326 editor = os.getenv('EDITOR') 327 if not editor: 328 print 'You must set the EDITOR environmental variable.' 329 return 330 331 matching_requests = self.get_requests(command, host, full_path) 332 if not matching_requests: 333 print ('Failed to find any requests matching given command, host, ' 334 'full_path.') 335 return 336 337 if len(matching_requests) > 1: 338 print 'Found multiple matching requests. Please refine.' 339 print self.ls(command, host, full_path) 340 341 response = self[matching_requests[0]] 342 tmp_file = tempfile.NamedTemporaryFile(delete=False) 343 tmp_file.write(response.get_response_as_text()) 344 tmp_file.close() 345 subprocess.check_call([editor, tmp_file.name]) 346 response.set_response_from_text(''.join(open(tmp_file.name).readlines())) 347 os.remove(tmp_file.name) 348 349 def find_closest_request(self, request, use_path=False): 350 """Find the closest matching request in the archive to the given request. 351 352 Args: 353 request: an ArchivedHttpRequest 354 use_path: If True, closest matching request's path component must match. 355 (Note: this refers to the 'path' component within the URL, not the 356 'full path' which includes the query string component.) 357 358 If use_path=True, candidate will NOT match in example below 359 e.g. request = GET www.test.com/a?p=1 360 candidate = GET www.test.com/b?p=1 361 362 Even if use_path=False, urls with same paths are always favored. 363 For example, candidate1 is considered a better match than candidate2. 364 request = GET www.test.com/a?p=1&q=2&r=3 365 candidate1 = GET www.test.com/a?s=4 366 candidate2 = GET www.test.com/b?p=1&q=2&r=3 367 368 Returns: 369 If a close match is found, return the instance of ArchivedHttpRequest. 370 Otherwise, return None. 371 """ 372 # Start with strictest constraints. This trims search space considerably. 373 requests = self.get_requests(request.command, request.host, 374 request.full_path, is_ssl=request.is_ssl, 375 use_query=True) 376 # Relax constraint: use_query if there is no match. 377 if not requests: 378 requests = self.get_requests(request.command, request.host, 379 request.full_path, is_ssl=request.is_ssl, 380 use_query=False) 381 # Relax constraint: full_path if there is no match and use_path=False. 382 if not requests and not use_path: 383 requests = self.get_requests(request.command, request.host, 384 None, is_ssl=request.is_ssl, 385 use_query=False) 386 387 if not requests: 388 return None 389 390 if len(requests) == 1: 391 return requests[0] 392 393 matcher = difflib.SequenceMatcher(b=request.cmp_seq) 394 395 # quick_ratio() is cheap to compute, but ratio() is expensive. So we call 396 # quick_ratio() on all requests, sort them descending, and then loop through 397 # until we find a candidate whose ratio() is >= the next quick_ratio(). 398 # This works because quick_ratio() is guaranteed to be an upper bound on 399 # ratio(). 400 candidates = [] 401 for candidate in requests: 402 matcher.set_seq1(candidate.cmp_seq) 403 candidates.append((matcher.quick_ratio(), candidate)) 404 405 candidates.sort(reverse=True, key=lambda c: c[0]) 406 407 best_match = (0, None) 408 for i in xrange(len(candidates)): 409 matcher.set_seq1(candidates[i][1].cmp_seq) 410 best_match = max(best_match, (matcher.ratio(), candidates[i][1])) 411 if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]: 412 break 413 return best_match[1] 414 415 def diff(self, request): 416 """Diff the given request to the closest matching request in the archive. 417 418 Args: 419 request: an ArchivedHttpRequest 420 Returns: 421 If a close match is found, return a textual diff between the requests. 422 Otherwise, return None. 423 """ 424 request_lines = request.formatted_request.split('\n') 425 closest_request = self.find_closest_request(request) 426 if closest_request: 427 closest_request_lines = closest_request.formatted_request.split('\n') 428 return '\n'.join(difflib.ndiff(closest_request_lines, request_lines)) 429 return None 430 431 def get_server_cert(self, host): 432 """Gets certificate from the server and stores it in archive""" 433 request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {}) 434 if request not in self: 435 self[request] = create_response(200, body=certutils.get_host_cert(host)) 436 return self[request].response_data[0] 437 438 def get_certificate(self, host): 439 request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {}) 440 if request not in self: 441 self[request] = create_response(200, body=self._generate_cert(host)) 442 return self[request].response_data[0] 443 444 @classmethod 445 def AssertWritable(cls, filename): 446 """Raises an IOError if filename is not writable.""" 447 persist_dir = os.path.dirname(os.path.abspath(filename)) 448 if not os.path.exists(persist_dir): 449 raise IOError('Directory does not exist: %s' % persist_dir) 450 if os.path.exists(filename): 451 if not os.access(filename, os.W_OK): 452 raise IOError('Need write permission on file: %s' % filename) 453 elif not os.access(persist_dir, os.W_OK): 454 raise IOError('Need write permission on directory: %s' % persist_dir) 455 456 @classmethod 457 def Load(cls, filename): 458 """Load an instance from filename.""" 459 return cPickle.load(open(filename, 'rb')) 460 461 def Persist(self, filename): 462 """Persist all state to filename.""" 463 try: 464 original_checkinterval = sys.getcheckinterval() 465 sys.setcheckinterval(2**31-1) # Lock out other threads so nothing can 466 # modify |self| during pickling. 467 pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL) 468 finally: 469 sys.setcheckinterval(original_checkinterval) 470 with open(filename, 'wb') as f: 471 f.write(pickled_self) 472 473 474class ArchivedHttpRequest(object): 475 """Record all the state that goes into a request. 476 477 ArchivedHttpRequest instances are considered immutable so they can 478 serve as keys for HttpArchive instances. 479 (The immutability is not enforced.) 480 481 Upon creation, the headers are "trimmed" (i.e. edited or dropped) 482 and saved to self.trimmed_headers to allow requests to match in a wider 483 variety of playback situations (e.g. using different user agents). 484 485 For unpickling, 'trimmed_headers' is recreated from 'headers'. That 486 allows for changes to the trim function and can help with debugging. 487 """ 488 CONDITIONAL_HEADERS = [ 489 'if-none-match', 'if-match', 490 'if-modified-since', 'if-unmodified-since'] 491 492 def __init__(self, command, host, full_path, request_body, headers, 493 is_ssl=False): 494 """Initialize an ArchivedHttpRequest. 495 496 Args: 497 command: a string (e.g. 'GET' or 'POST'). 498 host: a host name (e.g. 'www.google.com'). 499 full_path: a request path. Includes everything after the host & port in 500 the URL (e.g. '/search?q=dogs'). 501 request_body: a request body string for a POST or None. 502 headers: {key: value, ...} where key and value are strings. 503 is_ssl: a boolean which is True iff request is make via SSL. 504 """ 505 self.command = command 506 self.host = host 507 self.full_path = full_path 508 parsed_url = urlparse.urlparse(full_path) if full_path else None 509 self.path = parsed_url.path if parsed_url else None 510 self.request_body = request_body 511 self.headers = headers 512 self.is_ssl = is_ssl 513 self.trimmed_headers = self._TrimHeaders(headers) 514 self.formatted_request = self._GetFormattedRequest() 515 self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None) 516 517 def __str__(self): 518 scheme = 'https' if self.is_ssl else 'http' 519 return '%s %s://%s%s %s' % ( 520 self.command, scheme, self.host, self.full_path, self.trimmed_headers) 521 522 def __repr__(self): 523 return repr((self.command, self.host, self.full_path, self.request_body, 524 self.trimmed_headers, self.is_ssl)) 525 526 def __hash__(self): 527 """Return a integer hash to use for hashed collections including dict.""" 528 return hash(repr(self)) 529 530 def __eq__(self, other): 531 """Define the __eq__ method to match the hash behavior.""" 532 return repr(self) == repr(other) 533 534 def __setstate__(self, state): 535 """Influence how to unpickle. 536 537 "headers" are the original request headers. 538 "trimmed_headers" are the trimmed headers used for matching requests 539 during replay. 540 541 Args: 542 state: a dictionary for __dict__ 543 """ 544 if 'full_headers' in state: 545 # Fix older version of archive. 546 state['headers'] = state['full_headers'] 547 del state['full_headers'] 548 if 'headers' not in state: 549 raise HttpArchiveException( 550 'Archived HTTP request is missing "headers". The HTTP archive is' 551 ' likely from a previous version and must be re-recorded.') 552 if 'path' in state: 553 # before, 'path' and 'path_without_query' were used and 'path' was 554 # pickled. Now, 'path' has been renamed to 'full_path' and 555 # 'path_without_query' has been renamed to 'path'. 'full_path' is 556 # pickled, but 'path' is not. If we see 'path' here it means we are 557 # dealing with an older archive. 558 state['full_path'] = state['path'] 559 del state['path'] 560 state['trimmed_headers'] = self._TrimHeaders(dict(state['headers'])) 561 if 'is_ssl' not in state: 562 state['is_ssl'] = False 563 self.__dict__.update(state) 564 parsed_url = urlparse.urlparse(self.full_path) 565 self.path = parsed_url.path 566 self.formatted_request = self._GetFormattedRequest() 567 self.cmp_seq = self._GetCmpSeq(parsed_url.query) 568 569 def __getstate__(self): 570 """Influence how to pickle. 571 572 Returns: 573 a dict to use for pickling 574 """ 575 state = self.__dict__.copy() 576 del state['trimmed_headers'] 577 del state['path'] 578 del state['formatted_request'] 579 del state['cmp_seq'] 580 return state 581 582 def _GetFormattedRequest(self): 583 """Format request to make diffs easier to read. 584 585 Returns: 586 A string consisting of the request. Example: 587 'GET www.example.com/path\nHeader-Key: header value\n' 588 """ 589 parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)] 590 if self.request_body: 591 parts.append('%s\n' % self.request_body) 592 for k, v in self.trimmed_headers: 593 k = '-'.join(x.capitalize() for x in k.split('-')) 594 parts.append('%s: %s\n' % (k, v)) 595 return ''.join(parts) 596 597 def _GetCmpSeq(self, query=None): 598 """Compute a sequence out of query and header for difflib to compare. 599 For example: 600 [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')] 601 will be returned for a request with URL: 602 http://example.com/index.html?q1=a2&q2=a2 603 and header: 604 k1: v1 605 k2: v2 606 607 Args: 608 query: the query string in the URL. 609 610 Returns: 611 A sequence for difflib to compare. 612 """ 613 if not query: 614 return self.trimmed_headers 615 return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers 616 617 def matches(self, command=None, host=None, full_path=None, is_ssl=None, 618 use_query=True): 619 """Returns true iff the request matches all parameters. 620 621 Args: 622 command: a string (e.g. 'GET' or 'POST'). 623 host: a host name (e.g. 'www.google.com'). 624 full_path: a request path with query string (e.g. '/search?q=dogs') 625 is_ssl: whether the request is secure. 626 use_query: 627 If use_query is True, request matching uses both the hierarchical path 628 and query string component. 629 If use_query is False, request matching only uses the hierarchical path 630 631 e.g. req1 = GET www.test.com/index?aaaa 632 req2 = GET www.test.com/index?bbbb 633 634 If use_query is True, req1.matches(req2) evaluates to False 635 If use_query is False, req1.matches(req2) evaluates to True 636 637 Returns: 638 True iff the request matches all parameters 639 """ 640 if command is not None and command != self.command: 641 return False 642 if is_ssl is not None and is_ssl != self.is_ssl: 643 return False 644 if host is not None and host != self.host: 645 return False 646 if full_path is None: 647 return True 648 if use_query: 649 return full_path == self.full_path 650 else: 651 return self.path == urlparse.urlparse(full_path).path 652 653 @classmethod 654 def _TrimHeaders(cls, headers): 655 """Removes headers that are known to cause problems during replay. 656 657 These headers are removed for the following reasons: 658 - accept: Causes problems with www.bing.com. During record, CSS is fetched 659 with *. During replay, it's text/css. 660 - accept-charset, accept-language, referer: vary between clients. 661 - cache-control: sometimes sent from Chrome with 'max-age=0' as value. 662 - connection, method, scheme, url, version: Cause problems with spdy. 663 - cookie: Extremely sensitive to request/response order. 664 - keep-alive: Doesn't affect the content of the request, only some 665 transient state of the transport layer. 666 - user-agent: Changes with every Chrome version. 667 - proxy-connection: Sent for proxy requests. 668 - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by 669 Google to collect statistics about Chrome's enabled features. 670 671 Another variant to consider is dropping only the value from the header. 672 However, this is particularly bad for the cookie header, because the 673 presence of the cookie depends on the responses we've seen when the request 674 is made. 675 676 Args: 677 headers: {header_key: header_value, ...} 678 679 Returns: 680 [(header_key, header_value), ...] # (with undesirable headers removed) 681 """ 682 # TODO(tonyg): Strip sdch from the request headers because we can't 683 # guarantee that the dictionary will be recorded, so replay may not work. 684 if 'accept-encoding' in headers: 685 accept_encoding = headers['accept-encoding'] 686 accept_encoding = accept_encoding.replace('sdch', '') 687 # Strip lzma so Opera's requests matches archives recorded using Chrome. 688 accept_encoding = accept_encoding.replace('lzma', '') 689 stripped_encodings = [e.strip() for e in accept_encoding.split(',')] 690 accept_encoding = ','.join(filter(bool, stripped_encodings)) 691 headers['accept-encoding'] = accept_encoding 692 undesirable_keys = [ 693 'accept', 'accept-charset', 'accept-language', 'cache-control', 694 'connection', 'cookie', 'keep-alive', 'method', 695 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection', 696 'x-chrome-variations', 'x-client-data'] 697 return sorted([(k, v) for k, v in headers.items() 698 if k.lower() not in undesirable_keys]) 699 700 def is_conditional(self): 701 """Return list of headers that match conditional headers.""" 702 for header in self.CONDITIONAL_HEADERS: 703 if header in self.headers: 704 return True 705 return False 706 707 def create_request_without_conditions(self): 708 stripped_headers = dict((k, v) for k, v in self.headers.iteritems() 709 if k.lower() not in self.CONDITIONAL_HEADERS) 710 return ArchivedHttpRequest( 711 self.command, self.host, self.full_path, self.request_body, 712 stripped_headers, self.is_ssl) 713 714class ArchivedHttpResponse(object): 715 """All the data needed to recreate all HTTP response.""" 716 717 # CHUNK_EDIT_SEPARATOR is used to edit and view text content. 718 # It is not sent in responses. It is added by get_data_as_text() 719 # and removed by set_data(). 720 CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]' 721 722 # DELAY_EDIT_SEPARATOR is used to edit and view server delays. 723 DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- ' 724 'Delays are above. Response content is below.]\n') 725 726 def __init__(self, version, status, reason, headers, response_data, 727 delays=None): 728 """Initialize an ArchivedHttpResponse. 729 730 Args: 731 version: HTTP protocol version used by server. 732 10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib). 733 status: Status code returned by server (e.g. 200). 734 reason: Reason phrase returned by server (e.g. "OK"). 735 headers: list of (header, value) tuples. 736 response_data: list of content chunks. 737 Concatenating the chunks gives the complete contents 738 (i.e. the chunks do not have any lengths or delimiters). 739 Do not include the final, zero-length chunk that marks the end. 740 delays: dict of (ms) delays for 'connect', 'headers' and 'data'. 741 e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]} 742 connect - The time to connect to the server. 743 Each resource has a value because Replay's record mode captures it. 744 This includes the time for the SYN and SYN/ACK (1 rtt). 745 headers - The time elapsed between the TCP connect and the headers. 746 This typically includes all the server-time to generate a response. 747 data - If the response is chunked, these are the times for each chunk. 748 """ 749 self.version = version 750 self.status = status 751 self.reason = reason 752 self.headers = headers 753 self.response_data = response_data 754 self.delays = delays 755 self.fix_delays() 756 757 def fix_delays(self): 758 """Initialize delays, or check the number of data delays.""" 759 expected_num_delays = len(self.response_data) 760 if not self.delays: 761 self.delays = { 762 'connect': 0, 763 'headers': 0, 764 'data': [0] * expected_num_delays 765 } 766 else: 767 num_delays = len(self.delays['data']) 768 if num_delays != expected_num_delays: 769 raise HttpArchiveException( 770 'Server delay length mismatch: %d (expected %d): %s', 771 num_delays, expected_num_delays, self.delays['data']) 772 773 def __repr__(self): 774 return repr((self.version, self.status, self.reason, sorted(self.headers), 775 self.response_data)) 776 777 def __hash__(self): 778 """Return a integer hash to use for hashed collections including dict.""" 779 return hash(repr(self)) 780 781 def __eq__(self, other): 782 """Define the __eq__ method to match the hash behavior.""" 783 return repr(self) == repr(other) 784 785 def __setstate__(self, state): 786 """Influence how to unpickle. 787 788 Args: 789 state: a dictionary for __dict__ 790 """ 791 if 'server_delays' in state: 792 state['delays'] = { 793 'connect': 0, 794 'headers': 0, 795 'data': state['server_delays'] 796 } 797 del state['server_delays'] 798 elif 'delays' not in state: 799 state['delays'] = None 800 self.__dict__.update(state) 801 self.fix_delays() 802 803 def get_header(self, key, default=None): 804 for k, v in self.headers: 805 if key.lower() == k.lower(): 806 return v 807 return default 808 809 def set_header(self, key, value): 810 for i, (k, v) in enumerate(self.headers): 811 if key == k: 812 self.headers[i] = (key, value) 813 return 814 self.headers.append((key, value)) 815 816 def remove_header(self, key): 817 for i, (k, v) in enumerate(self.headers): 818 if key.lower() == k.lower(): 819 self.headers.pop(i) 820 return 821 822 @staticmethod 823 def _get_epoch_seconds(date_str): 824 """Return the epoch seconds of a date header. 825 826 Args: 827 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT") 828 Returns: 829 epoch seconds as a float 830 """ 831 date_tuple = email.utils.parsedate(date_str) 832 if date_tuple: 833 return calendar.timegm(date_tuple) 834 return None 835 836 def update_date(self, date_str, now=None): 837 """Return an updated date based on its delta from the "Date" header. 838 839 For example, if |date_str| is one week later than the "Date" header, 840 then the returned date string is one week later than the current date. 841 842 Args: 843 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT") 844 Returns: 845 a date string 846 """ 847 date_seconds = self._get_epoch_seconds(self.get_header('date')) 848 header_seconds = self._get_epoch_seconds(date_str) 849 if date_seconds and header_seconds: 850 updated_seconds = header_seconds + (now or time.time()) - date_seconds 851 return email.utils.formatdate(updated_seconds, usegmt=True) 852 return date_str 853 854 def is_gzip(self): 855 return self.get_header('content-encoding') == 'gzip' 856 857 def is_compressed(self): 858 return self.get_header('content-encoding') in ('gzip', 'deflate') 859 860 def is_chunked(self): 861 return self.get_header('transfer-encoding') == 'chunked' 862 863 def get_data_as_text(self): 864 """Return content as a single string. 865 866 Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR. 867 """ 868 content_type = self.get_header('content-type') 869 if (not content_type or 870 not (content_type.startswith('text/') or 871 content_type == 'application/x-javascript' or 872 content_type.startswith('application/json'))): 873 return None 874 if self.is_compressed(): 875 uncompressed_chunks = httpzlib.uncompress_chunks( 876 self.response_data, self.is_gzip()) 877 else: 878 uncompressed_chunks = self.response_data 879 return self.CHUNK_EDIT_SEPARATOR.join(uncompressed_chunks) 880 881 def get_delays_as_text(self): 882 """Return delays as editable text.""" 883 return json.dumps(self.delays, indent=2) 884 885 def get_response_as_text(self): 886 """Returns response content as a single string. 887 888 Server delays are separated on a per-chunk basis. Delays are in seconds. 889 Response content begins after DELAY_EDIT_SEPARATOR 890 """ 891 data = self.get_data_as_text() 892 if data is None: 893 logging.warning('Data can not be represented as text.') 894 data = '' 895 delays = self.get_delays_as_text() 896 return self.DELAY_EDIT_SEPARATOR.join((delays, data)) 897 898 def set_data(self, text): 899 """Inverse of get_data_as_text(). 900 901 Split on CHUNK_EDIT_SEPARATOR and compress if needed. 902 """ 903 text_chunks = text.split(self.CHUNK_EDIT_SEPARATOR) 904 if self.is_compressed(): 905 self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip()) 906 else: 907 self.response_data = text_chunks 908 if not self.is_chunked(): 909 content_length = sum(len(c) for c in self.response_data) 910 self.set_header('content-length', str(content_length)) 911 912 def set_delays(self, delays_text): 913 """Inverse of get_delays_as_text(). 914 915 Args: 916 delays_text: JSON encoded text such as the following: 917 { 918 connect: 80, 919 headers: 80, 920 data: [6, 55, 0] 921 } 922 Times are in milliseconds. 923 Each data delay corresponds with one response_data value. 924 """ 925 try: 926 self.delays = json.loads(delays_text) 927 except (ValueError, KeyError) as e: 928 logging.critical('Unable to parse delays %s: %s', delays_text, e) 929 self.fix_delays() 930 931 def set_response_from_text(self, text): 932 """Inverse of get_response_as_text(). 933 934 Modifies the state of the archive according to the textual representation. 935 """ 936 try: 937 delays, data = text.split(self.DELAY_EDIT_SEPARATOR) 938 except ValueError: 939 logging.critical( 940 'Error parsing text representation. Skipping edits.') 941 return 942 self.set_delays(delays) 943 self.set_data(data) 944 945 946def create_response(status, reason=None, headers=None, body=None): 947 """Convenience method for creating simple ArchivedHttpResponse objects.""" 948 if reason is None: 949 reason = httplib.responses.get(status, 'Unknown') 950 if headers is None: 951 headers = [('content-type', 'text/plain')] 952 if body is None: 953 body = "%s %s" % (status, reason) 954 return ArchivedHttpResponse(11, status, reason, headers, [body]) 955 956 957def main(): 958 class PlainHelpFormatter(optparse.IndentedHelpFormatter): 959 def format_description(self, description): 960 if description: 961 return description + '\n' 962 else: 963 return '' 964 965 option_parser = optparse.OptionParser( 966 usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)', 967 formatter=PlainHelpFormatter(), 968 description=__doc__, 969 epilog='http://code.google.com/p/web-page-replay/') 970 971 option_parser.add_option('-c', '--command', default=None, 972 action='store', 973 type='string', 974 help='Only show URLs matching this command.') 975 option_parser.add_option('-o', '--host', default=None, 976 action='store', 977 type='string', 978 help='Only show URLs matching this host.') 979 option_parser.add_option('-p', '--full_path', default=None, 980 action='store', 981 type='string', 982 help='Only show URLs matching this full path.') 983 option_parser.add_option('-f', '--merged_file', default=None, 984 action='store', 985 type='string', 986 help='The output file to use when using the merge command.') 987 988 options, args = option_parser.parse_args() 989 990 # Merge command expects an umlimited number of archives. 991 if len(args) < 2: 992 print 'args: %s' % args 993 option_parser.error('Must specify a command and replay_file') 994 995 command = args[0] 996 replay_file = args[1] 997 998 if not os.path.exists(replay_file): 999 option_parser.error('Replay file "%s" does not exist' % replay_file) 1000 1001 http_archive = HttpArchive.Load(replay_file) 1002 if command == 'ls': 1003 print http_archive.ls(options.command, options.host, options.full_path) 1004 elif command == 'cat': 1005 print http_archive.cat(options.command, options.host, options.full_path) 1006 elif command == 'stats': 1007 print http_archive.stats(options.command, options.host, options.full_path) 1008 elif command == 'merge': 1009 if not options.merged_file: 1010 print 'Error: Must specify a merged file name (use --merged_file)' 1011 return 1012 http_archive.merge(options.merged_file, args[2:]) 1013 elif command == 'edit': 1014 http_archive.edit(options.command, options.host, options.full_path) 1015 http_archive.Persist(replay_file) 1016 else: 1017 option_parser.error('Unknown command "%s"' % command) 1018 return 0 1019 1020 1021if __name__ == '__main__': 1022 sys.exit(main()) 1023