1#!/usr/bin/env python 2# Copyright 2010 Google Inc. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15 16"""View and edit HTTP Archives. 17 18To list all URLs in an archive: 19 $ ./httparchive.py ls archive.wpr 20 21To view the content of all URLs from example.com: 22 $ ./httparchive.py cat --host example.com archive.wpr 23 24To view the content of a particular URL: 25 $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr 26 27To view the content of all URLs: 28 $ ./httparchive.py cat archive.wpr 29 30To edit a particular URL: 31 $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr 32 33To print statistics of an archive: 34 $ ./httparchive.py stats archive.wpr 35 36To print statistics of a set of URLs: 37 $ ./httparchive.py stats --host www.example.com archive.wpr 38 39To merge multiple archives 40 $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ... 41""" 42 43import calendar 44import certutils 45import datetime 46import cPickle 47import difflib 48import email.utils 49import httplib 50import httpzlib 51import json 52import logging 53import optparse 54import os 55import StringIO 56import subprocess 57import sys 58import tempfile 59import time 60import urlparse 61from collections import defaultdict 62 63 64 65def LogRunTime(fn): 66 """Annotation which logs the run time of the function.""" 67 def wrapped(self, *args, **kwargs): 68 start_time = time.time() 69 try: 70 return fn(self, *args, **kwargs) 71 finally: 72 run_time = (time.time() - start_time) * 1000.0 73 logging.debug('%s: %dms', fn.__name__, run_time) 74 return wrapped 75 76 77class HttpArchiveException(Exception): 78 """Base class for all exceptions in httparchive.""" 79 pass 80 81 82class HttpArchive(dict): 83 """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values. 84 85 Attributes: 86 responses_by_host: dict of {hostname, {request: response}}. This must remain 87 in sync with the underlying dict of self. It is used as an optimization 88 so that get_requests() doesn't have to linearly search all requests in 89 the archive to find potential matches. 90 """ 91 92 def __init__(self): # pylint: disable=super-init-not-called 93 self.responses_by_host = defaultdict(dict) 94 95 def __setstate__(self, state): 96 """Influence how to unpickle. 97 98 Args: 99 state: a dictionary for __dict__ 100 """ 101 self.__dict__.update(state) 102 self.responses_by_host = defaultdict(dict) 103 for request in self: 104 self.responses_by_host[request.host][request] = self[request] 105 106 def __getstate__(self): 107 """Influence how to pickle. 108 109 Returns: 110 a dict to use for pickling 111 """ 112 state = self.__dict__.copy() 113 del state['responses_by_host'] 114 return state 115 116 def __setitem__(self, key, value): 117 super(HttpArchive, self).__setitem__(key, value) 118 if hasattr(self, 'responses_by_host'): 119 self.responses_by_host[key.host][key] = value 120 121 def __delitem__(self, key): 122 super(HttpArchive, self).__delitem__(key) 123 del self.responses_by_host[key.host][key] 124 125 def get(self, request, default=None): 126 """Return the archived response for a given request. 127 128 Does extra checking for handling some HTTP request headers. 129 130 Args: 131 request: instance of ArchivedHttpRequest 132 default: default value to return if request is not found 133 134 Returns: 135 Instance of ArchivedHttpResponse or default if no matching 136 response is found 137 """ 138 if request in self: 139 return self[request] 140 return self.get_conditional_response(request, default) 141 142 def get_conditional_response(self, request, default): 143 """Get the response based on the conditional HTTP request headers. 144 145 Args: 146 request: an ArchivedHttpRequest representing the original request. 147 default: default ArchivedHttpResponse 148 original request with matched headers removed. 149 150 Returns: 151 an ArchivedHttpResponse with a status of 200, 302 (not modified), or 152 412 (precondition failed) 153 """ 154 response = default 155 if request.is_conditional(): 156 stripped_request = request.create_request_without_conditions() 157 if stripped_request in self: 158 response = self[stripped_request] 159 if response.status == 200: 160 status = self.get_conditional_status(request, response) 161 if status != 200: 162 response = create_response(status) 163 return response 164 165 def get_conditional_status(self, request, response): 166 status = 200 167 last_modified = email.utils.parsedate( 168 response.update_date(response.get_header('last-modified'))) 169 response_etag = response.get_header('etag') 170 is_get_or_head = request.command.upper() in ('GET', 'HEAD') 171 172 match_value = request.headers.get('if-match', None) 173 if match_value: 174 if self.is_etag_match(match_value, response_etag): 175 status = 200 176 else: 177 status = 412 # precondition failed 178 none_match_value = request.headers.get('if-none-match', None) 179 if none_match_value: 180 if self.is_etag_match(none_match_value, response_etag): 181 status = 304 182 elif is_get_or_head: 183 status = 200 184 else: 185 status = 412 186 if is_get_or_head and last_modified: 187 for header in ('if-modified-since', 'if-unmodified-since'): 188 date = email.utils.parsedate(request.headers.get(header, None)) 189 if date: 190 if ((header == 'if-modified-since' and last_modified > date) or 191 (header == 'if-unmodified-since' and last_modified < date)): 192 if status != 412: 193 status = 200 194 else: 195 status = 304 # not modified 196 return status 197 198 @staticmethod 199 def is_etag_match(request_etag, response_etag): 200 """Determines whether the entity tags of the request/response matches. 201 202 Args: 203 request_etag: the value string of the "if-(none)-match:" 204 portion of the request header 205 response_etag: the etag value of the response 206 207 Returns: 208 True on match, False otherwise 209 """ 210 response_etag = response_etag.strip('" ') 211 for etag in request_etag.split(','): 212 etag = etag.strip('" ') 213 if etag in ('*', response_etag): 214 return True 215 return False 216 217 def get_requests(self, command=None, host=None, full_path=None, is_ssl=None, 218 use_query=True): 219 """Return a list of requests that match the given args.""" 220 if host: 221 return [r for r in self.responses_by_host[host] 222 if r.matches(command, None, full_path, is_ssl, 223 use_query=use_query)] 224 else: 225 return [r for r in self 226 if r.matches(command, host, full_path, is_ssl, 227 use_query=use_query)] 228 229 def ls(self, command=None, host=None, full_path=None): 230 """List all URLs that match given params.""" 231 return ''.join(sorted( 232 '%s\n' % r for r in self.get_requests(command, host, full_path))) 233 234 def cat(self, command=None, host=None, full_path=None): 235 """Print the contents of all URLs that match given params.""" 236 out = StringIO.StringIO() 237 for request in self.get_requests(command, host, full_path): 238 print >>out, str(request) 239 print >>out, 'Untrimmed request headers:' 240 for k in request.headers: 241 print >>out, ' %s: %s' % (k, request.headers[k]) 242 if request.request_body: 243 print >>out, request.request_body 244 print >>out, '---- Response Info', '-' * 51 245 response = self[request] 246 chunk_lengths = [len(x) for x in response.response_data] 247 print >>out, ('Status: %s\n' 248 'Reason: %s\n' 249 'Headers delay: %s\n' 250 'Untrimmed response headers:') % ( 251 response.status, response.reason, response.delays['headers']) 252 for k, v in response.original_headers: 253 print >>out, ' %s: %s' % (k, v) 254 print >>out, ('Chunk count: %s\n' 255 'Chunk lengths: %s\n' 256 'Chunk delays: %s') % ( 257 len(chunk_lengths), chunk_lengths, response.delays['data']) 258 body = response.get_data_as_text() 259 print >>out, '---- Response Data', '-' * 51 260 if body: 261 print >>out, body 262 else: 263 print >>out, '[binary data]' 264 print >>out, '=' * 70 265 return out.getvalue() 266 267 def stats(self, command=None, host=None, full_path=None): 268 """Print stats about the archive for all URLs that match given params.""" 269 matching_requests = self.get_requests(command, host, full_path) 270 if not matching_requests: 271 print 'Failed to find any requests matching given command, host, path.' 272 return 273 274 out = StringIO.StringIO() 275 stats = { 276 'Total': len(matching_requests), 277 'Domains': defaultdict(int), 278 'HTTP_response_code': defaultdict(int), 279 'content_type': defaultdict(int), 280 'Documents': defaultdict(int), 281 } 282 283 for request in matching_requests: 284 stats['Domains'][request.host] += 1 285 stats['HTTP_response_code'][self[request].status] += 1 286 287 content_type = self[request].get_header('content-type') 288 # Remove content type options for readability and higher level groupings. 289 str_content_type = str(content_type.split(';')[0] 290 if content_type else None) 291 stats['content_type'][str_content_type] += 1 292 293 # Documents are the main URL requested and not a referenced resource. 294 if str_content_type == 'text/html' and not 'referer' in request.headers: 295 stats['Documents'][request.host] += 1 296 297 print >>out, json.dumps(stats, indent=4) 298 return out.getvalue() 299 300 def merge(self, merged_archive=None, other_archives=None): 301 """Merge multiple archives into merged_archive by 'chaining' resources, 302 only resources that are not part of the accumlated archive are added""" 303 if not other_archives: 304 print 'No archives passed to merge' 305 return 306 307 # Note we already loaded 'replay_file'. 308 print 'Loaded %d responses' % len(self) 309 310 for archive in other_archives: 311 if not os.path.exists(archive): 312 print 'Error: Replay file "%s" does not exist' % archive 313 return 314 315 http_archive_other = HttpArchive.Load(archive) 316 print 'Loaded %d responses from %s' % (len(http_archive_other), archive) 317 for r in http_archive_other: 318 # Only resources that are not already part of the current archive 319 # get added. 320 if r not in self: 321 print '\t %s ' % r 322 self[r] = http_archive_other[r] 323 self.Persist('%s' % merged_archive) 324 325 def edit(self, command=None, host=None, full_path=None): 326 """Edits the single request which matches given params.""" 327 editor = os.getenv('EDITOR') 328 if not editor: 329 print 'You must set the EDITOR environmental variable.' 330 return 331 332 matching_requests = self.get_requests(command, host, full_path) 333 if not matching_requests: 334 print ('Failed to find any requests matching given command, host, ' 335 'full_path.') 336 return 337 338 if len(matching_requests) > 1: 339 print 'Found multiple matching requests. Please refine.' 340 print self.ls(command, host, full_path) 341 342 response = self[matching_requests[0]] 343 tmp_file = tempfile.NamedTemporaryFile(delete=False) 344 tmp_file.write(response.get_response_as_text()) 345 tmp_file.close() 346 subprocess.check_call([editor, tmp_file.name]) 347 response.set_response_from_text(''.join(open(tmp_file.name).readlines())) 348 os.remove(tmp_file.name) 349 350 def find_closest_request(self, request, use_path=False): 351 """Find the closest matching request in the archive to the given request. 352 353 Args: 354 request: an ArchivedHttpRequest 355 use_path: If True, closest matching request's path component must match. 356 (Note: this refers to the 'path' component within the URL, not the 357 'full path' which includes the query string component.) 358 359 If use_path=True, candidate will NOT match in example below 360 e.g. request = GET www.test.com/a?p=1 361 candidate = GET www.test.com/b?p=1 362 363 Even if use_path=False, urls with same paths are always favored. 364 For example, candidate1 is considered a better match than candidate2. 365 request = GET www.test.com/a?p=1&q=2&r=3 366 candidate1 = GET www.test.com/a?s=4 367 candidate2 = GET www.test.com/b?p=1&q=2&r=3 368 369 Returns: 370 If a close match is found, return the instance of ArchivedHttpRequest. 371 Otherwise, return None. 372 """ 373 # Start with strictest constraints. This trims search space considerably. 374 requests = self.get_requests(request.command, request.host, 375 request.full_path, is_ssl=request.is_ssl, 376 use_query=True) 377 # Relax constraint: use_query if there is no match. 378 if not requests: 379 requests = self.get_requests(request.command, request.host, 380 request.full_path, is_ssl=request.is_ssl, 381 use_query=False) 382 # Relax constraint: full_path if there is no match and use_path=False. 383 if not requests and not use_path: 384 requests = self.get_requests(request.command, request.host, 385 None, is_ssl=request.is_ssl, 386 use_query=False) 387 388 if not requests: 389 return None 390 391 if len(requests) == 1: 392 return requests[0] 393 394 matcher = difflib.SequenceMatcher(b=request.cmp_seq) 395 396 # quick_ratio() is cheap to compute, but ratio() is expensive. So we call 397 # quick_ratio() on all requests, sort them descending, and then loop through 398 # until we find a candidate whose ratio() is >= the next quick_ratio(). 399 # This works because quick_ratio() is guaranteed to be an upper bound on 400 # ratio(). 401 candidates = [] 402 for candidate in requests: 403 matcher.set_seq1(candidate.cmp_seq) 404 candidates.append((matcher.quick_ratio(), candidate)) 405 406 candidates.sort(reverse=True, key=lambda c: c[0]) 407 408 best_match = (0, None) 409 for i in xrange(len(candidates)): 410 matcher.set_seq1(candidates[i][1].cmp_seq) 411 best_match = max(best_match, (matcher.ratio(), candidates[i][1])) 412 if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]: 413 break 414 return best_match[1] 415 416 def diff(self, request): 417 """Diff the given request to the closest matching request in the archive. 418 419 Args: 420 request: an ArchivedHttpRequest 421 Returns: 422 If a close match is found, return a textual diff between the requests. 423 Otherwise, return None. 424 """ 425 request_lines = request.formatted_request.split('\n') 426 closest_request = self.find_closest_request(request) 427 if closest_request: 428 closest_request_lines = closest_request.formatted_request.split('\n') 429 return '\n'.join(difflib.ndiff(closest_request_lines, request_lines)) 430 return None 431 432 def get_server_cert(self, host): 433 """Gets certificate from the server and stores it in archive""" 434 request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {}) 435 if request not in self: 436 self[request] = create_response(200, body=certutils.get_host_cert(host)) 437 return self[request].response_data[0] 438 439 def get_certificate(self, host): 440 request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {}) 441 if request not in self: 442 self[request] = create_response(200, body=self._generate_cert(host)) 443 return self[request].response_data[0] 444 445 @classmethod 446 def AssertWritable(cls, filename): 447 """Raises an IOError if filename is not writable.""" 448 persist_dir = os.path.dirname(os.path.abspath(filename)) 449 if not os.path.exists(persist_dir): 450 raise IOError('Directory does not exist: %s' % persist_dir) 451 if os.path.exists(filename): 452 if not os.access(filename, os.W_OK): 453 raise IOError('Need write permission on file: %s' % filename) 454 elif not os.access(persist_dir, os.W_OK): 455 raise IOError('Need write permission on directory: %s' % persist_dir) 456 457 @classmethod 458 def Load(cls, filename): 459 """Load an instance from filename.""" 460 return cPickle.load(open(filename, 'rb')) 461 462 def Persist(self, filename): 463 """Persist all state to filename.""" 464 try: 465 original_checkinterval = sys.getcheckinterval() 466 sys.setcheckinterval(2**31-1) # Lock out other threads so nothing can 467 # modify |self| during pickling. 468 pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL) 469 finally: 470 sys.setcheckinterval(original_checkinterval) 471 with open(filename, 'wb') as f: 472 f.write(pickled_self) 473 474 475class ArchivedHttpRequest(object): 476 """Record all the state that goes into a request. 477 478 ArchivedHttpRequest instances are considered immutable so they can 479 serve as keys for HttpArchive instances. 480 (The immutability is not enforced.) 481 482 Upon creation, the headers are "trimmed" (i.e. edited or dropped) 483 and saved to self.trimmed_headers to allow requests to match in a wider 484 variety of playback situations (e.g. using different user agents). 485 486 For unpickling, 'trimmed_headers' is recreated from 'headers'. That 487 allows for changes to the trim function and can help with debugging. 488 """ 489 CONDITIONAL_HEADERS = [ 490 'if-none-match', 'if-match', 491 'if-modified-since', 'if-unmodified-since'] 492 493 def __init__(self, command, host, full_path, request_body, headers, 494 is_ssl=False): 495 """Initialize an ArchivedHttpRequest. 496 497 Args: 498 command: a string (e.g. 'GET' or 'POST'). 499 host: a host name (e.g. 'www.google.com'). 500 full_path: a request path. Includes everything after the host & port in 501 the URL (e.g. '/search?q=dogs'). 502 request_body: a request body string for a POST or None. 503 headers: {key: value, ...} where key and value are strings. 504 is_ssl: a boolean which is True iff request is make via SSL. 505 """ 506 self.command = command 507 self.host = host 508 self.full_path = full_path 509 parsed_url = urlparse.urlparse(full_path) if full_path else None 510 self.path = parsed_url.path if parsed_url else None 511 self.request_body = request_body 512 self.headers = headers 513 self.is_ssl = is_ssl 514 self.trimmed_headers = self._TrimHeaders(headers) 515 self.formatted_request = self._GetFormattedRequest() 516 self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None) 517 518 def __str__(self): 519 scheme = 'https' if self.is_ssl else 'http' 520 return '%s %s://%s%s %s' % ( 521 self.command, scheme, self.host, self.full_path, self.trimmed_headers) 522 523 def __repr__(self): 524 return repr((self.command, self.host, self.full_path, self.request_body, 525 self.trimmed_headers, self.is_ssl)) 526 527 def __hash__(self): 528 """Return a integer hash to use for hashed collections including dict.""" 529 return hash(repr(self)) 530 531 def __eq__(self, other): 532 """Define the __eq__ method to match the hash behavior.""" 533 return repr(self) == repr(other) 534 535 def __setstate__(self, state): 536 """Influence how to unpickle. 537 538 "headers" are the original request headers. 539 "trimmed_headers" are the trimmed headers used for matching requests 540 during replay. 541 542 Args: 543 state: a dictionary for __dict__ 544 """ 545 if 'full_headers' in state: 546 # Fix older version of archive. 547 state['headers'] = state['full_headers'] 548 del state['full_headers'] 549 if 'headers' not in state: 550 raise HttpArchiveException( 551 'Archived HTTP request is missing "headers". The HTTP archive is' 552 ' likely from a previous version and must be re-recorded.') 553 if 'path' in state: 554 # before, 'path' and 'path_without_query' were used and 'path' was 555 # pickled. Now, 'path' has been renamed to 'full_path' and 556 # 'path_without_query' has been renamed to 'path'. 'full_path' is 557 # pickled, but 'path' is not. If we see 'path' here it means we are 558 # dealing with an older archive. 559 state['full_path'] = state['path'] 560 del state['path'] 561 state['trimmed_headers'] = self._TrimHeaders(dict(state['headers'])) 562 if 'is_ssl' not in state: 563 state['is_ssl'] = False 564 self.__dict__.update(state) 565 parsed_url = urlparse.urlparse(self.full_path) 566 self.path = parsed_url.path 567 self.formatted_request = self._GetFormattedRequest() 568 self.cmp_seq = self._GetCmpSeq(parsed_url.query) 569 570 def __getstate__(self): 571 """Influence how to pickle. 572 573 Returns: 574 a dict to use for pickling 575 """ 576 state = self.__dict__.copy() 577 del state['trimmed_headers'] 578 del state['path'] 579 del state['formatted_request'] 580 del state['cmp_seq'] 581 return state 582 583 def _GetFormattedRequest(self): 584 """Format request to make diffs easier to read. 585 586 Returns: 587 A string consisting of the request. Example: 588 'GET www.example.com/path\nHeader-Key: header value\n' 589 """ 590 parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)] 591 if self.request_body: 592 parts.append('%s\n' % self.request_body) 593 for k, v in self.trimmed_headers: 594 k = '-'.join(x.capitalize() for x in k.split('-')) 595 parts.append('%s: %s\n' % (k, v)) 596 return ''.join(parts) 597 598 def _GetCmpSeq(self, query=None): 599 """Compute a sequence out of query and header for difflib to compare. 600 For example: 601 [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')] 602 will be returned for a request with URL: 603 http://example.com/index.html?q1=a2&q2=a2 604 and header: 605 k1: v1 606 k2: v2 607 608 Args: 609 query: the query string in the URL. 610 611 Returns: 612 A sequence for difflib to compare. 613 """ 614 if not query: 615 return self.trimmed_headers 616 return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers 617 618 def matches(self, command=None, host=None, full_path=None, is_ssl=None, 619 use_query=True): 620 """Returns true iff the request matches all parameters. 621 622 Args: 623 command: a string (e.g. 'GET' or 'POST'). 624 host: a host name (e.g. 'www.google.com'). 625 full_path: a request path with query string (e.g. '/search?q=dogs') 626 is_ssl: whether the request is secure. 627 use_query: 628 If use_query is True, request matching uses both the hierarchical path 629 and query string component. 630 If use_query is False, request matching only uses the hierarchical path 631 632 e.g. req1 = GET www.test.com/index?aaaa 633 req2 = GET www.test.com/index?bbbb 634 635 If use_query is True, req1.matches(req2) evaluates to False 636 If use_query is False, req1.matches(req2) evaluates to True 637 638 Returns: 639 True iff the request matches all parameters 640 """ 641 if command is not None and command != self.command: 642 return False 643 if is_ssl is not None and is_ssl != self.is_ssl: 644 return False 645 if host is not None and host != self.host: 646 return False 647 if full_path is None: 648 return True 649 if use_query: 650 return full_path == self.full_path 651 else: 652 return self.path == urlparse.urlparse(full_path).path 653 654 @classmethod 655 def _TrimHeaders(cls, headers): 656 """Removes headers that are known to cause problems during replay. 657 658 These headers are removed for the following reasons: 659 - accept: Causes problems with www.bing.com. During record, CSS is fetched 660 with *. During replay, it's text/css. 661 - accept-charset, accept-language, referer: vary between clients. 662 - cache-control: sometimes sent from Chrome with 'max-age=0' as value. 663 - connection, method, scheme, url, version: Cause problems with spdy. 664 - cookie: Extremely sensitive to request/response order. 665 - keep-alive: Doesn't affect the content of the request, only some 666 transient state of the transport layer. 667 - user-agent: Changes with every Chrome version. 668 - proxy-connection: Sent for proxy requests. 669 - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by 670 Google to collect statistics about Chrome's enabled features. 671 672 Another variant to consider is dropping only the value from the header. 673 However, this is particularly bad for the cookie header, because the 674 presence of the cookie depends on the responses we've seen when the request 675 is made. 676 677 Args: 678 headers: {header_key: header_value, ...} 679 680 Returns: 681 [(header_key, header_value), ...] # (with undesirable headers removed) 682 """ 683 # TODO(tonyg): Strip sdch from the request headers because we can't 684 # guarantee that the dictionary will be recorded, so replay may not work. 685 if 'accept-encoding' in headers: 686 accept_encoding = headers['accept-encoding'] 687 accept_encoding = accept_encoding.replace('sdch', '') 688 # Strip lzma so Opera's requests matches archives recorded using Chrome. 689 accept_encoding = accept_encoding.replace('lzma', '') 690 stripped_encodings = [e.strip() for e in accept_encoding.split(',')] 691 accept_encoding = ','.join(filter(bool, stripped_encodings)) 692 headers['accept-encoding'] = accept_encoding 693 undesirable_keys = [ 694 'accept', 'accept-charset', 'accept-language', 'cache-control', 695 'connection', 'cookie', 'keep-alive', 'method', 696 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection', 697 'x-chrome-variations', 'x-client-data'] 698 return sorted([(k, v) for k, v in headers.items() 699 if k.lower() not in undesirable_keys]) 700 701 def is_conditional(self): 702 """Return list of headers that match conditional headers.""" 703 for header in self.CONDITIONAL_HEADERS: 704 if header in self.headers: 705 return True 706 return False 707 708 def create_request_without_conditions(self): 709 stripped_headers = dict((k, v) for k, v in self.headers.iteritems() 710 if k.lower() not in self.CONDITIONAL_HEADERS) 711 return ArchivedHttpRequest( 712 self.command, self.host, self.full_path, self.request_body, 713 stripped_headers, self.is_ssl) 714 715class ArchivedHttpResponse(object): 716 """All the data needed to recreate all HTTP response. 717 718 Upon creation, the headers are "trimmed" (i.e. edited or dropped). 719 The original headers are saved to self.original_headers, while the 720 trimmed ones are used to allow responses to match in a wider variety 721 of playback situations. 722 723 For pickling, 'original_headers' are stored in the archive. For unpickling 724 the headers are trimmed again. That allows for changes to the trim 725 function and can help with debugging. 726 """ 727 728 # CHUNK_EDIT_SEPARATOR is used to edit and view text content. 729 # It is not sent in responses. It is added by get_data_as_text() 730 # and removed by set_data(). 731 CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]' 732 733 # DELAY_EDIT_SEPARATOR is used to edit and view server delays. 734 DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- ' 735 'Delays are above. Response content is below.]\n') 736 737 # This date was used in deterministic.js prior to switching to recorded 738 # request time. See https://github.com/chromium/web-page-replay/issues/71 739 # for details. 740 DEFAULT_REQUEST_TIME = datetime.datetime(2008, 2, 29, 2, 26, 8, 254000) 741 742 def __init__(self, version, status, reason, headers, response_data, 743 delays=None, request_time=None): 744 """Initialize an ArchivedHttpResponse. 745 746 Args: 747 version: HTTP protocol version used by server. 748 10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib). 749 status: Status code returned by server (e.g. 200). 750 reason: Reason phrase returned by server (e.g. "OK"). 751 headers: list of (header, value) tuples. 752 response_data: list of content chunks. 753 Concatenating the chunks gives the complete contents 754 (i.e. the chunks do not have any lengths or delimiters). 755 Do not include the final, zero-length chunk that marks the end. 756 delays: dict of (ms) delays for 'connect', 'headers' and 'data'. 757 e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]} 758 connect - The time to connect to the server. 759 Each resource has a value because Replay's record mode captures it. 760 This includes the time for the SYN and SYN/ACK (1 rtt). 761 headers - The time elapsed between the TCP connect and the headers. 762 This typically includes all the server-time to generate a response. 763 data - If the response is chunked, these are the times for each chunk. 764 """ 765 self.version = version 766 self.status = status 767 self.reason = reason 768 self.original_headers = headers 769 self.headers = self._TrimHeaders(headers) 770 self.response_data = response_data 771 self.delays = delays 772 self.fix_delays() 773 self.request_time = ( 774 request_time or ArchivedHttpResponse.DEFAULT_REQUEST_TIME 775 ) 776 777 def fix_delays(self): 778 """Initialize delays, or check the number of data delays.""" 779 expected_num_delays = len(self.response_data) 780 if not self.delays: 781 self.delays = { 782 'connect': 0, 783 'headers': 0, 784 'data': [0] * expected_num_delays 785 } 786 else: 787 num_delays = len(self.delays['data']) 788 if num_delays != expected_num_delays: 789 raise HttpArchiveException( 790 'Server delay length mismatch: %d (expected %d): %s', 791 num_delays, expected_num_delays, self.delays['data']) 792 793 @classmethod 794 def _TrimHeaders(cls, headers): 795 """Removes headers that are known to cause problems during replay. 796 797 These headers are removed for the following reasons: 798 - content-security-policy: Causes problems with script injection. 799 """ 800 undesirable_keys = ['content-security-policy'] 801 return [(k, v) for k, v in headers if k.lower() not in undesirable_keys] 802 803 def __repr__(self): 804 return repr((self.version, self.status, self.reason, sorted(self.headers), 805 self.response_data, self.request_time)) 806 807 def __hash__(self): 808 """Return a integer hash to use for hashed collections including dict.""" 809 return hash(repr(self)) 810 811 def __eq__(self, other): 812 """Define the __eq__ method to match the hash behavior.""" 813 return repr(self) == repr(other) 814 815 def __setstate__(self, state): 816 """Influence how to unpickle. 817 818 "original_headers" are the original request headers. 819 "headers" are the trimmed headers used for replaying responses. 820 821 Args: 822 state: a dictionary for __dict__ 823 """ 824 if 'server_delays' in state: 825 state['delays'] = { 826 'connect': 0, 827 'headers': 0, 828 'data': state['server_delays'] 829 } 830 del state['server_delays'] 831 elif 'delays' not in state: 832 state['delays'] = None 833 # Set to date that was hardcoded in deterministic.js originally. 834 state.setdefault('request_time', ArchivedHttpResponse.DEFAULT_REQUEST_TIME) 835 state['original_headers'] = state['headers'] 836 state['headers'] = self._TrimHeaders(state['original_headers']) 837 self.__dict__.update(state) 838 self.fix_delays() 839 840 def __getstate__(self): 841 """Influence how to pickle. 842 843 Returns: 844 a dict to use for pickling 845 """ 846 state = self.__dict__.copy() 847 state['headers'] = state['original_headers'] 848 del state['original_headers'] 849 return state 850 851 def get_header(self, key, default=None): 852 for k, v in self.headers: 853 if key.lower() == k.lower(): 854 return v 855 return default 856 857 def set_header(self, key, value): 858 for i, (k, v) in enumerate(self.headers): 859 if key == k: 860 self.headers[i] = (key, value) 861 return 862 self.headers.append((key, value)) 863 864 def remove_header(self, key): 865 for i, (k, v) in enumerate(self.headers): 866 if key.lower() == k.lower(): 867 self.headers.pop(i) 868 return 869 870 @staticmethod 871 def _get_epoch_seconds(date_str): 872 """Return the epoch seconds of a date header. 873 874 Args: 875 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT") 876 Returns: 877 epoch seconds as a float 878 """ 879 date_tuple = email.utils.parsedate(date_str) 880 if date_tuple: 881 return calendar.timegm(date_tuple) 882 return None 883 884 def update_date(self, date_str, now=None): 885 """Return an updated date based on its delta from the "Date" header. 886 887 For example, if |date_str| is one week later than the "Date" header, 888 then the returned date string is one week later than the current date. 889 890 Args: 891 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT") 892 Returns: 893 a date string 894 """ 895 date_seconds = self._get_epoch_seconds(self.get_header('date')) 896 header_seconds = self._get_epoch_seconds(date_str) 897 if date_seconds and header_seconds: 898 updated_seconds = header_seconds + (now or time.time()) - date_seconds 899 return email.utils.formatdate(updated_seconds, usegmt=True) 900 return date_str 901 902 def is_gzip(self): 903 return self.get_header('content-encoding') == 'gzip' 904 905 def is_compressed(self): 906 return self.get_header('content-encoding') in ('gzip', 'deflate') 907 908 def is_chunked(self): 909 return self.get_header('transfer-encoding') == 'chunked' 910 911 def get_data_as_chunks(self): 912 """Return content as a list of strings, each corresponding to a chunk. 913 914 Uncompresses the chunks, if needed. 915 """ 916 content_type = self.get_header('content-type') 917 if (not content_type or 918 not (content_type.startswith('text/') or 919 content_type == 'application/x-javascript' or 920 content_type.startswith('application/json'))): 921 return None 922 if self.is_compressed(): 923 return httpzlib.uncompress_chunks(self.response_data, self.is_gzip()) 924 else: 925 return self.response_data 926 927 def get_data_as_text(self): 928 """Return content as a single string. 929 930 Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR. 931 """ 932 return self.CHUNK_EDIT_SEPARATOR.join(self.get_data_as_chunks()) 933 934 def get_delays_as_text(self): 935 """Return delays as editable text.""" 936 return json.dumps(self.delays, indent=2) 937 938 def get_response_as_text(self): 939 """Returns response content as a single string. 940 941 Server delays are separated on a per-chunk basis. Delays are in seconds. 942 Response content begins after DELAY_EDIT_SEPARATOR 943 """ 944 data = self.get_data_as_text() 945 if data is None: 946 logging.warning('Data can not be represented as text.') 947 data = '' 948 delays = self.get_delays_as_text() 949 return self.DELAY_EDIT_SEPARATOR.join((delays, data)) 950 951 def set_data_from_chunks(self, text_chunks): 952 """Inverse of get_data_as_chunks(). 953 954 Compress, if needed. 955 """ 956 if self.is_compressed(): 957 self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip()) 958 else: 959 self.response_data = text_chunks 960 if not self.is_chunked(): 961 content_length = sum(len(c) for c in self.response_data) 962 self.set_header('content-length', str(content_length)) 963 964 def set_data(self, text): 965 """Inverse of get_data_as_text(). 966 967 Split on CHUNK_EDIT_SEPARATOR and compress if needed. 968 """ 969 self.set_data_from_chunks(text.split(self.CHUNK_EDIT_SEPARATOR)) 970 971 def set_delays(self, delays_text): 972 """Inverse of get_delays_as_text(). 973 974 Args: 975 delays_text: JSON encoded text such as the following: 976 { 977 connect: 80, 978 headers: 80, 979 data: [6, 55, 0] 980 } 981 Times are in milliseconds. 982 Each data delay corresponds with one response_data value. 983 """ 984 try: 985 self.delays = json.loads(delays_text) 986 except (ValueError, KeyError) as e: 987 logging.critical('Unable to parse delays %s: %s', delays_text, e) 988 self.fix_delays() 989 990 def set_response_from_text(self, text): 991 """Inverse of get_response_as_text(). 992 993 Modifies the state of the archive according to the textual representation. 994 """ 995 try: 996 delays, data = text.split(self.DELAY_EDIT_SEPARATOR) 997 except ValueError: 998 logging.critical( 999 'Error parsing text representation. Skipping edits.') 1000 return 1001 self.set_delays(delays) 1002 self.set_data(data) 1003 1004 1005def create_response(status, reason=None, headers=None, body=None): 1006 """Convenience method for creating simple ArchivedHttpResponse objects.""" 1007 if reason is None: 1008 reason = httplib.responses.get(status, 'Unknown') 1009 if headers is None: 1010 headers = [('content-type', 'text/plain')] 1011 if body is None: 1012 body = "%s %s" % (status, reason) 1013 return ArchivedHttpResponse(11, status, reason, headers, [body]) 1014 1015 1016def main(): 1017 class PlainHelpFormatter(optparse.IndentedHelpFormatter): 1018 def format_description(self, description): 1019 if description: 1020 return description + '\n' 1021 else: 1022 return '' 1023 1024 option_parser = optparse.OptionParser( 1025 usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)', 1026 formatter=PlainHelpFormatter(), 1027 description=__doc__, 1028 epilog='http://code.google.com/p/web-page-replay/') 1029 1030 option_parser.add_option('-c', '--command', default=None, 1031 action='store', 1032 type='string', 1033 help='Only show URLs matching this command.') 1034 option_parser.add_option('-o', '--host', default=None, 1035 action='store', 1036 type='string', 1037 help='Only show URLs matching this host.') 1038 option_parser.add_option('-p', '--full_path', default=None, 1039 action='store', 1040 type='string', 1041 help='Only show URLs matching this full path.') 1042 option_parser.add_option('-f', '--merged_file', default=None, 1043 action='store', 1044 type='string', 1045 help='The output file to use when using the merge command.') 1046 1047 options, args = option_parser.parse_args() 1048 1049 # Merge command expects an umlimited number of archives. 1050 if len(args) < 2: 1051 print 'args: %s' % args 1052 option_parser.error('Must specify a command and replay_file') 1053 1054 command = args[0] 1055 replay_file = args[1] 1056 1057 if not os.path.exists(replay_file): 1058 option_parser.error('Replay file "%s" does not exist' % replay_file) 1059 1060 http_archive = HttpArchive.Load(replay_file) 1061 if command == 'ls': 1062 print http_archive.ls(options.command, options.host, options.full_path) 1063 elif command == 'cat': 1064 print http_archive.cat(options.command, options.host, options.full_path) 1065 elif command == 'stats': 1066 print http_archive.stats(options.command, options.host, options.full_path) 1067 elif command == 'merge': 1068 if not options.merged_file: 1069 print 'Error: Must specify a merged file name (use --merged_file)' 1070 return 1071 http_archive.merge(options.merged_file, args[2:]) 1072 elif command == 'edit': 1073 http_archive.edit(options.command, options.host, options.full_path) 1074 http_archive.Persist(replay_file) 1075 else: 1076 option_parser.error('Unknown command "%s"' % command) 1077 return 0 1078 1079 1080if __name__ == '__main__': 1081 sys.exit(main()) 1082