1# Copyright (c) 2013 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import os 6import sys 7 8from py_vulcanize import module 9from py_vulcanize import strip_js_comments 10from py_vulcanize import html_generation_controller 11 12 13def _AddToPathIfNeeded(path): 14 if path not in sys.path: 15 sys.path.insert(0, path) 16 17 18def _InitBeautifulSoup(): 19 catapult_path = os.path.abspath( 20 os.path.join(os.path.dirname(__file__), 21 os.path.pardir, os.path.pardir, os.path.pardir)) 22 bs_path = os.path.join(catapult_path, 'third_party', 'beautifulsoup4') 23 _AddToPathIfNeeded(bs_path) 24 25 html5lib_path = os.path.join(catapult_path, 'third_party', 'html5lib-python') 26 _AddToPathIfNeeded(html5lib_path) 27 28 six_path = os.path.join(catapult_path, 'third_party', 'six') 29 _AddToPathIfNeeded(six_path) 30 31 32_InitBeautifulSoup() 33import bs4 34 35 36class InlineScript(object): 37 38 def __init__(self, soup): 39 if not soup: 40 raise module.DepsException('InlineScript created without soup') 41 self._soup = soup 42 self._stripped_contents = None 43 self._open_tags = None 44 45 @property 46 def contents(self): 47 return unicode(self._soup.string) 48 49 @property 50 def stripped_contents(self): 51 if not self._stripped_contents: 52 self._stripped_contents = strip_js_comments.StripJSComments( 53 self.contents) 54 return self._stripped_contents 55 56 @property 57 def open_tags(self): 58 if self._open_tags: 59 return self._open_tags 60 open_tags = [] 61 cur = self._soup.parent 62 while cur: 63 if isinstance(cur, bs4.BeautifulSoup): 64 break 65 66 open_tags.append(_Tag(cur.name, cur.attrs)) 67 cur = cur.parent 68 69 open_tags.reverse() 70 assert open_tags[-1].tag == 'script' 71 del open_tags[-1] 72 73 self._open_tags = open_tags 74 return self._open_tags 75 76 77def _CreateSoupWithoutHeadOrBody(html): 78 soupCopy = bs4.BeautifulSoup(html, 'html5lib') 79 soup = bs4.BeautifulSoup() 80 soup.reset() 81 if soupCopy.head: 82 for n in soupCopy.head.contents: 83 n.extract() 84 soup.append(n) 85 if soupCopy.body: 86 for n in soupCopy.body.contents: 87 n.extract() 88 soup.append(n) 89 return soup 90 91 92class HTMLModuleParserResults(object): 93 94 def __init__(self, html): 95 self._soup = bs4.BeautifulSoup(html, 'html5lib') 96 self._inline_scripts = None 97 98 @property 99 def scripts_external(self): 100 tags = self._soup.findAll('script', src=True) 101 return [t['src'] for t in tags] 102 103 @property 104 def inline_scripts(self): 105 if not self._inline_scripts: 106 tags = self._soup.findAll('script', src=None) 107 self._inline_scripts = [InlineScript(t.string) for t in tags] 108 return self._inline_scripts 109 110 @property 111 def imports(self): 112 tags = self._soup.findAll('link', rel='import') 113 return [t['href'] for t in tags] 114 115 @property 116 def stylesheets(self): 117 tags = self._soup.findAll('link', rel='stylesheet') 118 return [t['href'] for t in tags] 119 120 @property 121 def inline_stylesheets(self): 122 tags = self._soup.findAll('style') 123 return [unicode(t.string) for t in tags] 124 125 def YieldHTMLInPieces(self, controller, minify=False): 126 yield self.GenerateHTML(controller, minify) 127 128 def GenerateHTML(self, controller, minify=False, prettify=False): 129 soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup)) 130 131 # Remove declaration. 132 for x in soup.contents: 133 if isinstance(x, bs4.Doctype): 134 x.extract() 135 136 # Remove declaration. 137 for x in soup.contents: 138 if isinstance(x, bs4.Declaration): 139 x.extract() 140 141 # Remove all imports. 142 imports = soup.findAll('link', rel='import') 143 for imp in imports: 144 imp.extract() 145 146 # Remove all script links. 147 scripts_external = soup.findAll('script', src=True) 148 for script in scripts_external: 149 script.extract() 150 151 # Remove all in-line scripts. 152 scripts_external = soup.findAll('script', src=None) 153 for script in scripts_external: 154 script.extract() 155 156 # Process all in-line styles. 157 inline_styles = soup.findAll('style') 158 for style in inline_styles: 159 html = controller.GetHTMLForInlineStylesheet(unicode(style.string)) 160 if html: 161 ns = soup.new_tag('style') 162 ns.append(bs4.NavigableString(html)) 163 style.replaceWith(ns) 164 else: 165 style.extract() 166 167 # Rewrite all external stylesheet hrefs or remove, as needed. 168 stylesheet_links = soup.findAll('link', rel='stylesheet') 169 for stylesheet_link in stylesheet_links: 170 html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) 171 if html: 172 tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style') 173 assert len(tmp) == 1 174 stylesheet_link.replaceWith(tmp[0]) 175 else: 176 stylesheet_link.extract() 177 178 # Remove comments if minifying. 179 if minify: 180 comments = soup.findAll( 181 text=lambda text: isinstance(text, bs4.Comment)) 182 for comment in comments: 183 comment.extract() 184 if prettify: 185 return soup.prettify('utf-8').strip() 186 187 # We are done. 188 return unicode(soup).strip() 189 190 @property 191 def html_contents_without_links_and_script(self): 192 return self.GenerateHTML( 193 html_generation_controller.HTMLGenerationController()) 194 195 196class _Tag(object): 197 198 def __init__(self, tag, attrs): 199 self.tag = tag 200 self.attrs = attrs 201 202 def __repr__(self): 203 attr_string = ' '.join('%s="%s"' % (x[0], x[1]) for x in self.attrs) 204 return '<%s %s>' % (self.tag, attr_string) 205 206 207class HTMLModuleParser(): 208 209 def Parse(self, html): 210 if html is None: 211 html = '' 212 else: 213 if html.find('< /script>') != -1: 214 raise Exception('Escape script tags with <\/script>') 215 216 return HTMLModuleParserResults(html) 217