1# Copyright (c) 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import os
6import sys
7
8from py_vulcanize import module
9from py_vulcanize import strip_js_comments
10from py_vulcanize import html_generation_controller
11
12
13def _AddToPathIfNeeded(path):
14  if path not in sys.path:
15    sys.path.insert(0, path)
16
17
18def _InitBeautifulSoup():
19  catapult_path = os.path.abspath(
20      os.path.join(os.path.dirname(__file__),
21                   os.path.pardir, os.path.pardir, os.path.pardir))
22  bs_path = os.path.join(catapult_path, 'third_party', 'beautifulsoup4')
23  _AddToPathIfNeeded(bs_path)
24
25  html5lib_path = os.path.join(catapult_path, 'third_party', 'html5lib-python')
26  _AddToPathIfNeeded(html5lib_path)
27
28  six_path = os.path.join(catapult_path, 'third_party', 'six')
29  _AddToPathIfNeeded(six_path)
30
31
32_InitBeautifulSoup()
33import bs4
34
35
36class InlineScript(object):
37
38  def __init__(self, soup):
39    if not soup:
40      raise module.DepsException('InlineScript created without soup')
41    self._soup = soup
42    self._stripped_contents = None
43    self._open_tags = None
44
45  @property
46  def contents(self):
47    return unicode(self._soup.string)
48
49  @property
50  def stripped_contents(self):
51    if not self._stripped_contents:
52      self._stripped_contents = strip_js_comments.StripJSComments(
53          self.contents)
54    return self._stripped_contents
55
56  @property
57  def open_tags(self):
58    if self._open_tags:
59      return self._open_tags
60    open_tags = []
61    cur = self._soup.parent
62    while cur:
63      if isinstance(cur, bs4.BeautifulSoup):
64        break
65
66      open_tags.append(_Tag(cur.name, cur.attrs))
67      cur = cur.parent
68
69    open_tags.reverse()
70    assert open_tags[-1].tag == 'script'
71    del open_tags[-1]
72
73    self._open_tags = open_tags
74    return self._open_tags
75
76
77def _CreateSoupWithoutHeadOrBody(html):
78  soupCopy = bs4.BeautifulSoup(html, 'html5lib')
79  soup = bs4.BeautifulSoup()
80  soup.reset()
81  if soupCopy.head:
82    for n in soupCopy.head.contents:
83      n.extract()
84      soup.append(n)
85  if soupCopy.body:
86    for n in soupCopy.body.contents:
87      n.extract()
88      soup.append(n)
89  return soup
90
91
92class HTMLModuleParserResults(object):
93
94  def __init__(self, html):
95    self._soup = bs4.BeautifulSoup(html, 'html5lib')
96    self._inline_scripts = None
97
98  @property
99  def scripts_external(self):
100    tags = self._soup.findAll('script', src=True)
101    return [t['src'] for t in tags]
102
103  @property
104  def inline_scripts(self):
105    if not self._inline_scripts:
106      tags = self._soup.findAll('script', src=None)
107      self._inline_scripts = [InlineScript(t.string) for t in tags]
108    return self._inline_scripts
109
110  @property
111  def imports(self):
112    tags = self._soup.findAll('link', rel='import')
113    return [t['href'] for t in tags]
114
115  @property
116  def stylesheets(self):
117    tags = self._soup.findAll('link', rel='stylesheet')
118    return [t['href'] for t in tags]
119
120  @property
121  def inline_stylesheets(self):
122    tags = self._soup.findAll('style')
123    return [unicode(t.string) for t in tags]
124
125  def YieldHTMLInPieces(self, controller, minify=False):
126    yield self.GenerateHTML(controller, minify)
127
128  def GenerateHTML(self, controller, minify=False, prettify=False):
129    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))
130
131    # Remove declaration.
132    for x in soup.contents:
133      if isinstance(x, bs4.Doctype):
134        x.extract()
135
136    # Remove declaration.
137    for x in soup.contents:
138      if isinstance(x, bs4.Declaration):
139        x.extract()
140
141    # Remove all imports.
142    imports = soup.findAll('link', rel='import')
143    for imp in imports:
144      imp.extract()
145
146    # Remove all script links.
147    scripts_external = soup.findAll('script', src=True)
148    for script in scripts_external:
149      script.extract()
150
151    # Remove all in-line scripts.
152    scripts_external = soup.findAll('script', src=None)
153    for script in scripts_external:
154      script.extract()
155
156    # Process all in-line styles.
157    inline_styles = soup.findAll('style')
158    for style in inline_styles:
159      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
160      if html:
161        ns = soup.new_tag('style')
162        ns.append(bs4.NavigableString(html))
163        style.replaceWith(ns)
164      else:
165        style.extract()
166
167    # Rewrite all external stylesheet hrefs or remove, as needed.
168    stylesheet_links = soup.findAll('link', rel='stylesheet')
169    for stylesheet_link in stylesheet_links:
170      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
171      if html:
172        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
173        assert len(tmp) == 1
174        stylesheet_link.replaceWith(tmp[0])
175      else:
176        stylesheet_link.extract()
177
178    # Remove comments if minifying.
179    if minify:
180      comments = soup.findAll(
181          text=lambda text: isinstance(text, bs4.Comment))
182      for comment in comments:
183        comment.extract()
184    if prettify:
185      return soup.prettify('utf-8').strip()
186
187    # We are done.
188    return unicode(soup).strip()
189
190  @property
191  def html_contents_without_links_and_script(self):
192    return self.GenerateHTML(
193        html_generation_controller.HTMLGenerationController())
194
195
196class _Tag(object):
197
198  def __init__(self, tag, attrs):
199    self.tag = tag
200    self.attrs = attrs
201
202  def __repr__(self):
203    attr_string = ' '.join('%s="%s"' % (x[0], x[1]) for x in self.attrs)
204    return '<%s %s>' % (self.tag, attr_string)
205
206
207class HTMLModuleParser():
208
209  def Parse(self, html):
210    if html is None:
211      html = ''
212    else:
213      if html.find('< /script>') != -1:
214        raise Exception('Escape script tags with <\/script>')
215
216    return HTMLModuleParserResults(html)
217