1# Copyright (c) 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from __future__ import absolute_import
6from __future__ import division
7from __future__ import print_function
8
9import os
10import sys
11
12from py_vulcanize import html_generation_controller
13from py_vulcanize import js_utils
14from py_vulcanize import module
15from py_vulcanize import strip_js_comments
16import six
17
18
19def _AddToPathIfNeeded(path):
20  if path not in sys.path:
21    sys.path.insert(0, path)
22
23
24def _InitBeautifulSoup():
25  catapult_path = os.path.abspath(
26      os.path.join(os.path.dirname(__file__),
27                   os.path.pardir, os.path.pardir, os.path.pardir))
28  bs_path = os.path.join(catapult_path, 'third_party', 'beautifulsoup4')
29  _AddToPathIfNeeded(bs_path)
30
31  html5lib_path = os.path.join(catapult_path, 'third_party', 'html5lib-python')
32  _AddToPathIfNeeded(html5lib_path)
33
34  six_path = os.path.join(catapult_path, 'third_party', 'six')
35  _AddToPathIfNeeded(six_path)
36
37
38_InitBeautifulSoup()
39import bs4
40
41class Script(object):
42
43  def __init__(self, soup):
44    if not soup:
45      raise module.DepsException('Script object created without soup')
46    self._soup = soup
47
48  def AppendJSContentsToFile(self, f, *args, **kwargs):
49    raise NotImplementedError()
50
51class InlineScript(Script):
52
53  def __init__(self, soup):
54    super(InlineScript, self).__init__(soup)
55    self._stripped_contents = None
56    self._open_tags = None
57    self.is_external = False
58
59  @property
60  def contents(self):
61    return six.text_type(self._soup.string)
62
63  @property
64  def stripped_contents(self):
65    if not self._stripped_contents:
66      self._stripped_contents = strip_js_comments.StripJSComments(
67          self.contents)
68    return self._stripped_contents
69
70  @property
71  def open_tags(self):
72    if self._open_tags:
73      return self._open_tags
74    open_tags = []
75    cur = self._soup.parent
76    while cur:
77      if isinstance(cur, bs4.BeautifulSoup):
78        break
79
80      open_tags.append(_Tag(cur.name, cur.attrs))
81      cur = cur.parent
82
83    open_tags.reverse()
84    assert open_tags[-1].tag == 'script'
85    del open_tags[-1]
86
87    self._open_tags = open_tags
88    return self._open_tags
89
90  def AppendJSContentsToFile(self, f, *args, **kwargs):
91    js = self.contents
92    escaped_js = js_utils.EscapeJSIfNeeded(js)
93    f.write(escaped_js)
94    f.write('\n')
95
96class ExternalScript(Script):
97
98  def __init__(self, soup):
99    super(ExternalScript, self).__init__(soup)
100    if 'src' not in soup.attrs:
101      raise Exception("{0} is not an external script.".format(soup))
102    self.is_external = True
103    self._loaded_raw_script = None
104
105  @property
106  def loaded_raw_script(self):
107    if self._loaded_raw_script:
108      return self._loaded_raw_script
109
110    return None
111
112  @loaded_raw_script.setter
113  def loaded_raw_script(self, value):
114    self._loaded_raw_script = value
115
116  @property
117  def src(self):
118    return self._soup.attrs['src']
119
120  def AppendJSContentsToFile(self,
121                             f,
122                             use_include_tags_for_scripts,
123                             dir_for_include_tag_root):
124    raw_script = self.loaded_raw_script
125    if not raw_script:
126      return
127
128    if use_include_tags_for_scripts:
129      rel_filename = os.path.relpath(raw_script.filename,
130                                    dir_for_include_tag_root)
131      f.write("""<include src="%s">\n""" % rel_filename)
132    else:
133      f.write(js_utils.EscapeJSIfNeeded(raw_script.contents))
134      f.write('\n')
135
136def _CreateSoupWithoutHeadOrBody(html):
137  soupCopy = bs4.BeautifulSoup(html, 'html5lib')
138  soup = bs4.BeautifulSoup()
139  soup.reset()
140  if soupCopy.head:
141    for n in soupCopy.head.contents:
142      n.extract()
143      soup.append(n)
144  if soupCopy.body:
145    for n in soupCopy.body.contents:
146      n.extract()
147      soup.append(n)
148  return soup
149
150
151class HTMLModuleParserResults(object):
152
153  def __init__(self, html):
154    self._soup = bs4.BeautifulSoup(html, 'html5lib')
155    self._inline_scripts = None
156    self._scripts = None
157
158  @property
159  def scripts_external(self):
160    tags = self._soup.findAll('script', src=True)
161    return [t['src'] for t in tags]
162
163  @property
164  def inline_scripts(self):
165    if not self._inline_scripts:
166      tags = self._soup.findAll('script', src=None)
167      self._inline_scripts = [InlineScript(t.string) for t in tags]
168    return self._inline_scripts
169
170  @property
171  def scripts(self):
172    if not self._scripts:
173      self._scripts = []
174      script_elements = self._soup.findAll('script')
175      for element in script_elements:
176        if 'src' in element.attrs:
177          self._scripts.append(ExternalScript(element))
178        else:
179          self._scripts.append(InlineScript(element))
180    return self._scripts
181
182  @property
183  def imports(self):
184    tags = self._soup.findAll('link', rel='import')
185    return [t['href'] for t in tags]
186
187  @property
188  def stylesheets(self):
189    tags = self._soup.findAll('link', rel='stylesheet')
190    return [t['href'] for t in tags]
191
192  @property
193  def inline_stylesheets(self):
194    tags = self._soup.findAll('style')
195    return [six.text_type(t.string) for t in tags]
196
197  def YieldHTMLInPieces(self, controller, minify=False):
198    yield self.GenerateHTML(controller, minify)
199
200  def GenerateHTML(self, controller, minify=False, prettify=False):
201    soup = _CreateSoupWithoutHeadOrBody(six.text_type(self._soup))
202
203    # Remove declaration.
204    for x in soup.contents:
205      if isinstance(x, bs4.Doctype):
206        x.extract()
207
208    # Remove declaration.
209    for x in soup.contents:
210      if isinstance(x, bs4.Declaration):
211        x.extract()
212
213    # Remove all imports.
214    imports = soup.findAll('link', rel='import')
215    for imp in imports:
216      imp.extract()
217
218    # Remove all script links.
219    scripts_external = soup.findAll('script', src=True)
220    for script in scripts_external:
221      script.extract()
222
223    # Remove all in-line scripts.
224    scripts_external = soup.findAll('script', src=None)
225    for script in scripts_external:
226      script.extract()
227
228    # Process all in-line styles.
229    inline_styles = soup.findAll('style')
230    for style in inline_styles:
231      html = controller.GetHTMLForInlineStylesheet(six.text_type(style.string))
232      if html:
233        ns = soup.new_tag('style')
234        ns.append(bs4.NavigableString(html))
235        style.replaceWith(ns)
236      else:
237        style.extract()
238
239    # Rewrite all external stylesheet hrefs or remove, as needed.
240    stylesheet_links = soup.findAll('link', rel='stylesheet')
241    for stylesheet_link in stylesheet_links:
242      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
243      if html:
244        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
245        assert len(tmp) == 1
246        stylesheet_link.replaceWith(tmp[0])
247      else:
248        stylesheet_link.extract()
249
250    # Remove comments if minifying.
251    if minify:
252      comments = soup.findAll(
253          text=lambda text: isinstance(text, bs4.Comment))
254      for comment in comments:
255        comment.extract()
256    if prettify:
257      return soup.prettify('utf-8').strip()
258
259    # We are done.
260    return six.text_type(soup).strip()
261
262  @property
263  def html_contents_without_links_and_script(self):
264    return self.GenerateHTML(
265        html_generation_controller.HTMLGenerationController())
266
267
268class _Tag(object):
269
270  def __init__(self, tag, attrs):
271    self.tag = tag
272    self.attrs = attrs
273
274  def __repr__(self):
275    attr_string = ' '.join('%s="%s"' % (x[0], x[1]) for x in self.attrs)
276    return '<%s %s>' % (self.tag, attr_string)
277
278
279class HTMLModuleParser():
280
281  def Parse(self, html):
282    if html is None:
283      html = ''
284    else:
285      if html.find('< /script>') != -1:
286        raise Exception('Escape script tags with <\/script>')
287
288    return HTMLModuleParserResults(html)
289