1# Copyright (c) 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import os
6import sys
7
8from py_vulcanize import js_utils
9from py_vulcanize import module
10from py_vulcanize import strip_js_comments
11from py_vulcanize import html_generation_controller
12
13
14def _AddToPathIfNeeded(path):
15  if path not in sys.path:
16    sys.path.insert(0, path)
17
18
19def _InitBeautifulSoup():
20  catapult_path = os.path.abspath(
21      os.path.join(os.path.dirname(__file__),
22                   os.path.pardir, os.path.pardir, os.path.pardir))
23  bs_path = os.path.join(catapult_path, 'third_party', 'beautifulsoup4')
24  _AddToPathIfNeeded(bs_path)
25
26  html5lib_path = os.path.join(catapult_path, 'third_party', 'html5lib-python')
27  _AddToPathIfNeeded(html5lib_path)
28
29  six_path = os.path.join(catapult_path, 'third_party', 'six')
30  _AddToPathIfNeeded(six_path)
31
32
33_InitBeautifulSoup()
34import bs4
35
36class Script(object):
37
38  def __init__(self, soup):
39    if not soup:
40      raise module.DepsException('Script object created without soup')
41    self._soup = soup
42
43  def AppendJSContentsToFile(self, f, *args, **kwargs):
44    raise NotImplementedError()
45
46class InlineScript(Script):
47
48  def __init__(self, soup):
49    super(InlineScript, self).__init__(soup)
50    self._stripped_contents = None
51    self._open_tags = None
52    self.is_external = False
53
54  @property
55  def contents(self):
56    return unicode(self._soup.string)
57
58  @property
59  def stripped_contents(self):
60    if not self._stripped_contents:
61      self._stripped_contents = strip_js_comments.StripJSComments(
62          self.contents)
63    return self._stripped_contents
64
65  @property
66  def open_tags(self):
67    if self._open_tags:
68      return self._open_tags
69    open_tags = []
70    cur = self._soup.parent
71    while cur:
72      if isinstance(cur, bs4.BeautifulSoup):
73        break
74
75      open_tags.append(_Tag(cur.name, cur.attrs))
76      cur = cur.parent
77
78    open_tags.reverse()
79    assert open_tags[-1].tag == 'script'
80    del open_tags[-1]
81
82    self._open_tags = open_tags
83    return self._open_tags
84
85  def AppendJSContentsToFile(self, f, *args, **kwargs):
86    js = self.contents
87    escaped_js = js_utils.EscapeJSIfNeeded(js)
88    f.write(escaped_js)
89    f.write('\n')
90
91class ExternalScript(Script):
92
93  def __init__(self, soup):
94    super(ExternalScript, self).__init__(soup)
95    if 'src' not in soup.attrs:
96      raise Exception("{0} is not an external script.".format(soup))
97    self.is_external = True
98    self._loaded_raw_script = None
99
100  @property
101  def loaded_raw_script(self):
102    if self._loaded_raw_script:
103      return self._loaded_raw_script
104
105    return None
106
107  @loaded_raw_script.setter
108  def loaded_raw_script(self, value):
109    self._loaded_raw_script = value
110
111  @property
112  def src(self):
113    return self._soup.attrs['src']
114
115  def AppendJSContentsToFile(self,
116                             f,
117                             use_include_tags_for_scripts,
118                             dir_for_include_tag_root):
119    raw_script = self.loaded_raw_script
120    if not raw_script:
121      return
122
123    if use_include_tags_for_scripts:
124      rel_filename = os.path.relpath(raw_script.filename,
125                                    dir_for_include_tag_root)
126      f.write("""<include src="%s">\n""" % rel_filename)
127    else:
128      f.write(js_utils.EscapeJSIfNeeded(raw_script.contents))
129      f.write('\n')
130
131def _CreateSoupWithoutHeadOrBody(html):
132  soupCopy = bs4.BeautifulSoup(html, 'html5lib')
133  soup = bs4.BeautifulSoup()
134  soup.reset()
135  if soupCopy.head:
136    for n in soupCopy.head.contents:
137      n.extract()
138      soup.append(n)
139  if soupCopy.body:
140    for n in soupCopy.body.contents:
141      n.extract()
142      soup.append(n)
143  return soup
144
145
146class HTMLModuleParserResults(object):
147
148  def __init__(self, html):
149    self._soup = bs4.BeautifulSoup(html, 'html5lib')
150    self._inline_scripts = None
151    self._scripts = None
152
153  @property
154  def scripts_external(self):
155    tags = self._soup.findAll('script', src=True)
156    return [t['src'] for t in tags]
157
158  @property
159  def inline_scripts(self):
160    if not self._inline_scripts:
161      tags = self._soup.findAll('script', src=None)
162      self._inline_scripts = [InlineScript(t.string) for t in tags]
163    return self._inline_scripts
164
165  @property
166  def scripts(self):
167    if not self._scripts:
168      self._scripts = []
169      script_elements = self._soup.findAll('script')
170      for element in script_elements:
171        if 'src' in element.attrs:
172          self._scripts.append(ExternalScript(element))
173        else:
174          self._scripts.append(InlineScript(element))
175    return self._scripts
176
177  @property
178  def imports(self):
179    tags = self._soup.findAll('link', rel='import')
180    return [t['href'] for t in tags]
181
182  @property
183  def stylesheets(self):
184    tags = self._soup.findAll('link', rel='stylesheet')
185    return [t['href'] for t in tags]
186
187  @property
188  def inline_stylesheets(self):
189    tags = self._soup.findAll('style')
190    return [unicode(t.string) for t in tags]
191
192  def YieldHTMLInPieces(self, controller, minify=False):
193    yield self.GenerateHTML(controller, minify)
194
195  def GenerateHTML(self, controller, minify=False, prettify=False):
196    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))
197
198    # Remove declaration.
199    for x in soup.contents:
200      if isinstance(x, bs4.Doctype):
201        x.extract()
202
203    # Remove declaration.
204    for x in soup.contents:
205      if isinstance(x, bs4.Declaration):
206        x.extract()
207
208    # Remove all imports.
209    imports = soup.findAll('link', rel='import')
210    for imp in imports:
211      imp.extract()
212
213    # Remove all script links.
214    scripts_external = soup.findAll('script', src=True)
215    for script in scripts_external:
216      script.extract()
217
218    # Remove all in-line scripts.
219    scripts_external = soup.findAll('script', src=None)
220    for script in scripts_external:
221      script.extract()
222
223    # Process all in-line styles.
224    inline_styles = soup.findAll('style')
225    for style in inline_styles:
226      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
227      if html:
228        ns = soup.new_tag('style')
229        ns.append(bs4.NavigableString(html))
230        style.replaceWith(ns)
231      else:
232        style.extract()
233
234    # Rewrite all external stylesheet hrefs or remove, as needed.
235    stylesheet_links = soup.findAll('link', rel='stylesheet')
236    for stylesheet_link in stylesheet_links:
237      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
238      if html:
239        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
240        assert len(tmp) == 1
241        stylesheet_link.replaceWith(tmp[0])
242      else:
243        stylesheet_link.extract()
244
245    # Remove comments if minifying.
246    if minify:
247      comments = soup.findAll(
248          text=lambda text: isinstance(text, bs4.Comment))
249      for comment in comments:
250        comment.extract()
251    if prettify:
252      return soup.prettify('utf-8').strip()
253
254    # We are done.
255    return unicode(soup).strip()
256
257  @property
258  def html_contents_without_links_and_script(self):
259    return self.GenerateHTML(
260        html_generation_controller.HTMLGenerationController())
261
262
263class _Tag(object):
264
265  def __init__(self, tag, attrs):
266    self.tag = tag
267    self.attrs = attrs
268
269  def __repr__(self):
270    attr_string = ' '.join('%s="%s"' % (x[0], x[1]) for x in self.attrs)
271    return '<%s %s>' % (self.tag, attr_string)
272
273
274class HTMLModuleParser():
275
276  def Parse(self, html):
277    if html is None:
278      html = ''
279    else:
280      if html.find('< /script>') != -1:
281        raise Exception('Escape script tags with <\/script>')
282
283    return HTMLModuleParserResults(html)
284