1#!/usr/bin/python2.4
2
3# Copyright 2012 the V8 project authors. All rights reserved.
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met:
7#
8#     * Redistributions of source code must retain the above copyright
9#       notice, this list of conditions and the following disclaimer.
10#     * Redistributions in binary form must reproduce the above
11#       copyright notice, this list of conditions and the following
12#       disclaimer in the documentation and/or other materials provided
13#       with the distribution.
14#     * Neither the name of Google Inc. nor the names of its
15#       contributors may be used to endorse or promote products derived
16#       from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30"""A JavaScript minifier.
31
32It is far from being a complete JS parser, so there are many valid
33JavaScript programs that will be ruined by it.  Another strangeness is that
34it accepts $ and % as parts of identifiers.  It doesn't merge lines or strip
35out blank lines in order to ease debugging.  Variables at the top scope are
36properties of the global object so we can't rename them.  It is assumed that
37you introduce variables with var as if JavaScript followed C++ scope rules
38around curly braces, so the declaration must be above the first use.
39
40Use as:
41import jsmin
42minifier = JavaScriptMinifier()
43program1 = minifier.JSMinify(program1)
44program2 = minifier.JSMinify(program2)
45"""
46
47import re
48
49
50class JavaScriptMinifier(object):
51  """An object that you can feed code snippets to to get them minified."""
52
53  def __init__(self):
54    # We prepopulate the list of identifiers that shouldn't be used.  These
55    # short language keywords could otherwise be used by the script as variable
56    # names.
57    self.seen_identifiers = {"do": True, "in": True}
58    self.identifier_counter = 0
59    self.in_comment = False
60    self.map = {}
61    self.nesting = 0
62
63  def LookAtIdentifier(self, m):
64    """Records identifiers or keywords that we see in use.
65
66    (So we can avoid renaming variables to these strings.)
67    Args:
68      m: The match object returned by re.search.
69
70    Returns:
71      Nothing.
72    """
73    identifier = m.group(1)
74    self.seen_identifiers[identifier] = True
75
76  def Push(self):
77    """Called when we encounter a '{'."""
78    self.nesting += 1
79
80  def Pop(self):
81    """Called when we encounter a '}'."""
82    self.nesting -= 1
83    # We treat each top-level opening brace as a single scope that can span
84    # several sets of nested braces.
85    if self.nesting == 0:
86      self.map = {}
87      self.identifier_counter = 0
88
89  def Declaration(self, m):
90    """Rewrites bits of the program selected by a regexp.
91
92    These can be curly braces, literal strings, function declarations and var
93    declarations.  (These last two must be on one line including the opening
94    curly brace of the function for their variables to be renamed).
95
96    Args:
97      m: The match object returned by re.search.
98
99    Returns:
100      The string that should replace the match in the rewritten program.
101    """
102    matched_text = m.group(0)
103
104    if matched_text.startswith("`") and matched_text.endswith("`"):
105      return re.sub(r"\$\{([\w$%]+)\}",
106                    lambda m: '${' + self.FindNewName(m.group(1)) + '}',
107                    matched_text)
108
109    if matched_text == "{":
110      self.Push()
111      return matched_text
112    if matched_text == "}":
113      self.Pop()
114      return matched_text
115    if re.match("[\"'/]", matched_text):
116      return matched_text
117    m = re.match(r"var ", matched_text)
118    if m:
119      var_names = matched_text[m.end():]
120      var_names = re.split(r",", var_names)
121      return "var " + ",".join(map(self.FindNewName, var_names))
122    m = re.match(r"(function\b[^(]*)\((.*)\)\{$", matched_text)
123    if m:
124      up_to_args = m.group(1)
125      args = m.group(2)
126      args = re.split(r",", args)
127      self.Push()
128      return up_to_args + "(" + ",".join(map(self.FindNewName, args)) + "){"
129
130    if matched_text in self.map:
131      return self.map[matched_text]
132
133    return matched_text
134
135  def CharFromNumber(self, number):
136    """A single-digit base-52 encoding using a-zA-Z."""
137    if number < 26:
138      return chr(number + 97)
139    number -= 26
140    return chr(number + 65)
141
142  def FindNewName(self, var_name):
143    """Finds a new 1-character or 2-character name for a variable.
144
145    Enters it into the mapping table for this scope.
146
147    Args:
148      var_name: The name of the variable before renaming.
149
150    Returns:
151      The new name of the variable.
152    """
153    new_identifier = ""
154    # Variable names that end in _ are member variables of the global object,
155    # so they can be visible from code in a different scope.  We leave them
156    # alone.
157    if var_name in self.map:
158      return self.map[var_name]
159    if self.nesting == 0:
160      return var_name
161    # Do not rename arguments object.
162    if var_name == 'arguments':
163      return 'arguments'
164    while True:
165      identifier_first_char = self.identifier_counter % 52
166      identifier_second_char = self.identifier_counter // 52
167      new_identifier = self.CharFromNumber(identifier_first_char)
168      if identifier_second_char != 0:
169        new_identifier = (
170            self.CharFromNumber(identifier_second_char - 1) + new_identifier)
171      self.identifier_counter += 1
172      if not new_identifier in self.seen_identifiers:
173        break
174
175    self.map[var_name] = new_identifier
176    return new_identifier
177
178  def RemoveSpaces(self, m):
179    """Returns literal strings unchanged, replaces other inputs with group 2.
180
181    Other inputs are replaced with the contents of capture 1.  This is either
182    a single space or an empty string.
183
184    Args:
185      m: The match object returned by re.search.
186
187    Returns:
188      The string that should be inserted instead of the matched text.
189    """
190    entire_match = m.group(0)
191    replacement = m.group(1)
192    if re.match(r"'.*'$", entire_match):
193      return entire_match
194    if re.match(r'".*"$', entire_match):
195      return entire_match
196    if re.match(r"`.*`$", entire_match):
197      return entire_match
198    if re.match(r"/.+/$", entire_match):
199      return entire_match
200    return replacement
201
202  def JSMinify(self, text):
203    """The main entry point.  Takes a text and returns a compressed version.
204
205    The compressed version hopefully does the same thing.  Line breaks are
206    preserved.
207
208    Args:
209      text: The text of the code snippet as a multiline string.
210
211    Returns:
212      The compressed text of the code snippet as a multiline string.
213    """
214    new_lines = []
215    for line in re.split(r"\n", text):
216      line = line.replace("\t", " ")
217      if self.in_comment:
218        m = re.search(r"\*/", line)
219        if m:
220          line = line[m.end():]
221          self.in_comment = False
222        else:
223          new_lines.append("")
224          continue
225
226      if not self.in_comment:
227        line = re.sub(r"/\*.*?\*/", " ", line)
228        line = re.sub(r"//.*", "", line)
229        m = re.search(r"/\*", line)
230        if m:
231          line = line[:m.start()]
232          self.in_comment = True
233
234      # Strip leading and trailing spaces.
235      line = re.sub(r"^ +", "", line)
236      line = re.sub(r" +$", "", line)
237      # A regexp that matches a literal string surrounded by "double quotes".
238      # This regexp can handle embedded backslash-escaped characters including
239      # embedded backslash-escaped double quotes.
240      double_quoted_string = r'"(?:[^"\\]|\\.)*"'
241      # A regexp that matches a literal string surrounded by 'single quotes'.
242      single_quoted_string = r"'(?:[^'\\]|\\.)*'"
243      # A regexp that matches a template string
244      template_string = r"`(?:[^`\\]|\\.)*`"
245      # A regexp that matches a regexp literal surrounded by /slashes/.
246      # Don't allow a regexp to have a ) before the first ( since that's a
247      # syntax error and it's probably just two unrelated slashes.
248      # Also don't allow it to come after anything that can only be the
249      # end of a primary expression.
250      slash_quoted_regexp = r"(?<![\w$'\")\]])/(?:(?=\()|(?:[^()/\\]|\\.)+)(?:\([^/\\]|\\.)*/"
251      # Replace multiple spaces with a single space.
252      line = re.sub("|".join([double_quoted_string,
253                              single_quoted_string,
254                              template_string,
255                              slash_quoted_regexp,
256                              "( )+"]),
257                    self.RemoveSpaces,
258                    line)
259      # Strip single spaces unless they have an identifier character both before
260      # and after the space.  % and $ are counted as identifier characters.
261      line = re.sub("|".join([double_quoted_string,
262                              single_quoted_string,
263                              template_string,
264                              slash_quoted_regexp,
265                              r"(?<![a-zA-Z_0-9$%]) | (?![a-zA-Z_0-9$%])()"]),
266                    self.RemoveSpaces,
267                    line)
268      # Collect keywords and identifiers that are already in use.
269      if self.nesting == 0:
270        re.sub(r"([a-zA-Z0-9_$%]+)", self.LookAtIdentifier, line)
271      function_declaration_regexp = (
272          r"\bfunction"              # Function definition keyword...
273          r"( [\w$%]+)?"             # ...optional function name...
274          r"\([\w$%,]+\)\{")         # ...argument declarations.
275      # Unfortunately the keyword-value syntax { key:value } makes the key look
276      # like a variable where in fact it is a literal string.  We use the
277      # presence or absence of a question mark to try to distinguish between
278      # this case and the ternary operator: "condition ? iftrue : iffalse".
279      if re.search(r"\?", line):
280        block_trailing_colon = r""
281      else:
282        block_trailing_colon = r"(?![:\w$%])"
283      # Variable use.  Cannot follow a period precede a colon.
284      variable_use_regexp = r"(?<![.\w$%])[\w$%]+" + block_trailing_colon
285      line = re.sub("|".join([double_quoted_string,
286                              single_quoted_string,
287                              template_string,
288                              slash_quoted_regexp,
289                              r"\{",                  # Curly braces.
290                              r"\}",
291                              r"\bvar [\w$%,]+",      # var declarations.
292                              function_declaration_regexp,
293                              variable_use_regexp]),
294                    self.Declaration,
295                    line)
296      new_lines.append(line)
297
298    return "\n".join(new_lines) + "\n"
299