tvcm/tvcm/strip_js_comments.py

# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Utility function for stripping comments out of JavaScript source code."""

import re


def _TokenizeJS(text):
  """Splits source code text into segments in preparation for comment stripping.

  Note that this doesn't tokenize for parsing. There is no notion of statements,
  variables, etc. The only tokens of interest are comment-related tokens.

  Args:
    text: The contents of a JavaScript file.

  Yields:
    A succession of strings in the file, including all comment-related symbols.
  """
  rest = text
  tokens = ["//", "/*", "*/", "\n"]
  next_tok = re.compile('|'.join([re.escape(x) for x in tokens]))
  while len(rest):
    m = next_tok.search(rest)
    if not m:
      # end of string
      yield rest
      return
    min_index = m.start()
    end_index = m.end()

    if min_index > 0:
      yield rest[:min_index]

    yield rest[min_index:end_index]
    rest = rest[end_index:]


def StripJSComments(text):
  """Strips comments out of JavaScript source code.

  Args:
    text: JavaScript source text.

  Returns:
    JavaScript source text with comments stripped out.
  """
  result_tokens = []
  token_stream = _TokenizeJS(text).__iter__()
  while True:
    try:
      t = token_stream.next()
    except StopIteration:
      break

    if t == "//":
      while True:
        try:
          t2 = token_stream.next()
          if t2 == "\n":
            break
        except StopIteration:
          break
    elif t == '/*':
      nesting = 1
      while True:
        try:
          t2 = token_stream.next()
          if t2 == "/*":
            nesting += 1
          elif t2 == "*/":
            nesting -= 1
            if nesting == 0:
              break
        except StopIteration:
          break
    else:
      result_tokens.append(t)
  return "".join(result_tokens)