1# Protocol Buffers - Google's data interchange format
2# Copyright 2008 Google Inc.  All rights reserved.
3# https://developers.google.com/protocol-buffers/
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9#     * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11#     * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15#     * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""Contains routines for printing protocol messages in text format.
32
33Simple usage example:
34
35  # Create a proto object and serialize it to a text proto string.
36  message = my_proto_pb2.MyMessage(foo='bar')
37  text_proto = text_format.MessageToString(message)
38
39  # Parse a text proto string.
40  message = text_format.Parse(text_proto, my_proto_pb2.MyMessage())
41"""
42
43__author__ = 'kenton@google.com (Kenton Varda)'
44
45import io
46import re
47
48import six
49
50if six.PY3:
51  long = int  # pylint: disable=redefined-builtin,invalid-name
52
53# pylint: disable=g-import-not-at-top
54from google.protobuf.internal import decoder
55from google.protobuf.internal import type_checkers
56from google.protobuf import descriptor
57from google.protobuf import text_encoding
58
59__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField',
60           'PrintFieldValue', 'Merge', 'MessageToBytes']
61
62_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
63                     type_checkers.Int32ValueChecker(),
64                     type_checkers.Uint64ValueChecker(),
65                     type_checkers.Int64ValueChecker())
66_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE)
67_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE)
68_QUOTES = frozenset(("'", '"'))
69_ANY_FULL_TYPE_NAME = 'google.protobuf.Any'
70
71
72class Error(Exception):
73  """Top-level module error for text_format."""
74
75
76class ParseError(Error):
77  """Thrown in case of text parsing or tokenizing error."""
78
79  def __init__(self, message=None, line=None, column=None):
80    if message is not None and line is not None:
81      loc = str(line)
82      if column is not None:
83        loc += ':{0}'.format(column)
84      message = '{0} : {1}'.format(loc, message)
85    if message is not None:
86      super(ParseError, self).__init__(message)
87    else:
88      super(ParseError, self).__init__()
89    self._line = line
90    self._column = column
91
92  def GetLine(self):
93    return self._line
94
95  def GetColumn(self):
96    return self._column
97
98
99class TextWriter(object):
100
101  def __init__(self, as_utf8):
102    if six.PY2:
103      self._writer = io.BytesIO()
104    else:
105      self._writer = io.StringIO()
106
107  def write(self, val):
108    if six.PY2:
109      if isinstance(val, six.text_type):
110        val = val.encode('utf-8')
111    return self._writer.write(val)
112
113  def close(self):
114    return self._writer.close()
115
116  def getvalue(self):
117    return self._writer.getvalue()
118
119
120def MessageToString(message,
121                    as_utf8=False,
122                    as_one_line=False,
123                    use_short_repeated_primitives=False,
124                    pointy_brackets=False,
125                    use_index_order=False,
126                    float_format=None,
127                    double_format=None,
128                    use_field_number=False,
129                    descriptor_pool=None,
130                    indent=0,
131                    message_formatter=None,
132                    print_unknown_fields=False):
133  # type: (...) -> str
134  """Convert protobuf message to text format.
135
136  Double values can be formatted compactly with 15 digits of
137  precision (which is the most that IEEE 754 "double" can guarantee)
138  using double_format='.15g'. To ensure that converting to text and back to a
139  proto will result in an identical value, double_format='.17g' should be used.
140
141  Args:
142    message: The protocol buffers message.
143    as_utf8: Return unescaped Unicode for non-ASCII characters.
144        In Python 3 actual Unicode characters may appear as is in strings.
145        In Python 2 the return value will be valid UTF-8 rather than only ASCII.
146    as_one_line: Don't introduce newlines between fields.
147    use_short_repeated_primitives: Use short repeated format for primitives.
148    pointy_brackets: If True, use angle brackets instead of curly braces for
149      nesting.
150    use_index_order: If True, fields of a proto message will be printed using
151      the order defined in source code instead of the field number, extensions
152      will be printed at the end of the message and their relative order is
153      determined by the extension number. By default, use the field number
154      order.
155    float_format: If set, use this to specify float field formatting
156      (per the "Format Specification Mini-Language"); otherwise, 8 valid digits
157      is used (default '.8g'). Also affect double field if double_format is
158      not set but float_format is set.
159    double_format: If set, use this to specify double field formatting
160      (per the "Format Specification Mini-Language"); if it is not set but
161      float_format is set, use float_format. Otherwise, use str()
162    use_field_number: If True, print field numbers instead of names.
163    descriptor_pool: A DescriptorPool used to resolve Any types.
164    indent: The initial indent level, in terms of spaces, for pretty print.
165    message_formatter: A function(message, indent, as_one_line): unicode|None
166      to custom format selected sub-messages (usually based on message type).
167      Use to pretty print parts of the protobuf for easier diffing.
168    print_unknown_fields: If True, unknown fields will be printed.
169
170  Returns:
171    A string of the text formatted protocol buffer message.
172  """
173  out = TextWriter(as_utf8)
174  printer = _Printer(out, indent, as_utf8, as_one_line,
175                     use_short_repeated_primitives, pointy_brackets,
176                     use_index_order, float_format, double_format,
177                     use_field_number,
178                     descriptor_pool, message_formatter,
179                     print_unknown_fields=print_unknown_fields)
180  printer.PrintMessage(message)
181  result = out.getvalue()
182  out.close()
183  if as_one_line:
184    return result.rstrip()
185  return result
186
187
188def MessageToBytes(message, **kwargs):
189  # type: (...) -> bytes
190  """Convert protobuf message to encoded text format.  See MessageToString."""
191  text = MessageToString(message, **kwargs)
192  if isinstance(text, bytes):
193    return text
194  codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii'
195  return text.encode(codec)
196
197
198def _IsMapEntry(field):
199  return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
200          field.message_type.has_options and
201          field.message_type.GetOptions().map_entry)
202
203
204def PrintMessage(message,
205                 out,
206                 indent=0,
207                 as_utf8=False,
208                 as_one_line=False,
209                 use_short_repeated_primitives=False,
210                 pointy_brackets=False,
211                 use_index_order=False,
212                 float_format=None,
213                 double_format=None,
214                 use_field_number=False,
215                 descriptor_pool=None,
216                 message_formatter=None,
217                 print_unknown_fields=False):
218  printer = _Printer(
219      out=out, indent=indent, as_utf8=as_utf8,
220      as_one_line=as_one_line,
221      use_short_repeated_primitives=use_short_repeated_primitives,
222      pointy_brackets=pointy_brackets,
223      use_index_order=use_index_order,
224      float_format=float_format,
225      double_format=double_format,
226      use_field_number=use_field_number,
227      descriptor_pool=descriptor_pool,
228      message_formatter=message_formatter,
229      print_unknown_fields=print_unknown_fields)
230  printer.PrintMessage(message)
231
232
233def PrintField(field,
234               value,
235               out,
236               indent=0,
237               as_utf8=False,
238               as_one_line=False,
239               use_short_repeated_primitives=False,
240               pointy_brackets=False,
241               use_index_order=False,
242               float_format=None,
243               double_format=None,
244               message_formatter=None,
245               print_unknown_fields=False):
246  """Print a single field name/value pair."""
247  printer = _Printer(out, indent, as_utf8, as_one_line,
248                     use_short_repeated_primitives, pointy_brackets,
249                     use_index_order, float_format, double_format,
250                     message_formatter=message_formatter,
251                     print_unknown_fields=print_unknown_fields)
252  printer.PrintField(field, value)
253
254
255def PrintFieldValue(field,
256                    value,
257                    out,
258                    indent=0,
259                    as_utf8=False,
260                    as_one_line=False,
261                    use_short_repeated_primitives=False,
262                    pointy_brackets=False,
263                    use_index_order=False,
264                    float_format=None,
265                    double_format=None,
266                    message_formatter=None,
267                    print_unknown_fields=False):
268  """Print a single field value (not including name)."""
269  printer = _Printer(out, indent, as_utf8, as_one_line,
270                     use_short_repeated_primitives, pointy_brackets,
271                     use_index_order, float_format, double_format,
272                     message_formatter=message_formatter,
273                     print_unknown_fields=print_unknown_fields)
274  printer.PrintFieldValue(field, value)
275
276
277def _BuildMessageFromTypeName(type_name, descriptor_pool):
278  """Returns a protobuf message instance.
279
280  Args:
281    type_name: Fully-qualified protobuf  message type name string.
282    descriptor_pool: DescriptorPool instance.
283
284  Returns:
285    A Message instance of type matching type_name, or None if the a Descriptor
286    wasn't found matching type_name.
287  """
288  # pylint: disable=g-import-not-at-top
289  if descriptor_pool is None:
290    from google.protobuf import descriptor_pool as pool_mod
291    descriptor_pool = pool_mod.Default()
292  from google.protobuf import symbol_database
293  database = symbol_database.Default()
294  try:
295    message_descriptor = descriptor_pool.FindMessageTypeByName(type_name)
296  except KeyError:
297    return None
298  message_type = database.GetPrototype(message_descriptor)
299  return message_type()
300
301
302# These values must match WireType enum in google/protobuf/wire_format.h.
303WIRETYPE_LENGTH_DELIMITED = 2
304WIRETYPE_START_GROUP = 3
305
306
307class _Printer(object):
308  """Text format printer for protocol message."""
309
310  def __init__(self,
311               out,
312               indent=0,
313               as_utf8=False,
314               as_one_line=False,
315               use_short_repeated_primitives=False,
316               pointy_brackets=False,
317               use_index_order=False,
318               float_format=None,
319               double_format=None,
320               use_field_number=False,
321               descriptor_pool=None,
322               message_formatter=None,
323               print_unknown_fields=False):
324    """Initialize the Printer.
325
326    Double values can be formatted compactly with 15 digits of precision
327    (which is the most that IEEE 754 "double" can guarantee) using
328    double_format='.15g'. To ensure that converting to text and back to a proto
329    will result in an identical value, double_format='.17g' should be used.
330
331    Args:
332      out: To record the text format result.
333      indent: The initial indent level for pretty print.
334      as_utf8: Return unescaped Unicode for non-ASCII characters.
335          In Python 3 actual Unicode characters may appear as is in strings.
336          In Python 2 the return value will be valid UTF-8 rather than ASCII.
337      as_one_line: Don't introduce newlines between fields.
338      use_short_repeated_primitives: Use short repeated format for primitives.
339      pointy_brackets: If True, use angle brackets instead of curly braces for
340        nesting.
341      use_index_order: If True, print fields of a proto message using the order
342        defined in source code instead of the field number. By default, use the
343        field number order.
344      float_format: If set, use this to specify float field formatting
345        (per the "Format Specification Mini-Language"); otherwise, 8 valid
346        digits is used (default '.8g'). Also affect double field if
347        double_format is not set but float_format is set.
348      double_format: If set, use this to specify double field formatting
349        (per the "Format Specification Mini-Language"); if it is not set but
350        float_format is set, use float_format. Otherwise, str() is used.
351      use_field_number: If True, print field numbers instead of names.
352      descriptor_pool: A DescriptorPool used to resolve Any types.
353      message_formatter: A function(message, indent, as_one_line): unicode|None
354        to custom format selected sub-messages (usually based on message type).
355        Use to pretty print parts of the protobuf for easier diffing.
356      print_unknown_fields: If True, unknown fields will be printed.
357    """
358    self.out = out
359    self.indent = indent
360    self.as_utf8 = as_utf8
361    self.as_one_line = as_one_line
362    self.use_short_repeated_primitives = use_short_repeated_primitives
363    self.pointy_brackets = pointy_brackets
364    self.use_index_order = use_index_order
365    self.float_format = float_format
366    if double_format is not None:
367      self.double_format = double_format
368    else:
369      self.double_format = float_format
370    self.use_field_number = use_field_number
371    self.descriptor_pool = descriptor_pool
372    self.message_formatter = message_formatter
373    self.print_unknown_fields = print_unknown_fields
374
375  def _TryPrintAsAnyMessage(self, message):
376    """Serializes if message is a google.protobuf.Any field."""
377    if '/' not in message.type_url:
378      return False
379    packed_message = _BuildMessageFromTypeName(message.TypeName(),
380                                               self.descriptor_pool)
381    if packed_message:
382      packed_message.MergeFromString(message.value)
383      self.out.write('%s[%s] ' % (self.indent * ' ', message.type_url))
384      self._PrintMessageFieldValue(packed_message)
385      self.out.write(' ' if self.as_one_line else '\n')
386      return True
387    else:
388      return False
389
390  def _TryCustomFormatMessage(self, message):
391    formatted = self.message_formatter(message, self.indent, self.as_one_line)
392    if formatted is None:
393      return False
394
395    out = self.out
396    out.write(' ' * self.indent)
397    out.write(formatted)
398    out.write(' ' if self.as_one_line else '\n')
399    return True
400
401  def PrintMessage(self, message):
402    """Convert protobuf message to text format.
403
404    Args:
405      message: The protocol buffers message.
406    """
407    if self.message_formatter and self._TryCustomFormatMessage(message):
408      return
409    if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and
410        self._TryPrintAsAnyMessage(message)):
411      return
412    fields = message.ListFields()
413    if self.use_index_order:
414      fields.sort(
415          key=lambda x: x[0].number if x[0].is_extension else x[0].index)
416    for field, value in fields:
417      if _IsMapEntry(field):
418        for key in sorted(value):
419          # This is slow for maps with submessage entries because it copies the
420          # entire tree.  Unfortunately this would take significant refactoring
421          # of this file to work around.
422          #
423          # TODO(haberman): refactor and optimize if this becomes an issue.
424          entry_submsg = value.GetEntryClass()(key=key, value=value[key])
425          self.PrintField(field, entry_submsg)
426      elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
427        if (self.use_short_repeated_primitives
428            and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE
429            and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING):
430          self._PrintShortRepeatedPrimitivesValue(field, value)
431        else:
432          for element in value:
433            self.PrintField(field, element)
434      else:
435        self.PrintField(field, value)
436
437    if self.print_unknown_fields:
438      self._PrintUnknownFields(message.UnknownFields())
439
440  def _PrintUnknownFields(self, unknown_fields):
441    """Print unknown fields."""
442    out = self.out
443    for field in unknown_fields:
444      out.write(' ' * self.indent)
445      out.write(str(field.field_number))
446      if field.wire_type == WIRETYPE_START_GROUP:
447        if self.as_one_line:
448          out.write(' { ')
449        else:
450          out.write(' {\n')
451          self.indent += 2
452
453        self._PrintUnknownFields(field.data)
454
455        if self.as_one_line:
456          out.write('} ')
457        else:
458          out.write('}\n')
459          self.indent -= 2
460      elif field.wire_type == WIRETYPE_LENGTH_DELIMITED:
461        try:
462          # If this field is parseable as a Message, it is probably
463          # an embedded message.
464          # pylint: disable=protected-access
465          (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet(
466              memoryview(field.data), 0, len(field.data))
467        except Exception:    # pylint: disable=broad-except
468          pos = 0
469
470        if pos == len(field.data):
471          if self.as_one_line:
472            out.write(' { ')
473          else:
474            out.write(' {\n')
475            self.indent += 2
476
477          self._PrintUnknownFields(embedded_unknown_message)
478
479          if self.as_one_line:
480            out.write('} ')
481          else:
482            out.write('}\n')
483            self.indent -= 2
484        else:
485          # A string or bytes field. self.as_utf8 may not work.
486          out.write(': \"')
487          out.write(text_encoding.CEscape(field.data, False))
488          out.write('\" ' if self.as_one_line else '\"\n')
489      else:
490        # varint, fixed32, fixed64
491        out.write(': ')
492        out.write(str(field.data))
493        out.write(' ' if self.as_one_line else '\n')
494
495  def _PrintFieldName(self, field):
496    """Print field name."""
497    out = self.out
498    out.write(' ' * self.indent)
499    if self.use_field_number:
500      out.write(str(field.number))
501    else:
502      if field.is_extension:
503        out.write('[')
504        if (field.containing_type.GetOptions().message_set_wire_format and
505            field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
506            field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
507          out.write(field.message_type.full_name)
508        else:
509          out.write(field.full_name)
510        out.write(']')
511      elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
512        # For groups, use the capitalized name.
513        out.write(field.message_type.name)
514      else:
515        out.write(field.name)
516
517    if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
518      # The colon is optional in this case, but our cross-language golden files
519      # don't include it.
520      out.write(':')
521
522  def PrintField(self, field, value):
523    """Print a single field name/value pair."""
524    self._PrintFieldName(field)
525    self.out.write(' ')
526    self.PrintFieldValue(field, value)
527    self.out.write(' ' if self.as_one_line else '\n')
528
529  def _PrintShortRepeatedPrimitivesValue(self, field, value):
530    # Note: this is called only when value has at least one element.
531    self._PrintFieldName(field)
532    self.out.write(' [')
533    for i in six.moves.range(len(value) - 1):
534      self.PrintFieldValue(field, value[i])
535      self.out.write(', ')
536    self.PrintFieldValue(field, value[-1])
537    self.out.write(']')
538    self.out.write(' ' if self.as_one_line else '\n')
539
540  def _PrintMessageFieldValue(self, value):
541    if self.pointy_brackets:
542      openb = '<'
543      closeb = '>'
544    else:
545      openb = '{'
546      closeb = '}'
547
548    if self.as_one_line:
549      self.out.write('%s ' % openb)
550      self.PrintMessage(value)
551      self.out.write(closeb)
552    else:
553      self.out.write('%s\n' % openb)
554      self.indent += 2
555      self.PrintMessage(value)
556      self.indent -= 2
557      self.out.write(' ' * self.indent + closeb)
558
559  def PrintFieldValue(self, field, value):
560    """Print a single field value (not including name).
561
562    For repeated fields, the value should be a single element.
563
564    Args:
565      field: The descriptor of the field to be printed.
566      value: The value of the field.
567    """
568    out = self.out
569    if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
570      self._PrintMessageFieldValue(value)
571    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
572      enum_value = field.enum_type.values_by_number.get(value, None)
573      if enum_value is not None:
574        out.write(enum_value.name)
575      else:
576        out.write(str(value))
577    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
578      out.write('\"')
579      if isinstance(value, six.text_type) and (six.PY2 or not self.as_utf8):
580        out_value = value.encode('utf-8')
581      else:
582        out_value = value
583      if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
584        # We always need to escape all binary data in TYPE_BYTES fields.
585        out_as_utf8 = False
586      else:
587        out_as_utf8 = self.as_utf8
588      out.write(text_encoding.CEscape(out_value, out_as_utf8))
589      out.write('\"')
590    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
591      if value:
592        out.write('true')
593      else:
594        out.write('false')
595    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT:
596      if self.float_format is not None:
597        out.write('{1:{0}}'.format(self.float_format, value))
598      else:
599        out.write(str(float(format(value, '.8g'))))
600    elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and
601          self.double_format is not None):
602      out.write('{1:{0}}'.format(self.double_format, value))
603    else:
604      out.write(str(value))
605
606
607def Parse(text,
608          message,
609          allow_unknown_extension=False,
610          allow_field_number=False,
611          descriptor_pool=None,
612          allow_unknown_field=False):
613  """Parses a text representation of a protocol message into a message.
614
615  NOTE: for historical reasons this function does not clear the input
616  message. This is different from what the binary msg.ParseFrom(...) does.
617
618  Example
619    a = MyProto()
620    a.repeated_field.append('test')
621    b = MyProto()
622
623    text_format.Parse(repr(a), b)
624    text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"]
625
626    # Binary version:
627    b.ParseFromString(a.SerializeToString()) # repeated_field is now "test"
628
629  Caller is responsible for clearing the message as needed.
630
631  Args:
632    text: Message text representation.
633    message: A protocol buffer message to merge into.
634    allow_unknown_extension: if True, skip over missing extensions and keep
635      parsing
636    allow_field_number: if True, both field number and field name are allowed.
637    descriptor_pool: A DescriptorPool used to resolve Any types.
638    allow_unknown_field: if True, skip over unknown field and keep
639      parsing. Avoid to use this option if possible. It may hide some
640      errors (e.g. spelling error on field name)
641
642  Returns:
643    The same message passed as argument.
644
645  Raises:
646    ParseError: On text parsing problems.
647  """
648  return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'),
649                    message,
650                    allow_unknown_extension,
651                    allow_field_number,
652                    descriptor_pool=descriptor_pool,
653                    allow_unknown_field=allow_unknown_field)
654
655
656def Merge(text,
657          message,
658          allow_unknown_extension=False,
659          allow_field_number=False,
660          descriptor_pool=None,
661          allow_unknown_field=False):
662  """Parses a text representation of a protocol message into a message.
663
664  Like Parse(), but allows repeated values for a non-repeated field, and uses
665  the last one.
666
667  Args:
668    text: Message text representation.
669    message: A protocol buffer message to merge into.
670    allow_unknown_extension: if True, skip over missing extensions and keep
671      parsing
672    allow_field_number: if True, both field number and field name are allowed.
673    descriptor_pool: A DescriptorPool used to resolve Any types.
674    allow_unknown_field: if True, skip over unknown field and keep
675      parsing. Avoid to use this option if possible. It may hide some
676      errors (e.g. spelling error on field name)
677
678  Returns:
679    The same message passed as argument.
680
681  Raises:
682    ParseError: On text parsing problems.
683  """
684  return MergeLines(
685      text.split(b'\n' if isinstance(text, bytes) else u'\n'),
686      message,
687      allow_unknown_extension,
688      allow_field_number,
689      descriptor_pool=descriptor_pool,
690      allow_unknown_field=allow_unknown_field)
691
692
693def ParseLines(lines,
694               message,
695               allow_unknown_extension=False,
696               allow_field_number=False,
697               descriptor_pool=None,
698               allow_unknown_field=False):
699  """Parses a text representation of a protocol message into a message.
700
701  Args:
702    lines: An iterable of lines of a message's text representation.
703    message: A protocol buffer message to merge into.
704    allow_unknown_extension: if True, skip over missing extensions and keep
705      parsing
706    allow_field_number: if True, both field number and field name are allowed.
707    descriptor_pool: A DescriptorPool used to resolve Any types.
708    allow_unknown_field: if True, skip over unknown field and keep
709      parsing. Avoid to use this option if possible. It may hide some
710      errors (e.g. spelling error on field name)
711
712  Returns:
713    The same message passed as argument.
714
715  Raises:
716    ParseError: On text parsing problems.
717  """
718  parser = _Parser(allow_unknown_extension,
719                   allow_field_number,
720                   descriptor_pool=descriptor_pool,
721                   allow_unknown_field=allow_unknown_field)
722  return parser.ParseLines(lines, message)
723
724
725def MergeLines(lines,
726               message,
727               allow_unknown_extension=False,
728               allow_field_number=False,
729               descriptor_pool=None,
730               allow_unknown_field=False):
731  """Parses a text representation of a protocol message into a message.
732
733  Like ParseLines(), but allows repeated values for a non-repeated field, and
734  uses the last one.
735
736  Args:
737    lines: An iterable of lines of a message's text representation.
738    message: A protocol buffer message to merge into.
739    allow_unknown_extension: if True, skip over missing extensions and keep
740      parsing
741    allow_field_number: if True, both field number and field name are allowed.
742    descriptor_pool: A DescriptorPool used to resolve Any types.
743    allow_unknown_field: if True, skip over unknown field and keep
744      parsing. Avoid to use this option if possible. It may hide some
745      errors (e.g. spelling error on field name)
746
747  Returns:
748    The same message passed as argument.
749
750  Raises:
751    ParseError: On text parsing problems.
752  """
753  parser = _Parser(allow_unknown_extension,
754                   allow_field_number,
755                   descriptor_pool=descriptor_pool,
756                   allow_unknown_field=allow_unknown_field)
757  return parser.MergeLines(lines, message)
758
759
760class _Parser(object):
761  """Text format parser for protocol message."""
762
763  def __init__(self,
764               allow_unknown_extension=False,
765               allow_field_number=False,
766               descriptor_pool=None,
767               allow_unknown_field=False):
768    self.allow_unknown_extension = allow_unknown_extension
769    self.allow_field_number = allow_field_number
770    self.descriptor_pool = descriptor_pool
771    self.allow_unknown_field = allow_unknown_field
772
773  def ParseLines(self, lines, message):
774    """Parses a text representation of a protocol message into a message."""
775    self._allow_multiple_scalars = False
776    self._ParseOrMerge(lines, message)
777    return message
778
779  def MergeLines(self, lines, message):
780    """Merges a text representation of a protocol message into a message."""
781    self._allow_multiple_scalars = True
782    self._ParseOrMerge(lines, message)
783    return message
784
785  def _ParseOrMerge(self, lines, message):
786    """Converts a text representation of a protocol message into a message.
787
788    Args:
789      lines: Lines of a message's text representation.
790      message: A protocol buffer message to merge into.
791
792    Raises:
793      ParseError: On text parsing problems.
794    """
795    # Tokenize expects native str lines.
796    if six.PY2:
797      str_lines = (line if isinstance(line, str) else line.encode('utf-8')
798                   for line in lines)
799    else:
800      str_lines = (line if isinstance(line, str) else line.decode('utf-8')
801                   for line in lines)
802    tokenizer = Tokenizer(str_lines)
803    while not tokenizer.AtEnd():
804      self._MergeField(tokenizer, message)
805
806  def _MergeField(self, tokenizer, message):
807    """Merges a single protocol message field into a message.
808
809    Args:
810      tokenizer: A tokenizer to parse the field name and values.
811      message: A protocol message to record the data.
812
813    Raises:
814      ParseError: In case of text parsing problems.
815    """
816    message_descriptor = message.DESCRIPTOR
817    if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and
818        tokenizer.TryConsume('[')):
819      type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer)
820      tokenizer.Consume(']')
821      tokenizer.TryConsume(':')
822      if tokenizer.TryConsume('<'):
823        expanded_any_end_token = '>'
824      else:
825        tokenizer.Consume('{')
826        expanded_any_end_token = '}'
827      expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name,
828                                                           self.descriptor_pool)
829      if not expanded_any_sub_message:
830        raise ParseError('Type %s not found in descriptor pool' %
831                         packed_type_name)
832      while not tokenizer.TryConsume(expanded_any_end_token):
833        if tokenizer.AtEnd():
834          raise tokenizer.ParseErrorPreviousToken('Expected "%s".' %
835                                                  (expanded_any_end_token,))
836        self._MergeField(tokenizer, expanded_any_sub_message)
837      message.Pack(expanded_any_sub_message,
838                   type_url_prefix=type_url_prefix)
839      return
840
841    if tokenizer.TryConsume('['):
842      name = [tokenizer.ConsumeIdentifier()]
843      while tokenizer.TryConsume('.'):
844        name.append(tokenizer.ConsumeIdentifier())
845      name = '.'.join(name)
846
847      if not message_descriptor.is_extendable:
848        raise tokenizer.ParseErrorPreviousToken(
849            'Message type "%s" does not have extensions.' %
850            message_descriptor.full_name)
851      # pylint: disable=protected-access
852      field = message.Extensions._FindExtensionByName(name)
853      # pylint: enable=protected-access
854      if not field:
855        if self.allow_unknown_extension:
856          field = None
857        else:
858          raise tokenizer.ParseErrorPreviousToken(
859              'Extension "%s" not registered. '
860              'Did you import the _pb2 module which defines it? '
861              'If you are trying to place the extension in the MessageSet '
862              'field of another message that is in an Any or MessageSet field, '
863              'that message\'s _pb2 module must be imported as well' % name)
864      elif message_descriptor != field.containing_type:
865        raise tokenizer.ParseErrorPreviousToken(
866            'Extension "%s" does not extend message type "%s".' %
867            (name, message_descriptor.full_name))
868
869      tokenizer.Consume(']')
870
871    else:
872      name = tokenizer.ConsumeIdentifierOrNumber()
873      if self.allow_field_number and name.isdigit():
874        number = ParseInteger(name, True, True)
875        field = message_descriptor.fields_by_number.get(number, None)
876        if not field and message_descriptor.is_extendable:
877          field = message.Extensions._FindExtensionByNumber(number)
878      else:
879        field = message_descriptor.fields_by_name.get(name, None)
880
881        # Group names are expected to be capitalized as they appear in the
882        # .proto file, which actually matches their type names, not their field
883        # names.
884        if not field:
885          field = message_descriptor.fields_by_name.get(name.lower(), None)
886          if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
887            field = None
888
889        if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
890            field.message_type.name != name):
891          field = None
892
893      if not field and not self.allow_unknown_field:
894        raise tokenizer.ParseErrorPreviousToken(
895            'Message type "%s" has no field named "%s".' %
896            (message_descriptor.full_name, name))
897
898    if field:
899      if not self._allow_multiple_scalars and field.containing_oneof:
900        # Check if there's a different field set in this oneof.
901        # Note that we ignore the case if the same field was set before, and we
902        # apply _allow_multiple_scalars to non-scalar fields as well.
903        which_oneof = message.WhichOneof(field.containing_oneof.name)
904        if which_oneof is not None and which_oneof != field.name:
905          raise tokenizer.ParseErrorPreviousToken(
906              'Field "%s" is specified along with field "%s", another member '
907              'of oneof "%s" for message type "%s".' %
908              (field.name, which_oneof, field.containing_oneof.name,
909               message_descriptor.full_name))
910
911      if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
912        tokenizer.TryConsume(':')
913        merger = self._MergeMessageField
914      else:
915        tokenizer.Consume(':')
916        merger = self._MergeScalarField
917
918      if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and
919          tokenizer.TryConsume('[')):
920        # Short repeated format, e.g. "foo: [1, 2, 3]"
921        if not tokenizer.TryConsume(']'):
922          while True:
923            merger(tokenizer, message, field)
924            if tokenizer.TryConsume(']'):
925              break
926            tokenizer.Consume(',')
927
928      else:
929        merger(tokenizer, message, field)
930
931    else:  # Proto field is unknown.
932      assert (self.allow_unknown_extension or self.allow_unknown_field)
933      _SkipFieldContents(tokenizer)
934
935    # For historical reasons, fields may optionally be separated by commas or
936    # semicolons.
937    if not tokenizer.TryConsume(','):
938      tokenizer.TryConsume(';')
939
940  def _ConsumeAnyTypeUrl(self, tokenizer):
941    """Consumes a google.protobuf.Any type URL and returns the type name."""
942    # Consume "type.googleapis.com/".
943    prefix = [tokenizer.ConsumeIdentifier()]
944    tokenizer.Consume('.')
945    prefix.append(tokenizer.ConsumeIdentifier())
946    tokenizer.Consume('.')
947    prefix.append(tokenizer.ConsumeIdentifier())
948    tokenizer.Consume('/')
949    # Consume the fully-qualified type name.
950    name = [tokenizer.ConsumeIdentifier()]
951    while tokenizer.TryConsume('.'):
952      name.append(tokenizer.ConsumeIdentifier())
953    return '.'.join(prefix), '.'.join(name)
954
955  def _MergeMessageField(self, tokenizer, message, field):
956    """Merges a single scalar field into a message.
957
958    Args:
959      tokenizer: A tokenizer to parse the field value.
960      message: The message of which field is a member.
961      field: The descriptor of the field to be merged.
962
963    Raises:
964      ParseError: In case of text parsing problems.
965    """
966    is_map_entry = _IsMapEntry(field)
967
968    if tokenizer.TryConsume('<'):
969      end_token = '>'
970    else:
971      tokenizer.Consume('{')
972      end_token = '}'
973
974    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
975      if field.is_extension:
976        sub_message = message.Extensions[field].add()
977      elif is_map_entry:
978        sub_message = getattr(message, field.name).GetEntryClass()()
979      else:
980        sub_message = getattr(message, field.name).add()
981    else:
982      if field.is_extension:
983        if (not self._allow_multiple_scalars and
984            message.HasExtension(field)):
985          raise tokenizer.ParseErrorPreviousToken(
986              'Message type "%s" should not have multiple "%s" extensions.' %
987              (message.DESCRIPTOR.full_name, field.full_name))
988        sub_message = message.Extensions[field]
989      else:
990        # Also apply _allow_multiple_scalars to message field.
991        # TODO(jieluo): Change to _allow_singular_overwrites.
992        if (not self._allow_multiple_scalars and
993            message.HasField(field.name)):
994          raise tokenizer.ParseErrorPreviousToken(
995              'Message type "%s" should not have multiple "%s" fields.' %
996              (message.DESCRIPTOR.full_name, field.name))
997        sub_message = getattr(message, field.name)
998      sub_message.SetInParent()
999
1000    while not tokenizer.TryConsume(end_token):
1001      if tokenizer.AtEnd():
1002        raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,))
1003      self._MergeField(tokenizer, sub_message)
1004
1005    if is_map_entry:
1006      value_cpptype = field.message_type.fields_by_name['value'].cpp_type
1007      if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
1008        value = getattr(message, field.name)[sub_message.key]
1009        value.MergeFrom(sub_message.value)
1010      else:
1011        getattr(message, field.name)[sub_message.key] = sub_message.value
1012
1013  @staticmethod
1014  def _IsProto3Syntax(message):
1015    message_descriptor = message.DESCRIPTOR
1016    return (hasattr(message_descriptor, 'syntax') and
1017            message_descriptor.syntax == 'proto3')
1018
1019  def _MergeScalarField(self, tokenizer, message, field):
1020    """Merges a single scalar field into a message.
1021
1022    Args:
1023      tokenizer: A tokenizer to parse the field value.
1024      message: A protocol message to record the data.
1025      field: The descriptor of the field to be merged.
1026
1027    Raises:
1028      ParseError: In case of text parsing problems.
1029      RuntimeError: On runtime errors.
1030    """
1031    _ = self.allow_unknown_extension
1032    value = None
1033
1034    if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
1035                      descriptor.FieldDescriptor.TYPE_SINT32,
1036                      descriptor.FieldDescriptor.TYPE_SFIXED32):
1037      value = _ConsumeInt32(tokenizer)
1038    elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
1039                        descriptor.FieldDescriptor.TYPE_SINT64,
1040                        descriptor.FieldDescriptor.TYPE_SFIXED64):
1041      value = _ConsumeInt64(tokenizer)
1042    elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
1043                        descriptor.FieldDescriptor.TYPE_FIXED32):
1044      value = _ConsumeUint32(tokenizer)
1045    elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
1046                        descriptor.FieldDescriptor.TYPE_FIXED64):
1047      value = _ConsumeUint64(tokenizer)
1048    elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
1049                        descriptor.FieldDescriptor.TYPE_DOUBLE):
1050      value = tokenizer.ConsumeFloat()
1051    elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
1052      value = tokenizer.ConsumeBool()
1053    elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
1054      value = tokenizer.ConsumeString()
1055    elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
1056      value = tokenizer.ConsumeByteString()
1057    elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
1058      value = tokenizer.ConsumeEnum(field)
1059    else:
1060      raise RuntimeError('Unknown field type %d' % field.type)
1061
1062    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1063      if field.is_extension:
1064        message.Extensions[field].append(value)
1065      else:
1066        getattr(message, field.name).append(value)
1067    else:
1068      if field.is_extension:
1069        if (not self._allow_multiple_scalars and
1070            not self._IsProto3Syntax(message) and
1071            message.HasExtension(field)):
1072          raise tokenizer.ParseErrorPreviousToken(
1073              'Message type "%s" should not have multiple "%s" extensions.' %
1074              (message.DESCRIPTOR.full_name, field.full_name))
1075        else:
1076          message.Extensions[field] = value
1077      else:
1078        duplicate_error = False
1079        if not self._allow_multiple_scalars:
1080          if self._IsProto3Syntax(message):
1081            # Proto3 doesn't represent presence so we try best effort to check
1082            # multiple scalars by compare to default values.
1083            duplicate_error = bool(getattr(message, field.name))
1084          else:
1085            duplicate_error = message.HasField(field.name)
1086
1087        if duplicate_error:
1088          raise tokenizer.ParseErrorPreviousToken(
1089              'Message type "%s" should not have multiple "%s" fields.' %
1090              (message.DESCRIPTOR.full_name, field.name))
1091        else:
1092          setattr(message, field.name, value)
1093
1094
1095def _SkipFieldContents(tokenizer):
1096  """Skips over contents (value or message) of a field.
1097
1098  Args:
1099    tokenizer: A tokenizer to parse the field name and values.
1100  """
1101  # Try to guess the type of this field.
1102  # If this field is not a message, there should be a ":" between the
1103  # field name and the field value and also the field value should not
1104  # start with "{" or "<" which indicates the beginning of a message body.
1105  # If there is no ":" or there is a "{" or "<" after ":", this field has
1106  # to be a message or the input is ill-formed.
1107  if tokenizer.TryConsume(':') and not tokenizer.LookingAt(
1108      '{') and not tokenizer.LookingAt('<'):
1109    _SkipFieldValue(tokenizer)
1110  else:
1111    _SkipFieldMessage(tokenizer)
1112
1113
1114def _SkipField(tokenizer):
1115  """Skips over a complete field (name and value/message).
1116
1117  Args:
1118    tokenizer: A tokenizer to parse the field name and values.
1119  """
1120  if tokenizer.TryConsume('['):
1121    # Consume extension name.
1122    tokenizer.ConsumeIdentifier()
1123    while tokenizer.TryConsume('.'):
1124      tokenizer.ConsumeIdentifier()
1125    tokenizer.Consume(']')
1126  else:
1127    tokenizer.ConsumeIdentifierOrNumber()
1128
1129  _SkipFieldContents(tokenizer)
1130
1131  # For historical reasons, fields may optionally be separated by commas or
1132  # semicolons.
1133  if not tokenizer.TryConsume(','):
1134    tokenizer.TryConsume(';')
1135
1136
1137def _SkipFieldMessage(tokenizer):
1138  """Skips over a field message.
1139
1140  Args:
1141    tokenizer: A tokenizer to parse the field name and values.
1142  """
1143
1144  if tokenizer.TryConsume('<'):
1145    delimiter = '>'
1146  else:
1147    tokenizer.Consume('{')
1148    delimiter = '}'
1149
1150  while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'):
1151    _SkipField(tokenizer)
1152
1153  tokenizer.Consume(delimiter)
1154
1155
1156def _SkipFieldValue(tokenizer):
1157  """Skips over a field value.
1158
1159  Args:
1160    tokenizer: A tokenizer to parse the field name and values.
1161
1162  Raises:
1163    ParseError: In case an invalid field value is found.
1164  """
1165  # String/bytes tokens can come in multiple adjacent string literals.
1166  # If we can consume one, consume as many as we can.
1167  if tokenizer.TryConsumeByteString():
1168    while tokenizer.TryConsumeByteString():
1169      pass
1170    return
1171
1172  if (not tokenizer.TryConsumeIdentifier() and
1173      not _TryConsumeInt64(tokenizer) and not _TryConsumeUint64(tokenizer) and
1174      not tokenizer.TryConsumeFloat()):
1175    raise ParseError('Invalid field value: ' + tokenizer.token)
1176
1177
1178class Tokenizer(object):
1179  """Protocol buffer text representation tokenizer.
1180
1181  This class handles the lower level string parsing by splitting it into
1182  meaningful tokens.
1183
1184  It was directly ported from the Java protocol buffer API.
1185  """
1186
1187  _WHITESPACE = re.compile(r'\s+')
1188  _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE)
1189  _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE)
1190  _TOKEN = re.compile('|'.join([
1191      r'[a-zA-Z_][0-9a-zA-Z_+-]*',  # an identifier
1192      r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*',  # a number
1193  ] + [  # quoted str for each quote mark
1194      # Avoid backtracking! https://stackoverflow.com/a/844267
1195      r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark)
1196      for mark in _QUOTES
1197  ]))
1198
1199  _IDENTIFIER = re.compile(r'[^\d\W]\w*')
1200  _IDENTIFIER_OR_NUMBER = re.compile(r'\w+')
1201
1202  def __init__(self, lines, skip_comments=True):
1203    self._position = 0
1204    self._line = -1
1205    self._column = 0
1206    self._token_start = None
1207    self.token = ''
1208    self._lines = iter(lines)
1209    self._current_line = ''
1210    self._previous_line = 0
1211    self._previous_column = 0
1212    self._more_lines = True
1213    self._skip_comments = skip_comments
1214    self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT
1215                                or self._WHITESPACE)
1216    self._SkipWhitespace()
1217    self.NextToken()
1218
1219  def LookingAt(self, token):
1220    return self.token == token
1221
1222  def AtEnd(self):
1223    """Checks the end of the text was reached.
1224
1225    Returns:
1226      True iff the end was reached.
1227    """
1228    return not self.token
1229
1230  def _PopLine(self):
1231    while len(self._current_line) <= self._column:
1232      try:
1233        self._current_line = next(self._lines)
1234      except StopIteration:
1235        self._current_line = ''
1236        self._more_lines = False
1237        return
1238      else:
1239        self._line += 1
1240        self._column = 0
1241
1242  def _SkipWhitespace(self):
1243    while True:
1244      self._PopLine()
1245      match = self._whitespace_pattern.match(self._current_line, self._column)
1246      if not match:
1247        break
1248      length = len(match.group(0))
1249      self._column += length
1250
1251  def TryConsume(self, token):
1252    """Tries to consume a given piece of text.
1253
1254    Args:
1255      token: Text to consume.
1256
1257    Returns:
1258      True iff the text was consumed.
1259    """
1260    if self.token == token:
1261      self.NextToken()
1262      return True
1263    return False
1264
1265  def Consume(self, token):
1266    """Consumes a piece of text.
1267
1268    Args:
1269      token: Text to consume.
1270
1271    Raises:
1272      ParseError: If the text couldn't be consumed.
1273    """
1274    if not self.TryConsume(token):
1275      raise self.ParseError('Expected "%s".' % token)
1276
1277  def ConsumeComment(self):
1278    result = self.token
1279    if not self._COMMENT.match(result):
1280      raise self.ParseError('Expected comment.')
1281    self.NextToken()
1282    return result
1283
1284  def ConsumeCommentOrTrailingComment(self):
1285    """Consumes a comment, returns a 2-tuple (trailing bool, comment str)."""
1286
1287    # Tokenizer initializes _previous_line and _previous_column to 0. As the
1288    # tokenizer starts, it looks like there is a previous token on the line.
1289    just_started = self._line == 0 and self._column == 0
1290
1291    before_parsing = self._previous_line
1292    comment = self.ConsumeComment()
1293
1294    # A trailing comment is a comment on the same line than the previous token.
1295    trailing = (self._previous_line == before_parsing
1296                and not just_started)
1297
1298    return trailing, comment
1299
1300  def TryConsumeIdentifier(self):
1301    try:
1302      self.ConsumeIdentifier()
1303      return True
1304    except ParseError:
1305      return False
1306
1307  def ConsumeIdentifier(self):
1308    """Consumes protocol message field identifier.
1309
1310    Returns:
1311      Identifier string.
1312
1313    Raises:
1314      ParseError: If an identifier couldn't be consumed.
1315    """
1316    result = self.token
1317    if not self._IDENTIFIER.match(result):
1318      raise self.ParseError('Expected identifier.')
1319    self.NextToken()
1320    return result
1321
1322  def TryConsumeIdentifierOrNumber(self):
1323    try:
1324      self.ConsumeIdentifierOrNumber()
1325      return True
1326    except ParseError:
1327      return False
1328
1329  def ConsumeIdentifierOrNumber(self):
1330    """Consumes protocol message field identifier.
1331
1332    Returns:
1333      Identifier string.
1334
1335    Raises:
1336      ParseError: If an identifier couldn't be consumed.
1337    """
1338    result = self.token
1339    if not self._IDENTIFIER_OR_NUMBER.match(result):
1340      raise self.ParseError('Expected identifier or number, got %s.' % result)
1341    self.NextToken()
1342    return result
1343
1344  def TryConsumeInteger(self):
1345    try:
1346      # Note: is_long only affects value type, not whether an error is raised.
1347      self.ConsumeInteger()
1348      return True
1349    except ParseError:
1350      return False
1351
1352  def ConsumeInteger(self, is_long=False):
1353    """Consumes an integer number.
1354
1355    Args:
1356      is_long: True if the value should be returned as a long integer.
1357    Returns:
1358      The integer parsed.
1359
1360    Raises:
1361      ParseError: If an integer couldn't be consumed.
1362    """
1363    try:
1364      result = _ParseAbstractInteger(self.token, is_long=is_long)
1365    except ValueError as e:
1366      raise self.ParseError(str(e))
1367    self.NextToken()
1368    return result
1369
1370  def TryConsumeFloat(self):
1371    try:
1372      self.ConsumeFloat()
1373      return True
1374    except ParseError:
1375      return False
1376
1377  def ConsumeFloat(self):
1378    """Consumes an floating point number.
1379
1380    Returns:
1381      The number parsed.
1382
1383    Raises:
1384      ParseError: If a floating point number couldn't be consumed.
1385    """
1386    try:
1387      result = ParseFloat(self.token)
1388    except ValueError as e:
1389      raise self.ParseError(str(e))
1390    self.NextToken()
1391    return result
1392
1393  def ConsumeBool(self):
1394    """Consumes a boolean value.
1395
1396    Returns:
1397      The bool parsed.
1398
1399    Raises:
1400      ParseError: If a boolean value couldn't be consumed.
1401    """
1402    try:
1403      result = ParseBool(self.token)
1404    except ValueError as e:
1405      raise self.ParseError(str(e))
1406    self.NextToken()
1407    return result
1408
1409  def TryConsumeByteString(self):
1410    try:
1411      self.ConsumeByteString()
1412      return True
1413    except ParseError:
1414      return False
1415
1416  def ConsumeString(self):
1417    """Consumes a string value.
1418
1419    Returns:
1420      The string parsed.
1421
1422    Raises:
1423      ParseError: If a string value couldn't be consumed.
1424    """
1425    the_bytes = self.ConsumeByteString()
1426    try:
1427      return six.text_type(the_bytes, 'utf-8')
1428    except UnicodeDecodeError as e:
1429      raise self._StringParseError(e)
1430
1431  def ConsumeByteString(self):
1432    """Consumes a byte array value.
1433
1434    Returns:
1435      The array parsed (as a string).
1436
1437    Raises:
1438      ParseError: If a byte array value couldn't be consumed.
1439    """
1440    the_list = [self._ConsumeSingleByteString()]
1441    while self.token and self.token[0] in _QUOTES:
1442      the_list.append(self._ConsumeSingleByteString())
1443    return b''.join(the_list)
1444
1445  def _ConsumeSingleByteString(self):
1446    """Consume one token of a string literal.
1447
1448    String literals (whether bytes or text) can come in multiple adjacent
1449    tokens which are automatically concatenated, like in C or Python.  This
1450    method only consumes one token.
1451
1452    Returns:
1453      The token parsed.
1454    Raises:
1455      ParseError: When the wrong format data is found.
1456    """
1457    text = self.token
1458    if len(text) < 1 or text[0] not in _QUOTES:
1459      raise self.ParseError('Expected string but found: %r' % (text,))
1460
1461    if len(text) < 2 or text[-1] != text[0]:
1462      raise self.ParseError('String missing ending quote: %r' % (text,))
1463
1464    try:
1465      result = text_encoding.CUnescape(text[1:-1])
1466    except ValueError as e:
1467      raise self.ParseError(str(e))
1468    self.NextToken()
1469    return result
1470
1471  def ConsumeEnum(self, field):
1472    try:
1473      result = ParseEnum(field, self.token)
1474    except ValueError as e:
1475      raise self.ParseError(str(e))
1476    self.NextToken()
1477    return result
1478
1479  def ParseErrorPreviousToken(self, message):
1480    """Creates and *returns* a ParseError for the previously read token.
1481
1482    Args:
1483      message: A message to set for the exception.
1484
1485    Returns:
1486      A ParseError instance.
1487    """
1488    return ParseError(message, self._previous_line + 1,
1489                      self._previous_column + 1)
1490
1491  def ParseError(self, message):
1492    """Creates and *returns* a ParseError for the current token."""
1493    return ParseError('\'' + self._current_line + '\': ' + message,
1494                      self._line + 1, self._column + 1)
1495
1496  def _StringParseError(self, e):
1497    return self.ParseError('Couldn\'t parse string: ' + str(e))
1498
1499  def NextToken(self):
1500    """Reads the next meaningful token."""
1501    self._previous_line = self._line
1502    self._previous_column = self._column
1503
1504    self._column += len(self.token)
1505    self._SkipWhitespace()
1506
1507    if not self._more_lines:
1508      self.token = ''
1509      return
1510
1511    match = self._TOKEN.match(self._current_line, self._column)
1512    if not match and not self._skip_comments:
1513      match = self._COMMENT.match(self._current_line, self._column)
1514    if match:
1515      token = match.group(0)
1516      self.token = token
1517    else:
1518      self.token = self._current_line[self._column]
1519
1520# Aliased so it can still be accessed by current visibility violators.
1521# TODO(dbarnett): Migrate violators to textformat_tokenizer.
1522_Tokenizer = Tokenizer  # pylint: disable=invalid-name
1523
1524
1525def _ConsumeInt32(tokenizer):
1526  """Consumes a signed 32bit integer number from tokenizer.
1527
1528  Args:
1529    tokenizer: A tokenizer used to parse the number.
1530
1531  Returns:
1532    The integer parsed.
1533
1534  Raises:
1535    ParseError: If a signed 32bit integer couldn't be consumed.
1536  """
1537  return _ConsumeInteger(tokenizer, is_signed=True, is_long=False)
1538
1539
1540def _ConsumeUint32(tokenizer):
1541  """Consumes an unsigned 32bit integer number from tokenizer.
1542
1543  Args:
1544    tokenizer: A tokenizer used to parse the number.
1545
1546  Returns:
1547    The integer parsed.
1548
1549  Raises:
1550    ParseError: If an unsigned 32bit integer couldn't be consumed.
1551  """
1552  return _ConsumeInteger(tokenizer, is_signed=False, is_long=False)
1553
1554
1555def _TryConsumeInt64(tokenizer):
1556  try:
1557    _ConsumeInt64(tokenizer)
1558    return True
1559  except ParseError:
1560    return False
1561
1562
1563def _ConsumeInt64(tokenizer):
1564  """Consumes a signed 32bit integer number from tokenizer.
1565
1566  Args:
1567    tokenizer: A tokenizer used to parse the number.
1568
1569  Returns:
1570    The integer parsed.
1571
1572  Raises:
1573    ParseError: If a signed 32bit integer couldn't be consumed.
1574  """
1575  return _ConsumeInteger(tokenizer, is_signed=True, is_long=True)
1576
1577
1578def _TryConsumeUint64(tokenizer):
1579  try:
1580    _ConsumeUint64(tokenizer)
1581    return True
1582  except ParseError:
1583    return False
1584
1585
1586def _ConsumeUint64(tokenizer):
1587  """Consumes an unsigned 64bit integer number from tokenizer.
1588
1589  Args:
1590    tokenizer: A tokenizer used to parse the number.
1591
1592  Returns:
1593    The integer parsed.
1594
1595  Raises:
1596    ParseError: If an unsigned 64bit integer couldn't be consumed.
1597  """
1598  return _ConsumeInteger(tokenizer, is_signed=False, is_long=True)
1599
1600
1601def _TryConsumeInteger(tokenizer, is_signed=False, is_long=False):
1602  try:
1603    _ConsumeInteger(tokenizer, is_signed=is_signed, is_long=is_long)
1604    return True
1605  except ParseError:
1606    return False
1607
1608
1609def _ConsumeInteger(tokenizer, is_signed=False, is_long=False):
1610  """Consumes an integer number from tokenizer.
1611
1612  Args:
1613    tokenizer: A tokenizer used to parse the number.
1614    is_signed: True if a signed integer must be parsed.
1615    is_long: True if a long integer must be parsed.
1616
1617  Returns:
1618    The integer parsed.
1619
1620  Raises:
1621    ParseError: If an integer with given characteristics couldn't be consumed.
1622  """
1623  try:
1624    result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long)
1625  except ValueError as e:
1626    raise tokenizer.ParseError(str(e))
1627  tokenizer.NextToken()
1628  return result
1629
1630
1631def ParseInteger(text, is_signed=False, is_long=False):
1632  """Parses an integer.
1633
1634  Args:
1635    text: The text to parse.
1636    is_signed: True if a signed integer must be parsed.
1637    is_long: True if a long integer must be parsed.
1638
1639  Returns:
1640    The integer value.
1641
1642  Raises:
1643    ValueError: Thrown Iff the text is not a valid integer.
1644  """
1645  # Do the actual parsing. Exception handling is propagated to caller.
1646  result = _ParseAbstractInteger(text, is_long=is_long)
1647
1648  # Check if the integer is sane. Exceptions handled by callers.
1649  checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
1650  checker.CheckValue(result)
1651  return result
1652
1653
1654def _ParseAbstractInteger(text, is_long=False):
1655  """Parses an integer without checking size/signedness.
1656
1657  Args:
1658    text: The text to parse.
1659    is_long: True if the value should be returned as a long integer.
1660
1661  Returns:
1662    The integer value.
1663
1664  Raises:
1665    ValueError: Thrown Iff the text is not a valid integer.
1666  """
1667  # Do the actual parsing. Exception handling is propagated to caller.
1668  orig_text = text
1669  c_octal_match = re.match(r'(-?)0(\d+)$', text)
1670  if c_octal_match:
1671    # Python 3 no longer supports 0755 octal syntax without the 'o', so
1672    # we always use the '0o' prefix for multi-digit numbers starting with 0.
1673    text = c_octal_match.group(1) + '0o' + c_octal_match.group(2)
1674  try:
1675    # We force 32-bit values to int and 64-bit values to long to make
1676    # alternate implementations where the distinction is more significant
1677    # (e.g. the C++ implementation) simpler.
1678    if is_long:
1679      return long(text, 0)
1680    else:
1681      return int(text, 0)
1682  except ValueError:
1683    raise ValueError('Couldn\'t parse integer: %s' % orig_text)
1684
1685
1686def ParseFloat(text):
1687  """Parse a floating point number.
1688
1689  Args:
1690    text: Text to parse.
1691
1692  Returns:
1693    The number parsed.
1694
1695  Raises:
1696    ValueError: If a floating point number couldn't be parsed.
1697  """
1698  try:
1699    # Assume Python compatible syntax.
1700    return float(text)
1701  except ValueError:
1702    # Check alternative spellings.
1703    if _FLOAT_INFINITY.match(text):
1704      if text[0] == '-':
1705        return float('-inf')
1706      else:
1707        return float('inf')
1708    elif _FLOAT_NAN.match(text):
1709      return float('nan')
1710    else:
1711      # assume '1.0f' format
1712      try:
1713        return float(text.rstrip('f'))
1714      except ValueError:
1715        raise ValueError('Couldn\'t parse float: %s' % text)
1716
1717
1718def ParseBool(text):
1719  """Parse a boolean value.
1720
1721  Args:
1722    text: Text to parse.
1723
1724  Returns:
1725    Boolean values parsed
1726
1727  Raises:
1728    ValueError: If text is not a valid boolean.
1729  """
1730  if text in ('true', 't', '1', 'True'):
1731    return True
1732  elif text in ('false', 'f', '0', 'False'):
1733    return False
1734  else:
1735    raise ValueError('Expected "true" or "false".')
1736
1737
1738def ParseEnum(field, value):
1739  """Parse an enum value.
1740
1741  The value can be specified by a number (the enum value), or by
1742  a string literal (the enum name).
1743
1744  Args:
1745    field: Enum field descriptor.
1746    value: String value.
1747
1748  Returns:
1749    Enum value number.
1750
1751  Raises:
1752    ValueError: If the enum value could not be parsed.
1753  """
1754  enum_descriptor = field.enum_type
1755  try:
1756    number = int(value, 0)
1757  except ValueError:
1758    # Identifier.
1759    enum_value = enum_descriptor.values_by_name.get(value, None)
1760    if enum_value is None:
1761      raise ValueError('Enum type "%s" has no value named %s.' %
1762                       (enum_descriptor.full_name, value))
1763  else:
1764    # Numeric value.
1765    if hasattr(field.file, 'syntax'):
1766      # Attribute is checked for compatibility.
1767      if field.file.syntax == 'proto3':
1768        # Proto3 accept numeric unknown enums.
1769        return number
1770    enum_value = enum_descriptor.values_by_number.get(number, None)
1771    if enum_value is None:
1772      raise ValueError('Enum type "%s" has no value with number %d.' %
1773                       (enum_descriptor.full_name, number))
1774  return enum_value.number
1775