1import re
2import sys
3import unittest
4
5sys.path.insert(0, '..')
6from pycparser.c_lexer import CLexer
7
8
9def token_list(clex):
10    return list(iter(clex.token, None))
11
12
13def token_types(clex):
14    return [i.type for i in token_list(clex)]
15
16
17class TestCLexerNoErrors(unittest.TestCase):
18    """ Test lexing of strings that are not supposed to cause
19        errors. Therefore, the error_func passed to the lexer
20        raises an exception.
21    """
22    def error_func(self, msg, line, column):
23        self.fail(msg)
24
25    def on_lbrace_func(self):
26        pass
27
28    def on_rbrace_func(self):
29        pass
30
31    def type_lookup_func(self, typ):
32        if typ.startswith('mytype'):
33            return True
34        else:
35            return False
36
37    def setUp(self):
38        self.clex = CLexer(self.error_func, lambda: None, lambda: None,
39                           self.type_lookup_func)
40        self.clex.build(optimize=False)
41
42    def assertTokensTypes(self, str, types):
43        self.clex.input(str)
44        self.assertEqual(token_types(self.clex), types)
45
46    def test_trivial_tokens(self):
47        self.assertTokensTypes('1', ['INT_CONST_DEC'])
48        self.assertTokensTypes('-', ['MINUS'])
49        self.assertTokensTypes('volatile', ['VOLATILE'])
50        self.assertTokensTypes('...', ['ELLIPSIS'])
51        self.assertTokensTypes('++', ['PLUSPLUS'])
52        self.assertTokensTypes('case int', ['CASE', 'INT'])
53        self.assertTokensTypes('caseint', ['ID'])
54        self.assertTokensTypes('$dollar cent$', ['ID', 'ID'])
55        self.assertTokensTypes('i ^= 1;', ['ID', 'XOREQUAL', 'INT_CONST_DEC', 'SEMI'])
56
57    def test_id_typeid(self):
58        self.assertTokensTypes('myt', ['ID'])
59        self.assertTokensTypes('mytype', ['TYPEID'])
60        self.assertTokensTypes('mytype6 var', ['TYPEID', 'ID'])
61
62    def test_integer_constants(self):
63        self.assertTokensTypes('12', ['INT_CONST_DEC'])
64        self.assertTokensTypes('12u', ['INT_CONST_DEC'])
65        self.assertTokensTypes('12l', ['INT_CONST_DEC'])
66        self.assertTokensTypes('199872Ul', ['INT_CONST_DEC'])
67        self.assertTokensTypes('199872lU', ['INT_CONST_DEC'])
68        self.assertTokensTypes('199872LL', ['INT_CONST_DEC'])
69        self.assertTokensTypes('199872ull', ['INT_CONST_DEC'])
70        self.assertTokensTypes('199872llu', ['INT_CONST_DEC'])
71        self.assertTokensTypes('1009843200000uLL', ['INT_CONST_DEC'])
72        self.assertTokensTypes('1009843200000LLu', ['INT_CONST_DEC'])
73
74        self.assertTokensTypes('077', ['INT_CONST_OCT'])
75        self.assertTokensTypes('0123456L', ['INT_CONST_OCT'])
76
77        self.assertTokensTypes('0xf7', ['INT_CONST_HEX'])
78        self.assertTokensTypes('0b110', ['INT_CONST_BIN'])
79        self.assertTokensTypes('0x01202AAbbf7Ul', ['INT_CONST_HEX'])
80        self.assertTokensTypes("'12'", ['INT_CONST_CHAR'])
81        self.assertTokensTypes("'123'", ['INT_CONST_CHAR'])
82        self.assertTokensTypes("'1AB4'", ['INT_CONST_CHAR'])
83        self.assertTokensTypes(r"'1A\n4'", ['INT_CONST_CHAR'])
84
85        # no 0 before x, so ID catches it
86        self.assertTokensTypes('xf7', ['ID'])
87
88        # - is MINUS, the rest a constnant
89        self.assertTokensTypes('-1', ['MINUS', 'INT_CONST_DEC'])
90
91    def test_special_names(self):
92        self.assertTokensTypes('sizeof offsetof', ['SIZEOF', 'OFFSETOF'])
93
94    def test_floating_constants(self):
95        self.assertTokensTypes('1.5f', ['FLOAT_CONST'])
96        self.assertTokensTypes('01.5', ['FLOAT_CONST'])
97        self.assertTokensTypes('.15L', ['FLOAT_CONST'])
98        self.assertTokensTypes('0.', ['FLOAT_CONST'])
99
100        # but just a period is a period
101        self.assertTokensTypes('.', ['PERIOD'])
102
103        self.assertTokensTypes('3.3e-3', ['FLOAT_CONST'])
104        self.assertTokensTypes('.7e25L', ['FLOAT_CONST'])
105        self.assertTokensTypes('6.e+125f', ['FLOAT_CONST'])
106        self.assertTokensTypes('666e666', ['FLOAT_CONST'])
107        self.assertTokensTypes('00666e+3', ['FLOAT_CONST'])
108
109        # but this is a hex integer + 3
110        self.assertTokensTypes('0x0666e+3', ['INT_CONST_HEX', 'PLUS', 'INT_CONST_DEC'])
111
112    def test_hexadecimal_floating_constants(self):
113        self.assertTokensTypes('0xDE.488641p0', ['HEX_FLOAT_CONST'])
114        self.assertTokensTypes('0x.488641p0', ['HEX_FLOAT_CONST'])
115        self.assertTokensTypes('0X12.P0', ['HEX_FLOAT_CONST'])
116
117    def test_char_constants(self):
118        self.assertTokensTypes(r"""'x'""", ['CHAR_CONST'])
119        self.assertTokensTypes(r"""L'x'""", ['WCHAR_CONST'])
120        self.assertTokensTypes(r"""'\t'""", ['CHAR_CONST'])
121        self.assertTokensTypes(r"""'\''""", ['CHAR_CONST'])
122        self.assertTokensTypes(r"""'\?'""", ['CHAR_CONST'])
123        self.assertTokensTypes(r"""'\0'""", ['CHAR_CONST'])
124        self.assertTokensTypes(r"""'\012'""", ['CHAR_CONST'])
125        self.assertTokensTypes(r"""'\x2f'""", ['CHAR_CONST'])
126        self.assertTokensTypes(r"""'\x2f12'""", ['CHAR_CONST'])
127        self.assertTokensTypes(r"""L'\xaf'""", ['WCHAR_CONST'])
128
129    def test_on_rbrace_lbrace(self):
130        braces = []
131        def on_lbrace():
132            braces.append('{')
133        def on_rbrace():
134            braces.append('}')
135        clex = CLexer(self.error_func, on_lbrace, on_rbrace,
136                      self.type_lookup_func)
137        clex.build(optimize=False)
138        clex.input('hello { there } } and again }}{')
139        token_list(clex)
140        self.assertEqual(braces, ['{', '}', '}', '}', '}', '{'])
141
142    def test_string_literal(self):
143        self.assertTokensTypes('"a string"', ['STRING_LITERAL'])
144        self.assertTokensTypes('L"ing"', ['WSTRING_LITERAL'])
145        self.assertTokensTypes(
146            '"i am a string too \t"',
147            ['STRING_LITERAL'])
148        self.assertTokensTypes(
149            r'''"esc\ape \"\'\? \0234 chars \rule"''',
150            ['STRING_LITERAL'])
151        self.assertTokensTypes(
152            r'''"hello 'joe' wanna give it a \"go\"?"''',
153            ['STRING_LITERAL'])
154        self.assertTokensTypes(
155            '"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123"',
156            ['STRING_LITERAL'])
157        # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
158        # directives with Windows paths as filenames (..\..\dir\file)
159        self.assertTokensTypes(
160            r'"\x"',
161            ['STRING_LITERAL'])
162        self.assertTokensTypes(
163            r'"\a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z\A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z"',
164            ['STRING_LITERAL'])
165        self.assertTokensTypes(
166            r'"C:\x\fa\x1e\xited"',
167            ['STRING_LITERAL'])
168        # The lexer is permissive and allows decimal escapes (not just octal)
169        self.assertTokensTypes(
170            '"jx\9"',
171            ['STRING_LITERAL'])
172        self.assertTokensTypes(
173            '"fo\9999999"',
174            ['STRING_LITERAL'])
175
176    def test_mess(self):
177        self.assertTokensTypes(
178            r'[{}]()',
179            ['LBRACKET',
180                'LBRACE', 'RBRACE',
181            'RBRACKET',
182            'LPAREN', 'RPAREN'])
183
184        self.assertTokensTypes(
185            r'()||!C&~Z?J',
186            ['LPAREN', 'RPAREN',
187            'LOR',
188            'LNOT', 'ID',
189            'AND',
190            'NOT', 'ID',
191            'CONDOP', 'ID'])
192
193        self.assertTokensTypes(
194            r'+-*/%|||&&&^><>=<===!=',
195            ['PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
196            'LOR', 'OR',
197            'LAND', 'AND',
198            'XOR',
199            'GT', 'LT', 'GE', 'LE', 'EQ', 'NE'])
200
201        self.assertTokensTypes(
202            r'++--->?.,;:',
203            ['PLUSPLUS', 'MINUSMINUS',
204            'ARROW', 'CONDOP',
205            'PERIOD', 'COMMA', 'SEMI', 'COLON'])
206
207    def test_exprs(self):
208        self.assertTokensTypes(
209            'bb-cc',
210            ['ID', 'MINUS', 'ID'])
211
212        self.assertTokensTypes(
213            'foo & 0xFF',
214            ['ID', 'AND', 'INT_CONST_HEX'])
215
216        self.assertTokensTypes(
217            '(2+k) * 62',
218            ['LPAREN', 'INT_CONST_DEC', 'PLUS', 'ID',
219            'RPAREN', 'TIMES', 'INT_CONST_DEC'],)
220
221        self.assertTokensTypes(
222            'x | y >> z',
223            ['ID', 'OR', 'ID', 'RSHIFT', 'ID'])
224
225        self.assertTokensTypes(
226            'x <<= z << 5',
227            ['ID', 'LSHIFTEQUAL', 'ID', 'LSHIFT', 'INT_CONST_DEC'])
228
229        self.assertTokensTypes(
230            'x = y > 0 ? y : -6',
231            ['ID', 'EQUALS',
232                'ID', 'GT', 'INT_CONST_OCT',
233                'CONDOP',
234                'ID',
235                'COLON',
236                'MINUS', 'INT_CONST_DEC'])
237
238        self.assertTokensTypes(
239            'a+++b',
240            ['ID', 'PLUSPLUS', 'PLUS', 'ID'])
241
242    def test_statements(self):
243        self.assertTokensTypes(
244            'for (int i = 0; i < n; ++i)',
245            ['FOR', 'LPAREN',
246                        'INT', 'ID', 'EQUALS', 'INT_CONST_OCT', 'SEMI',
247                        'ID', 'LT', 'ID', 'SEMI',
248                        'PLUSPLUS', 'ID',
249                    'RPAREN'])
250
251        self.assertTokensTypes(
252            'self: goto self;',
253            ['ID', 'COLON', 'GOTO', 'ID', 'SEMI'])
254
255        self.assertTokensTypes(
256            """ switch (typ)
257                {
258                    case TYPE_ID:
259                        m = 5;
260                        break;
261                    default:
262                        m = 8;
263                }""",
264            ['SWITCH', 'LPAREN', 'ID', 'RPAREN',
265                'LBRACE',
266                    'CASE', 'ID', 'COLON',
267                        'ID', 'EQUALS', 'INT_CONST_DEC', 'SEMI',
268                        'BREAK', 'SEMI',
269                    'DEFAULT', 'COLON',
270                        'ID', 'EQUALS', 'INT_CONST_DEC', 'SEMI',
271                'RBRACE'])
272
273    def test_preprocessor_line(self):
274        self.assertTokensTypes('#abracadabra', ['PPHASH', 'ID'])
275
276        str = r"""
277        546
278        #line 66 "kwas\df.h"
279        id 4
280        dsf
281        # 9
282        armo
283        #line 10 "..\~..\test.h"
284        tok1
285        #line 99999 "include/me.h"
286        tok2
287        """
288
289        #~ self.clex.filename
290        self.clex.input(str)
291        self.clex.reset_lineno()
292
293        t1 = self.clex.token()
294        self.assertEqual(t1.type, 'INT_CONST_DEC')
295        self.assertEqual(t1.lineno, 2)
296
297        t2 = self.clex.token()
298        self.assertEqual(t2.type, 'ID')
299        self.assertEqual(t2.value, 'id')
300        self.assertEqual(t2.lineno, 66)
301        self.assertEqual(self.clex.filename, r'kwas\df.h')
302
303        for i in range(3):
304            t = self.clex.token()
305
306        self.assertEqual(t.type, 'ID')
307        self.assertEqual(t.value, 'armo')
308        self.assertEqual(t.lineno, 9)
309        self.assertEqual(self.clex.filename, r'kwas\df.h')
310
311        t4 = self.clex.token()
312        self.assertEqual(t4.type, 'ID')
313        self.assertEqual(t4.value, 'tok1')
314        self.assertEqual(t4.lineno, 10)
315        self.assertEqual(self.clex.filename, r'..\~..\test.h')
316
317        t5 = self.clex.token()
318        self.assertEqual(t5.type, 'ID')
319        self.assertEqual(t5.value, 'tok2')
320        self.assertEqual(t5.lineno, 99999)
321        self.assertEqual(self.clex.filename, r'include/me.h')
322
323    def test_preprocessor_line_funny(self):
324        str = r'''
325        #line 10 "..\6\joe.h"
326        10
327        '''
328        self.clex.input(str)
329        self.clex.reset_lineno()
330
331        t1 = self.clex.token()
332        self.assertEqual(t1.type, 'INT_CONST_DEC')
333        self.assertEqual(t1.lineno, 10)
334        self.assertEqual(self.clex.filename, r'..\6\joe.h')
335
336
337    def test_preprocessor_pragma(self):
338        str = '''
339        42
340        #pragma
341        #pragma helo me
342        #pragma once
343        # pragma omp parallel private(th_id)
344        #\tpragma {pack: 2, smack: 3}
345        #pragma <includeme.h> "nowit.h"
346        #pragma "string"
347        #pragma somestring="some_other_string"
348        #pragma id 124124 and numbers 0235495
349        59
350        '''
351        # Check that pragmas are tokenized, including trailing string
352        self.clex.input(str)
353        self.clex.reset_lineno()
354
355        t1 = self.clex.token()
356        self.assertEqual(t1.type, 'INT_CONST_DEC')
357
358        t2 = self.clex.token()
359        self.assertEqual(t2.type, 'PPPRAGMA')
360
361        t3 = self.clex.token()
362        self.assertEqual(t3.type, 'PPPRAGMA')
363
364        t4 = self.clex.token()
365        self.assertEqual(t4.type, 'PPPRAGMASTR')
366        self.assertEqual(t4.value, 'helo me')
367
368        for i in range(3):
369            t = self.clex.token()
370
371        t5 = self.clex.token()
372        self.assertEqual(t5.type, 'PPPRAGMASTR')
373        self.assertEqual(t5.value, 'omp parallel private(th_id)')
374
375        for i in range(5):
376            ta = self.clex.token()
377            self.assertEqual(ta.type, 'PPPRAGMA')
378            tb = self.clex.token()
379            self.assertEqual(tb.type, 'PPPRAGMASTR')
380
381        t6 = self.clex.token()
382        self.assertEqual(t6.type, 'INT_CONST_DEC')
383        self.assertEqual(t6.lineno, 12)
384
385
386
387# Keeps all the errors the lexer spits in one place, to allow
388# easier modification if the error syntax changes.
389#
390ERR_ILLEGAL_CHAR    = 'Illegal character'
391ERR_OCTAL           = 'Invalid octal constant'
392ERR_UNMATCHED_QUOTE = 'Unmatched \''
393ERR_INVALID_CCONST  = 'Invalid char constant'
394ERR_STRING_ESCAPE   = 'String contains invalid escape'
395
396ERR_FILENAME_BEFORE_LINE    = 'filename before line'
397ERR_LINENUM_MISSING         = 'line number missing'
398ERR_INVALID_LINE_DIRECTIVE  = 'invalid #line directive'
399
400
401class TestCLexerErrors(unittest.TestCase):
402    """ Test lexing of erroneous strings.
403        Works by passing an error functions that saves the error
404        in an attribute for later perusal.
405    """
406    def error_func(self, msg, line, column):
407        self.error = msg
408
409    def on_lbrace_func(self):
410        pass
411
412    def on_rbrace_func(self):
413        pass
414
415    def type_lookup_func(self, typ):
416        return False
417
418    def setUp(self):
419        self.clex = CLexer(self.error_func, self.on_lbrace_func,
420                self.on_rbrace_func, self.type_lookup_func)
421        self.clex.build(optimize=False)
422        self.error = ""
423
424    def assertLexerError(self, str, error_like):
425        # feed the string to the lexer
426        self.clex.input(str)
427
428        # Pulls all tokens from the string. Errors will
429        # be written into self.error by the error_func
430        # callback
431        #
432        token_types(self.clex)
433
434        # compare the error to the expected
435        self.assertTrue(re.search(error_like, self.error),
436            "\nExpected error matching: %s\nGot: %s" %
437                (error_like, self.error))
438
439        # clear last error, for the sake of subsequent invocations
440        self.error = ""
441
442    def test_trivial_tokens(self):
443        self.assertLexerError('@', ERR_ILLEGAL_CHAR)
444        self.assertLexerError('`', ERR_ILLEGAL_CHAR)
445        self.assertLexerError('\\', ERR_ILLEGAL_CHAR)
446
447    def test_integer_constants(self):
448        self.assertLexerError('029', ERR_OCTAL)
449        self.assertLexerError('012345678', ERR_OCTAL)
450
451    def test_char_constants(self):
452        self.assertLexerError("'", ERR_UNMATCHED_QUOTE)
453        self.assertLexerError("'b\n", ERR_UNMATCHED_QUOTE)
454        self.assertLexerError("'\\xaa\n'", ERR_UNMATCHED_QUOTE)
455
456        self.assertLexerError(r"'123\12a'", ERR_INVALID_CCONST)
457        self.assertLexerError(r"'123\xabg'", ERR_INVALID_CCONST)
458        self.assertLexerError("''", ERR_INVALID_CCONST)
459        self.assertLexerError("'abcjx'", ERR_INVALID_CCONST)
460        self.assertLexerError(r"'\*'", ERR_INVALID_CCONST)
461
462    def test_string_literals(self):
463        self.assertLexerError(r'"jx\`"', ERR_STRING_ESCAPE)
464        self.assertLexerError(r'"hekllo\* on ix"', ERR_STRING_ESCAPE)
465        self.assertLexerError(r'L"hekllo\* on ix"', ERR_STRING_ESCAPE)
466        # Should not suffer from slow backtracking
467        self.assertLexerError(r'"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\`\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123"', ERR_STRING_ESCAPE)
468        self.assertLexerError(r'"\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\x23\`\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23"', ERR_STRING_ESCAPE)
469        # Should not suffer from slow backtracking when there's no end quote
470        self.assertLexerError(r'"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\`\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\12\123456', ERR_ILLEGAL_CHAR)
471        self.assertLexerError(r'"\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\`\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x2\x23456', ERR_ILLEGAL_CHAR)
472
473    def test_preprocessor(self):
474        self.assertLexerError('#line "ka"', ERR_FILENAME_BEFORE_LINE)
475        self.assertLexerError('#line df', ERR_INVALID_LINE_DIRECTIVE)
476        self.assertLexerError('#line \n', ERR_LINENUM_MISSING)
477
478
479if __name__ == '__main__':
480    unittest.main()
481