1 /* This is the Assembler Pre-Processor
2    Copyright (C) 1987-2014 Free Software Foundation, Inc.
3 
4    This file is part of GAS, the GNU Assembler.
5 
6    GAS is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3, or (at your option)
9    any later version.
10 
11    GAS is distributed in the hope that it will be useful, but WITHOUT
12    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
14    License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with GAS; see the file COPYING.  If not, write to the Free
18    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
19    02110-1301, USA.  */
20 
21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
22 /* App, the assembler pre-processor.  This pre-processor strips out
23    excess spaces, turns single-quoted characters into a decimal
24    constant, and turns the # in # <number> <filename> <garbage> into a
25    .linefile.  This needs better error-handling.  */
26 
27 #include "as.h"
28 
29 #if (__STDC__ != 1)
30 #ifndef const
31 #define const  /* empty */
32 #endif
33 #endif
34 
35 #ifdef H_TICK_HEX
36 int enable_h_tick_hex = 0;
37 #endif
38 
39 #ifdef TC_M68K
40 /* Whether we are scrubbing in m68k MRI mode.  This is different from
41    flag_m68k_mri, because the two flags will be affected by the .mri
42    pseudo-op at different times.  */
43 static int scrub_m68k_mri;
44 
45 /* The pseudo-op which switches in and out of MRI mode.  See the
46    comment in do_scrub_chars.  */
47 static const char mri_pseudo[] = ".mri 0";
48 #else
49 #define scrub_m68k_mri 0
50 #endif
51 
52 #if defined TC_ARM && defined OBJ_ELF
53 /* The pseudo-op for which we need to special-case `@' characters.
54    See the comment in do_scrub_chars.  */
55 static const char   symver_pseudo[] = ".symver";
56 static const char * symver_state;
57 #endif
58 
59 static char lex[256];
60 static const char symbol_chars[] =
61 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
62 
63 #define LEX_IS_SYMBOL_COMPONENT		1
64 #define LEX_IS_WHITESPACE		2
65 #define LEX_IS_LINE_SEPARATOR		3
66 #define LEX_IS_COMMENT_START		4
67 #define LEX_IS_LINE_COMMENT_START	5
68 #define	LEX_IS_TWOCHAR_COMMENT_1ST	6
69 #define	LEX_IS_STRINGQUOTE		8
70 #define	LEX_IS_COLON			9
71 #define	LEX_IS_NEWLINE			10
72 #define	LEX_IS_ONECHAR_QUOTE		11
73 #ifdef TC_V850
74 #define LEX_IS_DOUBLEDASH_1ST		12
75 #endif
76 #ifdef TC_M32R
77 #define DOUBLEBAR_PARALLEL
78 #endif
79 #ifdef DOUBLEBAR_PARALLEL
80 #define LEX_IS_DOUBLEBAR_1ST		13
81 #endif
82 #define LEX_IS_PARALLEL_SEPARATOR	14
83 #ifdef H_TICK_HEX
84 #define LEX_IS_H			15
85 #endif
86 #define IS_SYMBOL_COMPONENT(c)		(lex[c] == LEX_IS_SYMBOL_COMPONENT)
87 #define IS_WHITESPACE(c)		(lex[c] == LEX_IS_WHITESPACE)
88 #define IS_LINE_SEPARATOR(c)		(lex[c] == LEX_IS_LINE_SEPARATOR)
89 #define IS_PARALLEL_SEPARATOR(c)	(lex[c] == LEX_IS_PARALLEL_SEPARATOR)
90 #define IS_COMMENT(c)			(lex[c] == LEX_IS_COMMENT_START)
91 #define IS_LINE_COMMENT(c)		(lex[c] == LEX_IS_LINE_COMMENT_START)
92 #define	IS_NEWLINE(c)			(lex[c] == LEX_IS_NEWLINE)
93 
94 static int process_escape (int);
95 
96 /* FIXME-soon: The entire lexer/parser thingy should be
97    built statically at compile time rather than dynamically
98    each and every time the assembler is run.  xoxorich.  */
99 
100 void
do_scrub_begin(int m68k_mri ATTRIBUTE_UNUSED)101 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
102 {
103   const char *p;
104   int c;
105 
106   lex[' '] = LEX_IS_WHITESPACE;
107   lex['\t'] = LEX_IS_WHITESPACE;
108   lex['\r'] = LEX_IS_WHITESPACE;
109   lex['\n'] = LEX_IS_NEWLINE;
110   lex[':'] = LEX_IS_COLON;
111 
112 #ifdef TC_M68K
113   scrub_m68k_mri = m68k_mri;
114 
115   if (! m68k_mri)
116 #endif
117     {
118       lex['"'] = LEX_IS_STRINGQUOTE;
119 
120 #if ! defined (TC_HPPA) && ! defined (TC_I370)
121       /* I370 uses single-quotes to delimit integer, float constants.  */
122       lex['\''] = LEX_IS_ONECHAR_QUOTE;
123 #endif
124 
125 #ifdef SINGLE_QUOTE_STRINGS
126       lex['\''] = LEX_IS_STRINGQUOTE;
127 #endif
128     }
129 
130   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
131      in state 5 of do_scrub_chars must be changed.  */
132 
133   /* Note that these override the previous defaults, e.g. if ';' is a
134      comment char, then it isn't a line separator.  */
135   for (p = symbol_chars; *p; ++p)
136     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
137 
138   for (c = 128; c < 256; ++c)
139     lex[c] = LEX_IS_SYMBOL_COMPONENT;
140 
141 #ifdef tc_symbol_chars
142   /* This macro permits the processor to specify all characters which
143      may appears in an operand.  This will prevent the scrubber from
144      discarding meaningful whitespace in certain cases.  The i386
145      backend uses this to support prefixes, which can confuse the
146      scrubber as to whether it is parsing operands or opcodes.  */
147   for (p = tc_symbol_chars; *p; ++p)
148     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
149 #endif
150 
151   /* The m68k backend wants to be able to change comment_chars.  */
152 #ifndef tc_comment_chars
153 #define tc_comment_chars comment_chars
154 #endif
155   for (p = tc_comment_chars; *p; p++)
156     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
157 
158   for (p = line_comment_chars; *p; p++)
159     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
160 
161 #ifndef tc_line_separator_chars
162 #define tc_line_separator_chars line_separator_chars
163 #endif
164   for (p = tc_line_separator_chars; *p; p++)
165     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
166 
167 #ifdef tc_parallel_separator_chars
168   /* This macro permits the processor to specify all characters which
169      separate parallel insns on the same line.  */
170   for (p = tc_parallel_separator_chars; *p; p++)
171     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
172 #endif
173 
174   /* Only allow slash-star comments if slash is not in use.
175      FIXME: This isn't right.  We should always permit them.  */
176   if (lex['/'] == 0)
177     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
178 
179 #ifdef TC_M68K
180   if (m68k_mri)
181     {
182       lex['\''] = LEX_IS_STRINGQUOTE;
183       lex[';'] = LEX_IS_COMMENT_START;
184       lex['*'] = LEX_IS_LINE_COMMENT_START;
185       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
186 	 then it can't be used in an expression.  */
187       lex['!'] = LEX_IS_LINE_COMMENT_START;
188     }
189 #endif
190 
191 #ifdef TC_V850
192   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
193 #endif
194 #ifdef DOUBLEBAR_PARALLEL
195   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
196 #endif
197 #ifdef TC_D30V
198   /* Must do this is we want VLIW instruction with "->" or "<-".  */
199   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
200 #endif
201 
202 #ifdef H_TICK_HEX
203   if (enable_h_tick_hex)
204     {
205       lex['h'] = LEX_IS_H;
206       lex['H'] = LEX_IS_H;
207     }
208 #endif
209 }
210 
211 /* Saved state of the scrubber.  */
212 static int state;
213 static int old_state;
214 static char *out_string;
215 static char out_buf[20];
216 static int add_newlines;
217 static char *saved_input;
218 static size_t saved_input_len;
219 static char input_buffer[32 * 1024];
220 static const char *mri_state;
221 static char mri_last_ch;
222 
223 /* Data structure for saving the state of app across #include's.  Note that
224    app is called asynchronously to the parsing of the .include's, so our
225    state at the time .include is interpreted is completely unrelated.
226    That's why we have to save it all.  */
227 
228 struct app_save
229 {
230   int          state;
231   int          old_state;
232   char *       out_string;
233   char         out_buf[sizeof (out_buf)];
234   int          add_newlines;
235   char *       saved_input;
236   size_t       saved_input_len;
237 #ifdef TC_M68K
238   int          scrub_m68k_mri;
239 #endif
240   const char * mri_state;
241   char         mri_last_ch;
242 #if defined TC_ARM && defined OBJ_ELF
243   const char * symver_state;
244 #endif
245 };
246 
247 char *
app_push(void)248 app_push (void)
249 {
250   struct app_save *saved;
251 
252   saved = (struct app_save *) xmalloc (sizeof (*saved));
253   saved->state = state;
254   saved->old_state = old_state;
255   saved->out_string = out_string;
256   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
257   saved->add_newlines = add_newlines;
258   if (saved_input == NULL)
259     saved->saved_input = NULL;
260   else
261     {
262       saved->saved_input = (char *) xmalloc (saved_input_len);
263       memcpy (saved->saved_input, saved_input, saved_input_len);
264       saved->saved_input_len = saved_input_len;
265     }
266 #ifdef TC_M68K
267   saved->scrub_m68k_mri = scrub_m68k_mri;
268 #endif
269   saved->mri_state = mri_state;
270   saved->mri_last_ch = mri_last_ch;
271 #if defined TC_ARM && defined OBJ_ELF
272   saved->symver_state = symver_state;
273 #endif
274 
275   /* do_scrub_begin() is not useful, just wastes time.  */
276 
277   state = 0;
278   saved_input = NULL;
279   add_newlines = 0;
280 
281   return (char *) saved;
282 }
283 
284 void
app_pop(char * arg)285 app_pop (char *arg)
286 {
287   struct app_save *saved = (struct app_save *) arg;
288 
289   /* There is no do_scrub_end ().  */
290   state = saved->state;
291   old_state = saved->old_state;
292   out_string = saved->out_string;
293   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
294   add_newlines = saved->add_newlines;
295   if (saved->saved_input == NULL)
296     saved_input = NULL;
297   else
298     {
299       gas_assert (saved->saved_input_len <= sizeof (input_buffer));
300       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
301       saved_input = input_buffer;
302       saved_input_len = saved->saved_input_len;
303       free (saved->saved_input);
304     }
305 #ifdef TC_M68K
306   scrub_m68k_mri = saved->scrub_m68k_mri;
307 #endif
308   mri_state = saved->mri_state;
309   mri_last_ch = saved->mri_last_ch;
310 #if defined TC_ARM && defined OBJ_ELF
311   symver_state = saved->symver_state;
312 #endif
313 
314   free (arg);
315 }
316 
317 /* @@ This assumes that \n &c are the same on host and target.  This is not
318    necessarily true.  */
319 
320 static int
process_escape(int ch)321 process_escape (int ch)
322 {
323   switch (ch)
324     {
325     case 'b':
326       return '\b';
327     case 'f':
328       return '\f';
329     case 'n':
330       return '\n';
331     case 'r':
332       return '\r';
333     case 't':
334       return '\t';
335     case '\'':
336       return '\'';
337     case '"':
338       return '\"';
339     default:
340       return ch;
341     }
342 }
343 
344 /* This function is called to process input characters.  The GET
345    parameter is used to retrieve more input characters.  GET should
346    set its parameter to point to a buffer, and return the length of
347    the buffer; it should return 0 at end of file.  The scrubbed output
348    characters are put into the buffer starting at TOSTART; the TOSTART
349    buffer is TOLEN bytes in length.  The function returns the number
350    of scrubbed characters put into TOSTART.  This will be TOLEN unless
351    end of file was seen.  This function is arranged as a state
352    machine, and saves its state so that it may return at any point.
353    This is the way the old code used to work.  */
354 
355 size_t
do_scrub_chars(size_t (* get)(char *,size_t),char * tostart,size_t tolen)356 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
357 {
358   char *to = tostart;
359   char *toend = tostart + tolen;
360   char *from;
361   char *fromend;
362   size_t fromlen;
363   int ch, ch2 = 0;
364   /* Character that started the string we're working on.  */
365   static char quotechar;
366 
367   /*State 0: beginning of normal line
368 	  1: After first whitespace on line (flush more white)
369 	  2: After first non-white (opcode) on line (keep 1white)
370 	  3: after second white on line (into operands) (flush white)
371 	  4: after putting out a .linefile, put out digits
372 	  5: parsing a string, then go to old-state
373 	  6: putting out \ escape in a "d string.
374 	  7: no longer used
375 	  8: no longer used
376 	  9: After seeing symbol char in state 3 (keep 1white after symchar)
377 	 10: After seeing whitespace in state 9 (keep white before symchar)
378 	 11: After seeing a symbol character in state 0 (eg a label definition)
379 	 -1: output string in out_string and go to the state in old_state
380 	 -2: flush text until a '*' '/' is seen, then go to state old_state
381 #ifdef TC_V850
382 	 12: After seeing a dash, looking for a second dash as a start
383 	     of comment.
384 #endif
385 #ifdef DOUBLEBAR_PARALLEL
386 	 13: After seeing a vertical bar, looking for a second
387 	     vertical bar as a parallel expression separator.
388 #endif
389 #ifdef TC_PREDICATE_START_CHAR
390 	 14: After seeing a predicate start character at state 0, looking
391 	     for a predicate end character as predicate.
392 	 15: After seeing a predicate start character at state 1, looking
393 	     for a predicate end character as predicate.
394 #endif
395 #ifdef TC_Z80
396 	 16: After seeing an 'a' or an 'A' at the start of a symbol
397 	 17: After seeing an 'f' or an 'F' in state 16
398 #endif
399 	  */
400 
401   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
402      constructs like ``.loc 1 20''.  This was turning into ``.loc
403      120''.  States 9 and 10 ensure that a space is never dropped in
404      between characters which could appear in an identifier.  Ian
405      Taylor, ian@cygnus.com.
406 
407      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
408      correctly on the PA (and any other target where colons are optional).
409      Jeff Law, law@cs.utah.edu.
410 
411      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
412      get squashed into "cmp r1,r2||trap#1", with the all important space
413      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
414 
415   /* This macro gets the next input character.  */
416 
417 #define GET()							\
418   (from < fromend						\
419    ? * (unsigned char *) (from++)				\
420    : (saved_input = NULL,					\
421       fromlen = (*get) (input_buffer, sizeof input_buffer),	\
422       from = input_buffer,					\
423       fromend = from + fromlen,					\
424       (fromlen == 0						\
425        ? EOF							\
426        : * (unsigned char *) (from++))))
427 
428   /* This macro pushes a character back on the input stream.  */
429 
430 #define UNGET(uch) (*--from = (uch))
431 
432   /* This macro puts a character into the output buffer.  If this
433      character fills the output buffer, this macro jumps to the label
434      TOFULL.  We use this rather ugly approach because we need to
435      handle two different termination conditions: EOF on the input
436      stream, and a full output buffer.  It would be simpler if we
437      always read in the entire input stream before processing it, but
438      I don't want to make such a significant change to the assembler's
439      memory usage.  */
440 
441 #define PUT(pch)				\
442   do						\
443     {						\
444       *to++ = (pch);				\
445       if (to >= toend)				\
446 	goto tofull;				\
447     }						\
448   while (0)
449 
450   if (saved_input != NULL)
451     {
452       from = saved_input;
453       fromend = from + saved_input_len;
454     }
455   else
456     {
457       fromlen = (*get) (input_buffer, sizeof input_buffer);
458       if (fromlen == 0)
459 	return 0;
460       from = input_buffer;
461       fromend = from + fromlen;
462     }
463 
464   while (1)
465     {
466       /* The cases in this switch end with continue, in order to
467 	 branch back to the top of this while loop and generate the
468 	 next output character in the appropriate state.  */
469       switch (state)
470 	{
471 	case -1:
472 	  ch = *out_string++;
473 	  if (*out_string == '\0')
474 	    {
475 	      state = old_state;
476 	      old_state = 3;
477 	    }
478 	  PUT (ch);
479 	  continue;
480 
481 	case -2:
482 	  for (;;)
483 	    {
484 	      do
485 		{
486 		  ch = GET ();
487 
488 		  if (ch == EOF)
489 		    {
490 		      as_warn (_("end of file in comment"));
491 		      goto fromeof;
492 		    }
493 
494 		  if (ch == '\n')
495 		    PUT ('\n');
496 		}
497 	      while (ch != '*');
498 
499 	      while ((ch = GET ()) == '*')
500 		;
501 
502 	      if (ch == EOF)
503 		{
504 		  as_warn (_("end of file in comment"));
505 		  goto fromeof;
506 		}
507 
508 	      if (ch == '/')
509 		break;
510 
511 	      UNGET (ch);
512 	    }
513 
514 	  state = old_state;
515 	  UNGET (' ');
516 	  continue;
517 
518 	case 4:
519 	  ch = GET ();
520 	  if (ch == EOF)
521 	    goto fromeof;
522 	  else if (ch >= '0' && ch <= '9')
523 	    PUT (ch);
524 	  else
525 	    {
526 	      while (ch != EOF && IS_WHITESPACE (ch))
527 		ch = GET ();
528 	      if (ch == '"')
529 		{
530 		  quotechar = ch;
531 		  state = 5;
532 		  old_state = 3;
533 		  PUT (ch);
534 		}
535 	      else
536 		{
537 		  while (ch != EOF && ch != '\n')
538 		    ch = GET ();
539 		  state = 0;
540 		  PUT (ch);
541 		}
542 	    }
543 	  continue;
544 
545 	case 5:
546 	  /* We are going to copy everything up to a quote character,
547 	     with special handling for a backslash.  We try to
548 	     optimize the copying in the simple case without using the
549 	     GET and PUT macros.  */
550 	  {
551 	    char *s;
552 	    ptrdiff_t len;
553 
554 	    for (s = from; s < fromend; s++)
555 	      {
556 		ch = *s;
557 		if (ch == '\\'
558 		    || ch == quotechar
559 		    || ch == '\n')
560 		  break;
561 	      }
562 	    len = s - from;
563 	    if (len > toend - to)
564 	      len = toend - to;
565 	    if (len > 0)
566 	      {
567 		memcpy (to, from, len);
568 		to += len;
569 		from += len;
570 		if (to >= toend)
571 		  goto tofull;
572 	      }
573 	  }
574 
575 	  ch = GET ();
576 	  if (ch == EOF)
577 	    {
578 	      /* This buffer is here specifically so
579 		 that the UNGET below will work.  */
580 	      static char one_char_buf[1];
581 
582 	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
583 	      state = old_state;
584 	      from = fromend = one_char_buf + 1;
585 	      fromlen = 1;
586 	      UNGET ('\n');
587 	      PUT (quotechar);
588 	    }
589 	  else if (ch == quotechar)
590 	    {
591 	      state = old_state;
592 	      PUT (ch);
593 	    }
594 #ifndef NO_STRING_ESCAPES
595 	  else if (ch == '\\')
596 	    {
597 	      state = 6;
598 	      PUT (ch);
599 	    }
600 #endif
601 	  else if (scrub_m68k_mri && ch == '\n')
602 	    {
603 	      /* Just quietly terminate the string.  This permits lines like
604 		   bne	label	loop if we haven't reach end yet.  */
605 	      state = old_state;
606 	      UNGET (ch);
607 	      PUT ('\'');
608 	    }
609 	  else
610 	    {
611 	      PUT (ch);
612 	    }
613 	  continue;
614 
615 	case 6:
616 	  state = 5;
617 	  ch = GET ();
618 	  switch (ch)
619 	    {
620 	      /* Handle strings broken across lines, by turning '\n' into
621 		 '\\' and 'n'.  */
622 	    case '\n':
623 	      UNGET ('n');
624 	      add_newlines++;
625 	      PUT ('\\');
626 	      continue;
627 
628 	    case EOF:
629 	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
630 	      PUT (quotechar);
631 	      continue;
632 
633 	    case '"':
634 	    case '\\':
635 	    case 'b':
636 	    case 'f':
637 	    case 'n':
638 	    case 'r':
639 	    case 't':
640 	    case 'v':
641 	    case 'x':
642 	    case 'X':
643 	    case '0':
644 	    case '1':
645 	    case '2':
646 	    case '3':
647 	    case '4':
648 	    case '5':
649 	    case '6':
650 	    case '7':
651 	      break;
652 
653 	    default:
654 #ifdef ONLY_STANDARD_ESCAPES
655 	      as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
656 #endif
657 	      break;
658 	    }
659 	  PUT (ch);
660 	  continue;
661 
662 #ifdef DOUBLEBAR_PARALLEL
663 	case 13:
664 	  ch = GET ();
665 	  if (ch != '|')
666 	    abort ();
667 
668 	  /* Reset back to state 1 and pretend that we are parsing a
669 	     line from just after the first white space.  */
670 	  state = 1;
671 	  PUT ('|');
672 #ifdef TC_TIC6X
673 	  /* "||^" is used for SPMASKed instructions.  */
674 	  ch = GET ();
675 	  if (ch == EOF)
676 	    goto fromeof;
677 	  else if (ch == '^')
678 	    PUT ('^');
679 	  else
680 	    UNGET (ch);
681 #endif
682 	  continue;
683 #endif
684 #ifdef TC_Z80
685 	case 16:
686 	  /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
687 	  ch = GET ();
688 	  if (ch == 'f' || ch == 'F')
689 	    {
690 	      state = 17;
691 	      PUT (ch);
692 	    }
693 	  else
694 	    {
695 	      state = 9;
696 	      break;
697 	    }
698 	case 17:
699 	  /* We have seen "af" at the start of a symbol,
700 	     a ' here is a part of that symbol.  */
701 	  ch = GET ();
702 	  state = 9;
703 	  if (ch == '\'')
704 	    /* Change to avoid warning about unclosed string.  */
705 	    PUT ('`');
706 	  else if (ch != EOF)
707 	    UNGET (ch);
708 	  break;
709 #endif
710 	}
711 
712       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
713 
714       /* flushchar: */
715       ch = GET ();
716 
717 #ifdef TC_PREDICATE_START_CHAR
718       if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
719 	{
720 	  state += 14;
721 	  PUT (ch);
722 	  continue;
723 	}
724       else if (state == 14 || state == 15)
725 	{
726 	  if (ch == TC_PREDICATE_END_CHAR)
727 	    {
728 	      state -= 14;
729 	      PUT (ch);
730 	      ch = GET ();
731 	    }
732 	  else
733 	    {
734 	      PUT (ch);
735 	      continue;
736 	    }
737 	}
738 #endif
739 
740     recycle:
741 
742 #if defined TC_ARM && defined OBJ_ELF
743       /* We need to watch out for .symver directives.  See the comment later
744 	 in this function.  */
745       if (symver_state == NULL)
746 	{
747 	  if ((state == 0 || state == 1) && ch == symver_pseudo[0])
748 	    symver_state = symver_pseudo + 1;
749 	}
750       else
751 	{
752 	  /* We advance to the next state if we find the right
753 	     character.  */
754 	  if (ch != '\0' && (*symver_state == ch))
755 	    ++symver_state;
756 	  else if (*symver_state != '\0')
757 	    /* We did not get the expected character, or we didn't
758 	       get a valid terminating character after seeing the
759 	       entire pseudo-op, so we must go back to the beginning.  */
760 	    symver_state = NULL;
761 	  else
762 	    {
763 	      /* We've read the entire pseudo-op.  If this is the end
764 		 of the line, go back to the beginning.  */
765 	      if (IS_NEWLINE (ch))
766 		symver_state = NULL;
767 	    }
768 	}
769 #endif /* TC_ARM && OBJ_ELF */
770 
771 #ifdef TC_M68K
772       /* We want to have pseudo-ops which control whether we are in
773 	 MRI mode or not.  Unfortunately, since m68k MRI mode affects
774 	 the scrubber, that means that we need a special purpose
775 	 recognizer here.  */
776       if (mri_state == NULL)
777 	{
778 	  if ((state == 0 || state == 1)
779 	      && ch == mri_pseudo[0])
780 	    mri_state = mri_pseudo + 1;
781 	}
782       else
783 	{
784 	  /* We advance to the next state if we find the right
785 	     character, or if we need a space character and we get any
786 	     whitespace character, or if we need a '0' and we get a
787 	     '1' (this is so that we only need one state to handle
788 	     ``.mri 0'' and ``.mri 1'').  */
789 	  if (ch != '\0'
790 	      && (*mri_state == ch
791 		  || (*mri_state == ' '
792 		      && lex[ch] == LEX_IS_WHITESPACE)
793 		  || (*mri_state == '0'
794 		      && ch == '1')))
795 	    {
796 	      mri_last_ch = ch;
797 	      ++mri_state;
798 	    }
799 	  else if (*mri_state != '\0'
800 		   || (lex[ch] != LEX_IS_WHITESPACE
801 		       && lex[ch] != LEX_IS_NEWLINE))
802 	    {
803 	      /* We did not get the expected character, or we didn't
804 		 get a valid terminating character after seeing the
805 		 entire pseudo-op, so we must go back to the
806 		 beginning.  */
807 	      mri_state = NULL;
808 	    }
809 	  else
810 	    {
811 	      /* We've read the entire pseudo-op.  mips_last_ch is
812 		 either '0' or '1' indicating whether to enter or
813 		 leave MRI mode.  */
814 	      do_scrub_begin (mri_last_ch == '1');
815 	      mri_state = NULL;
816 
817 	      /* We continue handling the character as usual.  The
818 		 main gas reader must also handle the .mri pseudo-op
819 		 to control expression parsing and the like.  */
820 	    }
821 	}
822 #endif
823 
824       if (ch == EOF)
825 	{
826 	  if (state != 0)
827 	    {
828 	      as_warn (_("end of file not at end of a line; newline inserted"));
829 	      state = 0;
830 	      PUT ('\n');
831 	    }
832 	  goto fromeof;
833 	}
834 
835       switch (lex[ch])
836 	{
837 	case LEX_IS_WHITESPACE:
838 	  do
839 	    {
840 	      ch = GET ();
841 	    }
842 	  while (ch != EOF && IS_WHITESPACE (ch));
843 	  if (ch == EOF)
844 	    goto fromeof;
845 
846 	  if (state == 0)
847 	    {
848 	      /* Preserve a single whitespace character at the
849 		 beginning of a line.  */
850 	      state = 1;
851 	      UNGET (ch);
852 	      PUT (' ');
853 	      break;
854 	    }
855 
856 #ifdef KEEP_WHITE_AROUND_COLON
857 	  if (lex[ch] == LEX_IS_COLON)
858 	    {
859 	      /* Only keep this white if there's no white *after* the
860 		 colon.  */
861 	      ch2 = GET ();
862 	      if (ch2 != EOF)
863 		UNGET (ch2);
864 	      if (!IS_WHITESPACE (ch2))
865 		{
866 		  state = 9;
867 		  UNGET (ch);
868 		  PUT (' ');
869 		  break;
870 		}
871 	    }
872 #endif
873 	  if (IS_COMMENT (ch)
874 	      || ch == '/'
875 	      || IS_LINE_SEPARATOR (ch)
876 	      || IS_PARALLEL_SEPARATOR (ch))
877 	    {
878 	      if (scrub_m68k_mri)
879 		{
880 		  /* In MRI mode, we keep these spaces.  */
881 		  UNGET (ch);
882 		  PUT (' ');
883 		  break;
884 		}
885 	      goto recycle;
886 	    }
887 
888 	  /* If we're in state 2 or 11, we've seen a non-white
889 	     character followed by whitespace.  If the next character
890 	     is ':', this is whitespace after a label name which we
891 	     normally must ignore.  In MRI mode, though, spaces are
892 	     not permitted between the label and the colon.  */
893 	  if ((state == 2 || state == 11)
894 	      && lex[ch] == LEX_IS_COLON
895 	      && ! scrub_m68k_mri)
896 	    {
897 	      state = 1;
898 	      PUT (ch);
899 	      break;
900 	    }
901 
902 	  switch (state)
903 	    {
904 	    case 1:
905 	      /* We can arrive here if we leave a leading whitespace
906 		 character at the beginning of a line.  */
907 	      goto recycle;
908 	    case 2:
909 	      state = 3;
910 	      if (to + 1 < toend)
911 		{
912 		  /* Optimize common case by skipping UNGET/GET.  */
913 		  PUT (' ');	/* Sp after opco */
914 		  goto recycle;
915 		}
916 	      UNGET (ch);
917 	      PUT (' ');
918 	      break;
919 	    case 3:
920 #ifndef TC_KEEP_OPERAND_SPACES
921 	      /* For TI C6X, we keep these spaces as they may separate
922 		 functional unit specifiers from operands.  */
923 	      if (scrub_m68k_mri)
924 #endif
925 		{
926 		  /* In MRI mode, we keep these spaces.  */
927 		  UNGET (ch);
928 		  PUT (' ');
929 		  break;
930 		}
931 	      goto recycle;	/* Sp in operands */
932 	    case 9:
933 	    case 10:
934 #ifndef TC_KEEP_OPERAND_SPACES
935 	      if (scrub_m68k_mri)
936 #endif
937 		{
938 		  /* In MRI mode, we keep these spaces.  */
939 		  state = 3;
940 		  UNGET (ch);
941 		  PUT (' ');
942 		  break;
943 		}
944 	      state = 10;	/* Sp after symbol char */
945 	      goto recycle;
946 	    case 11:
947 	      if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
948 		state = 1;
949 	      else
950 		{
951 		  /* We know that ch is not ':', since we tested that
952 		     case above.  Therefore this is not a label, so it
953 		     must be the opcode, and we've just seen the
954 		     whitespace after it.  */
955 		  state = 3;
956 		}
957 	      UNGET (ch);
958 	      PUT (' ');	/* Sp after label definition.  */
959 	      break;
960 	    default:
961 	      BAD_CASE (state);
962 	    }
963 	  break;
964 
965 	case LEX_IS_TWOCHAR_COMMENT_1ST:
966 	  ch2 = GET ();
967 	  if (ch2 == '*')
968 	    {
969 	      for (;;)
970 		{
971 		  do
972 		    {
973 		      ch2 = GET ();
974 		      if (ch2 != EOF && IS_NEWLINE (ch2))
975 			add_newlines++;
976 		    }
977 		  while (ch2 != EOF && ch2 != '*');
978 
979 		  while (ch2 == '*')
980 		    ch2 = GET ();
981 
982 		  if (ch2 == EOF || ch2 == '/')
983 		    break;
984 
985 		  /* This UNGET will ensure that we count newlines
986 		     correctly.  */
987 		  UNGET (ch2);
988 		}
989 
990 	      if (ch2 == EOF)
991 		as_warn (_("end of file in multiline comment"));
992 
993 	      ch = ' ';
994 	      goto recycle;
995 	    }
996 #ifdef DOUBLESLASH_LINE_COMMENTS
997 	  else if (ch2 == '/')
998 	    {
999 	      do
1000 		{
1001 		  ch = GET ();
1002 		}
1003 	      while (ch != EOF && !IS_NEWLINE (ch));
1004 	      if (ch == EOF)
1005 		as_warn ("end of file in comment; newline inserted");
1006 	      state = 0;
1007 	      PUT ('\n');
1008 	      break;
1009 	    }
1010 #endif
1011 	  else
1012 	    {
1013 	      if (ch2 != EOF)
1014 		UNGET (ch2);
1015 	      if (state == 9 || state == 10)
1016 		state = 3;
1017 	      PUT (ch);
1018 	    }
1019 	  break;
1020 
1021 	case LEX_IS_STRINGQUOTE:
1022 	  quotechar = ch;
1023 	  if (state == 10)
1024 	    {
1025 	      /* Preserve the whitespace in foo "bar".  */
1026 	      UNGET (ch);
1027 	      state = 3;
1028 	      PUT (' ');
1029 
1030 	      /* PUT didn't jump out.  We could just break, but we
1031 		 know what will happen, so optimize a bit.  */
1032 	      ch = GET ();
1033 	      old_state = 3;
1034 	    }
1035 	  else if (state == 9)
1036 	    old_state = 3;
1037 	  else
1038 	    old_state = state;
1039 	  state = 5;
1040 	  PUT (ch);
1041 	  break;
1042 
1043 #ifndef IEEE_STYLE
1044 	case LEX_IS_ONECHAR_QUOTE:
1045 #ifdef H_TICK_HEX
1046 	  if (state == 9 && enable_h_tick_hex)
1047 	    {
1048 	      char c;
1049 
1050 	      c = GET ();
1051 	      as_warn ("'%c found after symbol", c);
1052 	      UNGET (c);
1053 	    }
1054 #endif
1055 	  if (state == 10)
1056 	    {
1057 	      /* Preserve the whitespace in foo 'b'.  */
1058 	      UNGET (ch);
1059 	      state = 3;
1060 	      PUT (' ');
1061 	      break;
1062 	    }
1063 	  ch = GET ();
1064 	  if (ch == EOF)
1065 	    {
1066 	      as_warn (_("end of file after a one-character quote; \\0 inserted"));
1067 	      ch = 0;
1068 	    }
1069 	  if (ch == '\\')
1070 	    {
1071 	      ch = GET ();
1072 	      if (ch == EOF)
1073 		{
1074 		  as_warn (_("end of file in escape character"));
1075 		  ch = '\\';
1076 		}
1077 	      else
1078 		ch = process_escape (ch);
1079 	    }
1080 	  sprintf (out_buf, "%d", (int) (unsigned char) ch);
1081 
1082 	  /* None of these 'x constants for us.  We want 'x'.  */
1083 	  if ((ch = GET ()) != '\'')
1084 	    {
1085 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1086 	      as_warn (_("missing close quote; (assumed)"));
1087 #else
1088 	      if (ch != EOF)
1089 		UNGET (ch);
1090 #endif
1091 	    }
1092 	  if (strlen (out_buf) == 1)
1093 	    {
1094 	      PUT (out_buf[0]);
1095 	      break;
1096 	    }
1097 	  if (state == 9)
1098 	    old_state = 3;
1099 	  else
1100 	    old_state = state;
1101 	  state = -1;
1102 	  out_string = out_buf;
1103 	  PUT (*out_string++);
1104 	  break;
1105 #endif
1106 
1107 	case LEX_IS_COLON:
1108 #ifdef KEEP_WHITE_AROUND_COLON
1109 	  state = 9;
1110 #else
1111 	  if (state == 9 || state == 10)
1112 	    state = 3;
1113 	  else if (state != 3)
1114 	    state = 1;
1115 #endif
1116 	  PUT (ch);
1117 	  break;
1118 
1119 	case LEX_IS_NEWLINE:
1120 	  /* Roll out a bunch of newlines from inside comments, etc.  */
1121 	  if (add_newlines)
1122 	    {
1123 	      --add_newlines;
1124 	      UNGET (ch);
1125 	    }
1126 	  /* Fall through.  */
1127 
1128 	case LEX_IS_LINE_SEPARATOR:
1129 	  state = 0;
1130 	  PUT (ch);
1131 	  break;
1132 
1133 	case LEX_IS_PARALLEL_SEPARATOR:
1134 	  state = 1;
1135 	  PUT (ch);
1136 	  break;
1137 
1138 #ifdef TC_V850
1139 	case LEX_IS_DOUBLEDASH_1ST:
1140 	  ch2 = GET ();
1141 	  if (ch2 != '-')
1142 	    {
1143 	      if (ch2 != EOF)
1144 		UNGET (ch2);
1145 	      goto de_fault;
1146 	    }
1147 	  /* Read and skip to end of line.  */
1148 	  do
1149 	    {
1150 	      ch = GET ();
1151 	    }
1152 	  while (ch != EOF && ch != '\n');
1153 
1154 	  if (ch == EOF)
1155 	    as_warn (_("end of file in comment; newline inserted"));
1156 
1157 	  state = 0;
1158 	  PUT ('\n');
1159 	  break;
1160 #endif
1161 #ifdef DOUBLEBAR_PARALLEL
1162 	case LEX_IS_DOUBLEBAR_1ST:
1163 	  ch2 = GET ();
1164 	  if (ch2 != EOF)
1165 	    UNGET (ch2);
1166 	  if (ch2 != '|')
1167 	    goto de_fault;
1168 
1169 	  /* Handle '||' in two states as invoking PUT twice might
1170 	     result in the first one jumping out of this loop.  We'd
1171 	     then lose track of the state and one '|' char.  */
1172 	  state = 13;
1173 	  PUT ('|');
1174 	  break;
1175 #endif
1176 	case LEX_IS_LINE_COMMENT_START:
1177 	  /* FIXME-someday: The two character comment stuff was badly
1178 	     thought out.  On i386, we want '/' as line comment start
1179 	     AND we want C style comments.  hence this hack.  The
1180 	     whole lexical process should be reworked.  xoxorich.  */
1181 	  if (ch == '/')
1182 	    {
1183 	      ch2 = GET ();
1184 	      if (ch2 == '*')
1185 		{
1186 		  old_state = 3;
1187 		  state = -2;
1188 		  break;
1189 		}
1190 	      else
1191 		{
1192 		  UNGET (ch2);
1193 		}
1194 	    }
1195 
1196 	  if (state == 0 || state == 1)	/* Only comment at start of line.  */
1197 	    {
1198 	      int startch;
1199 
1200 	      startch = ch;
1201 
1202 	      do
1203 		{
1204 		  ch = GET ();
1205 		}
1206 	      while (ch != EOF && IS_WHITESPACE (ch));
1207 
1208 	      if (ch == EOF)
1209 		{
1210 		  as_warn (_("end of file in comment; newline inserted"));
1211 		  PUT ('\n');
1212 		  break;
1213 		}
1214 
1215 	      if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1216 		{
1217 		  /* Not a cpp line.  */
1218 		  while (ch != EOF && !IS_NEWLINE (ch))
1219 		    ch = GET ();
1220 		  if (ch == EOF)
1221 		    {
1222 		      as_warn (_("end of file in comment; newline inserted"));
1223 		      PUT ('\n');
1224 		    }
1225 		  else /* IS_NEWLINE (ch) */
1226 		    {
1227 		      /* To process non-zero add_newlines.  */
1228 		      UNGET (ch);
1229 		    }
1230 		  state = 0;
1231 		  break;
1232 		}
1233 	      /* Looks like `# 123 "filename"' from cpp.  */
1234 	      UNGET (ch);
1235 	      old_state = 4;
1236 	      state = -1;
1237 	      if (scrub_m68k_mri)
1238 		out_string = "\tlinefile ";
1239 	      else
1240 		out_string = "\t.linefile ";
1241 	      PUT (*out_string++);
1242 	      break;
1243 	    }
1244 
1245 #ifdef TC_D10V
1246 	  /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1247 	     Trap is the only short insn that has a first operand that is
1248 	     neither register nor label.
1249 	     We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1250 	     We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1251 	     already LEX_IS_LINE_COMMENT_START.  However, it is the
1252 	     only character in line_comment_chars for d10v, hence we
1253 	     can recognize it as such.  */
1254 	  /* An alternative approach would be to reset the state to 1 when
1255 	     we see '||', '<'- or '->', but that seems to be overkill.  */
1256 	  if (state == 10)
1257 	    PUT (' ');
1258 #endif
1259 	  /* We have a line comment character which is not at the
1260 	     start of a line.  If this is also a normal comment
1261 	     character, fall through.  Otherwise treat it as a default
1262 	     character.  */
1263 	  if (strchr (tc_comment_chars, ch) == NULL
1264 	      && (! scrub_m68k_mri
1265 		  || (ch != '!' && ch != '*')))
1266 	    goto de_fault;
1267 	  if (scrub_m68k_mri
1268 	      && (ch == '!' || ch == '*' || ch == '#')
1269 	      && state != 1
1270 	      && state != 10)
1271 	    goto de_fault;
1272 	  /* Fall through.  */
1273 	case LEX_IS_COMMENT_START:
1274 #if defined TC_ARM && defined OBJ_ELF
1275 	  /* On the ARM, `@' is the comment character.
1276 	     Unfortunately this is also a special character in ELF .symver
1277 	     directives (and .type, though we deal with those another way).
1278 	     So we check if this line is such a directive, and treat
1279 	     the character as default if so.  This is a hack.  */
1280 	  if ((symver_state != NULL) && (*symver_state == 0))
1281 	    goto de_fault;
1282 #endif
1283 
1284 #ifdef TC_ARM
1285 	  /* For the ARM, care is needed not to damage occurrences of \@
1286 	     by stripping the @ onwards.  Yuck.  */
1287 	  if (to > tostart && *(to - 1) == '\\')
1288 	    /* Do not treat the @ as a start-of-comment.  */
1289 	    goto de_fault;
1290 #endif
1291 
1292 #ifdef WARN_COMMENTS
1293 	  if (!found_comment)
1294 	    as_where (&found_comment_file, &found_comment);
1295 #endif
1296 	  do
1297 	    {
1298 	      ch = GET ();
1299 	    }
1300 	  while (ch != EOF && !IS_NEWLINE (ch));
1301 	  if (ch == EOF)
1302 	    as_warn (_("end of file in comment; newline inserted"));
1303 	  state = 0;
1304 	  PUT ('\n');
1305 	  break;
1306 
1307 #ifdef H_TICK_HEX
1308 	case LEX_IS_H:
1309 	  /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1310 	     the H' with 0x to make them gas-style hex characters.  */
1311 	  if (enable_h_tick_hex)
1312 	    {
1313 	      char quot;
1314 
1315 	      quot = GET ();
1316 	      if (quot == '\'')
1317 		{
1318 		  UNGET ('x');
1319 		  ch = '0';
1320 		}
1321 	      else
1322 		UNGET (quot);
1323 	    }
1324 	  /* FALL THROUGH */
1325 #endif
1326 
1327 	case LEX_IS_SYMBOL_COMPONENT:
1328 	  if (state == 10)
1329 	    {
1330 	      /* This is a symbol character following another symbol
1331 		 character, with whitespace in between.  We skipped
1332 		 the whitespace earlier, so output it now.  */
1333 	      UNGET (ch);
1334 	      state = 3;
1335 	      PUT (' ');
1336 	      break;
1337 	    }
1338 
1339 #ifdef TC_Z80
1340 	  /* "af'" is a symbol containing '\''.  */
1341 	  if (state == 3 && (ch == 'a' || ch == 'A'))
1342 	    {
1343 	      state = 16;
1344 	      PUT (ch);
1345 	      ch = GET ();
1346 	      if (ch == 'f' || ch == 'F')
1347 		{
1348 		  state = 17;
1349 		  PUT (ch);
1350 		  break;
1351 		}
1352 	      else
1353 		{
1354 		  state = 9;
1355 		  if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1356 		    {
1357 		      if (ch != EOF)
1358 			UNGET (ch);
1359 		      break;
1360 		    }
1361 		}
1362 	    }
1363 #endif
1364 	  if (state == 3)
1365 	    state = 9;
1366 
1367 	  /* This is a common case.  Quickly copy CH and all the
1368 	     following symbol component or normal characters.  */
1369 	  if (to + 1 < toend
1370 	      && mri_state == NULL
1371 #if defined TC_ARM && defined OBJ_ELF
1372 	      && symver_state == NULL
1373 #endif
1374 	      )
1375 	    {
1376 	      char *s;
1377 	      ptrdiff_t len;
1378 
1379 	      for (s = from; s < fromend; s++)
1380 		{
1381 		  int type;
1382 
1383 		  ch2 = *(unsigned char *) s;
1384 		  type = lex[ch2];
1385 		  if (type != 0
1386 		      && type != LEX_IS_SYMBOL_COMPONENT)
1387 		    break;
1388 		}
1389 
1390 	      if (s > from)
1391 		/* Handle the last character normally, for
1392 		   simplicity.  */
1393 		--s;
1394 
1395 	      len = s - from;
1396 
1397 	      if (len > (toend - to) - 1)
1398 		len = (toend - to) - 1;
1399 
1400 	      if (len > 0)
1401 		{
1402 		  PUT (ch);
1403 		  memcpy (to, from, len);
1404 		  to += len;
1405 		  from += len;
1406 		  if (to >= toend)
1407 		    goto tofull;
1408 		  ch = GET ();
1409 		}
1410 	    }
1411 
1412 	  /* Fall through.  */
1413 	default:
1414 	de_fault:
1415 	  /* Some relatively `normal' character.  */
1416 	  if (state == 0)
1417 	    {
1418 	      state = 11;	/* Now seeing label definition.  */
1419 	    }
1420 	  else if (state == 1)
1421 	    {
1422 	      state = 2;	/* Ditto.  */
1423 	    }
1424 	  else if (state == 9)
1425 	    {
1426 	      if (!IS_SYMBOL_COMPONENT (ch))
1427 		state = 3;
1428 	    }
1429 	  else if (state == 10)
1430 	    {
1431 	      if (ch == '\\')
1432 		{
1433 		  /* Special handling for backslash: a backslash may
1434 		     be the beginning of a formal parameter (of a
1435 		     macro) following another symbol character, with
1436 		     whitespace in between.  If that is the case, we
1437 		     output a space before the parameter.  Strictly
1438 		     speaking, correct handling depends upon what the
1439 		     macro parameter expands into; if the parameter
1440 		     expands into something which does not start with
1441 		     an operand character, then we don't want to keep
1442 		     the space.  We don't have enough information to
1443 		     make the right choice, so here we are making the
1444 		     choice which is more likely to be correct.  */
1445 		  if (to + 1 >= toend)
1446 		    {
1447 		      /* If we're near the end of the buffer, save the
1448 		         character for the next time round.  Otherwise
1449 		         we'll lose our state.  */
1450 		      UNGET (ch);
1451 		      goto tofull;
1452 		    }
1453 		  *to++ = ' ';
1454 		}
1455 
1456 	      state = 3;
1457 	    }
1458 	  PUT (ch);
1459 	  break;
1460 	}
1461     }
1462 
1463   /*NOTREACHED*/
1464 
1465  fromeof:
1466   /* We have reached the end of the input.  */
1467   return to - tostart;
1468 
1469  tofull:
1470   /* The output buffer is full.  Save any input we have not yet
1471      processed.  */
1472   if (fromend > from)
1473     {
1474       saved_input = from;
1475       saved_input_len = fromend - from;
1476     }
1477   else
1478     saved_input = NULL;
1479 
1480   return to - tostart;
1481 }
1482