1 /*  GNU SED, a batch stream editor.
2     Copyright (C) 1989,90,91,92,93,94,95,98,99,2002,2003
3     Free Software Foundation, Inc.
4 
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 3, or (at your option)
8     any later version.
9 
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with this program; if not, write to the Free Software
17     Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18 
19 #ifdef HAVE_CONFIG_H
20 #include "config.h"
21 #endif
22 
23 #include "basicdefs.h"
24 #include "regex.h"
25 
26 #ifndef BOOTSTRAP
27 #include <stdio.h>
28 #include "unlocked-io.h"
29 #endif
30 
31 #include "utils.h"
32 
33 /* Struct vector is used to describe a compiled sed program. */
34 struct vector {
35   struct sed_cmd *v;	/* a dynamically allocated array */
36   size_t v_allocated;	/* ... number slots allocated */
37   size_t v_length;	/* ... number of slots in use */
38 };
39 
40 /* This structure tracks files used by sed so that they may all be
41    closed cleanly at normal program termination.  A flag is kept that tells
42    if a missing newline was encountered, so that it is added on the
43    next line and the two lines are not concatenated.  */
44 struct output {
45   char *name;
46   bool missing_newline;
47   FILE *fp;
48   struct output *link;
49 };
50 
51 struct text_buf {
52   char *text;
53   size_t text_length;
54 };
55 
56 struct regex {
57   regex_t pattern;
58   int flags;
59   size_t sz;
60   char re[1];
61 };
62 
63 enum replacement_types {
64   REPL_ASIS = 0,
65   REPL_UPPERCASE = 1,
66   REPL_LOWERCASE = 2,
67   REPL_UPPERCASE_FIRST = 4,
68   REPL_LOWERCASE_FIRST = 8,
69   REPL_MODIFIERS = REPL_UPPERCASE_FIRST | REPL_LOWERCASE_FIRST,
70 
71   /* These are given to aid in debugging */
72   REPL_UPPERCASE_UPPERCASE = REPL_UPPERCASE_FIRST | REPL_UPPERCASE,
73   REPL_UPPERCASE_LOWERCASE = REPL_UPPERCASE_FIRST | REPL_LOWERCASE,
74   REPL_LOWERCASE_UPPERCASE = REPL_LOWERCASE_FIRST | REPL_UPPERCASE,
75   REPL_LOWERCASE_LOWERCASE = REPL_LOWERCASE_FIRST | REPL_LOWERCASE
76 };
77 
78 enum text_types {
79   TEXT_BUFFER,
80   TEXT_REPLACEMENT,
81   TEXT_REGEX
82 };
83 
84 enum posixicity_types {
85   POSIXLY_EXTENDED,	/* with GNU extensions */
86   POSIXLY_CORRECT,	/* with POSIX-compatible GNU extensions */
87   POSIXLY_BASIC		/* pedantically POSIX */
88 };
89 
90 enum addr_state {
91   RANGE_INACTIVE,	/* never been active */
92   RANGE_ACTIVE,		/* between first and second address */
93   RANGE_CLOSED		/* like RANGE_INACTIVE, but range has ended once */
94 };
95 
96 enum addr_types {
97   ADDR_IS_NULL,		/* null address */
98   ADDR_IS_REGEX,	/* a.addr_regex is valid */
99   ADDR_IS_NUM,		/* a.addr_number is valid */
100   ADDR_IS_NUM_MOD,	/* a.addr_number is valid, addr_step is modulo */
101   ADDR_IS_STEP,		/* address is +N (only valid for addr2) */
102   ADDR_IS_STEP_MOD,	/* address is ~N (only valid for addr2) */
103   ADDR_IS_LAST		/* address is $ */
104 };
105 
106 struct addr {
107   enum addr_types addr_type;
108   countT addr_number;
109   countT addr_step;
110   struct regex *addr_regex;
111 };
112 
113 
114 struct replacement {
115   char *prefix;
116   size_t prefix_length;
117   int subst_id;
118   enum replacement_types repl_type;
119   struct replacement *next;
120 };
121 
122 struct subst {
123   struct regex *regx;
124   struct replacement *replacement;
125   countT numb;		/* if >0, only substitute for match number "numb" */
126   struct output *outf;	/* 'w' option given */
127   unsigned global : 1;	/* 'g' option given */
128   unsigned print : 2;	/* 'p' option given (before/after eval) */
129   unsigned eval : 1;	/* 'e' option given */
130   unsigned max_id : 4;  /* maximum backreference on the RHS */
131 };
132 
133 #ifdef REG_PERL
134 /* This is the structure we store register match data in.  See
135    regex.texinfo for a full description of what registers match.  */
136 struct re_registers
137 {
138   unsigned num_regs;
139   regoff_t *start;
140   regoff_t *end;
141 };
142 #endif
143 
144 
145 
146 struct sed_cmd {
147   struct addr *a1;	/* save space: usually is NULL */
148   struct addr *a2;
149 
150   /* See description the enum, above.  */
151   enum addr_state range_state;
152 
153   /* Non-zero if command is to be applied to non-matches. */
154   char addr_bang;
155 
156   /* The actual command character. */
157   char cmd;
158 
159   /* auxiliary data for various commands */
160   union {
161     /* This structure is used for a, i, and c commands. */
162     struct text_buf cmd_txt;
163 
164     /* This is used for the l, q and Q commands. */
165     int int_arg;
166 
167     /* This is used for the {}, b, and t commands. */
168     countT jump_index;
169 
170     /* This is used for the r command. */
171     char *fname;
172 
173     /* This is used for the hairy s command. */
174     struct subst *cmd_subst;
175 
176     /* This is used for the w command. */
177     struct output *outf;
178 
179     /* This is used for the R command. */
180     FILE *fp;
181 
182     /* This is used for the y command. */
183     unsigned char *translate;
184     char **translatemb;
185   } x;
186 };
187 
188 
189 
190 void bad_prog P_((const char *why));
191 size_t normalize_text P_((char *text, size_t len, enum text_types buftype));
192 struct vector *compile_string P_((struct vector *, char *str, size_t len));
193 struct vector *compile_file P_((struct vector *, const char *cmdfile));
194 void check_final_program P_((struct vector *));
195 void rewind_read_files P_((void));
196 void finish_program P_((struct vector *));
197 
198 struct regex *compile_regex P_((struct buffer *b, int flags, int needed_sub));
199 int match_regex P_((struct regex *regex,
200 		    char *buf, size_t buflen, size_t buf_start_offset,
201 		    struct re_registers *regarray, int regsize));
202 #ifdef DEBUG_LEAKS
203 void release_regex P_((struct regex *));
204 #endif
205 
206 int process_files P_((struct vector *, char **argv));
207 
208 int main P_((int, char **));
209 
210 extern void fmt P_ ((const char *line, const char *line_end, int max_length, FILE *output_file));
211 
212 extern int extended_regexp_flags;
213 
214 /* If set, fflush(stdout) on every line output. */
215 extern bool unbuffered_output;
216 
217 /* If set, don't write out the line unless explicitly told to. */
218 extern bool no_default_output;
219 
220 /* If set, reset line counts on every new file. */
221 extern bool separate_files;
222 
223 /* If set, follow symlinks when invoked with -i option */
224 extern bool follow_symlinks;
225 
226 /* Do we need to be pedantically POSIX compliant? */
227 extern enum posixicity_types posixicity;
228 
229 /* How long should the `l' command's output line be? */
230 extern countT lcmd_out_line_len;
231 
232 /* How do we edit files in-place? (we don't if NULL) */
233 extern char *in_place_extension;
234 
235 /* The mode to use to read files, either "rt" or "rb".  */
236 extern char *read_mode;
237 
238 /* Should we use EREs? */
239 extern bool use_extended_syntax_p;
240 
241 /* Declarations for multibyte character sets.  */
242 extern int mb_cur_max;
243 extern bool is_utf8;
244 
245 #ifdef HAVE_MBRTOWC
246 #ifdef HAVE_BTOWC
247 #define MBRTOWC(pwc, s, n, ps) \
248   (mb_cur_max == 1 ? \
249    (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \
250    mbrtowc ((pwc), (s), (n), (ps)))
251 
252 #define WCRTOMB(s, wc, ps) \
253   (mb_cur_max == 1 ? \
254    (*(s) = wctob ((wint_t) (wc)), 1) : \
255    wcrtomb ((s), (wc), (ps)))
256 #else
257 #define MBRTOWC(pwc, s, n, ps) \
258   mbrtowc ((pwc), (s), (n), (ps))
259 
260 #define WCRTOMB(s, wc, ps) \
261   wcrtomb ((s), (wc), (ps))
262 #endif
263 
264 #define MBSINIT(s) \
265   (mb_cur_max == 1 ? 1 : mbsinit ((s)))
266 
267 #define MBRLEN(s, n, ps) \
268   (mb_cur_max == 1 ? 1 : mbrtowc (NULL, s, n, ps))
269 
270 #define BRLEN(ch, ps) \
271   (mb_cur_max == 1 ? 1 : brlen (ch, ps))
272 
273 #else
274 #define MBSINIT(s) 1
275 #define MBRLEN(s, n, ps) 1
276 #define BRLEN(ch, ps) 1
277 #endif
278 
279 extern int brlen P_ ((int ch, mbstate_t *ps));
280 extern void initialize_mbcs P_ ((void));
281 
282