1 /* `L' command implementation for GNU sed, based on GNU fmt 1.22.
2    Copyright (C) 1994, 1995, 1996, 2002, 2003 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 3, or (at your option)
7    any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software Foundation,
16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
17 
18 /* GNU fmt was written by Ross Paterson <rap@doc.ic.ac.uk>.  */
19 
20 #include "sed.h"
21 
22 #include <stdio.h>
23 #include <string.h>
24 #include <ctype.h>
25 #include <sys/types.h>
26 
27 #if HAVE_LIMITS_H
28 # include <limits.h>
29 #endif
30 
31 #ifndef UINT_MAX
32 # define UINT_MAX ((unsigned int) ~(unsigned int) 0)
33 #endif
34 
35 #ifndef INT_MAX
36 # define INT_MAX ((int) (UINT_MAX >> 1))
37 #endif
38 
39 /* The following parameters represent the program's idea of what is
40    "best".  Adjust to taste, subject to the caveats given.  */
41 
42 /* Prefer lines to be LEEWAY % shorter than the maximum width, giving
43    room for optimization.  */
44 #define	LEEWAY	7
45 
46 /* Costs and bonuses are expressed as the equivalent departure from the
47    optimal line length, multiplied by 10.  e.g. assigning something a
48    cost of 50 means that it is as bad as a line 5 characters too short
49    or too long.  The definition of SHORT_COST(n) should not be changed.
50    However, EQUIV(n) may need tuning.  */
51 
52 typedef long COST;
53 
54 #define	MAXCOST	(~(((unsigned long) 1) << (8 * sizeof (COST) -1)))
55 
56 #define	SQR(n)		((n) * (n))
57 #define	EQUIV(n)	SQR ((COST) (n))
58 
59 /* Cost of a filled line n chars longer or shorter than best_width.  */
60 #define	SHORT_COST(n)	EQUIV ((n) * 10)
61 
62 /* Cost of the difference between adjacent filled lines.  */
63 #define	RAGGED_COST(n)	(SHORT_COST (n) / 2)
64 
65 /* Basic cost per line.  */
66 #define	LINE_COST	EQUIV (70)
67 
68 /* Cost of breaking a line after the first word of a sentence, where
69    the length of the word is N.  */
70 #define	WIDOW_COST(n)	(EQUIV (200) / ((n) + 2))
71 
72 /* Cost of breaking a line before the last word of a sentence, where
73    the length of the word is N.  */
74 #define	ORPHAN_COST(n)	(EQUIV (150) / ((n) + 2))
75 
76 /* Bonus for breaking a line at the end of a sentence.  */
77 #define	SENTENCE_BONUS	EQUIV (50)
78 
79 /* Cost of breaking a line after a period not marking end of a sentence.
80    With the definition of sentence we are using (borrowed from emacs, see
81    get_line()) such a break would then look like a sentence break.  Hence
82    we assign a very high cost -- it should be avoided unless things are
83    really bad.  */
84 #define	NOBREAK_COST	EQUIV (600)
85 
86 /* Bonus for breaking a line before open parenthesis.  */
87 #define	PAREN_BONUS	EQUIV (40)
88 
89 /* Bonus for breaking a line after other punctuation.  */
90 #define	PUNCT_BONUS	EQUIV(40)
91 
92 /* Credit for breaking a long paragraph one line later.  */
93 #define	LINE_CREDIT	EQUIV(3)
94 
95 /* Size of paragraph buffer in words.  Longer paragraphs are handled
96    neatly (cf. flush_paragraph()), so there's little to gain by making
97    these larger.  */
98 #define	MAXWORDS	1000
99 
100 #define GETC()          (parabuf == end_of_parabuf ? EOF : *parabuf++)
101 
102 /* Extra ctype(3)-style macros.  */
103 
104 #define	isopen(c)	(strchr ("([`'\"", (c)) != NULL)
105 #define	isclose(c)	(strchr (")]'\"", (c)) != NULL)
106 #define	isperiod(c)	(strchr (".?!", (c)) != NULL)
107 
108 /* Size of a tab stop, for expansion on input and re-introduction on
109    output.  */
110 #define	TABWIDTH	8
111 
112 /* Word descriptor structure.  */
113 
114 typedef struct Word WORD;
115 
116 struct Word
117   {
118 
119     /* Static attributes determined during input.  */
120 
121     const char *text;		/* the text of the word */
122     short length;		/* length of this word */
123     short space;		/* the size of the following space */
124     unsigned paren:1;		/* starts with open paren */
125     unsigned period:1;		/* ends in [.?!])* */
126     unsigned punct:1;		/* ends in punctuation */
127     unsigned final:1;		/* end of sentence */
128 
129     /* The remaining fields are computed during the optimization.  */
130 
131     short line_length;		/* length of the best line starting here */
132     COST best_cost;		/* cost of best paragraph starting here */
133     WORD *next_break;		/* break which achieves best_cost */
134   };
135 
136 /* Forward declarations.  */
137 
138 static bool get_paragraph P_ ((void));
139 static int get_line P_ ((int c));
140 static int get_space P_ ((int c));
141 static int copy_rest P_ ((int c));
142 static bool same_para P_ ((int c));
143 static void flush_paragraph P_ ((void));
144 static void fmt_paragraph P_ ((void));
145 static void check_punctuation P_ ((WORD *w));
146 static COST base_cost P_ ((WORD *this));
147 static COST line_cost P_ ((WORD *next, int len));
148 static void put_paragraph P_ ((WORD *finish));
149 static void put_line P_ ((WORD *w, int indent));
150 static void put_word P_ ((WORD *w));
151 static void put_space P_ ((int space));
152 
153 /* Option values.  */
154 
155 /* User-supplied maximum line width (default WIDTH).  The only output
156    lines
157    longer than this will each comprise a single word.  */
158 static int max_width;
159 
160 /* Space for the paragraph text.  */
161 static const char *parabuf;
162 
163 /* End of space for the paragraph text.  */
164 static const char *end_of_parabuf;
165 
166 /* The file on which we output */
167 static FILE *outfile;
168 
169 /* Values derived from the option values.  */
170 
171 /* The preferred width of text lines, set to LEEWAY % less than max_width.  */
172 static int best_width;
173 
174 /* Dynamic variables.  */
175 
176 /* Start column of the character most recently read from the input file.  */
177 static int in_column;
178 
179 /* Start column of the next character to be written to stdout.  */
180 static int out_column;
181 
182 /* The words of a paragraph -- longer paragraphs are handled neatly
183    (cf. flush_paragraph()).  */
184 static WORD words[MAXWORDS];
185 
186 /* A pointer into the above word array, indicating the first position
187    after the last complete word.  Sometimes it will point at an incomplete
188    word.  */
189 static WORD *word_limit;
190 
191 /* Indentation of the first line of the current paragraph.  */
192 static int first_indent;
193 
194 /* Indentation of other lines of the current paragraph */
195 static int other_indent;
196 
197 /* The last character read from the input file.  */
198 static int next_char;
199 
200 /* If nonzero, the length of the last line output in the current
201    paragraph, used to charge for raggedness at the split point for long
202    paragraphs chosen by fmt_paragraph().  */
203 static int last_line_length;
204 
205 /* read file F and send formatted output to stdout.  */
206 
207 void
fmt(const char * line,const char * line_end,int max_length,FILE * output_file)208 fmt (const char *line, const char *line_end, int max_length, FILE *output_file)
209 {
210   parabuf = line;
211   end_of_parabuf = line_end;
212   outfile = output_file;
213 
214   max_width = max_length;
215   best_width = max_width * (201 - 2 * LEEWAY) / 200;
216 
217   in_column = 0;
218   other_indent = 0;
219   next_char = GETC();
220   while (get_paragraph ())
221     {
222       fmt_paragraph ();
223       put_paragraph (word_limit);
224     }
225 }
226 
227 /* Read a paragraph from input file F.  A paragraph consists of a
228    maximal number of non-blank (excluding any prefix) lines
229    with the same indent.
230 
231    Return false if end-of-file was encountered before the start of a
232    paragraph, else true.  */
233 
234 static bool
get_paragraph()235 get_paragraph ()
236 {
237   register int c;
238 
239   last_line_length = 0;
240   c = next_char;
241 
242   /* Scan (and copy) blank lines, and lines not introduced by the prefix.  */
243 
244   while (c == '\n' || c == EOF)
245     {
246       c = copy_rest (c);
247       if (c == EOF)
248 	{
249 	  next_char = EOF;
250 	  return false;
251 	}
252       putc ('\n', outfile);
253       c = GETC();
254     }
255 
256   /* Got a suitable first line for a paragraph.  */
257 
258   first_indent = in_column;
259   word_limit = words;
260   c = get_line (c);
261 
262   /* Read rest of paragraph.  */
263 
264   other_indent = in_column;
265   while (same_para (c) && in_column == other_indent)
266     c = get_line (c);
267 
268   (word_limit - 1)->period = (word_limit - 1)->final = true;
269   next_char = c;
270   return true;
271 }
272 
273 /* Copy to the output a blank line.  In the latter, C is \n or EOF.
274    Return the character (\n or EOF) ending the line.  */
275 
276 static int
copy_rest(register int c)277 copy_rest (register int c)
278 {
279   out_column = 0;
280   while (c != '\n' && c != EOF)
281     {
282       putc (c, outfile);
283       c = GETC();
284     }
285   return c;
286 }
287 
288 /* Return true if a line whose first non-blank character after the
289    prefix (if any) is C could belong to the current paragraph,
290    otherwise false.  */
291 
292 static bool
same_para(register int c)293 same_para (register int c)
294 {
295   return (c != '\n' && c != EOF);
296 }
297 
298 /* Read a line from the input data given first non-blank character C
299    after the prefix, and the following indent, and break it into words.
300    A word is a maximal non-empty string of non-white characters.  A word
301    ending in [.?!]["')\]]* and followed by end-of-line or at least two
302    spaces ends a sentence, as in emacs.
303 
304    Return the first non-blank character of the next line.  */
305 
306 static int
get_line(register int c)307 get_line (register int c)
308 {
309   int start;
310   register WORD *end_of_word;
311 
312   end_of_word = &words[MAXWORDS - 2];
313 
314   do
315     {				/* for each word in a line */
316 
317       /* Scan word.  */
318 
319       word_limit->text = parabuf - 1;
320       do
321 	c = GETC();
322       while (c != EOF && !ISSPACE (c));
323       word_limit->length = parabuf - word_limit->text - (c != EOF);
324       in_column += word_limit->length;
325 
326       check_punctuation (word_limit);
327 
328       /* Scan inter-word space.  */
329 
330       start = in_column;
331       c = get_space (c);
332       word_limit->space = in_column - start;
333       word_limit->final = (c == EOF
334 			   || (word_limit->period
335 			       && (c == '\n' || word_limit->space > 1)));
336       if (c == '\n' || c == EOF)
337 	word_limit->space = word_limit->final ? 2 : 1;
338       if (word_limit == end_of_word)
339 	flush_paragraph ();
340       word_limit++;
341       if (c == EOF)
342 	{
343 	  in_column = first_indent;
344 	  return EOF;
345 	}
346     }
347   while (c != '\n');
348 
349   in_column = 0;
350   c = GETC();
351   return get_space (c);
352 }
353 
354 /* Read blank characters from the input data, starting with C, and keeping
355    in_column up-to-date.  Return first non-blank character.  */
356 
357 static int
get_space(register int c)358 get_space (register int c)
359 {
360   for (;;)
361     {
362       if (c == ' ')
363 	in_column++;
364       else if (c == '\t')
365 	in_column = (in_column / TABWIDTH + 1) * TABWIDTH;
366       else
367 	return c;
368       c = GETC();
369     }
370 }
371 
372 /* Set extra fields in word W describing any attached punctuation.  */
373 
374 static void
check_punctuation(register WORD * w)375 check_punctuation (register WORD *w)
376 {
377   register const char *start, *finish;
378 
379   start = w->text;
380   finish = start + (w->length - 1);
381   w->paren = isopen (*start);
382   w->punct = ISPUNCT (*finish);
383   while (isclose (*finish) && finish > start)
384     finish--;
385   w->period = isperiod (*finish);
386 }
387 
388 /* Flush part of the paragraph to make room.  This function is called on
389    hitting the limit on the number of words or characters.  */
390 
391 static void
flush_paragraph(void)392 flush_paragraph (void)
393 {
394   WORD *split_point;
395   register WORD *w;
396   COST best_break;
397 
398   /* - format what you have so far as a paragraph,
399      - find a low-cost line break near the end,
400      - output to there,
401      - make that the start of the paragraph.  */
402 
403   fmt_paragraph ();
404 
405   /* Choose a good split point.  */
406 
407   split_point = word_limit;
408   best_break = MAXCOST;
409   for (w = words->next_break; w != word_limit; w = w->next_break)
410     {
411       if (w->best_cost - w->next_break->best_cost < best_break)
412 	{
413 	  split_point = w;
414 	  best_break = w->best_cost - w->next_break->best_cost;
415 	}
416       if (best_break <= MAXCOST - LINE_CREDIT)
417 	best_break += LINE_CREDIT;
418     }
419   put_paragraph (split_point);
420 
421   /* Copy words from split_point down to word -- we use memmove because
422      the source and target may overlap.  */
423 
424   memmove ((char *) words, (char *) split_point,
425 	 (word_limit - split_point + 1) * sizeof (WORD));
426   word_limit -= split_point - words;
427 }
428 
429 /* Compute the optimal formatting for the whole paragraph by computing
430    and remembering the optimal formatting for each suffix from the empty
431    one to the whole paragraph.  */
432 
433 static void
fmt_paragraph(void)434 fmt_paragraph (void)
435 {
436   register WORD *start, *w;
437   register int len;
438   register COST wcost, best;
439   int saved_length;
440 
441   word_limit->best_cost = 0;
442   saved_length = word_limit->length;
443   word_limit->length = max_width;	/* sentinel */
444 
445   for (start = word_limit - 1; start >= words; start--)
446     {
447       best = MAXCOST;
448       len = start == words ? first_indent : other_indent;
449 
450       /* At least one word, however long, in the line.  */
451 
452       w = start;
453       len += w->length;
454       do
455 	{
456 	  w++;
457 
458 	  /* Consider breaking before w.  */
459 
460 	  wcost = line_cost (w, len) + w->best_cost;
461 	  if (start == words && last_line_length > 0)
462 	    wcost += RAGGED_COST (len - last_line_length);
463 	  if (wcost < best)
464 	    {
465 	      best = wcost;
466 	      start->next_break = w;
467 	      start->line_length = len;
468 	    }
469 	  len += (w - 1)->space + w->length;	/* w > start >= words */
470 	}
471       while (len < max_width);
472       start->best_cost = best + base_cost (start);
473     }
474 
475   word_limit->length = saved_length;
476 }
477 
478 /* Return the constant component of the cost of breaking before the
479    word THIS.  */
480 
481 static COST
base_cost(register WORD * this)482 base_cost (register WORD *this)
483 {
484   register COST cost;
485 
486   cost = LINE_COST;
487 
488   if (this > words)
489     {
490       if ((this - 1)->period)
491 	{
492 	  if ((this - 1)->final)
493 	    cost -= SENTENCE_BONUS;
494 	  else
495 	    cost += NOBREAK_COST;
496 	}
497       else if ((this - 1)->punct)
498 	cost -= PUNCT_BONUS;
499       else if (this > words + 1 && (this - 2)->final)
500 	cost += WIDOW_COST ((this - 1)->length);
501     }
502 
503   if (this->paren)
504     cost -= PAREN_BONUS;
505   else if (this->final)
506     cost += ORPHAN_COST (this->length);
507 
508   return cost;
509 }
510 
511 /* Return the component of the cost of breaking before word NEXT that
512    depends on LEN, the length of the line beginning there.  */
513 
514 static COST
line_cost(register WORD * next,register int len)515 line_cost (register WORD *next, register int len)
516 {
517   register int n;
518   register COST cost;
519 
520   if (next == word_limit)
521     return 0;
522   n = best_width - len;
523   cost = SHORT_COST (n);
524   if (next->next_break != word_limit)
525     {
526       n = len - next->line_length;
527       cost += RAGGED_COST (n);
528     }
529   return cost;
530 }
531 
532 /* Output to stdout a paragraph from word up to (but not including)
533    FINISH, which must be in the next_break chain from word.  */
534 
535 static void
put_paragraph(register WORD * finish)536 put_paragraph (register WORD *finish)
537 {
538   register WORD *w;
539 
540   put_line (words, first_indent);
541   for (w = words->next_break; w != finish; w = w->next_break)
542     put_line (w, other_indent);
543 }
544 
545 /* Output to stdout the line beginning with word W, beginning in column
546    INDENT, including the prefix (if any).  */
547 
548 static void
put_line(register WORD * w,int indent)549 put_line (register WORD *w, int indent)
550 {
551   register WORD *endline;
552   out_column = 0;
553   put_space (indent);
554 
555   endline = w->next_break - 1;
556   for (; w != endline; w++)
557     {
558       put_word (w);
559       put_space (w->space);
560     }
561   put_word (w);
562   last_line_length = out_column;
563   putc ('\n', outfile);
564 }
565 
566 /* Output to stdout the word W.  */
567 
568 static void
put_word(register WORD * w)569 put_word (register WORD *w)
570 {
571   register const char *s;
572   register int n;
573 
574   s = w->text;
575   for (n = w->length; n != 0; n--)
576     putc (*s++, outfile);
577   out_column += w->length;
578 }
579 
580 /* Output to stdout SPACE spaces, or equivalent tabs.  */
581 
582 static void
put_space(int space)583 put_space (int space)
584 {
585   out_column += space;
586   while (space--)
587     putc (' ', outfile);
588 }
589