1#! /usr/bin/perl -w
2
3# Script to turn PCRE man pages into HTML
4
5
6# Subroutine to handle font changes and other escapes
7
8sub do_line {
9my($s) = $_[0];
10
11$s =~ s/</&#60;/g;                   # Deal with < and >
12$s =~ s/>/&#62;/g;
13$s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g;
14$s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g;
15$s =~ s"\\e"\\"g;
16$s =~ s/(?<=Copyright )\(c\)/&copy;/g;
17$s;
18}
19
20# Subroutine to ensure not in a paragraph
21
22sub end_para {
23if ($inpara)
24  {
25  print TEMP "</PRE>\n" if ($inpre);
26  print TEMP "</P>\n";
27  }
28$inpara = $inpre = 0;
29$wrotetext = 0;
30}
31
32# Subroutine to start a new paragraph
33
34sub new_para {
35&end_para();
36print TEMP "<P>\n";
37$inpara = 1;
38}
39
40
41# Main program
42
43$innf = 0;
44$inpara = 0;
45$inpre = 0;
46$wrotetext = 0;
47$toc = 0;
48$ref = 1;
49
50while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
51  {
52  $toc = 1 if $ARGV[0] eq "-toc";
53  shift;
54  }
55
56# Initial output to STDOUT
57
58print <<End ;
59<html>
60<head>
61<title>$ARGV[0] specification</title>
62</head>
63<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
64<h1>$ARGV[0] man page</h1>
65<p>
66Return to the <a href="index.html">PCRE index page</a>.
67</p>
68<p>
69This page is part of the PCRE HTML documentation. It was generated automatically
70from the original man page. If there is any nonsense in it, please consult the
71man page, in case the conversion went wrong.
72<br>
73End
74
75print "<ul>\n" if ($toc);
76
77open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n";
78
79while (<STDIN>)
80  {
81  # Handle lines beginning with a dot
82
83  if (/^\./)
84    {
85    # Some of the PCRE man pages used to contain instances of .br. However,
86    # they should have all been removed because they cause trouble in some
87    # (other) automated systems that translate man pages to HTML. Complain if
88    # we find .br or .in (another macro that is deprecated).
89
90    if (/^\.br/ || /^\.in/)
91      {
92      print STDERR "\n*** Deprecated macro encountered - rewrite needed\n";
93      print STDERR "*** $_\n";
94      die "*** Processing abandoned\n";
95      }
96
97    # Instead of .br, relevent "literal" sections are enclosed in .nf/.fi.
98
99    elsif (/^\.nf/)
100      {
101      $innf = 1;
102      }
103
104    elsif (/^\.fi/)
105      {
106      $innf = 0;
107      }
108
109    # Handling .sp is subtle. If it is inside a literal section, do nothing if
110    # the next line is a non literal text line; similarly, if not inside a
111    # literal section, do nothing if a literal follows, unless we are inside
112    # a .nf/.ne section. The point being that the <pre> and </pre> that delimit
113    # literal sections will do the spacing. Always skip if no previous output.
114
115    elsif (/^\.sp/)
116      {
117      if ($wrotetext)
118        {
119        $_ = <STDIN>;
120        if ($inpre)
121          {
122          print TEMP "\n" if (/^[\s.]/);
123          }
124        else
125          {
126          print TEMP "<br>\n<br>\n" if ($innf || !/^[\s.]/);
127          }
128        redo;    # Now process the lookahead line we just read
129        }
130      }
131    elsif (/^\.TP/ || /^\.PP/ || /^\.P/)
132      {
133      &new_para();
134      }
135    elsif (/^\.SH\s*("?)(.*)\1/)
136      {
137      # Ignore the NAME section
138      if ($2 =~ /^NAME\b/)
139        {
140        <STDIN>;
141        next;
142        }
143
144      &end_para();
145      my($title) = &do_line($2);
146      if ($toc)
147        {
148        printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
149          $ref, $ref);
150        printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
151          $ref);
152        $ref++;
153        }
154      else
155        {
156        print TEMP "<br><b>\n$title\n</b><br>\n";
157        }
158      }
159    elsif (/^\.SS\s*("?)(.*)\1/)
160      {
161      &end_para();
162      my($title) = &do_line($2);
163      print TEMP "<br><b>\n$title\n</b><br>\n";
164      }
165    elsif (/^\.B\s*(.*)/)
166      {
167      &new_para() if (!$inpara);
168      $_ = &do_line($1);
169      s/"(.*?)"/$1/g;
170      print TEMP "<b>$_</b>\n";
171      $wrotetext = 1;
172      }
173    elsif (/^\.I\s*(.*)/)
174      {
175      &new_para() if (!$inpara);
176      $_ = &do_line($1);
177      s/"(.*?)"/$1/g;
178      print TEMP "<i>$_</i>\n";
179      $wrotetext = 1;
180      }
181
182    # A comment that starts "HREF" takes the next line as a name that
183    # is turned into a hyperlink, using the text given, which might be
184    # in a special font. If it ends in () or (digits) or punctuation, they
185    # aren't part of the link.
186
187    elsif (/^\.\\"\s*HREF/)
188      {
189      $_=<STDIN>;
190      chomp;
191      $_ = &do_line($_);
192      $_ =~ s/\s+$//;
193      $_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/;
194      print TEMP "<a href=\"$1.html\">$_</a>\n";
195      }
196
197    # A comment that starts "HTML" inserts literal HTML
198
199    elsif (/^\.\\"\s*HTML\s*(.*)/)
200      {
201      print TEMP $1;
202      }
203
204    # A comment that starts < inserts that HTML at the end of the
205    # *next* input line - so as not to get a newline between them.
206
207    elsif (/^\.\\"\s*(<.*>)/)
208      {
209      my($markup) = $1;
210      $_=<STDIN>;
211      chomp;
212      $_ = &do_line($_);
213      $_ =~ s/\s+$//;
214      print TEMP "$_$markup\n";
215      }
216
217    # A comment that starts JOIN joins the next two lines together, with one
218    # space between them. Then that line is processed. This is used in some
219    # displays where two lines are needed for the "man" version. JOINSH works
220    # the same, except that it assumes this is a shell command, so removes
221    # continuation backslashes.
222
223    elsif (/^\.\\"\s*JOIN(SH)?/)
224      {
225      my($one,$two);
226      $one = <STDIN>;
227      $two = <STDIN>;
228      $one =~ s/\s*\\e\s*$// if (defined($1));
229      chomp($one);
230      $two =~ s/^\s+//;
231      $_ = "$one $two";
232      redo;            # Process the joined lines
233      }
234
235    # .EX/.EE are used in the pcredemo page to bracket the entire program,
236    # which is unmodified except for turning backslash into "\e".
237
238    elsif (/^\.EX\s*$/)
239      {
240      print TEMP "<PRE>\n";
241      while (<STDIN>)
242        {
243        last if /^\.EE\s*$/;
244        s/\\e/\\/g;
245        s/&/&amp;/g;
246        s/</&lt;/g;
247        s/>/&gt;/g;
248        print TEMP;
249        }
250      }
251
252    # Ignore anything not recognized
253
254    next;
255    }
256
257  # Line does not begin with a dot. Replace blank lines with new paragraphs
258
259  if (/^\s*$/)
260    {
261    &end_para() if ($wrotetext);
262    next;
263    }
264
265  # Convert fonts changes and output an ordinary line. Ensure that indented
266  # lines are marked as literal.
267
268  $_ = &do_line($_);
269  &new_para() if (!$inpara);
270
271  if (/^\s/)
272    {
273    if (!$inpre)
274      {
275      print TEMP "<pre>\n";
276      $inpre = 1;
277      }
278    }
279  elsif ($inpre)
280    {
281    print TEMP "</pre>\n";
282    $inpre = 0;
283    }
284
285  # Add <br> to the end of a non-literal line if we are within .nf/.fi
286
287  $_ .= "<br>\n" if (!$inpre && $innf);
288
289  print TEMP;
290  $wrotetext = 1;
291  }
292
293# The TOC, if present, will have been written - terminate it
294
295print "</ul>\n" if ($toc);
296
297# Copy the remainder to the standard output
298
299close(TEMP);
300open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n";
301
302print while (<TEMP>);
303
304print <<End ;
305<p>
306Return to the <a href="index.html">PCRE index page</a>.
307</p>
308End
309
310close(TEMP);
311unlink("/tmp/$$");
312
313# End
314