1 /*
2  * Copyright 2011 - 2015
3  * Andr\xe9 Malo or his licensors, as applicable
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 #include "cext.h"
19 EXT_INIT_FUNC;
20 
21 #define RJSMIN_DULL_BIT           (1 << 0)
22 #define RJSMIN_PRE_REGEX_BIT      (1 << 1)
23 #define RJSMIN_REGEX_DULL_BIT     (1 << 2)
24 #define RJSMIN_REGEX_CC_DULL_BIT  (1 << 3)
25 #define RJSMIN_ID_LIT_BIT         (1 << 4)
26 #define RJSMIN_ID_LIT_O_BIT       (1 << 5)
27 #define RJSMIN_ID_LIT_C_BIT       (1 << 6)
28 #define RJSMIN_STRING_DULL_BIT    (1 << 7)
29 #define RJSMIN_SPACE_BIT          (1 << 8)
30 #define RJSMIN_POST_REGEX_OFF_BIT (1 << 9)
31 
32 #ifdef EXT3
33 typedef Py_UNICODE rchar;
34 #else
35 typedef unsigned char rchar;
36 #endif
37 #define U(c) ((rchar)(c))
38 
39 #define RJSMIN_IS_DULL(c) ((U(c) > 127) || \
40     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_DULL_BIT))
41 
42 #define RJSMIN_IS_REGEX_DULL(c) ((U(c) > 127) || \
43     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_REGEX_DULL_BIT))
44 
45 #define RJSMIN_IS_REGEX_CC_DULL(c) ((U(c) > 127) || \
46     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_REGEX_CC_DULL_BIT))
47 
48 #define RJSMIN_IS_STRING_DULL(c) ((U(c) > 127) || \
49     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_STRING_DULL_BIT))
50 
51 #define RJSMIN_IS_ID_LITERAL(c) ((U(c) > 127) || \
52     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_BIT))
53 
54 #define RJSMIN_IS_ID_LITERAL_OPEN(c) ((U(c) > 127) || \
55     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_O_BIT))
56 
57 #define RJSMIN_IS_ID_LITERAL_CLOSE(c) ((U(c) > 127) || \
58     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_C_BIT))
59 
60 #define RJSMIN_IS_POST_REGEX_OFF(c) ((U(c) > 127) || \
61     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_POST_REGEX_OFF_BIT))
62 
63 #define RJSMIN_IS_SPACE(c) ((U(c) <= 127) && \
64     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_SPACE_BIT))
65 
66 #define RJSMIN_IS_PRE_REGEX_1(c) ((U(c) <= 127) && \
67     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_PRE_REGEX_BIT))
68 
69 
70 static const unsigned short rjsmin_charmask[128] = {
71     396, 396, 396, 396, 396, 396, 396, 396,
72     396, 396,   2, 396, 396,   2, 396, 396,
73     396, 396, 396, 396, 396, 396, 396, 396,
74     396, 396, 396, 396, 396, 396, 396, 396,
75     396, 687, 588, 653, 765, 653, 143, 588,
76     687, 205, 653, 237, 143, 237, 141, 648,
77     765, 765, 765, 765, 765, 765, 765, 765,
78     765, 765, 143, 143, 653, 143, 653, 143,
79     653, 765, 765, 765, 765, 765, 765, 765,
80     765, 765, 765, 765, 765, 765, 765, 765,
81     765, 765, 765, 765, 765, 765, 765, 765,
82     765, 765, 765, 683, 513, 197, 653, 765,
83     653, 765, 765, 765, 765, 765, 765, 765,
84     765, 765, 765, 765, 765, 765, 765, 765,
85     765, 765, 765, 765, 765, 765, 765, 765,
86     765, 765, 765, 687, 143, 207, 653, 765
87 };
88 
89 static Py_ssize_t
rjsmin(const rchar * source,rchar * target,Py_ssize_t length,int keep_bang_comments)90 rjsmin(const rchar *source, rchar *target, Py_ssize_t length,
91        int keep_bang_comments)
92 {
93     const rchar *reset, *pcreset = NULL, *pctoken = NULL, *xtarget,
94                 *sentinel = source + length;
95     rchar *tstart = target;
96     int post_regex = 0;
97     rchar c, quote, spaced = U(' ');
98 
99     while (source < sentinel) {
100         c = *source++;
101         if (RJSMIN_IS_DULL(c)) {
102             if (post_regex) post_regex = 0;
103             if (pctoken) pctoken = NULL;
104             if (spaced == U('\n')) spaced = U(' ');
105 
106             *target++ = c;
107             continue;
108         }
109         switch (c) {
110 
111         /* String */
112         case U('\''): case U('"'):
113             if (post_regex) post_regex = 0;
114             if (pctoken) pctoken = NULL;
115             if (spaced == U('\n')) spaced = U(' ');
116 
117             reset = source;
118             *target++ = quote = c;
119             while (source < sentinel) {
120                 c = *source++;
121                 *target++ = c;
122                 if (RJSMIN_IS_STRING_DULL(c))
123                     continue;
124                 switch (c) {
125                 case U('\''): case U('"'):
126                     if (c == quote)
127                         goto cont;
128                     continue;
129                 case U('\\'):
130                     if (source < sentinel) {
131                         c = *source++;
132                         *target++ = c;
133                         if (c == U('\r') && source < sentinel
134                             && *source == U('\n'))
135                             *target++ = *source++;
136                     }
137                     continue;
138                 }
139                 break;
140             }
141             target -= source - reset;
142             source = reset;
143             continue;
144 
145         /* Comment or Regex or something else entirely */
146         case U('/'):
147             if (!(source < sentinel)) {
148                 if (post_regex) post_regex = 0;
149                 if (pctoken) pctoken = NULL;
150                 if (spaced == U('\n')) spaced = U(' ');
151 
152                 *target++ = c;
153             }
154             else {
155                 switch (*source) {
156             /* Comment */
157                 case U('*'): case U('/'):
158                     goto skip_or_copy_ws;
159 
160                 default:
161                     xtarget = NULL;
162                     if (   target == tstart
163                         || RJSMIN_IS_PRE_REGEX_1(*((pctoken ? pctoken : target)
164                                                    - 1))
165                         || (
166                             (xtarget = pctoken ? pctoken : target)
167                             && (xtarget - tstart >= 6)
168                             && *(xtarget - 1) == U('n')
169                             && *(xtarget - 2) == U('r')
170                             && *(xtarget - 3) == U('u')
171                             && *(xtarget - 4) == U('t')
172                             && *(xtarget - 5) == U('e')
173                             && *(xtarget - 6) == U('r')
174                             && (
175                                    xtarget - tstart == 6
176                                 || !RJSMIN_IS_ID_LITERAL(*(xtarget - 7))
177                             )
178                         )) {
179 
180             /* Regex */
181                         if (post_regex) post_regex = 0;
182                         if (pctoken) pctoken = NULL;
183 
184                         reset = source;
185                         if (spaced == U('\n')) {
186                             spaced = U(' ');
187                             if (xtarget)
188                                 *target++ = U('\n');
189                         }
190 
191                         *target++ = U('/');
192                         while (source < sentinel) {
193                             c = *source++;
194                             *target++ = c;
195                             if (RJSMIN_IS_REGEX_DULL(c))
196                                 continue;
197                             switch (c) {
198                             case U('/'):
199                                 post_regex = 1;
200                                 goto cont;
201                             case U('\\'):
202                                 if (source < sentinel) {
203                                     c = *source++;
204                                     *target++ = c;
205                                     if (c == U('\r') || c == U('\n'))
206                                         break;
207                                 }
208                                 continue;
209                             case U('['):
210                                 while (source < sentinel) {
211                                     c = *source++;
212                                     *target++ = c;
213                                     if (RJSMIN_IS_REGEX_CC_DULL(c))
214                                         continue;
215                                     switch (c) {
216                                     case U('\\'):
217                                         if (source < sentinel) {
218                                             c = *source++;
219                                             *target++ = c;
220                                             if (c == U('\r') || c == U('\n'))
221                                                 break;
222                                         }
223                                         continue;
224                                     case U(']'):
225                                         goto cont_regex;
226                                     }
227                                 }
228                                 break;
229                             }
230                             break;
231                         cont_regex:
232                             continue;
233                         }
234                         target -= source - reset;
235                         source = reset;
236                     }
237                     else {
238             /* Just a slash */
239                         if (post_regex) post_regex = 0;
240                         if (pctoken) pctoken = NULL;
241                         if (spaced == U('\n')) spaced = U(' ');
242 
243                         *target++ = c;
244                     }
245                     continue;
246                 }
247             }
248             continue;
249 
250         /* Whitespace */
251         default:
252         skip_or_copy_ws:
253             quote = U(' ');
254             --source;
255             while (source < sentinel) {
256                 c = *source++;
257                 if (RJSMIN_IS_SPACE(c))
258                     continue;
259                 switch (c) {
260                 case U('\r'): case U('\n'):
261                     quote = U('\n');
262                     continue;
263                 case U('/'):
264                     if (source < sentinel) {
265                         switch (*source) {
266                         case U('*'):
267                             reset = source++;
268                             /* copy bang comment, if requested */
269                             if (   keep_bang_comments && source < sentinel
270                                 && *source == U('!')) {
271                                 if (!pctoken) {
272                                     pctoken = target;
273                                     pcreset = reset;
274                                 }
275 
276                                 *target++ = U('/');
277                                 *target++ = U('*');
278                                 *target++ = *source++;
279                                 while (source < sentinel) {
280                                     c = *source++;
281                                     *target++ = c;
282                                     if (c == U('*') && source < sentinel
283                                         && *source == U('/')) {
284                                         *target++ = *source++;
285                                         reset = NULL;
286                                         break;
287                                     }
288                                 }
289                                 if (!reset)
290                                     continue;
291 
292                                 target -= source - reset;
293                                 source = reset;
294                                 if (pcreset == reset) {
295                                     pctoken = NULL;
296                                     pcreset = NULL;
297                                 }
298 
299                             }
300                             /* strip regular comment */
301                             else {
302                                 while (source < sentinel) {
303                                     c = *source++;
304                                     if (c == U('*') && source < sentinel
305                                         && *source == U('/')) {
306                                         ++source;
307                                         reset = NULL;
308                                         break;
309                                     }
310                                 }
311                                 if (!reset)
312                                     continue;
313                                 source = reset;
314                                 *target++ = U('/');
315                             }
316                             goto cont;
317                         case U('/'):
318                             ++source;
319                             while (source < sentinel) {
320                                 c = *source++;
321                                 switch (c) {
322                                 case U('\n'):
323                                     break;
324                                 case U('\r'):
325                                     if (source < sentinel
326                                         && *source == U('\n'))
327                                         ++source;
328                                     break;
329                                 default:
330                                     continue;
331                                 }
332                                 break;
333                             }
334                             quote = U('\n');
335                             continue;
336                         }
337                     }
338                 }
339                 --source;
340                 break;
341             }
342 
343             if ((tstart < (pctoken ? pctoken : target) && source < sentinel)
344                 && ((quote == U('\n')
345                      && ((RJSMIN_IS_ID_LITERAL_CLOSE(*((pctoken ?
346                                                         pctoken : target) - 1))
347                           && RJSMIN_IS_ID_LITERAL_OPEN(*source))
348                          || (post_regex
349                              && RJSMIN_IS_POST_REGEX_OFF(*source)
350                              && !(post_regex = 0))))
351                     ||
352                     (quote == U(' ') && !pctoken
353                      && ((RJSMIN_IS_ID_LITERAL(*(target - 1))
354                           && RJSMIN_IS_ID_LITERAL(*source))
355                          || (source < sentinel
356                              && ((*(target - 1) == U('+')
357                                   && *source == U('+'))
358                                  || (*(target - 1) == U('-')
359                                      && *source == U('-')))))))) {
360                 *target++ = quote;
361             }
362 
363             pcreset = NULL;
364             spaced = quote;
365         }
366     cont:
367         continue;
368     }
369     return (Py_ssize_t)(target - tstart);
370 }
371 
372 
373 PyDoc_STRVAR(rjsmin_jsmin__doc__,
374 "jsmin(script, keep_bang_comments=False)\n\
375 \n\
376 Minify javascript based on `jsmin.c by Douglas Crockford`_\\.\n\
377 \n\
378 Instead of parsing the stream char by char, it uses a regular\n\
379 expression approach which minifies the whole script with one big\n\
380 substitution regex.\n\
381 \n\
382 .. _jsmin.c by Douglas Crockford:\n\
383    http://www.crockford.com/javascript/jsmin.c\n\
384 \n\
385 :Note: This is a hand crafted C implementation built on the regex\n\
386        semantics.\n\
387 \n\
388 :Parameters:\n\
389   `script` : ``str``\n\
390     Script to minify\n\
391 \n\
392   `keep_bang_comments` : ``bool``\n\
393     Keep comments starting with an exclamation mark? (``/*!...*/``)\n\
394 \n\
395 :Return: Minified script\n\
396 :Rtype: ``str``");
397 
398 static PyObject *
rjsmin_jsmin(PyObject * self,PyObject * args,PyObject * kwds)399 rjsmin_jsmin(PyObject *self, PyObject *args, PyObject *kwds)
400 {
401     PyObject *script, *keep_bang_comments_ = NULL, *result;
402     static char *kwlist[] = {"script", "keep_bang_comments", NULL};
403     Py_ssize_t slength, length;
404     int keep_bang_comments;
405 #ifdef EXT2
406     int uni;
407 #define UOBJ "O"
408 #endif
409 #ifdef EXT3
410 #define UOBJ "U"
411 #endif
412 
413     if (!PyArg_ParseTupleAndKeywords(args, kwds, UOBJ "|O", kwlist,
414                                      &script, &keep_bang_comments_))
415         return NULL;
416 
417     if (!keep_bang_comments_)
418         keep_bang_comments = 0;
419     else {
420         keep_bang_comments = PyObject_IsTrue(keep_bang_comments_);
421         if (keep_bang_comments == -1)
422             return NULL;
423     }
424 
425 #ifdef EXT2
426     if (PyUnicode_Check(script)) {
427         if (!(script = PyUnicode_AsUTF8String(script)))
428             return NULL;
429         uni = 1;
430     }
431     else {
432         if (!(script = PyObject_Str(script)))
433             return NULL;
434         uni = 0;
435     }
436 #endif
437 
438 #ifdef EXT3
439     Py_INCREF(script);
440 #define PyString_GET_SIZE PyUnicode_GET_SIZE
441 #define PyString_AS_STRING PyUnicode_AS_UNICODE
442 #define _PyString_Resize PyUnicode_Resize
443 #define PyString_FromStringAndSize PyUnicode_FromUnicode
444 #endif
445 
446     slength = PyString_GET_SIZE(script);
447     if (!(result = PyString_FromStringAndSize(NULL, slength))) {
448         Py_DECREF(script);
449         return NULL;
450     }
451     Py_BEGIN_ALLOW_THREADS
452     length = rjsmin((rchar *)PyString_AS_STRING(script),
453                     (rchar *)PyString_AS_STRING(result),
454                     slength, keep_bang_comments);
455     Py_END_ALLOW_THREADS
456 
457     Py_DECREF(script);
458     if (length < 0) {
459         Py_DECREF(result);
460         return NULL;
461     }
462     if (length != slength && _PyString_Resize(&result, length) == -1)
463         return NULL;
464 
465 #ifdef EXT2
466     if (uni) {
467         script = PyUnicode_DecodeUTF8(PyString_AS_STRING(result),
468                                       PyString_GET_SIZE(result), "strict");
469         Py_DECREF(result);
470         if (!script)
471             return NULL;
472         result = script;
473     }
474 #endif
475     return result;
476 }
477 
478 /* ------------------------ BEGIN MODULE DEFINITION ------------------------ */
479 
480 EXT_METHODS = {
481     {"jsmin",
482         (PyCFunction)rjsmin_jsmin, METH_VARARGS | METH_KEYWORDS,
483         rjsmin_jsmin__doc__},
484 
485     {NULL}  /* Sentinel */
486 };
487 
488 PyDoc_STRVAR(EXT_DOCS_VAR,
489 "C implementation of rjsmin\n\
490 ==========================\n\
491 \n\
492 C implementation of rjsmin.");
493 
494 
495 EXT_DEFINE(EXT_MODULE_NAME, EXT_METHODS_VAR, EXT_DOCS_VAR);
496 
497 EXT_INIT_FUNC {
498     PyObject *m;
499 
500     /* Create the module and populate stuff */
501     if (!(m = EXT_CREATE(&EXT_DEFINE_VAR)))
502         EXT_INIT_ERROR(NULL);
503 
504     EXT_ADD_UNICODE(m, "__author__", "Andr\xe9 Malo", "latin-1");
505     EXT_ADD_STRING(m, "__docformat__", "restructuredtext en");
506 
507     EXT_INIT_RETURN(m);
508 }
509 
510 /* ------------------------- END MODULE DEFINITION ------------------------- */
511