1 /*
2  * Copyright 2011 - 2014
3  * Andr\xe9 Malo or his licensors, as applicable
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 #include "cext.h"
19 EXT_INIT_FUNC;
20 
21 #define RJSMIN_DULL_BIT          (1 << 0)
22 #define RJSMIN_PRE_REGEX_BIT     (1 << 1)
23 #define RJSMIN_REGEX_DULL_BIT    (1 << 2)
24 #define RJSMIN_REGEX_CC_DULL_BIT (1 << 3)
25 #define RJSMIN_ID_LIT_BIT        (1 << 4)
26 #define RJSMIN_ID_LIT_O_BIT      (1 << 5)
27 #define RJSMIN_ID_LIT_C_BIT      (1 << 6)
28 #define RJSMIN_STRING_DULL_BIT   (1 << 7)
29 #define RJSMIN_SPACE_BIT         (1 << 8)
30 
31 #ifdef EXT3
32 typedef Py_UNICODE rchar;
33 #else
34 typedef unsigned char rchar;
35 #endif
36 #define U(c) ((rchar)(c))
37 
38 #define RJSMIN_IS_DULL(c) ((U(c) > 127) || \
39     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_DULL_BIT))
40 
41 #define RJSMIN_IS_REGEX_DULL(c) ((U(c) > 127) || \
42     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_REGEX_DULL_BIT))
43 
44 #define RJSMIN_IS_REGEX_CC_DULL(c) ((U(c) > 127) || \
45     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_REGEX_CC_DULL_BIT))
46 
47 #define RJSMIN_IS_STRING_DULL(c) ((U(c) > 127) || \
48     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_STRING_DULL_BIT))
49 
50 #define RJSMIN_IS_ID_LITERAL(c) ((U(c) > 127) || \
51     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_BIT))
52 
53 #define RJSMIN_IS_ID_LITERAL_OPEN(c) ((U(c) > 127) || \
54     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_O_BIT))
55 
56 #define RJSMIN_IS_ID_LITERAL_CLOSE(c) ((U(c) > 127) || \
57     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_C_BIT))
58 
59 #define RJSMIN_IS_SPACE(c) ((U(c) <= 127) && \
60     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_SPACE_BIT))
61 
62 #define RJSMIN_IS_PRE_REGEX_1(c) ((U(c) <= 127) && \
63     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_PRE_REGEX_BIT))
64 
65 
66 static const unsigned short rjsmin_charmask[128] = {
67     396, 396, 396, 396, 396, 396, 396, 396,
68     396, 396,   2, 396, 396,   2, 396, 396,
69     396, 396, 396, 396, 396, 396, 396, 396,
70     396, 396, 396, 396, 396, 396, 396, 396,
71     396, 175,  76, 141, 253, 141, 143,  76,
72     175, 205, 141, 237, 143, 237, 141, 136,
73     253, 253, 253, 253, 253, 253, 253, 253,
74     253, 253, 143, 143, 141, 143, 141, 143,
75     141, 253, 253, 253, 253, 253, 253, 253,
76     253, 253, 253, 253, 253, 253, 253, 253,
77     253, 253, 253, 253, 253, 253, 253, 253,
78     253, 253, 253, 171,   1, 197, 141, 253,
79     141, 253, 253, 253, 253, 253, 253, 253,
80     253, 253, 253, 253, 253, 253, 253, 253,
81     253, 253, 253, 253, 253, 253, 253, 253,
82     253, 253, 253, 175, 143, 207, 141, 253
83 };
84 
85 static Py_ssize_t
rjsmin(const rchar * source,rchar * target,Py_ssize_t length,int keep_bang_comments)86 rjsmin(const rchar *source, rchar *target, Py_ssize_t length,
87        int keep_bang_comments)
88 {
89     const rchar *reset, *sentinel = source + length;
90     rchar *tstart = target;
91     rchar c, quote;
92 
93     while (source < sentinel) {
94         c = *source++;
95         if (RJSMIN_IS_DULL(c)) {
96             *target++ = c;
97             continue;
98         }
99         switch (c) {
100 
101         /* String */
102         case U('\''): case U('"'):
103             reset = source;
104             *target++ = quote = c;
105             while (source < sentinel) {
106                 c = *source++;
107                 *target++ = c;
108                 if (RJSMIN_IS_STRING_DULL(c))
109                     continue;
110                 switch (c) {
111                 case U('\''): case U('"'):
112                     if (c == quote)
113                         goto cont;
114                     continue;
115                 case U('\\'):
116                     if (source < sentinel) {
117                         c = *source++;
118                         *target++ = c;
119                         if (c == U('\r') && source < sentinel
120                             && *source == U('\n'))
121                             *target++ = *source++;
122                     }
123                     continue;
124                 }
125                 break;
126             }
127             target -= source - reset;
128             source = reset;
129             continue;
130 
131         /* Comment or Regex or something else entirely */
132         case U('/'):
133             if (!(source < sentinel)) {
134                 *target++ = c;
135             }
136             else {
137                 switch (*source) {
138             /* Comment */
139                 case U('*'): case U('/'):
140                     goto skip_or_copy_ws;
141 
142                 default:
143                     if (   target == tstart
144                         || RJSMIN_IS_PRE_REGEX_1(*(target - 1))
145                         || (
146                             (target - tstart >= 6)
147                             && *(target - 1) == U('n')
148                             && *(target - 2) == U('r')
149                             && *(target - 3) == U('u')
150                             && *(target - 4) == U('t')
151                             && *(target - 5) == U('e')
152                             && *(target - 6) == U('r')
153                             && (
154                                    target - tstart == 6
155                                 || !RJSMIN_IS_ID_LITERAL(*(target - 7))
156                             )
157                         )) {
158 
159             /* Regex */
160                         reset = source;
161                         *target++ = U('/');
162                         while (source < sentinel) {
163                             c = *source++;
164                             *target++ = c;
165                             if (RJSMIN_IS_REGEX_DULL(c))
166                                 continue;
167                             switch (c) {
168                             case U('/'):
169                                 goto cont;
170                             case U('\\'):
171                                 if (source < sentinel) {
172                                     c = *source++;
173                                     *target++ = c;
174                                     if (c == U('\r') || c == U('\n'))
175                                         break;
176                                 }
177                                 continue;
178                             case U('['):
179                                 while (source < sentinel) {
180                                     c = *source++;
181                                     *target++ = c;
182                                     if (RJSMIN_IS_REGEX_CC_DULL(c))
183                                         continue;
184                                     switch (c) {
185                                     case U('\\'):
186                                         if (source < sentinel) {
187                                             c = *source++;
188                                             *target++ = c;
189                                             if (c == U('\r') || c == U('\n'))
190                                                 break;
191                                         }
192                                         continue;
193                                     case U(']'):
194                                         goto cont_regex;
195                                     }
196                                 }
197                                 break;
198                             }
199                             break;
200                         cont_regex:
201                             continue;
202                         }
203                         target -= source - reset;
204                         source = reset;
205                     }
206                     else {
207             /* Just a slash */
208                         *target++ = c;
209                     }
210                     continue;
211                 }
212             }
213             continue;
214 
215         /* Whitespace */
216         default:
217         skip_or_copy_ws:
218             quote = U(' ');
219             --source;
220             while (source < sentinel) {
221                 c = *source++;
222                 if (RJSMIN_IS_SPACE(c))
223                     continue;
224                 switch (c) {
225                 case U('\r'): case U('\n'):
226                     quote = U('\n');
227                     continue;
228                 case U('/'):
229                     if (source < sentinel) {
230                         switch (*source) {
231                         case U('*'):
232                             reset = source++;
233                             /* copy bang comment, if requested */
234                             if (   keep_bang_comments && source < sentinel
235                                 && *source == U('!')) {
236                                 *target++ = U('/');
237                                 *target++ = U('*');
238                                 *target++ = *source++;
239                                 while (source < sentinel) {
240                                     c = *source++;
241                                     *target++ = c;
242                                     if (c == U('*') && source < sentinel
243                                         && *source == U('/')) {
244                                         *target++ = *source++;
245                                         reset = NULL;
246                                         break;
247                                     }
248                                 }
249                                 if (!reset)
250                                     continue;
251                                 target -= source - reset;
252                                 source = reset;
253                             }
254                             /* strip regular comment */
255                             else {
256                                 while (source < sentinel) {
257                                     c = *source++;
258                                     if (c == U('*') && source < sentinel
259                                         && *source == U('/')) {
260                                         ++source;
261                                         reset = NULL;
262                                         break;
263                                     }
264                                 }
265                                 if (!reset)
266                                     continue;
267                                 source = reset;
268                                 *target++ = U('/');
269                             }
270                             goto cont;
271                         case U('/'):
272                             ++source;
273                             while (source < sentinel) {
274                                 c = *source++;
275                                 switch (c) {
276                                 case U('\n'):
277                                     break;
278                                 case U('\r'):
279                                     if (source < sentinel
280                                         && *source == U('\n'))
281                                         ++source;
282                                     break;
283                                 default:
284                                     continue;
285                                 }
286                                 break;
287                             }
288                             quote = U('\n');
289                             continue;
290                         }
291                     }
292                 }
293                 --source;
294                 break;
295             }
296 
297             if ((tstart < target && source < sentinel)
298                 && ((quote == U('\n')
299                     && RJSMIN_IS_ID_LITERAL_CLOSE(*(target - 1))
300                     && RJSMIN_IS_ID_LITERAL_OPEN(*source))
301                     ||
302                     (quote == U(' ')
303                     && ((RJSMIN_IS_ID_LITERAL(*(target - 1))
304                          && RJSMIN_IS_ID_LITERAL(*source))
305                         || (source < sentinel
306                             && ((*(target - 1) == U('+')
307                                  && *source == U('+'))
308                                 || (*(target - 1) == U('-')
309                                     && *source == U('-'))))))))
310                 *target++ = quote;
311         }
312     cont:
313         continue;
314     }
315     return (Py_ssize_t)(target - tstart);
316 }
317 
318 
319 PyDoc_STRVAR(rjsmin_jsmin__doc__,
320 "jsmin(script, keep_bang_comments=False)\n\
321 \n\
322 Minify javascript based on `jsmin.c by Douglas Crockford`_\\.\n\
323 \n\
324 Instead of parsing the stream char by char, it uses a regular\n\
325 expression approach which minifies the whole script with one big\n\
326 substitution regex.\n\
327 \n\
328 .. _jsmin.c by Douglas Crockford:\n\
329    http://www.crockford.com/javascript/jsmin.c\n\
330 \n\
331 :Note: This is a hand crafted C implementation built on the regex\n\
332        semantics.\n\
333 \n\
334 :Parameters:\n\
335   `script` : ``str``\n\
336     Script to minify\n\
337 \n\
338   `keep_bang_comments` : ``bool``\n\
339     Keep comments starting with an exclamation mark? (``/*!...*/``)\n\
340 \n\
341 :Return: Minified script\n\
342 :Rtype: ``str``");
343 
344 static PyObject *
rjsmin_jsmin(PyObject * self,PyObject * args,PyObject * kwds)345 rjsmin_jsmin(PyObject *self, PyObject *args, PyObject *kwds)
346 {
347     PyObject *script, *keep_bang_comments_ = NULL, *result;
348     static char *kwlist[] = {"script", "keep_bang_comments", NULL};
349     Py_ssize_t slength, length;
350     int keep_bang_comments;
351 #ifdef EXT2
352     int uni;
353 #define UOBJ "O"
354 #endif
355 #ifdef EXT3
356 #define UOBJ "U"
357 #endif
358 
359     if (!PyArg_ParseTupleAndKeywords(args, kwds, UOBJ "|O", kwlist,
360                                      &script, &keep_bang_comments_))
361         return NULL;
362 
363     if (!keep_bang_comments_)
364         keep_bang_comments = 0;
365     else {
366         keep_bang_comments = PyObject_IsTrue(keep_bang_comments_);
367         if (keep_bang_comments == -1)
368             return NULL;
369     }
370 
371 #ifdef EXT2
372     if (PyUnicode_Check(script)) {
373         if (!(script = PyUnicode_AsUTF8String(script)))
374             return NULL;
375         uni = 1;
376     }
377     else {
378         if (!(script = PyObject_Str(script)))
379             return NULL;
380         uni = 0;
381     }
382 #endif
383 
384 #ifdef EXT3
385     Py_INCREF(script);
386 #define PyString_GET_SIZE PyUnicode_GET_SIZE
387 #define PyString_AS_STRING PyUnicode_AS_UNICODE
388 #define _PyString_Resize PyUnicode_Resize
389 #define PyString_FromStringAndSize PyUnicode_FromUnicode
390 #endif
391 
392     slength = PyString_GET_SIZE(script);
393     if (!(result = PyString_FromStringAndSize(NULL, slength))) {
394         Py_DECREF(script);
395         return NULL;
396     }
397     Py_BEGIN_ALLOW_THREADS
398     length = rjsmin((rchar *)PyString_AS_STRING(script),
399                     (rchar *)PyString_AS_STRING(result),
400                     slength, keep_bang_comments);
401     Py_END_ALLOW_THREADS
402 
403     Py_DECREF(script);
404     if (length < 0) {
405         Py_DECREF(result);
406         return NULL;
407     }
408     if (length != slength && _PyString_Resize(&result, length) == -1)
409         return NULL;
410 
411 #ifdef EXT2
412     if (uni) {
413         script = PyUnicode_DecodeUTF8(PyString_AS_STRING(result),
414                                       PyString_GET_SIZE(result), "strict");
415         Py_DECREF(result);
416         if (!script)
417             return NULL;
418         result = script;
419     }
420 #endif
421     return result;
422 }
423 
424 /* ------------------------ BEGIN MODULE DEFINITION ------------------------ */
425 
426 EXT_METHODS = {
427     {"jsmin",
428         (PyCFunction)rjsmin_jsmin, METH_VARARGS | METH_KEYWORDS,
429         rjsmin_jsmin__doc__},
430 
431     {NULL}  /* Sentinel */
432 };
433 
434 PyDoc_STRVAR(EXT_DOCS_VAR,
435 "C implementation of rjsmin\n\
436 ==========================\n\
437 \n\
438 C implementation of rjsmin.");
439 
440 
441 EXT_DEFINE(EXT_MODULE_NAME, EXT_METHODS_VAR, EXT_DOCS_VAR);
442 
443 EXT_INIT_FUNC {
444     PyObject *m;
445 
446     /* Create the module and populate stuff */
447     if (!(m = EXT_CREATE(&EXT_DEFINE_VAR)))
448         EXT_INIT_ERROR(NULL);
449 
450     EXT_ADD_UNICODE(m, "__author__", "Andr\xe9 Malo", "latin-1");
451     EXT_ADD_STRING(m, "__docformat__", "restructuredtext en");
452 
453     EXT_INIT_RETURN(m);
454 }
455 
456 /* ------------------------- END MODULE DEFINITION ------------------------- */
457