1/*
2 * Copyright 2011 - 2014
3 * Andr\xe9 Malo or his licensors, as applicable
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#include "cext.h"
19EXT_INIT_FUNC;
20
21#define RJSMIN_DULL_BIT          (1 << 0)
22#define RJSMIN_PRE_REGEX_BIT     (1 << 1)
23#define RJSMIN_REGEX_DULL_BIT    (1 << 2)
24#define RJSMIN_REGEX_CC_DULL_BIT (1 << 3)
25#define RJSMIN_ID_LIT_BIT        (1 << 4)
26#define RJSMIN_ID_LIT_O_BIT      (1 << 5)
27#define RJSMIN_ID_LIT_C_BIT      (1 << 6)
28#define RJSMIN_STRING_DULL_BIT   (1 << 7)
29#define RJSMIN_SPACE_BIT         (1 << 8)
30
31#ifdef EXT3
32typedef Py_UNICODE rchar;
33#else
34typedef unsigned char rchar;
35#endif
36#define U(c) ((rchar)(c))
37
38#define RJSMIN_IS_DULL(c) ((U(c) > 127) || \
39    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_DULL_BIT))
40
41#define RJSMIN_IS_REGEX_DULL(c) ((U(c) > 127) || \
42    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_REGEX_DULL_BIT))
43
44#define RJSMIN_IS_REGEX_CC_DULL(c) ((U(c) > 127) || \
45    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_REGEX_CC_DULL_BIT))
46
47#define RJSMIN_IS_STRING_DULL(c) ((U(c) > 127) || \
48    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_STRING_DULL_BIT))
49
50#define RJSMIN_IS_ID_LITERAL(c) ((U(c) > 127) || \
51    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_BIT))
52
53#define RJSMIN_IS_ID_LITERAL_OPEN(c) ((U(c) > 127) || \
54    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_O_BIT))
55
56#define RJSMIN_IS_ID_LITERAL_CLOSE(c) ((U(c) > 127) || \
57    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_C_BIT))
58
59#define RJSMIN_IS_SPACE(c) ((U(c) <= 127) && \
60    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_SPACE_BIT))
61
62#define RJSMIN_IS_PRE_REGEX_1(c) ((U(c) <= 127) && \
63    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_PRE_REGEX_BIT))
64
65
66static const unsigned short rjsmin_charmask[128] = {
67    396, 396, 396, 396, 396, 396, 396, 396,
68    396, 396,   2, 396, 396,   2, 396, 396,
69    396, 396, 396, 396, 396, 396, 396, 396,
70    396, 396, 396, 396, 396, 396, 396, 396,
71    396, 175,  76, 141, 253, 141, 143,  76,
72    175, 205, 141, 237, 143, 237, 141, 136,
73    253, 253, 253, 253, 253, 253, 253, 253,
74    253, 253, 143, 143, 141, 143, 141, 143,
75    141, 253, 253, 253, 253, 253, 253, 253,
76    253, 253, 253, 253, 253, 253, 253, 253,
77    253, 253, 253, 253, 253, 253, 253, 253,
78    253, 253, 253, 171,   1, 197, 141, 253,
79    141, 253, 253, 253, 253, 253, 253, 253,
80    253, 253, 253, 253, 253, 253, 253, 253,
81    253, 253, 253, 253, 253, 253, 253, 253,
82    253, 253, 253, 175, 143, 207, 141, 253
83};
84
85static Py_ssize_t
86rjsmin(const rchar *source, rchar *target, Py_ssize_t length,
87       int keep_bang_comments)
88{
89    const rchar *reset, *sentinel = source + length;
90    rchar *tstart = target;
91    rchar c, quote;
92
93    while (source < sentinel) {
94        c = *source++;
95        if (RJSMIN_IS_DULL(c)) {
96            *target++ = c;
97            continue;
98        }
99        switch (c) {
100
101        /* String */
102        case U('\''): case U('"'):
103            reset = source;
104            *target++ = quote = c;
105            while (source < sentinel) {
106                c = *source++;
107                *target++ = c;
108                if (RJSMIN_IS_STRING_DULL(c))
109                    continue;
110                switch (c) {
111                case U('\''): case U('"'):
112                    if (c == quote)
113                        goto cont;
114                    continue;
115                case U('\\'):
116                    if (source < sentinel) {
117                        c = *source++;
118                        *target++ = c;
119                        if (c == U('\r') && source < sentinel
120                            && *source == U('\n'))
121                            *target++ = *source++;
122                    }
123                    continue;
124                }
125                break;
126            }
127            target -= source - reset;
128            source = reset;
129            continue;
130
131        /* Comment or Regex or something else entirely */
132        case U('/'):
133            if (!(source < sentinel)) {
134                *target++ = c;
135            }
136            else {
137                switch (*source) {
138            /* Comment */
139                case U('*'): case U('/'):
140                    goto skip_or_copy_ws;
141
142                default:
143                    if (   target == tstart
144                        || RJSMIN_IS_PRE_REGEX_1(*(target - 1))
145                        || (
146                            (target - tstart >= 6)
147                            && *(target - 1) == U('n')
148                            && *(target - 2) == U('r')
149                            && *(target - 3) == U('u')
150                            && *(target - 4) == U('t')
151                            && *(target - 5) == U('e')
152                            && *(target - 6) == U('r')
153                            && (
154                                   target - tstart == 6
155                                || !RJSMIN_IS_ID_LITERAL(*(target - 7))
156                            )
157                        )) {
158
159            /* Regex */
160                        reset = source;
161                        *target++ = U('/');
162                        while (source < sentinel) {
163                            c = *source++;
164                            *target++ = c;
165                            if (RJSMIN_IS_REGEX_DULL(c))
166                                continue;
167                            switch (c) {
168                            case U('/'):
169                                goto cont;
170                            case U('\\'):
171                                if (source < sentinel) {
172                                    c = *source++;
173                                    *target++ = c;
174                                    if (c == U('\r') || c == U('\n'))
175                                        break;
176                                }
177                                continue;
178                            case U('['):
179                                while (source < sentinel) {
180                                    c = *source++;
181                                    *target++ = c;
182                                    if (RJSMIN_IS_REGEX_CC_DULL(c))
183                                        continue;
184                                    switch (c) {
185                                    case U('\\'):
186                                        if (source < sentinel) {
187                                            c = *source++;
188                                            *target++ = c;
189                                            if (c == U('\r') || c == U('\n'))
190                                                break;
191                                        }
192                                        continue;
193                                    case U(']'):
194                                        goto cont_regex;
195                                    }
196                                }
197                                break;
198                            }
199                            break;
200                        cont_regex:
201                            continue;
202                        }
203                        target -= source - reset;
204                        source = reset;
205                    }
206                    else {
207            /* Just a slash */
208                        *target++ = c;
209                    }
210                    continue;
211                }
212            }
213            continue;
214
215        /* Whitespace */
216        default:
217        skip_or_copy_ws:
218            quote = U(' ');
219            --source;
220            while (source < sentinel) {
221                c = *source++;
222                if (RJSMIN_IS_SPACE(c))
223                    continue;
224                switch (c) {
225                case U('\r'): case U('\n'):
226                    quote = U('\n');
227                    continue;
228                case U('/'):
229                    if (source < sentinel) {
230                        switch (*source) {
231                        case U('*'):
232                            reset = source++;
233                            /* copy bang comment, if requested */
234                            if (   keep_bang_comments && source < sentinel
235                                && *source == U('!')) {
236                                *target++ = U('/');
237                                *target++ = U('*');
238                                *target++ = *source++;
239                                while (source < sentinel) {
240                                    c = *source++;
241                                    *target++ = c;
242                                    if (c == U('*') && source < sentinel
243                                        && *source == U('/')) {
244                                        *target++ = *source++;
245                                        reset = NULL;
246                                        break;
247                                    }
248                                }
249                                if (!reset)
250                                    continue;
251                                target -= source - reset;
252                                source = reset;
253                            }
254                            /* strip regular comment */
255                            else {
256                                while (source < sentinel) {
257                                    c = *source++;
258                                    if (c == U('*') && source < sentinel
259                                        && *source == U('/')) {
260                                        ++source;
261                                        reset = NULL;
262                                        break;
263                                    }
264                                }
265                                if (!reset)
266                                    continue;
267                                source = reset;
268                                *target++ = U('/');
269                            }
270                            goto cont;
271                        case U('/'):
272                            ++source;
273                            while (source < sentinel) {
274                                c = *source++;
275                                switch (c) {
276                                case U('\n'):
277                                    break;
278                                case U('\r'):
279                                    if (source < sentinel
280                                        && *source == U('\n'))
281                                        ++source;
282                                    break;
283                                default:
284                                    continue;
285                                }
286                                break;
287                            }
288                            quote = U('\n');
289                            continue;
290                        }
291                    }
292                }
293                --source;
294                break;
295            }
296
297            if ((tstart < target && source < sentinel)
298                && ((quote == U('\n')
299                    && RJSMIN_IS_ID_LITERAL_CLOSE(*(target - 1))
300                    && RJSMIN_IS_ID_LITERAL_OPEN(*source))
301                    ||
302                    (quote == U(' ')
303                    && ((RJSMIN_IS_ID_LITERAL(*(target - 1))
304                         && RJSMIN_IS_ID_LITERAL(*source))
305                        || (source < sentinel
306                            && ((*(target - 1) == U('+')
307                                 && *source == U('+'))
308                                || (*(target - 1) == U('-')
309                                    && *source == U('-'))))))))
310                *target++ = quote;
311        }
312    cont:
313        continue;
314    }
315    return (Py_ssize_t)(target - tstart);
316}
317
318
319PyDoc_STRVAR(rjsmin_jsmin__doc__,
320"jsmin(script, keep_bang_comments=False)\n\
321\n\
322Minify javascript based on `jsmin.c by Douglas Crockford`_\\.\n\
323\n\
324Instead of parsing the stream char by char, it uses a regular\n\
325expression approach which minifies the whole script with one big\n\
326substitution regex.\n\
327\n\
328.. _jsmin.c by Douglas Crockford:\n\
329   http://www.crockford.com/javascript/jsmin.c\n\
330\n\
331:Note: This is a hand crafted C implementation built on the regex\n\
332       semantics.\n\
333\n\
334:Parameters:\n\
335  `script` : ``str``\n\
336    Script to minify\n\
337\n\
338  `keep_bang_comments` : ``bool``\n\
339    Keep comments starting with an exclamation mark? (``/*!...*/``)\n\
340\n\
341:Return: Minified script\n\
342:Rtype: ``str``");
343
344static PyObject *
345rjsmin_jsmin(PyObject *self, PyObject *args, PyObject *kwds)
346{
347    PyObject *script, *keep_bang_comments_ = NULL, *result;
348    static char *kwlist[] = {"script", "keep_bang_comments", NULL};
349    Py_ssize_t slength, length;
350    int keep_bang_comments;
351#ifdef EXT2
352    int uni;
353#define UOBJ "O"
354#endif
355#ifdef EXT3
356#define UOBJ "U"
357#endif
358
359    if (!PyArg_ParseTupleAndKeywords(args, kwds, UOBJ "|O", kwlist,
360                                     &script, &keep_bang_comments_))
361        return NULL;
362
363    if (!keep_bang_comments_)
364        keep_bang_comments = 0;
365    else {
366        keep_bang_comments = PyObject_IsTrue(keep_bang_comments_);
367        if (keep_bang_comments == -1)
368            return NULL;
369    }
370
371#ifdef EXT2
372    if (PyUnicode_Check(script)) {
373        if (!(script = PyUnicode_AsUTF8String(script)))
374            return NULL;
375        uni = 1;
376    }
377    else {
378        if (!(script = PyObject_Str(script)))
379            return NULL;
380        uni = 0;
381    }
382#endif
383
384#ifdef EXT3
385    Py_INCREF(script);
386#define PyString_GET_SIZE PyUnicode_GET_SIZE
387#define PyString_AS_STRING PyUnicode_AS_UNICODE
388#define _PyString_Resize PyUnicode_Resize
389#define PyString_FromStringAndSize PyUnicode_FromUnicode
390#endif
391
392    slength = PyString_GET_SIZE(script);
393    if (!(result = PyString_FromStringAndSize(NULL, slength))) {
394        Py_DECREF(script);
395        return NULL;
396    }
397    Py_BEGIN_ALLOW_THREADS
398    length = rjsmin((rchar *)PyString_AS_STRING(script),
399                    (rchar *)PyString_AS_STRING(result),
400                    slength, keep_bang_comments);
401    Py_END_ALLOW_THREADS
402
403    Py_DECREF(script);
404    if (length < 0) {
405        Py_DECREF(result);
406        return NULL;
407    }
408    if (length != slength && _PyString_Resize(&result, length) == -1)
409        return NULL;
410
411#ifdef EXT2
412    if (uni) {
413        script = PyUnicode_DecodeUTF8(PyString_AS_STRING(result),
414                                      PyString_GET_SIZE(result), "strict");
415        Py_DECREF(result);
416        if (!script)
417            return NULL;
418        result = script;
419    }
420#endif
421    return result;
422}
423
424/* ------------------------ BEGIN MODULE DEFINITION ------------------------ */
425
426EXT_METHODS = {
427    {"jsmin",
428        (PyCFunction)rjsmin_jsmin, METH_VARARGS | METH_KEYWORDS,
429        rjsmin_jsmin__doc__},
430
431    {NULL}  /* Sentinel */
432};
433
434PyDoc_STRVAR(EXT_DOCS_VAR,
435"C implementation of rjsmin\n\
436==========================\n\
437\n\
438C implementation of rjsmin.");
439
440
441EXT_DEFINE(EXT_MODULE_NAME, EXT_METHODS_VAR, EXT_DOCS_VAR);
442
443EXT_INIT_FUNC {
444    PyObject *m;
445
446    /* Create the module and populate stuff */
447    if (!(m = EXT_CREATE(&EXT_DEFINE_VAR)))
448        EXT_INIT_ERROR(NULL);
449
450    EXT_ADD_UNICODE(m, "__author__", "Andr\xe9 Malo", "latin-1");
451    EXT_ADD_STRING(m, "__docformat__", "restructuredtext en");
452
453    EXT_INIT_RETURN(m);
454}
455
456/* ------------------------- END MODULE DEFINITION ------------------------- */
457