1/* ----------------------------------------------------------------------- *
2 *
3 *   Copyright 2011 Intel Corporation; author: H. Peter Anvin
4 *
5 *   This program is free software; you can redistribute it and/or modify
6 *   it under the terms of the GNU General Public License as published by
7 *   the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
8 *   Boston MA 02110-1301, USA; either version 2 of the License, or
9 *   (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13#include <inttypes.h>
14#include <string.h>
15#include <stdlib.h>
16#include <ctype.h>
17#include <dprintf.h>
18#include "pxe.h"
19
20enum http_readdir_state {
21    st_start,			/*  0 Initial state */
22    st_open,			/*  1 "<" */
23    st_a,			/*  2 "<a" */
24    st_attribute,		/*  3 "<a " */
25    st_h,			/*  4 "<a h" */
26    st_hr,			/*  5 */
27    st_hre,			/*  6 */
28    st_href,			/*  7 */
29    st_hrefeq,			/*  8 */
30    st_hrefqu,			/*  9 */
31    st_badtag,			/* 10 */
32    st_badtagqu,		/* 11 */
33    st_badattr,			/* 12 */
34    st_badattrqu,		/* 13 */
35};
36
37struct machine {
38    char xchar;
39    uint8_t st_xchar;
40    uint8_t st_left;		/* < */
41    uint8_t st_right;		/* > */
42    uint8_t st_space;		/* white */
43    uint8_t st_other;		/* anything else */
44};
45
46static const struct machine statemachine[] = {
47    /* xchar	st_xchar	st_left		st_right	st_space	st_other */
48    { 0,	0,		st_open,	st_start,	st_start,	st_start },
49    { 'a',	st_a,		st_badtag,	st_start,	st_open,	st_badtag },
50    { 0,	0,		st_open,	st_open,	st_attribute,	st_badtag },
51    { 'h',	st_h,		st_open,	st_start,	st_attribute,	st_badattr },
52    { 'r',	st_hr,		st_open,	st_start,	st_attribute,	st_badattr },
53    { 'e',	st_hre,		st_open,	st_start,	st_attribute,	st_badattr },
54    { 'f',	st_href,	st_open,	st_start,	st_attribute,	st_badattr },
55    { '=',	st_hrefeq,	st_open,	st_start,	st_attribute,	st_badattr },
56    { '\"',	st_hrefqu,	st_open,	st_start,	st_attribute,	st_hrefeq },
57    { '\"',	st_attribute,	st_hrefqu,	st_hrefqu,	st_hrefqu,	st_hrefqu },
58    { '\"',	st_badtagqu,	st_open,	st_start,	st_badtag,	st_badtag },
59    { '\"',	st_badtag,	st_badtagqu,	st_badtagqu,	st_badtagqu,	st_badtagqu },
60    { '\"',	st_badattrqu,	st_open,	st_start,	st_attribute,	st_badattr },
61    { '\"',	st_attribute,	st_badattrqu,	st_badattrqu,	st_badattrqu,	st_badattrqu },
62};
63
64struct html_entity {
65    uint16_t ucs;
66    const char entity[9];
67};
68
69static const struct html_entity entities[] = {
70    {   34, "quot" },
71    {   38, "amp" },
72    {   60, "lt" },
73    {   62, "gt" },
74#ifdef HTTP_ALL_ENTITIES
75    {  160, "nbsp" },
76    {  161, "iexcl" },
77    {  162, "cent" },
78    {  163, "pound" },
79    {  164, "curren" },
80    {  165, "yen" },
81    {  166, "brvbar" },
82    {  167, "sect" },
83    {  168, "uml" },
84    {  169, "copy" },
85    {  170, "ordf" },
86    {  171, "laquo" },
87    {  172, "not" },
88    {  173, "shy" },
89    {  174, "reg" },
90    {  175, "macr" },
91    {  176, "deg" },
92    {  177, "plusmn" },
93    {  178, "sup2" },
94    {  179, "sup3" },
95    {  180, "acute" },
96    {  181, "micro" },
97    {  182, "para" },
98    {  183, "middot" },
99    {  184, "cedil" },
100    {  185, "sup1" },
101    {  186, "ordm" },
102    {  187, "raquo" },
103    {  188, "frac14" },
104    {  189, "frac12" },
105    {  190, "frac34" },
106    {  191, "iquest" },
107    {  192, "Agrave" },
108    {  193, "Aacute" },
109    {  194, "Acirc" },
110    {  195, "Atilde" },
111    {  196, "Auml" },
112    {  197, "Aring" },
113    {  198, "AElig" },
114    {  199, "Ccedil" },
115    {  200, "Egrave" },
116    {  201, "Eacute" },
117    {  202, "Ecirc" },
118    {  203, "Euml" },
119    {  204, "Igrave" },
120    {  205, "Iacute" },
121    {  206, "Icirc" },
122    {  207, "Iuml" },
123    {  208, "ETH" },
124    {  209, "Ntilde" },
125    {  210, "Ograve" },
126    {  211, "Oacute" },
127    {  212, "Ocirc" },
128    {  213, "Otilde" },
129    {  214, "Ouml" },
130    {  215, "times" },
131    {  216, "Oslash" },
132    {  217, "Ugrave" },
133    {  218, "Uacute" },
134    {  219, "Ucirc" },
135    {  220, "Uuml" },
136    {  221, "Yacute" },
137    {  222, "THORN" },
138    {  223, "szlig" },
139    {  224, "agrave" },
140    {  225, "aacute" },
141    {  226, "acirc" },
142    {  227, "atilde" },
143    {  228, "auml" },
144    {  229, "aring" },
145    {  230, "aelig" },
146    {  231, "ccedil" },
147    {  232, "egrave" },
148    {  233, "eacute" },
149    {  234, "ecirc" },
150    {  235, "euml" },
151    {  236, "igrave" },
152    {  237, "iacute" },
153    {  238, "icirc" },
154    {  239, "iuml" },
155    {  240, "eth" },
156    {  241, "ntilde" },
157    {  242, "ograve" },
158    {  243, "oacute" },
159    {  244, "ocirc" },
160    {  245, "otilde" },
161    {  246, "ouml" },
162    {  247, "divide" },
163    {  248, "oslash" },
164    {  249, "ugrave" },
165    {  250, "uacute" },
166    {  251, "ucirc" },
167    {  252, "uuml" },
168    {  253, "yacute" },
169    {  254, "thorn" },
170    {  255, "yuml" },
171    {  338, "OElig" },
172    {  339, "oelig" },
173    {  352, "Scaron" },
174    {  353, "scaron" },
175    {  376, "Yuml" },
176    {  402, "fnof" },
177    {  710, "circ" },
178    {  732, "tilde" },
179    {  913, "Alpha" },
180    {  914, "Beta" },
181    {  915, "Gamma" },
182    {  916, "Delta" },
183    {  917, "Epsilon" },
184    {  918, "Zeta" },
185    {  919, "Eta" },
186    {  920, "Theta" },
187    {  921, "Iota" },
188    {  922, "Kappa" },
189    {  923, "Lambda" },
190    {  924, "Mu" },
191    {  925, "Nu" },
192    {  926, "Xi" },
193    {  927, "Omicron" },
194    {  928, "Pi" },
195    {  929, "Rho" },
196    {  931, "Sigma" },
197    {  932, "Tau" },
198    {  933, "Upsilon" },
199    {  934, "Phi" },
200    {  935, "Chi" },
201    {  936, "Psi" },
202    {  937, "Omega" },
203    {  945, "alpha" },
204    {  946, "beta" },
205    {  947, "gamma" },
206    {  948, "delta" },
207    {  949, "epsilon" },
208    {  950, "zeta" },
209    {  951, "eta" },
210    {  952, "theta" },
211    {  953, "iota" },
212    {  954, "kappa" },
213    {  955, "lambda" },
214    {  956, "mu" },
215    {  957, "nu" },
216    {  958, "xi" },
217    {  959, "omicron" },
218    {  960, "pi" },
219    {  961, "rho" },
220    {  962, "sigmaf" },
221    {  963, "sigma" },
222    {  964, "tau" },
223    {  965, "upsilon" },
224    {  966, "phi" },
225    {  967, "chi" },
226    {  968, "psi" },
227    {  969, "omega" },
228    {  977, "thetasym" },
229    {  978, "upsih" },
230    {  982, "piv" },
231    { 8194, "ensp" },
232    { 8195, "emsp" },
233    { 8201, "thinsp" },
234    { 8204, "zwnj" },
235    { 8205, "zwj" },
236    { 8206, "lrm" },
237    { 8207, "rlm" },
238    { 8211, "ndash" },
239    { 8212, "mdash" },
240    { 8216, "lsquo" },
241    { 8217, "rsquo" },
242    { 8218, "sbquo" },
243    { 8220, "ldquo" },
244    { 8221, "rdquo" },
245    { 8222, "bdquo" },
246    { 8224, "dagger" },
247    { 8225, "Dagger" },
248    { 8226, "bull" },
249    { 8230, "hellip" },
250    { 8240, "permil" },
251    { 8242, "prime" },
252    { 8243, "Prime" },
253    { 8249, "lsaquo" },
254    { 8250, "rsaquo" },
255    { 8254, "oline" },
256    { 8260, "frasl" },
257    { 8364, "euro" },
258    { 8465, "image" },
259    { 8472, "weierp" },
260    { 8476, "real" },
261    { 8482, "trade" },
262    { 8501, "alefsym" },
263    { 8592, "larr" },
264    { 8593, "uarr" },
265    { 8594, "rarr" },
266    { 8595, "darr" },
267    { 8596, "harr" },
268    { 8629, "crarr" },
269    { 8656, "lArr" },
270    { 8657, "uArr" },
271    { 8658, "rArr" },
272    { 8659, "dArr" },
273    { 8660, "hArr" },
274    { 8704, "forall" },
275    { 8706, "part" },
276    { 8707, "exist" },
277    { 8709, "empty" },
278    { 8711, "nabla" },
279    { 8712, "isin" },
280    { 8713, "notin" },
281    { 8715, "ni" },
282    { 8719, "prod" },
283    { 8721, "sum" },
284    { 8722, "minus" },
285    { 8727, "lowast" },
286    { 8730, "radic" },
287    { 8733, "prop" },
288    { 8734, "infin" },
289    { 8736, "ang" },
290    { 8743, "and" },
291    { 8744, "or" },
292    { 8745, "cap" },
293    { 8746, "cup" },
294    { 8747, "int" },
295    { 8756, "there4" },
296    { 8764, "sim" },
297    { 8773, "cong" },
298    { 8776, "asymp" },
299    { 8800, "ne" },
300    { 8801, "equiv" },
301    { 8804, "le" },
302    { 8805, "ge" },
303    { 8834, "sub" },
304    { 8835, "sup" },
305    { 8836, "nsub" },
306    { 8838, "sube" },
307    { 8839, "supe" },
308    { 8853, "oplus" },
309    { 8855, "otimes" },
310    { 8869, "perp" },
311    { 8901, "sdot" },
312    { 8968, "lceil" },
313    { 8969, "rceil" },
314    { 8970, "lfloor" },
315    { 8971, "rfloor" },
316    { 9001, "lang" },
317    { 9002, "rang" },
318    { 9674, "loz" },
319    { 9824, "spades" },
320    { 9827, "clubs" },
321    { 9829, "hearts" },
322    { 9830, "diams" },
323#endif /* HTTP_ALL_ENTITIES */
324    { 0, "" }
325};
326
327struct entity_state {
328    char entity_buf[16];
329    char *ep;
330};
331
332static char *emit(char *p, int c, struct entity_state *st)
333{
334    const struct html_entity *ent;
335    unsigned int ucs;
336
337    if (!st->ep) {
338	if (c == '&') {
339	    /* Entity open */
340	    st->ep = st->entity_buf;
341	} else {
342	    *p++ = c;
343	}
344    } else {
345	if (c == ';') {
346	    st->ep = NULL;
347	    *p = '\0';
348	    if (st->entity_buf[0] == '#') {
349		if ((st->entity_buf[1] | 0x20)== 'x') {
350		    ucs = strtoul(st->entity_buf + 2, NULL, 16);
351		} else {
352		    ucs = strtoul(st->entity_buf + 1, NULL, 10);
353		}
354	    } else {
355		for (ent = entities; ent->ucs; ent++) {
356		    if (!strcmp(st->entity_buf, ent->entity))
357			break;
358		}
359		ucs = ent->ucs;
360	    }
361	    if (ucs < 32 || ucs >= 0x10ffff)
362		return p;	/* Bogus */
363	    if (ucs >= 0x10000) {
364		*p++ = 0xf0 + (ucs >> 18);
365		*p++ = 0x80 + ((ucs >> 12) & 0x3f);
366		*p++ = 0x80 + ((ucs >> 6) & 0x3f);
367		*p++ = 0x80 + (ucs & 0x3f);
368	    } else if (ucs >= 0x800) {
369		*p++ = 0xe0 + (ucs >> 12);
370		*p++ = 0x80 + ((ucs >> 6) & 0x3f);
371		*p++ = 0x80 + (ucs & 0x3f);
372	    } else if (ucs >= 0x80) {
373		*p++ = 0xc0 + (ucs >> 6);
374		*p++ = 0x80 + (ucs & 0x3f);
375	    } else {
376		*p++ = ucs;
377	    }
378	} else if (st->ep < st->entity_buf + sizeof st->entity_buf - 1) {
379	    *st->ep++ = c;
380	}
381    }
382    return p;
383}
384
385static const char *http_get_filename(struct inode *inode, char *buf)
386{
387    int c, lc;
388    char *p;
389    const struct machine *sm;
390    struct entity_state es;
391    enum http_readdir_state state = st_start;
392    enum http_readdir_state pstate = st_start;
393
394    memset(&es, 0, sizeof es);
395
396    p = buf;
397    for (;;) {
398	c = pxe_getc(inode);
399	if (c == -1)
400	    return NULL;
401
402	lc = tolower(c);
403
404	sm = &statemachine[state];
405
406	if (lc == sm->xchar)
407	    state = sm->st_xchar;
408	else if (c == '<')
409	    state = sm->st_left;
410	else if (c == '>')
411	    state = sm->st_right;
412	else if (isspace(c))
413	    state = sm->st_space;
414	else
415	    state = sm->st_other;
416
417	if (state == st_hrefeq || state == st_hrefqu) {
418	    if (state != pstate)
419		p = buf;
420	    else if (p < buf + FILENAME_MAX)
421		p = emit(p, c, &es);
422	    pstate = state;
423	} else {
424	    if (pstate != st_start)
425		pstate = st_start;
426	    if (p != buf && state == st_start) {
427		*p = '\0';
428		return buf;
429	    }
430	}
431    }
432}
433
434int http_readdir(struct inode *inode, struct dirent *dirent)
435{
436    char buf[FILENAME_MAX + 6];
437    const char *fn, *sp;
438
439    for (;;) {
440	fn = http_get_filename(inode, buf);
441
442	if (!fn)
443	    return -1;		/* End of directory */
444
445	/* Ignore entries with http special characters */
446	if (strchr(fn, '#'))
447	    continue;
448	if (strchr(fn, '?'))
449	    continue;
450
451	/* A slash if present has to be the last character, and not the first */
452	sp = strchr(fn, '/');
453	if (sp) {
454	    if (sp == fn || sp[1])
455		continue;
456	} else {
457	    sp = strchr(fn, '\0');
458	}
459
460	if (sp > fn + NAME_MAX)
461	    continue;
462
463	dirent->d_ino = 0;	/* Not applicable */
464	dirent->d_off = 0;	/* Not applicable */
465	dirent->d_reclen = offsetof(struct dirent, d_name) + (sp-fn) + 1;
466	dirent->d_type = *sp == '/' ? DT_DIR : DT_REG;
467	memcpy(dirent->d_name, fn, sp-fn);
468	dirent->d_name[sp-fn] = '\0';
469	return 0;
470    }
471}
472