1/* 2 * HTMLtree.c : implementation of access function for an HTML tree. 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9 10#define IN_LIBXML 11#include "libxml.h" 12#ifdef LIBXML_HTML_ENABLED 13 14#include <string.h> /* for memset() only ! */ 15 16#ifdef HAVE_CTYPE_H 17#include <ctype.h> 18#endif 19#ifdef HAVE_STDLIB_H 20#include <stdlib.h> 21#endif 22 23#include <libxml/xmlmemory.h> 24#include <libxml/HTMLparser.h> 25#include <libxml/HTMLtree.h> 26#include <libxml/entities.h> 27#include <libxml/valid.h> 28#include <libxml/xmlerror.h> 29#include <libxml/parserInternals.h> 30#include <libxml/globals.h> 31#include <libxml/uri.h> 32 33/************************************************************************ 34 * * 35 * Getting/Setting encoding meta tags * 36 * * 37 ************************************************************************/ 38 39/** 40 * htmlGetMetaEncoding: 41 * @doc: the document 42 * 43 * Encoding definition lookup in the Meta tags 44 * 45 * Returns the current encoding as flagged in the HTML source 46 */ 47const xmlChar * 48htmlGetMetaEncoding(htmlDocPtr doc) { 49 htmlNodePtr cur; 50 const xmlChar *content; 51 const xmlChar *encoding; 52 53 if (doc == NULL) 54 return(NULL); 55 cur = doc->children; 56 57 /* 58 * Search the html 59 */ 60 while (cur != NULL) { 61 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 62 if (xmlStrEqual(cur->name, BAD_CAST"html")) 63 break; 64 if (xmlStrEqual(cur->name, BAD_CAST"head")) 65 goto found_head; 66 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 67 goto found_meta; 68 } 69 cur = cur->next; 70 } 71 if (cur == NULL) 72 return(NULL); 73 cur = cur->children; 74 75 /* 76 * Search the head 77 */ 78 while (cur != NULL) { 79 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 80 if (xmlStrEqual(cur->name, BAD_CAST"head")) 81 break; 82 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 83 goto found_meta; 84 } 85 cur = cur->next; 86 } 87 if (cur == NULL) 88 return(NULL); 89found_head: 90 cur = cur->children; 91 92 /* 93 * Search the meta elements 94 */ 95found_meta: 96 while (cur != NULL) { 97 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 98 if (xmlStrEqual(cur->name, BAD_CAST"meta")) { 99 xmlAttrPtr attr = cur->properties; 100 int http; 101 const xmlChar *value; 102 103 content = NULL; 104 http = 0; 105 while (attr != NULL) { 106 if ((attr->children != NULL) && 107 (attr->children->type == XML_TEXT_NODE) && 108 (attr->children->next == NULL)) { 109 value = attr->children->content; 110 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 111 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 112 http = 1; 113 else if ((value != NULL) 114 && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 115 content = value; 116 if ((http != 0) && (content != NULL)) 117 goto found_content; 118 } 119 attr = attr->next; 120 } 121 } 122 } 123 cur = cur->next; 124 } 125 return(NULL); 126 127found_content: 128 encoding = xmlStrstr(content, BAD_CAST"charset="); 129 if (encoding == NULL) 130 encoding = xmlStrstr(content, BAD_CAST"Charset="); 131 if (encoding == NULL) 132 encoding = xmlStrstr(content, BAD_CAST"CHARSET="); 133 if (encoding != NULL) { 134 encoding += 8; 135 } else { 136 encoding = xmlStrstr(content, BAD_CAST"charset ="); 137 if (encoding == NULL) 138 encoding = xmlStrstr(content, BAD_CAST"Charset ="); 139 if (encoding == NULL) 140 encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); 141 if (encoding != NULL) 142 encoding += 9; 143 } 144 if (encoding != NULL) { 145 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 146 } 147 return(encoding); 148} 149 150/** 151 * htmlSetMetaEncoding: 152 * @doc: the document 153 * @encoding: the encoding string 154 * 155 * Sets the current encoding in the Meta tags 156 * NOTE: this will not change the document content encoding, just 157 * the META flag associated. 158 * 159 * Returns 0 in case of success and -1 in case of error 160 */ 161int 162htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { 163 htmlNodePtr cur, meta = NULL, head = NULL; 164 const xmlChar *content = NULL; 165 char newcontent[100]; 166 167 168 if (doc == NULL) 169 return(-1); 170 171 /* html isn't a real encoding it's just libxml2 way to get entities */ 172 if (!xmlStrcasecmp(encoding, BAD_CAST "html")) 173 return(-1); 174 175 if (encoding != NULL) { 176 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", 177 (char *)encoding); 178 newcontent[sizeof(newcontent) - 1] = 0; 179 } 180 181 cur = doc->children; 182 183 /* 184 * Search the html 185 */ 186 while (cur != NULL) { 187 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 188 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) 189 break; 190 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 191 goto found_head; 192 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) 193 goto found_meta; 194 } 195 cur = cur->next; 196 } 197 if (cur == NULL) 198 return(-1); 199 cur = cur->children; 200 201 /* 202 * Search the head 203 */ 204 while (cur != NULL) { 205 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 206 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 207 break; 208 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 209 head = cur->parent; 210 goto found_meta; 211 } 212 } 213 cur = cur->next; 214 } 215 if (cur == NULL) 216 return(-1); 217found_head: 218 head = cur; 219 if (cur->children == NULL) 220 goto create; 221 cur = cur->children; 222 223found_meta: 224 /* 225 * Search and update all the remaining the meta elements carrying 226 * encoding informations 227 */ 228 while (cur != NULL) { 229 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 230 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 231 xmlAttrPtr attr = cur->properties; 232 int http; 233 const xmlChar *value; 234 235 content = NULL; 236 http = 0; 237 while (attr != NULL) { 238 if ((attr->children != NULL) && 239 (attr->children->type == XML_TEXT_NODE) && 240 (attr->children->next == NULL)) { 241 value = attr->children->content; 242 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 243 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 244 http = 1; 245 else 246 { 247 if ((value != NULL) && 248 (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 249 content = value; 250 } 251 if ((http != 0) && (content != NULL)) 252 break; 253 } 254 attr = attr->next; 255 } 256 if ((http != 0) && (content != NULL)) { 257 meta = cur; 258 break; 259 } 260 261 } 262 } 263 cur = cur->next; 264 } 265create: 266 if (meta == NULL) { 267 if ((encoding != NULL) && (head != NULL)) { 268 /* 269 * Create a new Meta element with the right attributes 270 */ 271 272 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); 273 if (head->children == NULL) 274 xmlAddChild(head, meta); 275 else 276 xmlAddPrevSibling(head->children, meta); 277 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); 278 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); 279 } 280 } else { 281 /* change the document only if there is a real encoding change */ 282 if (xmlStrcasestr(content, encoding) == NULL) { 283 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent); 284 } 285 } 286 287 288 return(0); 289} 290 291/** 292 * booleanHTMLAttrs: 293 * 294 * These are the HTML attributes which will be output 295 * in minimized form, i.e. <option selected="selected"> will be 296 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" 297 * 298 */ 299static const char* htmlBooleanAttrs[] = { 300 "checked", "compact", "declare", "defer", "disabled", "ismap", 301 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", 302 "selected", NULL 303}; 304 305 306/** 307 * htmlIsBooleanAttr: 308 * @name: the name of the attribute to check 309 * 310 * Determine if a given attribute is a boolean attribute. 311 * 312 * returns: false if the attribute is not boolean, true otherwise. 313 */ 314int 315htmlIsBooleanAttr(const xmlChar *name) 316{ 317 int i = 0; 318 319 while (htmlBooleanAttrs[i] != NULL) { 320 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) 321 return 1; 322 i++; 323 } 324 return 0; 325} 326 327#ifdef LIBXML_OUTPUT_ENABLED 328/* 329 * private routine exported from xmlIO.c 330 */ 331xmlOutputBufferPtr 332xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder); 333/************************************************************************ 334 * * 335 * Output error handlers * 336 * * 337 ************************************************************************/ 338/** 339 * htmlSaveErrMemory: 340 * @extra: extra informations 341 * 342 * Handle an out of memory condition 343 */ 344static void 345htmlSaveErrMemory(const char *extra) 346{ 347 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra); 348} 349 350/** 351 * htmlSaveErr: 352 * @code: the error number 353 * @node: the location of the error. 354 * @extra: extra informations 355 * 356 * Handle an out of memory condition 357 */ 358static void 359htmlSaveErr(int code, xmlNodePtr node, const char *extra) 360{ 361 const char *msg = NULL; 362 363 switch(code) { 364 case XML_SAVE_NOT_UTF8: 365 msg = "string is not in UTF-8\n"; 366 break; 367 case XML_SAVE_CHAR_INVALID: 368 msg = "invalid character value\n"; 369 break; 370 case XML_SAVE_UNKNOWN_ENCODING: 371 msg = "unknown encoding %s\n"; 372 break; 373 case XML_SAVE_NO_DOCTYPE: 374 msg = "HTML has no DOCTYPE\n"; 375 break; 376 default: 377 msg = "unexpected error number\n"; 378 } 379 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra); 380} 381 382/************************************************************************ 383 * * 384 * Dumping HTML tree content to a simple buffer * 385 * * 386 ************************************************************************/ 387 388static int 389htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, 390 int format); 391 392/** 393 * htmlNodeDumpFormat: 394 * @buf: the HTML buffer output 395 * @doc: the document 396 * @cur: the current node 397 * @format: should formatting spaces been added 398 * 399 * Dump an HTML node, recursive behaviour,children are printed too. 400 * 401 * Returns the number of byte written or -1 in case of error 402 */ 403static int 404htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, 405 int format) { 406 unsigned int use; 407 int ret; 408 xmlOutputBufferPtr outbuf; 409 410 if (cur == NULL) { 411 return (-1); 412 } 413 if (buf == NULL) { 414 return (-1); 415 } 416 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); 417 if (outbuf == NULL) { 418 htmlSaveErrMemory("allocating HTML output buffer"); 419 return (-1); 420 } 421 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer)); 422 outbuf->buffer = buf; 423 outbuf->encoder = NULL; 424 outbuf->writecallback = NULL; 425 outbuf->closecallback = NULL; 426 outbuf->context = NULL; 427 outbuf->written = 0; 428 429 use = buf->use; 430 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); 431 xmlFree(outbuf); 432 ret = buf->use - use; 433 return (ret); 434} 435 436/** 437 * htmlNodeDump: 438 * @buf: the HTML buffer output 439 * @doc: the document 440 * @cur: the current node 441 * 442 * Dump an HTML node, recursive behaviour,children are printed too, 443 * and formatting returns are added. 444 * 445 * Returns the number of byte written or -1 in case of error 446 */ 447int 448htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { 449 xmlInitParser(); 450 451 return(htmlNodeDumpFormat(buf, doc, cur, 1)); 452} 453 454/** 455 * htmlNodeDumpFileFormat: 456 * @out: the FILE pointer 457 * @doc: the document 458 * @cur: the current node 459 * @encoding: the document encoding 460 * @format: should formatting spaces been added 461 * 462 * Dump an HTML node, recursive behaviour,children are printed too. 463 * 464 * TODO: if encoding == NULL try to save in the doc encoding 465 * 466 * returns: the number of byte written or -1 in case of failure. 467 */ 468int 469htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, 470 xmlNodePtr cur, const char *encoding, int format) { 471 xmlOutputBufferPtr buf; 472 xmlCharEncodingHandlerPtr handler = NULL; 473 int ret; 474 475 xmlInitParser(); 476 477 if (encoding != NULL) { 478 xmlCharEncoding enc; 479 480 enc = xmlParseCharEncoding(encoding); 481 if (enc != XML_CHAR_ENCODING_UTF8) { 482 handler = xmlFindCharEncodingHandler(encoding); 483 if (handler == NULL) 484 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 485 } 486 } 487 488 /* 489 * Fallback to HTML or ASCII when the encoding is unspecified 490 */ 491 if (handler == NULL) 492 handler = xmlFindCharEncodingHandler("HTML"); 493 if (handler == NULL) 494 handler = xmlFindCharEncodingHandler("ascii"); 495 496 /* 497 * save the content to a temp buffer. 498 */ 499 buf = xmlOutputBufferCreateFile(out, handler); 500 if (buf == NULL) return(0); 501 502 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 503 504 ret = xmlOutputBufferClose(buf); 505 return(ret); 506} 507 508/** 509 * htmlNodeDumpFile: 510 * @out: the FILE pointer 511 * @doc: the document 512 * @cur: the current node 513 * 514 * Dump an HTML node, recursive behaviour,children are printed too, 515 * and formatting returns are added. 516 */ 517void 518htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { 519 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); 520} 521 522/** 523 * htmlDocDumpMemoryFormat: 524 * @cur: the document 525 * @mem: OUT: the memory pointer 526 * @size: OUT: the memory length 527 * @format: should formatting spaces been added 528 * 529 * Dump an HTML document in memory and return the xmlChar * and it's size. 530 * It's up to the caller to free the memory. 531 */ 532void 533htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { 534 xmlOutputBufferPtr buf; 535 xmlCharEncodingHandlerPtr handler = NULL; 536 const char *encoding; 537 538 xmlInitParser(); 539 540 if ((mem == NULL) || (size == NULL)) 541 return; 542 if (cur == NULL) { 543 *mem = NULL; 544 *size = 0; 545 return; 546 } 547 548 encoding = (const char *) htmlGetMetaEncoding(cur); 549 550 if (encoding != NULL) { 551 xmlCharEncoding enc; 552 553 enc = xmlParseCharEncoding(encoding); 554 if (enc != cur->charset) { 555 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 556 /* 557 * Not supported yet 558 */ 559 *mem = NULL; 560 *size = 0; 561 return; 562 } 563 564 handler = xmlFindCharEncodingHandler(encoding); 565 if (handler == NULL) 566 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 567 568 } else { 569 handler = xmlFindCharEncodingHandler(encoding); 570 } 571 } 572 573 /* 574 * Fallback to HTML or ASCII when the encoding is unspecified 575 */ 576 if (handler == NULL) 577 handler = xmlFindCharEncodingHandler("HTML"); 578 if (handler == NULL) 579 handler = xmlFindCharEncodingHandler("ascii"); 580 581 buf = xmlAllocOutputBufferInternal(handler); 582 if (buf == NULL) { 583 *mem = NULL; 584 *size = 0; 585 return; 586 } 587 588 htmlDocContentDumpFormatOutput(buf, cur, NULL, format); 589 590 xmlOutputBufferFlush(buf); 591 if (buf->conv != NULL) { 592 *size = buf->conv->use; 593 *mem = xmlStrndup(buf->conv->content, *size); 594 } else { 595 *size = buf->buffer->use; 596 *mem = xmlStrndup(buf->buffer->content, *size); 597 } 598 (void)xmlOutputBufferClose(buf); 599} 600 601/** 602 * htmlDocDumpMemory: 603 * @cur: the document 604 * @mem: OUT: the memory pointer 605 * @size: OUT: the memory length 606 * 607 * Dump an HTML document in memory and return the xmlChar * and it's size. 608 * It's up to the caller to free the memory. 609 */ 610void 611htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { 612 htmlDocDumpMemoryFormat(cur, mem, size, 1); 613} 614 615 616/************************************************************************ 617 * * 618 * Dumping HTML tree content to an I/O output buffer * 619 * * 620 ************************************************************************/ 621 622void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur); 623 624/** 625 * htmlDtdDumpOutput: 626 * @buf: the HTML buffer output 627 * @doc: the document 628 * @encoding: the encoding string 629 * 630 * TODO: check whether encoding is needed 631 * 632 * Dump the HTML document DTD, if any. 633 */ 634static void 635htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 636 const char *encoding ATTRIBUTE_UNUSED) { 637 xmlDtdPtr cur = doc->intSubset; 638 639 if (cur == NULL) { 640 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL); 641 return; 642 } 643 xmlOutputBufferWriteString(buf, "<!DOCTYPE "); 644 xmlOutputBufferWriteString(buf, (const char *)cur->name); 645 if (cur->ExternalID != NULL) { 646 xmlOutputBufferWriteString(buf, " PUBLIC "); 647 xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID); 648 if (cur->SystemID != NULL) { 649 xmlOutputBufferWriteString(buf, " "); 650 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID); 651 } 652 } else if (cur->SystemID != NULL) { 653 xmlOutputBufferWriteString(buf, " SYSTEM "); 654 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID); 655 } 656 xmlOutputBufferWriteString(buf, ">\n"); 657} 658 659/** 660 * htmlAttrDumpOutput: 661 * @buf: the HTML buffer output 662 * @doc: the document 663 * @cur: the attribute pointer 664 * @encoding: the encoding string 665 * 666 * Dump an HTML attribute 667 */ 668static void 669htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, 670 const char *encoding ATTRIBUTE_UNUSED) { 671 xmlChar *value; 672 673 /* 674 * TODO: The html output method should not escape a & character 675 * occurring in an attribute value immediately followed by 676 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation). 677 */ 678 679 if (cur == NULL) { 680 return; 681 } 682 xmlOutputBufferWriteString(buf, " "); 683 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 684 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 685 xmlOutputBufferWriteString(buf, ":"); 686 } 687 xmlOutputBufferWriteString(buf, (const char *)cur->name); 688 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { 689 value = xmlNodeListGetString(doc, cur->children, 0); 690 if (value) { 691 xmlOutputBufferWriteString(buf, "="); 692 if ((cur->ns == NULL) && (cur->parent != NULL) && 693 (cur->parent->ns == NULL) && 694 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || 695 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || 696 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || 697 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && 698 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) { 699 xmlChar *escaped; 700 xmlChar *tmp = value; 701 702 while (IS_BLANK_CH(*tmp)) tmp++; 703 704 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+"); 705 if (escaped != NULL) { 706 xmlBufferWriteQuotedString(buf->buffer, escaped); 707 xmlFree(escaped); 708 } else { 709 xmlBufferWriteQuotedString(buf->buffer, value); 710 } 711 } else { 712 xmlBufferWriteQuotedString(buf->buffer, value); 713 } 714 xmlFree(value); 715 } else { 716 xmlOutputBufferWriteString(buf, "=\"\""); 717 } 718 } 719} 720 721/** 722 * htmlAttrListDumpOutput: 723 * @buf: the HTML buffer output 724 * @doc: the document 725 * @cur: the first attribute pointer 726 * @encoding: the encoding string 727 * 728 * Dump a list of HTML attributes 729 */ 730static void 731htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) { 732 if (cur == NULL) { 733 return; 734 } 735 while (cur != NULL) { 736 htmlAttrDumpOutput(buf, doc, cur, encoding); 737 cur = cur->next; 738 } 739} 740 741 742 743/** 744 * htmlNodeListDumpOutput: 745 * @buf: the HTML buffer output 746 * @doc: the document 747 * @cur: the first node 748 * @encoding: the encoding string 749 * @format: should formatting spaces been added 750 * 751 * Dump an HTML node list, recursive behaviour,children are printed too. 752 */ 753static void 754htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 755 xmlNodePtr cur, const char *encoding, int format) { 756 if (cur == NULL) { 757 return; 758 } 759 while (cur != NULL) { 760 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 761 cur = cur->next; 762 } 763} 764 765/** 766 * htmlNodeDumpFormatOutput: 767 * @buf: the HTML buffer output 768 * @doc: the document 769 * @cur: the current node 770 * @encoding: the encoding string 771 * @format: should formatting spaces been added 772 * 773 * Dump an HTML node, recursive behaviour,children are printed too. 774 */ 775void 776htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 777 xmlNodePtr cur, const char *encoding, int format) { 778 const htmlElemDesc * info; 779 780 xmlInitParser(); 781 782 if ((cur == NULL) || (buf == NULL)) { 783 return; 784 } 785 /* 786 * Special cases. 787 */ 788 if (cur->type == XML_DTD_NODE) 789 return; 790 if ((cur->type == XML_HTML_DOCUMENT_NODE) || 791 (cur->type == XML_DOCUMENT_NODE)){ 792 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding); 793 return; 794 } 795 if (cur->type == XML_ATTRIBUTE_NODE) { 796 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding); 797 return; 798 } 799 if (cur->type == HTML_TEXT_NODE) { 800 if (cur->content != NULL) { 801 if (((cur->name == (const xmlChar *)xmlStringText) || 802 (cur->name != (const xmlChar *)xmlStringTextNoenc)) && 803 ((cur->parent == NULL) || 804 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && 805 (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { 806 xmlChar *buffer; 807 808 buffer = xmlEncodeEntitiesReentrant(doc, cur->content); 809 if (buffer != NULL) { 810 xmlOutputBufferWriteString(buf, (const char *)buffer); 811 xmlFree(buffer); 812 } 813 } else { 814 xmlOutputBufferWriteString(buf, (const char *)cur->content); 815 } 816 } 817 return; 818 } 819 if (cur->type == HTML_COMMENT_NODE) { 820 if (cur->content != NULL) { 821 xmlOutputBufferWriteString(buf, "<!--"); 822 xmlOutputBufferWriteString(buf, (const char *)cur->content); 823 xmlOutputBufferWriteString(buf, "-->"); 824 } 825 return; 826 } 827 if (cur->type == HTML_PI_NODE) { 828 if (cur->name == NULL) 829 return; 830 xmlOutputBufferWriteString(buf, "<?"); 831 xmlOutputBufferWriteString(buf, (const char *)cur->name); 832 if (cur->content != NULL) { 833 xmlOutputBufferWriteString(buf, " "); 834 xmlOutputBufferWriteString(buf, (const char *)cur->content); 835 } 836 xmlOutputBufferWriteString(buf, ">"); 837 return; 838 } 839 if (cur->type == HTML_ENTITY_REF_NODE) { 840 xmlOutputBufferWriteString(buf, "&"); 841 xmlOutputBufferWriteString(buf, (const char *)cur->name); 842 xmlOutputBufferWriteString(buf, ";"); 843 return; 844 } 845 if (cur->type == HTML_PRESERVE_NODE) { 846 if (cur->content != NULL) { 847 xmlOutputBufferWriteString(buf, (const char *)cur->content); 848 } 849 return; 850 } 851 852 /* 853 * Get specific HTML info for that node. 854 */ 855 if (cur->ns == NULL) 856 info = htmlTagLookup(cur->name); 857 else 858 info = NULL; 859 860 xmlOutputBufferWriteString(buf, "<"); 861 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 862 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 863 xmlOutputBufferWriteString(buf, ":"); 864 } 865 xmlOutputBufferWriteString(buf, (const char *)cur->name); 866 if (cur->nsDef) 867 xmlNsListDumpOutput(buf, cur->nsDef); 868 if (cur->properties != NULL) 869 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding); 870 871 if ((info != NULL) && (info->empty)) { 872 xmlOutputBufferWriteString(buf, ">"); 873 if ((format) && (!info->isinline) && (cur->next != NULL)) { 874 if ((cur->next->type != HTML_TEXT_NODE) && 875 (cur->next->type != HTML_ENTITY_REF_NODE) && 876 (cur->parent != NULL) && 877 (cur->parent->name != NULL) && 878 (cur->parent->name[0] != 'p')) /* p, pre, param */ 879 xmlOutputBufferWriteString(buf, "\n"); 880 } 881 return; 882 } 883 if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) && 884 (cur->children == NULL)) { 885 if ((info != NULL) && (info->saveEndTag != 0) && 886 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) && 887 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { 888 xmlOutputBufferWriteString(buf, ">"); 889 } else { 890 xmlOutputBufferWriteString(buf, "></"); 891 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 892 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 893 xmlOutputBufferWriteString(buf, ":"); 894 } 895 xmlOutputBufferWriteString(buf, (const char *)cur->name); 896 xmlOutputBufferWriteString(buf, ">"); 897 } 898 if ((format) && (cur->next != NULL) && 899 (info != NULL) && (!info->isinline)) { 900 if ((cur->next->type != HTML_TEXT_NODE) && 901 (cur->next->type != HTML_ENTITY_REF_NODE) && 902 (cur->parent != NULL) && 903 (cur->parent->name != NULL) && 904 (cur->parent->name[0] != 'p')) /* p, pre, param */ 905 xmlOutputBufferWriteString(buf, "\n"); 906 } 907 return; 908 } 909 xmlOutputBufferWriteString(buf, ">"); 910 if ((cur->type != XML_ELEMENT_NODE) && 911 (cur->content != NULL)) { 912 /* 913 * Uses the OutputBuffer property to automatically convert 914 * invalids to charrefs 915 */ 916 917 xmlOutputBufferWriteString(buf, (const char *) cur->content); 918 } 919 if (cur->children != NULL) { 920 if ((format) && (info != NULL) && (!info->isinline) && 921 (cur->children->type != HTML_TEXT_NODE) && 922 (cur->children->type != HTML_ENTITY_REF_NODE) && 923 (cur->children != cur->last) && 924 (cur->name != NULL) && 925 (cur->name[0] != 'p')) /* p, pre, param */ 926 xmlOutputBufferWriteString(buf, "\n"); 927 htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format); 928 if ((format) && (info != NULL) && (!info->isinline) && 929 (cur->last->type != HTML_TEXT_NODE) && 930 (cur->last->type != HTML_ENTITY_REF_NODE) && 931 (cur->children != cur->last) && 932 (cur->name != NULL) && 933 (cur->name[0] != 'p')) /* p, pre, param */ 934 xmlOutputBufferWriteString(buf, "\n"); 935 } 936 xmlOutputBufferWriteString(buf, "</"); 937 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 938 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 939 xmlOutputBufferWriteString(buf, ":"); 940 } 941 xmlOutputBufferWriteString(buf, (const char *)cur->name); 942 xmlOutputBufferWriteString(buf, ">"); 943 if ((format) && (info != NULL) && (!info->isinline) && 944 (cur->next != NULL)) { 945 if ((cur->next->type != HTML_TEXT_NODE) && 946 (cur->next->type != HTML_ENTITY_REF_NODE) && 947 (cur->parent != NULL) && 948 (cur->parent->name != NULL) && 949 (cur->parent->name[0] != 'p')) /* p, pre, param */ 950 xmlOutputBufferWriteString(buf, "\n"); 951 } 952} 953 954/** 955 * htmlNodeDumpOutput: 956 * @buf: the HTML buffer output 957 * @doc: the document 958 * @cur: the current node 959 * @encoding: the encoding string 960 * 961 * Dump an HTML node, recursive behaviour,children are printed too, 962 * and formatting returns/spaces are added. 963 */ 964void 965htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 966 xmlNodePtr cur, const char *encoding) { 967 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1); 968} 969 970/** 971 * htmlDocContentDumpFormatOutput: 972 * @buf: the HTML buffer output 973 * @cur: the document 974 * @encoding: the encoding string 975 * @format: should formatting spaces been added 976 * 977 * Dump an HTML document. 978 */ 979void 980htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 981 const char *encoding, int format) { 982 int type; 983 984 xmlInitParser(); 985 986 if ((buf == NULL) || (cur == NULL)) 987 return; 988 989 /* 990 * force to output the stuff as HTML, especially for entities 991 */ 992 type = cur->type; 993 cur->type = XML_HTML_DOCUMENT_NODE; 994 if (cur->intSubset != NULL) { 995 htmlDtdDumpOutput(buf, cur, NULL); 996 } 997 if (cur->children != NULL) { 998 htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format); 999 } 1000 xmlOutputBufferWriteString(buf, "\n"); 1001 cur->type = (xmlElementType) type; 1002} 1003 1004/** 1005 * htmlDocContentDumpOutput: 1006 * @buf: the HTML buffer output 1007 * @cur: the document 1008 * @encoding: the encoding string 1009 * 1010 * Dump an HTML document. Formating return/spaces are added. 1011 */ 1012void 1013htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 1014 const char *encoding) { 1015 htmlDocContentDumpFormatOutput(buf, cur, encoding, 1); 1016} 1017 1018/************************************************************************ 1019 * * 1020 * Saving functions front-ends * 1021 * * 1022 ************************************************************************/ 1023 1024/** 1025 * htmlDocDump: 1026 * @f: the FILE* 1027 * @cur: the document 1028 * 1029 * Dump an HTML document to an open FILE. 1030 * 1031 * returns: the number of byte written or -1 in case of failure. 1032 */ 1033int 1034htmlDocDump(FILE *f, xmlDocPtr cur) { 1035 xmlOutputBufferPtr buf; 1036 xmlCharEncodingHandlerPtr handler = NULL; 1037 const char *encoding; 1038 int ret; 1039 1040 xmlInitParser(); 1041 1042 if ((cur == NULL) || (f == NULL)) { 1043 return(-1); 1044 } 1045 1046 encoding = (const char *) htmlGetMetaEncoding(cur); 1047 1048 if (encoding != NULL) { 1049 xmlCharEncoding enc; 1050 1051 enc = xmlParseCharEncoding(encoding); 1052 if (enc != cur->charset) { 1053 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1054 /* 1055 * Not supported yet 1056 */ 1057 return(-1); 1058 } 1059 1060 handler = xmlFindCharEncodingHandler(encoding); 1061 if (handler == NULL) 1062 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1063 } else { 1064 handler = xmlFindCharEncodingHandler(encoding); 1065 } 1066 } 1067 1068 /* 1069 * Fallback to HTML or ASCII when the encoding is unspecified 1070 */ 1071 if (handler == NULL) 1072 handler = xmlFindCharEncodingHandler("HTML"); 1073 if (handler == NULL) 1074 handler = xmlFindCharEncodingHandler("ascii"); 1075 1076 buf = xmlOutputBufferCreateFile(f, handler); 1077 if (buf == NULL) return(-1); 1078 htmlDocContentDumpOutput(buf, cur, NULL); 1079 1080 ret = xmlOutputBufferClose(buf); 1081 return(ret); 1082} 1083 1084/** 1085 * htmlSaveFile: 1086 * @filename: the filename (or URL) 1087 * @cur: the document 1088 * 1089 * Dump an HTML document to a file. If @filename is "-" the stdout file is 1090 * used. 1091 * returns: the number of byte written or -1 in case of failure. 1092 */ 1093int 1094htmlSaveFile(const char *filename, xmlDocPtr cur) { 1095 xmlOutputBufferPtr buf; 1096 xmlCharEncodingHandlerPtr handler = NULL; 1097 const char *encoding; 1098 int ret; 1099 1100 if ((cur == NULL) || (filename == NULL)) 1101 return(-1); 1102 1103 xmlInitParser(); 1104 1105 encoding = (const char *) htmlGetMetaEncoding(cur); 1106 1107 if (encoding != NULL) { 1108 xmlCharEncoding enc; 1109 1110 enc = xmlParseCharEncoding(encoding); 1111 if (enc != cur->charset) { 1112 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1113 /* 1114 * Not supported yet 1115 */ 1116 return(-1); 1117 } 1118 1119 handler = xmlFindCharEncodingHandler(encoding); 1120 if (handler == NULL) 1121 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1122 } 1123 } 1124 1125 /* 1126 * Fallback to HTML or ASCII when the encoding is unspecified 1127 */ 1128 if (handler == NULL) 1129 handler = xmlFindCharEncodingHandler("HTML"); 1130 if (handler == NULL) 1131 handler = xmlFindCharEncodingHandler("ascii"); 1132 1133 /* 1134 * save the content to a temp buffer. 1135 */ 1136 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); 1137 if (buf == NULL) return(0); 1138 1139 htmlDocContentDumpOutput(buf, cur, NULL); 1140 1141 ret = xmlOutputBufferClose(buf); 1142 return(ret); 1143} 1144 1145/** 1146 * htmlSaveFileFormat: 1147 * @filename: the filename 1148 * @cur: the document 1149 * @format: should formatting spaces been added 1150 * @encoding: the document encoding 1151 * 1152 * Dump an HTML document to a file using a given encoding. 1153 * 1154 * returns: the number of byte written or -1 in case of failure. 1155 */ 1156int 1157htmlSaveFileFormat(const char *filename, xmlDocPtr cur, 1158 const char *encoding, int format) { 1159 xmlOutputBufferPtr buf; 1160 xmlCharEncodingHandlerPtr handler = NULL; 1161 int ret; 1162 1163 if ((cur == NULL) || (filename == NULL)) 1164 return(-1); 1165 1166 xmlInitParser(); 1167 1168 if (encoding != NULL) { 1169 xmlCharEncoding enc; 1170 1171 enc = xmlParseCharEncoding(encoding); 1172 if (enc != cur->charset) { 1173 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1174 /* 1175 * Not supported yet 1176 */ 1177 return(-1); 1178 } 1179 1180 handler = xmlFindCharEncodingHandler(encoding); 1181 if (handler == NULL) 1182 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1183 } 1184 htmlSetMetaEncoding(cur, (const xmlChar *) encoding); 1185 } else { 1186 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); 1187 } 1188 1189 /* 1190 * Fallback to HTML or ASCII when the encoding is unspecified 1191 */ 1192 if (handler == NULL) 1193 handler = xmlFindCharEncodingHandler("HTML"); 1194 if (handler == NULL) 1195 handler = xmlFindCharEncodingHandler("ascii"); 1196 1197 /* 1198 * save the content to a temp buffer. 1199 */ 1200 buf = xmlOutputBufferCreateFilename(filename, handler, 0); 1201 if (buf == NULL) return(0); 1202 1203 htmlDocContentDumpFormatOutput(buf, cur, encoding, format); 1204 1205 ret = xmlOutputBufferClose(buf); 1206 return(ret); 1207} 1208 1209/** 1210 * htmlSaveFileEnc: 1211 * @filename: the filename 1212 * @cur: the document 1213 * @encoding: the document encoding 1214 * 1215 * Dump an HTML document to a file using a given encoding 1216 * and formatting returns/spaces are added. 1217 * 1218 * returns: the number of byte written or -1 in case of failure. 1219 */ 1220int 1221htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { 1222 return(htmlSaveFileFormat(filename, cur, encoding, 1)); 1223} 1224 1225#endif /* LIBXML_OUTPUT_ENABLED */ 1226 1227#define bottom_HTMLtree 1228#include "elfgcchack.h" 1229#endif /* LIBXML_HTML_ENABLED */ 1230