1/* 2 * HTMLtree.c : implementation of access function for an HTML tree. 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9 10#define IN_LIBXML 11#include "libxml.h" 12#ifdef LIBXML_HTML_ENABLED 13 14#include <string.h> /* for memset() only ! */ 15 16#ifdef HAVE_CTYPE_H 17#include <ctype.h> 18#endif 19#ifdef HAVE_STDLIB_H 20#include <stdlib.h> 21#endif 22 23#include <libxml/xmlmemory.h> 24#include <libxml/HTMLparser.h> 25#include <libxml/HTMLtree.h> 26#include <libxml/entities.h> 27#include <libxml/valid.h> 28#include <libxml/xmlerror.h> 29#include <libxml/parserInternals.h> 30#include <libxml/globals.h> 31#include <libxml/uri.h> 32 33/************************************************************************ 34 * * 35 * Getting/Setting encoding meta tags * 36 * * 37 ************************************************************************/ 38 39/** 40 * htmlGetMetaEncoding: 41 * @doc: the document 42 * 43 * Encoding definition lookup in the Meta tags 44 * 45 * Returns the current encoding as flagged in the HTML source 46 */ 47const xmlChar * 48htmlGetMetaEncoding(htmlDocPtr doc) { 49 htmlNodePtr cur; 50 const xmlChar *content; 51 const xmlChar *encoding; 52 53 if (doc == NULL) 54 return(NULL); 55 cur = doc->children; 56 57 /* 58 * Search the html 59 */ 60 while (cur != NULL) { 61 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 62 if (xmlStrEqual(cur->name, BAD_CAST"html")) 63 break; 64 if (xmlStrEqual(cur->name, BAD_CAST"head")) 65 goto found_head; 66 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 67 goto found_meta; 68 } 69 cur = cur->next; 70 } 71 if (cur == NULL) 72 return(NULL); 73 cur = cur->children; 74 75 /* 76 * Search the head 77 */ 78 while (cur != NULL) { 79 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 80 if (xmlStrEqual(cur->name, BAD_CAST"head")) 81 break; 82 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 83 goto found_meta; 84 } 85 cur = cur->next; 86 } 87 if (cur == NULL) 88 return(NULL); 89found_head: 90 cur = cur->children; 91 92 /* 93 * Search the meta elements 94 */ 95found_meta: 96 while (cur != NULL) { 97 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 98 if (xmlStrEqual(cur->name, BAD_CAST"meta")) { 99 xmlAttrPtr attr = cur->properties; 100 int http; 101 const xmlChar *value; 102 103 content = NULL; 104 http = 0; 105 while (attr != NULL) { 106 if ((attr->children != NULL) && 107 (attr->children->type == XML_TEXT_NODE) && 108 (attr->children->next == NULL)) { 109 value = attr->children->content; 110 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 111 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 112 http = 1; 113 else if ((value != NULL) 114 && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 115 content = value; 116 if ((http != 0) && (content != NULL)) 117 goto found_content; 118 } 119 attr = attr->next; 120 } 121 } 122 } 123 cur = cur->next; 124 } 125 return(NULL); 126 127found_content: 128 encoding = xmlStrstr(content, BAD_CAST"charset="); 129 if (encoding == NULL) 130 encoding = xmlStrstr(content, BAD_CAST"Charset="); 131 if (encoding == NULL) 132 encoding = xmlStrstr(content, BAD_CAST"CHARSET="); 133 if (encoding != NULL) { 134 encoding += 8; 135 } else { 136 encoding = xmlStrstr(content, BAD_CAST"charset ="); 137 if (encoding == NULL) 138 encoding = xmlStrstr(content, BAD_CAST"Charset ="); 139 if (encoding == NULL) 140 encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); 141 if (encoding != NULL) 142 encoding += 9; 143 } 144 if (encoding != NULL) { 145 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 146 } 147 return(encoding); 148} 149 150/** 151 * htmlSetMetaEncoding: 152 * @doc: the document 153 * @encoding: the encoding string 154 * 155 * Sets the current encoding in the Meta tags 156 * NOTE: this will not change the document content encoding, just 157 * the META flag associated. 158 * 159 * Returns 0 in case of success and -1 in case of error 160 */ 161int 162htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { 163 htmlNodePtr cur, meta = NULL, head = NULL; 164 const xmlChar *content = NULL; 165 char newcontent[100]; 166 167 168 if (doc == NULL) 169 return(-1); 170 171 /* html isn't a real encoding it's just libxml2 way to get entities */ 172 if (!xmlStrcasecmp(encoding, BAD_CAST "html")) 173 return(-1); 174 175 if (encoding != NULL) { 176 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", 177 (char *)encoding); 178 newcontent[sizeof(newcontent) - 1] = 0; 179 } 180 181 cur = doc->children; 182 183 /* 184 * Search the html 185 */ 186 while (cur != NULL) { 187 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 188 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) 189 break; 190 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 191 goto found_head; 192 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) 193 goto found_meta; 194 } 195 cur = cur->next; 196 } 197 if (cur == NULL) 198 return(-1); 199 cur = cur->children; 200 201 /* 202 * Search the head 203 */ 204 while (cur != NULL) { 205 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 206 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 207 break; 208 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 209 head = cur->parent; 210 goto found_meta; 211 } 212 } 213 cur = cur->next; 214 } 215 if (cur == NULL) 216 return(-1); 217found_head: 218 head = cur; 219 if (cur->children == NULL) 220 goto create; 221 cur = cur->children; 222 223found_meta: 224 /* 225 * Search and update all the remaining the meta elements carrying 226 * encoding informations 227 */ 228 while (cur != NULL) { 229 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 230 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 231 xmlAttrPtr attr = cur->properties; 232 int http; 233 const xmlChar *value; 234 235 content = NULL; 236 http = 0; 237 while (attr != NULL) { 238 if ((attr->children != NULL) && 239 (attr->children->type == XML_TEXT_NODE) && 240 (attr->children->next == NULL)) { 241 value = attr->children->content; 242 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 243 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 244 http = 1; 245 else 246 { 247 if ((value != NULL) && 248 (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 249 content = value; 250 } 251 if ((http != 0) && (content != NULL)) 252 break; 253 } 254 attr = attr->next; 255 } 256 if ((http != 0) && (content != NULL)) { 257 meta = cur; 258 break; 259 } 260 261 } 262 } 263 cur = cur->next; 264 } 265create: 266 if (meta == NULL) { 267 if ((encoding != NULL) && (head != NULL)) { 268 /* 269 * Create a new Meta element with the right attributes 270 */ 271 272 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); 273 if (head->children == NULL) 274 xmlAddChild(head, meta); 275 else 276 xmlAddPrevSibling(head->children, meta); 277 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); 278 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); 279 } 280 } else { 281 /* change the document only if there is a real encoding change */ 282 if (xmlStrcasestr(content, encoding) == NULL) { 283 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent); 284 } 285 } 286 287 288 return(0); 289} 290 291/** 292 * booleanHTMLAttrs: 293 * 294 * These are the HTML attributes which will be output 295 * in minimized form, i.e. <option selected="selected"> will be 296 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" 297 * 298 */ 299static const char* htmlBooleanAttrs[] = { 300 "checked", "compact", "declare", "defer", "disabled", "ismap", 301 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", 302 "selected", NULL 303}; 304 305 306/** 307 * htmlIsBooleanAttr: 308 * @name: the name of the attribute to check 309 * 310 * Determine if a given attribute is a boolean attribute. 311 * 312 * returns: false if the attribute is not boolean, true otherwise. 313 */ 314int 315htmlIsBooleanAttr(const xmlChar *name) 316{ 317 int i = 0; 318 319 while (htmlBooleanAttrs[i] != NULL) { 320 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) 321 return 1; 322 i++; 323 } 324 return 0; 325} 326 327#ifdef LIBXML_OUTPUT_ENABLED 328/* 329 * private routine exported from xmlIO.c 330 */ 331xmlOutputBufferPtr 332xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder); 333/************************************************************************ 334 * * 335 * Output error handlers * 336 * * 337 ************************************************************************/ 338/** 339 * htmlSaveErrMemory: 340 * @extra: extra informations 341 * 342 * Handle an out of memory condition 343 */ 344static void 345htmlSaveErrMemory(const char *extra) 346{ 347 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra); 348} 349 350/** 351 * htmlSaveErr: 352 * @code: the error number 353 * @node: the location of the error. 354 * @extra: extra informations 355 * 356 * Handle an out of memory condition 357 */ 358static void 359htmlSaveErr(int code, xmlNodePtr node, const char *extra) 360{ 361 const char *msg = NULL; 362 363 switch(code) { 364 case XML_SAVE_NOT_UTF8: 365 msg = "string is not in UTF-8\n"; 366 break; 367 case XML_SAVE_CHAR_INVALID: 368 msg = "invalid character value\n"; 369 break; 370 case XML_SAVE_UNKNOWN_ENCODING: 371 msg = "unknown encoding %s\n"; 372 break; 373 case XML_SAVE_NO_DOCTYPE: 374 msg = "HTML has no DOCTYPE\n"; 375 break; 376 default: 377 msg = "unexpected error number\n"; 378 } 379 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra); 380} 381 382/************************************************************************ 383 * * 384 * Dumping HTML tree content to a simple buffer * 385 * * 386 ************************************************************************/ 387 388static int 389htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, 390 int format); 391 392/** 393 * htmlNodeDumpFormat: 394 * @buf: the HTML buffer output 395 * @doc: the document 396 * @cur: the current node 397 * @format: should formatting spaces been added 398 * 399 * Dump an HTML node, recursive behaviour,children are printed too. 400 * 401 * Returns the number of byte written or -1 in case of error 402 */ 403static int 404htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, 405 int format) { 406 unsigned int use; 407 int ret; 408 xmlOutputBufferPtr outbuf; 409 410 if (cur == NULL) { 411 return (-1); 412 } 413 if (buf == NULL) { 414 return (-1); 415 } 416 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); 417 if (outbuf == NULL) { 418 htmlSaveErrMemory("allocating HTML output buffer"); 419 return (-1); 420 } 421 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer)); 422 outbuf->buffer = buf; 423 outbuf->encoder = NULL; 424 outbuf->writecallback = NULL; 425 outbuf->closecallback = NULL; 426 outbuf->context = NULL; 427 outbuf->written = 0; 428 429 use = buf->use; 430 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); 431 xmlFree(outbuf); 432 ret = buf->use - use; 433 return (ret); 434} 435 436/** 437 * htmlNodeDump: 438 * @buf: the HTML buffer output 439 * @doc: the document 440 * @cur: the current node 441 * 442 * Dump an HTML node, recursive behaviour,children are printed too, 443 * and formatting returns are added. 444 * 445 * Returns the number of byte written or -1 in case of error 446 */ 447int 448htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { 449 xmlInitParser(); 450 451 return(htmlNodeDumpFormat(buf, doc, cur, 1)); 452} 453 454/** 455 * htmlNodeDumpFileFormat: 456 * @out: the FILE pointer 457 * @doc: the document 458 * @cur: the current node 459 * @encoding: the document encoding 460 * @format: should formatting spaces been added 461 * 462 * Dump an HTML node, recursive behaviour,children are printed too. 463 * 464 * TODO: if encoding == NULL try to save in the doc encoding 465 * 466 * returns: the number of byte written or -1 in case of failure. 467 */ 468int 469htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, 470 xmlNodePtr cur, const char *encoding, int format) { 471 xmlOutputBufferPtr buf; 472 xmlCharEncodingHandlerPtr handler = NULL; 473 int ret; 474 475 xmlInitParser(); 476 477 if (encoding != NULL) { 478 xmlCharEncoding enc; 479 480 enc = xmlParseCharEncoding(encoding); 481 if (enc != XML_CHAR_ENCODING_UTF8) { 482 handler = xmlFindCharEncodingHandler(encoding); 483 if (handler == NULL) 484 return(-1); 485 } 486 } 487 488 /* 489 * Fallback to HTML or ASCII when the encoding is unspecified 490 */ 491 if (handler == NULL) 492 handler = xmlFindCharEncodingHandler("HTML"); 493 if (handler == NULL) 494 handler = xmlFindCharEncodingHandler("ascii"); 495 496 /* 497 * save the content to a temp buffer. 498 */ 499 buf = xmlOutputBufferCreateFile(out, handler); 500 if (buf == NULL) return(0); 501 502 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 503 504 ret = xmlOutputBufferClose(buf); 505 return(ret); 506} 507 508/** 509 * htmlNodeDumpFile: 510 * @out: the FILE pointer 511 * @doc: the document 512 * @cur: the current node 513 * 514 * Dump an HTML node, recursive behaviour,children are printed too, 515 * and formatting returns are added. 516 */ 517void 518htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { 519 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); 520} 521 522/** 523 * htmlDocDumpMemoryFormat: 524 * @cur: the document 525 * @mem: OUT: the memory pointer 526 * @size: OUT: the memory length 527 * @format: should formatting spaces been added 528 * 529 * Dump an HTML document in memory and return the xmlChar * and it's size. 530 * It's up to the caller to free the memory. 531 */ 532void 533htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { 534 xmlOutputBufferPtr buf; 535 xmlCharEncodingHandlerPtr handler = NULL; 536 const char *encoding; 537 538 xmlInitParser(); 539 540 if ((mem == NULL) || (size == NULL)) 541 return; 542 if (cur == NULL) { 543 *mem = NULL; 544 *size = 0; 545 return; 546 } 547 548 encoding = (const char *) htmlGetMetaEncoding(cur); 549 550 if (encoding != NULL) { 551 xmlCharEncoding enc; 552 553 enc = xmlParseCharEncoding(encoding); 554 if (enc != cur->charset) { 555 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 556 /* 557 * Not supported yet 558 */ 559 *mem = NULL; 560 *size = 0; 561 return; 562 } 563 564 handler = xmlFindCharEncodingHandler(encoding); 565 if (handler == NULL) { 566 *mem = NULL; 567 *size = 0; 568 return; 569 } 570 } else { 571 handler = xmlFindCharEncodingHandler(encoding); 572 } 573 } 574 575 /* 576 * Fallback to HTML or ASCII when the encoding is unspecified 577 */ 578 if (handler == NULL) 579 handler = xmlFindCharEncodingHandler("HTML"); 580 if (handler == NULL) 581 handler = xmlFindCharEncodingHandler("ascii"); 582 583 buf = xmlAllocOutputBufferInternal(handler); 584 if (buf == NULL) { 585 *mem = NULL; 586 *size = 0; 587 return; 588 } 589 590 htmlDocContentDumpFormatOutput(buf, cur, NULL, format); 591 592 xmlOutputBufferFlush(buf); 593 if (buf->conv != NULL) { 594 *size = buf->conv->use; 595 *mem = xmlStrndup(buf->conv->content, *size); 596 } else { 597 *size = buf->buffer->use; 598 *mem = xmlStrndup(buf->buffer->content, *size); 599 } 600 (void)xmlOutputBufferClose(buf); 601} 602 603/** 604 * htmlDocDumpMemory: 605 * @cur: the document 606 * @mem: OUT: the memory pointer 607 * @size: OUT: the memory length 608 * 609 * Dump an HTML document in memory and return the xmlChar * and it's size. 610 * It's up to the caller to free the memory. 611 */ 612void 613htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { 614 htmlDocDumpMemoryFormat(cur, mem, size, 1); 615} 616 617 618/************************************************************************ 619 * * 620 * Dumping HTML tree content to an I/O output buffer * 621 * * 622 ************************************************************************/ 623 624void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur); 625 626/** 627 * htmlDtdDumpOutput: 628 * @buf: the HTML buffer output 629 * @doc: the document 630 * @encoding: the encoding string 631 * 632 * TODO: check whether encoding is needed 633 * 634 * Dump the HTML document DTD, if any. 635 */ 636static void 637htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 638 const char *encoding ATTRIBUTE_UNUSED) { 639 xmlDtdPtr cur = doc->intSubset; 640 641 if (cur == NULL) { 642 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL); 643 return; 644 } 645 xmlOutputBufferWriteString(buf, "<!DOCTYPE "); 646 xmlOutputBufferWriteString(buf, (const char *)cur->name); 647 if (cur->ExternalID != NULL) { 648 xmlOutputBufferWriteString(buf, " PUBLIC "); 649 xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID); 650 if (cur->SystemID != NULL) { 651 xmlOutputBufferWriteString(buf, " "); 652 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID); 653 } 654 } else if (cur->SystemID != NULL) { 655 xmlOutputBufferWriteString(buf, " SYSTEM "); 656 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID); 657 } 658 xmlOutputBufferWriteString(buf, ">\n"); 659} 660 661/** 662 * htmlAttrDumpOutput: 663 * @buf: the HTML buffer output 664 * @doc: the document 665 * @cur: the attribute pointer 666 * @encoding: the encoding string 667 * 668 * Dump an HTML attribute 669 */ 670static void 671htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, 672 const char *encoding ATTRIBUTE_UNUSED) { 673 xmlChar *value; 674 675 /* 676 * TODO: The html output method should not escape a & character 677 * occurring in an attribute value immediately followed by 678 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation). 679 */ 680 681 if (cur == NULL) { 682 return; 683 } 684 xmlOutputBufferWriteString(buf, " "); 685 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 686 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 687 xmlOutputBufferWriteString(buf, ":"); 688 } 689 xmlOutputBufferWriteString(buf, (const char *)cur->name); 690 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { 691 value = xmlNodeListGetString(doc, cur->children, 0); 692 if (value) { 693 xmlOutputBufferWriteString(buf, "="); 694 if ((cur->ns == NULL) && (cur->parent != NULL) && 695 (cur->parent->ns == NULL) && 696 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || 697 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || 698 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || 699 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && 700 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) { 701 xmlChar *escaped; 702 xmlChar *tmp = value; 703 704 while (IS_BLANK_CH(*tmp)) tmp++; 705 706 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+"); 707 if (escaped != NULL) { 708 xmlBufferWriteQuotedString(buf->buffer, escaped); 709 xmlFree(escaped); 710 } else { 711 xmlBufferWriteQuotedString(buf->buffer, value); 712 } 713 } else { 714 xmlBufferWriteQuotedString(buf->buffer, value); 715 } 716 xmlFree(value); 717 } else { 718 xmlOutputBufferWriteString(buf, "=\"\""); 719 } 720 } 721} 722 723/** 724 * htmlAttrListDumpOutput: 725 * @buf: the HTML buffer output 726 * @doc: the document 727 * @cur: the first attribute pointer 728 * @encoding: the encoding string 729 * 730 * Dump a list of HTML attributes 731 */ 732static void 733htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) { 734 if (cur == NULL) { 735 return; 736 } 737 while (cur != NULL) { 738 htmlAttrDumpOutput(buf, doc, cur, encoding); 739 cur = cur->next; 740 } 741} 742 743 744 745/** 746 * htmlNodeListDumpOutput: 747 * @buf: the HTML buffer output 748 * @doc: the document 749 * @cur: the first node 750 * @encoding: the encoding string 751 * @format: should formatting spaces been added 752 * 753 * Dump an HTML node list, recursive behaviour,children are printed too. 754 */ 755static void 756htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 757 xmlNodePtr cur, const char *encoding, int format) { 758 if (cur == NULL) { 759 return; 760 } 761 while (cur != NULL) { 762 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 763 cur = cur->next; 764 } 765} 766 767/** 768 * htmlNodeDumpFormatOutput: 769 * @buf: the HTML buffer output 770 * @doc: the document 771 * @cur: the current node 772 * @encoding: the encoding string 773 * @format: should formatting spaces been added 774 * 775 * Dump an HTML node, recursive behaviour,children are printed too. 776 */ 777void 778htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 779 xmlNodePtr cur, const char *encoding, int format) { 780 const htmlElemDesc * info; 781 782 xmlInitParser(); 783 784 if ((cur == NULL) || (buf == NULL)) { 785 return; 786 } 787 /* 788 * Special cases. 789 */ 790 if (cur->type == XML_DTD_NODE) 791 return; 792 if ((cur->type == XML_HTML_DOCUMENT_NODE) || 793 (cur->type == XML_DOCUMENT_NODE)){ 794 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding); 795 return; 796 } 797 if (cur->type == XML_ATTRIBUTE_NODE) { 798 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding); 799 return; 800 } 801 if (cur->type == HTML_TEXT_NODE) { 802 if (cur->content != NULL) { 803 if (((cur->name == (const xmlChar *)xmlStringText) || 804 (cur->name != (const xmlChar *)xmlStringTextNoenc)) && 805 ((cur->parent == NULL) || 806 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && 807 (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { 808 xmlChar *buffer; 809 810 buffer = xmlEncodeEntitiesReentrant(doc, cur->content); 811 if (buffer != NULL) { 812 xmlOutputBufferWriteString(buf, (const char *)buffer); 813 xmlFree(buffer); 814 } 815 } else { 816 xmlOutputBufferWriteString(buf, (const char *)cur->content); 817 } 818 } 819 return; 820 } 821 if (cur->type == HTML_COMMENT_NODE) { 822 if (cur->content != NULL) { 823 xmlOutputBufferWriteString(buf, "<!--"); 824 xmlOutputBufferWriteString(buf, (const char *)cur->content); 825 xmlOutputBufferWriteString(buf, "-->"); 826 } 827 return; 828 } 829 if (cur->type == HTML_PI_NODE) { 830 if (cur->name == NULL) 831 return; 832 xmlOutputBufferWriteString(buf, "<?"); 833 xmlOutputBufferWriteString(buf, (const char *)cur->name); 834 if (cur->content != NULL) { 835 xmlOutputBufferWriteString(buf, " "); 836 xmlOutputBufferWriteString(buf, (const char *)cur->content); 837 } 838 xmlOutputBufferWriteString(buf, ">"); 839 return; 840 } 841 if (cur->type == HTML_ENTITY_REF_NODE) { 842 xmlOutputBufferWriteString(buf, "&"); 843 xmlOutputBufferWriteString(buf, (const char *)cur->name); 844 xmlOutputBufferWriteString(buf, ";"); 845 return; 846 } 847 if (cur->type == HTML_PRESERVE_NODE) { 848 if (cur->content != NULL) { 849 xmlOutputBufferWriteString(buf, (const char *)cur->content); 850 } 851 return; 852 } 853 854 /* 855 * Get specific HTML info for that node. 856 */ 857 if (cur->ns == NULL) 858 info = htmlTagLookup(cur->name); 859 else 860 info = NULL; 861 862 xmlOutputBufferWriteString(buf, "<"); 863 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 864 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 865 xmlOutputBufferWriteString(buf, ":"); 866 } 867 xmlOutputBufferWriteString(buf, (const char *)cur->name); 868 if (cur->nsDef) 869 xmlNsListDumpOutput(buf, cur->nsDef); 870 if (cur->properties != NULL) 871 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding); 872 873 if ((info != NULL) && (info->empty)) { 874 xmlOutputBufferWriteString(buf, ">"); 875 if ((format) && (!info->isinline) && (cur->next != NULL)) { 876 if ((cur->next->type != HTML_TEXT_NODE) && 877 (cur->next->type != HTML_ENTITY_REF_NODE) && 878 (cur->parent != NULL) && 879 (cur->parent->name != NULL) && 880 (cur->parent->name[0] != 'p')) /* p, pre, param */ 881 xmlOutputBufferWriteString(buf, "\n"); 882 } 883 return; 884 } 885 if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) && 886 (cur->children == NULL)) { 887 if ((info != NULL) && (info->saveEndTag != 0) && 888 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) && 889 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { 890 xmlOutputBufferWriteString(buf, ">"); 891 } else { 892 xmlOutputBufferWriteString(buf, "></"); 893 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 894 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 895 xmlOutputBufferWriteString(buf, ":"); 896 } 897 xmlOutputBufferWriteString(buf, (const char *)cur->name); 898 xmlOutputBufferWriteString(buf, ">"); 899 } 900 if ((format) && (cur->next != NULL) && 901 (info != NULL) && (!info->isinline)) { 902 if ((cur->next->type != HTML_TEXT_NODE) && 903 (cur->next->type != HTML_ENTITY_REF_NODE) && 904 (cur->parent != NULL) && 905 (cur->parent->name != NULL) && 906 (cur->parent->name[0] != 'p')) /* p, pre, param */ 907 xmlOutputBufferWriteString(buf, "\n"); 908 } 909 return; 910 } 911 xmlOutputBufferWriteString(buf, ">"); 912 if ((cur->type != XML_ELEMENT_NODE) && 913 (cur->content != NULL)) { 914 /* 915 * Uses the OutputBuffer property to automatically convert 916 * invalids to charrefs 917 */ 918 919 xmlOutputBufferWriteString(buf, (const char *) cur->content); 920 } 921 if (cur->children != NULL) { 922 if ((format) && (info != NULL) && (!info->isinline) && 923 (cur->children->type != HTML_TEXT_NODE) && 924 (cur->children->type != HTML_ENTITY_REF_NODE) && 925 (cur->children != cur->last) && 926 (cur->name != NULL) && 927 (cur->name[0] != 'p')) /* p, pre, param */ 928 xmlOutputBufferWriteString(buf, "\n"); 929 htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format); 930 if ((format) && (info != NULL) && (!info->isinline) && 931 (cur->last->type != HTML_TEXT_NODE) && 932 (cur->last->type != HTML_ENTITY_REF_NODE) && 933 (cur->children != cur->last) && 934 (cur->name != NULL) && 935 (cur->name[0] != 'p')) /* p, pre, param */ 936 xmlOutputBufferWriteString(buf, "\n"); 937 } 938 xmlOutputBufferWriteString(buf, "</"); 939 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 940 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 941 xmlOutputBufferWriteString(buf, ":"); 942 } 943 xmlOutputBufferWriteString(buf, (const char *)cur->name); 944 xmlOutputBufferWriteString(buf, ">"); 945 if ((format) && (info != NULL) && (!info->isinline) && 946 (cur->next != NULL)) { 947 if ((cur->next->type != HTML_TEXT_NODE) && 948 (cur->next->type != HTML_ENTITY_REF_NODE) && 949 (cur->parent != NULL) && 950 (cur->parent->name != NULL) && 951 (cur->parent->name[0] != 'p')) /* p, pre, param */ 952 xmlOutputBufferWriteString(buf, "\n"); 953 } 954} 955 956/** 957 * htmlNodeDumpOutput: 958 * @buf: the HTML buffer output 959 * @doc: the document 960 * @cur: the current node 961 * @encoding: the encoding string 962 * 963 * Dump an HTML node, recursive behaviour,children are printed too, 964 * and formatting returns/spaces are added. 965 */ 966void 967htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 968 xmlNodePtr cur, const char *encoding) { 969 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1); 970} 971 972/** 973 * htmlDocContentDumpFormatOutput: 974 * @buf: the HTML buffer output 975 * @cur: the document 976 * @encoding: the encoding string 977 * @format: should formatting spaces been added 978 * 979 * Dump an HTML document. 980 */ 981void 982htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 983 const char *encoding, int format) { 984 int type; 985 986 xmlInitParser(); 987 988 if ((buf == NULL) || (cur == NULL)) 989 return; 990 991 /* 992 * force to output the stuff as HTML, especially for entities 993 */ 994 type = cur->type; 995 cur->type = XML_HTML_DOCUMENT_NODE; 996 if (cur->intSubset != NULL) { 997 htmlDtdDumpOutput(buf, cur, NULL); 998 } 999 if (cur->children != NULL) { 1000 htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format); 1001 } 1002 xmlOutputBufferWriteString(buf, "\n"); 1003 cur->type = (xmlElementType) type; 1004} 1005 1006/** 1007 * htmlDocContentDumpOutput: 1008 * @buf: the HTML buffer output 1009 * @cur: the document 1010 * @encoding: the encoding string 1011 * 1012 * Dump an HTML document. Formating return/spaces are added. 1013 */ 1014void 1015htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 1016 const char *encoding) { 1017 htmlDocContentDumpFormatOutput(buf, cur, encoding, 1); 1018} 1019 1020/************************************************************************ 1021 * * 1022 * Saving functions front-ends * 1023 * * 1024 ************************************************************************/ 1025 1026/** 1027 * htmlDocDump: 1028 * @f: the FILE* 1029 * @cur: the document 1030 * 1031 * Dump an HTML document to an open FILE. 1032 * 1033 * returns: the number of byte written or -1 in case of failure. 1034 */ 1035int 1036htmlDocDump(FILE *f, xmlDocPtr cur) { 1037 xmlOutputBufferPtr buf; 1038 xmlCharEncodingHandlerPtr handler = NULL; 1039 const char *encoding; 1040 int ret; 1041 1042 xmlInitParser(); 1043 1044 if ((cur == NULL) || (f == NULL)) { 1045 return(-1); 1046 } 1047 1048 encoding = (const char *) htmlGetMetaEncoding(cur); 1049 1050 if (encoding != NULL) { 1051 xmlCharEncoding enc; 1052 1053 enc = xmlParseCharEncoding(encoding); 1054 if (enc != cur->charset) { 1055 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1056 /* 1057 * Not supported yet 1058 */ 1059 return(-1); 1060 } 1061 1062 handler = xmlFindCharEncodingHandler(encoding); 1063 if (handler == NULL) 1064 return(-1); 1065 } else { 1066 handler = xmlFindCharEncodingHandler(encoding); 1067 } 1068 } 1069 1070 /* 1071 * Fallback to HTML or ASCII when the encoding is unspecified 1072 */ 1073 if (handler == NULL) 1074 handler = xmlFindCharEncodingHandler("HTML"); 1075 if (handler == NULL) 1076 handler = xmlFindCharEncodingHandler("ascii"); 1077 1078 buf = xmlOutputBufferCreateFile(f, handler); 1079 if (buf == NULL) return(-1); 1080 htmlDocContentDumpOutput(buf, cur, NULL); 1081 1082 ret = xmlOutputBufferClose(buf); 1083 return(ret); 1084} 1085 1086/** 1087 * htmlSaveFile: 1088 * @filename: the filename (or URL) 1089 * @cur: the document 1090 * 1091 * Dump an HTML document to a file. If @filename is "-" the stdout file is 1092 * used. 1093 * returns: the number of byte written or -1 in case of failure. 1094 */ 1095int 1096htmlSaveFile(const char *filename, xmlDocPtr cur) { 1097 xmlOutputBufferPtr buf; 1098 xmlCharEncodingHandlerPtr handler = NULL; 1099 const char *encoding; 1100 int ret; 1101 1102 if ((cur == NULL) || (filename == NULL)) 1103 return(-1); 1104 1105 xmlInitParser(); 1106 1107 encoding = (const char *) htmlGetMetaEncoding(cur); 1108 1109 if (encoding != NULL) { 1110 xmlCharEncoding enc; 1111 1112 enc = xmlParseCharEncoding(encoding); 1113 if (enc != cur->charset) { 1114 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1115 /* 1116 * Not supported yet 1117 */ 1118 return(-1); 1119 } 1120 1121 handler = xmlFindCharEncodingHandler(encoding); 1122 if (handler == NULL) 1123 return(-1); 1124 } 1125 } 1126 1127 /* 1128 * Fallback to HTML or ASCII when the encoding is unspecified 1129 */ 1130 if (handler == NULL) 1131 handler = xmlFindCharEncodingHandler("HTML"); 1132 if (handler == NULL) 1133 handler = xmlFindCharEncodingHandler("ascii"); 1134 1135 /* 1136 * save the content to a temp buffer. 1137 */ 1138 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); 1139 if (buf == NULL) return(0); 1140 1141 htmlDocContentDumpOutput(buf, cur, NULL); 1142 1143 ret = xmlOutputBufferClose(buf); 1144 return(ret); 1145} 1146 1147/** 1148 * htmlSaveFileFormat: 1149 * @filename: the filename 1150 * @cur: the document 1151 * @format: should formatting spaces been added 1152 * @encoding: the document encoding 1153 * 1154 * Dump an HTML document to a file using a given encoding. 1155 * 1156 * returns: the number of byte written or -1 in case of failure. 1157 */ 1158int 1159htmlSaveFileFormat(const char *filename, xmlDocPtr cur, 1160 const char *encoding, int format) { 1161 xmlOutputBufferPtr buf; 1162 xmlCharEncodingHandlerPtr handler = NULL; 1163 int ret; 1164 1165 if ((cur == NULL) || (filename == NULL)) 1166 return(-1); 1167 1168 xmlInitParser(); 1169 1170 if (encoding != NULL) { 1171 xmlCharEncoding enc; 1172 1173 enc = xmlParseCharEncoding(encoding); 1174 if (enc != cur->charset) { 1175 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1176 /* 1177 * Not supported yet 1178 */ 1179 return(-1); 1180 } 1181 1182 handler = xmlFindCharEncodingHandler(encoding); 1183 if (handler == NULL) 1184 return(-1); 1185 } 1186 htmlSetMetaEncoding(cur, (const xmlChar *) encoding); 1187 } else { 1188 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); 1189 } 1190 1191 /* 1192 * Fallback to HTML or ASCII when the encoding is unspecified 1193 */ 1194 if (handler == NULL) 1195 handler = xmlFindCharEncodingHandler("HTML"); 1196 if (handler == NULL) 1197 handler = xmlFindCharEncodingHandler("ascii"); 1198 1199 /* 1200 * save the content to a temp buffer. 1201 */ 1202 buf = xmlOutputBufferCreateFilename(filename, handler, 0); 1203 if (buf == NULL) return(0); 1204 1205 htmlDocContentDumpFormatOutput(buf, cur, encoding, format); 1206 1207 ret = xmlOutputBufferClose(buf); 1208 return(ret); 1209} 1210 1211/** 1212 * htmlSaveFileEnc: 1213 * @filename: the filename 1214 * @cur: the document 1215 * @encoding: the document encoding 1216 * 1217 * Dump an HTML document to a file using a given encoding 1218 * and formatting returns/spaces are added. 1219 * 1220 * returns: the number of byte written or -1 in case of failure. 1221 */ 1222int 1223htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { 1224 return(htmlSaveFileFormat(filename, cur, encoding, 1)); 1225} 1226 1227#endif /* LIBXML_OUTPUT_ENABLED */ 1228 1229#define bottom_HTMLtree 1230#include "elfgcchack.h" 1231#endif /* LIBXML_HTML_ENABLED */ 1232