1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/bind.h"
6#include "base/command_line.h"
7#include "base/compiler_specific.h"
8#include "base/containers/hash_tables.h"
9#include "base/files/file_path.h"
10#include "base/files/file_util.h"
11#include "base/strings/string_util.h"
12#include "base/strings/utf_string_conversions.h"
13#include "content/public/common/content_switches.h"
14#include "content/public/renderer/render_view.h"
15#include "content/public/renderer/render_view_observer.h"
16#include "content/public/test/content_browser_test.h"
17#include "content/public/test/content_browser_test_utils.h"
18#include "content/public/test/test_utils.h"
19#include "content/renderer/savable_resources.h"
20#include "content/shell/browser/shell.h"
21#include "net/base/filename_util.h"
22#include "net/url_request/url_request_context.h"
23#include "third_party/WebKit/public/platform/WebCString.h"
24#include "third_party/WebKit/public/platform/WebData.h"
25#include "third_party/WebKit/public/platform/WebString.h"
26#include "third_party/WebKit/public/platform/WebURL.h"
27#include "third_party/WebKit/public/platform/WebVector.h"
28#include "third_party/WebKit/public/web/WebDocument.h"
29#include "third_party/WebKit/public/web/WebElement.h"
30#include "third_party/WebKit/public/web/WebElementCollection.h"
31#include "third_party/WebKit/public/web/WebLocalFrame.h"
32#include "third_party/WebKit/public/web/WebNode.h"
33#include "third_party/WebKit/public/web/WebNodeList.h"
34#include "third_party/WebKit/public/web/WebPageSerializer.h"
35#include "third_party/WebKit/public/web/WebPageSerializerClient.h"
36#include "third_party/WebKit/public/web/WebView.h"
37
38using blink::WebCString;
39using blink::WebData;
40using blink::WebDocument;
41using blink::WebElement;
42using blink::WebElementCollection;
43using blink::WebFrame;
44using blink::WebLocalFrame;
45using blink::WebNode;
46using blink::WebNodeList;
47using blink::WebPageSerializer;
48using blink::WebPageSerializerClient;
49using blink::WebString;
50using blink::WebURL;
51using blink::WebView;
52using blink::WebVector;
53
54namespace {
55
56// The first RenderFrame is routing ID 1, and the first RenderView is 2.
57const int kRenderViewRoutingId = 2;
58
59}
60
61namespace content {
62
63// Iterate recursively over sub-frames to find one with with a given url.
64WebFrame* FindSubFrameByURL(WebView* web_view, const GURL& url) {
65  if (!web_view->mainFrame())
66    return NULL;
67
68  std::vector<WebFrame*> stack;
69  stack.push_back(web_view->mainFrame());
70
71  while (!stack.empty()) {
72    WebFrame* current_frame = stack.back();
73    stack.pop_back();
74    if (GURL(current_frame->document().url()) == url)
75      return current_frame;
76    WebElementCollection all = current_frame->document().all();
77    for (WebElement element = all.firstItem();
78         !element.isNull(); element = all.nextItem()) {
79      // Check frame tag and iframe tag
80      if (!element.hasHTMLTagName("frame") && !element.hasHTMLTagName("iframe"))
81        continue;
82      WebFrame* sub_frame = WebLocalFrame::fromFrameOwnerElement(element);
83      if (sub_frame)
84        stack.push_back(sub_frame);
85    }
86  }
87  return NULL;
88}
89
90// Helper function that test whether the first node in the doc is a doc type
91// node.
92bool HasDocType(const WebDocument& doc) {
93  WebNode node = doc.firstChild();
94  if (node.isNull())
95    return false;
96  return node.nodeType() == WebNode::DocumentTypeNode;
97}
98
99  // Helper function for checking whether input node is META tag. Return true
100// means it is META element, otherwise return false. The parameter charset_info
101// return actual charset info if the META tag has charset declaration.
102bool IsMetaElement(const WebNode& node, std::string& charset_info) {
103  if (!node.isElementNode())
104    return false;
105  const WebElement meta = node.toConst<WebElement>();
106  if (!meta.hasHTMLTagName("meta"))
107    return false;
108  charset_info.erase(0, charset_info.length());
109  // Check the META charset declaration.
110  WebString httpEquiv = meta.getAttribute("http-equiv");
111  if (LowerCaseEqualsASCII(httpEquiv, "content-type")) {
112    std::string content = meta.getAttribute("content").utf8();
113    int pos = content.find("charset", 0);
114    if (pos > -1) {
115      // Add a dummy charset declaration to charset_info, which indicates this
116      // META tag has charset declaration although we do not get correct value
117      // yet.
118      charset_info.append("has-charset-declaration");
119      int remaining_length = content.length() - pos - 7;
120      if (!remaining_length)
121        return true;
122      int start_pos = pos + 7;
123      // Find "=" symbol.
124      while (remaining_length--)
125        if (content[start_pos++] == L'=')
126          break;
127      // Skip beginning space.
128      while (remaining_length) {
129        if (content[start_pos] > 0x0020)
130          break;
131        ++start_pos;
132        --remaining_length;
133      }
134      if (!remaining_length)
135        return true;
136      int end_pos = start_pos;
137      // Now we find out the start point of charset info. Search the end point.
138      while (remaining_length--) {
139        if (content[end_pos] <= 0x0020 || content[end_pos] == L';')
140          break;
141        ++end_pos;
142      }
143      // Get actual charset info.
144      charset_info = content.substr(start_pos, end_pos - start_pos);
145      return true;
146    }
147  }
148  return true;
149}
150
151class LoadObserver : public RenderViewObserver {
152 public:
153  LoadObserver(RenderView* render_view, const base::Closure& quit_closure)
154      : RenderViewObserver(render_view),
155        quit_closure_(quit_closure) {}
156
157  virtual void DidFinishLoad(blink::WebLocalFrame* frame) OVERRIDE {
158    if (frame == render_view()->GetWebView()->mainFrame())
159      quit_closure_.Run();
160  }
161
162 private:
163  base::Closure quit_closure_;
164};
165
166class DomSerializerTests : public ContentBrowserTest,
167                           public WebPageSerializerClient {
168 public:
169  DomSerializerTests()
170    : serialized_(false),
171      local_directory_name_(FILE_PATH_LITERAL("./dummy_files/")) {}
172
173  virtual void SetUpCommandLine(CommandLine* command_line) OVERRIDE {
174    command_line->AppendSwitch(switches::kSingleProcess);
175#if defined(OS_WIN)
176    // Don't want to try to create a GPU process.
177    command_line->AppendSwitch(switches::kDisableGpu);
178#endif
179  }
180
181  // DomSerializerDelegate.
182  virtual void didSerializeDataForFrame(const WebURL& frame_web_url,
183                                        const WebCString& data,
184                                        PageSerializationStatus status) {
185
186    GURL frame_url(frame_web_url);
187    // If the all frames are finished saving, check all finish status
188    if (status == WebPageSerializerClient::AllFramesAreFinished) {
189      SerializationFinishStatusMap::iterator it =
190          serialization_finish_status_.begin();
191      for (; it != serialization_finish_status_.end(); ++it)
192        ASSERT_TRUE(it->second);
193      serialized_ = true;
194      return;
195    }
196
197    // Check finish status of current frame.
198    SerializationFinishStatusMap::iterator it =
199        serialization_finish_status_.find(frame_url.spec());
200    // New frame, set initial status as false.
201    if (it == serialization_finish_status_.end())
202      serialization_finish_status_[frame_url.spec()] = false;
203
204    it = serialization_finish_status_.find(frame_url.spec());
205    ASSERT_TRUE(it != serialization_finish_status_.end());
206    // In process frame, finish status should be false.
207    ASSERT_FALSE(it->second);
208
209    // Add data to corresponding frame's content.
210    serialized_frame_map_[frame_url.spec()] += data.data();
211
212    // Current frame is completed saving, change the finish status.
213    if (status == WebPageSerializerClient::CurrentFrameIsFinished)
214      it->second = true;
215  }
216
217  bool HasSerializedFrame(const GURL& frame_url) {
218    return serialized_frame_map_.find(frame_url.spec()) !=
219           serialized_frame_map_.end();
220  }
221
222  const std::string& GetSerializedContentForFrame(
223      const GURL& frame_url) {
224    return serialized_frame_map_[frame_url.spec()];
225  }
226
227  RenderView* GetRenderView() {
228    // We could have the test on the UI thread get the WebContent's routing ID,
229    // but we know this will be the first RV so skip that and just hardcode it.
230    return RenderView::FromRoutingID(kRenderViewRoutingId);
231  }
232
233  WebView* GetWebView() {
234    return GetRenderView()->GetWebView();
235  }
236
237  WebFrame* GetMainFrame() {
238    return GetWebView()->mainFrame();
239  }
240
241  // Load web page according to input content and relative URLs within
242  // the document.
243  void LoadContents(const std::string& contents,
244                    const GURL& base_url,
245                    const WebString encoding_info) {
246    scoped_refptr<MessageLoopRunner> runner = new MessageLoopRunner;
247    LoadObserver observer(GetRenderView(), runner->QuitClosure());
248
249    // If input encoding is empty, use UTF-8 as default encoding.
250    if (encoding_info.isEmpty()) {
251      GetMainFrame()->loadHTMLString(contents, base_url);
252    } else {
253      WebData data(contents.data(), contents.length());
254
255      // Do not use WebFrame.LoadHTMLString because it assumes that input
256      // html contents use UTF-8 encoding.
257      // TODO(darin): This should use WebFrame::loadData.
258      WebFrame* web_frame = GetMainFrame();
259
260      ASSERT_TRUE(web_frame != NULL);
261
262      web_frame->loadData(data, "text/html", encoding_info, base_url);
263    }
264
265    runner->Run();
266  }
267
268  // Serialize page DOM according to specific page URL. The parameter
269  // recursive_serialization indicates whether we will serialize all
270  // sub-frames.
271  void SerializeDomForURL(const GURL& page_url,
272                          bool recursive_serialization) {
273    // Find corresponding WebFrame according to page_url.
274    WebFrame* web_frame = FindSubFrameByURL(GetWebView(), page_url);
275    ASSERT_TRUE(web_frame != NULL);
276    WebVector<WebURL> links;
277    links.assign(&page_url, 1);
278    WebString file_path =
279        base::FilePath(FILE_PATH_LITERAL("c:\\dummy.htm")).AsUTF16Unsafe();
280    WebVector<WebString> local_paths;
281    local_paths.assign(&file_path, 1);
282    // Start serializing DOM.
283    bool result = WebPageSerializer::serialize(web_frame->toWebLocalFrame(),
284       recursive_serialization,
285       static_cast<WebPageSerializerClient*>(this),
286       links,
287       local_paths,
288       local_directory_name_.AsUTF16Unsafe());
289    ASSERT_TRUE(result);
290    ASSERT_TRUE(serialized_);
291  }
292
293  void SerializeHTMLDOMWithDocTypeOnRenderer(const GURL& file_url) {
294    // Make sure original contents have document type.
295    WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
296    ASSERT_TRUE(web_frame != NULL);
297    WebDocument doc = web_frame->document();
298    ASSERT_TRUE(HasDocType(doc));
299    // Do serialization.
300    SerializeDomForURL(file_url, false);
301    // Load the serialized contents.
302    ASSERT_TRUE(HasSerializedFrame(file_url));
303    const std::string& serialized_contents =
304        GetSerializedContentForFrame(file_url);
305    LoadContents(serialized_contents, file_url,
306                 web_frame->document().encoding());
307    // Make sure serialized contents still have document type.
308    web_frame = GetMainFrame();
309    doc = web_frame->document();
310    ASSERT_TRUE(HasDocType(doc));
311  }
312
313  void SerializeHTMLDOMWithoutDocTypeOnRenderer(const GURL& file_url) {
314    // Make sure original contents do not have document type.
315    WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
316    ASSERT_TRUE(web_frame != NULL);
317    WebDocument doc = web_frame->document();
318    ASSERT_TRUE(!HasDocType(doc));
319    // Do serialization.
320    SerializeDomForURL(file_url, false);
321    // Load the serialized contents.
322    ASSERT_TRUE(HasSerializedFrame(file_url));
323    const std::string& serialized_contents =
324        GetSerializedContentForFrame(file_url);
325    LoadContents(serialized_contents, file_url,
326                 web_frame->document().encoding());
327    // Make sure serialized contents do not have document type.
328    web_frame = GetMainFrame();
329    doc = web_frame->document();
330    ASSERT_TRUE(!HasDocType(doc));
331  }
332
333  void SerializeXMLDocWithBuiltInEntitiesOnRenderer(
334      const GURL& xml_file_url, const std::string& original_contents) {
335    // Do serialization.
336    SerializeDomForURL(xml_file_url, false);
337    // Compare the serialized contents with original contents.
338    ASSERT_TRUE(HasSerializedFrame(xml_file_url));
339    const std::string& serialized_contents =
340        GetSerializedContentForFrame(xml_file_url);
341    ASSERT_EQ(original_contents, serialized_contents);
342  }
343
344  void SerializeHTMLDOMWithAddingMOTWOnRenderer(
345      const GURL& file_url, const std::string& original_contents) {
346    // Make sure original contents does not have MOTW;
347    std::string motw_declaration =
348       WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
349    ASSERT_FALSE(motw_declaration.empty());
350    // The encoding of original contents is ISO-8859-1, so we convert the MOTW
351    // declaration to ASCII and search whether original contents has it or not.
352    ASSERT_TRUE(std::string::npos == original_contents.find(motw_declaration));
353
354    // Do serialization.
355    SerializeDomForURL(file_url, false);
356    // Make sure the serialized contents have MOTW ;
357    ASSERT_TRUE(HasSerializedFrame(file_url));
358    const std::string& serialized_contents =
359        GetSerializedContentForFrame(file_url);
360    ASSERT_FALSE(std::string::npos ==
361        serialized_contents.find(motw_declaration));
362  }
363
364  void SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer(
365      const GURL& file_url) {
366    // Make sure there is no META charset declaration in original document.
367    WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
368    ASSERT_TRUE(web_frame != NULL);
369    WebDocument doc = web_frame->document();
370    ASSERT_TRUE(doc.isHTMLDocument());
371    WebElement head_element = doc.head();
372    ASSERT_TRUE(!head_element.isNull());
373    // Go through all children of HEAD element.
374    for (WebNode child = head_element.firstChild(); !child.isNull();
375         child = child.nextSibling()) {
376      std::string charset_info;
377      if (IsMetaElement(child, charset_info))
378        ASSERT_TRUE(charset_info.empty());
379    }
380    // Do serialization.
381    SerializeDomForURL(file_url, false);
382
383    // Load the serialized contents.
384    ASSERT_TRUE(HasSerializedFrame(file_url));
385    const std::string& serialized_contents =
386        GetSerializedContentForFrame(file_url);
387    LoadContents(serialized_contents, file_url,
388                 web_frame->document().encoding());
389    // Make sure the first child of HEAD element is META which has charset
390    // declaration in serialized contents.
391    web_frame = GetMainFrame();
392    ASSERT_TRUE(web_frame != NULL);
393    doc = web_frame->document();
394    ASSERT_TRUE(doc.isHTMLDocument());
395    head_element = doc.head();
396    ASSERT_TRUE(!head_element.isNull());
397    WebNode meta_node = head_element.firstChild();
398    ASSERT_TRUE(!meta_node.isNull());
399    // Get meta charset info.
400    std::string charset_info2;
401    ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
402    ASSERT_TRUE(!charset_info2.empty());
403    ASSERT_EQ(charset_info2,
404              std::string(web_frame->document().encoding().utf8()));
405
406    // Make sure no more additional META tags which have charset declaration.
407    for (WebNode child = meta_node.nextSibling(); !child.isNull();
408         child = child.nextSibling()) {
409      std::string charset_info;
410      if (IsMetaElement(child, charset_info))
411        ASSERT_TRUE(charset_info.empty());
412    }
413  }
414
415  void SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer(
416      const GURL& file_url) {
417    // Make sure there are multiple META charset declarations in original
418    // document.
419    WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
420    ASSERT_TRUE(web_frame != NULL);
421    WebDocument doc = web_frame->document();
422    ASSERT_TRUE(doc.isHTMLDocument());
423    WebElement head_ele = doc.head();
424    ASSERT_TRUE(!head_ele.isNull());
425    // Go through all children of HEAD element.
426    int charset_declaration_count = 0;
427    for (WebNode child = head_ele.firstChild(); !child.isNull();
428         child = child.nextSibling()) {
429      std::string charset_info;
430      if (IsMetaElement(child, charset_info) && !charset_info.empty())
431        charset_declaration_count++;
432    }
433    // The original doc has more than META tags which have charset declaration.
434    ASSERT_TRUE(charset_declaration_count > 1);
435
436    // Do serialization.
437    SerializeDomForURL(file_url, false);
438
439    // Load the serialized contents.
440    ASSERT_TRUE(HasSerializedFrame(file_url));
441    const std::string& serialized_contents =
442        GetSerializedContentForFrame(file_url);
443    LoadContents(serialized_contents, file_url,
444                 web_frame->document().encoding());
445    // Make sure only first child of HEAD element is META which has charset
446    // declaration in serialized contents.
447    web_frame = GetMainFrame();
448    ASSERT_TRUE(web_frame != NULL);
449    doc = web_frame->document();
450    ASSERT_TRUE(doc.isHTMLDocument());
451    head_ele = doc.head();
452    ASSERT_TRUE(!head_ele.isNull());
453    WebNode meta_node = head_ele.firstChild();
454    ASSERT_TRUE(!meta_node.isNull());
455    // Get meta charset info.
456    std::string charset_info2;
457    ASSERT_TRUE(IsMetaElement(meta_node, charset_info2));
458    ASSERT_TRUE(!charset_info2.empty());
459    ASSERT_EQ(charset_info2,
460              std::string(web_frame->document().encoding().utf8()));
461
462    // Make sure no more additional META tags which have charset declaration.
463    for (WebNode child = meta_node.nextSibling(); !child.isNull();
464         child = child.nextSibling()) {
465      std::string charset_info;
466      if (IsMetaElement(child, charset_info))
467        ASSERT_TRUE(charset_info.empty());
468    }
469  }
470
471  void SerializeHTMLDOMWithEntitiesInTextOnRenderer() {
472    base::FilePath page_file_path = GetTestFilePath(
473        "dom_serializer", "dom_serializer/htmlentities_in_text.htm");
474    // Get file URL. The URL is dummy URL to identify the following loading
475    // actions. The test content is in constant:original_contents.
476    GURL file_url = net::FilePathToFileURL(page_file_path);
477    ASSERT_TRUE(file_url.SchemeIsFile());
478    // Test contents.
479    static const char* const original_contents =
480        "<html><body>&amp;&lt;&gt;\"\'</body></html>";
481    // Load the test contents.
482    LoadContents(original_contents, file_url, WebString());
483
484    // Get BODY's text content in DOM.
485    WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
486    ASSERT_TRUE(web_frame != NULL);
487    WebDocument doc = web_frame->document();
488    ASSERT_TRUE(doc.isHTMLDocument());
489    WebElement body_ele = doc.body();
490    ASSERT_TRUE(!body_ele.isNull());
491    WebNode text_node = body_ele.firstChild();
492    ASSERT_TRUE(text_node.isTextNode());
493    ASSERT_TRUE(std::string(text_node.createMarkup().utf8()) ==
494                "&amp;&lt;&gt;\"\'");
495    // Do serialization.
496    SerializeDomForURL(file_url, false);
497    // Compare the serialized contents with original contents.
498    ASSERT_TRUE(HasSerializedFrame(file_url));
499    const std::string& serialized_contents =
500        GetSerializedContentForFrame(file_url);
501    // Compare the serialized contents with original contents to make sure
502    // they are same.
503    // Because we add MOTW when serializing DOM, so before comparison, we also
504    // need to add MOTW to original_contents.
505    std::string original_str =
506      WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
507    original_str += original_contents;
508    // Since WebCore now inserts a new HEAD element if there is no HEAD element
509    // when creating BODY element. (Please see
510    // HTMLParser::bodyCreateErrorCheck.) We need to append the HEAD content and
511    // corresponding META content if we find WebCore-generated HEAD element.
512    if (!doc.head().isNull()) {
513      WebString encoding = web_frame->document().encoding();
514      std::string htmlTag("<html>");
515      std::string::size_type pos = original_str.find(htmlTag);
516      ASSERT_NE(std::string::npos, pos);
517      pos += htmlTag.length();
518      std::string head_part("<head>");
519      head_part +=
520          WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
521      head_part += "</head>";
522      original_str.insert(pos, head_part);
523    }
524    ASSERT_EQ(original_str, serialized_contents);
525  }
526
527  void SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer() {
528    base::FilePath page_file_path = GetTestFilePath(
529        "dom_serializer", "dom_serializer/htmlentities_in_attribute_value.htm");
530    // Get file URL. The URL is dummy URL to identify the following loading
531    // actions. The test content is in constant:original_contents.
532    GURL file_url = net::FilePathToFileURL(page_file_path);
533    ASSERT_TRUE(file_url.SchemeIsFile());
534    // Test contents.
535    static const char* const original_contents =
536        "<html><body title=\"&amp;&lt;&gt;&quot;&#39;\"></body></html>";
537    // Load the test contents.
538    LoadContents(original_contents, file_url, WebString());
539    // Get value of BODY's title attribute in DOM.
540    WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
541    ASSERT_TRUE(web_frame != NULL);
542    WebDocument doc = web_frame->document();
543    ASSERT_TRUE(doc.isHTMLDocument());
544    WebElement body_ele = doc.body();
545    ASSERT_TRUE(!body_ele.isNull());
546    WebString value = body_ele.getAttribute("title");
547    ASSERT_TRUE(std::string(value.utf8()) == "&<>\"\'");
548    // Do serialization.
549    SerializeDomForURL(file_url, false);
550    // Compare the serialized contents with original contents.
551    ASSERT_TRUE(HasSerializedFrame(file_url));
552    const std::string& serialized_contents =
553        GetSerializedContentForFrame(file_url);
554    // Compare the serialized contents with original contents to make sure
555    // they are same.
556    std::string original_str =
557        WebPageSerializer::generateMarkOfTheWebDeclaration(file_url).utf8();
558    original_str += original_contents;
559    if (!doc.isNull()) {
560      WebString encoding = web_frame->document().encoding();
561      std::string htmlTag("<html>");
562      std::string::size_type pos = original_str.find(htmlTag);
563      ASSERT_NE(std::string::npos, pos);
564      pos += htmlTag.length();
565      std::string head_part("<head>");
566      head_part +=
567          WebPageSerializer::generateMetaCharsetDeclaration(encoding).utf8();
568      head_part += "</head>";
569      original_str.insert(pos, head_part);
570    }
571    ASSERT_EQ(original_str, serialized_contents);
572  }
573
574  void SerializeHTMLDOMWithNonStandardEntitiesOnRenderer(const GURL& file_url) {
575    // Get value of BODY's title attribute in DOM.
576    WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
577    WebDocument doc = web_frame->document();
578    ASSERT_TRUE(doc.isHTMLDocument());
579    WebElement body_element = doc.body();
580    // Unescaped string for "&percnt;&nsup;&sup1;&apos;".
581    static const wchar_t parsed_value[] = {
582      '%', 0x2285, 0x00b9, '\'', 0
583    };
584    WebString value = body_element.getAttribute("title");
585    ASSERT_TRUE(base::UTF16ToWide(value) == parsed_value);
586    ASSERT_TRUE(base::UTF16ToWide(body_element.innerText()) == parsed_value);
587
588    // Do serialization.
589    SerializeDomForURL(file_url, false);
590    // Check the serialized string.
591    ASSERT_TRUE(HasSerializedFrame(file_url));
592    const std::string& serialized_contents =
593        GetSerializedContentForFrame(file_url);
594    // Confirm that the serialized string has no non-standard HTML entities.
595    ASSERT_EQ(std::string::npos, serialized_contents.find("&percnt;"));
596    ASSERT_EQ(std::string::npos, serialized_contents.find("&nsup;"));
597    ASSERT_EQ(std::string::npos, serialized_contents.find("&sup1;"));
598    ASSERT_EQ(std::string::npos, serialized_contents.find("&apos;"));
599  }
600
601  void SerializeHTMLDOMWithBaseTagOnRenderer(const GURL& file_url,
602                                             const GURL& path_dir_url) {
603    // There are total 2 available base tags in this test file.
604    const int kTotalBaseTagCountInTestFile = 2;
605
606    // Since for this test, we assume there is no savable sub-resource links for
607    // this test file, also all links are relative URLs in this test file, so we
608    // need to check those relative URLs and make sure document has BASE tag.
609    WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
610    ASSERT_TRUE(web_frame != NULL);
611    WebDocument doc = web_frame->document();
612    ASSERT_TRUE(doc.isHTMLDocument());
613    // Go through all descent nodes.
614    WebElementCollection all = doc.all();
615    int original_base_tag_count = 0;
616    for (WebElement element = all.firstItem(); !element.isNull();
617         element = all.nextItem()) {
618      if (element.hasHTMLTagName("base")) {
619        original_base_tag_count++;
620      } else {
621        // Get link.
622        WebString value = GetSubResourceLinkFromElement(element);
623        if (value.isNull() && element.hasHTMLTagName("a")) {
624          value = element.getAttribute("href");
625          if (value.isEmpty())
626            value = WebString();
627        }
628        // Each link is relative link.
629        if (!value.isNull()) {
630          GURL link(value.utf8());
631          ASSERT_TRUE(link.scheme().empty());
632        }
633      }
634    }
635    ASSERT_EQ(original_base_tag_count, kTotalBaseTagCountInTestFile);
636    // Make sure in original document, the base URL is not equal with the
637    // |path_dir_url|.
638    GURL original_base_url(doc.baseURL());
639    ASSERT_NE(original_base_url, path_dir_url);
640
641    // Do serialization.
642    SerializeDomForURL(file_url, false);
643
644    // Load the serialized contents.
645    ASSERT_TRUE(HasSerializedFrame(file_url));
646    const std::string& serialized_contents =
647        GetSerializedContentForFrame(file_url);
648    LoadContents(serialized_contents, file_url,
649                 web_frame->document().encoding());
650
651    // Make sure all links are absolute URLs and doc there are some number of
652    // BASE tags in serialized HTML data. Each of those BASE tags have same base
653    // URL which is as same as URL of current test file.
654    web_frame = GetMainFrame();
655    ASSERT_TRUE(web_frame != NULL);
656    doc = web_frame->document();
657    ASSERT_TRUE(doc.isHTMLDocument());
658    // Go through all descent nodes.
659    all = doc.all();
660    int new_base_tag_count = 0;
661    for (WebNode node = all.firstItem(); !node.isNull();
662         node = all.nextItem()) {
663      if (!node.isElementNode())
664        continue;
665      WebElement element = node.to<WebElement>();
666      if (element.hasHTMLTagName("base")) {
667        new_base_tag_count++;
668      } else {
669        // Get link.
670        WebString value = GetSubResourceLinkFromElement(element);
671        if (value.isNull() && element.hasHTMLTagName("a")) {
672          value = element.getAttribute("href");
673          if (value.isEmpty())
674            value = WebString();
675        }
676        // Each link is absolute link.
677        if (!value.isNull()) {
678          GURL link(std::string(value.utf8()));
679          ASSERT_FALSE(link.scheme().empty());
680        }
681      }
682    }
683    // We have one more added BASE tag which is generated by JavaScript.
684    ASSERT_EQ(new_base_tag_count, original_base_tag_count + 1);
685    // Make sure in new document, the base URL is equal with the |path_dir_url|.
686    GURL new_base_url(doc.baseURL());
687    ASSERT_EQ(new_base_url, path_dir_url);
688  }
689
690  void SerializeHTMLDOMWithEmptyHeadOnRenderer() {
691    base::FilePath page_file_path = GetTestFilePath(
692        "dom_serializer", "empty_head.htm");
693    GURL file_url = net::FilePathToFileURL(page_file_path);
694    ASSERT_TRUE(file_url.SchemeIsFile());
695
696    // Load the test html content.
697    static const char* const empty_head_contents =
698      "<html><head></head><body>hello world</body></html>";
699    LoadContents(empty_head_contents, file_url, WebString());
700
701    // Make sure the head tag is empty.
702    WebFrame* web_frame = GetMainFrame();
703    ASSERT_TRUE(web_frame != NULL);
704    WebDocument doc = web_frame->document();
705    ASSERT_TRUE(doc.isHTMLDocument());
706    WebElement head_element = doc.head();
707    ASSERT_TRUE(!head_element.isNull());
708    ASSERT_TRUE(!head_element.hasChildNodes());
709    ASSERT_TRUE(head_element.childNodes().length() == 0);
710
711    // Do serialization.
712    SerializeDomForURL(file_url, false);
713    // Make sure the serialized contents have META ;
714    ASSERT_TRUE(HasSerializedFrame(file_url));
715    const std::string& serialized_contents =
716        GetSerializedContentForFrame(file_url);
717
718    // Reload serialized contents and make sure there is only one META tag.
719    LoadContents(serialized_contents, file_url,
720                 web_frame->document().encoding());
721    web_frame = GetMainFrame();
722    ASSERT_TRUE(web_frame != NULL);
723    doc = web_frame->document();
724    ASSERT_TRUE(doc.isHTMLDocument());
725    head_element = doc.head();
726    ASSERT_TRUE(!head_element.isNull());
727    ASSERT_TRUE(head_element.hasChildNodes());
728    ASSERT_TRUE(head_element.childNodes().length() == 1);
729    WebNode meta_node = head_element.firstChild();
730    ASSERT_TRUE(!meta_node.isNull());
731    // Get meta charset info.
732    std::string charset_info;
733    ASSERT_TRUE(IsMetaElement(meta_node, charset_info));
734    ASSERT_TRUE(!charset_info.empty());
735    ASSERT_EQ(charset_info,
736              std::string(web_frame->document().encoding().utf8()));
737
738    // Check the body's first node is text node and its contents are
739    // "hello world"
740    WebElement body_element = doc.body();
741    ASSERT_TRUE(!body_element.isNull());
742    WebNode text_node = body_element.firstChild();
743    ASSERT_TRUE(text_node.isTextNode());
744    WebString text_node_contents = text_node.nodeValue();
745    ASSERT_TRUE(std::string(text_node_contents.utf8()) == "hello world");
746  }
747
748  void SerializeDocumentWithDownloadedIFrameOnRenderer(const GURL& file_url) {
749    // Do a recursive serialization. We pass if we don't crash.
750    SerializeDomForURL(file_url, true);
751  }
752
753  void SubResourceForElementsInNonHTMLNamespaceOnRenderer(
754      const GURL& file_url) {
755    WebFrame* web_frame = FindSubFrameByURL(GetWebView(), file_url);
756    ASSERT_TRUE(web_frame != NULL);
757    WebDocument doc = web_frame->document();
758    WebNode lastNodeInBody = doc.body().lastChild();
759    ASSERT_EQ(WebNode::ElementNode, lastNodeInBody.nodeType());
760    WebString uri = GetSubResourceLinkFromElement(
761        lastNodeInBody.to<WebElement>());
762    EXPECT_TRUE(uri.isNull());
763  }
764
765 private:
766  // Map frame_url to corresponding serialized_content.
767  typedef base::hash_map<std::string, std::string> SerializedFrameContentMap;
768  SerializedFrameContentMap serialized_frame_map_;
769  // Map frame_url to corresponding status of serialization finish.
770  typedef base::hash_map<std::string, bool> SerializationFinishStatusMap;
771  SerializationFinishStatusMap serialization_finish_status_;
772  // Flag indicates whether the process of serializing DOM is finished or not.
773  bool serialized_;
774  // The local_directory_name_ is dummy relative path of directory which
775  // contain all saved auxiliary files included all sub frames and resources.
776  const base::FilePath local_directory_name_;
777};
778
779// If original contents have document type, the serialized contents also have
780// document type.
781IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithDocType) {
782  base::FilePath page_file_path =
783      GetTestFilePath("dom_serializer", "youtube_1.htm");
784  GURL file_url = net::FilePathToFileURL(page_file_path);
785  ASSERT_TRUE(file_url.SchemeIsFile());
786  // Load the test file.
787  NavigateToURL(shell(), file_url);
788
789  PostTaskToInProcessRendererAndWait(
790        base::Bind(&DomSerializerTests::SerializeHTMLDOMWithDocTypeOnRenderer,
791                   base::Unretained(this), file_url));
792}
793
794// If original contents do not have document type, the serialized contents
795// also do not have document type.
796IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithoutDocType) {
797  base::FilePath page_file_path =
798      GetTestFilePath("dom_serializer", "youtube_2.htm");
799  GURL file_url = net::FilePathToFileURL(page_file_path);
800  ASSERT_TRUE(file_url.SchemeIsFile());
801  // Load the test file.
802  NavigateToURL(shell(), file_url);
803
804  PostTaskToInProcessRendererAndWait(
805        base::Bind(
806            &DomSerializerTests::SerializeHTMLDOMWithoutDocTypeOnRenderer,
807            base::Unretained(this), file_url));
808}
809
810// Serialize XML document which has all 5 built-in entities. After
811// finishing serialization, the serialized contents should be same
812// with original XML document.
813//
814// TODO(tiger@opera.com): Disabled in preparation of page serializer merge --
815// XML headers are handled differently in the merged serializer.
816// Bug: http://crbug.com/328354
817IN_PROC_BROWSER_TEST_F(DomSerializerTests,
818                       DISABLED_SerializeXMLDocWithBuiltInEntities) {
819  base::FilePath page_file_path =
820      GetTestFilePath("dom_serializer", "note.html");
821  base::FilePath xml_file_path = GetTestFilePath("dom_serializer", "note.xml");
822  // Read original contents for later comparison.
823  std::string original_contents;
824  ASSERT_TRUE(base::ReadFileToString(xml_file_path, &original_contents));
825  // Get file URL.
826  GURL file_url = net::FilePathToFileURL(page_file_path);
827  GURL xml_file_url = net::FilePathToFileURL(xml_file_path);
828  ASSERT_TRUE(file_url.SchemeIsFile());
829  // Load the test file.
830  NavigateToURL(shell(), file_url);
831
832  PostTaskToInProcessRendererAndWait(
833        base::Bind(
834            &DomSerializerTests::SerializeXMLDocWithBuiltInEntitiesOnRenderer,
835            base::Unretained(this), xml_file_url, original_contents));
836}
837
838// When serializing DOM, we add MOTW declaration before html tag.
839IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithAddingMOTW) {
840  base::FilePath page_file_path =
841      GetTestFilePath("dom_serializer", "youtube_2.htm");
842  // Read original contents for later comparison .
843  std::string original_contents;
844  ASSERT_TRUE(base::ReadFileToString(page_file_path, &original_contents));
845  // Get file URL.
846  GURL file_url = net::FilePathToFileURL(page_file_path);
847  ASSERT_TRUE(file_url.SchemeIsFile());
848
849  // Load the test file.
850  NavigateToURL(shell(), file_url);
851
852  PostTaskToInProcessRendererAndWait(
853        base::Bind(
854            &DomSerializerTests::SerializeHTMLDOMWithAddingMOTWOnRenderer,
855            base::Unretained(this), file_url, original_contents));
856}
857
858// When serializing DOM, we will add the META which have correct charset
859// declaration as first child of HEAD element for resolving WebKit bug:
860// http://bugs.webkit.org/show_bug.cgi?id=16621 even the original document
861// does not have META charset declaration.
862IN_PROC_BROWSER_TEST_F(DomSerializerTests,
863                       SerializeHTMLDOMWithNoMetaCharsetInOriginalDoc) {
864  base::FilePath page_file_path =
865      GetTestFilePath("dom_serializer", "youtube_1.htm");
866  // Get file URL.
867  GURL file_url = net::FilePathToFileURL(page_file_path);
868  ASSERT_TRUE(file_url.SchemeIsFile());
869  // Load the test file.
870  NavigateToURL(shell(), file_url);
871
872  PostTaskToInProcessRendererAndWait(
873        base::Bind(
874            &DomSerializerTests::
875                SerializeHTMLDOMWithNoMetaCharsetInOriginalDocOnRenderer,
876            base::Unretained(this), file_url));
877}
878
879// When serializing DOM, if the original document has multiple META charset
880// declaration, we will add the META which have correct charset declaration
881// as first child of HEAD element and remove all original META charset
882// declarations.
883IN_PROC_BROWSER_TEST_F(DomSerializerTests,
884                       SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDoc) {
885  base::FilePath page_file_path =
886      GetTestFilePath("dom_serializer", "youtube_2.htm");
887  // Get file URL.
888  GURL file_url = net::FilePathToFileURL(page_file_path);
889  ASSERT_TRUE(file_url.SchemeIsFile());
890  // Load the test file.
891  NavigateToURL(shell(), file_url);
892
893  PostTaskToInProcessRendererAndWait(
894        base::Bind(
895            &DomSerializerTests::
896                SerializeHTMLDOMWithMultipleMetaCharsetInOriginalDocOnRenderer,
897            base::Unretained(this), file_url));
898}
899
900// Test situation of html entities in text when serializing HTML DOM.
901IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEntitiesInText) {
902  // Need to spin up the renderer and also navigate to a file url so that the
903  // renderer code doesn't attempt a fork when it sees a load to file scheme
904  // from non-file scheme.
905  NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
906
907  PostTaskToInProcessRendererAndWait(
908        base::Bind(
909            &DomSerializerTests::SerializeHTMLDOMWithEntitiesInTextOnRenderer,
910            base::Unretained(this)));
911}
912
913// Test situation of html entities in attribute value when serializing
914// HTML DOM.
915// This test started to fail at WebKit r65388. See http://crbug.com/52279.
916//
917// TODO(tiger@opera.com): Disabled in preparation of page serializer merge --
918// Some attributes are handled differently in the merged serializer.
919// Bug: http://crbug.com/328354
920IN_PROC_BROWSER_TEST_F(DomSerializerTests,
921                       DISABLED_SerializeHTMLDOMWithEntitiesInAttributeValue) {
922  // Need to spin up the renderer and also navigate to a file url so that the
923  // renderer code doesn't attempt a fork when it sees a load to file scheme
924  // from non-file scheme.
925  NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
926
927  PostTaskToInProcessRendererAndWait(
928        base::Bind(
929            &DomSerializerTests::
930                SerializeHTMLDOMWithEntitiesInAttributeValueOnRenderer,
931            base::Unretained(this)));
932}
933
934// Test situation of non-standard HTML entities when serializing HTML DOM.
935// This test started to fail at WebKit r65351. See http://crbug.com/52279.
936IN_PROC_BROWSER_TEST_F(DomSerializerTests,
937                       SerializeHTMLDOMWithNonStandardEntities) {
938  // Make a test file URL and load it.
939  base::FilePath page_file_path = GetTestFilePath(
940      "dom_serializer", "nonstandard_htmlentities.htm");
941  GURL file_url = net::FilePathToFileURL(page_file_path);
942  NavigateToURL(shell(), file_url);
943
944  PostTaskToInProcessRendererAndWait(
945        base::Bind(
946            &DomSerializerTests::
947                SerializeHTMLDOMWithNonStandardEntitiesOnRenderer,
948            base::Unretained(this), file_url));
949}
950
951// Test situation of BASE tag in original document when serializing HTML DOM.
952// When serializing, we should comment the BASE tag, append a new BASE tag.
953// rewrite all the savable URLs to relative local path, and change other URLs
954// to absolute URLs.
955//
956// TODO(tiger@opera.com): Disabled in preparation of page serializer merge --
957// Base tags are handled a bit different in merged version.
958// Bug: http://crbug.com/328354
959IN_PROC_BROWSER_TEST_F(DomSerializerTests,
960                       DISABLED_SerializeHTMLDOMWithBaseTag) {
961  base::FilePath page_file_path = GetTestFilePath(
962      "dom_serializer", "html_doc_has_base_tag.htm");
963
964  // Get page dir URL which is base URL of this file.
965  base::FilePath dir_name = page_file_path.DirName();
966  dir_name = dir_name.Append(
967      base::FilePath::StringType(base::FilePath::kSeparators[0], 1));
968  GURL path_dir_url = net::FilePathToFileURL(dir_name);
969
970  // Get file URL.
971  GURL file_url = net::FilePathToFileURL(page_file_path);
972  ASSERT_TRUE(file_url.SchemeIsFile());
973  // Load the test file.
974  NavigateToURL(shell(), file_url);
975
976  PostTaskToInProcessRendererAndWait(
977        base::Bind(
978            &DomSerializerTests::SerializeHTMLDOMWithBaseTagOnRenderer,
979            base::Unretained(this), file_url, path_dir_url));
980}
981
982// Serializing page which has an empty HEAD tag.
983IN_PROC_BROWSER_TEST_F(DomSerializerTests, SerializeHTMLDOMWithEmptyHead) {
984  // Need to spin up the renderer and also navigate to a file url so that the
985  // renderer code doesn't attempt a fork when it sees a load to file scheme
986  // from non-file scheme.
987  NavigateToURL(shell(), GetTestUrl(".", "simple_page.html"));
988
989  PostTaskToInProcessRendererAndWait(
990        base::Bind(&DomSerializerTests::SerializeHTMLDOMWithEmptyHeadOnRenderer,
991                   base::Unretained(this)));
992}
993
994// Test that we don't crash when the page contains an iframe that
995// was handled as a download (http://crbug.com/42212).
996IN_PROC_BROWSER_TEST_F(DomSerializerTests,
997                       SerializeDocumentWithDownloadedIFrame) {
998  base::FilePath page_file_path = GetTestFilePath(
999      "dom_serializer", "iframe-src-is-exe.htm");
1000  GURL file_url = net::FilePathToFileURL(page_file_path);
1001  ASSERT_TRUE(file_url.SchemeIsFile());
1002  // Load the test file.
1003  NavigateToURL(shell(), file_url);
1004
1005  PostTaskToInProcessRendererAndWait(
1006        base::Bind(
1007            &DomSerializerTests::
1008                SerializeDocumentWithDownloadedIFrameOnRenderer,
1009            base::Unretained(this), file_url));
1010}
1011
1012IN_PROC_BROWSER_TEST_F(DomSerializerTests,
1013                       SubResourceForElementsInNonHTMLNamespace) {
1014  base::FilePath page_file_path = GetTestFilePath(
1015      "dom_serializer", "non_html_namespace.htm");
1016  GURL file_url = net::FilePathToFileURL(page_file_path);
1017  NavigateToURL(shell(), file_url);
1018
1019  PostTaskToInProcessRendererAndWait(
1020        base::Bind(
1021            &DomSerializerTests::
1022                SubResourceForElementsInNonHTMLNamespaceOnRenderer,
1023            base::Unretained(this), file_url));
1024}
1025
1026}  // namespace content
1027