1c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Copyright 2013 The Chromium Authors. All rights reserved.
2c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
3c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// found in the LICENSE file.
4c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
5c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#ifndef URL_URL_CANON_H_
6c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#define URL_URL_CANON_H_
7c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
8c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include <stdlib.h>
9c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include <string.h>
10c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
117d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/strings/string16.h"
12868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "url/url_export.h"
13c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include "url/url_parse.h"
14c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
15c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)namespace url_canon {
16c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
17c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Canonicalizer output -------------------------------------------------------
18c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
19c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Base class for the canonicalizer output, this maintains a buffer and
20c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// supports simple resizing and append operations on it.
21c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
22c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// It is VERY IMPORTANT that no virtual function calls be made on the common
23c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// code path. We only have two virtual function calls, the destructor and a
24c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// resize function that is called when the existing buffer is not big enough.
25c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The derived class is then in charge of setting up our buffer which we will
26c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// manage.
27c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)template<typename T>
28c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)class CanonOutputT {
29c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) public:
30c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) {
31c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
32c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  virtual ~CanonOutputT() {
33c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
34c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
35c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Implemented to resize the buffer. This function should update the buffer
36c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // pointer to point to the new buffer, and any old data up to |cur_len_| in
37c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // the buffer must be copied over.
38c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  //
39c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // The new size |sz| must be larger than buffer_len_.
40c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  virtual void Resize(int sz) = 0;
41c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
42c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Accessor for returning a character at a given position. The input offset
43c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // must be in the valid range.
44c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  inline char at(int offset) const {
45c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    return buffer_[offset];
46c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
47c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
48c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Sets the character at the given position. The given position MUST be less
49c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // than the length().
50c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  inline void set(int offset, int ch) {
51c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    buffer_[offset] = ch;
52c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
53c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
54c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Returns the number of characters currently in the buffer.
55c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  inline int length() const {
56c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    return cur_len_;
57c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
58c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
59c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Returns the current capacity of the buffer. The length() is the number of
60c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // characters that have been declared to be written, but the capacity() is
61c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // the number that can be written without reallocation. If the caller must
62c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // write many characters at once, it can make sure there is enough capacity,
63c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // write the data, then use set_size() to declare the new length().
64c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  int capacity() const {
65c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    return buffer_len_;
66c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
67c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
68c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Called by the user of this class to get the output. The output will NOT
69c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // be NULL-terminated. Call length() to get the
70c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // length.
71c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const T* data() const {
72c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    return buffer_;
73c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
74c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  T* data() {
75c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    return buffer_;
76c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
77c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
78c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Shortens the URL to the new length. Used for "backing up" when processing
79c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // relative paths. This can also be used if an external function writes a lot
80c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // of data to the buffer (when using the "Raw" version below) beyond the end,
81c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // to declare the new length.
82c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  //
83c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // This MUST NOT be used to expand the size of the buffer beyond capacity().
84c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void set_length(int new_len) {
85c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    cur_len_ = new_len;
86c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
87c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
88c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // This is the most performance critical function, since it is called for
89c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // every character.
90c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void push_back(T ch) {
91c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    // In VC2005, putting this common case first speeds up execution
92c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    // dramatically because this branch is predicted as taken.
93c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    if (cur_len_ < buffer_len_) {
94c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      buffer_[cur_len_] = ch;
95c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      cur_len_++;
96c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      return;
97c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    }
98c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
99c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    // Grow the buffer to hold at least one more item. Hopefully we won't have
100c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    // to do this very often.
101c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    if (!Grow(1))
102c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      return;
103c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
104c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    // Actually do the insertion.
105c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    buffer_[cur_len_] = ch;
106c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    cur_len_++;
107c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
108c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
109c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Appends the given string to the output.
110c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void Append(const T* str, int str_len) {
111c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    if (cur_len_ + str_len > buffer_len_) {
112c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      if (!Grow(cur_len_ + str_len - buffer_len_))
113c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        return;
114c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    }
115c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    for (int i = 0; i < str_len; i++)
116c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      buffer_[cur_len_ + i] = str[i];
117c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    cur_len_ += str_len;
118c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
119c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
120c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) protected:
121c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Grows the given buffer so that it can fit at least |min_additional|
122c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // characters. Returns true if the buffer could be resized, false on OOM.
123c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  bool Grow(int min_additional) {
124c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    static const int kMinBufferLen = 16;
125c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_;
126c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    do {
127c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      if (new_len >= (1 << 30))  // Prevent overflow below.
128c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        return false;
129c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      new_len *= 2;
130c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    } while (new_len < buffer_len_ + min_additional);
131c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    Resize(new_len);
132c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    return true;
133c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
134c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
135c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  T* buffer_;
136c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  int buffer_len_;
137c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
138c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Used characters in the buffer.
139c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  int cur_len_;
140c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)};
141c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
142c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Simple implementation of the CanonOutput using new[]. This class
143c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// also supports a static buffer so if it is allocated on the stack, most
144c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// URLs can be canonicalized with no heap allocations.
145c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)template<typename T, int fixed_capacity = 1024>
146c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)class RawCanonOutputT : public CanonOutputT<T> {
147c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) public:
148c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  RawCanonOutputT() : CanonOutputT<T>() {
149c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    this->buffer_ = fixed_buffer_;
150c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    this->buffer_len_ = fixed_capacity;
151c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
152c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  virtual ~RawCanonOutputT() {
153c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    if (this->buffer_ != fixed_buffer_)
154c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      delete[] this->buffer_;
155c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
156c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
157c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  virtual void Resize(int sz) {
158c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    T* new_buf = new T[sz];
159c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    memcpy(new_buf, this->buffer_,
160c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)           sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz));
161c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    if (this->buffer_ != fixed_buffer_)
162c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      delete[] this->buffer_;
163c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    this->buffer_ = new_buf;
164c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    this->buffer_len_ = sz;
165c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
166c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
167c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) protected:
168c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  T fixed_buffer_[fixed_capacity];
169c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)};
170c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
171c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Normally, all canonicalization output is in narrow characters. We support
172c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// the templates so it can also be used internally if a wide buffer is
173c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// required.
174c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)typedef CanonOutputT<char> CanonOutput;
1757d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)typedef CanonOutputT<base::char16> CanonOutputW;
176c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
177c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)template<int fixed_capacity>
178c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {};
179c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)template<int fixed_capacity>
1807d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)class RawCanonOutputW : public RawCanonOutputT<base::char16, fixed_capacity> {};
181c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
182c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Character set converter ----------------------------------------------------
183c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
184c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Converts query strings into a custom encoding. The embedder can supply an
185c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// implementation of this class to interface with their own character set
186c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// conversion libraries.
187c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
188c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Embedders will want to see the unit test for the ICU version.
189c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
190868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)class URL_EXPORT CharsetConverter {
191c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) public:
192c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  CharsetConverter() {}
193c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  virtual ~CharsetConverter() {}
194c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
195c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Converts the given input string from UTF-16 to whatever output format the
196c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // converter supports. This is used only for the query encoding conversion,
197c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // which does not fail. Instead, the converter should insert "invalid
198c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // character" characters in the output for invalid sequences, and do the
199c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // best it can.
200c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  //
201c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // If the input contains a character not representable in the output
202c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // character set, the converter should append the HTML entity sequence in
203c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // decimal, (such as "&#20320;") with escaping of the ampersand, number
204c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // sign, and semicolon (in the previous example it would be
205c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // "%26%2320320%3B"). This rule is based on what IE does in this situation.
2067d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)  virtual void ConvertFromUTF16(const base::char16* input,
207c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)                                int input_len,
208c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)                                CanonOutput* output) = 0;
209c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)};
210c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
211c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Whitespace -----------------------------------------------------------------
212c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
213c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Searches for whitespace that should be removed from the middle of URLs, and
214c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces
215c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// are preserved, which is what most browsers do. A pointer to the output will
216c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// be returned, and the length of that output will be in |output_len|.
217c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
218c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// This should be called before parsing if whitespace removal is desired (which
219c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// it normally is when you are canonicalizing).
220c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
221c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// If no whitespace is removed, this function will not use the buffer and will
222c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// return a pointer to the input, to avoid the extra copy. If modification is
223c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// required, the given |buffer| will be used and the returned pointer will
224c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// point to the beginning of the buffer.
225c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
2267d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)// Therefore, callers should not use the buffer, since it may actually be empty,
227c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// use the computed pointer and |*output_len| instead.
228868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT const char* RemoveURLWhitespace(const char* input, int input_len,
229868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                           CanonOutputT<char>* buffer,
230868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                           int* output_len);
2317d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT const base::char16* RemoveURLWhitespace(
2327d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    const base::char16* input,
2337d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    int input_len,
2347d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    CanonOutputT<base::char16>* buffer,
2357d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    int* output_len);
236c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
237c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// IDN ------------------------------------------------------------------------
238c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
239c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Converts the Unicode input representing a hostname to ASCII using IDN rules.
240c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The output must fall in the ASCII range, but will be encoded in UTF-16.
241c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
242c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// On success, the output will be filled with the ASCII host name and it will
243c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// return true. Unlike most other canonicalization functions, this assumes that
244c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// the output is empty. The beginning of the host will be at offset 0, and
245c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// the length of the output will be set to the length of the new host name.
246c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
247c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// On error, returns false. The output in this case is undefined.
2487d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool IDNToASCII(const base::char16* src,
249868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                           int src_len,
250868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                           CanonOutputW* output);
251c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
252c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Piece-by-piece canonicalizers ----------------------------------------------
253c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
254c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// These individual canonicalizers append the canonicalized versions of the
255c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// corresponding URL component to the given std::string. The spec and the
256c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// previously-identified range of that component are the input. The range of
257c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// the canonicalized component will be written to the output component.
258c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
259c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// These functions all append to the output so they can be chained. Make sure
260c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// the output is empty when you start.
261c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
262c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// These functions returns boolean values indicating success. On failure, they
263c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// will attempt to write something reasonable to the output so that, if
264c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// displayed to the user, they will recognise it as something that's messed up.
265c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Nothing more should ever be done with these invalid URLs, however.
266c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
267c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Scheme: Appends the scheme and colon to the URL. The output component will
268c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// indicate the range of characters up to but not including the colon.
269c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
270c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Canonical URLs always have a scheme. If the scheme is not present in the
271c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// input, this will just write the colon to indicate an empty scheme. Does not
272c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// append slashes which will be needed before any authority components for most
273c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// URLs.
274c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
275c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The 8-bit version requires UTF-8 encoding.
276868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool CanonicalizeScheme(const char* spec,
277868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                   const url_parse::Component& scheme,
278868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                   CanonOutput* output,
279868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                   url_parse::Component* out_scheme);
2807d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool CanonicalizeScheme(const base::char16* spec,
281868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                   const url_parse::Component& scheme,
282868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                   CanonOutput* output,
283868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                   url_parse::Component* out_scheme);
284c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
285c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// User info: username/password. If present, this will add the delimiters so
286c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// the output will be "<username>:<password>@" or "<username>@". Empty
287c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// username/password pairs, or empty passwords, will get converted to
288c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// nonexistant in the canonical version.
289c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
290c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The components for the username and password refer to ranges in the
291c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// respective source strings. Usually, these will be the same string, which
292c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// is legal as long as the two components don't overlap.
293c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
294c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The 8-bit version requires UTF-8 encoding.
295868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool CanonicalizeUserInfo(const char* username_source,
296868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     const url_parse::Component& username,
297868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     const char* password_source,
298868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     const url_parse::Component& password,
299868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     CanonOutput* output,
300868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     url_parse::Component* out_username,
301868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     url_parse::Component* out_password);
3027d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool CanonicalizeUserInfo(const base::char16* username_source,
303868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     const url_parse::Component& username,
3047d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)                                     const base::char16* password_source,
305868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     const url_parse::Component& password,
306868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     CanonOutput* output,
307868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     url_parse::Component* out_username,
308868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     url_parse::Component* out_password);
309c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
310c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
311c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// This structure holds detailed state exported from the IP/Host canonicalizers.
312c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Additional fields may be added as callers require them.
313c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)struct CanonHostInfo {
314c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {}
315c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
316c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Convenience function to test if family is an IP address.
317c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  bool IsIPAddress() const { return family == IPV4 || family == IPV6; }
318c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
319c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // This field summarizes how the input was classified by the canonicalizer.
320c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  enum Family {
321c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    NEUTRAL,   // - Doesn't resemble an IP address.  As far as the IP
322c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)               //   canonicalizer is concerned, it should be treated as a
323c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)               //   hostname.
324c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    BROKEN,    // - Almost an IP, but was not canonicalized.  This could be an
325c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)               //   IPv4 address where truncation occurred, or something
326c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)               //   containing the special characters :[] which did not parse
327c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)               //   as an IPv6 address.  Never attempt to connect to this
328c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)               //   address, because it might actually succeed!
329c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    IPV4,      // - Successfully canonicalized as an IPv4 address.
330c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    IPV6,      // - Successfully canonicalized as an IPv6 address.
331c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  };
332c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  Family family;
333c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
334c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // If |family| is IPV4, then this is the number of nonempty dot-separated
335c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // components in the input text, from 1 to 4.  If |family| is not IPV4,
336c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // this value is undefined.
337c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  int num_ipv4_components;
338c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
339c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Location of host within the canonicalized output.
340c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6.
341c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // CanonicalizeHostVerbose() always sets it.
342c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  url_parse::Component out_host;
343c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
344c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // |address| contains the parsed IP Address (if any) in its first
345c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // AddressLength() bytes, in network order. If IsIPAddress() is false
346c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // AddressLength() will return zero and the content of |address| is undefined.
347c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  unsigned char address[16];
348c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
349c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Convenience function to calculate the length of an IP address corresponding
350c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // to the current IP version in |family|, if any. For use with |address|.
351c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  int AddressLength() const {
352c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0);
353c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
354c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)};
355c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
356c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
357c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Host.
358c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
359c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The 8-bit version requires UTF-8 encoding.  Use this version when you only
360c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// need to know whether canonicalization succeeded.
361868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool CanonicalizeHost(const char* spec,
362868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 const url_parse::Component& host,
363868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 CanonOutput* output,
364868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 url_parse::Component* out_host);
3657d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool CanonicalizeHost(const base::char16* spec,
366868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 const url_parse::Component& host,
367868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 CanonOutput* output,
368868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 url_parse::Component* out_host);
369c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
370c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Extended version of CanonicalizeHost, which returns additional information.
371c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Use this when you need to know whether the hostname was an IP address.
372c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// A successful return is indicated by host_info->family != BROKEN.  See the
373c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// definition of CanonHostInfo above for details.
374868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT void CanonicalizeHostVerbose(const char* spec,
375868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        const url_parse::Component& host,
376868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        CanonOutput* output,
377868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        CanonHostInfo* host_info);
3787d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT void CanonicalizeHostVerbose(const base::char16* spec,
379868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        const url_parse::Component& host,
380868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        CanonOutput* output,
381868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        CanonHostInfo* host_info);
382c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
383c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
384c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// IP addresses.
385c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
386c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Tries to interpret the given host name as an IPv4 or IPv6 address. If it is
387c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// an IP address, it will canonicalize it as such, appending it to |output|.
388c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Additional status information is returned via the |*host_info| parameter.
389c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// See the definition of CanonHostInfo above for details.
390c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
391c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// This is called AUTOMATICALLY from the host canonicalizer, which ensures that
392c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// the input is unescaped and name-prepped, etc. It should not normally be
393c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// necessary or wise to call this directly.
394868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT void CanonicalizeIPAddress(const char* spec,
395868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      const url_parse::Component& host,
396868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      CanonOutput* output,
397868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      CanonHostInfo* host_info);
3987d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT void CanonicalizeIPAddress(const base::char16* spec,
399868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      const url_parse::Component& host,
400868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      CanonOutput* output,
401868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      CanonHostInfo* host_info);
402c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
403c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Port: this function will add the colon for the port if a port is present.
404c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The caller can pass url_parse::PORT_UNSPECIFIED as the
405c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// default_port_for_scheme argument if there is no default port.
406c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
407c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The 8-bit version requires UTF-8 encoding.
408868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool CanonicalizePort(const char* spec,
409868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 const url_parse::Component& port,
410868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 int default_port_for_scheme,
411868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 CanonOutput* output,
412868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 url_parse::Component* out_port);
4137d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool CanonicalizePort(const base::char16* spec,
414868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 const url_parse::Component& port,
415868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 int default_port_for_scheme,
416868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 CanonOutput* output,
417868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 url_parse::Component* out_port);
418c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
419c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
420c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// if the scheme is unknown.
421868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT int DefaultPortForScheme(const char* scheme, int scheme_len);
422c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
423c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Path. If the input does not begin in a slash (including if the input is
424c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// empty), we'll prepend a slash to the path to make it canonical.
425c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
426c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The 8-bit version assumes UTF-8 encoding, but does not verify the validity
427c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid
428c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// characters, etc.). Normally, URLs will come in as UTF-16, so this isn't
429c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// an issue. Somebody giving us an 8-bit path is responsible for generating
430c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// the path that the server expects (we'll escape high-bit characters), so
431c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// if something is invalid, it's their problem.
432868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool CanonicalizePath(const char* spec,
433868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 const url_parse::Component& path,
434868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 CanonOutput* output,
435868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 url_parse::Component* out_path);
4367d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool CanonicalizePath(const base::char16* spec,
437868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 const url_parse::Component& path,
438868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 CanonOutput* output,
439868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 url_parse::Component* out_path);
440c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
441c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Canonicalizes the input as a file path. This is like CanonicalizePath except
442c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// that it also handles Windows drive specs. For example, the path can begin
443c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// with "c|\" and it will get properly canonicalized to "C:/".
444c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The string will be appended to |*output| and |*out_path| will be updated.
445c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
446c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The 8-bit version requires UTF-8 encoding.
447868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool FileCanonicalizePath(const char* spec,
448868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     const url_parse::Component& path,
449868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     CanonOutput* output,
450868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     url_parse::Component* out_path);
4517d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool FileCanonicalizePath(const base::char16* spec,
452868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     const url_parse::Component& path,
453868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     CanonOutput* output,
454868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     url_parse::Component* out_path);
455c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
456c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Query: Prepends the ? if needed.
457c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
458c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The 8-bit version requires the input to be UTF-8 encoding. Incorrectly
459c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode
460c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// "invalid character." This function can not fail, we always just try to do
461c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// our best for crazy input here since web pages can set it themselves.
462c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
463c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// This will convert the given input into the output encoding that the given
464c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// character set converter object provides. The converter will only be called
465c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// if necessary, for ASCII input, no conversions are necessary.
466c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
467c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The converter can be NULL. In this case, the output encoding will be UTF-8.
468868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT void CanonicalizeQuery(const char* spec,
469868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                  const url_parse::Component& query,
470868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                  CharsetConverter* converter,
471868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                  CanonOutput* output,
472868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                  url_parse::Component* out_query);
4737d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT void CanonicalizeQuery(const base::char16* spec,
474868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                  const url_parse::Component& query,
475868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                  CharsetConverter* converter,
476868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                  CanonOutput* output,
477868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                  url_parse::Component* out_query);
478c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
479c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only
480c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// canonicalizer that does not produce ASCII output). The output is
481c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// guaranteed to be valid UTF-8.
482c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
483c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use
484c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// the "Unicode replacement character" for the confusing bits and copy the rest.
485868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT void CanonicalizeRef(const char* spec,
486868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                const url_parse::Component& path,
487868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                CanonOutput* output,
488868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                url_parse::Component* out_path);
4897d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT void CanonicalizeRef(const base::char16* spec,
490868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                const url_parse::Component& path,
491868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                CanonOutput* output,
492868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                url_parse::Component* out_path);
493c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
494c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Full canonicalizer ---------------------------------------------------------
495c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
496c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// These functions replace any string contents, rather than append as above.
497c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// See the above piece-by-piece functions for information specific to
498c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// canonicalizing individual components.
499c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
500c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The output will be ASCII except the reference fragment, which may be UTF-8.
501c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
502c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The 8-bit versions require UTF-8 encoding.
503c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
504c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Use for standard URLs with authorities and paths.
505868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool CanonicalizeStandardURL(const char* spec,
506868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        int spec_len,
507868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        const url_parse::Parsed& parsed,
508868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        CharsetConverter* query_converter,
509868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        CanonOutput* output,
510868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        url_parse::Parsed* new_parsed);
5117d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool CanonicalizeStandardURL(const base::char16* spec,
512868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        int spec_len,
513868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        const url_parse::Parsed& parsed,
514868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        CharsetConverter* query_converter,
515868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        CanonOutput* output,
516868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                        url_parse::Parsed* new_parsed);
517c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
518c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Use for file URLs.
519868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool CanonicalizeFileURL(const char* spec,
520868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    int spec_len,
521868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    const url_parse::Parsed& parsed,
522868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    CharsetConverter* query_converter,
523868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    CanonOutput* output,
524868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    url_parse::Parsed* new_parsed);
5257d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool CanonicalizeFileURL(const base::char16* spec,
526868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    int spec_len,
527868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    const url_parse::Parsed& parsed,
528868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    CharsetConverter* query_converter,
529868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    CanonOutput* output,
530868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    url_parse::Parsed* new_parsed);
531c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
532c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Use for filesystem URLs.
533868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool CanonicalizeFileSystemURL(const char* spec,
534868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                          int spec_len,
535868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                          const url_parse::Parsed& parsed,
536868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                          CharsetConverter* query_converter,
537868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                          CanonOutput* output,
538868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                          url_parse::Parsed* new_parsed);
5397d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool CanonicalizeFileSystemURL(const base::char16* spec,
540868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                          int spec_len,
541868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                          const url_parse::Parsed& parsed,
542868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                          CharsetConverter* query_converter,
543868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                          CanonOutput* output,
544868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                          url_parse::Parsed* new_parsed);
545c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
546c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Use for path URLs such as javascript. This does not modify the path in any
547c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// way, for example, by escaping it.
548868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool CanonicalizePathURL(const char* spec,
549868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    int spec_len,
550868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    const url_parse::Parsed& parsed,
551868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    CanonOutput* output,
552868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    url_parse::Parsed* new_parsed);
5537d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool CanonicalizePathURL(const base::char16* spec,
554868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    int spec_len,
555868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    const url_parse::Parsed& parsed,
556868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    CanonOutput* output,
557868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                    url_parse::Parsed* new_parsed);
558c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
559c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Use for mailto URLs. This "canonicalizes" the url into a path and query
560c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// component. It does not attempt to merge "to" fields. It uses UTF-8 for
561c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// the query encoding if there is a query. This is because a mailto URL is
562c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// really intended for an external mail program, and the encoding of a page,
563c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// etc. which would influence a query encoding normally are irrelevant.
564868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool CanonicalizeMailtoURL(const char* spec,
565868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      int spec_len,
566868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      const url_parse::Parsed& parsed,
567868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      CanonOutput* output,
568868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      url_parse::Parsed* new_parsed);
5697d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool CanonicalizeMailtoURL(const base::char16* spec,
570868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      int spec_len,
571868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      const url_parse::Parsed& parsed,
572868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      CanonOutput* output,
573868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                      url_parse::Parsed* new_parsed);
574c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
575c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Part replacer --------------------------------------------------------------
576c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
577c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Internal structure used for storing separate strings for each component.
578c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The basic canonicalization functions use this structure internally so that
579c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// component replacement (different strings for different components) can be
580c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// treated on the same code path as regular canonicalization (the same string
581c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// for each component).
582c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
583c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// A url_parse::Parsed structure usually goes along with this. Those
584c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// components identify offsets within these strings, so that they can all be
585c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// in the same string, or spread arbitrarily across different ones.
586c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
587c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// This structures does not own any data. It is the caller's responsibility to
588c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// ensure that the data the pointers point to stays in scope and is not
589c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// modified.
590c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)template<typename CHAR>
591c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)struct URLComponentSource {
592c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Constructor normally used by callers wishing to replace components. This
593c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // will make them all NULL, which is no replacement. The caller would then
594c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // override the components they want to replace.
595c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  URLComponentSource()
596c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      : scheme(NULL),
597c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        username(NULL),
598c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        password(NULL),
599c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        host(NULL),
600c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        port(NULL),
601c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        path(NULL),
602c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        query(NULL),
603c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        ref(NULL) {
604c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
605c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
606c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Constructor normally used internally to initialize all the components to
607c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // point to the same spec.
608c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  explicit URLComponentSource(const CHAR* default_value)
609c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)      : scheme(default_value),
610c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        username(default_value),
611c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        password(default_value),
612c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        host(default_value),
613c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        port(default_value),
614c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        path(default_value),
615c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        query(default_value),
616c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)        ref(default_value) {
617c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
618c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
619c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const CHAR* scheme;
620c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const CHAR* username;
621c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const CHAR* password;
622c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const CHAR* host;
623c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const CHAR* port;
624c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const CHAR* path;
625c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const CHAR* query;
626c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const CHAR* ref;
627c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)};
628c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
629c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// This structure encapsulates information on modifying a URL. Each component
630c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// may either be left unchanged, replaced, or deleted.
631c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
632c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// By default, each component is unchanged. For those components that should be
633c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// modified, call either Set* or Clear* to modify it.
634c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
635c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT
636c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// IN SCOPE BY THE CALLER for as long as this object exists!
637c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
638c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Prefer the 8-bit replacement version if possible since it is more efficient.
639c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)template<typename CHAR>
640c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)class Replacements {
641c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) public:
642c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  Replacements() {
643c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
644c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
645c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Scheme
646c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void SetScheme(const CHAR* s, const url_parse::Component& comp) {
647c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.scheme = s;
648c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.scheme = comp;
649c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
650c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Note: we don't have a ClearScheme since this doesn't make any sense.
651c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  bool IsSchemeOverridden() const { return sources_.scheme != NULL; }
652c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
653c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Username
654c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void SetUsername(const CHAR* s, const url_parse::Component& comp) {
655c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.username = s;
656c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.username = comp;
657c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
658c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void ClearUsername() {
659c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.username = Placeholder();
660c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.username = url_parse::Component();
661c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
662c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  bool IsUsernameOverridden() const { return sources_.username != NULL; }
663c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
664c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Password
665c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void SetPassword(const CHAR* s, const url_parse::Component& comp) {
666c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.password = s;
667c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.password = comp;
668c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
669c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void ClearPassword() {
670c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.password = Placeholder();
671c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.password = url_parse::Component();
672c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
673c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  bool IsPasswordOverridden() const { return sources_.password != NULL; }
674c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
675c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Host
676c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void SetHost(const CHAR* s, const url_parse::Component& comp) {
677c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.host = s;
678c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.host = comp;
679c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
680c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void ClearHost() {
681c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.host = Placeholder();
682c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.host = url_parse::Component();
683c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
684c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  bool IsHostOverridden() const { return sources_.host != NULL; }
685c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
686c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Port
687c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void SetPort(const CHAR* s, const url_parse::Component& comp) {
688c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.port = s;
689c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.port = comp;
690c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
691c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void ClearPort() {
692c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.port = Placeholder();
693c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.port = url_parse::Component();
694c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
695c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  bool IsPortOverridden() const { return sources_.port != NULL; }
696c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
697c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Path
698c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void SetPath(const CHAR* s, const url_parse::Component& comp) {
699c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.path = s;
700c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.path = comp;
701c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
702c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void ClearPath() {
703c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.path = Placeholder();
704c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.path = url_parse::Component();
705c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
706c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  bool IsPathOverridden() const { return sources_.path != NULL; }
707c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
708c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Query
709c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void SetQuery(const CHAR* s, const url_parse::Component& comp) {
710c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.query = s;
711c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.query = comp;
712c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
713c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void ClearQuery() {
714c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.query = Placeholder();
715c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.query = url_parse::Component();
716c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
717c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  bool IsQueryOverridden() const { return sources_.query != NULL; }
718c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
719c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Ref
720c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void SetRef(const CHAR* s, const url_parse::Component& comp) {
721c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.ref = s;
722c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.ref = comp;
723c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
724c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  void ClearRef() {
725c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    sources_.ref = Placeholder();
726c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    components_.ref = url_parse::Component();
727c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
728c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  bool IsRefOverridden() const { return sources_.ref != NULL; }
729c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
730c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Getters for the itnernal data. See the variables below for how the
731c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // information is encoded.
732c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const URLComponentSource<CHAR>& sources() const { return sources_; }
733c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const url_parse::Parsed& components() const { return components_; }
734c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
735c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) private:
736c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Returns a pointer to a static empty string that is used as a placeholder
737c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // to indicate a component should be deleted (see below).
738c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  const CHAR* Placeholder() {
739c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    static const CHAR empty_string = 0;
740c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    return &empty_string;
741c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  }
742c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
743c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // We support three states:
744c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  //
745c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Action                 | Source                Component
746c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // -----------------------+--------------------------------------------------
747c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Don't change component | NULL                  (unused)
748c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Replace component      | (replacement string)  (replacement component)
749c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Delete component       | (non-NULL)            (invalid component: (0,-1))
750c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  //
751c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // We use a pointer to the empty string for the source when the component
752c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // should be deleted.
753c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  URLComponentSource<CHAR> sources_;
754c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  url_parse::Parsed components_;
755c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)};
756c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
757c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The base must be an 8-bit canonical URL.
758868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool ReplaceStandardURL(const char* base,
759868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                   const url_parse::Parsed& base_parsed,
760868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                   const Replacements<char>& replacements,
761868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                   CharsetConverter* query_converter,
762868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                   CanonOutput* output,
763868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                   url_parse::Parsed* new_parsed);
7647d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool ReplaceStandardURL(
7657d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    const char* base,
7667d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    const url_parse::Parsed& base_parsed,
7677d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    const Replacements<base::char16>& replacements,
7687d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    CharsetConverter* query_converter,
7697d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    CanonOutput* output,
7707d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    url_parse::Parsed* new_parsed);
771c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
772c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Filesystem URLs can only have the path, query, or ref replaced.
773c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// All other components will be ignored.
774868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool ReplaceFileSystemURL(const char* base,
775868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     const url_parse::Parsed& base_parsed,
776868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     const Replacements<char>& replacements,
777868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     CharsetConverter* query_converter,
778868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     CanonOutput* output,
779868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                     url_parse::Parsed* new_parsed);
7807d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)URL_EXPORT bool ReplaceFileSystemURL(
7817d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    const char* base,
7827d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    const url_parse::Parsed& base_parsed,
7837d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    const Replacements<base::char16>& replacements,
7847d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    CharsetConverter* query_converter,
7857d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    CanonOutput* output,
7867d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    url_parse::Parsed* new_parsed);
787c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
788c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Replacing some parts of a file URL is not permitted. Everything except
789c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// the host, path, query, and ref will be ignored.
790868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool ReplaceFileURL(const char* base,
791868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               const url_parse::Parsed& base_parsed,
792868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               const Replacements<char>& replacements,
793868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               CharsetConverter* query_converter,
794868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               CanonOutput* output,
795868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               url_parse::Parsed* new_parsed);
796868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool ReplaceFileURL(const char* base,
797868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               const url_parse::Parsed& base_parsed,
7987d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)                               const Replacements<base::char16>& replacements,
799868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               CharsetConverter* query_converter,
800868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               CanonOutput* output,
801868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               url_parse::Parsed* new_parsed);
802c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
803c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Path URLs can only have the scheme and path replaced. All other components
804c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// will be ignored.
805868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool ReplacePathURL(const char* base,
806868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               const url_parse::Parsed& base_parsed,
807868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               const Replacements<char>& replacements,
808868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               CanonOutput* output,
809868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               url_parse::Parsed* new_parsed);
810868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool ReplacePathURL(const char* base,
811868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               const url_parse::Parsed& base_parsed,
8127d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)                               const Replacements<base::char16>& replacements,
813868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               CanonOutput* output,
814868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                               url_parse::Parsed* new_parsed);
815c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
816c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Mailto URLs can only have the scheme, path, and query replaced.
817c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// All other components will be ignored.
818868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool ReplaceMailtoURL(const char* base,
819868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 const url_parse::Parsed& base_parsed,
820868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 const Replacements<char>& replacements,
821868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 CanonOutput* output,
822868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 url_parse::Parsed* new_parsed);
823868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool ReplaceMailtoURL(const char* base,
824868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 const url_parse::Parsed& base_parsed,
8257d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)                                 const Replacements<base::char16>& replacements,
826868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 CanonOutput* output,
827868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                                 url_parse::Parsed* new_parsed);
828c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
829c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Relative URL ---------------------------------------------------------------
830c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
831c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Given an input URL or URL fragment |fragment|, determines if it is a
832c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// relative or absolute URL and places the result into |*is_relative|. If it is
833c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// relative, the relevant portion of the URL will be placed into
834c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// |*relative_component| (there may have been trimmed whitespace, for example).
835c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// This value is passed to ResolveRelativeURL. If the input is not relative,
836c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// this value is UNDEFINED (it may be changed by the function).
837c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
838c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Returns true on success (we successfully determined the URL is relative or
839c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// not). Failure means that the combination of URLs doesn't make any sense.
840c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
841c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The base URL should always be canonical, therefore is ASCII.
842868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool IsRelativeURL(const char* base,
843868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                              const url_parse::Parsed& base_parsed,
844868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                              const char* fragment,
845868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                              int fragment_len,
846868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                              bool is_base_hierarchical,
847868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                              bool* is_relative,
848868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                              url_parse::Component* relative_component);
849868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool IsRelativeURL(const char* base,
850868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                              const url_parse::Parsed& base_parsed,
8517d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)                              const base::char16* fragment,
852868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                              int fragment_len,
853868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                              bool is_base_hierarchical,
854868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                              bool* is_relative,
855868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)                              url_parse::Component* relative_component);
856c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
857c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Given a canonical parsed source URL, a URL fragment known to be relative,
858c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// and the identified relevant portion of the relative URL (computed by
859c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// IsRelativeURL), this produces a new parsed canonical URL in |output| and
860c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// |out_parsed|.
861c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
862c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// It also requires a flag indicating whether the base URL is a file: URL
863c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// which triggers additional logic.
864c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
865c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The base URL should be canonical and have a host (may be empty for file
866c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// URLs) and a path. If it doesn't have these, we can't resolve relative
867c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// URLs off of it and will return the base as the output with an error flag.
868c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Becausee it is canonical is should also be ASCII.
869c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
870c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// The query charset converter follows the same rules as CanonicalizeQuery.
871c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)//
872c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// Returns true on success. On failure, the output will be "something
873c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// reasonable" that will be consistent and valid, just probably not what
874c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)// was intended by the web page author or caller.
875868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool ResolveRelativeURL(
876868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    const char* base_url,
877868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    const url_parse::Parsed& base_parsed,
878868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    bool base_is_file,
879868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    const char* relative_url,
880868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    const url_parse::Component& relative_component,
881868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    CharsetConverter* query_converter,
882868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    CanonOutput* output,
883868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    url_parse::Parsed* out_parsed);
884868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)URL_EXPORT bool ResolveRelativeURL(
885868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    const char* base_url,
886868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    const url_parse::Parsed& base_parsed,
887868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    bool base_is_file,
8887d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)    const base::char16* relative_url,
889868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    const url_parse::Component& relative_component,
890868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    CharsetConverter* query_converter,
891868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    CanonOutput* output,
892868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    url_parse::Parsed* out_parsed);
893c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
894c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)}  // namespace url_canon
895c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
896c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#endif  // URL_URL_CANON_H_
897