001    // Copyright (c) 2011, Mike Samuel
002    // All rights reserved.
003    //
004    // Redistribution and use in source and binary forms, with or without
005    // modification, are permitted provided that the following conditions
006    // are met:
007    //
008    // Redistributions of source code must retain the above copyright
009    // notice, this list of conditions and the following disclaimer.
010    // Redistributions in binary form must reproduce the above copyright
011    // notice, this list of conditions and the following disclaimer in the
012    // documentation and/or other materials provided with the distribution.
013    // Neither the name of the OWASP nor the names of its contributors may
014    // be used to endorse or promote products derived from this software
015    // without specific prior written permission.
016    // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017    // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018    // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019    // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020    // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021    // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022    // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023    // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024    // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025    // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026    // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027    // POSSIBILITY OF SUCH DAMAGE.
028    
029    package org.owasp.html;
030    
031    import javax.annotation.Nullable;
032    
033    import com.google.common.collect.ImmutableSet;
034    
035    /**
036     * An attribute policy for attributes whose values are URLs that requires that
037     * the value have no protocol or have an allowed protocol.
038     *
039     * <p>
040     * URLs with protocols must match the protocol set passed to the constructor.
041     * URLs without protocols but which specify an origin different from the
042     * containing page (e.g. {@code //example.org}) are only allowed if the
043     * {@link FilterUrlByProtocolAttributePolicy#allowProtocolRelativeUrls policy}
044     * allows both {@code http} and {@code https} which are normally used to serve
045     * HTML.
046     * Same-origin URLs, URLs without any protocol or authority part are always
047     * allowed.
048     * </p>
049     *
050     * <p>
051     * This class assumes that URLs are either hierarchical, or are opaque, but
052     * do not look like they contain an authority portion.
053     * </p>
054     *
055     * @author Mike Samuel <mikesamuel@gmail.com>
056     */
057    @TCB
058    public class FilterUrlByProtocolAttributePolicy implements AttributePolicy {
059      private final ImmutableSet<String> protocols;
060    
061      public FilterUrlByProtocolAttributePolicy(
062          Iterable<? extends String> protocols) {
063        this.protocols = ImmutableSet.copyOf(protocols);
064      }
065    
066      public @Nullable String apply(
067          String elementName, String attributeName, String s) {
068        protocol_loop:
069        for (int i = 0, n = s.length(); i < n; ++i) {
070          switch (s.charAt(i)) {
071            case '/': case '#': case '?':  // No protocol.
072              // Check for domain relative URLs like //www.evil.org/
073              if (s.startsWith("//")
074                  // or the protocols by which HTML is normally served are OK.
075                  && !allowProtocolRelativeUrls()) {
076                return null;
077              }
078              break protocol_loop;
079            case ':':
080              String protocol = Strings.toLowerCase(s.substring(0, i));
081              if (!protocols.contains(protocol)) { return null; }
082              break protocol_loop;
083          }
084        }
085        return normalizeUri(s);
086      }
087    
088      protected boolean allowProtocolRelativeUrls() {
089        return protocols.contains("http") && protocols.contains("https");
090      }
091    
092      /** Percent encodes anything that looks like a colon, or a parenthesis. */
093      static String normalizeUri(String s) {
094        int n = s.length();
095        boolean colonsIrrelevant = false;
096        for (int i = 0; i < n; ++i) {
097          char ch = s.charAt(i);
098          switch (ch) {
099            case '/': case '#': case '?': case ':':
100              colonsIrrelevant = true;
101              break;
102            case '(': case ')': case '\uff1a':
103              StringBuilder sb = new StringBuilder(n + 16);
104              int pos = 0;
105              for (; i < n; ++i) {
106                ch = s.charAt(i);
107                switch (ch) {
108                  case '(':
109                    sb.append(s, pos, i).append("%28");
110                    pos = i + 1;
111                    break;
112                  case ')':
113                    sb.append(s, pos, i).append("%29");
114                    pos = i + 1;
115                    break;
116                  default:
117                    if (ch > 0x100 && !colonsIrrelevant) {
118                      // Other colon like characters.
119                      // TODO: do we need to encode non-colon characters if we're
120                      // not dealing with URLs that haven't been copy/pasted into
121                      // the URL bar?
122                      // Is it safe to assume UTF-8 here?
123                      switch (ch) {
124                        case '\u0589':
125                          sb.append(s, pos, i).append("%d6%89");
126                          pos = i + 1;
127                          break;
128                        case '\u05c3':
129                          sb.append(s, pos, i).append("%d7%83");
130                          pos = i + 1;
131                          break;
132                        case '\u2236':
133                          sb.append(s, pos, i).append("%e2%88%b6");
134                          pos = i + 1;
135                          break;
136                        case '\uff1a':
137                          sb.append(s, pos, i).append("%ef%bc%9a");
138                          pos = i + 1;
139                          break;
140                      }
141                    }
142                    break;
143                }
144              }
145              return sb.append(s, pos, n).toString();
146          }
147        }
148        return s;
149      }
150    
151      @Override
152      public boolean equals(Object o) {
153        return o != null && this.getClass() == o.getClass()
154            && protocols.equals(((FilterUrlByProtocolAttributePolicy) o).protocols);
155      }
156    
157      @Override
158      public int hashCode() {
159        return protocols.hashCode();
160      }
161    
162    }