/* * Copyright (C) 2007 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package android.net; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Set; import java.util.StringTokenizer; /** * * Sanitizes the Query portion of a URL. Simple example: *


 * UrlQuerySanitizer sanitizer = new UrlQuerySanitizer();
 * sanitizer.setAllowUnregisteredParamaters(true);
 * sanitizer.parseUrl("http://example.com/?name=Joe+User");
 * String name = sanitizer.getValue("name"));
 * // name now contains "Joe_User"
 *

* * Register ValueSanitizers to customize the way individual * parameters are sanitized: *


 * UrlQuerySanitizer sanitizer = new UrlQuerySanitizer();
 * sanitizer.registerParamater("name", UrlQuerySanitizer.createSpaceLegal());
 * sanitizer.parseUrl("http://example.com/?name=Joe+User");
 * String name = sanitizer.getValue("name"));
 * // name now contains "Joe User". (The string is first decoded, which
 * // converts the '+' to a ' '. Then the string is sanitized, which
 * // converts the ' ' to an '_'. (The ' ' is converted because the default
 * unregistered parameter sanitizer does not allow any special characters,
 * and ' ' is a special character.)
 *

* * There are several ways to create ValueSanitizers. In order of increasing * sophistication: *

Call one of the UrlQuerySanitizer.createXXX() methods. *
Construct your own instance of * UrlQuerySanitizer.IllegalCharacterValueSanitizer. *
Subclass UrlQuerySanitizer.ValueSanitizer to define your own value * sanitizer. *

* */ public class UrlQuerySanitizer { /** * A simple tuple that holds parameter-value pairs. * */ public class ParameterValuePair { /** * Construct a parameter-value tuple. * @param parameter an unencoded parameter * @param value an unencoded value */ public ParameterValuePair(String parameter, String value) { mParameter = parameter; mValue = value; } /** * The unencoded parameter */ public String mParameter; /** * The unencoded value */ public String mValue; } final private HashMap mSanitizers = new HashMap(); final private HashMap mEntries = new HashMap(); final private ArrayList mEntriesList = new ArrayList(); private boolean mAllowUnregisteredParamaters; private boolean mPreferFirstRepeatedParameter; private ValueSanitizer mUnregisteredParameterValueSanitizer = getAllIllegal(); /** * A functor used to sanitize a single query value. * */ public static interface ValueSanitizer { /** * Sanitize an unencoded value. * @param value * @return the sanitized unencoded value */ public String sanitize(String value); } /** * Sanitize values based on which characters they contain. Illegal * characters are replaced with either space or '_', depending upon * whether space is a legal character or not. */ public static class IllegalCharacterValueSanitizer implements ValueSanitizer { private int mFlags; /** * Allow space (' ') characters. */ public final static int SPACE_OK = 1 << 0; /** * Allow whitespace characters other than space. The * other whitespace characters are * '\t' '\f' '\n' '\r' and '\0x000b' (vertical tab) */ public final static int OTHER_WHITESPACE_OK = 1 << 1; /** * Allow characters with character codes 128 to 255. */ public final static int NON_7_BIT_ASCII_OK = 1 << 2; /** * Allow double quote characters. ('"') */ public final static int DQUOTE_OK = 1 << 3; /** * Allow single quote characters. ('\'') */ public final static int SQUOTE_OK = 1 << 4; /** * Allow less-than characters. ('<') */ public final static int LT_OK = 1 << 5; /** * Allow greater-than characters. ('>') */ public final static int GT_OK = 1 << 6; /** * Allow ampersand characters ('&') */ public final static int AMP_OK = 1 << 7; /** * Allow percent-sign characters ('%') */ public final static int PCT_OK = 1 << 8; /** * Allow nul characters ('\0') */ public final static int NUL_OK = 1 << 9; /** * Allow text to start with a script URL * such as "javascript:" or "vbscript:" */ public final static int SCRIPT_URL_OK = 1 << 10; /** * Mask with all fields set to OK */ public final static int ALL_OK = 0x7ff; /** * Mask with both regular space and other whitespace OK */ public final static int ALL_WHITESPACE_OK = SPACE_OK | OTHER_WHITESPACE_OK; // Common flag combinations: /** *

Deny all special characters. *
Deny script URLs. *

*/ public final static int ALL_ILLEGAL = 0; /** *

Allow all special characters except Nul. ('\0'). *
Allow script URLs. *

*/ public final static int ALL_BUT_NUL_LEGAL = ALL_OK & ~NUL_OK; /** *

Allow all special characters except for: *
- whitespace characters *
- Nul ('\0') *
*
Allow script URLs. *

*/ public final static int ALL_BUT_WHITESPACE_LEGAL = ALL_OK & ~(ALL_WHITESPACE_OK | NUL_OK); /** *

Allow characters used by encoded URLs. *
Deny script URLs. *

*/ public final static int URL_LEGAL = NON_7_BIT_ASCII_OK | SQUOTE_OK | AMP_OK | PCT_OK; /** *

Allow characters used by encoded URLs. *
Allow spaces. *
Deny script URLs. *

*/ public final static int URL_AND_SPACE_LEGAL = URL_LEGAL | SPACE_OK; /** *

Allow ampersand. *
Deny script URLs. *

*/ public final static int AMP_LEGAL = AMP_OK; /** *

Allow ampersand. *
Allow space. *
Deny script URLs. *

*/ public final static int AMP_AND_SPACE_LEGAL = AMP_OK | SPACE_OK; /** *

Allow space. *
Deny script URLs. *

*/ public final static int SPACE_LEGAL = SPACE_OK; /** *

Allow all but. *
- Nul ('\0') *
- Angle brackets ('<', '>') *
*
Deny script URLs. *

*/ public final static int ALL_BUT_NUL_AND_ANGLE_BRACKETS_LEGAL = ALL_OK & ~(NUL_OK | LT_OK | GT_OK); /** * Script URL definitions */ private final static String JAVASCRIPT_PREFIX = "javascript:"; private final static String VBSCRIPT_PREFIX = "vbscript:"; private final static int MIN_SCRIPT_PREFIX_LENGTH = Math.min( JAVASCRIPT_PREFIX.length(), VBSCRIPT_PREFIX.length()); /** * Construct a sanitizer. The parameters set the behavior of the * sanitizer. * @param flags some combination of the XXX_OK flags. */ public IllegalCharacterValueSanitizer( int flags) { mFlags = flags; } /** * Sanitize a value. *

If script URLs are not OK, the will be removed. *
If neither spaces nor other white space is OK, then * white space will be trimmed from the beginning and end of * the URL. (Just the actual white space characters are trimmed, not * other control codes.) *
Illegal characters will be replaced with * either ' ' or '_', depending on whether a space is itself a * legal character. *

* @param value * @return the sanitized value */ public String sanitize(String value) { if (value == null) { return null; } int length = value.length(); if ((mFlags & SCRIPT_URL_OK) != 0) { if (length >= MIN_SCRIPT_PREFIX_LENGTH) { String asLower = value.toLowerCase(); if (asLower.startsWith(JAVASCRIPT_PREFIX) || asLower.startsWith(VBSCRIPT_PREFIX)) { return ""; } } } // If whitespace isn't OK, get rid of whitespace at beginning // and end of value. if ( (mFlags & ALL_WHITESPACE_OK) == 0) { value = trimWhitespace(value); // The length could have changed, so we need to correct // the length variable. length = value.length(); } StringBuilder stringBuilder = new StringBuilder(length); for(int i = 0; i < length; i++) { char c = value.charAt(i); if (!characterIsLegal(c)) { if ((mFlags & SPACE_OK) != 0) { c = ' '; } else { c = '_'; } } stringBuilder.append(c); } return stringBuilder.toString(); } /** * Trim whitespace from the beginning and end of a string. *

* Note: can't use {@link String#trim} because {@link String#trim} has a * different definition of whitespace than we want. * @param value the string to trim * @return the trimmed string */ private String trimWhitespace(String value) { int start = 0; int last = value.length() - 1; int end = last; while (start <= end && isWhitespace(value.charAt(start))) { start++; } while (end >= start && isWhitespace(value.charAt(end))) { end--; } if (start == 0 && end == last) { return value; } return value.substring(start, end + 1); } /** * Check if c is whitespace. * @param c character to test * @return true if c is a whitespace character */ private boolean isWhitespace(char c) { switch(c) { case ' ': case '\t': case '\f': case '\n': case '\r': case 11: /* VT */ return true; default: return false; } } /** * Check whether an individual character is legal. Uses the * flag bit-set passed into the constructor. * @param c * @return true if c is a legal character */ private boolean characterIsLegal(char c) { switch(c) { case ' ' : return (mFlags & SPACE_OK) != 0; case '\t': case '\f': case '\n': case '\r': case 11: /* VT */ return (mFlags & OTHER_WHITESPACE_OK) != 0; case '\"': return (mFlags & DQUOTE_OK) != 0; case '\'': return (mFlags & SQUOTE_OK) != 0; case '<' : return (mFlags & LT_OK) != 0; case '>' : return (mFlags & GT_OK) != 0; case '&' : return (mFlags & AMP_OK) != 0; case '%' : return (mFlags & PCT_OK) != 0; case '\0': return (mFlags & NUL_OK) != 0; default : return (c >= 32 && c < 127) || ((c >= 128) && ((mFlags & NON_7_BIT_ASCII_OK) != 0)); } } } /** * Get the current value sanitizer used when processing * unregistered parameter values. *

* Note: The default unregistered parameter value sanitizer is * one that doesn't allow any special characters, similar to what * is returned by calling createAllIllegal. * * @return the current ValueSanitizer used to sanitize unregistered * parameter values. */ public ValueSanitizer getUnregisteredParameterValueSanitizer() { return mUnregisteredParameterValueSanitizer; } /** * Set the value sanitizer used when processing unregistered * parameter values. * @param sanitizer set the ValueSanitizer used to sanitize unregistered * parameter values. */ public void setUnregisteredParameterValueSanitizer( ValueSanitizer sanitizer) { mUnregisteredParameterValueSanitizer = sanitizer; } // Private fields for singleton sanitizers: private static final ValueSanitizer sAllIllegal = new IllegalCharacterValueSanitizer( IllegalCharacterValueSanitizer.ALL_ILLEGAL); private static final ValueSanitizer sAllButNulLegal = new IllegalCharacterValueSanitizer( IllegalCharacterValueSanitizer.ALL_BUT_NUL_LEGAL); private static final ValueSanitizer sAllButWhitespaceLegal = new IllegalCharacterValueSanitizer( IllegalCharacterValueSanitizer.ALL_BUT_WHITESPACE_LEGAL); private static final ValueSanitizer sURLLegal = new IllegalCharacterValueSanitizer( IllegalCharacterValueSanitizer.URL_LEGAL); private static final ValueSanitizer sUrlAndSpaceLegal = new IllegalCharacterValueSanitizer( IllegalCharacterValueSanitizer.URL_AND_SPACE_LEGAL); private static final ValueSanitizer sAmpLegal = new IllegalCharacterValueSanitizer( IllegalCharacterValueSanitizer.AMP_LEGAL); private static final ValueSanitizer sAmpAndSpaceLegal = new IllegalCharacterValueSanitizer( IllegalCharacterValueSanitizer.AMP_AND_SPACE_LEGAL); private static final ValueSanitizer sSpaceLegal = new IllegalCharacterValueSanitizer( IllegalCharacterValueSanitizer.SPACE_LEGAL); private static final ValueSanitizer sAllButNulAndAngleBracketsLegal = new IllegalCharacterValueSanitizer( IllegalCharacterValueSanitizer.ALL_BUT_NUL_AND_ANGLE_BRACKETS_LEGAL); /** * Return a value sanitizer that does not allow any special characters, * and also does not allow script URLs. * @return a value sanitizer */ public static final ValueSanitizer getAllIllegal() { return sAllIllegal; } /** * Return a value sanitizer that allows everything except Nul ('\0') * characters. Script URLs are allowed. * @return a value sanitizer */ public static final ValueSanitizer getAllButNulLegal() { return sAllButNulLegal; } /** * Return a value sanitizer that allows everything except Nul ('\0') * characters, space (' '), and other whitespace characters. * Script URLs are allowed. * @return a value sanitizer */ public static final ValueSanitizer getAllButWhitespaceLegal() { return sAllButWhitespaceLegal; } /** * Return a value sanitizer that allows all the characters used by * encoded URLs. Does not allow script URLs. * @return a value sanitizer */ public static final ValueSanitizer getUrlLegal() { return sURLLegal; } /** * Return a value sanitizer that allows all the characters used by * encoded URLs and allows spaces, which are not technically legal * in encoded URLs, but commonly appear anyway. * Does not allow script URLs. * @return a value sanitizer */ public static final ValueSanitizer getUrlAndSpaceLegal() { return sUrlAndSpaceLegal; } /** * Return a value sanitizer that does not allow any special characters * except ampersand ('&'). Does not allow script URLs. * @return a value sanitizer */ public static final ValueSanitizer getAmpLegal() { return sAmpLegal; } /** * Return a value sanitizer that does not allow any special characters * except ampersand ('&') and space (' '). Does not allow script URLs. * @return a value sanitizer */ public static final ValueSanitizer getAmpAndSpaceLegal() { return sAmpAndSpaceLegal; } /** * Return a value sanitizer that does not allow any special characters * except space (' '). Does not allow script URLs. * @return a value sanitizer */ public static final ValueSanitizer getSpaceLegal() { return sSpaceLegal; } /** * Return a value sanitizer that allows any special characters * except angle brackets ('<' and '>') and Nul ('\0'). * Allows script URLs. * @return a value sanitizer */ public static final ValueSanitizer getAllButNulAndAngleBracketsLegal() { return sAllButNulAndAngleBracketsLegal; } /** * Constructs a UrlQuerySanitizer. *

* Defaults: *

unregistered parameters are not allowed. *
the last instance of a repeated parameter is preferred. *
The default value sanitizer is an AllIllegal value sanitizer. *