1/*
2 * Copyright (C) 2009 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.common.net;
18
19import static com.google.common.base.Preconditions.checkArgument;
20import static com.google.common.base.Preconditions.checkNotNull;
21import static com.google.common.base.Preconditions.checkState;
22
23import com.google.common.annotations.Beta;
24import com.google.common.annotations.GwtCompatible;
25import com.google.common.base.Ascii;
26import com.google.common.base.CharMatcher;
27import com.google.common.base.Joiner;
28import com.google.common.base.Objects;
29import com.google.common.base.Splitter;
30import com.google.common.collect.ImmutableList;
31
32import java.util.List;
33
34import javax.annotation.Nullable;
35
36/**
37 * An immutable well-formed internet domain name, such as {@code com} or {@code
38 * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other
39 * network interactions take place. Thus there is no guarantee that the domain
40 * actually exists on the internet.
41 *
42 * <p>One common use of this class is to determine whether a given string is
43 * likely to represent an addressable domain on the web -- that is, for a
44 * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"}
45 * result in a webpage being displayed? In the past, this test was frequently
46 * done by determining whether the domain ended with a {@linkplain
47 * #isPublicSuffix() public suffix} but was not itself a public suffix. However,
48 * this test is no longer accurate. There are many domains which are both public
49 * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a
50 * result, the only useful test to determine if a domain is a plausible web host
51 * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains
52 * which (currently) are not hosts, such as {@code "com"}), but given that any
53 * public suffix may become a host without warning, it is better to err on the
54 * side of permissiveness and thus avoid spurious rejection of valid sites.
55 *
56 * <p>During construction, names are normalized in two ways:
57 * <ol>
58 * <li>ASCII uppercase characters are converted to lowercase.
59 * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are
60 * converted to the ASCII period.
61 * </ol>
62 * The normalized values will be returned from {@link #name()} and
63 * {@link #parts()}, and will be reflected in the result of
64 * {@link #equals(Object)}.
65 *
66 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
67 * internationalized domain names</a> such as {@code 网络.cn} are supported, as
68 * are the equivalent <a
69 * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA
70 * Punycode-encoded</a> versions.
71 *
72 * @author Craig Berry
73 * @since 5.0
74 */
75@Beta
76@GwtCompatible(emulated = true)
77public final class InternetDomainName {
78
79  private static final CharMatcher DOTS_MATCHER =
80      CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
81  private static final Splitter DOT_SPLITTER = Splitter.on('.');
82  private static final Joiner DOT_JOINER = Joiner.on('.');
83
84  /**
85   * Value of {@link #publicSuffixIndex} which indicates that no public suffix
86   * was found.
87   */
88  private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
89
90  private static final String DOT_REGEX = "\\.";
91
92  /**
93   * Maximum parts (labels) in a domain name. This value arises from
94   * the 255-octet limit described in
95   * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with
96   * the fact that the encoding of each part occupies at least two bytes
97   * (dot plus label externally, length byte plus label internally). Thus, if
98   * all labels have the minimum size of one byte, 127 of them will fit.
99   */
100  private static final int MAX_PARTS = 127;
101
102  /**
103   * Maximum length of a full domain name, including separators, and
104   * leaving room for the root label. See
105   * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
106   */
107  private static final int MAX_LENGTH = 253;
108
109  /**
110   * Maximum size of a single part of a domain name. See
111   * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
112   */
113  private static final int MAX_DOMAIN_PART_LENGTH = 63;
114
115  /**
116   * The full domain name, converted to lower case.
117   */
118  private final String name;
119
120  /**
121   * The parts of the domain name, converted to lower case.
122   */
123  private final ImmutableList<String> parts;
124
125  /**
126   * The index in the {@link #parts()} list at which the public suffix begins.
127   * For example, for the domain name {@code www.google.co.uk}, the value would
128   * be 2 (the index of the {@code co} part). The value is negative
129   * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
130   * found.
131   */
132  private final int publicSuffixIndex;
133
134  /**
135   * Constructor used to implement {@link #from(String)}, and from subclasses.
136   */
137  InternetDomainName(String name) {
138    // Normalize:
139    // * ASCII characters to lowercase
140    // * All dot-like characters to '.'
141    // * Strip trailing '.'
142
143    name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
144
145    if (name.endsWith(".")) {
146      name = name.substring(0, name.length() - 1);
147    }
148
149    checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name);
150    this.name = name;
151
152    this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
153    checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name);
154    checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
155
156    this.publicSuffixIndex = findPublicSuffix();
157  }
158
159  /**
160   * Returns the index of the leftmost part of the public suffix, or -1 if not
161   * found. Note that the value defined as the "public suffix" may not be a
162   * public suffix according to {@link #isPublicSuffix()} if the domain ends
163   * with an excluded domain pattern such as {@code "nhs.uk"}.
164   */
165  private int findPublicSuffix() {
166    final int partsSize = parts.size();
167
168    for (int i = 0; i < partsSize; i++) {
169      String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
170
171      if (TldPatterns.EXACT.contains(ancestorName)) {
172        return i;
173      }
174
175      // Excluded domains (e.g. !nhs.uk) use the next highest
176      // domain as the effective public suffix (e.g. uk).
177
178      if (TldPatterns.EXCLUDED.contains(ancestorName)) {
179        return i + 1;
180      }
181
182      if (matchesWildcardPublicSuffix(ancestorName)) {
183        return i;
184      }
185    }
186
187    return NO_PUBLIC_SUFFIX_FOUND;
188  }
189
190  /**
191   * A deprecated synonym for {@link #from(String)}.
192   *
193   * @param domain A domain name (not IP address)
194   * @throws IllegalArgumentException if {@code name} is not syntactically valid
195   *     according to {@link #isValidLenient}
196   * @since 8.0 (previously named {@code from})
197   * @deprecated Use {@link #from(String)}
198   */
199  @Deprecated
200  public static InternetDomainName fromLenient(String domain) {
201    return from(domain);
202  }
203
204  /**
205   * Returns an instance of {@link InternetDomainName} after lenient
206   * validation.  Specifically, validation against <a
207   * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
208   * ("Internationalizing Domain Names in Applications") is skipped, while
209   * validation against <a
210   * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in
211   * the following ways:
212   * <ul>
213   * <li>Any part containing non-ASCII characters is considered valid.
214   * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
215   * <li>Parts other than the final part may start with a digit.
216   * </ul>
217   *
218   *
219   * @param domain A domain name (not IP address)
220   * @throws IllegalArgumentException if {@code name} is not syntactically valid
221   *     according to {@link #isValid}
222   * @since 10.0 (previously named {@code fromLenient})
223   */
224  public static InternetDomainName from(String domain) {
225    return new InternetDomainName(checkNotNull(domain));
226  }
227
228  /**
229   * Validation method used by {@from} to ensure that the domain name is
230   * syntactically valid according to RFC 1035.
231   *
232   * @return Is the domain name syntactically valid?
233   */
234  private static boolean validateSyntax(List<String> parts) {
235    final int lastIndex = parts.size() - 1;
236
237    // Validate the last part specially, as it has different syntax rules.
238
239    if (!validatePart(parts.get(lastIndex), true)) {
240      return false;
241    }
242
243    for (int i = 0; i < lastIndex; i++) {
244      String part = parts.get(i);
245      if (!validatePart(part, false)) {
246        return false;
247      }
248    }
249
250    return true;
251  }
252
253  private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
254
255  private static final CharMatcher PART_CHAR_MATCHER =
256      CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
257
258  /**
259   * Helper method for {@link #validateSyntax(List)}. Validates that one part of
260   * a domain name is valid.
261   *
262   * @param part The domain name part to be validated
263   * @param isFinalPart Is this the final (rightmost) domain part?
264   * @return Whether the part is valid
265   */
266  private static boolean validatePart(String part, boolean isFinalPart) {
267
268    // These tests could be collapsed into one big boolean expression, but
269    // they have been left as independent tests for clarity.
270
271    if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
272      return false;
273    }
274
275    /*
276     * GWT claims to support java.lang.Character's char-classification methods,
277     * but it actually only works for ASCII. So for now, assume any non-ASCII
278     * characters are valid. The only place this seems to be documented is here:
279     * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
280     *
281     * <p>ASCII characters in the part are expected to be valid per RFC 1035,
282     * with underscore also being allowed due to widespread practice.
283     */
284
285    String asciiChars = CharMatcher.ASCII.retainFrom(part);
286
287    if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
288      return false;
289    }
290
291    // No initial or final dashes or underscores.
292
293    if (DASH_MATCHER.matches(part.charAt(0))
294        || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
295      return false;
296    }
297
298    /*
299     * Note that we allow (in contravention of a strict interpretation of the
300     * relevant RFCs) domain parts other than the last may begin with a digit
301     * (for example, "3com.com"). It's important to disallow an initial digit in
302     * the last part; it's the only thing that stops an IPv4 numeric address
303     * like 127.0.0.1 from looking like a valid domain name.
304     */
305
306    if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
307      return false;
308    }
309
310    return true;
311  }
312
313  /**
314   * Returns the domain name, normalized to all lower case.
315   */
316  public String name() {
317    return name;
318  }
319
320  /**
321   * Returns the individual components of this domain name, normalized to all
322   * lower case. For example, for the domain name {@code mail.google.com}, this
323   * method returns the list {@code ["mail", "google", "com"]}.
324   */
325  public ImmutableList<String> parts() {
326    return parts;
327  }
328
329  /**
330   * Indicates whether this domain name represents a <i>public suffix</i>, as
331   * defined by the Mozilla Foundation's
332   * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
333   * suffix is one under which Internet users can directly register names, such
334   * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
335   * names that are <i>not</i> public suffixes include {@code google}, {@code
336   * google.com} and {@code foo.co.uk}.
337   *
338   * @return {@code true} if this domain name appears exactly on the public
339   *     suffix list
340   * @since 6.0
341   */
342  public boolean isPublicSuffix() {
343    return publicSuffixIndex == 0;
344  }
345
346  /**
347   * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
348   * public suffix}, including if it is a public suffix itself. For example,
349   * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
350   * {@code com}, but not for {@code google} or {@code google.foo}. This is
351   * the recommended method for determining whether a domain is potentially an
352   * addressable host.
353   *
354   * @since 6.0
355   */
356  public boolean hasPublicSuffix() {
357    return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
358  }
359
360  /**
361   * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
362   * domain name, or {@code null} if no public suffix is present.
363   *
364   * @since 6.0
365   */
366  public InternetDomainName publicSuffix() {
367    return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
368  }
369
370  /**
371   * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
372   * public suffix}, while not being a public suffix itself. For example,
373   * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
374   * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
375   * google.foo}.
376   *
377   * <p><b>Warning:</b> a {@code false} result from this method does not imply
378   * that the domain does not represent an addressable host, as many public
379   * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
380   * that test.
381   *
382   * <p>This method can be used to determine whether it will probably be
383   * possible to set cookies on the domain, though even that depends on
384   * individual browsers' implementations of cookie controls. See
385   * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
386   *
387   * @since 6.0
388   */
389  public boolean isUnderPublicSuffix() {
390    return publicSuffixIndex > 0;
391  }
392
393  /**
394   * Indicates whether this domain name is composed of exactly one subdomain
395   * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
396   * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
397   * but not for {@code www.google.com} or {@code co.uk}.
398   *
399   * <p><b>Warning:</b> A {@code true} result from this method does not imply
400   * that the domain is at the highest level which is addressable as a host, as
401   * many public suffixes are also addressable hosts. For example, the domain
402   * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
403   * return {@code true} from this method. But {@code uk.com} is itself an
404   * addressable host.
405   *
406   * <p>This method can be used to determine whether a domain is probably the
407   * highest level for which cookies may be set, though even that depends on
408   * individual browsers' implementations of cookie controls. See
409   * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
410   *
411   * @since 6.0
412   */
413  public boolean isTopPrivateDomain() {
414    return publicSuffixIndex == 1;
415  }
416
417  /**
418   * Returns the portion of this domain name that is one level beneath the
419   * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
420   * {@code google.co.uk}, since {@code co.uk} is a public suffix.
421   *
422   * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
423   * instance is returned.
424   *
425   * <p>This method should not be used to determine the topmost parent domain
426   * which is addressable as a host, as many public suffixes are also
427   * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
428   * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
429   * from this method. But {@code uk.com} is itself an addressable host.
430   *
431   * <p>This method can be used to determine the probable highest level parent
432   * domain for which cookies may be set, though even that depends on individual
433   * browsers' implementations of cookie controls.
434   *
435   * @throws IllegalStateException if this domain does not end with a
436   *     public suffix
437   * @since 6.0
438   */
439  public InternetDomainName topPrivateDomain() {
440    if (isTopPrivateDomain()) {
441      return this;
442    }
443    checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
444    return ancestor(publicSuffixIndex - 1);
445  }
446
447  /**
448   * Indicates whether this domain is composed of two or more parts.
449   */
450  public boolean hasParent() {
451    return parts.size() > 1;
452  }
453
454  /**
455   * Returns an {@code InternetDomainName} that is the immediate ancestor of
456   * this one; that is, the current domain with the leftmost part removed. For
457   * example, the parent of {@code www.google.com} is {@code google.com}.
458   *
459   * @throws IllegalStateException if the domain has no parent, as determined
460   *     by {@link #hasParent}
461   */
462  public InternetDomainName parent() {
463    checkState(hasParent(), "Domain '%s' has no parent", name);
464    return ancestor(1);
465  }
466
467  /**
468   * Returns the ancestor of the current domain at the given number of levels
469   * "higher" (rightward) in the subdomain list. The number of levels must be
470   * non-negative, and less than {@code N-1}, where {@code N} is the number of
471   * parts in the domain.
472   *
473   * <p>TODO: Reasonable candidate for addition to public API.
474   */
475  private InternetDomainName ancestor(int levels) {
476    return from(DOT_JOINER.join(parts.subList(levels, parts.size())));
477  }
478
479  /**
480   * Creates and returns a new {@code InternetDomainName} by prepending the
481   * argument and a dot to the current name. For example, {@code
482   * InternetDomainName.from("foo.com").child("www.bar")} returns a new
483   * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only
484   * lenient validation is performed, as described {@link #from(String) here}.
485   *
486   * @throws NullPointerException if leftParts is null
487   * @throws IllegalArgumentException if the resulting name is not valid
488   */
489  public InternetDomainName child(String leftParts) {
490    return from(checkNotNull(leftParts) + "." + name);
491  }
492
493  /**
494   * A deprecated synonym for {@link #isValid(String)}.
495   *
496   * @since 8.0 (previously named {@code isValid})
497   * @deprecated Use {@link #isValid(String)} instead
498   */
499  @Deprecated
500  public static boolean isValidLenient(String name) {
501    return isValid(name);
502  }
503
504  /**
505   * Indicates whether the argument is a syntactically valid domain name using
506   * lenient validation. Specifically, validation against <a
507   * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
508   * ("Internationalizing Domain Names in Applications") is skipped.
509   *
510   * <p>The following two code snippets are equivalent:
511   *
512   * <pre>   {@code
513   *
514   *   domainName = InternetDomainName.isValid(name)
515   *       ? InternetDomainName.from(name)
516   *       : DEFAULT_DOMAIN;
517   *   }</pre>
518   *
519   * <pre>   {@code
520   *
521   *   try {
522   *     domainName = InternetDomainName.from(name);
523   *   } catch (IllegalArgumentException e) {
524   *     domainName = DEFAULT_DOMAIN;
525   *   }}</pre>
526   *
527   * @since 8.0 (previously named {@code isValidLenient})
528   */
529  public static boolean isValid(String name) {
530    try {
531      from(name);
532      return true;
533    } catch (IllegalArgumentException e) {
534      return false;
535    }
536  }
537
538  /**
539   * Does the domain name match one of the "wildcard" patterns (e.g.
540   * {@code "*.ar"})?
541   */
542  private static boolean matchesWildcardPublicSuffix(String domain) {
543    final String[] pieces = domain.split(DOT_REGEX, 2);
544    return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
545  }
546
547  // TODO: specify this to return the same as name(); remove name()
548  @Override
549  public String toString() {
550    return Objects.toStringHelper(this).add("name", name).toString();
551  }
552
553  /**
554   * Equality testing is based on the text supplied by the caller,
555   * after normalization as described in the class documentation. For
556   * example, a non-ASCII Unicode domain name and the Punycode version
557   * of the same domain name would not be considered equal.
558   *
559   */
560  @Override
561  public boolean equals(@Nullable Object object) {
562    if (object == this) {
563      return true;
564    }
565
566    if (object instanceof InternetDomainName) {
567      InternetDomainName that = (InternetDomainName) object;
568      return this.name.equals(that.name);
569    }
570
571    return false;
572  }
573
574  @Override
575  public int hashCode() {
576    return name.hashCode();
577  }
578}
579