Alan Viverette | 3da604b | 2020-06-10 18:34:39 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2014 The Android Open Source Project |
| 3 | * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved. |
| 4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 5 | * |
| 6 | * This code is free software; you can redistribute it and/or modify it |
| 7 | * under the terms of the GNU General Public License version 2 only, as |
| 8 | * published by the Free Software Foundation. Oracle designates this |
| 9 | * particular file as subject to the "Classpath" exception as provided |
| 10 | * by Oracle in the LICENSE file that accompanied this code. |
| 11 | * |
| 12 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 13 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 14 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 15 | * version 2 for more details (a copy is included in the LICENSE file that |
| 16 | * accompanied this code). |
| 17 | * |
| 18 | * You should have received a copy of the GNU General Public License version |
| 19 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 20 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 21 | * |
| 22 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| 23 | * or visit www.oracle.com if you need additional information or have any |
| 24 | * questions. |
| 25 | */ |
| 26 | |
| 27 | package java.net; |
| 28 | |
| 29 | import java.io.IOException; |
| 30 | import java.io.InvalidObjectException; |
| 31 | import java.io.ObjectInputStream; |
| 32 | import java.io.ObjectOutputStream; |
| 33 | import java.io.Serializable; |
| 34 | import java.nio.ByteBuffer; |
| 35 | import java.nio.CharBuffer; |
| 36 | import java.nio.charset.CharsetDecoder; |
| 37 | import java.nio.charset.CoderResult; |
| 38 | import java.nio.charset.CodingErrorAction; |
| 39 | import java.nio.charset.CharacterCodingException; |
| 40 | import java.text.Normalizer; |
| 41 | import sun.nio.cs.ThreadLocalCoders; |
| 42 | |
| 43 | import java.lang.Character; // for javadoc |
| 44 | import java.lang.NullPointerException; // for javadoc |
| 45 | |
| 46 | |
| 47 | // Android-changed: Reformat @see links. |
| 48 | /** |
| 49 | * Represents a Uniform Resource Identifier (URI) reference. |
| 50 | * |
| 51 | * <p> Aside from some minor deviations noted below, an instance of this |
| 52 | * class represents a URI reference as defined by |
| 53 | * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform |
| 54 | * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a |
| 55 | * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for |
| 56 | * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format |
| 57 | * also supports scope_ids. The syntax and usage of scope_ids is described |
| 58 | * <a href="Inet6Address.html#scoped">here</a>. |
| 59 | * This class provides constructors for creating URI instances from |
| 60 | * their components or by parsing their string forms, methods for accessing the |
| 61 | * various components of an instance, and methods for normalizing, resolving, |
| 62 | * and relativizing URI instances. Instances of this class are immutable. |
| 63 | * |
| 64 | * |
| 65 | * <h3> URI syntax and components </h3> |
| 66 | * |
| 67 | * At the highest level a URI reference (hereinafter simply "URI") in string |
| 68 | * form has the syntax |
| 69 | * |
| 70 | * <blockquote> |
| 71 | * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>] |
| 72 | * </blockquote> |
| 73 | * |
| 74 | * where square brackets [...] delineate optional components and the characters |
| 75 | * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves. |
| 76 | * |
| 77 | * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is |
| 78 | * said to be <i>relative</i>. URIs are also classified according to whether |
| 79 | * they are <i>opaque</i> or <i>hierarchical</i>. |
| 80 | * |
| 81 | * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does |
| 82 | * not begin with a slash character ({@code '/'}). Opaque URIs are not |
| 83 | * subject to further parsing. Some examples of opaque URIs are: |
| 84 | * |
| 85 | * <blockquote><table cellpadding=0 cellspacing=0 summary="layout"> |
| 86 | * <tr><td>{@code mailto:java-net@java.sun.com}<td></tr> |
| 87 | * <tr><td>{@code news:comp.lang.java}<td></tr> |
| 88 | * <tr><td>{@code urn:isbn:096139210x}</td></tr> |
| 89 | * </table></blockquote> |
| 90 | * |
| 91 | * <p> A <i>hierarchical</i> URI is either an absolute URI whose |
| 92 | * scheme-specific part begins with a slash character, or a relative URI, that |
| 93 | * is, a URI that does not specify a scheme. Some examples of hierarchical |
| 94 | * URIs are: |
| 95 | * |
| 96 | * <blockquote> |
| 97 | * {@code http://java.sun.com/j2se/1.3/}<br> |
| 98 | * {@code docs/guide/collections/designfaq.html#28}<br> |
| 99 | * {@code ../../../demo/jfc/SwingSet2/src/SwingSet2.java}<br> |
| 100 | * {@code file:///~/calendar} |
| 101 | * </blockquote> |
| 102 | * |
| 103 | * <p> A hierarchical URI is subject to further parsing according to the syntax |
| 104 | * |
| 105 | * <blockquote> |
| 106 | * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>] |
| 107 | * </blockquote> |
| 108 | * |
| 109 | * where the characters <b>{@code :}</b>, <b>{@code /}</b>, |
| 110 | * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves. The |
| 111 | * scheme-specific part of a hierarchical URI consists of the characters |
| 112 | * between the scheme and fragment components. |
| 113 | * |
| 114 | * <p> The authority component of a hierarchical URI is, if specified, either |
| 115 | * <i>server-based</i> or <i>registry-based</i>. A server-based authority |
| 116 | * parses according to the familiar syntax |
| 117 | * |
| 118 | * <blockquote> |
| 119 | * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>] |
| 120 | * </blockquote> |
| 121 | * |
| 122 | * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for |
| 123 | * themselves. Nearly all URI schemes currently in use are server-based. An |
| 124 | * authority component that does not parse in this way is considered to be |
| 125 | * registry-based. |
| 126 | * |
| 127 | * <p> The path component of a hierarchical URI is itself said to be absolute |
| 128 | * if it begins with a slash character ({@code '/'}); otherwise it is |
| 129 | * relative. The path of a hierarchical URI that is either absolute or |
| 130 | * specifies an authority is always absolute. |
| 131 | * |
| 132 | * <p> All told, then, a URI instance has the following nine components: |
| 133 | * |
| 134 | * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment"> |
| 135 | * <tr><th><i>Component</i></th><th><i>Type</i></th></tr> |
| 136 | * <tr><td>scheme</td><td>{@code String}</td></tr> |
| 137 | * <tr><td>scheme-specific-part </td><td>{@code String}</td></tr> |
| 138 | * <tr><td>authority</td><td>{@code String}</td></tr> |
| 139 | * <tr><td>user-info</td><td>{@code String}</td></tr> |
| 140 | * <tr><td>host</td><td>{@code String}</td></tr> |
| 141 | * <tr><td>port</td><td>{@code int}</td></tr> |
| 142 | * <tr><td>path</td><td>{@code String}</td></tr> |
| 143 | * <tr><td>query</td><td>{@code String}</td></tr> |
| 144 | * <tr><td>fragment</td><td>{@code String}</td></tr> |
| 145 | * </table></blockquote> |
| 146 | * |
| 147 | * In a given instance any particular component is either <i>undefined</i> or |
| 148 | * <i>defined</i> with a distinct value. Undefined string components are |
| 149 | * represented by {@code null}, while undefined integer components are |
| 150 | * represented by {@code -1}. A string component may be defined to have the |
| 151 | * empty string as its value; this is not equivalent to that component being |
| 152 | * undefined. |
| 153 | * |
| 154 | * <p> Whether a particular component is or is not defined in an instance |
| 155 | * depends upon the type of the URI being represented. An absolute URI has a |
| 156 | * scheme component. An opaque URI has a scheme, a scheme-specific part, and |
| 157 | * possibly a fragment, but has no other components. A hierarchical URI always |
| 158 | * has a path (though it may be empty) and a scheme-specific-part (which at |
| 159 | * least contains the path), and may have any of the other components. If the |
| 160 | * authority component is present and is server-based then the host component |
| 161 | * will be defined and the user-information and port components may be defined. |
| 162 | * |
| 163 | * |
| 164 | * <h4> Operations on URI instances </h4> |
| 165 | * |
| 166 | * The key operations supported by this class are those of |
| 167 | * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>. |
| 168 | * |
| 169 | * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."} |
| 170 | * and {@code ".."} segments from the path component of a hierarchical URI. |
| 171 | * Each {@code "."} segment is simply removed. A {@code ".."} segment is |
| 172 | * removed only if it is preceded by a non-{@code ".."} segment. |
| 173 | * Normalization has no effect upon opaque URIs. |
| 174 | * |
| 175 | * <p> <i>Resolution</i> is the process of resolving one URI against another, |
| 176 | * <i>base</i> URI. The resulting URI is constructed from components of both |
| 177 | * URIs in the manner specified by RFC 2396, taking components from the |
| 178 | * base URI for those not specified in the original. For hierarchical URIs, |
| 179 | * the path of the original is resolved against the path of the base and then |
| 180 | * normalized. The result, for example, of resolving |
| 181 | * |
| 182 | * <blockquote> |
| 183 | * {@code docs/guide/collections/designfaq.html#28} |
| 184 | * |
| 185 | * (1) |
| 186 | * </blockquote> |
| 187 | * |
| 188 | * against the base URI {@code http://java.sun.com/j2se/1.3/} is the result |
| 189 | * URI |
| 190 | * |
| 191 | * <blockquote> |
| 192 | * {@code http://java.sun.com/j2se/1.3/docs/guide/collections/designfaq.html#28} |
| 193 | * </blockquote> |
| 194 | * |
| 195 | * Resolving the relative URI |
| 196 | * |
| 197 | * <blockquote> |
| 198 | * {@code ../../../demo/jfc/SwingSet2/src/SwingSet2.java} (2) |
| 199 | * </blockquote> |
| 200 | * |
| 201 | * against this result yields, in turn, |
| 202 | * |
| 203 | * <blockquote> |
| 204 | * {@code http://java.sun.com/j2se/1.3/demo/jfc/SwingSet2/src/SwingSet2.java} |
| 205 | * </blockquote> |
| 206 | * |
| 207 | * Resolution of both absolute and relative URIs, and of both absolute and |
| 208 | * relative paths in the case of hierarchical URIs, is supported. Resolving |
| 209 | * the URI {@code file:///~calendar} against any other URI simply yields the |
| 210 | * original URI, since it is absolute. Resolving the relative URI (2) above |
| 211 | * against the relative base URI (1) yields the normalized, but still relative, |
| 212 | * URI |
| 213 | * |
| 214 | * <blockquote> |
| 215 | * {@code demo/jfc/SwingSet2/src/SwingSet2.java} |
| 216 | * </blockquote> |
| 217 | * |
| 218 | * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any |
| 219 | * two normalized URIs <i>u</i> and <i>v</i>, |
| 220 | * |
| 221 | * <blockquote> |
| 222 | * <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} and<br> |
| 223 | * <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} .<br> |
| 224 | * </blockquote> |
| 225 | * |
| 226 | * This operation is often useful when constructing a document containing URIs |
| 227 | * that must be made relative to the base URI of the document wherever |
| 228 | * possible. For example, relativizing the URI |
| 229 | * |
| 230 | * <blockquote> |
| 231 | * {@code http://java.sun.com/j2se/1.3/docs/guide/index.html} |
| 232 | * </blockquote> |
| 233 | * |
| 234 | * against the base URI |
| 235 | * |
| 236 | * <blockquote> |
| 237 | * {@code http://java.sun.com/j2se/1.3} |
| 238 | * </blockquote> |
| 239 | * |
| 240 | * yields the relative URI {@code docs/guide/index.html}. |
| 241 | * |
| 242 | * |
| 243 | * <h4> Character categories </h4> |
| 244 | * |
| 245 | * RFC 2396 specifies precisely which characters are permitted in the |
| 246 | * various components of a URI reference. The following categories, most of |
| 247 | * which are taken from that specification, are used below to describe these |
| 248 | * constraints: |
| 249 | * |
| 250 | * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other"> |
| 251 | * <tr><th valign=top><i>alpha</i></th> |
| 252 | * <td>The US-ASCII alphabetic characters, |
| 253 | * {@code 'A'} through {@code 'Z'} |
| 254 | * and {@code 'a'} through {@code 'z'}</td></tr> |
| 255 | * <tr><th valign=top><i>digit</i></th> |
| 256 | * <td>The US-ASCII decimal digit characters, |
| 257 | * {@code '0'} through {@code '9'}</td></tr> |
| 258 | * <tr><th valign=top><i>alphanum</i></th> |
| 259 | * <td>All <i>alpha</i> and <i>digit</i> characters</td></tr> |
| 260 | * <tr><th valign=top><i>unreserved</i> </th> |
| 261 | * <td>All <i>alphanum</i> characters together with those in the string |
| 262 | * {@code "_-!.~'()*"}</td></tr> |
| 263 | * <tr><th valign=top><i>punct</i></th> |
| 264 | * <td>The characters in the string {@code ",;:$&+="}</td></tr> |
| 265 | * <tr><th valign=top><i>reserved</i></th> |
| 266 | * <td>All <i>punct</i> characters together with those in the string |
| 267 | * {@code "?/[]@"}</td></tr> |
| 268 | * <tr><th valign=top><i>escaped</i></th> |
| 269 | * <td>Escaped octets, that is, triplets consisting of the percent |
| 270 | * character ({@code '%'}) followed by two hexadecimal digits |
| 271 | * ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and |
| 272 | * {@code 'a'}-{@code 'f'})</td></tr> |
| 273 | * <tr><th valign=top><i>other</i></th> |
| 274 | * <td>The Unicode characters that are not in the US-ASCII character set, |
| 275 | * are not control characters (according to the {@link |
| 276 | * java.lang.Character#isISOControl(char) Character.isISOControl} |
| 277 | * method), and are not space characters (according to the {@link |
| 278 | * java.lang.Character#isSpaceChar(char) Character.isSpaceChar} |
| 279 | * method) <i>(<b>Deviation from RFC 2396</b>, which is |
| 280 | * limited to US-ASCII)</i></td></tr> |
| 281 | * </table></blockquote> |
| 282 | * |
| 283 | * <p><a name="legal-chars"></a> The set of all legal URI characters consists of |
| 284 | * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i> |
| 285 | * characters. |
| 286 | * |
| 287 | * |
| 288 | * <h4> Escaped octets, quotation, encoding, and decoding </h4> |
| 289 | * |
| 290 | * RFC 2396 allows escaped octets to appear in the user-info, path, query, and |
| 291 | * fragment components. Escaping serves two purposes in URIs: |
| 292 | * |
| 293 | * <ul> |
| 294 | * |
| 295 | * <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to |
| 296 | * conform strictly to RFC 2396 by not containing any <i>other</i> |
| 297 | * characters. </p></li> |
| 298 | * |
| 299 | * <li><p> To <i>quote</i> characters that are otherwise illegal in a |
| 300 | * component. The user-info, path, query, and fragment components differ |
| 301 | * slightly in terms of which characters are considered legal and illegal. |
| 302 | * </p></li> |
| 303 | * |
| 304 | * </ul> |
| 305 | * |
| 306 | * These purposes are served in this class by three related operations: |
| 307 | * |
| 308 | * <ul> |
| 309 | * |
| 310 | * <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it |
| 311 | * with the sequence of escaped octets that represent that character in the |
| 312 | * UTF-8 character set. The Euro currency symbol ({@code '\u005Cu20AC'}), |
| 313 | * for example, is encoded as {@code "%E2%82%AC"}. <i>(<b>Deviation from |
| 314 | * RFC 2396</b>, which does not specify any particular character |
| 315 | * set.)</i> </p></li> |
| 316 | * |
| 317 | * <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by |
| 318 | * encoding it. The space character, for example, is quoted by replacing it |
| 319 | * with {@code "%20"}. UTF-8 contains US-ASCII, hence for US-ASCII |
| 320 | * characters this transformation has exactly the effect required by |
| 321 | * RFC 2396. </p></li> |
| 322 | * |
| 323 | * <li><p><a name="decode"></a> |
| 324 | * A sequence of escaped octets is <i>decoded</i> by |
| 325 | * replacing it with the sequence of characters that it represents in the |
| 326 | * UTF-8 character set. UTF-8 contains US-ASCII, hence decoding has the |
| 327 | * effect of de-quoting any quoted US-ASCII characters as well as that of |
| 328 | * decoding any encoded non-US-ASCII characters. If a <a |
| 329 | * href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs |
| 330 | * when decoding the escaped octets then the erroneous octets are replaced by |
| 331 | * {@code '\u005CuFFFD'}, the Unicode replacement character. </p></li> |
| 332 | * |
| 333 | * </ul> |
| 334 | * |
| 335 | * These operations are exposed in the constructors and methods of this class |
| 336 | * as follows: |
| 337 | * |
| 338 | * <ul> |
| 339 | * |
| 340 | * <li><p> The {@linkplain #URI(java.lang.String) single-argument |
| 341 | * constructor} requires any illegal characters in its argument to be |
| 342 | * quoted and preserves any escaped octets and <i>other</i> characters that |
| 343 | * are present. </p></li> |
| 344 | * |
| 345 | * <li><p> The {@linkplain |
| 346 | * #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String) |
| 347 | * multi-argument constructors} quote illegal characters as |
| 348 | * required by the components in which they appear. The percent character |
| 349 | * ({@code '%'}) is always quoted by these constructors. Any <i>other</i> |
| 350 | * characters are preserved. </p></li> |
| 351 | * |
| 352 | * <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath() |
| 353 | * getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment() |
| 354 | * getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link |
| 355 | * #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the |
| 356 | * values of their corresponding components in raw form, without interpreting |
| 357 | * any escaped octets. The strings returned by these methods may contain |
| 358 | * both escaped octets and <i>other</i> characters, and will not contain any |
| 359 | * illegal characters. </p></li> |
| 360 | * |
| 361 | * <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath() |
| 362 | * getPath}, {@link #getQuery() getQuery}, {@link #getFragment() |
| 363 | * getFragment}, {@link #getAuthority() getAuthority}, and {@link |
| 364 | * #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped |
| 365 | * octets in their corresponding components. The strings returned by these |
| 366 | * methods may contain both <i>other</i> characters and illegal characters, |
| 367 | * and will not contain any escaped octets. </p></li> |
| 368 | * |
| 369 | * <li><p> The {@link #toString() toString} method returns a URI string with |
| 370 | * all necessary quotation but which may contain <i>other</i> characters. |
| 371 | * </p></li> |
| 372 | * |
| 373 | * <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully |
| 374 | * quoted and encoded URI string that does not contain any <i>other</i> |
| 375 | * characters. </p></li> |
| 376 | * |
| 377 | * </ul> |
| 378 | * |
| 379 | * |
| 380 | * <h4> Identities </h4> |
| 381 | * |
| 382 | * For any URI <i>u</i>, it is always the case that |
| 383 | * |
| 384 | * <blockquote> |
| 385 | * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )} . |
| 386 | * </blockquote> |
| 387 | * |
| 388 | * For any URI <i>u</i> that does not contain redundant syntax such as two |
| 389 | * slashes before an empty authority (as in {@code file:///tmp/} ) or a |
| 390 | * colon following a host name but no port (as in |
| 391 | * {@code http://java.sun.com:} ), and that does not encode characters |
| 392 | * except those that must be quoted, the following identities also hold: |
| 393 | * <pre> |
| 394 | * new URI(<i>u</i>.getScheme(), |
| 395 | * <i>u</i>.getSchemeSpecificPart(), |
| 396 | * <i>u</i>.getFragment()) |
| 397 | * .equals(<i>u</i>)</pre> |
| 398 | * in all cases, |
| 399 | * <pre> |
| 400 | * new URI(<i>u</i>.getScheme(), |
| 401 | * <i>u</i>.getUserInfo(), <i>u</i>.getAuthority(), |
| 402 | * <i>u</i>.getPath(), <i>u</i>.getQuery(), |
| 403 | * <i>u</i>.getFragment()) |
| 404 | * .equals(<i>u</i>)</pre> |
| 405 | * if <i>u</i> is hierarchical, and |
| 406 | * <pre> |
| 407 | * new URI(<i>u</i>.getScheme(), |
| 408 | * <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(), |
| 409 | * <i>u</i>.getPath(), <i>u</i>.getQuery(), |
| 410 | * <i>u</i>.getFragment()) |
| 411 | * .equals(<i>u</i>)</pre> |
| 412 | * if <i>u</i> is hierarchical and has either no authority or a server-based |
| 413 | * authority. |
| 414 | * |
| 415 | * |
| 416 | * <h4> URIs, URLs, and URNs </h4> |
| 417 | * |
| 418 | * A URI is a uniform resource <i>identifier</i> while a URL is a uniform |
| 419 | * resource <i>locator</i>. Hence every URL is a URI, abstractly speaking, but |
| 420 | * not every URI is a URL. This is because there is another subcategory of |
| 421 | * URIs, uniform resource <i>names</i> (URNs), which name resources but do not |
| 422 | * specify how to locate them. The {@code mailto}, {@code news}, and |
| 423 | * {@code isbn} URIs shown above are examples of URNs. |
| 424 | * |
| 425 | * <p> The conceptual distinction between URIs and URLs is reflected in the |
| 426 | * differences between this class and the {@link URL} class. |
| 427 | * |
| 428 | * <p> An instance of this class represents a URI reference in the syntactic |
| 429 | * sense defined by RFC 2396. A URI may be either absolute or relative. |
| 430 | * A URI string is parsed according to the generic syntax without regard to the |
| 431 | * scheme, if any, that it specifies. No lookup of the host, if any, is |
| 432 | * performed, and no scheme-dependent stream handler is constructed. Equality, |
| 433 | * hashing, and comparison are defined strictly in terms of the character |
| 434 | * content of the instance. In other words, a URI instance is little more than |
| 435 | * a structured string that supports the syntactic, scheme-independent |
| 436 | * operations of comparison, normalization, resolution, and relativization. |
| 437 | * |
| 438 | * <p> An instance of the {@link URL} class, by contrast, represents the |
| 439 | * syntactic components of a URL together with some of the information required |
| 440 | * to access the resource that it describes. A URL must be absolute, that is, |
| 441 | * it must always specify a scheme. A URL string is parsed according to its |
| 442 | * scheme. A stream handler is always established for a URL, and in fact it is |
| 443 | * impossible to create a URL instance for a scheme for which no handler is |
| 444 | * available. Equality and hashing depend upon both the scheme and the |
| 445 | * Internet address of the host, if any; comparison is not defined. In other |
| 446 | * words, a URL is a structured string that supports the syntactic operation of |
| 447 | * resolution as well as the network I/O operations of looking up the host and |
| 448 | * opening a connection to the specified resource. |
| 449 | * |
| 450 | * |
| 451 | * @author Mark Reinhold |
| 452 | * @since 1.4 |
| 453 | * |
| 454 | * @see <a href="http://www.ietf.org/rfc/rfc2279.txt">RFC 2279: UTF-8, a transformation format of ISO 10646</a> |
| 455 | * @see <a href="http://www.ietf.org/rfc/rfc2373.txt">RFC 2373: IPv6 Addressing Architecture</a> |
| 456 | * @see <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax</a> |
| 457 | * @see <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732: Format for Literal IPv6 Addresses in URLs</a> |
| 458 | */ |
| 459 | |
| 460 | public final class URI |
| 461 | implements Comparable<URI>, Serializable |
| 462 | { |
| 463 | |
| 464 | // Note: Comments containing the word "ASSERT" indicate places where a |
| 465 | // throw of an InternalError should be replaced by an appropriate assertion |
| 466 | // statement once asserts are enabled in the build. |
| 467 | |
| 468 | static final long serialVersionUID = -6052424284110960213L; |
| 469 | |
| 470 | |
| 471 | // -- Properties and components of this instance -- |
| 472 | |
| 473 | // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>] |
| 474 | private transient String scheme; // null ==> relative URI |
| 475 | private transient String fragment; |
| 476 | |
| 477 | // Hierarchical URI components: [//<authority>]<path>[?<query>] |
| 478 | private transient String authority; // Registry or server |
| 479 | |
| 480 | // Server-based authority: [<userInfo>@]<host>[:<port>] |
| 481 | private transient String userInfo; |
| 482 | private transient String host; // null ==> registry-based |
| 483 | private transient int port = -1; // -1 ==> undefined |
| 484 | |
| 485 | // Remaining components of hierarchical URIs |
| 486 | private transient String path; // null ==> opaque |
| 487 | private transient String query; |
| 488 | |
| 489 | // The remaining fields may be computed on demand |
| 490 | |
| 491 | private volatile transient String schemeSpecificPart; |
| 492 | private volatile transient int hash; // Zero ==> undefined |
| 493 | |
| 494 | private volatile transient String decodedUserInfo = null; |
| 495 | private volatile transient String decodedAuthority = null; |
| 496 | private volatile transient String decodedPath = null; |
| 497 | private volatile transient String decodedQuery = null; |
| 498 | private volatile transient String decodedFragment = null; |
| 499 | private volatile transient String decodedSchemeSpecificPart = null; |
| 500 | |
| 501 | /** |
| 502 | * The string form of this URI. |
| 503 | * |
| 504 | * @serial |
| 505 | */ |
| 506 | private volatile String string; // The only serializable field |
| 507 | |
| 508 | |
| 509 | |
| 510 | // -- Constructors and factories -- |
| 511 | |
| 512 | private URI() { } // Used internally |
| 513 | |
| 514 | /** |
| 515 | * Constructs a URI by parsing the given string. |
| 516 | * |
| 517 | * <p> This constructor parses the given string exactly as specified by the |
| 518 | * grammar in <a |
| 519 | * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, |
| 520 | * Appendix A, <b><i>except for the following deviations:</i></b> </p> |
| 521 | * |
| 522 | * <ul> |
| 523 | * |
| 524 | * <li><p> An empty authority component is permitted as long as it is |
| 525 | * followed by a non-empty path, a query component, or a fragment |
| 526 | * component. This allows the parsing of URIs such as |
| 527 | * {@code "file:///foo/bar"}, which seems to be the intent of |
| 528 | * RFC 2396 although the grammar does not permit it. If the |
| 529 | * authority component is empty then the user-information, host, and port |
| 530 | * components are undefined. </p></li> |
| 531 | * |
| 532 | * <li><p> Empty relative paths are permitted; this seems to be the |
| 533 | * intent of RFC 2396 although the grammar does not permit it. The |
| 534 | * primary consequence of this deviation is that a standalone fragment |
| 535 | * such as {@code "#foo"} parses as a relative URI with an empty path |
| 536 | * and the given fragment, and can be usefully <a |
| 537 | * href="#resolve-frag">resolved</a> against a base URI. |
| 538 | * |
| 539 | * <li><p> IPv4 addresses in host components are parsed rigorously, as |
| 540 | * specified by <a |
| 541 | * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>: Each |
| 542 | * element of a dotted-quad address must contain no more than three |
| 543 | * decimal digits. Each element is further constrained to have a value |
| 544 | * no greater than 255. </p></li> |
| 545 | * |
| 546 | * <li> <p> Hostnames in host components that comprise only a single |
| 547 | * domain label are permitted to start with an <i>alphanum</i> |
| 548 | * character. This seems to be the intent of <a |
| 549 | * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> |
| 550 | * section 3.2.2 although the grammar does not permit it. The |
| 551 | * consequence of this deviation is that the authority component of a |
| 552 | * hierarchical URI such as {@code s://123}, will parse as a server-based |
| 553 | * authority. </p></li> |
| 554 | * |
| 555 | * <li><p> IPv6 addresses are permitted for the host component. An IPv6 |
| 556 | * address must be enclosed in square brackets ({@code '['} and |
| 557 | * {@code ']'}) as specified by <a |
| 558 | * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>. The |
| 559 | * IPv6 address itself must parse according to <a |
| 560 | * href="http://www.ietf.org/rfc/rfc2373.txt">RFC 2373</a>. IPv6 |
| 561 | * addresses are further constrained to describe no more than sixteen |
| 562 | * bytes of address information, a constraint implicit in RFC 2373 |
| 563 | * but not expressible in the grammar. </p></li> |
| 564 | * |
| 565 | * <li><p> Characters in the <i>other</i> category are permitted wherever |
| 566 | * RFC 2396 permits <i>escaped</i> octets, that is, in the |
| 567 | * user-information, path, query, and fragment components, as well as in |
| 568 | * the authority component if the authority is registry-based. This |
| 569 | * allows URIs to contain Unicode characters beyond those in the US-ASCII |
| 570 | * character set. </p></li> |
| 571 | * |
| 572 | * </ul> |
| 573 | * |
| 574 | * @param str The string to be parsed into a URI |
| 575 | * |
| 576 | * @throws NullPointerException |
| 577 | * If {@code str} is {@code null} |
| 578 | * |
| 579 | * @throws URISyntaxException |
| 580 | * If the given string violates RFC 2396, as augmented |
| 581 | * by the above deviations |
| 582 | */ |
| 583 | public URI(String str) throws URISyntaxException { |
| 584 | new Parser(str).parse(false); |
| 585 | } |
| 586 | |
| 587 | /** |
| 588 | * Constructs a hierarchical URI from the given components. |
| 589 | * |
| 590 | * <p> If a scheme is given then the path, if also given, must either be |
| 591 | * empty or begin with a slash character ({@code '/'}). Otherwise a |
| 592 | * component of the new URI may be left undefined by passing {@code null} |
| 593 | * for the corresponding parameter or, in the case of the {@code port} |
| 594 | * parameter, by passing {@code -1}. |
| 595 | * |
| 596 | * <p> This constructor first builds a URI string from the given components |
| 597 | * according to the rules specified in <a |
| 598 | * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, |
| 599 | * section 5.2, step 7: </p> |
| 600 | * |
| 601 | * <ol> |
| 602 | * |
| 603 | * <li><p> Initially, the result string is empty. </p></li> |
| 604 | * |
| 605 | * <li><p> If a scheme is given then it is appended to the result, |
| 606 | * followed by a colon character ({@code ':'}). </p></li> |
| 607 | * |
| 608 | * <li><p> If user information, a host, or a port are given then the |
| 609 | * string {@code "//"} is appended. </p></li> |
| 610 | * |
| 611 | * <li><p> If user information is given then it is appended, followed by |
| 612 | * a commercial-at character ({@code '@'}). Any character not in the |
| 613 | * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> |
| 614 | * categories is <a href="#quote">quoted</a>. </p></li> |
| 615 | * |
| 616 | * <li><p> If a host is given then it is appended. If the host is a |
| 617 | * literal IPv6 address but is not enclosed in square brackets |
| 618 | * ({@code '['} and {@code ']'}) then the square brackets are added. |
| 619 | * </p></li> |
| 620 | * |
| 621 | * <li><p> If a port number is given then a colon character |
| 622 | * ({@code ':'}) is appended, followed by the port number in decimal. |
| 623 | * </p></li> |
| 624 | * |
| 625 | * <li><p> If a path is given then it is appended. Any character not in |
| 626 | * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> |
| 627 | * categories, and not equal to the slash character ({@code '/'}) or the |
| 628 | * commercial-at character ({@code '@'}), is quoted. </p></li> |
| 629 | * |
| 630 | * <li><p> If a query is given then a question-mark character |
| 631 | * ({@code '?'}) is appended, followed by the query. Any character that |
| 632 | * is not a <a href="#legal-chars">legal URI character</a> is quoted. |
| 633 | * </p></li> |
| 634 | * |
| 635 | * <li><p> Finally, if a fragment is given then a hash character |
| 636 | * ({@code '#'}) is appended, followed by the fragment. Any character |
| 637 | * that is not a legal URI character is quoted. </p></li> |
| 638 | * |
| 639 | * </ol> |
| 640 | * |
| 641 | * <p> The resulting URI string is then parsed as if by invoking the {@link |
| 642 | * #URI(String)} constructor and then invoking the {@link |
| 643 | * #parseServerAuthority()} method upon the result; this may cause a {@link |
| 644 | * URISyntaxException} to be thrown. </p> |
| 645 | * |
| 646 | * @param scheme Scheme name |
| 647 | * @param userInfo User name and authorization information |
| 648 | * @param host Host name |
| 649 | * @param port Port number |
| 650 | * @param path Path |
| 651 | * @param query Query |
| 652 | * @param fragment Fragment |
| 653 | * |
| 654 | * @throws URISyntaxException |
| 655 | * If both a scheme and a path are given but the path is relative, |
| 656 | * if the URI string constructed from the given components violates |
| 657 | * RFC 2396, or if the authority component of the string is |
| 658 | * present but cannot be parsed as a server-based authority |
| 659 | */ |
| 660 | public URI(String scheme, |
| 661 | String userInfo, String host, int port, |
| 662 | String path, String query, String fragment) |
| 663 | throws URISyntaxException |
| 664 | { |
| 665 | String s = toString(scheme, null, |
| 666 | null, userInfo, host, port, |
| 667 | path, query, fragment); |
| 668 | checkPath(s, scheme, path); |
| 669 | new Parser(s).parse(true); |
| 670 | } |
| 671 | |
| 672 | /** |
| 673 | * Constructs a hierarchical URI from the given components. |
| 674 | * |
| 675 | * <p> If a scheme is given then the path, if also given, must either be |
| 676 | * empty or begin with a slash character ({@code '/'}). Otherwise a |
| 677 | * component of the new URI may be left undefined by passing {@code null} |
| 678 | * for the corresponding parameter. |
| 679 | * |
| 680 | * <p> This constructor first builds a URI string from the given components |
| 681 | * according to the rules specified in <a |
| 682 | * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, |
| 683 | * section 5.2, step 7: </p> |
| 684 | * |
| 685 | * <ol> |
| 686 | * |
| 687 | * <li><p> Initially, the result string is empty. </p></li> |
| 688 | * |
| 689 | * <li><p> If a scheme is given then it is appended to the result, |
| 690 | * followed by a colon character ({@code ':'}). </p></li> |
| 691 | * |
| 692 | * <li><p> If an authority is given then the string {@code "//"} is |
| 693 | * appended, followed by the authority. If the authority contains a |
| 694 | * literal IPv6 address then the address must be enclosed in square |
| 695 | * brackets ({@code '['} and {@code ']'}). Any character not in the |
| 696 | * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> |
| 697 | * categories, and not equal to the commercial-at character |
| 698 | * ({@code '@'}), is <a href="#quote">quoted</a>. </p></li> |
| 699 | * |
| 700 | * <li><p> If a path is given then it is appended. Any character not in |
| 701 | * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> |
| 702 | * categories, and not equal to the slash character ({@code '/'}) or the |
| 703 | * commercial-at character ({@code '@'}), is quoted. </p></li> |
| 704 | * |
| 705 | * <li><p> If a query is given then a question-mark character |
| 706 | * ({@code '?'}) is appended, followed by the query. Any character that |
| 707 | * is not a <a href="#legal-chars">legal URI character</a> is quoted. |
| 708 | * </p></li> |
| 709 | * |
| 710 | * <li><p> Finally, if a fragment is given then a hash character |
| 711 | * ({@code '#'}) is appended, followed by the fragment. Any character |
| 712 | * that is not a legal URI character is quoted. </p></li> |
| 713 | * |
| 714 | * </ol> |
| 715 | * |
| 716 | * <p> The resulting URI string is then parsed as if by invoking the {@link |
| 717 | * #URI(String)} constructor and then invoking the {@link |
| 718 | * #parseServerAuthority()} method upon the result; this may cause a {@link |
| 719 | * URISyntaxException} to be thrown. </p> |
| 720 | * |
| 721 | * @param scheme Scheme name |
| 722 | * @param authority Authority |
| 723 | * @param path Path |
| 724 | * @param query Query |
| 725 | * @param fragment Fragment |
| 726 | * |
| 727 | * @throws URISyntaxException |
| 728 | * If both a scheme and a path are given but the path is relative, |
| 729 | * if the URI string constructed from the given components violates |
| 730 | * RFC 2396, or if the authority component of the string is |
| 731 | * present but cannot be parsed as a server-based authority |
| 732 | */ |
| 733 | public URI(String scheme, |
| 734 | String authority, |
| 735 | String path, String query, String fragment) |
| 736 | throws URISyntaxException |
| 737 | { |
| 738 | String s = toString(scheme, null, |
| 739 | authority, null, null, -1, |
| 740 | path, query, fragment); |
| 741 | checkPath(s, scheme, path); |
| 742 | new Parser(s).parse(false); |
| 743 | } |
| 744 | |
| 745 | /** |
| 746 | * Constructs a hierarchical URI from the given components. |
| 747 | * |
| 748 | * <p> A component may be left undefined by passing {@code null}. |
| 749 | * |
| 750 | * <p> This convenience constructor works as if by invoking the |
| 751 | * seven-argument constructor as follows: |
| 752 | * |
| 753 | * <blockquote> |
| 754 | * {@code new} {@link #URI(String, String, String, int, String, String, String) |
| 755 | * URI}{@code (scheme, null, host, -1, path, null, fragment);} |
| 756 | * </blockquote> |
| 757 | * |
| 758 | * @param scheme Scheme name |
| 759 | * @param host Host name |
| 760 | * @param path Path |
| 761 | * @param fragment Fragment |
| 762 | * |
| 763 | * @throws URISyntaxException |
| 764 | * If the URI string constructed from the given components |
| 765 | * violates RFC 2396 |
| 766 | */ |
| 767 | public URI(String scheme, String host, String path, String fragment) |
| 768 | throws URISyntaxException |
| 769 | { |
| 770 | this(scheme, null, host, -1, path, null, fragment); |
| 771 | } |
| 772 | |
| 773 | /** |
| 774 | * Constructs a URI from the given components. |
| 775 | * |
| 776 | * <p> A component may be left undefined by passing {@code null}. |
| 777 | * |
| 778 | * <p> This constructor first builds a URI in string form using the given |
| 779 | * components as follows: </p> |
| 780 | * |
| 781 | * <ol> |
| 782 | * |
| 783 | * <li><p> Initially, the result string is empty. </p></li> |
| 784 | * |
| 785 | * <li><p> If a scheme is given then it is appended to the result, |
| 786 | * followed by a colon character ({@code ':'}). </p></li> |
| 787 | * |
| 788 | * <li><p> If a scheme-specific part is given then it is appended. Any |
| 789 | * character that is not a <a href="#legal-chars">legal URI character</a> |
| 790 | * is <a href="#quote">quoted</a>. </p></li> |
| 791 | * |
| 792 | * <li><p> Finally, if a fragment is given then a hash character |
| 793 | * ({@code '#'}) is appended to the string, followed by the fragment. |
| 794 | * Any character that is not a legal URI character is quoted. </p></li> |
| 795 | * |
| 796 | * </ol> |
| 797 | * |
| 798 | * <p> The resulting URI string is then parsed in order to create the new |
| 799 | * URI instance as if by invoking the {@link #URI(String)} constructor; |
| 800 | * this may cause a {@link URISyntaxException} to be thrown. </p> |
| 801 | * |
| 802 | * @param scheme Scheme name |
| 803 | * @param ssp Scheme-specific part |
| 804 | * @param fragment Fragment |
| 805 | * |
| 806 | * @throws URISyntaxException |
| 807 | * If the URI string constructed from the given components |
| 808 | * violates RFC 2396 |
| 809 | */ |
| 810 | public URI(String scheme, String ssp, String fragment) |
| 811 | throws URISyntaxException |
| 812 | { |
| 813 | new Parser(toString(scheme, ssp, |
| 814 | null, null, null, -1, |
| 815 | null, null, fragment)) |
| 816 | .parse(false); |
| 817 | } |
| 818 | |
| 819 | /** |
| 820 | * Creates a URI by parsing the given string. |
| 821 | * |
| 822 | * <p> This convenience factory method works as if by invoking the {@link |
| 823 | * #URI(String)} constructor; any {@link URISyntaxException} thrown by the |
| 824 | * constructor is caught and wrapped in a new {@link |
| 825 | * IllegalArgumentException} object, which is then thrown. |
| 826 | * |
| 827 | * <p> This method is provided for use in situations where it is known that |
| 828 | * the given string is a legal URI, for example for URI constants declared |
| 829 | * within in a program, and so it would be considered a programming error |
| 830 | * for the string not to parse as such. The constructors, which throw |
| 831 | * {@link URISyntaxException} directly, should be used situations where a |
| 832 | * URI is being constructed from user input or from some other source that |
| 833 | * may be prone to errors. </p> |
| 834 | * |
| 835 | * @param str The string to be parsed into a URI |
| 836 | * @return The new URI |
| 837 | * |
| 838 | * @throws NullPointerException |
| 839 | * If {@code str} is {@code null} |
| 840 | * |
| 841 | * @throws IllegalArgumentException |
| 842 | * If the given string violates RFC 2396 |
| 843 | */ |
| 844 | public static URI create(String str) { |
| 845 | try { |
| 846 | return new URI(str); |
| 847 | } catch (URISyntaxException x) { |
| 848 | throw new IllegalArgumentException(x.getMessage(), x); |
| 849 | } |
| 850 | } |
| 851 | |
| 852 | |
| 853 | // -- Operations -- |
| 854 | |
| 855 | /** |
| 856 | * Attempts to parse this URI's authority component, if defined, into |
| 857 | * user-information, host, and port components. |
| 858 | * |
| 859 | * <p> If this URI's authority component has already been recognized as |
| 860 | * being server-based then it will already have been parsed into |
| 861 | * user-information, host, and port components. In this case, or if this |
| 862 | * URI has no authority component, this method simply returns this URI. |
| 863 | * |
| 864 | * <p> Otherwise this method attempts once more to parse the authority |
| 865 | * component into user-information, host, and port components, and throws |
| 866 | * an exception describing why the authority component could not be parsed |
| 867 | * in that way. |
| 868 | * |
| 869 | * <p> This method is provided because the generic URI syntax specified in |
| 870 | * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> |
| 871 | * cannot always distinguish a malformed server-based authority from a |
| 872 | * legitimate registry-based authority. It must therefore treat some |
| 873 | * instances of the former as instances of the latter. The authority |
| 874 | * component in the URI string {@code "//foo:bar"}, for example, is not a |
| 875 | * legal server-based authority but it is legal as a registry-based |
| 876 | * authority. |
| 877 | * |
| 878 | * <p> In many common situations, for example when working URIs that are |
| 879 | * known to be either URNs or URLs, the hierarchical URIs being used will |
| 880 | * always be server-based. They therefore must either be parsed as such or |
| 881 | * treated as an error. In these cases a statement such as |
| 882 | * |
| 883 | * <blockquote> |
| 884 | * {@code URI }<i>u</i>{@code = new URI(str).parseServerAuthority();} |
| 885 | * </blockquote> |
| 886 | * |
| 887 | * <p> can be used to ensure that <i>u</i> always refers to a URI that, if |
| 888 | * it has an authority component, has a server-based authority with proper |
| 889 | * user-information, host, and port components. Invoking this method also |
| 890 | * ensures that if the authority could not be parsed in that way then an |
| 891 | * appropriate diagnostic message can be issued based upon the exception |
| 892 | * that is thrown. </p> |
| 893 | * |
| 894 | * @return A URI whose authority field has been parsed |
| 895 | * as a server-based authority |
| 896 | * |
| 897 | * @throws URISyntaxException |
| 898 | * If the authority component of this URI is defined |
| 899 | * but cannot be parsed as a server-based authority |
| 900 | * according to RFC 2396 |
| 901 | */ |
| 902 | public URI parseServerAuthority() |
| 903 | throws URISyntaxException |
| 904 | { |
| 905 | // We could be clever and cache the error message and index from the |
| 906 | // exception thrown during the original parse, but that would require |
| 907 | // either more fields or a more-obscure representation. |
| 908 | if ((host != null) || (authority == null)) |
| 909 | return this; |
| 910 | defineString(); |
| 911 | new Parser(string).parse(true); |
| 912 | return this; |
| 913 | } |
| 914 | |
| 915 | /** |
| 916 | * Normalizes this URI's path. |
| 917 | * |
| 918 | * <p> If this URI is opaque, or if its path is already in normal form, |
| 919 | * then this URI is returned. Otherwise a new URI is constructed that is |
| 920 | * identical to this URI except that its path is computed by normalizing |
| 921 | * this URI's path in a manner consistent with <a |
| 922 | * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, |
| 923 | * section 5.2, step 6, sub-steps c through f; that is: |
| 924 | * </p> |
| 925 | * |
| 926 | * <ol> |
| 927 | * |
| 928 | * <li><p> All {@code "."} segments are removed. </p></li> |
| 929 | * |
| 930 | * <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."} |
| 931 | * segment then both of these segments are removed. This step is |
| 932 | * repeated until it is no longer applicable. </p></li> |
| 933 | * |
| 934 | * <li><p> If the path is relative, and if its first segment contains a |
| 935 | * colon character ({@code ':'}), then a {@code "."} segment is |
| 936 | * prepended. This prevents a relative URI with a path such as |
| 937 | * {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a |
| 938 | * scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}. |
| 939 | * <b><i>(Deviation from RFC 2396)</i></b> </p></li> |
| 940 | * |
| 941 | * </ol> |
| 942 | * |
| 943 | * <p> A normalized path will begin with one or more {@code ".."} segments |
| 944 | * if there were insufficient non-{@code ".."} segments preceding them to |
| 945 | * allow their removal. A normalized path will begin with a {@code "."} |
| 946 | * segment if one was inserted by step 3 above. Otherwise, a normalized |
| 947 | * path will not contain any {@code "."} or {@code ".."} segments. </p> |
| 948 | * |
| 949 | * @return A URI equivalent to this URI, |
| 950 | * but whose path is in normal form |
| 951 | */ |
| 952 | public URI normalize() { |
| 953 | return normalize(this); |
| 954 | } |
| 955 | |
| 956 | /** |
| 957 | * Resolves the given URI against this URI. |
| 958 | * |
| 959 | * <p> If the given URI is already absolute, or if this URI is opaque, then |
| 960 | * the given URI is returned. |
| 961 | * |
| 962 | * <p><a name="resolve-frag"></a> If the given URI's fragment component is |
| 963 | * defined, its path component is empty, and its scheme, authority, and |
| 964 | * query components are undefined, then a URI with the given fragment but |
| 965 | * with all other components equal to those of this URI is returned. This |
| 966 | * allows a URI representing a standalone fragment reference, such as |
| 967 | * {@code "#foo"}, to be usefully resolved against a base URI. |
| 968 | * |
| 969 | * <p> Otherwise this method constructs a new hierarchical URI in a manner |
| 970 | * consistent with <a |
| 971 | * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, |
| 972 | * section 5.2; that is: </p> |
| 973 | * |
| 974 | * <ol> |
| 975 | * |
| 976 | * <li><p> A new URI is constructed with this URI's scheme and the given |
| 977 | * URI's query and fragment components. </p></li> |
| 978 | * |
| 979 | * <li><p> If the given URI has an authority component then the new URI's |
| 980 | * authority and path are taken from the given URI. </p></li> |
| 981 | * |
| 982 | * <li><p> Otherwise the new URI's authority component is copied from |
| 983 | * this URI, and its path is computed as follows: </p> |
| 984 | * |
| 985 | * <ol> |
| 986 | * |
| 987 | * <li><p> If the given URI's path is absolute then the new URI's path |
| 988 | * is taken from the given URI. </p></li> |
| 989 | * |
| 990 | * <li><p> Otherwise the given URI's path is relative, and so the new |
| 991 | * URI's path is computed by resolving the path of the given URI |
| 992 | * against the path of this URI. This is done by concatenating all but |
| 993 | * the last segment of this URI's path, if any, with the given URI's |
| 994 | * path and then normalizing the result as if by invoking the {@link |
| 995 | * #normalize() normalize} method. </p></li> |
| 996 | * |
| 997 | * </ol></li> |
| 998 | * |
| 999 | * </ol> |
| 1000 | * |
| 1001 | * <p> The result of this method is absolute if, and only if, either this |
| 1002 | * URI is absolute or the given URI is absolute. </p> |
| 1003 | * |
| 1004 | * @param uri The URI to be resolved against this URI |
| 1005 | * @return The resulting URI |
| 1006 | * |
| 1007 | * @throws NullPointerException |
| 1008 | * If {@code uri} is {@code null} |
| 1009 | */ |
| 1010 | public URI resolve(URI uri) { |
| 1011 | return resolve(this, uri); |
| 1012 | } |
| 1013 | |
| 1014 | /** |
| 1015 | * Constructs a new URI by parsing the given string and then resolving it |
| 1016 | * against this URI. |
| 1017 | * |
| 1018 | * <p> This convenience method works as if invoking it were equivalent to |
| 1019 | * evaluating the expression {@link #resolve(java.net.URI) |
| 1020 | * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p> |
| 1021 | * |
| 1022 | * @param str The string to be parsed into a URI |
| 1023 | * @return The resulting URI |
| 1024 | * |
| 1025 | * @throws NullPointerException |
| 1026 | * If {@code str} is {@code null} |
| 1027 | * |
| 1028 | * @throws IllegalArgumentException |
| 1029 | * If the given string violates RFC 2396 |
| 1030 | */ |
| 1031 | public URI resolve(String str) { |
| 1032 | return resolve(URI.create(str)); |
| 1033 | } |
| 1034 | |
| 1035 | /** |
| 1036 | * Relativizes the given URI against this URI. |
| 1037 | * |
| 1038 | * <p> The relativization of the given URI against this URI is computed as |
| 1039 | * follows: </p> |
| 1040 | * |
| 1041 | * <ol> |
| 1042 | * |
| 1043 | * <li><p> If either this URI or the given URI are opaque, or if the |
| 1044 | * scheme and authority components of the two URIs are not identical, or |
| 1045 | * if the path of this URI is not a prefix of the path of the given URI, |
| 1046 | * then the given URI is returned. </p></li> |
| 1047 | * |
| 1048 | * <li><p> Otherwise a new relative hierarchical URI is constructed with |
| 1049 | * query and fragment components taken from the given URI and with a path |
| 1050 | * component computed by removing this URI's path from the beginning of |
| 1051 | * the given URI's path. </p></li> |
| 1052 | * |
| 1053 | * </ol> |
| 1054 | * |
| 1055 | * @param uri The URI to be relativized against this URI |
| 1056 | * @return The resulting URI |
| 1057 | * |
| 1058 | * @throws NullPointerException |
| 1059 | * If {@code uri} is {@code null} |
| 1060 | */ |
| 1061 | public URI relativize(URI uri) { |
| 1062 | return relativize(this, uri); |
| 1063 | } |
| 1064 | |
| 1065 | /** |
| 1066 | * Constructs a URL from this URI. |
| 1067 | * |
| 1068 | * <p> This convenience method works as if invoking it were equivalent to |
| 1069 | * evaluating the expression {@code new URL(this.toString())} after |
| 1070 | * first checking that this URI is absolute. </p> |
| 1071 | * |
| 1072 | * @return A URL constructed from this URI |
| 1073 | * |
| 1074 | * @throws IllegalArgumentException |
| 1075 | * If this URL is not absolute |
| 1076 | * |
| 1077 | * @throws MalformedURLException |
| 1078 | * If a protocol handler for the URL could not be found, |
| 1079 | * or if some other error occurred while constructing the URL |
| 1080 | */ |
| 1081 | public URL toURL() |
| 1082 | throws MalformedURLException { |
| 1083 | if (!isAbsolute()) |
| 1084 | throw new IllegalArgumentException("URI is not absolute"); |
| 1085 | return new URL(toString()); |
| 1086 | } |
| 1087 | |
| 1088 | // -- Component access methods -- |
| 1089 | |
| 1090 | /** |
| 1091 | * Returns the scheme component of this URI. |
| 1092 | * |
| 1093 | * <p> The scheme component of a URI, if defined, only contains characters |
| 1094 | * in the <i>alphanum</i> category and in the string {@code "-.+"}. A |
| 1095 | * scheme always starts with an <i>alpha</i> character. <p> |
| 1096 | * |
| 1097 | * The scheme component of a URI cannot contain escaped octets, hence this |
| 1098 | * method does not perform any decoding. |
| 1099 | * |
| 1100 | * @return The scheme component of this URI, |
| 1101 | * or {@code null} if the scheme is undefined |
| 1102 | */ |
| 1103 | public String getScheme() { |
| 1104 | return scheme; |
| 1105 | } |
| 1106 | |
| 1107 | /** |
| 1108 | * Tells whether or not this URI is absolute. |
| 1109 | * |
| 1110 | * <p> A URI is absolute if, and only if, it has a scheme component. </p> |
| 1111 | * |
| 1112 | * @return {@code true} if, and only if, this URI is absolute |
| 1113 | */ |
| 1114 | public boolean isAbsolute() { |
| 1115 | return scheme != null; |
| 1116 | } |
| 1117 | |
| 1118 | /** |
| 1119 | * Tells whether or not this URI is opaque. |
| 1120 | * |
| 1121 | * <p> A URI is opaque if, and only if, it is absolute and its |
| 1122 | * scheme-specific part does not begin with a slash character ('/'). |
| 1123 | * An opaque URI has a scheme, a scheme-specific part, and possibly |
| 1124 | * a fragment; all other components are undefined. </p> |
| 1125 | * |
| 1126 | * @return {@code true} if, and only if, this URI is opaque |
| 1127 | */ |
| 1128 | public boolean isOpaque() { |
| 1129 | return path == null; |
| 1130 | } |
| 1131 | |
| 1132 | /** |
| 1133 | * Returns the raw scheme-specific part of this URI. The scheme-specific |
| 1134 | * part is never undefined, though it may be empty. |
| 1135 | * |
| 1136 | * <p> The scheme-specific part of a URI only contains legal URI |
| 1137 | * characters. </p> |
| 1138 | * |
| 1139 | * @return The raw scheme-specific part of this URI |
| 1140 | * (never {@code null}) |
| 1141 | */ |
| 1142 | public String getRawSchemeSpecificPart() { |
| 1143 | defineSchemeSpecificPart(); |
| 1144 | return schemeSpecificPart; |
| 1145 | } |
| 1146 | |
| 1147 | /** |
| 1148 | * Returns the decoded scheme-specific part of this URI. |
| 1149 | * |
| 1150 | * <p> The string returned by this method is equal to that returned by the |
| 1151 | * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method |
| 1152 | * except that all sequences of escaped octets are <a |
| 1153 | * href="#decode">decoded</a>. </p> |
| 1154 | * |
| 1155 | * @return The decoded scheme-specific part of this URI |
| 1156 | * (never {@code null}) |
| 1157 | */ |
| 1158 | public String getSchemeSpecificPart() { |
| 1159 | if (decodedSchemeSpecificPart == null) |
| 1160 | decodedSchemeSpecificPart = decode(getRawSchemeSpecificPart()); |
| 1161 | return decodedSchemeSpecificPart; |
| 1162 | } |
| 1163 | |
| 1164 | /** |
| 1165 | * Returns the raw authority component of this URI. |
| 1166 | * |
| 1167 | * <p> The authority component of a URI, if defined, only contains the |
| 1168 | * commercial-at character ({@code '@'}) and characters in the |
| 1169 | * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i> |
| 1170 | * categories. If the authority is server-based then it is further |
| 1171 | * constrained to have valid user-information, host, and port |
| 1172 | * components. </p> |
| 1173 | * |
| 1174 | * @return The raw authority component of this URI, |
| 1175 | * or {@code null} if the authority is undefined |
| 1176 | */ |
| 1177 | public String getRawAuthority() { |
| 1178 | return authority; |
| 1179 | } |
| 1180 | |
| 1181 | /** |
| 1182 | * Returns the decoded authority component of this URI. |
| 1183 | * |
| 1184 | * <p> The string returned by this method is equal to that returned by the |
| 1185 | * {@link #getRawAuthority() getRawAuthority} method except that all |
| 1186 | * sequences of escaped octets are <a href="#decode">decoded</a>. </p> |
| 1187 | * |
| 1188 | * @return The decoded authority component of this URI, |
| 1189 | * or {@code null} if the authority is undefined |
| 1190 | */ |
| 1191 | public String getAuthority() { |
| 1192 | if (decodedAuthority == null) |
| 1193 | decodedAuthority = decode(authority); |
| 1194 | return decodedAuthority; |
| 1195 | } |
| 1196 | |
| 1197 | /** |
| 1198 | * Returns the raw user-information component of this URI. |
| 1199 | * |
| 1200 | * <p> The user-information component of a URI, if defined, only contains |
| 1201 | * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and |
| 1202 | * <i>other</i> categories. </p> |
| 1203 | * |
| 1204 | * @return The raw user-information component of this URI, |
| 1205 | * or {@code null} if the user information is undefined |
| 1206 | */ |
| 1207 | public String getRawUserInfo() { |
| 1208 | return userInfo; |
| 1209 | } |
| 1210 | |
| 1211 | /** |
| 1212 | * Returns the decoded user-information component of this URI. |
| 1213 | * |
| 1214 | * <p> The string returned by this method is equal to that returned by the |
| 1215 | * {@link #getRawUserInfo() getRawUserInfo} method except that all |
| 1216 | * sequences of escaped octets are <a href="#decode">decoded</a>. </p> |
| 1217 | * |
| 1218 | * @return The decoded user-information component of this URI, |
| 1219 | * or {@code null} if the user information is undefined |
| 1220 | */ |
| 1221 | public String getUserInfo() { |
| 1222 | if ((decodedUserInfo == null) && (userInfo != null)) |
| 1223 | decodedUserInfo = decode(userInfo); |
| 1224 | return decodedUserInfo; |
| 1225 | } |
| 1226 | |
| 1227 | /** |
| 1228 | * Returns the host component of this URI. |
| 1229 | * |
| 1230 | * <p> The host component of a URI, if defined, will have one of the |
| 1231 | * following forms: </p> |
| 1232 | * |
| 1233 | * <ul> |
| 1234 | * |
| 1235 | * <li><p> A domain name consisting of one or more <i>labels</i> |
| 1236 | * separated by period characters ({@code '.'}), optionally followed by |
| 1237 | * a period character. Each label consists of <i>alphanum</i> characters |
| 1238 | * as well as hyphen characters ({@code '-'}), though hyphens never |
| 1239 | * occur as the first or last characters in a label. The rightmost |
| 1240 | * label of a domain name consisting of two or more labels, begins |
| 1241 | * with an <i>alpha</i> character. </li> |
| 1242 | * |
| 1243 | * <li><p> A dotted-quad IPv4 address of the form |
| 1244 | * <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +}, |
| 1245 | * where no <i>digit</i> sequence is longer than three characters and no |
| 1246 | * sequence has a value larger than 255. </p></li> |
| 1247 | * |
| 1248 | * <li><p> An IPv6 address enclosed in square brackets ({@code '['} and |
| 1249 | * {@code ']'}) and consisting of hexadecimal digits, colon characters |
| 1250 | * ({@code ':'}), and possibly an embedded IPv4 address. The full |
| 1251 | * syntax of IPv6 addresses is specified in <a |
| 1252 | * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 |
| 1253 | * Addressing Architecture</i></a>. </p></li> |
| 1254 | * |
| 1255 | * </ul> |
| 1256 | * |
| 1257 | * The host component of a URI cannot contain escaped octets, hence this |
| 1258 | * method does not perform any decoding. |
| 1259 | * |
| 1260 | * @return The host component of this URI, |
| 1261 | * or {@code null} if the host is undefined |
| 1262 | */ |
| 1263 | public String getHost() { |
| 1264 | return host; |
| 1265 | } |
| 1266 | |
| 1267 | /** |
| 1268 | * Returns the port number of this URI. |
| 1269 | * |
| 1270 | * <p> The port component of a URI, if defined, is a non-negative |
| 1271 | * integer. </p> |
| 1272 | * |
| 1273 | * @return The port component of this URI, |
| 1274 | * or {@code -1} if the port is undefined |
| 1275 | */ |
| 1276 | public int getPort() { |
| 1277 | return port; |
| 1278 | } |
| 1279 | |
| 1280 | /** |
| 1281 | * Returns the raw path component of this URI. |
| 1282 | * |
| 1283 | * <p> The path component of a URI, if defined, only contains the slash |
| 1284 | * character ({@code '/'}), the commercial-at character ({@code '@'}), |
| 1285 | * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, |
| 1286 | * and <i>other</i> categories. </p> |
| 1287 | * |
| 1288 | * @return The path component of this URI, |
| 1289 | * or {@code null} if the path is undefined |
| 1290 | */ |
| 1291 | public String getRawPath() { |
| 1292 | return path; |
| 1293 | } |
| 1294 | |
| 1295 | /** |
| 1296 | * Returns the decoded path component of this URI. |
| 1297 | * |
| 1298 | * <p> The string returned by this method is equal to that returned by the |
| 1299 | * {@link #getRawPath() getRawPath} method except that all sequences of |
| 1300 | * escaped octets are <a href="#decode">decoded</a>. </p> |
| 1301 | * |
| 1302 | * @return The decoded path component of this URI, |
| 1303 | * or {@code null} if the path is undefined |
| 1304 | */ |
| 1305 | public String getPath() { |
| 1306 | if ((decodedPath == null) && (path != null)) |
| 1307 | decodedPath = decode(path); |
| 1308 | return decodedPath; |
| 1309 | } |
| 1310 | |
| 1311 | /** |
| 1312 | * Returns the raw query component of this URI. |
| 1313 | * |
| 1314 | * <p> The query component of a URI, if defined, only contains legal URI |
| 1315 | * characters. </p> |
| 1316 | * |
| 1317 | * @return The raw query component of this URI, |
| 1318 | * or {@code null} if the query is undefined |
| 1319 | */ |
| 1320 | public String getRawQuery() { |
| 1321 | return query; |
| 1322 | } |
| 1323 | |
| 1324 | /** |
| 1325 | * Returns the decoded query component of this URI. |
| 1326 | * |
| 1327 | * <p> The string returned by this method is equal to that returned by the |
| 1328 | * {@link #getRawQuery() getRawQuery} method except that all sequences of |
| 1329 | * escaped octets are <a href="#decode">decoded</a>. </p> |
| 1330 | * |
| 1331 | * @return The decoded query component of this URI, |
| 1332 | * or {@code null} if the query is undefined |
| 1333 | */ |
| 1334 | public String getQuery() { |
| 1335 | if ((decodedQuery == null) && (query != null)) |
| 1336 | decodedQuery = decode(query); |
| 1337 | return decodedQuery; |
| 1338 | } |
| 1339 | |
| 1340 | /** |
| 1341 | * Returns the raw fragment component of this URI. |
| 1342 | * |
| 1343 | * <p> The fragment component of a URI, if defined, only contains legal URI |
| 1344 | * characters. </p> |
| 1345 | * |
| 1346 | * @return The raw fragment component of this URI, |
| 1347 | * or {@code null} if the fragment is undefined |
| 1348 | */ |
| 1349 | public String getRawFragment() { |
| 1350 | return fragment; |
| 1351 | } |
| 1352 | |
| 1353 | /** |
| 1354 | * Returns the decoded fragment component of this URI. |
| 1355 | * |
| 1356 | * <p> The string returned by this method is equal to that returned by the |
| 1357 | * {@link #getRawFragment() getRawFragment} method except that all |
| 1358 | * sequences of escaped octets are <a href="#decode">decoded</a>. </p> |
| 1359 | * |
| 1360 | * @return The decoded fragment component of this URI, |
| 1361 | * or {@code null} if the fragment is undefined |
| 1362 | */ |
| 1363 | public String getFragment() { |
| 1364 | if ((decodedFragment == null) && (fragment != null)) |
| 1365 | decodedFragment = decode(fragment); |
| 1366 | return decodedFragment; |
| 1367 | } |
| 1368 | |
| 1369 | |
| 1370 | // -- Equality, comparison, hash code, toString, and serialization -- |
| 1371 | |
| 1372 | /** |
| 1373 | * Tests this URI for equality with another object. |
| 1374 | * |
| 1375 | * <p> If the given object is not a URI then this method immediately |
| 1376 | * returns {@code false}. |
| 1377 | * |
| 1378 | * <p> For two URIs to be considered equal requires that either both are |
| 1379 | * opaque or both are hierarchical. Their schemes must either both be |
| 1380 | * undefined or else be equal without regard to case. Their fragments |
| 1381 | * must either both be undefined or else be equal. |
| 1382 | * |
| 1383 | * <p> For two opaque URIs to be considered equal, their scheme-specific |
| 1384 | * parts must be equal. |
| 1385 | * |
| 1386 | * <p> For two hierarchical URIs to be considered equal, their paths must |
| 1387 | * be equal and their queries must either both be undefined or else be |
| 1388 | * equal. Their authorities must either both be undefined, or both be |
| 1389 | * registry-based, or both be server-based. If their authorities are |
| 1390 | * defined and are registry-based, then they must be equal. If their |
| 1391 | * authorities are defined and are server-based, then their hosts must be |
| 1392 | * equal without regard to case, their port numbers must be equal, and |
| 1393 | * their user-information components must be equal. |
| 1394 | * |
| 1395 | * <p> When testing the user-information, path, query, fragment, authority, |
| 1396 | * or scheme-specific parts of two URIs for equality, the raw forms rather |
| 1397 | * than the encoded forms of these components are compared and the |
| 1398 | * hexadecimal digits of escaped octets are compared without regard to |
| 1399 | * case. |
| 1400 | * |
| 1401 | * <p> This method satisfies the general contract of the {@link |
| 1402 | * java.lang.Object#equals(Object) Object.equals} method. </p> |
| 1403 | * |
| 1404 | * @param ob The object to which this object is to be compared |
| 1405 | * |
| 1406 | * @return {@code true} if, and only if, the given object is a URI that |
| 1407 | * is identical to this URI |
| 1408 | */ |
| 1409 | public boolean equals(Object ob) { |
| 1410 | if (ob == this) |
| 1411 | return true; |
| 1412 | if (!(ob instanceof URI)) |
| 1413 | return false; |
| 1414 | URI that = (URI)ob; |
| 1415 | if (this.isOpaque() != that.isOpaque()) return false; |
| 1416 | if (!equalIgnoringCase(this.scheme, that.scheme)) return false; |
| 1417 | if (!equal(this.fragment, that.fragment)) return false; |
| 1418 | |
| 1419 | // Opaque |
| 1420 | if (this.isOpaque()) |
| 1421 | return equal(this.schemeSpecificPart, that.schemeSpecificPart); |
| 1422 | |
| 1423 | // Hierarchical |
| 1424 | if (!equal(this.path, that.path)) return false; |
| 1425 | if (!equal(this.query, that.query)) return false; |
| 1426 | |
| 1427 | // Authorities |
| 1428 | if (this.authority == that.authority) return true; |
| 1429 | if (this.host != null) { |
| 1430 | // Server-based |
| 1431 | if (!equal(this.userInfo, that.userInfo)) return false; |
| 1432 | if (!equalIgnoringCase(this.host, that.host)) return false; |
| 1433 | if (this.port != that.port) return false; |
| 1434 | } else if (this.authority != null) { |
| 1435 | // Registry-based |
| 1436 | if (!equal(this.authority, that.authority)) return false; |
| 1437 | } else if (this.authority != that.authority) { |
| 1438 | return false; |
| 1439 | } |
| 1440 | |
| 1441 | return true; |
| 1442 | } |
| 1443 | |
| 1444 | /** |
| 1445 | * Returns a hash-code value for this URI. The hash code is based upon all |
| 1446 | * of the URI's components, and satisfies the general contract of the |
| 1447 | * {@link java.lang.Object#hashCode() Object.hashCode} method. |
| 1448 | * |
| 1449 | * @return A hash-code value for this URI |
| 1450 | */ |
| 1451 | public int hashCode() { |
| 1452 | if (hash != 0) |
| 1453 | return hash; |
| 1454 | int h = hashIgnoringCase(0, scheme); |
| 1455 | h = hash(h, fragment); |
| 1456 | if (isOpaque()) { |
| 1457 | h = hash(h, schemeSpecificPart); |
| 1458 | } else { |
| 1459 | h = hash(h, path); |
| 1460 | h = hash(h, query); |
| 1461 | if (host != null) { |
| 1462 | h = hash(h, userInfo); |
| 1463 | h = hashIgnoringCase(h, host); |
| 1464 | h += 1949 * port; |
| 1465 | } else { |
| 1466 | h = hash(h, authority); |
| 1467 | } |
| 1468 | } |
| 1469 | hash = h; |
| 1470 | return h; |
| 1471 | } |
| 1472 | |
| 1473 | /** |
| 1474 | * Compares this URI to another object, which must be a URI. |
| 1475 | * |
| 1476 | * <p> When comparing corresponding components of two URIs, if one |
| 1477 | * component is undefined but the other is defined then the first is |
| 1478 | * considered to be less than the second. Unless otherwise noted, string |
| 1479 | * components are ordered according to their natural, case-sensitive |
| 1480 | * ordering as defined by the {@link java.lang.String#compareTo(Object) |
| 1481 | * String.compareTo} method. String components that are subject to |
| 1482 | * encoding are compared by comparing their raw forms rather than their |
| 1483 | * encoded forms. |
| 1484 | * |
| 1485 | * <p> The ordering of URIs is defined as follows: </p> |
| 1486 | * |
| 1487 | * <ul> |
| 1488 | * |
| 1489 | * <li><p> Two URIs with different schemes are ordered according the |
| 1490 | * ordering of their schemes, without regard to case. </p></li> |
| 1491 | * |
| 1492 | * <li><p> A hierarchical URI is considered to be less than an opaque URI |
| 1493 | * with an identical scheme. </p></li> |
| 1494 | * |
| 1495 | * <li><p> Two opaque URIs with identical schemes are ordered according |
| 1496 | * to the ordering of their scheme-specific parts. </p></li> |
| 1497 | * |
| 1498 | * <li><p> Two opaque URIs with identical schemes and scheme-specific |
| 1499 | * parts are ordered according to the ordering of their |
| 1500 | * fragments. </p></li> |
| 1501 | * |
| 1502 | * <li><p> Two hierarchical URIs with identical schemes are ordered |
| 1503 | * according to the ordering of their authority components: </p> |
| 1504 | * |
| 1505 | * <ul> |
| 1506 | * |
| 1507 | * <li><p> If both authority components are server-based then the URIs |
| 1508 | * are ordered according to their user-information components; if these |
| 1509 | * components are identical then the URIs are ordered according to the |
| 1510 | * ordering of their hosts, without regard to case; if the hosts are |
| 1511 | * identical then the URIs are ordered according to the ordering of |
| 1512 | * their ports. </p></li> |
| 1513 | * |
| 1514 | * <li><p> If one or both authority components are registry-based then |
| 1515 | * the URIs are ordered according to the ordering of their authority |
| 1516 | * components. </p></li> |
| 1517 | * |
| 1518 | * </ul></li> |
| 1519 | * |
| 1520 | * <li><p> Finally, two hierarchical URIs with identical schemes and |
| 1521 | * authority components are ordered according to the ordering of their |
| 1522 | * paths; if their paths are identical then they are ordered according to |
| 1523 | * the ordering of their queries; if the queries are identical then they |
| 1524 | * are ordered according to the order of their fragments. </p></li> |
| 1525 | * |
| 1526 | * </ul> |
| 1527 | * |
| 1528 | * <p> This method satisfies the general contract of the {@link |
| 1529 | * java.lang.Comparable#compareTo(Object) Comparable.compareTo} |
| 1530 | * method. </p> |
| 1531 | * |
| 1532 | * @param that |
| 1533 | * The object to which this URI is to be compared |
| 1534 | * |
| 1535 | * @return A negative integer, zero, or a positive integer as this URI is |
| 1536 | * less than, equal to, or greater than the given URI |
| 1537 | * |
| 1538 | * @throws ClassCastException |
| 1539 | * If the given object is not a URI |
| 1540 | */ |
| 1541 | public int compareTo(URI that) { |
| 1542 | int c; |
| 1543 | |
| 1544 | if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0) |
| 1545 | return c; |
| 1546 | |
| 1547 | if (this.isOpaque()) { |
| 1548 | if (that.isOpaque()) { |
| 1549 | // Both opaque |
| 1550 | if ((c = compare(this.schemeSpecificPart, |
| 1551 | that.schemeSpecificPart)) != 0) |
| 1552 | return c; |
| 1553 | return compare(this.fragment, that.fragment); |
| 1554 | } |
| 1555 | return +1; // Opaque > hierarchical |
| 1556 | } else if (that.isOpaque()) { |
| 1557 | return -1; // Hierarchical < opaque |
| 1558 | } |
| 1559 | |
| 1560 | // Hierarchical |
| 1561 | if ((this.host != null) && (that.host != null)) { |
| 1562 | // Both server-based |
| 1563 | if ((c = compare(this.userInfo, that.userInfo)) != 0) |
| 1564 | return c; |
| 1565 | if ((c = compareIgnoringCase(this.host, that.host)) != 0) |
| 1566 | return c; |
| 1567 | if ((c = this.port - that.port) != 0) |
| 1568 | return c; |
| 1569 | } else { |
| 1570 | // If one or both authorities are registry-based then we simply |
| 1571 | // compare them in the usual, case-sensitive way. If one is |
| 1572 | // registry-based and one is server-based then the strings are |
| 1573 | // guaranteed to be unequal, hence the comparison will never return |
| 1574 | // zero and the compareTo and equals methods will remain |
| 1575 | // consistent. |
| 1576 | if ((c = compare(this.authority, that.authority)) != 0) return c; |
| 1577 | } |
| 1578 | |
| 1579 | if ((c = compare(this.path, that.path)) != 0) return c; |
| 1580 | if ((c = compare(this.query, that.query)) != 0) return c; |
| 1581 | return compare(this.fragment, that.fragment); |
| 1582 | } |
| 1583 | |
| 1584 | /** |
| 1585 | * Returns the content of this URI as a string. |
| 1586 | * |
| 1587 | * <p> If this URI was created by invoking one of the constructors in this |
| 1588 | * class then a string equivalent to the original input string, or to the |
| 1589 | * string computed from the originally-given components, as appropriate, is |
| 1590 | * returned. Otherwise this URI was created by normalization, resolution, |
| 1591 | * or relativization, and so a string is constructed from this URI's |
| 1592 | * components according to the rules specified in <a |
| 1593 | * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, |
| 1594 | * section 5.2, step 7. </p> |
| 1595 | * |
| 1596 | * @return The string form of this URI |
| 1597 | */ |
| 1598 | public String toString() { |
| 1599 | defineString(); |
| 1600 | return string; |
| 1601 | } |
| 1602 | |
| 1603 | /** |
| 1604 | * Returns the content of this URI as a US-ASCII string. |
| 1605 | * |
| 1606 | * <p> If this URI does not contain any characters in the <i>other</i> |
| 1607 | * category then an invocation of this method will return the same value as |
| 1608 | * an invocation of the {@link #toString() toString} method. Otherwise |
| 1609 | * this method works as if by invoking that method and then <a |
| 1610 | * href="#encode">encoding</a> the result. </p> |
| 1611 | * |
| 1612 | * @return The string form of this URI, encoded as needed |
| 1613 | * so that it only contains characters in the US-ASCII |
| 1614 | * charset |
| 1615 | */ |
| 1616 | public String toASCIIString() { |
| 1617 | defineString(); |
| 1618 | return encode(string); |
| 1619 | } |
| 1620 | |
| 1621 | |
| 1622 | // -- Serialization support -- |
| 1623 | |
| 1624 | /** |
| 1625 | * Saves the content of this URI to the given serial stream. |
| 1626 | * |
| 1627 | * <p> The only serializable field of a URI instance is its {@code string} |
| 1628 | * field. That field is given a value, if it does not have one already, |
| 1629 | * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()} |
| 1630 | * method of the given object-output stream is invoked. </p> |
| 1631 | * |
| 1632 | * @param os The object-output stream to which this object |
| 1633 | * is to be written |
| 1634 | */ |
| 1635 | private void writeObject(ObjectOutputStream os) |
| 1636 | throws IOException |
| 1637 | { |
| 1638 | defineString(); |
| 1639 | os.defaultWriteObject(); // Writes the string field only |
| 1640 | } |
| 1641 | |
| 1642 | /** |
| 1643 | * Reconstitutes a URI from the given serial stream. |
| 1644 | * |
| 1645 | * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is |
| 1646 | * invoked to read the value of the {@code string} field. The result is |
| 1647 | * then parsed in the usual way. |
| 1648 | * |
| 1649 | * @param is The object-input stream from which this object |
| 1650 | * is being read |
| 1651 | */ |
| 1652 | private void readObject(ObjectInputStream is) |
| 1653 | throws ClassNotFoundException, IOException |
| 1654 | { |
| 1655 | port = -1; // Argh |
| 1656 | is.defaultReadObject(); |
| 1657 | try { |
| 1658 | new Parser(string).parse(false); |
| 1659 | } catch (URISyntaxException x) { |
| 1660 | IOException y = new InvalidObjectException("Invalid URI"); |
| 1661 | y.initCause(x); |
| 1662 | throw y; |
| 1663 | } |
| 1664 | } |
| 1665 | |
| 1666 | |
| 1667 | // -- End of public methods -- |
| 1668 | |
| 1669 | |
| 1670 | // -- Utility methods for string-field comparison and hashing -- |
| 1671 | |
| 1672 | // These methods return appropriate values for null string arguments, |
| 1673 | // thereby simplifying the equals, hashCode, and compareTo methods. |
| 1674 | // |
| 1675 | // The case-ignoring methods should only be applied to strings whose |
| 1676 | // characters are all known to be US-ASCII. Because of this restriction, |
| 1677 | // these methods are faster than the similar methods in the String class. |
| 1678 | |
| 1679 | // US-ASCII only |
| 1680 | private static int toLower(char c) { |
| 1681 | if ((c >= 'A') && (c <= 'Z')) |
| 1682 | return c + ('a' - 'A'); |
| 1683 | return c; |
| 1684 | } |
| 1685 | |
| 1686 | // US-ASCII only |
| 1687 | private static int toUpper(char c) { |
| 1688 | if ((c >= 'a') && (c <= 'z')) |
| 1689 | return c - ('a' - 'A'); |
| 1690 | return c; |
| 1691 | } |
| 1692 | |
| 1693 | private static boolean equal(String s, String t) { |
| 1694 | if (s == t) return true; |
| 1695 | if ((s != null) && (t != null)) { |
| 1696 | if (s.length() != t.length()) |
| 1697 | return false; |
| 1698 | if (s.indexOf('%') < 0) |
| 1699 | return s.equals(t); |
| 1700 | int n = s.length(); |
| 1701 | for (int i = 0; i < n;) { |
| 1702 | char c = s.charAt(i); |
| 1703 | char d = t.charAt(i); |
| 1704 | if (c != '%') { |
| 1705 | if (c != d) |
| 1706 | return false; |
| 1707 | i++; |
| 1708 | continue; |
| 1709 | } |
| 1710 | if (d != '%') |
| 1711 | return false; |
| 1712 | i++; |
| 1713 | if (toLower(s.charAt(i)) != toLower(t.charAt(i))) |
| 1714 | return false; |
| 1715 | i++; |
| 1716 | if (toLower(s.charAt(i)) != toLower(t.charAt(i))) |
| 1717 | return false; |
| 1718 | i++; |
| 1719 | } |
| 1720 | return true; |
| 1721 | } |
| 1722 | return false; |
| 1723 | } |
| 1724 | |
| 1725 | // US-ASCII only |
| 1726 | private static boolean equalIgnoringCase(String s, String t) { |
| 1727 | if (s == t) return true; |
| 1728 | if ((s != null) && (t != null)) { |
| 1729 | int n = s.length(); |
| 1730 | if (t.length() != n) |
| 1731 | return false; |
| 1732 | for (int i = 0; i < n; i++) { |
| 1733 | if (toLower(s.charAt(i)) != toLower(t.charAt(i))) |
| 1734 | return false; |
| 1735 | } |
| 1736 | return true; |
| 1737 | } |
| 1738 | return false; |
| 1739 | } |
| 1740 | |
| 1741 | private static int hash(int hash, String s) { |
| 1742 | if (s == null) return hash; |
| 1743 | return s.indexOf('%') < 0 ? hash * 127 + s.hashCode() |
| 1744 | : normalizedHash(hash, s); |
| 1745 | } |
| 1746 | |
| 1747 | |
| 1748 | private static int normalizedHash(int hash, String s) { |
| 1749 | int h = 0; |
| 1750 | for (int index = 0; index < s.length(); index++) { |
| 1751 | char ch = s.charAt(index); |
| 1752 | h = 31 * h + ch; |
| 1753 | if (ch == '%') { |
| 1754 | /* |
| 1755 | * Process the next two encoded characters |
| 1756 | */ |
| 1757 | for (int i = index + 1; i < index + 3; i++) |
| 1758 | h = 31 * h + toUpper(s.charAt(i)); |
| 1759 | index += 2; |
| 1760 | } |
| 1761 | } |
| 1762 | return hash * 127 + h; |
| 1763 | } |
| 1764 | |
| 1765 | // US-ASCII only |
| 1766 | private static int hashIgnoringCase(int hash, String s) { |
| 1767 | if (s == null) return hash; |
| 1768 | int h = hash; |
| 1769 | int n = s.length(); |
| 1770 | for (int i = 0; i < n; i++) |
| 1771 | h = 31 * h + toLower(s.charAt(i)); |
| 1772 | return h; |
| 1773 | } |
| 1774 | |
| 1775 | private static int compare(String s, String t) { |
| 1776 | if (s == t) return 0; |
| 1777 | if (s != null) { |
| 1778 | if (t != null) |
| 1779 | return s.compareTo(t); |
| 1780 | else |
| 1781 | return +1; |
| 1782 | } else { |
| 1783 | return -1; |
| 1784 | } |
| 1785 | } |
| 1786 | |
| 1787 | // US-ASCII only |
| 1788 | private static int compareIgnoringCase(String s, String t) { |
| 1789 | if (s == t) return 0; |
| 1790 | if (s != null) { |
| 1791 | if (t != null) { |
| 1792 | int sn = s.length(); |
| 1793 | int tn = t.length(); |
| 1794 | int n = sn < tn ? sn : tn; |
| 1795 | for (int i = 0; i < n; i++) { |
| 1796 | int c = toLower(s.charAt(i)) - toLower(t.charAt(i)); |
| 1797 | if (c != 0) |
| 1798 | return c; |
| 1799 | } |
| 1800 | return sn - tn; |
| 1801 | } |
| 1802 | return +1; |
| 1803 | } else { |
| 1804 | return -1; |
| 1805 | } |
| 1806 | } |
| 1807 | |
| 1808 | |
| 1809 | // -- String construction -- |
| 1810 | |
| 1811 | // If a scheme is given then the path, if given, must be absolute |
| 1812 | // |
| 1813 | private static void checkPath(String s, String scheme, String path) |
| 1814 | throws URISyntaxException |
| 1815 | { |
| 1816 | if (scheme != null) { |
| 1817 | if ((path != null) |
| 1818 | && ((path.length() > 0) && (path.charAt(0) != '/'))) |
| 1819 | throw new URISyntaxException(s, |
| 1820 | "Relative path in absolute URI"); |
| 1821 | } |
| 1822 | } |
| 1823 | |
| 1824 | private void appendAuthority(StringBuffer sb, |
| 1825 | String authority, |
| 1826 | String userInfo, |
| 1827 | String host, |
| 1828 | int port) |
| 1829 | { |
| 1830 | if (host != null) { |
| 1831 | sb.append("//"); |
| 1832 | if (userInfo != null) { |
| 1833 | sb.append(quote(userInfo, L_USERINFO, H_USERINFO)); |
| 1834 | sb.append('@'); |
| 1835 | } |
| 1836 | boolean needBrackets = ((host.indexOf(':') >= 0) |
| 1837 | && !host.startsWith("[") |
| 1838 | && !host.endsWith("]")); |
| 1839 | if (needBrackets) sb.append('['); |
| 1840 | sb.append(host); |
| 1841 | if (needBrackets) sb.append(']'); |
| 1842 | if (port != -1) { |
| 1843 | sb.append(':'); |
| 1844 | sb.append(port); |
| 1845 | } |
| 1846 | } else if (authority != null) { |
| 1847 | sb.append("//"); |
| 1848 | if (authority.startsWith("[")) { |
| 1849 | // authority should (but may not) contain an embedded IPv6 address |
| 1850 | int end = authority.indexOf("]"); |
| 1851 | String doquote = authority, dontquote = ""; |
| 1852 | if (end != -1 && authority.indexOf(":") != -1) { |
| 1853 | // the authority contains an IPv6 address |
| 1854 | if (end == authority.length()) { |
| 1855 | dontquote = authority; |
| 1856 | doquote = ""; |
| 1857 | } else { |
| 1858 | dontquote = authority.substring(0 , end + 1); |
| 1859 | doquote = authority.substring(end + 1); |
| 1860 | } |
| 1861 | } |
| 1862 | sb.append(dontquote); |
| 1863 | sb.append(quote(doquote, |
| 1864 | L_REG_NAME | L_SERVER, |
| 1865 | H_REG_NAME | H_SERVER)); |
| 1866 | } else { |
| 1867 | sb.append(quote(authority, |
| 1868 | L_REG_NAME | L_SERVER, |
| 1869 | H_REG_NAME | H_SERVER)); |
| 1870 | } |
| 1871 | } |
| 1872 | } |
| 1873 | |
| 1874 | private void appendSchemeSpecificPart(StringBuffer sb, |
| 1875 | String opaquePart, |
| 1876 | String authority, |
| 1877 | String userInfo, |
| 1878 | String host, |
| 1879 | int port, |
| 1880 | String path, |
| 1881 | String query) |
| 1882 | { |
| 1883 | if (opaquePart != null) { |
| 1884 | /* check if SSP begins with an IPv6 address |
| 1885 | * because we must not quote a literal IPv6 address |
| 1886 | */ |
| 1887 | if (opaquePart.startsWith("//[")) { |
| 1888 | int end = opaquePart.indexOf("]"); |
| 1889 | if (end != -1 && opaquePart.indexOf(":")!=-1) { |
| 1890 | String doquote, dontquote; |
| 1891 | if (end == opaquePart.length()) { |
| 1892 | dontquote = opaquePart; |
| 1893 | doquote = ""; |
| 1894 | } else { |
| 1895 | dontquote = opaquePart.substring(0,end+1); |
| 1896 | doquote = opaquePart.substring(end+1); |
| 1897 | } |
| 1898 | sb.append (dontquote); |
| 1899 | sb.append(quote(doquote, L_URIC, H_URIC)); |
| 1900 | } |
| 1901 | } else { |
| 1902 | sb.append(quote(opaquePart, L_URIC, H_URIC)); |
| 1903 | } |
| 1904 | } else { |
| 1905 | appendAuthority(sb, authority, userInfo, host, port); |
| 1906 | if (path != null) |
| 1907 | sb.append(quote(path, L_PATH, H_PATH)); |
| 1908 | if (query != null) { |
| 1909 | sb.append('?'); |
| 1910 | sb.append(quote(query, L_URIC, H_URIC)); |
| 1911 | } |
| 1912 | } |
| 1913 | } |
| 1914 | |
| 1915 | private void appendFragment(StringBuffer sb, String fragment) { |
| 1916 | if (fragment != null) { |
| 1917 | sb.append('#'); |
| 1918 | sb.append(quote(fragment, L_URIC, H_URIC)); |
| 1919 | } |
| 1920 | } |
| 1921 | |
| 1922 | private String toString(String scheme, |
| 1923 | String opaquePart, |
| 1924 | String authority, |
| 1925 | String userInfo, |
| 1926 | String host, |
| 1927 | int port, |
| 1928 | String path, |
| 1929 | String query, |
| 1930 | String fragment) |
| 1931 | { |
| 1932 | StringBuffer sb = new StringBuffer(); |
| 1933 | if (scheme != null) { |
| 1934 | sb.append(scheme); |
| 1935 | sb.append(':'); |
| 1936 | } |
| 1937 | appendSchemeSpecificPart(sb, opaquePart, |
| 1938 | authority, userInfo, host, port, |
| 1939 | path, query); |
| 1940 | appendFragment(sb, fragment); |
| 1941 | return sb.toString(); |
| 1942 | } |
| 1943 | |
| 1944 | private void defineSchemeSpecificPart() { |
| 1945 | if (schemeSpecificPart != null) return; |
| 1946 | StringBuffer sb = new StringBuffer(); |
| 1947 | appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(), |
| 1948 | host, port, getPath(), getQuery()); |
| 1949 | if (sb.length() == 0) return; |
| 1950 | schemeSpecificPart = sb.toString(); |
| 1951 | } |
| 1952 | |
| 1953 | private void defineString() { |
| 1954 | if (string != null) return; |
| 1955 | |
| 1956 | StringBuffer sb = new StringBuffer(); |
| 1957 | if (scheme != null) { |
| 1958 | sb.append(scheme); |
| 1959 | sb.append(':'); |
| 1960 | } |
| 1961 | if (isOpaque()) { |
| 1962 | sb.append(schemeSpecificPart); |
| 1963 | } else { |
| 1964 | if (host != null) { |
| 1965 | sb.append("//"); |
| 1966 | if (userInfo != null) { |
| 1967 | sb.append(userInfo); |
| 1968 | sb.append('@'); |
| 1969 | } |
| 1970 | boolean needBrackets = ((host.indexOf(':') >= 0) |
| 1971 | && !host.startsWith("[") |
| 1972 | && !host.endsWith("]")); |
| 1973 | if (needBrackets) sb.append('['); |
| 1974 | sb.append(host); |
| 1975 | if (needBrackets) sb.append(']'); |
| 1976 | if (port != -1) { |
| 1977 | sb.append(':'); |
| 1978 | sb.append(port); |
| 1979 | } |
| 1980 | } else if (authority != null) { |
| 1981 | sb.append("//"); |
| 1982 | sb.append(authority); |
| 1983 | } |
| 1984 | if (path != null) |
| 1985 | sb.append(path); |
| 1986 | if (query != null) { |
| 1987 | sb.append('?'); |
| 1988 | sb.append(query); |
| 1989 | } |
| 1990 | } |
| 1991 | if (fragment != null) { |
| 1992 | sb.append('#'); |
| 1993 | sb.append(fragment); |
| 1994 | } |
| 1995 | string = sb.toString(); |
| 1996 | } |
| 1997 | |
| 1998 | |
| 1999 | // -- Normalization, resolution, and relativization -- |
| 2000 | |
| 2001 | // RFC2396 5.2 (6) |
| 2002 | private static String resolvePath(String base, String child, |
| 2003 | boolean absolute) |
| 2004 | { |
| 2005 | int i = base.lastIndexOf('/'); |
| 2006 | int cn = child.length(); |
| 2007 | String path = ""; |
| 2008 | |
| 2009 | if (cn == 0) { |
| 2010 | // 5.2 (6a) |
| 2011 | if (i >= 0) |
| 2012 | path = base.substring(0, i + 1); |
| 2013 | } else { |
| 2014 | StringBuffer sb = new StringBuffer(base.length() + cn); |
| 2015 | // 5.2 (6a) |
| 2016 | if (i >= 0) |
| 2017 | sb.append(base.substring(0, i + 1)); |
| 2018 | // 5.2 (6b) |
| 2019 | sb.append(child); |
| 2020 | path = sb.toString(); |
| 2021 | } |
| 2022 | |
| 2023 | // 5.2 (6c-f) |
| 2024 | // Android-changed: App compat. Remove leading dots when resolving path. http://b/25897693 |
| 2025 | // String np = normalize(path); |
| 2026 | String np = normalize(path, true); |
| 2027 | |
| 2028 | // 5.2 (6g): If the result is absolute but the path begins with "../", |
| 2029 | // then we simply leave the path as-is |
| 2030 | |
| 2031 | return np; |
| 2032 | } |
| 2033 | |
| 2034 | // RFC2396 5.2 |
| 2035 | private static URI resolve(URI base, URI child) { |
| 2036 | // check if child if opaque first so that NPE is thrown |
| 2037 | // if child is null. |
| 2038 | if (child.isOpaque() || base.isOpaque()) |
| 2039 | return child; |
| 2040 | |
| 2041 | // 5.2 (2): Reference to current document (lone fragment) |
| 2042 | if ((child.scheme == null) && (child.authority == null) |
| 2043 | && child.path.equals("") && (child.fragment != null) |
| 2044 | && (child.query == null)) { |
| 2045 | if ((base.fragment != null) |
| 2046 | && child.fragment.equals(base.fragment)) { |
| 2047 | return base; |
| 2048 | } |
| 2049 | URI ru = new URI(); |
| 2050 | ru.scheme = base.scheme; |
| 2051 | ru.authority = base.authority; |
| 2052 | ru.userInfo = base.userInfo; |
| 2053 | ru.host = base.host; |
| 2054 | ru.port = base.port; |
| 2055 | ru.path = base.path; |
| 2056 | ru.fragment = child.fragment; |
| 2057 | ru.query = base.query; |
| 2058 | return ru; |
| 2059 | } |
| 2060 | |
| 2061 | // 5.2 (3): Child is absolute |
| 2062 | if (child.scheme != null) |
| 2063 | return child; |
| 2064 | |
| 2065 | URI ru = new URI(); // Resolved URI |
| 2066 | ru.scheme = base.scheme; |
| 2067 | ru.query = child.query; |
| 2068 | ru.fragment = child.fragment; |
| 2069 | |
| 2070 | // 5.2 (4): Authority |
| 2071 | if (child.authority == null) { |
| 2072 | ru.authority = base.authority; |
| 2073 | ru.host = base.host; |
| 2074 | ru.userInfo = base.userInfo; |
| 2075 | ru.port = base.port; |
| 2076 | |
| 2077 | // BEGIN Android-changed: App Compat. Handle null and empty path using RFC 3986 logic |
| 2078 | // http://b/25897693 |
| 2079 | if (child.path == null || child.path.isEmpty()) { |
| 2080 | // This is an additional path from RFC 3986 RI, which fixes following RFC 2396 |
| 2081 | // "normal" examples: |
| 2082 | // Base: http://a/b/c/d;p?q |
| 2083 | // "?y" = "http://a/b/c/d;p?y" |
| 2084 | // "" = "http://a/b/c/d;p?q" |
| 2085 | // http://b/25897693 |
| 2086 | ru.path = base.path; |
| 2087 | ru.query = child.query != null ? child.query : base.query; |
| 2088 | // END Android-changed: App Compat. Handle null and empty path using RFC 3986 logic |
| 2089 | } else if ((child.path.length() > 0) && (child.path.charAt(0) == '/')) { |
| 2090 | // 5.2 (5): Child path is absolute |
| 2091 | // |
| 2092 | // Android-changed: App Compat. Remove leading dots in path. |
| 2093 | // There is an additional step from RFC 3986 RI, requiring to remove dots for |
| 2094 | // absolute path as well. |
| 2095 | // http://b/25897693 |
| 2096 | // ru.path = child.path; |
| 2097 | ru.path = normalize(child.path, true); |
| 2098 | } else { |
| 2099 | // 5.2 (6): Resolve relative path |
| 2100 | ru.path = resolvePath(base.path, child.path, base.isAbsolute()); |
| 2101 | } |
| 2102 | } else { |
| 2103 | ru.authority = child.authority; |
| 2104 | ru.host = child.host; |
| 2105 | ru.userInfo = child.userInfo; |
| 2106 | ru.host = child.host; |
| 2107 | ru.port = child.port; |
| 2108 | ru.path = child.path; |
| 2109 | } |
| 2110 | |
| 2111 | // 5.2 (7): Recombine (nothing to do here) |
| 2112 | return ru; |
| 2113 | } |
| 2114 | |
| 2115 | // If the given URI's path is normal then return the URI; |
| 2116 | // o.w., return a new URI containing the normalized path. |
| 2117 | // |
| 2118 | private static URI normalize(URI u) { |
| 2119 | if (u.isOpaque() || (u.path == null) || (u.path.length() == 0)) |
| 2120 | return u; |
| 2121 | |
| 2122 | String np = normalize(u.path); |
| 2123 | if (np == u.path) |
| 2124 | return u; |
| 2125 | |
| 2126 | URI v = new URI(); |
| 2127 | v.scheme = u.scheme; |
| 2128 | v.fragment = u.fragment; |
| 2129 | v.authority = u.authority; |
| 2130 | v.userInfo = u.userInfo; |
| 2131 | v.host = u.host; |
| 2132 | v.port = u.port; |
| 2133 | v.path = np; |
| 2134 | v.query = u.query; |
| 2135 | return v; |
| 2136 | } |
| 2137 | |
| 2138 | // If both URIs are hierarchical, their scheme and authority components are |
| 2139 | // identical, and the base path is a prefix of the child's path, then |
| 2140 | // return a relative URI that, when resolved against the base, yields the |
| 2141 | // child; otherwise, return the child. |
| 2142 | // |
| 2143 | private static URI relativize(URI base, URI child) { |
| 2144 | // check if child if opaque first so that NPE is thrown |
| 2145 | // if child is null. |
| 2146 | if (child.isOpaque() || base.isOpaque()) |
| 2147 | return child; |
| 2148 | if (!equalIgnoringCase(base.scheme, child.scheme) |
| 2149 | || !equal(base.authority, child.authority)) |
| 2150 | return child; |
| 2151 | |
| 2152 | String bp = normalize(base.path); |
| 2153 | String cp = normalize(child.path); |
| 2154 | if (!bp.equals(cp)) { |
| 2155 | // Android-changed: App Compat. Interpret ambiguous base path as a file, not a directory |
| 2156 | // Upstream would append '/' to bp if not present, interpreting it as a directory; thus, |
| 2157 | // /a/b/c relative to /a/b would become /c, whereas Android would relativize to /b/c. |
| 2158 | // The spec is pretty vague about this but the Android behavior is kept because several |
| 2159 | // tests enforce it. |
| 2160 | // if (!bp.endsWith("/")) |
| 2161 | // bp = bp + "/"; |
| 2162 | if (bp.indexOf('/') != -1) { |
| 2163 | bp = bp.substring(0, bp.lastIndexOf('/') + 1); |
| 2164 | } |
| 2165 | |
| 2166 | if (!cp.startsWith(bp)) |
| 2167 | return child; |
| 2168 | } |
| 2169 | |
| 2170 | URI v = new URI(); |
| 2171 | v.path = cp.substring(bp.length()); |
| 2172 | v.query = child.query; |
| 2173 | v.fragment = child.fragment; |
| 2174 | return v; |
| 2175 | } |
| 2176 | |
| 2177 | |
| 2178 | |
| 2179 | // -- Path normalization -- |
| 2180 | |
| 2181 | // The following algorithm for path normalization avoids the creation of a |
| 2182 | // string object for each segment, as well as the use of a string buffer to |
| 2183 | // compute the final result, by using a single char array and editing it in |
| 2184 | // place. The array is first split into segments, replacing each slash |
| 2185 | // with '\0' and creating a segment-index array, each element of which is |
| 2186 | // the index of the first char in the corresponding segment. We then walk |
| 2187 | // through both arrays, removing ".", "..", and other segments as necessary |
| 2188 | // by setting their entries in the index array to -1. Finally, the two |
| 2189 | // arrays are used to rejoin the segments and compute the final result. |
| 2190 | // |
| 2191 | // This code is based upon src/solaris/native/java/io/canonicalize_md.c |
| 2192 | |
| 2193 | |
| 2194 | // Check the given path to see if it might need normalization. A path |
| 2195 | // might need normalization if it contains duplicate slashes, a "." |
| 2196 | // segment, or a ".." segment. Return -1 if no further normalization is |
| 2197 | // possible, otherwise return the number of segments found. |
| 2198 | // |
| 2199 | // This method takes a string argument rather than a char array so that |
| 2200 | // this test can be performed without invoking path.toCharArray(). |
| 2201 | // |
| 2202 | static private int needsNormalization(String path) { |
| 2203 | boolean normal = true; |
| 2204 | int ns = 0; // Number of segments |
| 2205 | int end = path.length() - 1; // Index of last char in path |
| 2206 | int p = 0; // Index of next char in path |
| 2207 | |
| 2208 | // Skip initial slashes |
| 2209 | while (p <= end) { |
| 2210 | if (path.charAt(p) != '/') break; |
| 2211 | p++; |
| 2212 | } |
| 2213 | if (p > 1) normal = false; |
| 2214 | |
| 2215 | // Scan segments |
| 2216 | while (p <= end) { |
| 2217 | |
| 2218 | // Looking at "." or ".." ? |
| 2219 | if ((path.charAt(p) == '.') |
| 2220 | && ((p == end) |
| 2221 | || ((path.charAt(p + 1) == '/') |
| 2222 | || ((path.charAt(p + 1) == '.') |
| 2223 | && ((p + 1 == end) |
| 2224 | || (path.charAt(p + 2) == '/')))))) { |
| 2225 | normal = false; |
| 2226 | } |
| 2227 | ns++; |
| 2228 | |
| 2229 | // Find beginning of next segment |
| 2230 | while (p <= end) { |
| 2231 | if (path.charAt(p++) != '/') |
| 2232 | continue; |
| 2233 | |
| 2234 | // Skip redundant slashes |
| 2235 | while (p <= end) { |
| 2236 | if (path.charAt(p) != '/') break; |
| 2237 | normal = false; |
| 2238 | p++; |
| 2239 | } |
| 2240 | |
| 2241 | break; |
| 2242 | } |
| 2243 | } |
| 2244 | |
| 2245 | return normal ? -1 : ns; |
| 2246 | } |
| 2247 | |
| 2248 | |
| 2249 | // Split the given path into segments, replacing slashes with nulls and |
| 2250 | // filling in the given segment-index array. |
| 2251 | // |
| 2252 | // Preconditions: |
| 2253 | // segs.length == Number of segments in path |
| 2254 | // |
| 2255 | // Postconditions: |
| 2256 | // All slashes in path replaced by '\0' |
| 2257 | // segs[i] == Index of first char in segment i (0 <= i < segs.length) |
| 2258 | // |
| 2259 | static private void split(char[] path, int[] segs) { |
| 2260 | int end = path.length - 1; // Index of last char in path |
| 2261 | int p = 0; // Index of next char in path |
| 2262 | int i = 0; // Index of current segment |
| 2263 | |
| 2264 | // Skip initial slashes |
| 2265 | while (p <= end) { |
| 2266 | if (path[p] != '/') break; |
| 2267 | path[p] = '\0'; |
| 2268 | p++; |
| 2269 | } |
| 2270 | |
| 2271 | while (p <= end) { |
| 2272 | |
| 2273 | // Note start of segment |
| 2274 | segs[i++] = p++; |
| 2275 | |
| 2276 | // Find beginning of next segment |
| 2277 | while (p <= end) { |
| 2278 | if (path[p++] != '/') |
| 2279 | continue; |
| 2280 | path[p - 1] = '\0'; |
| 2281 | |
| 2282 | // Skip redundant slashes |
| 2283 | while (p <= end) { |
| 2284 | if (path[p] != '/') break; |
| 2285 | path[p++] = '\0'; |
| 2286 | } |
| 2287 | break; |
| 2288 | } |
| 2289 | } |
| 2290 | |
| 2291 | if (i != segs.length) |
| 2292 | throw new InternalError(); // ASSERT |
| 2293 | } |
| 2294 | |
| 2295 | |
| 2296 | // Join the segments in the given path according to the given segment-index |
| 2297 | // array, ignoring those segments whose index entries have been set to -1, |
| 2298 | // and inserting slashes as needed. Return the length of the resulting |
| 2299 | // path. |
| 2300 | // |
| 2301 | // Preconditions: |
| 2302 | // segs[i] == -1 implies segment i is to be ignored |
| 2303 | // path computed by split, as above, with '\0' having replaced '/' |
| 2304 | // |
| 2305 | // Postconditions: |
| 2306 | // path[0] .. path[return value] == Resulting path |
| 2307 | // |
| 2308 | static private int join(char[] path, int[] segs) { |
| 2309 | int ns = segs.length; // Number of segments |
| 2310 | int end = path.length - 1; // Index of last char in path |
| 2311 | int p = 0; // Index of next path char to write |
| 2312 | |
| 2313 | if (path[p] == '\0') { |
| 2314 | // Restore initial slash for absolute paths |
| 2315 | path[p++] = '/'; |
| 2316 | } |
| 2317 | |
| 2318 | for (int i = 0; i < ns; i++) { |
| 2319 | int q = segs[i]; // Current segment |
| 2320 | if (q == -1) |
| 2321 | // Ignore this segment |
| 2322 | continue; |
| 2323 | |
| 2324 | if (p == q) { |
| 2325 | // We're already at this segment, so just skip to its end |
| 2326 | while ((p <= end) && (path[p] != '\0')) |
| 2327 | p++; |
| 2328 | if (p <= end) { |
| 2329 | // Preserve trailing slash |
| 2330 | path[p++] = '/'; |
| 2331 | } |
| 2332 | } else if (p < q) { |
| 2333 | // Copy q down to p |
| 2334 | while ((q <= end) && (path[q] != '\0')) |
| 2335 | path[p++] = path[q++]; |
| 2336 | if (q <= end) { |
| 2337 | // Preserve trailing slash |
| 2338 | path[p++] = '/'; |
| 2339 | } |
| 2340 | } else |
| 2341 | throw new InternalError(); // ASSERT false |
| 2342 | } |
| 2343 | |
| 2344 | return p; |
| 2345 | } |
| 2346 | |
| 2347 | |
| 2348 | // Remove "." segments from the given path, and remove segment pairs |
| 2349 | // consisting of a non-".." segment followed by a ".." segment. |
| 2350 | // |
| 2351 | // Android-changed: App compat. Remove leading dots when resolving path. http://b/25897693 |
| 2352 | // private static void removeDots(char[] path, int[] segs) { |
| 2353 | private static void removeDots(char[] path, int[] segs, boolean removeLeading) { |
| 2354 | int ns = segs.length; |
| 2355 | int end = path.length - 1; |
| 2356 | |
| 2357 | for (int i = 0; i < ns; i++) { |
| 2358 | int dots = 0; // Number of dots found (0, 1, or 2) |
| 2359 | |
| 2360 | // Find next occurrence of "." or ".." |
| 2361 | do { |
| 2362 | int p = segs[i]; |
| 2363 | if (path[p] == '.') { |
| 2364 | if (p == end) { |
| 2365 | dots = 1; |
| 2366 | break; |
| 2367 | } else if (path[p + 1] == '\0') { |
| 2368 | dots = 1; |
| 2369 | break; |
| 2370 | } else if ((path[p + 1] == '.') |
| 2371 | && ((p + 1 == end) |
| 2372 | || (path[p + 2] == '\0'))) { |
| 2373 | dots = 2; |
| 2374 | break; |
| 2375 | } |
| 2376 | } |
| 2377 | i++; |
| 2378 | } while (i < ns); |
| 2379 | if ((i > ns) || (dots == 0)) |
| 2380 | break; |
| 2381 | |
| 2382 | if (dots == 1) { |
| 2383 | // Remove this occurrence of "." |
| 2384 | segs[i] = -1; |
| 2385 | } else { |
| 2386 | // If there is a preceding non-".." segment, remove both that |
| 2387 | // segment and this occurrence of ".." |
| 2388 | int j; |
| 2389 | for (j = i - 1; j >= 0; j--) { |
| 2390 | if (segs[j] != -1) break; |
| 2391 | } |
| 2392 | if (j >= 0) { |
| 2393 | int q = segs[j]; |
| 2394 | if (!((path[q] == '.') |
| 2395 | && (path[q + 1] == '.') |
| 2396 | && (path[q + 2] == '\0'))) { |
| 2397 | segs[i] = -1; |
| 2398 | segs[j] = -1; |
| 2399 | } |
| 2400 | // Android-added: App compat. Remove leading dots when resolving path. |
| 2401 | // This is a leading ".." segment. Per RFC 3986 RI, this should be removed as |
| 2402 | // well. This fixes RFC 2396 "abnormal" examples. |
| 2403 | // http://b/25897693 |
| 2404 | } else if (removeLeading) { |
| 2405 | segs[i] = -1; |
| 2406 | } |
| 2407 | } |
| 2408 | } |
| 2409 | } |
| 2410 | |
| 2411 | |
| 2412 | // DEVIATION: If the normalized path is relative, and if the first |
| 2413 | // segment could be parsed as a scheme name, then prepend a "." segment |
| 2414 | // |
| 2415 | private static void maybeAddLeadingDot(char[] path, int[] segs) { |
| 2416 | |
| 2417 | if (path[0] == '\0') |
| 2418 | // The path is absolute |
| 2419 | return; |
| 2420 | |
| 2421 | int ns = segs.length; |
| 2422 | int f = 0; // Index of first segment |
| 2423 | while (f < ns) { |
| 2424 | if (segs[f] >= 0) |
| 2425 | break; |
| 2426 | f++; |
| 2427 | } |
| 2428 | if ((f >= ns) || (f == 0)) |
| 2429 | // The path is empty, or else the original first segment survived, |
| 2430 | // in which case we already know that no leading "." is needed |
| 2431 | return; |
| 2432 | |
| 2433 | int p = segs[f]; |
| 2434 | while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++; |
| 2435 | if (p >= path.length || path[p] == '\0') |
| 2436 | // No colon in first segment, so no "." needed |
| 2437 | return; |
| 2438 | |
| 2439 | // At this point we know that the first segment is unused, |
| 2440 | // hence we can insert a "." segment at that position |
| 2441 | path[0] = '.'; |
| 2442 | path[1] = '\0'; |
| 2443 | segs[0] = 0; |
| 2444 | } |
| 2445 | |
| 2446 | |
| 2447 | // Normalize the given path string. A normal path string has no empty |
| 2448 | // segments (i.e., occurrences of "//"), no segments equal to ".", and no |
| 2449 | // segments equal to ".." that are preceded by a segment not equal to "..". |
| 2450 | // In contrast to Unix-style pathname normalization, for URI paths we |
| 2451 | // always retain trailing slashes. |
| 2452 | // |
| 2453 | private static String normalize(String ps) { |
| 2454 | // BEGIN Android-changed: App compat. Remove leading dots when resolving path. |
| 2455 | // Controlled by the "boolean removeLeading" argument added to normalize(). |
| 2456 | return normalize(ps, false); |
| 2457 | } |
| 2458 | |
| 2459 | private static String normalize(String ps, boolean removeLeading) { |
| 2460 | // END Android-changed: App compat. Remove leading dots when resolving path. |
| 2461 | // Does this path need normalization? |
| 2462 | int ns = needsNormalization(ps); // Number of segments |
| 2463 | if (ns < 0) |
| 2464 | // Nope -- just return it |
| 2465 | return ps; |
| 2466 | |
| 2467 | char[] path = ps.toCharArray(); // Path in char-array form |
| 2468 | |
| 2469 | // Split path into segments |
| 2470 | int[] segs = new int[ns]; // Segment-index array |
| 2471 | split(path, segs); |
| 2472 | |
| 2473 | // Remove dots |
| 2474 | // Android-changed: App compat. Remove leading dots when resolving path. |
| 2475 | // removeDots(path, segs); |
| 2476 | removeDots(path, segs, removeLeading); |
| 2477 | |
| 2478 | // Prevent scheme-name confusion |
| 2479 | maybeAddLeadingDot(path, segs); |
| 2480 | |
| 2481 | // Join the remaining segments and return the result |
| 2482 | String s = new String(path, 0, join(path, segs)); |
| 2483 | if (s.equals(ps)) { |
| 2484 | // string was already normalized |
| 2485 | return ps; |
| 2486 | } |
| 2487 | return s; |
| 2488 | } |
| 2489 | |
| 2490 | |
| 2491 | |
| 2492 | // -- Character classes for parsing -- |
| 2493 | |
| 2494 | // RFC2396 precisely specifies which characters in the US-ASCII charset are |
| 2495 | // permissible in the various components of a URI reference. We here |
| 2496 | // define a set of mask pairs to aid in enforcing these restrictions. Each |
| 2497 | // mask pair consists of two longs, a low mask and a high mask. Taken |
| 2498 | // together they represent a 128-bit mask, where bit i is set iff the |
| 2499 | // character with value i is permitted. |
| 2500 | // |
| 2501 | // This approach is more efficient than sequentially searching arrays of |
| 2502 | // permitted characters. It could be made still more efficient by |
| 2503 | // precompiling the mask information so that a character's presence in a |
| 2504 | // given mask could be determined by a single table lookup. |
| 2505 | |
| 2506 | // Compute the low-order mask for the characters in the given string |
| 2507 | private static long lowMask(String chars) { |
| 2508 | int n = chars.length(); |
| 2509 | long m = 0; |
| 2510 | for (int i = 0; i < n; i++) { |
| 2511 | char c = chars.charAt(i); |
| 2512 | if (c < 64) |
| 2513 | m |= (1L << c); |
| 2514 | } |
| 2515 | return m; |
| 2516 | } |
| 2517 | |
| 2518 | // Compute the high-order mask for the characters in the given string |
| 2519 | private static long highMask(String chars) { |
| 2520 | int n = chars.length(); |
| 2521 | long m = 0; |
| 2522 | for (int i = 0; i < n; i++) { |
| 2523 | char c = chars.charAt(i); |
| 2524 | if ((c >= 64) && (c < 128)) |
| 2525 | m |= (1L << (c - 64)); |
| 2526 | } |
| 2527 | return m; |
| 2528 | } |
| 2529 | |
| 2530 | // Compute a low-order mask for the characters |
| 2531 | // between first and last, inclusive |
| 2532 | private static long lowMask(char first, char last) { |
| 2533 | long m = 0; |
| 2534 | int f = Math.max(Math.min(first, 63), 0); |
| 2535 | int l = Math.max(Math.min(last, 63), 0); |
| 2536 | for (int i = f; i <= l; i++) |
| 2537 | m |= 1L << i; |
| 2538 | return m; |
| 2539 | } |
| 2540 | |
| 2541 | // Compute a high-order mask for the characters |
| 2542 | // between first and last, inclusive |
| 2543 | private static long highMask(char first, char last) { |
| 2544 | long m = 0; |
| 2545 | int f = Math.max(Math.min(first, 127), 64) - 64; |
| 2546 | int l = Math.max(Math.min(last, 127), 64) - 64; |
| 2547 | for (int i = f; i <= l; i++) |
| 2548 | m |= 1L << i; |
| 2549 | return m; |
| 2550 | } |
| 2551 | |
| 2552 | // Tell whether the given character is permitted by the given mask pair |
| 2553 | private static boolean match(char c, long lowMask, long highMask) { |
| 2554 | if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches. |
| 2555 | return false; |
| 2556 | if (c < 64) |
| 2557 | return ((1L << c) & lowMask) != 0; |
| 2558 | if (c < 128) |
| 2559 | return ((1L << (c - 64)) & highMask) != 0; |
| 2560 | return false; |
| 2561 | } |
| 2562 | |
| 2563 | // Character-class masks, in reverse order from RFC2396 because |
| 2564 | // initializers for static fields cannot make forward references. |
| 2565 | |
| 2566 | // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | |
| 2567 | // "8" | "9" |
| 2568 | private static final long L_DIGIT = lowMask('0', '9'); |
| 2569 | private static final long H_DIGIT = 0L; |
| 2570 | |
| 2571 | // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | |
| 2572 | // "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | |
| 2573 | // "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" |
| 2574 | private static final long L_UPALPHA = 0L; |
| 2575 | private static final long H_UPALPHA = highMask('A', 'Z'); |
| 2576 | |
| 2577 | // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | |
| 2578 | // "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | |
| 2579 | // "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" |
| 2580 | private static final long L_LOWALPHA = 0L; |
| 2581 | private static final long H_LOWALPHA = highMask('a', 'z'); |
| 2582 | |
| 2583 | // alpha = lowalpha | upalpha |
| 2584 | private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA; |
| 2585 | private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA; |
| 2586 | |
| 2587 | // alphanum = alpha | digit |
| 2588 | private static final long L_ALPHANUM = L_DIGIT | L_ALPHA; |
| 2589 | private static final long H_ALPHANUM = H_DIGIT | H_ALPHA; |
| 2590 | |
| 2591 | // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | |
| 2592 | // "a" | "b" | "c" | "d" | "e" | "f" |
| 2593 | private static final long L_HEX = L_DIGIT; |
| 2594 | private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f'); |
| 2595 | |
| 2596 | // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | |
| 2597 | // "(" | ")" |
| 2598 | private static final long L_MARK = lowMask("-_.!~*'()"); |
| 2599 | private static final long H_MARK = highMask("-_.!~*'()"); |
| 2600 | |
| 2601 | // unreserved = alphanum | mark |
| 2602 | private static final long L_UNRESERVED = L_ALPHANUM | L_MARK; |
| 2603 | private static final long H_UNRESERVED = H_ALPHANUM | H_MARK; |
| 2604 | |
| 2605 | // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | |
| 2606 | // "$" | "," | "[" | "]" |
| 2607 | // Added per RFC2732: "[", "]" |
| 2608 | private static final long L_RESERVED = lowMask(";/?:@&=+$,[]"); |
| 2609 | private static final long H_RESERVED = highMask(";/?:@&=+$,[]"); |
| 2610 | |
| 2611 | // The zero'th bit is used to indicate that escape pairs and non-US-ASCII |
| 2612 | // characters are allowed; this is handled by the scanEscape method below. |
| 2613 | private static final long L_ESCAPED = 1L; |
| 2614 | private static final long H_ESCAPED = 0L; |
| 2615 | |
| 2616 | // uric = reserved | unreserved | escaped |
| 2617 | private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED; |
| 2618 | private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED; |
| 2619 | |
| 2620 | // pchar = unreserved | escaped | |
| 2621 | // ":" | "@" | "&" | "=" | "+" | "$" | "," |
| 2622 | private static final long L_PCHAR |
| 2623 | = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,"); |
| 2624 | private static final long H_PCHAR |
| 2625 | = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,"); |
| 2626 | |
| 2627 | // All valid path characters |
| 2628 | private static final long L_PATH = L_PCHAR | lowMask(";/"); |
| 2629 | private static final long H_PATH = H_PCHAR | highMask(";/"); |
| 2630 | |
| 2631 | // Dash, for use in domainlabel and toplabel |
| 2632 | private static final long L_DASH = lowMask("-"); |
| 2633 | private static final long H_DASH = highMask("-"); |
| 2634 | |
| 2635 | // BEGIN Android-added: Allow underscore in hostname. |
| 2636 | // UNDERSCORE, for use in domainlabel and toplabel |
| 2637 | private static final long L_UNDERSCORE = lowMask("_"); |
| 2638 | private static final long H_UNDERSCORE = highMask("_"); |
| 2639 | // END Android-added: Allow underscore in hostname. |
| 2640 | |
| 2641 | // Dot, for use in hostnames |
| 2642 | private static final long L_DOT = lowMask("."); |
| 2643 | private static final long H_DOT = highMask("."); |
| 2644 | |
| 2645 | // userinfo = *( unreserved | escaped | |
| 2646 | // ";" | ":" | "&" | "=" | "+" | "$" | "," ) |
| 2647 | private static final long L_USERINFO |
| 2648 | = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,"); |
| 2649 | private static final long H_USERINFO |
| 2650 | = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,"); |
| 2651 | |
| 2652 | // reg_name = 1*( unreserved | escaped | "$" | "," | |
| 2653 | // ";" | ":" | "@" | "&" | "=" | "+" ) |
| 2654 | private static final long L_REG_NAME |
| 2655 | = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+"); |
| 2656 | private static final long H_REG_NAME |
| 2657 | = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+"); |
| 2658 | |
| 2659 | // All valid characters for server-based authorities |
| 2660 | private static final long L_SERVER |
| 2661 | = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]"); |
| 2662 | private static final long H_SERVER |
| 2663 | = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]"); |
| 2664 | |
| 2665 | // Special case of server authority that represents an IPv6 address |
| 2666 | // In this case, a % does not signify an escape sequence |
| 2667 | private static final long L_SERVER_PERCENT |
| 2668 | = L_SERVER | lowMask("%"); |
| 2669 | private static final long H_SERVER_PERCENT |
| 2670 | = H_SERVER | highMask("%"); |
| 2671 | private static final long L_LEFT_BRACKET = lowMask("["); |
| 2672 | private static final long H_LEFT_BRACKET = highMask("["); |
| 2673 | |
| 2674 | // scheme = alpha *( alpha | digit | "+" | "-" | "." ) |
| 2675 | private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-."); |
| 2676 | private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-."); |
| 2677 | |
| 2678 | // uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" | |
| 2679 | // "&" | "=" | "+" | "$" | "," |
| 2680 | private static final long L_URIC_NO_SLASH |
| 2681 | = L_UNRESERVED | L_ESCAPED | lowMask(";?:@&=+$,"); |
| 2682 | private static final long H_URIC_NO_SLASH |
| 2683 | = H_UNRESERVED | H_ESCAPED | highMask(";?:@&=+$,"); |
| 2684 | |
| 2685 | |
| 2686 | // -- Escaping and encoding -- |
| 2687 | |
| 2688 | private final static char[] hexDigits = { |
| 2689 | '0', '1', '2', '3', '4', '5', '6', '7', |
| 2690 | '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' |
| 2691 | }; |
| 2692 | |
| 2693 | private static void appendEscape(StringBuffer sb, byte b) { |
| 2694 | sb.append('%'); |
| 2695 | sb.append(hexDigits[(b >> 4) & 0x0f]); |
| 2696 | sb.append(hexDigits[(b >> 0) & 0x0f]); |
| 2697 | } |
| 2698 | |
| 2699 | private static void appendEncoded(StringBuffer sb, char c) { |
| 2700 | ByteBuffer bb = null; |
| 2701 | try { |
| 2702 | bb = ThreadLocalCoders.encoderFor("UTF-8") |
| 2703 | .encode(CharBuffer.wrap("" + c)); |
| 2704 | } catch (CharacterCodingException x) { |
| 2705 | assert false; |
| 2706 | } |
| 2707 | while (bb.hasRemaining()) { |
| 2708 | int b = bb.get() & 0xff; |
| 2709 | if (b >= 0x80) |
| 2710 | appendEscape(sb, (byte)b); |
| 2711 | else |
| 2712 | sb.append((char)b); |
| 2713 | } |
| 2714 | } |
| 2715 | |
| 2716 | // Quote any characters in s that are not permitted |
| 2717 | // by the given mask pair |
| 2718 | // |
| 2719 | private static String quote(String s, long lowMask, long highMask) { |
| 2720 | int n = s.length(); |
| 2721 | StringBuffer sb = null; |
| 2722 | boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0); |
| 2723 | for (int i = 0; i < s.length(); i++) { |
| 2724 | char c = s.charAt(i); |
| 2725 | if (c < '\u0080') { |
| 2726 | if (!match(c, lowMask, highMask)) { |
| 2727 | if (sb == null) { |
| 2728 | sb = new StringBuffer(); |
| 2729 | sb.append(s.substring(0, i)); |
| 2730 | } |
| 2731 | appendEscape(sb, (byte)c); |
| 2732 | } else { |
| 2733 | if (sb != null) |
| 2734 | sb.append(c); |
| 2735 | } |
| 2736 | } else if (allowNonASCII |
| 2737 | && (Character.isSpaceChar(c) |
| 2738 | || Character.isISOControl(c))) { |
| 2739 | if (sb == null) { |
| 2740 | sb = new StringBuffer(); |
| 2741 | sb.append(s.substring(0, i)); |
| 2742 | } |
| 2743 | appendEncoded(sb, c); |
| 2744 | } else { |
| 2745 | if (sb != null) |
| 2746 | sb.append(c); |
| 2747 | } |
| 2748 | } |
| 2749 | return (sb == null) ? s : sb.toString(); |
| 2750 | } |
| 2751 | |
| 2752 | // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets, |
| 2753 | // assuming that s is otherwise legal |
| 2754 | // |
| 2755 | private static String encode(String s) { |
| 2756 | int n = s.length(); |
| 2757 | if (n == 0) |
| 2758 | return s; |
| 2759 | |
| 2760 | // First check whether we actually need to encode |
| 2761 | for (int i = 0;;) { |
| 2762 | if (s.charAt(i) >= '\u0080') |
| 2763 | break; |
| 2764 | if (++i >= n) |
| 2765 | return s; |
| 2766 | } |
| 2767 | |
| 2768 | String ns = Normalizer.normalize(s, Normalizer.Form.NFC); |
| 2769 | ByteBuffer bb = null; |
| 2770 | try { |
| 2771 | bb = ThreadLocalCoders.encoderFor("UTF-8") |
| 2772 | .encode(CharBuffer.wrap(ns)); |
| 2773 | } catch (CharacterCodingException x) { |
| 2774 | assert false; |
| 2775 | } |
| 2776 | |
| 2777 | StringBuffer sb = new StringBuffer(); |
| 2778 | while (bb.hasRemaining()) { |
| 2779 | int b = bb.get() & 0xff; |
| 2780 | if (b >= 0x80) |
| 2781 | appendEscape(sb, (byte)b); |
| 2782 | else |
| 2783 | sb.append((char)b); |
| 2784 | } |
| 2785 | return sb.toString(); |
| 2786 | } |
| 2787 | |
| 2788 | private static int decode(char c) { |
| 2789 | if ((c >= '0') && (c <= '9')) |
| 2790 | return c - '0'; |
| 2791 | if ((c >= 'a') && (c <= 'f')) |
| 2792 | return c - 'a' + 10; |
| 2793 | if ((c >= 'A') && (c <= 'F')) |
| 2794 | return c - 'A' + 10; |
| 2795 | assert false; |
| 2796 | return -1; |
| 2797 | } |
| 2798 | |
| 2799 | private static byte decode(char c1, char c2) { |
| 2800 | return (byte)( ((decode(c1) & 0xf) << 4) |
| 2801 | | ((decode(c2) & 0xf) << 0)); |
| 2802 | } |
| 2803 | |
| 2804 | // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes |
| 2805 | // that escapes are well-formed syntactically, i.e., of the form %XX. If a |
| 2806 | // sequence of escaped octets is not valid UTF-8 then the erroneous octets |
| 2807 | // are replaced with '\uFFFD'. |
| 2808 | // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal |
| 2809 | // with a scope_id |
| 2810 | // |
| 2811 | private static String decode(String s) { |
| 2812 | if (s == null) |
| 2813 | return s; |
| 2814 | int n = s.length(); |
| 2815 | if (n == 0) |
| 2816 | return s; |
| 2817 | if (s.indexOf('%') < 0) |
| 2818 | return s; |
| 2819 | |
| 2820 | StringBuffer sb = new StringBuffer(n); |
| 2821 | ByteBuffer bb = ByteBuffer.allocate(n); |
| 2822 | CharBuffer cb = CharBuffer.allocate(n); |
| 2823 | CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8") |
| 2824 | .onMalformedInput(CodingErrorAction.REPLACE) |
| 2825 | .onUnmappableCharacter(CodingErrorAction.REPLACE); |
| 2826 | |
| 2827 | // This is not horribly efficient, but it will do for now |
| 2828 | char c = s.charAt(0); |
| 2829 | boolean betweenBrackets = false; |
| 2830 | |
| 2831 | for (int i = 0; i < n;) { |
| 2832 | assert c == s.charAt(i); // Loop invariant |
| 2833 | if (c == '[') { |
| 2834 | betweenBrackets = true; |
| 2835 | } else if (betweenBrackets && c == ']') { |
| 2836 | betweenBrackets = false; |
| 2837 | } |
| 2838 | if (c != '%' || betweenBrackets) { |
| 2839 | sb.append(c); |
| 2840 | if (++i >= n) |
| 2841 | break; |
| 2842 | c = s.charAt(i); |
| 2843 | continue; |
| 2844 | } |
| 2845 | bb.clear(); |
| 2846 | int ui = i; |
| 2847 | for (;;) { |
| 2848 | assert (n - i >= 2); |
| 2849 | bb.put(decode(s.charAt(++i), s.charAt(++i))); |
| 2850 | if (++i >= n) |
| 2851 | break; |
| 2852 | c = s.charAt(i); |
| 2853 | if (c != '%') |
| 2854 | break; |
| 2855 | } |
| 2856 | bb.flip(); |
| 2857 | cb.clear(); |
| 2858 | dec.reset(); |
| 2859 | CoderResult cr = dec.decode(bb, cb, true); |
| 2860 | assert cr.isUnderflow(); |
| 2861 | cr = dec.flush(cb); |
| 2862 | assert cr.isUnderflow(); |
| 2863 | sb.append(cb.flip().toString()); |
| 2864 | } |
| 2865 | |
| 2866 | return sb.toString(); |
| 2867 | } |
| 2868 | |
| 2869 | |
| 2870 | // -- Parsing -- |
| 2871 | |
| 2872 | // For convenience we wrap the input URI string in a new instance of the |
| 2873 | // following internal class. This saves always having to pass the input |
| 2874 | // string as an argument to each internal scan/parse method. |
| 2875 | |
| 2876 | private class Parser { |
| 2877 | |
| 2878 | private String input; // URI input string |
| 2879 | private boolean requireServerAuthority = false; |
| 2880 | |
| 2881 | Parser(String s) { |
| 2882 | input = s; |
| 2883 | string = s; |
| 2884 | } |
| 2885 | |
| 2886 | // -- Methods for throwing URISyntaxException in various ways -- |
| 2887 | |
| 2888 | private void fail(String reason) throws URISyntaxException { |
| 2889 | throw new URISyntaxException(input, reason); |
| 2890 | } |
| 2891 | |
| 2892 | private void fail(String reason, int p) throws URISyntaxException { |
| 2893 | throw new URISyntaxException(input, reason, p); |
| 2894 | } |
| 2895 | |
| 2896 | private void failExpecting(String expected, int p) |
| 2897 | throws URISyntaxException |
| 2898 | { |
| 2899 | fail("Expected " + expected, p); |
| 2900 | } |
| 2901 | |
| 2902 | private void failExpecting(String expected, String prior, int p) |
| 2903 | throws URISyntaxException |
| 2904 | { |
| 2905 | fail("Expected " + expected + " following " + prior, p); |
| 2906 | } |
| 2907 | |
| 2908 | |
| 2909 | // -- Simple access to the input string -- |
| 2910 | |
| 2911 | // Return a substring of the input string |
| 2912 | // |
| 2913 | private String substring(int start, int end) { |
| 2914 | return input.substring(start, end); |
| 2915 | } |
| 2916 | |
| 2917 | // Return the char at position p, |
| 2918 | // assuming that p < input.length() |
| 2919 | // |
| 2920 | private char charAt(int p) { |
| 2921 | return input.charAt(p); |
| 2922 | } |
| 2923 | |
| 2924 | // Tells whether start < end and, if so, whether charAt(start) == c |
| 2925 | // |
| 2926 | private boolean at(int start, int end, char c) { |
| 2927 | return (start < end) && (charAt(start) == c); |
| 2928 | } |
| 2929 | |
| 2930 | // Tells whether start + s.length() < end and, if so, |
| 2931 | // whether the chars at the start position match s exactly |
| 2932 | // |
| 2933 | private boolean at(int start, int end, String s) { |
| 2934 | int p = start; |
| 2935 | int sn = s.length(); |
| 2936 | if (sn > end - p) |
| 2937 | return false; |
| 2938 | int i = 0; |
| 2939 | while (i < sn) { |
| 2940 | if (charAt(p++) != s.charAt(i)) { |
| 2941 | break; |
| 2942 | } |
| 2943 | i++; |
| 2944 | } |
| 2945 | return (i == sn); |
| 2946 | } |
| 2947 | |
| 2948 | |
| 2949 | // -- Scanning -- |
| 2950 | |
| 2951 | // The various scan and parse methods that follow use a uniform |
| 2952 | // convention of taking the current start position and end index as |
| 2953 | // their first two arguments. The start is inclusive while the end is |
| 2954 | // exclusive, just as in the String class, i.e., a start/end pair |
| 2955 | // denotes the left-open interval [start, end) of the input string. |
| 2956 | // |
| 2957 | // These methods never proceed past the end position. They may return |
| 2958 | // -1 to indicate outright failure, but more often they simply return |
| 2959 | // the position of the first char after the last char scanned. Thus |
| 2960 | // a typical idiom is |
| 2961 | // |
| 2962 | // int p = start; |
| 2963 | // int q = scan(p, end, ...); |
| 2964 | // if (q > p) |
| 2965 | // // We scanned something |
| 2966 | // ...; |
| 2967 | // else if (q == p) |
| 2968 | // // We scanned nothing |
| 2969 | // ...; |
| 2970 | // else if (q == -1) |
| 2971 | // // Something went wrong |
| 2972 | // ...; |
| 2973 | |
| 2974 | |
| 2975 | // Scan a specific char: If the char at the given start position is |
| 2976 | // equal to c, return the index of the next char; otherwise, return the |
| 2977 | // start position. |
| 2978 | // |
| 2979 | private int scan(int start, int end, char c) { |
| 2980 | if ((start < end) && (charAt(start) == c)) |
| 2981 | return start + 1; |
| 2982 | return start; |
| 2983 | } |
| 2984 | |
| 2985 | // Scan forward from the given start position. Stop at the first char |
| 2986 | // in the err string (in which case -1 is returned), or the first char |
| 2987 | // in the stop string (in which case the index of the preceding char is |
| 2988 | // returned), or the end of the input string (in which case the length |
| 2989 | // of the input string is returned). May return the start position if |
| 2990 | // nothing matches. |
| 2991 | // |
| 2992 | private int scan(int start, int end, String err, String stop) { |
| 2993 | int p = start; |
| 2994 | while (p < end) { |
| 2995 | char c = charAt(p); |
| 2996 | if (err.indexOf(c) >= 0) |
| 2997 | return -1; |
| 2998 | if (stop.indexOf(c) >= 0) |
| 2999 | break; |
| 3000 | p++; |
| 3001 | } |
| 3002 | return p; |
| 3003 | } |
| 3004 | |
| 3005 | // Scan a potential escape sequence, starting at the given position, |
| 3006 | // with the given first char (i.e., charAt(start) == c). |
| 3007 | // |
| 3008 | // This method assumes that if escapes are allowed then visible |
| 3009 | // non-US-ASCII chars are also allowed. |
| 3010 | // |
| 3011 | private int scanEscape(int start, int n, char first) |
| 3012 | throws URISyntaxException |
| 3013 | { |
| 3014 | int p = start; |
| 3015 | char c = first; |
| 3016 | if (c == '%') { |
| 3017 | // Process escape pair |
| 3018 | if ((p + 3 <= n) |
| 3019 | && match(charAt(p + 1), L_HEX, H_HEX) |
| 3020 | && match(charAt(p + 2), L_HEX, H_HEX)) { |
| 3021 | return p + 3; |
| 3022 | } |
| 3023 | fail("Malformed escape pair", p); |
| 3024 | } else if ((c > 128) |
| 3025 | && !Character.isSpaceChar(c) |
| 3026 | && !Character.isISOControl(c)) { |
| 3027 | // Allow unescaped but visible non-US-ASCII chars |
| 3028 | return p + 1; |
| 3029 | } |
| 3030 | return p; |
| 3031 | } |
| 3032 | |
| 3033 | // Scan chars that match the given mask pair |
| 3034 | // |
| 3035 | private int scan(int start, int n, long lowMask, long highMask) |
| 3036 | throws URISyntaxException |
| 3037 | { |
| 3038 | int p = start; |
| 3039 | while (p < n) { |
| 3040 | char c = charAt(p); |
| 3041 | if (match(c, lowMask, highMask)) { |
| 3042 | p++; |
| 3043 | continue; |
| 3044 | } |
| 3045 | if ((lowMask & L_ESCAPED) != 0) { |
| 3046 | int q = scanEscape(p, n, c); |
| 3047 | if (q > p) { |
| 3048 | p = q; |
| 3049 | continue; |
| 3050 | } |
| 3051 | } |
| 3052 | break; |
| 3053 | } |
| 3054 | return p; |
| 3055 | } |
| 3056 | |
| 3057 | // Check that each of the chars in [start, end) matches the given mask |
| 3058 | // |
| 3059 | private void checkChars(int start, int end, |
| 3060 | long lowMask, long highMask, |
| 3061 | String what) |
| 3062 | throws URISyntaxException |
| 3063 | { |
| 3064 | int p = scan(start, end, lowMask, highMask); |
| 3065 | if (p < end) |
| 3066 | fail("Illegal character in " + what, p); |
| 3067 | } |
| 3068 | |
| 3069 | // Check that the char at position p matches the given mask |
| 3070 | // |
| 3071 | private void checkChar(int p, |
| 3072 | long lowMask, long highMask, |
| 3073 | String what) |
| 3074 | throws URISyntaxException |
| 3075 | { |
| 3076 | checkChars(p, p + 1, lowMask, highMask, what); |
| 3077 | } |
| 3078 | |
| 3079 | |
| 3080 | // -- Parsing -- |
| 3081 | |
| 3082 | // [<scheme>:]<scheme-specific-part>[#<fragment>] |
| 3083 | // |
| 3084 | void parse(boolean rsa) throws URISyntaxException { |
| 3085 | requireServerAuthority = rsa; |
| 3086 | int ssp; // Start of scheme-specific part |
| 3087 | int n = input.length(); |
| 3088 | int p = scan(0, n, "/?#", ":"); |
| 3089 | if ((p >= 0) && at(p, n, ':')) { |
| 3090 | if (p == 0) |
| 3091 | failExpecting("scheme name", 0); |
| 3092 | checkChar(0, L_ALPHA, H_ALPHA, "scheme name"); |
| 3093 | checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name"); |
| 3094 | scheme = substring(0, p); |
| 3095 | p++; // Skip ':' |
| 3096 | ssp = p; |
| 3097 | if (at(p, n, '/')) { |
| 3098 | p = parseHierarchical(p, n); |
| 3099 | } else { |
| 3100 | int q = scan(p, n, "", "#"); |
| 3101 | if (q <= p) |
| 3102 | failExpecting("scheme-specific part", p); |
| 3103 | checkChars(p, q, L_URIC, H_URIC, "opaque part"); |
| 3104 | p = q; |
| 3105 | } |
| 3106 | } else { |
| 3107 | ssp = 0; |
| 3108 | p = parseHierarchical(0, n); |
| 3109 | } |
| 3110 | schemeSpecificPart = substring(ssp, p); |
| 3111 | if (at(p, n, '#')) { |
| 3112 | checkChars(p + 1, n, L_URIC, H_URIC, "fragment"); |
| 3113 | fragment = substring(p + 1, n); |
| 3114 | p = n; |
| 3115 | } |
| 3116 | if (p < n) |
| 3117 | fail("end of URI", p); |
| 3118 | } |
| 3119 | |
| 3120 | // [//authority]<path>[?<query>] |
| 3121 | // |
| 3122 | // DEVIATION from RFC2396: We allow an empty authority component as |
| 3123 | // long as it's followed by a non-empty path, query component, or |
| 3124 | // fragment component. This is so that URIs such as "file:///foo/bar" |
| 3125 | // will parse. This seems to be the intent of RFC2396, though the |
| 3126 | // grammar does not permit it. If the authority is empty then the |
| 3127 | // userInfo, host, and port components are undefined. |
| 3128 | // |
| 3129 | // DEVIATION from RFC2396: We allow empty relative paths. This seems |
| 3130 | // to be the intent of RFC2396, but the grammar does not permit it. |
| 3131 | // The primary consequence of this deviation is that "#f" parses as a |
| 3132 | // relative URI with an empty path. |
| 3133 | // |
| 3134 | private int parseHierarchical(int start, int n) |
| 3135 | throws URISyntaxException |
| 3136 | { |
| 3137 | int p = start; |
| 3138 | if (at(p, n, '/') && at(p + 1, n, '/')) { |
| 3139 | p += 2; |
| 3140 | int q = scan(p, n, "", "/?#"); |
| 3141 | if (q > p) { |
| 3142 | p = parseAuthority(p, q); |
| 3143 | } else if (q < n) { |
| 3144 | // DEVIATION: Allow empty authority prior to non-empty |
| 3145 | // path, query component or fragment identifier |
| 3146 | } else |
| 3147 | failExpecting("authority", p); |
| 3148 | } |
| 3149 | int q = scan(p, n, "", "?#"); // DEVIATION: May be empty |
| 3150 | checkChars(p, q, L_PATH, H_PATH, "path"); |
| 3151 | path = substring(p, q); |
| 3152 | p = q; |
| 3153 | if (at(p, n, '?')) { |
| 3154 | p++; |
| 3155 | q = scan(p, n, "", "#"); |
| 3156 | checkChars(p, q, L_URIC, H_URIC, "query"); |
| 3157 | query = substring(p, q); |
| 3158 | p = q; |
| 3159 | } |
| 3160 | return p; |
| 3161 | } |
| 3162 | |
| 3163 | // authority = server | reg_name |
| 3164 | // |
| 3165 | // Ambiguity: An authority that is a registry name rather than a server |
| 3166 | // might have a prefix that parses as a server. We use the fact that |
| 3167 | // the authority component is always followed by '/' or the end of the |
| 3168 | // input string to resolve this: If the complete authority did not |
| 3169 | // parse as a server then we try to parse it as a registry name. |
| 3170 | // |
| 3171 | private int parseAuthority(int start, int n) |
| 3172 | throws URISyntaxException |
| 3173 | { |
| 3174 | int p = start; |
| 3175 | int q = p; |
| 3176 | URISyntaxException ex = null; |
| 3177 | |
| 3178 | boolean serverChars; |
| 3179 | boolean regChars; |
| 3180 | |
| 3181 | if (scan(p, n, "", "]") > p) { |
| 3182 | // contains a literal IPv6 address, therefore % is allowed |
| 3183 | serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n); |
| 3184 | } else { |
| 3185 | serverChars = (scan(p, n, L_SERVER, H_SERVER) == n); |
| 3186 | } |
| 3187 | regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n); |
| 3188 | |
| 3189 | if (regChars && !serverChars) { |
| 3190 | // Must be a registry-based authority |
| 3191 | authority = substring(p, n); |
| 3192 | return n; |
| 3193 | } |
| 3194 | |
| 3195 | if (serverChars) { |
| 3196 | // Might be (probably is) a server-based authority, so attempt |
| 3197 | // to parse it as such. If the attempt fails, try to treat it |
| 3198 | // as a registry-based authority. |
| 3199 | try { |
| 3200 | q = parseServer(p, n); |
| 3201 | if (q < n) |
| 3202 | failExpecting("end of authority", q); |
| 3203 | authority = substring(p, n); |
| 3204 | } catch (URISyntaxException x) { |
| 3205 | // Undo results of failed parse |
| 3206 | userInfo = null; |
| 3207 | host = null; |
| 3208 | port = -1; |
| 3209 | if (requireServerAuthority) { |
| 3210 | // If we're insisting upon a server-based authority, |
| 3211 | // then just re-throw the exception |
| 3212 | throw x; |
| 3213 | } else { |
| 3214 | // Save the exception in case it doesn't parse as a |
| 3215 | // registry either |
| 3216 | ex = x; |
| 3217 | q = p; |
| 3218 | } |
| 3219 | } |
| 3220 | } |
| 3221 | |
| 3222 | if (q < n) { |
| 3223 | if (regChars) { |
| 3224 | // Registry-based authority |
| 3225 | authority = substring(p, n); |
| 3226 | } else if (ex != null) { |
| 3227 | // Re-throw exception; it was probably due to |
| 3228 | // a malformed IPv6 address |
| 3229 | throw ex; |
| 3230 | } else { |
| 3231 | fail("Illegal character in authority", q); |
| 3232 | } |
| 3233 | } |
| 3234 | |
| 3235 | return n; |
| 3236 | } |
| 3237 | |
| 3238 | |
| 3239 | // [<userinfo>@]<host>[:<port>] |
| 3240 | // |
| 3241 | private int parseServer(int start, int n) |
| 3242 | throws URISyntaxException |
| 3243 | { |
| 3244 | int p = start; |
| 3245 | int q; |
| 3246 | |
| 3247 | // userinfo |
| 3248 | q = scan(p, n, "/?#", "@"); |
| 3249 | if ((q >= p) && at(q, n, '@')) { |
| 3250 | checkChars(p, q, L_USERINFO, H_USERINFO, "user info"); |
| 3251 | userInfo = substring(p, q); |
| 3252 | p = q + 1; // Skip '@' |
| 3253 | } |
| 3254 | |
| 3255 | // hostname, IPv4 address, or IPv6 address |
| 3256 | if (at(p, n, '[')) { |
| 3257 | // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732 |
| 3258 | p++; |
| 3259 | q = scan(p, n, "/?#", "]"); |
| 3260 | if ((q > p) && at(q, n, ']')) { |
| 3261 | // look for a "%" scope id |
| 3262 | int r = scan (p, q, "", "%"); |
| 3263 | if (r > p) { |
| 3264 | parseIPv6Reference(p, r); |
| 3265 | if (r+1 == q) { |
| 3266 | fail ("scope id expected"); |
| 3267 | } |
| 3268 | checkChars (r+1, q, L_ALPHANUM, H_ALPHANUM, |
| 3269 | "scope id"); |
| 3270 | } else { |
| 3271 | parseIPv6Reference(p, q); |
| 3272 | } |
| 3273 | host = substring(p-1, q+1); |
| 3274 | p = q + 1; |
| 3275 | } else { |
| 3276 | failExpecting("closing bracket for IPv6 address", q); |
| 3277 | } |
| 3278 | } else { |
| 3279 | q = parseIPv4Address(p, n); |
| 3280 | if (q <= p) |
| 3281 | q = parseHostname(p, n); |
| 3282 | p = q; |
| 3283 | } |
| 3284 | |
| 3285 | // port |
| 3286 | if (at(p, n, ':')) { |
| 3287 | p++; |
| 3288 | q = scan(p, n, "", "/"); |
| 3289 | if (q > p) { |
| 3290 | checkChars(p, q, L_DIGIT, H_DIGIT, "port number"); |
| 3291 | try { |
| 3292 | port = Integer.parseInt(substring(p, q)); |
| 3293 | } catch (NumberFormatException x) { |
| 3294 | fail("Malformed port number", p); |
| 3295 | } |
| 3296 | p = q; |
| 3297 | } |
| 3298 | } |
| 3299 | if (p < n) |
| 3300 | failExpecting("port number", p); |
| 3301 | |
| 3302 | return p; |
| 3303 | } |
| 3304 | |
| 3305 | // Scan a string of decimal digits whose value fits in a byte |
| 3306 | // |
| 3307 | private int scanByte(int start, int n) |
| 3308 | throws URISyntaxException |
| 3309 | { |
| 3310 | int p = start; |
| 3311 | int q = scan(p, n, L_DIGIT, H_DIGIT); |
| 3312 | if (q <= p) return q; |
| 3313 | if (Integer.parseInt(substring(p, q)) > 255) return p; |
| 3314 | return q; |
| 3315 | } |
| 3316 | |
| 3317 | // Scan an IPv4 address. |
| 3318 | // |
| 3319 | // If the strict argument is true then we require that the given |
| 3320 | // interval contain nothing besides an IPv4 address; if it is false |
| 3321 | // then we only require that it start with an IPv4 address. |
| 3322 | // |
| 3323 | // If the interval does not contain or start with (depending upon the |
| 3324 | // strict argument) a legal IPv4 address characters then we return -1 |
| 3325 | // immediately; otherwise we insist that these characters parse as a |
| 3326 | // legal IPv4 address and throw an exception on failure. |
| 3327 | // |
| 3328 | // We assume that any string of decimal digits and dots must be an IPv4 |
| 3329 | // address. It won't parse as a hostname anyway, so making that |
| 3330 | // assumption here allows more meaningful exceptions to be thrown. |
| 3331 | // |
| 3332 | private int scanIPv4Address(int start, int n, boolean strict) |
| 3333 | throws URISyntaxException |
| 3334 | { |
| 3335 | int p = start; |
| 3336 | int q; |
| 3337 | int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT); |
| 3338 | if ((m <= p) || (strict && (m != n))) |
| 3339 | return -1; |
| 3340 | for (;;) { |
| 3341 | // Per RFC2732: At most three digits per byte |
| 3342 | // Further constraint: Each element fits in a byte |
| 3343 | if ((q = scanByte(p, m)) <= p) break; p = q; |
| 3344 | if ((q = scan(p, m, '.')) <= p) break; p = q; |
| 3345 | if ((q = scanByte(p, m)) <= p) break; p = q; |
| 3346 | if ((q = scan(p, m, '.')) <= p) break; p = q; |
| 3347 | if ((q = scanByte(p, m)) <= p) break; p = q; |
| 3348 | if ((q = scan(p, m, '.')) <= p) break; p = q; |
| 3349 | if ((q = scanByte(p, m)) <= p) break; p = q; |
| 3350 | if (q < m) break; |
| 3351 | return q; |
| 3352 | } |
| 3353 | fail("Malformed IPv4 address", q); |
| 3354 | return -1; |
| 3355 | } |
| 3356 | |
| 3357 | // Take an IPv4 address: Throw an exception if the given interval |
| 3358 | // contains anything except an IPv4 address |
| 3359 | // |
| 3360 | private int takeIPv4Address(int start, int n, String expected) |
| 3361 | throws URISyntaxException |
| 3362 | { |
| 3363 | int p = scanIPv4Address(start, n, true); |
| 3364 | if (p <= start) |
| 3365 | failExpecting(expected, start); |
| 3366 | return p; |
| 3367 | } |
| 3368 | |
| 3369 | // Attempt to parse an IPv4 address, returning -1 on failure but |
| 3370 | // allowing the given interval to contain [:<characters>] after |
| 3371 | // the IPv4 address. |
| 3372 | // |
| 3373 | private int parseIPv4Address(int start, int n) { |
| 3374 | int p; |
| 3375 | |
| 3376 | try { |
| 3377 | p = scanIPv4Address(start, n, false); |
| 3378 | } catch (URISyntaxException x) { |
| 3379 | return -1; |
| 3380 | } catch (NumberFormatException nfe) { |
| 3381 | return -1; |
| 3382 | } |
| 3383 | |
| 3384 | if (p > start && p < n) { |
| 3385 | // IPv4 address is followed by something - check that |
| 3386 | // it's a ":" as this is the only valid character to |
| 3387 | // follow an address. |
| 3388 | if (charAt(p) != ':') { |
| 3389 | p = -1; |
| 3390 | } |
| 3391 | } |
| 3392 | |
| 3393 | if (p > start) |
| 3394 | host = substring(start, p); |
| 3395 | |
| 3396 | return p; |
| 3397 | } |
| 3398 | |
| 3399 | // Android-changed: Allow underscore in hostname. |
| 3400 | // Added "_" to the grammars for domainLabel and topLabel. |
| 3401 | // hostname = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ] |
| 3402 | // domainlabel = alphanum | alphanum *( alphanum | "-" | "_" ) alphanum |
| 3403 | // toplabel = alpha | alpha *( alphanum | "-" | "_" ) alphanum |
| 3404 | // |
| 3405 | private int parseHostname(int start, int n) |
| 3406 | throws URISyntaxException |
| 3407 | { |
| 3408 | int p = start; |
| 3409 | int q; |
| 3410 | int l = -1; // Start of last parsed label |
| 3411 | |
| 3412 | do { |
| 3413 | // Android-changed: Allow underscore in hostname. |
| 3414 | // RFC 2396 only allows alphanumeric characters and hyphens, but real, |
| 3415 | // large Internet hosts in the wild use underscore, so we have to allow it. |
| 3416 | // http://code.google.com/p/android/issues/detail?id=37577 |
| 3417 | // http://b/17579865 |
| 3418 | // http://b/18016625 |
| 3419 | // http://b/18023709 |
| 3420 | |
| 3421 | // domainlabel = alphanum [ *( alphanum | "-" | "_" ) alphanum ] |
| 3422 | q = scan(p, n, L_ALPHANUM, H_ALPHANUM); |
| 3423 | if (q <= p) |
| 3424 | break; |
| 3425 | l = p; |
| 3426 | if (q > p) { |
| 3427 | p = q; |
| 3428 | // Android-changed: Allow underscore in hostname. |
| 3429 | // q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH); |
| 3430 | q = scan(p, n, L_ALPHANUM | L_DASH | L_UNDERSCORE, H_ALPHANUM | H_DASH | H_UNDERSCORE); |
| 3431 | if (q > p) { |
| 3432 | if (charAt(q - 1) == '-') |
| 3433 | fail("Illegal character in hostname", q - 1); |
| 3434 | p = q; |
| 3435 | } |
| 3436 | } |
| 3437 | q = scan(p, n, '.'); |
| 3438 | if (q <= p) |
| 3439 | break; |
| 3440 | p = q; |
| 3441 | } while (p < n); |
| 3442 | |
| 3443 | if ((p < n) && !at(p, n, ':')) |
| 3444 | fail("Illegal character in hostname", p); |
| 3445 | |
| 3446 | if (l < 0) |
| 3447 | failExpecting("hostname", start); |
| 3448 | |
| 3449 | // for a fully qualified hostname check that the rightmost |
| 3450 | // label starts with an alpha character. |
| 3451 | if (l > start && !match(charAt(l), L_ALPHA, H_ALPHA)) { |
| 3452 | fail("Illegal character in hostname", l); |
| 3453 | } |
| 3454 | |
| 3455 | host = substring(start, p); |
| 3456 | return p; |
| 3457 | } |
| 3458 | |
| 3459 | |
| 3460 | // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture |
| 3461 | // |
| 3462 | // Bug: The grammar in RFC2373 Appendix B does not allow addresses of |
| 3463 | // the form ::12.34.56.78, which are clearly shown in the examples |
| 3464 | // earlier in the document. Here is the original grammar: |
| 3465 | // |
| 3466 | // IPv6address = hexpart [ ":" IPv4address ] |
| 3467 | // hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] |
| 3468 | // hexseq = hex4 *( ":" hex4) |
| 3469 | // hex4 = 1*4HEXDIG |
| 3470 | // |
| 3471 | // We therefore use the following revised grammar: |
| 3472 | // |
| 3473 | // IPv6address = hexseq [ ":" IPv4address ] |
| 3474 | // | hexseq [ "::" [ hexpost ] ] |
| 3475 | // | "::" [ hexpost ] |
| 3476 | // hexpost = hexseq | hexseq ":" IPv4address | IPv4address |
| 3477 | // hexseq = hex4 *( ":" hex4) |
| 3478 | // hex4 = 1*4HEXDIG |
| 3479 | // |
| 3480 | // This covers all and only the following cases: |
| 3481 | // |
| 3482 | // hexseq |
| 3483 | // hexseq : IPv4address |
| 3484 | // hexseq :: |
| 3485 | // hexseq :: hexseq |
| 3486 | // hexseq :: hexseq : IPv4address |
| 3487 | // hexseq :: IPv4address |
| 3488 | // :: hexseq |
| 3489 | // :: hexseq : IPv4address |
| 3490 | // :: IPv4address |
| 3491 | // :: |
| 3492 | // |
| 3493 | // Additionally we constrain the IPv6 address as follows :- |
| 3494 | // |
| 3495 | // i. IPv6 addresses without compressed zeros should contain |
| 3496 | // exactly 16 bytes. |
| 3497 | // |
| 3498 | // ii. IPv6 addresses with compressed zeros should contain |
| 3499 | // less than 16 bytes. |
| 3500 | |
| 3501 | private int ipv6byteCount = 0; |
| 3502 | |
| 3503 | private int parseIPv6Reference(int start, int n) |
| 3504 | throws URISyntaxException |
| 3505 | { |
| 3506 | int p = start; |
| 3507 | int q; |
| 3508 | boolean compressedZeros = false; |
| 3509 | |
| 3510 | q = scanHexSeq(p, n); |
| 3511 | |
| 3512 | if (q > p) { |
| 3513 | p = q; |
| 3514 | if (at(p, n, "::")) { |
| 3515 | compressedZeros = true; |
| 3516 | p = scanHexPost(p + 2, n); |
| 3517 | } else if (at(p, n, ':')) { |
| 3518 | p = takeIPv4Address(p + 1, n, "IPv4 address"); |
| 3519 | ipv6byteCount += 4; |
| 3520 | } |
| 3521 | } else if (at(p, n, "::")) { |
| 3522 | compressedZeros = true; |
| 3523 | p = scanHexPost(p + 2, n); |
| 3524 | } |
| 3525 | if (p < n) |
| 3526 | fail("Malformed IPv6 address", start); |
| 3527 | if (ipv6byteCount > 16) |
| 3528 | fail("IPv6 address too long", start); |
| 3529 | if (!compressedZeros && ipv6byteCount < 16) |
| 3530 | fail("IPv6 address too short", start); |
| 3531 | if (compressedZeros && ipv6byteCount == 16) |
| 3532 | fail("Malformed IPv6 address", start); |
| 3533 | |
| 3534 | return p; |
| 3535 | } |
| 3536 | |
| 3537 | private int scanHexPost(int start, int n) |
| 3538 | throws URISyntaxException |
| 3539 | { |
| 3540 | int p = start; |
| 3541 | int q; |
| 3542 | |
| 3543 | if (p == n) |
| 3544 | return p; |
| 3545 | |
| 3546 | q = scanHexSeq(p, n); |
| 3547 | if (q > p) { |
| 3548 | p = q; |
| 3549 | if (at(p, n, ':')) { |
| 3550 | p++; |
| 3551 | p = takeIPv4Address(p, n, "hex digits or IPv4 address"); |
| 3552 | ipv6byteCount += 4; |
| 3553 | } |
| 3554 | } else { |
| 3555 | p = takeIPv4Address(p, n, "hex digits or IPv4 address"); |
| 3556 | ipv6byteCount += 4; |
| 3557 | } |
| 3558 | return p; |
| 3559 | } |
| 3560 | |
| 3561 | // Scan a hex sequence; return -1 if one could not be scanned |
| 3562 | // |
| 3563 | private int scanHexSeq(int start, int n) |
| 3564 | throws URISyntaxException |
| 3565 | { |
| 3566 | int p = start; |
| 3567 | int q; |
| 3568 | |
| 3569 | q = scan(p, n, L_HEX, H_HEX); |
| 3570 | if (q <= p) |
| 3571 | return -1; |
| 3572 | if (at(q, n, '.')) // Beginning of IPv4 address |
| 3573 | return -1; |
| 3574 | if (q > p + 4) |
| 3575 | fail("IPv6 hexadecimal digit sequence too long", p); |
| 3576 | ipv6byteCount += 2; |
| 3577 | p = q; |
| 3578 | while (p < n) { |
| 3579 | if (!at(p, n, ':')) |
| 3580 | break; |
| 3581 | if (at(p + 1, n, ':')) |
| 3582 | break; // "::" |
| 3583 | p++; |
| 3584 | q = scan(p, n, L_HEX, H_HEX); |
| 3585 | if (q <= p) |
| 3586 | failExpecting("digits for an IPv6 address", p); |
| 3587 | if (at(q, n, '.')) { // Beginning of IPv4 address |
| 3588 | p--; |
| 3589 | break; |
| 3590 | } |
| 3591 | if (q > p + 4) |
| 3592 | fail("IPv6 hexadecimal digit sequence too long", p); |
| 3593 | ipv6byteCount += 2; |
| 3594 | p = q; |
| 3595 | } |
| 3596 | |
| 3597 | return p; |
| 3598 | } |
| 3599 | |
| 3600 | } |
| 3601 | |
| 3602 | } |