1 /* 2 * Copyright (c) 2000, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.net; 27 28 import java.io.File; 29 import java.io.IOException; 30 import java.io.InvalidObjectException; 31 import java.io.ObjectInputStream; 32 import java.io.ObjectOutputStream; 33 import java.io.Serializable; 34 import java.nio.ByteBuffer; 35 import java.nio.CharBuffer; 36 import java.nio.charset.CharsetDecoder; 37 import java.nio.charset.CharsetEncoder; 38 import java.nio.charset.CoderResult; 39 import java.nio.charset.CodingErrorAction; 40 import java.nio.charset.CharacterCodingException; 41 import java.nio.file.Path; 42 import java.text.Normalizer; 43 import jdk.internal.access.JavaNetUriAccess; 44 import jdk.internal.access.SharedSecrets; 45 import sun.nio.cs.UTF_8; 46 47 /** 48 * Represents a Uniform Resource Identifier (URI) reference. 49 * 50 * <p> Aside from some minor deviations noted below, an instance of this 51 * class represents a URI reference as defined by 52 * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 53 * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a 54 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 55 * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format 56 * also supports scope_ids. The syntax and usage of scope_ids is described 57 * <a href="Inet6Address.html#scoped">here</a>. 58 * This class provides constructors for creating URI instances from 59 * their components or by parsing their string forms, methods for accessing the 60 * various components of an instance, and methods for normalizing, resolving, 61 * and relativizing URI instances. Instances of this class are immutable. 62 * 63 * 64 * <h2> URI syntax and components </h2> 65 * 66 * At the highest level a URI reference (hereinafter simply "URI") in string 67 * form has the syntax 68 * 69 * <blockquote> 70 * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>] 71 * </blockquote> 72 * 73 * where square brackets [...] delineate optional components and the characters 74 * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves. 75 * 76 * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is 77 * said to be <i>relative</i>. URIs are also classified according to whether 78 * they are <i>opaque</i> or <i>hierarchical</i>. 79 * 80 * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does 81 * not begin with a slash character ({@code '/'}). Opaque URIs are not 82 * subject to further parsing. Some examples of opaque URIs are: 83 * 84 * <blockquote><ul style="list-style-type:none"> 85 * <li>{@code mailto:java-net@www.example.com}</li> 86 * <li>{@code news:comp.lang.java}</li> 87 * <li>{@code urn:isbn:096139210x}</li> 88 * </ul></blockquote> 89 * 90 * <p> A <i>hierarchical</i> URI is either an absolute URI whose 91 * scheme-specific part begins with a slash character, or a relative URI, that 92 * is, a URI that does not specify a scheme. Some examples of hierarchical 93 * URIs are: 94 * 95 * <blockquote> 96 * {@code http://example.com/languages/java/}<br> 97 * {@code sample/a/index.html#28}<br> 98 * {@code ../../demo/b/index.html}<br> 99 * {@code file:///~/calendar} 100 * </blockquote> 101 * 102 * <p> A hierarchical URI is subject to further parsing according to the syntax 103 * 104 * <blockquote> 105 * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>] 106 * </blockquote> 107 * 108 * where the characters <b>{@code :}</b>, <b>{@code /}</b>, 109 * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves. The 110 * scheme-specific part of a hierarchical URI consists of the characters 111 * between the scheme and fragment components. 112 * 113 * <p> The authority component of a hierarchical URI is, if specified, either 114 * <i>server-based</i> or <i>registry-based</i>. A server-based authority 115 * parses according to the familiar syntax 116 * 117 * <blockquote> 118 * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>] 119 * </blockquote> 120 * 121 * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for 122 * themselves. Nearly all URI schemes currently in use are server-based. An 123 * authority component that does not parse in this way is considered to be 124 * registry-based. 125 * 126 * <p> The path component of a hierarchical URI is itself said to be absolute 127 * if it begins with a slash character ({@code '/'}); otherwise it is 128 * relative. The path of a hierarchical URI that is either absolute or 129 * specifies an authority is always absolute. 130 * 131 * <p> All told, then, a URI instance has the following nine components: 132 * 133 * <table class="striped" style="margin-left:2em"> 134 * <caption style="display:none">Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment</caption> 135 * <thead> 136 * <tr><th scope="col">Component</th><th scope="col">Type</th></tr> 137 * </thead> 138 * <tbody style="text-align:left"> 139 * <tr><th scope="row">scheme</th><td>{@code String}</td></tr> 140 * <tr><th scope="row">scheme-specific-part</th><td>{@code String}</td></tr> 141 * <tr><th scope="row">authority</th><td>{@code String}</td></tr> 142 * <tr><th scope="row">user-info</th><td>{@code String}</td></tr> 143 * <tr><th scope="row">host</th><td>{@code String}</td></tr> 144 * <tr><th scope="row">port</th><td>{@code int}</td></tr> 145 * <tr><th scope="row">path</th><td>{@code String}</td></tr> 146 * <tr><th scope="row">query</th><td>{@code String}</td></tr> 147 * <tr><th scope="row">fragment</th><td>{@code String}</td></tr> 148 * </tbody> 149 * </table> 150 * 151 * In a given instance any particular component is either <i>undefined</i> or 152 * <i>defined</i> with a distinct value. Undefined string components are 153 * represented by {@code null}, while undefined integer components are 154 * represented by {@code -1}. A string component may be defined to have the 155 * empty string as its value; this is not equivalent to that component being 156 * undefined. 157 * 158 * <p> Whether a particular component is or is not defined in an instance 159 * depends upon the type of the URI being represented. An absolute URI has a 160 * scheme component. An opaque URI has a scheme, a scheme-specific part, and 161 * possibly a fragment, but has no other components. A hierarchical URI always 162 * has a path (though it may be empty) and a scheme-specific-part (which at 163 * least contains the path), and may have any of the other components. If the 164 * authority component is present and is server-based then the host component 165 * will be defined and the user-information and port components may be defined. 166 * 167 * 168 * <h3> Operations on URI instances </h3> 169 * 170 * The key operations supported by this class are those of 171 * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>. 172 * 173 * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."} 174 * and {@code ".."} segments from the path component of a hierarchical URI. 175 * Each {@code "."} segment is simply removed. A {@code ".."} segment is 176 * removed only if it is preceded by a non-{@code ".."} segment. 177 * Normalization has no effect upon opaque URIs. 178 * 179 * <p> <i>Resolution</i> is the process of resolving one URI against another, 180 * <i>base</i> URI. The resulting URI is constructed from components of both 181 * URIs in the manner specified by RFC 2396, taking components from the 182 * base URI for those not specified in the original. For hierarchical URIs, 183 * the path of the original is resolved against the path of the base and then 184 * normalized. The result, for example, of resolving 185 * 186 * <blockquote> 187 * {@code sample/a/index.html#28} 188 * 189 * (1) 190 * </blockquote> 191 * 192 * against the base URI {@code http://example.com/languages/java/} is the result 193 * URI 194 * 195 * <blockquote> 196 * {@code http://example.com/languages/java/sample/a/index.html#28} 197 * </blockquote> 198 * 199 * Resolving the relative URI 200 * 201 * <blockquote> 202 * {@code ../../demo/b/index.html} (2) 203 * </blockquote> 204 * 205 * against this result yields, in turn, 206 * 207 * <blockquote> 208 * {@code http://example.com/languages/java/demo/b/index.html} 209 * </blockquote> 210 * 211 * Resolution of both absolute and relative URIs, and of both absolute and 212 * relative paths in the case of hierarchical URIs, is supported. Resolving 213 * the URI {@code file:///~calendar} against any other URI simply yields the 214 * original URI, since it is absolute. Resolving the relative URI (2) above 215 * against the relative base URI (1) yields the normalized, but still relative, 216 * URI 217 * 218 * <blockquote> 219 * {@code demo/b/index.html} 220 * </blockquote> 221 * 222 * <p> <i>Relativization</i>, finally, can be regarded as the inverse of resolution. 223 * Let <i>u</i> be any normalized absolute URI ending with a slash character ({@code '/'}) 224 * and <i>v</i> be any normalized relative URI not beginning with a period character ({@code '.'}) 225 * or slash character ({@code '/'}). Then, the following statement is true: 226 * 227 * <blockquote> 228 * <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} 229 * </blockquote> 230 * 231 * Let <i>u</i> be any normalized absolute URI ending with a slash character ({@code '/'}) 232 * and <i>v</i> be any normalized absolute URI. Then, the following statement is true: 233 * 234 * <blockquote> 235 * <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} 236 * </blockquote> 237 * 238 * This operation is often useful when constructing a document containing URIs 239 * that must be made relative to the base URI of the document wherever 240 * possible. For example, relativizing the URI 241 * 242 * <blockquote> 243 * {@code http://example.com/languages/java/sample/a/index.html#28} 244 * </blockquote> 245 * 246 * against the base URI 247 * 248 * <blockquote> 249 * {@code http://example.com/languages/java/} 250 * </blockquote> 251 * 252 * yields the relative URI {@code sample/a/index.html#28}. 253 * 254 * 255 * <h3> Character categories </h3> 256 * 257 * RFC 2396 specifies precisely which characters are permitted in the 258 * various components of a URI reference. The following categories, most of 259 * which are taken from that specification, are used below to describe these 260 * constraints: 261 * 262 * <table class="striped" style="margin-left:2em"> 263 * <caption style="display:none">Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other</caption> 264 * <thead> 265 * <tr><th scope="col">Category</th><th scope="col">Description</th></tr> 266 * </thead> 267 * <tbody style="text-align:left"> 268 * <tr><th scope="row" style="vertical-align:top">alpha</th> 269 * <td>The US-ASCII alphabetic characters, 270 * {@code 'A'} through {@code 'Z'} 271 * and {@code 'a'} through {@code 'z'}</td></tr> 272 * <tr><th scope="row" style="vertical-align:top">digit</th> 273 * <td>The US-ASCII decimal digit characters, 274 * {@code '0'} through {@code '9'}</td></tr> 275 * <tr><th scope="row" style="vertical-align:top">alphanum</th> 276 * <td>All <i>alpha</i> and <i>digit</i> characters</td></tr> 277 * <tr><th scope="row" style="vertical-align:top">unreserved</th> 278 * <td>All <i>alphanum</i> characters together with those in the string 279 * {@code "_-!.~'()*"}</td></tr> 280 * <tr><th scope="row" style="vertical-align:top">punct</th> 281 * <td>The characters in the string {@code ",;:$&+="}</td></tr> 282 * <tr><th scope="row" style="vertical-align:top">reserved</th> 283 * <td>All <i>punct</i> characters together with those in the string 284 * {@code "?/[]@"}</td></tr> 285 * <tr><th scope="row" style="vertical-align:top">escaped</th> 286 * <td>Escaped octets, that is, triplets consisting of the percent 287 * character ({@code '%'}) followed by two hexadecimal digits 288 * ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and 289 * {@code 'a'}-{@code 'f'})</td></tr> 290 * <tr><th scope="row" style="vertical-align:top">other</th> 291 * <td>The Unicode characters that are not in the US-ASCII character set, 292 * are not control characters (according to the {@link 293 * java.lang.Character#isISOControl(char) Character.isISOControl} 294 * method), and are not space characters (according to the {@link 295 * java.lang.Character#isSpaceChar(char) Character.isSpaceChar} 296 * method) <i>(<b>Deviation from RFC 2396</b>, which is 297 * limited to US-ASCII)</i></td></tr> 298 * </tbody> 299 * </table> 300 * 301 * <p><a id="legal-chars"></a> The set of all legal URI characters consists of 302 * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i> 303 * characters. 304 * 305 * 306 * <h3> Escaped octets, quotation, encoding, and decoding </h3> 307 * 308 * RFC 2396 allows escaped octets to appear in the user-info, path, query, and 309 * fragment components. Escaping serves two purposes in URIs: 310 * 311 * <ul> 312 * 313 * <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to 314 * conform strictly to RFC 2396 by not containing any <i>other</i> 315 * characters. </p></li> 316 * 317 * <li><p> To <i>quote</i> characters that are otherwise illegal in a 318 * component. The user-info, path, query, and fragment components differ 319 * slightly in terms of which characters are considered legal and illegal. 320 * </p></li> 321 * 322 * </ul> 323 * 324 * These purposes are served in this class by three related operations: 325 * 326 * <ul> 327 * 328 * <li><p><a id="encode"></a> A character is <i>encoded</i> by replacing it 329 * with the sequence of escaped octets that represent that character in the 330 * UTF-8 character set. The Euro currency symbol ({@code '\u005Cu20AC'}), 331 * for example, is encoded as {@code "%E2%82%AC"}. <i>(<b>Deviation from 332 * RFC 2396</b>, which does not specify any particular character 333 * set.)</i> </p></li> 334 * 335 * <li><p><a id="quote"></a> An illegal character is <i>quoted</i> simply by 336 * encoding it. The space character, for example, is quoted by replacing it 337 * with {@code "%20"}. UTF-8 contains US-ASCII, hence for US-ASCII 338 * characters this transformation has exactly the effect required by 339 * RFC 2396. </p></li> 340 * 341 * <li><p><a id="decode"></a> 342 * A sequence of escaped octets is <i>decoded</i> by 343 * replacing it with the sequence of characters that it represents in the 344 * UTF-8 character set. UTF-8 contains US-ASCII, hence decoding has the 345 * effect of de-quoting any quoted US-ASCII characters as well as that of 346 * decoding any encoded non-US-ASCII characters. If a <a 347 * href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs 348 * when decoding the escaped octets then the erroneous octets are replaced by 349 * {@code '\u005CuFFFD'}, the Unicode replacement character. </p></li> 350 * 351 * </ul> 352 * 353 * These operations are exposed in the constructors and methods of this class 354 * as follows: 355 * 356 * <ul> 357 * 358 * <li><p> The {@linkplain #URI(java.lang.String) single-argument 359 * constructor} requires any illegal characters in its argument to be 360 * quoted and preserves any escaped octets and <i>other</i> characters that 361 * are present. </p></li> 362 * 363 * <li><p> The {@linkplain 364 * #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String) 365 * multi-argument constructors} quote illegal characters as 366 * required by the components in which they appear. The percent character 367 * ({@code '%'}) is always quoted by these constructors. Any <i>other</i> 368 * characters are preserved. </p></li> 369 * 370 * <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath() 371 * getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment() 372 * getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link 373 * #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the 374 * values of their corresponding components in raw form, without interpreting 375 * any escaped octets. The strings returned by these methods may contain 376 * both escaped octets and <i>other</i> characters, and will not contain any 377 * illegal characters. </p></li> 378 * 379 * <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath() 380 * getPath}, {@link #getQuery() getQuery}, {@link #getFragment() 381 * getFragment}, {@link #getAuthority() getAuthority}, and {@link 382 * #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped 383 * octets in their corresponding components. The strings returned by these 384 * methods may contain both <i>other</i> characters and illegal characters, 385 * and will not contain any escaped octets. </p></li> 386 * 387 * <li><p> The {@link #toString() toString} method returns a URI string with 388 * all necessary quotation but which may contain <i>other</i> characters. 389 * </p></li> 390 * 391 * <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully 392 * quoted and encoded URI string that does not contain any <i>other</i> 393 * characters. </p></li> 394 * 395 * </ul> 396 * 397 * 398 * <h3> Identities </h3> 399 * 400 * For any URI <i>u</i>, it is always the case that 401 * 402 * <blockquote> 403 * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )} . 404 * </blockquote> 405 * 406 * For any URI <i>u</i> that does not contain redundant syntax such as two 407 * slashes before an empty authority (as in {@code file:///tmp/} ) or a 408 * colon following a host name but no port (as in 409 * {@code http://www.example.com:} ), and that does not encode characters 410 * except those that must be quoted, the following identities also hold: 411 * <pre> 412 * new URI(<i>u</i>.getScheme(), 413 * <i>u</i>.getSchemeSpecificPart(), 414 * <i>u</i>.getFragment()) 415 * .equals(<i>u</i>)</pre> 416 * in all cases, 417 * <pre> 418 * new URI(<i>u</i>.getScheme(), 419 * <i>u</i>.getAuthority(), 420 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 421 * <i>u</i>.getFragment()) 422 * .equals(<i>u</i>)</pre> 423 * if <i>u</i> is hierarchical, and 424 * <pre> 425 * new URI(<i>u</i>.getScheme(), 426 * <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(), 427 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 428 * <i>u</i>.getFragment()) 429 * .equals(<i>u</i>)</pre> 430 * if <i>u</i> is hierarchical and has either no authority or a server-based 431 * authority. 432 * 433 * 434 * <h3> URIs, URLs, and URNs </h3> 435 * 436 * A URI is a uniform resource <i>identifier</i> while a URL is a uniform 437 * resource <i>locator</i>. Hence every URL is a URI, abstractly speaking, but 438 * not every URI is a URL. This is because there is another subcategory of 439 * URIs, uniform resource <i>names</i> (URNs), which name resources but do not 440 * specify how to locate them. The {@code mailto}, {@code news}, and 441 * {@code isbn} URIs shown above are examples of URNs. 442 * 443 * <p> The conceptual distinction between URIs and URLs is reflected in the 444 * differences between this class and the {@link URL} class. 445 * 446 * <p> An instance of this class represents a URI reference in the syntactic 447 * sense defined by RFC 2396. A URI may be either absolute or relative. 448 * A URI string is parsed according to the generic syntax without regard to the 449 * scheme, if any, that it specifies. No lookup of the host, if any, is 450 * performed, and no scheme-dependent stream handler is constructed. Equality, 451 * hashing, and comparison are defined strictly in terms of the character 452 * content of the instance. In other words, a URI instance is little more than 453 * a structured string that supports the syntactic, scheme-independent 454 * operations of comparison, normalization, resolution, and relativization. 455 * 456 * <p> An instance of the {@link URL} class, by contrast, represents the 457 * syntactic components of a URL together with some of the information required 458 * to access the resource that it describes. A URL must be absolute, that is, 459 * it must always specify a scheme. A URL string is parsed according to its 460 * scheme. A stream handler is always established for a URL, and in fact it is 461 * impossible to create a URL instance for a scheme for which no handler is 462 * available. Equality and hashing depend upon both the scheme and the 463 * Internet address of the host, if any; comparison is not defined. In other 464 * words, a URL is a structured string that supports the syntactic operation of 465 * resolution as well as the network I/O operations of looking up the host and 466 * opening a connection to the specified resource. 467 * 468 * @apiNote 469 * 470 * Applications working with file paths and file URIs should take great 471 * care to use the appropriate methods to convert between the two. 472 * The {@link Path#of(URI)} factory method and the {@link File#File(URI)} 473 * constructor can be used to create {@link Path} or {@link File} 474 * objects from a file URI. {@link Path#toUri()} and {@link File#toURI()} 475 * can be used to create a {@link URI} from a file path. 476 * Applications should never try to {@linkplain 477 * #URI(String, String, String, int, String, String, String) 478 * construct}, {@linkplain #URI(String) parse}, or 479 * {@linkplain #resolve(String) resolve} a {@code URI} 480 * from the direct string representation of a {@code File} or {@code Path} 481 * instance. 482 * <p> 483 * Some components of a URL or URI, such as <i>userinfo</i>, may 484 * be abused to construct misleading URLs or URIs. Applications 485 * that deal with URLs or URIs should take into account 486 * the recommendations advised in <a 487 * href="https://tools.ietf.org/html/rfc3986#section-7">RFC3986, 488 * Section 7, Security Considerations</a>. 489 * 490 * @author Mark Reinhold 491 * @since 1.4 492 * 493 * @spec https://www.rfc-editor.org/info/rfc2279 494 * RFC 2279: UTF-8, a transformation format of ISO 10646 495 * @spec https://www.rfc-editor.org/info/rfc2373 496 * RFC 2373: IP Version 6 Addressing Architecture 497 * @spec https://www.rfc-editor.org/info/rfc2396 498 * RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax 499 * @spec https://www.rfc-editor.org/info/rfc2732 500 * RFC 2732: Format for Literal IPv6 Addresses in URL's 501 * @spec https://www.rfc-editor.org/info/rfc3986 502 * RFC 3986: Uniform Resource Identifier (URI): Generic Syntax 503 * 504 * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279: UTF-8, a 505 * transformation format of ISO 10646</i></a> 506 * @see <a href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 Addressing 507 * Architecture</i></a> 508 * @see <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 509 * Resource Identifiers (URI): Generic Syntax</i></a> 510 * @see <a href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 511 * Literal IPv6 Addresses in URLs</i></a> 512 * @see <a href="URISyntaxException.html">URISyntaxException</a> 513 */ 514 515 public final class URI 516 implements Comparable<URI>, Serializable 517 { 518 519 // Note: Comments containing the word "ASSERT" indicate places where a 520 // throw of an InternalError should be replaced by an appropriate assertion 521 // statement once asserts are enabled in the build. 522 @java.io.Serial 523 static final long serialVersionUID = -6052424284110960213L; 524 525 526 // -- Properties and components of this instance -- 527 528 // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>] 529 private transient String scheme; // null ==> relative URI 530 private transient String fragment; 531 532 // Hierarchical URI components: [//<authority>]<path>[?<query>] 533 private transient String authority; // Registry or server 534 535 // Server-based authority: [<userInfo>@]<host>[:<port>] 536 private transient String userInfo; 537 private transient String host; // null ==> registry-based 538 private transient int port = -1; // -1 ==> undefined 539 540 // Remaining components of hierarchical URIs 541 private transient String path; // null ==> opaque 542 private transient String query; 543 544 // The remaining fields may be computed on demand, which is safe even in 545 // the face of multiple threads racing to initialize them 546 private transient String schemeSpecificPart; 547 private transient int hash; // Zero ==> undefined 548 549 private transient String decodedUserInfo; 550 private transient String decodedAuthority; 551 private transient String decodedPath; 552 private transient String decodedQuery; 553 private transient String decodedFragment; 554 private transient String decodedSchemeSpecificPart; 555 556 /** 557 * The string form of this URI. 558 * 559 * @serial 560 */ 561 private volatile String string; // The only serializable field 562 563 564 565 // -- Constructors and factories -- 566 567 private URI() { } // Used internally 568 569 /** 570 * Constructs a URI by parsing the given string. 571 * 572 * <p> This constructor parses the given string exactly as specified by the 573 * grammar in <a 574 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 575 * Appendix A, <b><i>except for the following deviations:</i></b> </p> 576 * 577 * <ul> 578 * 579 * <li><p> An empty authority component is permitted as long as it is 580 * followed by a non-empty path, a query component, or a fragment 581 * component. This allows the parsing of URIs such as 582 * {@code "file:///foo/bar"}, which seems to be the intent of 583 * RFC 2396 although the grammar does not permit it. If the 584 * authority component is empty then the user-information, host, and port 585 * components are undefined. </p></li> 586 * 587 * <li><p> Empty relative paths are permitted; this seems to be the 588 * intent of RFC 2396 although the grammar does not permit it. The 589 * primary consequence of this deviation is that a standalone fragment 590 * such as {@code "#foo"} parses as a relative URI with an empty path 591 * and the given fragment, and can be usefully <a 592 * href="#resolve-frag">resolved</a> against a base URI. 593 * 594 * <li><p> IPv4 addresses in host components are parsed rigorously, as 595 * specified by <a 596 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>: Each 597 * element of a dotted-quad address must contain no more than three 598 * decimal digits. Each element is further constrained to have a value 599 * no greater than 255. </p></li> 600 * 601 * <li> <p> Hostnames in host components that comprise only a single 602 * domain label are permitted to start with an <i>alphanum</i> 603 * character. This seems to be the intent of <a 604 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 605 * section 3.2.2 although the grammar does not permit it. The 606 * consequence of this deviation is that the authority component of a 607 * hierarchical URI such as {@code s://123}, will parse as a server-based 608 * authority. </p></li> 609 * 610 * <li><p> IPv6 addresses are permitted for the host component. An IPv6 611 * address must be enclosed in square brackets ({@code '['} and 612 * {@code ']'}) as specified by <a 613 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>. The 614 * IPv6 address itself must parse according to <a 615 * href="http://www.ietf.org/rfc/rfc2373.txt">RFC 2373</a>. IPv6 616 * addresses are further constrained to describe no more than sixteen 617 * bytes of address information, a constraint implicit in RFC 2373 618 * but not expressible in the grammar. </p></li> 619 * 620 * <li><p> Characters in the <i>other</i> category are permitted wherever 621 * RFC 2396 permits <i>escaped</i> octets, that is, in the 622 * user-information, path, query, and fragment components, as well as in 623 * the authority component if the authority is registry-based. This 624 * allows URIs to contain Unicode characters beyond those in the US-ASCII 625 * character set. </p></li> 626 * 627 * </ul> 628 * 629 * @param str The string to be parsed into a URI 630 * 631 * @throws NullPointerException 632 * If {@code str} is {@code null} 633 * 634 * @throws URISyntaxException 635 * If the given string violates RFC 2396, as augmented 636 * by the above deviations 637 * @spec https://www.rfc-editor.org/info/rfc2373 638 * RFC 2373: IP Version 6 Addressing Architecture 639 * @spec https://www.rfc-editor.org/info/rfc2396 640 * RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax 641 * @spec https://www.rfc-editor.org/info/rfc2732 642 * RFC 2732: Format for Literal IPv6 Addresses in URL's 643 */ 644 public URI(String str) throws URISyntaxException { 645 new Parser(str).parse(false); 646 } 647 648 /** 649 * Constructs a hierarchical URI from the given components. 650 * 651 * <p> If a scheme is given then the path, if also given, must either be 652 * empty or begin with a slash character ({@code '/'}). Otherwise a 653 * component of the new URI may be left undefined by passing {@code null} 654 * for the corresponding parameter or, in the case of the {@code port} 655 * parameter, by passing {@code -1}. 656 * 657 * <p> This constructor first builds a URI string from the given components 658 * according to the rules specified in <a 659 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 660 * section 5.2, step 7: </p> 661 * 662 * <ol> 663 * 664 * <li><p> Initially, the result string is empty. </p></li> 665 * 666 * <li><p> If a scheme is given then it is appended to the result, 667 * followed by a colon character ({@code ':'}). </p></li> 668 * 669 * <li><p> If user information, a host, or a port are given then the 670 * string {@code "//"} is appended. </p></li> 671 * 672 * <li><p> If user information is given then it is appended, followed by 673 * a commercial-at character ({@code '@'}). Any character not in the 674 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 675 * categories is <a href="#quote">quoted</a>. </p></li> 676 * 677 * <li><p> If a host is given then it is appended. If the host is a 678 * literal IPv6 address but is not enclosed in square brackets 679 * ({@code '['} and {@code ']'}) then the square brackets are added. 680 * </p></li> 681 * 682 * <li><p> If a port number is given then a colon character 683 * ({@code ':'}) is appended, followed by the port number in decimal. 684 * </p></li> 685 * 686 * <li><p> If a path is given then it is appended. Any character not in 687 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 688 * categories, and not equal to the slash character ({@code '/'}) or the 689 * commercial-at character ({@code '@'}), is quoted. </p></li> 690 * 691 * <li><p> If a query is given then a question-mark character 692 * ({@code '?'}) is appended, followed by the query. Any character that 693 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 694 * </p></li> 695 * 696 * <li><p> Finally, if a fragment is given then a hash character 697 * ({@code '#'}) is appended, followed by the fragment. Any character 698 * that is not a legal URI character is quoted. </p></li> 699 * 700 * </ol> 701 * 702 * <p> The resulting URI string is then parsed as if by invoking the {@link 703 * #URI(String)} constructor and then invoking the {@link 704 * #parseServerAuthority()} method upon the result; this may cause a {@link 705 * URISyntaxException} to be thrown. </p> 706 * 707 * @param scheme Scheme name 708 * @param userInfo User name and authorization information 709 * @param host Host name 710 * @param port Port number 711 * @param path Path 712 * @param query Query 713 * @param fragment Fragment 714 * 715 * @throws URISyntaxException 716 * If both a scheme and a path are given but the path is relative, 717 * if the URI string constructed from the given components violates 718 * RFC 2396, or if the authority component of the string is 719 * present but cannot be parsed as a server-based authority 720 * @spec https://www.rfc-editor.org/info/rfc2396 721 * RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax 722 */ 723 public URI(String scheme, 724 String userInfo, String host, int port, 725 String path, String query, String fragment) 726 throws URISyntaxException 727 { 728 String s = toString(scheme, null, 729 null, userInfo, host, port, 730 path, query, fragment); 731 checkPath(s, scheme, path); 732 new Parser(s).parse(true); 733 } 734 735 /** 736 * Constructs a hierarchical URI from the given components. 737 * 738 * <p> If a scheme is given then the path, if also given, must either be 739 * empty or begin with a slash character ({@code '/'}). Otherwise a 740 * component of the new URI may be left undefined by passing {@code null} 741 * for the corresponding parameter. 742 * 743 * <p> This constructor first builds a URI string from the given components 744 * according to the rules specified in <a 745 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 746 * section 5.2, step 7: </p> 747 * 748 * <ol> 749 * 750 * <li><p> Initially, the result string is empty. </p></li> 751 * 752 * <li><p> If a scheme is given then it is appended to the result, 753 * followed by a colon character ({@code ':'}). </p></li> 754 * 755 * <li><p> If an authority is given then the string {@code "//"} is 756 * appended, followed by the authority. If the authority contains a 757 * literal IPv6 address then the address must be enclosed in square 758 * brackets ({@code '['} and {@code ']'}). Any character not in the 759 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 760 * categories, and not equal to the commercial-at character 761 * ({@code '@'}), is <a href="#quote">quoted</a>. </p></li> 762 * 763 * <li><p> If a path is given then it is appended. Any character not in 764 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 765 * categories, and not equal to the slash character ({@code '/'}) or the 766 * commercial-at character ({@code '@'}), is quoted. </p></li> 767 * 768 * <li><p> If a query is given then a question-mark character 769 * ({@code '?'}) is appended, followed by the query. Any character that 770 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 771 * </p></li> 772 * 773 * <li><p> Finally, if a fragment is given then a hash character 774 * ({@code '#'}) is appended, followed by the fragment. Any character 775 * that is not a legal URI character is quoted. </p></li> 776 * 777 * </ol> 778 * 779 * <p> The resulting URI string is then parsed as if by invoking the {@link 780 * #URI(String)} constructor and then invoking the {@link 781 * #parseServerAuthority()} method upon the result; this may cause a {@link 782 * URISyntaxException} to be thrown. </p> 783 * 784 * @param scheme Scheme name 785 * @param authority Authority 786 * @param path Path 787 * @param query Query 788 * @param fragment Fragment 789 * 790 * @throws URISyntaxException 791 * If both a scheme and a path are given but the path is relative, 792 * if the URI string constructed from the given components violates 793 * RFC 2396, or if the authority component of the string is 794 * present but cannot be parsed as a server-based authority 795 * @spec https://www.rfc-editor.org/info/rfc2396 796 * RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax 797 */ 798 public URI(String scheme, 799 String authority, 800 String path, String query, String fragment) 801 throws URISyntaxException 802 { 803 String s = toString(scheme, null, 804 authority, null, null, -1, 805 path, query, fragment); 806 checkPath(s, scheme, path); 807 new Parser(s).parse(false); 808 } 809 810 /** 811 * Constructs a hierarchical URI from the given components. 812 * 813 * <p> A component may be left undefined by passing {@code null}. 814 * 815 * <p> This convenience constructor works as if by invoking the 816 * seven-argument constructor as follows: 817 * 818 * <blockquote> 819 * {@code new} {@link #URI(String, String, String, int, String, String, String) 820 * URI}{@code (scheme, null, host, -1, path, null, fragment);} 821 * </blockquote> 822 * 823 * @param scheme Scheme name 824 * @param host Host name 825 * @param path Path 826 * @param fragment Fragment 827 * 828 * @throws URISyntaxException 829 * If the URI string constructed from the given components 830 * violates RFC 2396 831 */ 832 public URI(String scheme, String host, String path, String fragment) 833 throws URISyntaxException 834 { 835 this(scheme, null, host, -1, path, null, fragment); 836 } 837 838 /** 839 * Constructs a URI from the given components. 840 * 841 * <p> A component may be left undefined by passing {@code null}. 842 * 843 * <p> This constructor first builds a URI in string form using the given 844 * components as follows: </p> 845 * 846 * <ol> 847 * 848 * <li><p> Initially, the result string is empty. </p></li> 849 * 850 * <li><p> If a scheme is given then it is appended to the result, 851 * followed by a colon character ({@code ':'}). </p></li> 852 * 853 * <li><p> If a scheme-specific part is given then it is appended. Any 854 * character that is not a <a href="#legal-chars">legal URI character</a> 855 * is <a href="#quote">quoted</a>. </p></li> 856 * 857 * <li><p> Finally, if a fragment is given then a hash character 858 * ({@code '#'}) is appended to the string, followed by the fragment. 859 * Any character that is not a legal URI character is quoted. </p></li> 860 * 861 * </ol> 862 * 863 * <p> The resulting URI string is then parsed in order to create the new 864 * URI instance as if by invoking the {@link #URI(String)} constructor; 865 * this may cause a {@link URISyntaxException} to be thrown. </p> 866 * 867 * @param scheme Scheme name 868 * @param ssp Scheme-specific part 869 * @param fragment Fragment 870 * 871 * @throws URISyntaxException 872 * If the URI string constructed from the given components 873 * violates RFC 2396 874 */ 875 public URI(String scheme, String ssp, String fragment) 876 throws URISyntaxException 877 { 878 new Parser(toString(scheme, ssp, 879 null, null, null, -1, 880 null, null, fragment)) 881 .parse(false); 882 } 883 884 /** 885 * Constructs a simple URI consisting of only a scheme and a pre-validated 886 * path. Provides a fast-path for some internal cases. 887 */ 888 URI(String scheme, String path) { 889 assert validSchemeAndPath(scheme, path); 890 this.scheme = scheme; 891 this.path = path; 892 } 893 894 private static boolean validSchemeAndPath(String scheme, String path) { 895 try { 896 URI u = new URI(scheme + ':' + path); 897 return scheme.equals(u.scheme) && path.equals(u.path); 898 } catch (URISyntaxException e) { 899 return false; 900 } 901 } 902 903 /** 904 * Creates a URI by parsing the given string. 905 * 906 * <p> This convenience factory method works as if by invoking the {@link 907 * #URI(String)} constructor; any {@link URISyntaxException} thrown by the 908 * constructor is caught and wrapped in a new {@link 909 * IllegalArgumentException} object, which is then thrown. 910 * 911 * <p> This method is provided for use in situations where it is known that 912 * the given string is a legal URI, for example for URI constants declared 913 * within a program, and so it would be considered a programming error 914 * for the string not to parse as such. The constructors, which throw 915 * {@link URISyntaxException} directly, should be used in situations where a 916 * URI is being constructed from user input or from some other source that 917 * may be prone to errors. </p> 918 * 919 * @param str The string to be parsed into a URI 920 * @return The new URI 921 * 922 * @throws NullPointerException 923 * If {@code str} is {@code null} 924 * 925 * @throws IllegalArgumentException 926 * If the given string violates RFC 2396 927 */ 928 public static URI create(String str) { 929 try { 930 return new URI(str); 931 } catch (URISyntaxException x) { 932 throw new IllegalArgumentException(x.getMessage(), x); 933 } 934 } 935 936 937 // -- Operations -- 938 939 /** 940 * Attempts to parse this URI's authority component, if defined, into 941 * user-information, host, and port components. 942 * 943 * <p> If this URI's authority component has already been recognized as 944 * being server-based then it will already have been parsed into 945 * user-information, host, and port components. In this case, or if this 946 * URI has no authority component, this method simply returns this URI. 947 * 948 * <p> Otherwise this method attempts once more to parse the authority 949 * component into user-information, host, and port components, and throws 950 * an exception describing why the authority component could not be parsed 951 * in that way. 952 * 953 * <p> This method is provided because the generic URI syntax specified in 954 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 955 * cannot always distinguish a malformed server-based authority from a 956 * legitimate registry-based authority. It must therefore treat some 957 * instances of the former as instances of the latter. The authority 958 * component in the URI string {@code "//foo:bar"}, for example, is not a 959 * legal server-based authority but it is legal as a registry-based 960 * authority. 961 * 962 * <p> In many common situations, for example when working URIs that are 963 * known to be either URNs or URLs, the hierarchical URIs being used will 964 * always be server-based. They therefore must either be parsed as such or 965 * treated as an error. In these cases a statement such as 966 * 967 * <blockquote> 968 * {@code URI }<i>u</i>{@code = new URI(str).parseServerAuthority();} 969 * </blockquote> 970 * 971 * <p> can be used to ensure that <i>u</i> always refers to a URI that, if 972 * it has an authority component, has a server-based authority with proper 973 * user-information, host, and port components. Invoking this method also 974 * ensures that if the authority could not be parsed in that way then an 975 * appropriate diagnostic message can be issued based upon the exception 976 * that is thrown. </p> 977 * 978 * @return A URI whose authority field has been parsed 979 * as a server-based authority 980 * 981 * @throws URISyntaxException 982 * If the authority component of this URI is defined 983 * but cannot be parsed as a server-based authority 984 * according to RFC 2396 985 * 986 * @spec https://www.rfc-editor.org/info/rfc2396 987 * RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax 988 */ 989 public URI parseServerAuthority() 990 throws URISyntaxException 991 { 992 // We could be clever and cache the error message and index from the 993 // exception thrown during the original parse, but that would require 994 // either more fields or a more-obscure representation. 995 if ((host != null) || (authority == null)) 996 return this; 997 new Parser(toString()).parse(true); 998 return this; 999 } 1000 1001 /** 1002 * Normalizes this URI's path. 1003 * 1004 * <p> If this URI is opaque, or if its path is already in normal form, 1005 * then this URI is returned. Otherwise a new URI is constructed that is 1006 * identical to this URI except that its path is computed by normalizing 1007 * this URI's path in a manner consistent with <a 1008 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 1009 * section 5.2, step 6, sub-steps c through f; that is: 1010 * </p> 1011 * 1012 * <ol> 1013 * 1014 * <li><p> All {@code "."} segments are removed. </p></li> 1015 * 1016 * <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."} 1017 * segment then both of these segments are removed. This step is 1018 * repeated until it is no longer applicable. </p></li> 1019 * 1020 * <li><p> If the path is relative, and if its first segment contains a 1021 * colon character ({@code ':'}), then a {@code "."} segment is 1022 * prepended. This prevents a relative URI with a path such as 1023 * {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a 1024 * scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}. 1025 * <b><i>(Deviation from RFC 2396)</i></b> </p></li> 1026 * 1027 * </ol> 1028 * 1029 * <p> A normalized path will begin with one or more {@code ".."} segments 1030 * if there were insufficient non-{@code ".."} segments preceding them to 1031 * allow their removal. A normalized path will begin with a {@code "."} 1032 * segment if one was inserted by step 3 above. Otherwise, a normalized 1033 * path will not contain any {@code "."} or {@code ".."} segments. </p> 1034 * 1035 * @return A URI equivalent to this URI, 1036 * but whose path is in normal form 1037 * @spec https://www.rfc-editor.org/info/rfc2396 1038 * RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax 1039 */ 1040 public URI normalize() { 1041 return normalize(this); 1042 } 1043 1044 /** 1045 * Resolves the given URI against this URI. 1046 * 1047 * <p> If the given URI is already absolute, or if this URI is opaque, then 1048 * the given URI is returned. 1049 * 1050 * <p><a id="resolve-frag"></a> If the given URI's fragment component is 1051 * defined, its path component is empty, and its scheme, authority, and 1052 * query components are undefined, then a URI with the given fragment but 1053 * with all other components equal to those of this URI is returned. This 1054 * allows a URI representing a standalone fragment reference, such as 1055 * {@code "#foo"}, to be usefully resolved against a base URI. 1056 * 1057 * <p> Otherwise this method constructs a new hierarchical URI in a manner 1058 * consistent with <a 1059 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 1060 * section 5.2; that is: </p> 1061 * 1062 * <ol> 1063 * 1064 * <li><p> A new URI is constructed with this URI's scheme and the given 1065 * URI's query and fragment components. </p></li> 1066 * 1067 * <li><p> If the given URI has an authority component then the new URI's 1068 * authority and path are taken from the given URI. </p></li> 1069 * 1070 * <li><p> Otherwise the new URI's authority component is copied from 1071 * this URI, and its path is computed as follows: </p> 1072 * 1073 * <ol> 1074 * 1075 * <li><p> If the given URI's path is absolute then the new URI's path 1076 * is taken from the given URI. </p></li> 1077 * 1078 * <li><p> Otherwise the given URI's path is relative, and so the new 1079 * URI's path is computed by resolving the path of the given URI 1080 * against the path of this URI. This is done by concatenating all but 1081 * the last segment of this URI's path, if any, with the given URI's 1082 * path and then normalizing the result as if by invoking the {@link 1083 * #normalize() normalize} method. </p></li> 1084 * 1085 * </ol></li> 1086 * 1087 * </ol> 1088 * 1089 * <p> The result of this method is absolute if, and only if, either this 1090 * URI is absolute or the given URI is absolute. </p> 1091 * 1092 * @param uri The URI to be resolved against this URI 1093 * @return The resulting URI 1094 * 1095 * @throws NullPointerException 1096 * If {@code uri} is {@code null} 1097 * @spec https://www.rfc-editor.org/info/rfc2396 1098 * RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax 1099 */ 1100 public URI resolve(URI uri) { 1101 return resolve(this, uri); 1102 } 1103 1104 /** 1105 * Constructs a new URI by parsing the given string and then resolving it 1106 * against this URI. 1107 * 1108 * <p> This convenience method works as if invoking it were equivalent to 1109 * evaluating the expression {@link #resolve(java.net.URI) 1110 * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p> 1111 * 1112 * @param str The string to be parsed into a URI 1113 * @return The resulting URI 1114 * 1115 * @throws NullPointerException 1116 * If {@code str} is {@code null} 1117 * 1118 * @throws IllegalArgumentException 1119 * If the given string violates RFC 2396 1120 */ 1121 public URI resolve(String str) { 1122 return resolve(URI.create(str)); 1123 } 1124 1125 /** 1126 * Relativizes the given URI against this URI. 1127 * 1128 * <p> The relativization of the given URI against this URI is computed as 1129 * follows: </p> 1130 * 1131 * <ol> 1132 * 1133 * <li><p> If either this URI or the given URI are opaque, or if the 1134 * scheme and authority components of the two URIs are not identical, or 1135 * if the path of this URI is not a prefix of the path of the given URI, 1136 * then the given URI is returned. </p></li> 1137 * 1138 * <li><p> Otherwise a new relative hierarchical URI is constructed with 1139 * query and fragment components taken from the given URI and with a path 1140 * component computed by removing this URI's path from the beginning of 1141 * the given URI's path. </p></li> 1142 * 1143 * </ol> 1144 * 1145 * @param uri The URI to be relativized against this URI 1146 * @return The resulting URI 1147 * 1148 * @throws NullPointerException 1149 * If {@code uri} is {@code null} 1150 */ 1151 public URI relativize(URI uri) { 1152 return relativize(this, uri); 1153 } 1154 1155 /** 1156 * Constructs a URL from this URI. 1157 * 1158 * <p> This convenience method works as if invoking it were equivalent to 1159 * evaluating the expression {@code new URL(this.toString())} after 1160 * first checking that this URI is absolute. </p> 1161 * 1162 * @return A URL constructed from this URI 1163 * 1164 * @throws IllegalArgumentException 1165 * If this URL is not absolute 1166 * 1167 * @throws MalformedURLException 1168 * If a protocol handler for the URL could not be found, 1169 * or if some other error occurred while constructing the URL 1170 */ 1171 public URL toURL() throws MalformedURLException { 1172 return URL.of(this, null); 1173 } 1174 1175 // -- Component access methods -- 1176 1177 /** 1178 * Returns the scheme component of this URI. 1179 * 1180 * <p> The scheme component of a URI, if defined, only contains characters 1181 * in the <i>alphanum</i> category and in the string {@code "-.+"}. A 1182 * scheme always starts with an <i>alpha</i> character. <p> 1183 * 1184 * The scheme component of a URI cannot contain escaped octets, hence this 1185 * method does not perform any decoding. 1186 * 1187 * @return The scheme component of this URI, 1188 * or {@code null} if the scheme is undefined 1189 */ 1190 public String getScheme() { 1191 return scheme; 1192 } 1193 1194 /** 1195 * Tells whether or not this URI is absolute. 1196 * 1197 * <p> A URI is absolute if, and only if, it has a scheme component. </p> 1198 * 1199 * @return {@code true} if, and only if, this URI is absolute 1200 */ 1201 public boolean isAbsolute() { 1202 return scheme != null; 1203 } 1204 1205 /** 1206 * Tells whether or not this URI is opaque. 1207 * 1208 * <p> A URI is opaque if, and only if, it is absolute and its 1209 * scheme-specific part does not begin with a slash character ('/'). 1210 * An opaque URI has a scheme, a scheme-specific part, and possibly 1211 * a fragment; all other components are undefined. </p> 1212 * 1213 * @return {@code true} if, and only if, this URI is opaque 1214 */ 1215 public boolean isOpaque() { 1216 return path == null; 1217 } 1218 1219 /** 1220 * Returns the raw scheme-specific part of this URI. The scheme-specific 1221 * part is never undefined, though it may be empty. 1222 * 1223 * <p> The scheme-specific part of a URI only contains legal URI 1224 * characters. </p> 1225 * 1226 * @return The raw scheme-specific part of this URI 1227 * (never {@code null}) 1228 */ 1229 public String getRawSchemeSpecificPart() { 1230 String part = schemeSpecificPart; 1231 if (part != null) { 1232 return part; 1233 } 1234 1235 String s = string; 1236 if (s != null) { 1237 // if string is defined, components will have been parsed 1238 int start = 0; 1239 int end = s.length(); 1240 if (scheme != null) { 1241 start = scheme.length() + 1; 1242 } 1243 if (fragment != null) { 1244 end -= fragment.length() + 1; 1245 } 1246 if (path != null && path.length() == end - start) { 1247 part = path; 1248 } else { 1249 part = s.substring(start, end); 1250 } 1251 } else { 1252 StringBuilder sb = new StringBuilder(); 1253 appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(), 1254 host, port, getPath(), getQuery()); 1255 part = sb.toString(); 1256 } 1257 return schemeSpecificPart = part; 1258 } 1259 1260 /** 1261 * Returns the decoded scheme-specific part of this URI. 1262 * 1263 * <p> The string returned by this method is equal to that returned by the 1264 * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method 1265 * except that all sequences of escaped octets are <a 1266 * href="#decode">decoded</a>. </p> 1267 * 1268 * @return The decoded scheme-specific part of this URI 1269 * (never {@code null}) 1270 */ 1271 public String getSchemeSpecificPart() { 1272 String part = decodedSchemeSpecificPart; 1273 if (part == null) { 1274 decodedSchemeSpecificPart = part = decode(getRawSchemeSpecificPart()); 1275 } 1276 return part; 1277 } 1278 1279 /** 1280 * Returns the raw authority component of this URI. 1281 * 1282 * <p> The authority component of a URI, if defined, only contains the 1283 * commercial-at character ({@code '@'}) and characters in the 1284 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i> 1285 * categories. If the authority is server-based then it is further 1286 * constrained to have valid user-information, host, and port 1287 * components. </p> 1288 * 1289 * @return The raw authority component of this URI, 1290 * or {@code null} if the authority is undefined 1291 */ 1292 public String getRawAuthority() { 1293 return authority; 1294 } 1295 1296 /** 1297 * Returns the decoded authority component of this URI. 1298 * 1299 * <p> The string returned by this method is equal to that returned by the 1300 * {@link #getRawAuthority() getRawAuthority} method except that all 1301 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1302 * 1303 * @return The decoded authority component of this URI, 1304 * or {@code null} if the authority is undefined 1305 */ 1306 public String getAuthority() { 1307 String auth = decodedAuthority; 1308 if ((auth == null) && (authority != null)) { 1309 decodedAuthority = auth = decode(authority); 1310 } 1311 return auth; 1312 } 1313 1314 /** 1315 * Returns the raw user-information component of this URI. 1316 * 1317 * <p> The user-information component of a URI, if defined, only contains 1318 * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and 1319 * <i>other</i> categories. </p> 1320 * 1321 * @return The raw user-information component of this URI, 1322 * or {@code null} if the user information is undefined 1323 */ 1324 public String getRawUserInfo() { 1325 return userInfo; 1326 } 1327 1328 /** 1329 * Returns the decoded user-information component of this URI. 1330 * 1331 * <p> The string returned by this method is equal to that returned by the 1332 * {@link #getRawUserInfo() getRawUserInfo} method except that all 1333 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1334 * 1335 * @return The decoded user-information component of this URI, 1336 * or {@code null} if the user information is undefined 1337 */ 1338 public String getUserInfo() { 1339 String user = decodedUserInfo; 1340 if ((user == null) && (userInfo != null)) { 1341 decodedUserInfo = user = decode(userInfo); 1342 } 1343 return user; 1344 } 1345 1346 /** 1347 * Returns the host component of this URI. 1348 * 1349 * <p> The host component of a URI, if defined, will have one of the 1350 * following forms: </p> 1351 * 1352 * <ul> 1353 * 1354 * <li><p> A domain name consisting of one or more <i>labels</i> 1355 * separated by period characters ({@code '.'}), optionally followed by 1356 * a period character. Each label consists of <i>alphanum</i> characters 1357 * as well as hyphen characters ({@code '-'}), though hyphens never 1358 * occur as the first or last characters in a label. The rightmost 1359 * label of a domain name consisting of two or more labels, begins 1360 * with an <i>alpha</i> character. </li> 1361 * 1362 * <li><p> A dotted-quad IPv4 address of the form 1363 * <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +}, 1364 * where no <i>digit</i> sequence is longer than three characters and no 1365 * sequence has a value larger than 255. </p></li> 1366 * 1367 * <li><p> An IPv6 address enclosed in square brackets ({@code '['} and 1368 * {@code ']'}) and consisting of hexadecimal digits, colon characters 1369 * ({@code ':'}), and possibly an embedded IPv4 address. The full 1370 * syntax of IPv6 addresses is specified in <a 1371 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 1372 * Addressing Architecture</i></a>. </p></li> 1373 * 1374 * </ul> 1375 * 1376 * The host component of a URI cannot contain escaped octets, hence this 1377 * method does not perform any decoding. 1378 * 1379 * @return The host component of this URI, 1380 * or {@code null} if the host is undefined 1381 * @spec https://www.rfc-editor.org/info/rfc2373 1382 * RFC 2373: IP Version 6 Addressing Architecture 1383 */ 1384 public String getHost() { 1385 return host; 1386 } 1387 1388 /** 1389 * Returns the port number of this URI. 1390 * 1391 * <p> The port component of a URI, if defined, is a non-negative 1392 * integer. </p> 1393 * 1394 * @return The port component of this URI, 1395 * or {@code -1} if the port is undefined 1396 */ 1397 public int getPort() { 1398 return port; 1399 } 1400 1401 /** 1402 * Returns the raw path component of this URI. 1403 * 1404 * <p> The path component of a URI, if defined, only contains the slash 1405 * character ({@code '/'}), the commercial-at character ({@code '@'}), 1406 * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, 1407 * and <i>other</i> categories. </p> 1408 * 1409 * @return The path component of this URI, 1410 * or {@code null} if the path is undefined 1411 */ 1412 public String getRawPath() { 1413 return path; 1414 } 1415 1416 /** 1417 * Returns the decoded path component of this URI. 1418 * 1419 * <p> The string returned by this method is equal to that returned by the 1420 * {@link #getRawPath() getRawPath} method except that all sequences of 1421 * escaped octets are <a href="#decode">decoded</a>. </p> 1422 * 1423 * @return The decoded path component of this URI, 1424 * or {@code null} if the path is undefined 1425 */ 1426 public String getPath() { 1427 String decoded = decodedPath; 1428 if ((decoded == null) && (path != null)) { 1429 decodedPath = decoded = decode(path); 1430 } 1431 return decoded; 1432 } 1433 1434 /** 1435 * Returns the raw query component of this URI. 1436 * 1437 * <p> The query component of a URI, if defined, only contains legal URI 1438 * characters. </p> 1439 * 1440 * @return The raw query component of this URI, 1441 * or {@code null} if the query is undefined 1442 */ 1443 public String getRawQuery() { 1444 return query; 1445 } 1446 1447 /** 1448 * Returns the decoded query component of this URI. 1449 * 1450 * <p> The string returned by this method is equal to that returned by the 1451 * {@link #getRawQuery() getRawQuery} method except that all sequences of 1452 * escaped octets are <a href="#decode">decoded</a>. </p> 1453 * 1454 * @return The decoded query component of this URI, 1455 * or {@code null} if the query is undefined 1456 */ 1457 public String getQuery() { 1458 String decoded = decodedQuery; 1459 if ((decoded == null) && (query != null)) { 1460 decodedQuery = decoded = decode(query, false); 1461 } 1462 return decoded; 1463 } 1464 1465 /** 1466 * Returns the raw fragment component of this URI. 1467 * 1468 * <p> The fragment component of a URI, if defined, only contains legal URI 1469 * characters. </p> 1470 * 1471 * @return The raw fragment component of this URI, 1472 * or {@code null} if the fragment is undefined 1473 */ 1474 public String getRawFragment() { 1475 return fragment; 1476 } 1477 1478 /** 1479 * Returns the decoded fragment component of this URI. 1480 * 1481 * <p> The string returned by this method is equal to that returned by the 1482 * {@link #getRawFragment() getRawFragment} method except that all 1483 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1484 * 1485 * @return The decoded fragment component of this URI, 1486 * or {@code null} if the fragment is undefined 1487 */ 1488 public String getFragment() { 1489 String decoded = decodedFragment; 1490 if ((decoded == null) && (fragment != null)) { 1491 decodedFragment = decoded = decode(fragment, false); 1492 } 1493 return decoded; 1494 } 1495 1496 1497 // -- Equality, comparison, hash code, toString, and serialization -- 1498 1499 /** 1500 * Tests this URI for equality with another object. 1501 * 1502 * <p> If the given object is not a URI then this method immediately 1503 * returns {@code false}. 1504 * 1505 * <p> For two URIs to be considered equal requires that either both are 1506 * opaque or both are hierarchical. Their schemes must either both be 1507 * undefined or else be equal without regard to case. Their fragments 1508 * must either both be undefined or else be equal. 1509 * 1510 * <p> For two opaque URIs to be considered equal, their scheme-specific 1511 * parts must be equal. 1512 * 1513 * <p> For two hierarchical URIs to be considered equal, their paths must 1514 * be equal and their queries must either both be undefined or else be 1515 * equal. Their authorities must either both be undefined, or both be 1516 * registry-based, or both be server-based. If their authorities are 1517 * defined and are registry-based, then they must be equal. If their 1518 * authorities are defined and are server-based, then their hosts must be 1519 * equal without regard to case, their port numbers must be equal, and 1520 * their user-information components must be equal. 1521 * 1522 * <p> When testing the user-information, path, query, fragment, authority, 1523 * or scheme-specific parts of two URIs for equality, the raw forms rather 1524 * than the encoded forms of these components are compared and the 1525 * hexadecimal digits of escaped octets are compared without regard to 1526 * case. 1527 * 1528 * <p> This method satisfies the general contract of the {@link 1529 * java.lang.Object#equals(Object) Object.equals} method. </p> 1530 * 1531 * @param ob The object to which this object is to be compared 1532 * 1533 * @return {@code true} if, and only if, the given object is a URI that 1534 * is identical to this URI 1535 */ 1536 public boolean equals(Object ob) { 1537 if (ob == this) 1538 return true; 1539 if (!(ob instanceof URI that)) 1540 return false; 1541 if (this.isOpaque() != that.isOpaque()) return false; 1542 if (!equalIgnoringCase(this.scheme, that.scheme)) return false; 1543 if (!equal(this.fragment, that.fragment)) return false; 1544 1545 // Opaque 1546 if (this.isOpaque()) 1547 return equal(this.schemeSpecificPart, that.schemeSpecificPart); 1548 1549 // Hierarchical 1550 if (!equal(this.path, that.path)) return false; 1551 if (!equal(this.query, that.query)) return false; 1552 1553 // Authorities 1554 if (this.authority == that.authority) return true; 1555 if (this.host != null) { 1556 // Server-based 1557 if (!equal(this.userInfo, that.userInfo)) return false; 1558 if (!equalIgnoringCase(this.host, that.host)) return false; 1559 if (this.port != that.port) return false; 1560 } else if (this.authority != null) { 1561 // Registry-based 1562 if (!equal(this.authority, that.authority)) return false; 1563 } else if (this.authority != that.authority) { 1564 return false; 1565 } 1566 1567 return true; 1568 } 1569 1570 /** 1571 * Returns a hash-code value for this URI. The hash code is based upon all 1572 * of the URI's components, and satisfies the general contract of the 1573 * {@link java.lang.Object#hashCode() Object.hashCode} method. 1574 * 1575 * @return A hash-code value for this URI 1576 */ 1577 public int hashCode() { 1578 int h = hash; 1579 if (h == 0) { 1580 h = hashIgnoringCase(0, scheme); 1581 h = hash(h, fragment); 1582 if (isOpaque()) { 1583 h = hash(h, schemeSpecificPart); 1584 } else { 1585 h = hash(h, path); 1586 h = hash(h, query); 1587 if (host != null) { 1588 h = hash(h, userInfo); 1589 h = hashIgnoringCase(h, host); 1590 h += 1949 * port; 1591 } else { 1592 h = hash(h, authority); 1593 } 1594 } 1595 if (h != 0) { 1596 hash = h; 1597 } 1598 } 1599 return h; 1600 } 1601 1602 /** 1603 * Compares this URI to another object, which must be a URI. 1604 * 1605 * <p> When comparing corresponding components of two URIs, if one 1606 * component is undefined but the other is defined then the first is 1607 * considered to be less than the second. Unless otherwise noted, string 1608 * components are ordered according to their natural, case-sensitive 1609 * ordering as defined by the {@link java.lang.String#compareTo(String) 1610 * String.compareTo} method. String components that are subject to 1611 * encoding are compared by comparing their raw forms rather than their 1612 * encoded forms and the hexadecimal digits of escaped octets are compared 1613 * without regard to case. 1614 * 1615 * <p> The ordering of URIs is defined as follows: </p> 1616 * 1617 * <ul> 1618 * 1619 * <li><p> Two URIs with different schemes are ordered according the 1620 * ordering of their schemes, without regard to case. </p></li> 1621 * 1622 * <li><p> A hierarchical URI is considered to be less than an opaque URI 1623 * with an identical scheme. </p></li> 1624 * 1625 * <li><p> Two opaque URIs with identical schemes are ordered according 1626 * to the ordering of their scheme-specific parts. </p></li> 1627 * 1628 * <li><p> Two opaque URIs with identical schemes and scheme-specific 1629 * parts are ordered according to the ordering of their 1630 * fragments. </p></li> 1631 * 1632 * <li><p> Two hierarchical URIs with identical schemes are ordered 1633 * according to the ordering of their authority components: </p> 1634 * 1635 * <ul> 1636 * 1637 * <li><p> If both authority components are server-based then the URIs 1638 * are ordered according to their user-information components; if these 1639 * components are identical then the URIs are ordered according to the 1640 * ordering of their hosts, without regard to case; if the hosts are 1641 * identical then the URIs are ordered according to the ordering of 1642 * their ports. </p></li> 1643 * 1644 * <li><p> If one or both authority components are registry-based then 1645 * the URIs are ordered according to the ordering of their authority 1646 * components. </p></li> 1647 * 1648 * </ul></li> 1649 * 1650 * <li><p> Finally, two hierarchical URIs with identical schemes and 1651 * authority components are ordered according to the ordering of their 1652 * paths; if their paths are identical then they are ordered according to 1653 * the ordering of their queries; if the queries are identical then they 1654 * are ordered according to the order of their fragments. </p></li> 1655 * 1656 * </ul> 1657 * 1658 * <p> This method satisfies the general contract of the {@link 1659 * java.lang.Comparable#compareTo(Object) Comparable.compareTo} 1660 * method. </p> 1661 * 1662 * @param that 1663 * The object to which this URI is to be compared 1664 * 1665 * @return A negative integer, zero, or a positive integer as this URI is 1666 * less than, equal to, or greater than the given URI 1667 * 1668 * @throws ClassCastException 1669 * If the given object is not a URI 1670 */ 1671 public int compareTo(URI that) { 1672 int c; 1673 1674 if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0) 1675 return c; 1676 1677 if (this.isOpaque()) { 1678 if (that.isOpaque()) { 1679 // Both opaque 1680 if ((c = compare(this.schemeSpecificPart, 1681 that.schemeSpecificPart)) != 0) 1682 return c; 1683 return compare(this.fragment, that.fragment); 1684 } 1685 return +1; // Opaque > hierarchical 1686 } else if (that.isOpaque()) { 1687 return -1; // Hierarchical < opaque 1688 } 1689 1690 // Hierarchical 1691 if ((this.host != null) && (that.host != null)) { 1692 // Both server-based 1693 if ((c = compare(this.userInfo, that.userInfo)) != 0) 1694 return c; 1695 if ((c = compareIgnoringCase(this.host, that.host)) != 0) 1696 return c; 1697 if ((c = this.port - that.port) != 0) 1698 return c; 1699 } else { 1700 // If one or both authorities are registry-based then we simply 1701 // compare them in the usual, case-sensitive way. If one is 1702 // registry-based and one is server-based then the strings are 1703 // guaranteed to be unequal, hence the comparison will never return 1704 // zero and the compareTo and equals methods will remain 1705 // consistent. 1706 if ((c = compare(this.authority, that.authority)) != 0) return c; 1707 } 1708 1709 if ((c = compare(this.path, that.path)) != 0) return c; 1710 if ((c = compare(this.query, that.query)) != 0) return c; 1711 return compare(this.fragment, that.fragment); 1712 } 1713 1714 /** 1715 * Returns the content of this URI as a string. 1716 * 1717 * <p> If this URI was created by invoking one of the constructors in this 1718 * class then a string equivalent to the original input string, or to the 1719 * string computed from the originally-given components, as appropriate, is 1720 * returned. Otherwise this URI was created by normalization, resolution, 1721 * or relativization, and so a string is constructed from this URI's 1722 * components according to the rules specified in <a 1723 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 1724 * section 5.2, step 7. </p> 1725 * 1726 * @return The string form of this URI 1727 * @spec https://www.rfc-editor.org/info/rfc2396 1728 * RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax 1729 */ 1730 public String toString() { 1731 String s = string; 1732 if (s == null) { 1733 s = defineString(); 1734 } 1735 return s; 1736 } 1737 1738 private String defineString() { 1739 String s = string; 1740 if (s != null) { 1741 return s; 1742 } 1743 1744 StringBuilder sb = new StringBuilder(); 1745 if (scheme != null) { 1746 sb.append(scheme); 1747 sb.append(':'); 1748 } 1749 if (isOpaque()) { 1750 sb.append(schemeSpecificPart); 1751 } else { 1752 if (host != null) { 1753 sb.append("//"); 1754 if (userInfo != null) { 1755 sb.append(userInfo); 1756 sb.append('@'); 1757 } 1758 boolean needBrackets = ((host.indexOf(':') >= 0) 1759 && !host.startsWith("[") 1760 && !host.endsWith("]")); 1761 if (needBrackets) sb.append('['); 1762 sb.append(host); 1763 if (needBrackets) sb.append(']'); 1764 if (port != -1) { 1765 sb.append(':'); 1766 sb.append(port); 1767 } 1768 } else if (authority != null) { 1769 sb.append("//"); 1770 sb.append(authority); 1771 } 1772 if (path != null) 1773 sb.append(path); 1774 if (query != null) { 1775 sb.append('?'); 1776 sb.append(query); 1777 } 1778 } 1779 if (fragment != null) { 1780 sb.append('#'); 1781 sb.append(fragment); 1782 } 1783 return string = sb.toString(); 1784 } 1785 1786 /** 1787 * Returns the content of this URI as a US-ASCII string. 1788 * 1789 * <p> If this URI does not contain any characters in the <i>other</i> 1790 * category then an invocation of this method will return the same value as 1791 * an invocation of the {@link #toString() toString} method. Otherwise 1792 * this method works as if by invoking that method and then <a 1793 * href="#encode">encoding</a> the result. </p> 1794 * 1795 * @return The string form of this URI, encoded as needed 1796 * so that it only contains characters in the US-ASCII 1797 * charset 1798 */ 1799 public String toASCIIString() { 1800 return encode(toString()); 1801 } 1802 1803 1804 // -- Serialization support -- 1805 1806 /** 1807 * Saves the content of this URI to the given serial stream. 1808 * 1809 * <p> The only serializable field of a URI instance is its {@code string} 1810 * field. That field is given a value, if it does not have one already, 1811 * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()} 1812 * method of the given object-output stream is invoked. </p> 1813 * 1814 * @param os The object-output stream to which this object 1815 * is to be written 1816 * 1817 * @throws IOException 1818 * If an I/O error occurs 1819 */ 1820 @java.io.Serial 1821 private void writeObject(ObjectOutputStream os) 1822 throws IOException 1823 { 1824 defineString(); 1825 os.defaultWriteObject(); // Writes the string field only 1826 } 1827 1828 /** 1829 * Reconstitutes a URI from the given serial stream. 1830 * 1831 * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is 1832 * invoked to read the value of the {@code string} field. The result is 1833 * then parsed in the usual way. 1834 * 1835 * @param is The object-input stream from which this object 1836 * is being read 1837 * 1838 * @throws IOException 1839 * If an I/O error occurs 1840 * 1841 * @throws ClassNotFoundException 1842 * If a serialized class cannot be loaded 1843 */ 1844 @java.io.Serial 1845 private void readObject(ObjectInputStream is) 1846 throws ClassNotFoundException, IOException 1847 { 1848 port = -1; // Argh 1849 is.defaultReadObject(); 1850 try { 1851 new Parser(string).parse(false); 1852 } catch (URISyntaxException x) { 1853 IOException y = new InvalidObjectException("Invalid URI"); 1854 y.initCause(x); 1855 throw y; 1856 } 1857 } 1858 1859 1860 // -- End of public methods -- 1861 1862 1863 // -- Utility methods for string-field comparison and hashing -- 1864 1865 // These methods return appropriate values for null string arguments, 1866 // thereby simplifying the equals, hashCode, and compareTo methods. 1867 // 1868 // The case-ignoring methods should only be applied to strings whose 1869 // characters are all known to be US-ASCII. Because of this restriction, 1870 // these methods are faster than the similar methods in the String class. 1871 1872 // US-ASCII only 1873 private static int toLower(char c) { 1874 if ((c >= 'A') && (c <= 'Z')) 1875 return c + ('a' - 'A'); 1876 return c; 1877 } 1878 1879 // US-ASCII only 1880 private static int toUpper(char c) { 1881 if ((c >= 'a') && (c <= 'z')) 1882 return c - ('a' - 'A'); 1883 return c; 1884 } 1885 1886 private static boolean equal(String s, String t) { 1887 boolean testForEquality = true; 1888 int result = percentNormalizedComparison(s, t, testForEquality); 1889 return result == 0; 1890 } 1891 1892 // US-ASCII only 1893 private static boolean equalIgnoringCase(String s, String t) { 1894 if (s == t) return true; 1895 if ((s != null) && (t != null)) { 1896 int n = s.length(); 1897 if (t.length() != n) 1898 return false; 1899 for (int i = 0; i < n; i++) { 1900 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1901 return false; 1902 } 1903 return true; 1904 } 1905 return false; 1906 } 1907 1908 private static int hash(int hash, String s) { 1909 if (s == null) return hash; 1910 return s.indexOf('%') < 0 ? hash * 127 + s.hashCode() 1911 : normalizedHash(hash, s); 1912 } 1913 1914 1915 private static int normalizedHash(int hash, String s) { 1916 int h = 0; 1917 for (int index = 0; index < s.length(); index++) { 1918 char ch = s.charAt(index); 1919 h = 31 * h + ch; 1920 if (ch == '%') { 1921 /* 1922 * Process the next two encoded characters 1923 */ 1924 for (int i = index + 1; i < index + 3; i++) 1925 h = 31 * h + toUpper(s.charAt(i)); 1926 index += 2; 1927 } 1928 } 1929 return hash * 127 + h; 1930 } 1931 1932 // US-ASCII only 1933 private static int hashIgnoringCase(int hash, String s) { 1934 if (s == null) return hash; 1935 int h = hash; 1936 int n = s.length(); 1937 for (int i = 0; i < n; i++) 1938 h = 31 * h + toLower(s.charAt(i)); 1939 return h; 1940 } 1941 1942 private static int compare(String s, String t) { 1943 boolean testForEquality = false; 1944 int result = percentNormalizedComparison(s, t, testForEquality); 1945 return result; 1946 } 1947 1948 // The percentNormalizedComparison method does not verify two 1949 // characters that follow the % sign are hexadecimal digits. 1950 // Reason being: 1951 // 1) percentNormalizedComparison method is not called with 1952 // 'decoded' strings 1953 // 2) The only place where a percent can be followed by anything 1954 // other than hexadecimal digits is in the authority component 1955 // (for a IPv6 scope) and the whole authority component is case 1956 // insensitive. 1957 private static int percentNormalizedComparison(String s, String t, 1958 boolean testForEquality) { 1959 1960 if (s == t) return 0; 1961 if (s != null) { 1962 if (t != null) { 1963 if (s.indexOf('%') < 0) { 1964 return s.compareTo(t); 1965 } 1966 int sn = s.length(); 1967 int tn = t.length(); 1968 if ((sn != tn) && testForEquality) 1969 return sn - tn; 1970 int val = 0; 1971 int n = Math.min(sn, tn); 1972 for (int i = 0; i < n; ) { 1973 char c = s.charAt(i); 1974 char d = t.charAt(i); 1975 val = c - d; 1976 if (c != '%') { 1977 if (val != 0) 1978 return val; 1979 i++; 1980 continue; 1981 } 1982 if (d != '%') { 1983 if (val != 0) 1984 return val; 1985 } 1986 i++; 1987 val = toLower(s.charAt(i)) - toLower(t.charAt(i)); 1988 if (val != 0) 1989 return val; 1990 i++; 1991 val = toLower(s.charAt(i)) - toLower(t.charAt(i)); 1992 if (val != 0) 1993 return val; 1994 i++; 1995 } 1996 return sn - tn; 1997 } else 1998 return +1; 1999 } else { 2000 return -1; 2001 } 2002 } 2003 2004 // US-ASCII only 2005 private static int compareIgnoringCase(String s, String t) { 2006 if (s == t) return 0; 2007 if (s != null) { 2008 if (t != null) { 2009 int sn = s.length(); 2010 int tn = t.length(); 2011 int n = sn < tn ? sn : tn; 2012 for (int i = 0; i < n; i++) { 2013 int c = toLower(s.charAt(i)) - toLower(t.charAt(i)); 2014 if (c != 0) 2015 return c; 2016 } 2017 return sn - tn; 2018 } 2019 return +1; 2020 } else { 2021 return -1; 2022 } 2023 } 2024 2025 2026 // -- String construction -- 2027 2028 // If a scheme is given then the path, if given, must be absolute 2029 // 2030 private static void checkPath(String s, String scheme, String path) 2031 throws URISyntaxException 2032 { 2033 if (scheme != null) { 2034 if (path != null && !path.isEmpty() && path.charAt(0) != '/') 2035 throw new URISyntaxException(s, "Relative path in absolute URI"); 2036 } 2037 } 2038 2039 private void appendAuthority(StringBuilder sb, 2040 String authority, 2041 String userInfo, 2042 String host, 2043 int port) 2044 { 2045 if (host != null) { 2046 sb.append("//"); 2047 if (userInfo != null) { 2048 sb.append(quote(userInfo, L_USERINFO, H_USERINFO)); 2049 sb.append('@'); 2050 } 2051 boolean needBrackets = ((host.indexOf(':') >= 0) 2052 && !host.startsWith("[") 2053 && !host.endsWith("]")); 2054 if (needBrackets) sb.append('['); 2055 sb.append(host); 2056 if (needBrackets) sb.append(']'); 2057 if (port != -1) { 2058 sb.append(':'); 2059 sb.append(port); 2060 } 2061 } else if (authority != null) { 2062 sb.append("//"); 2063 if (authority.startsWith("[")) { 2064 // authority should (but may not) contain an embedded IPv6 address 2065 int end = authority.indexOf(']'); 2066 String doquote = authority; 2067 if (end != -1 && authority.indexOf(':') != -1) { 2068 // the authority contains an IPv6 address 2069 sb.append(authority, 0, end + 1); 2070 doquote = authority.substring(end + 1); 2071 } 2072 sb.append(quote(doquote, 2073 L_REG_NAME | L_SERVER, 2074 H_REG_NAME | H_SERVER)); 2075 } else { 2076 sb.append(quote(authority, 2077 L_REG_NAME | L_SERVER, 2078 H_REG_NAME | H_SERVER)); 2079 } 2080 } 2081 } 2082 2083 private void appendSchemeSpecificPart(StringBuilder sb, 2084 String opaquePart, 2085 String authority, 2086 String userInfo, 2087 String host, 2088 int port, 2089 String path, 2090 String query) 2091 { 2092 if (opaquePart != null) { 2093 /* check if SSP begins with an IPv6 address 2094 * because we must not quote a literal IPv6 address 2095 */ 2096 if (opaquePart.startsWith("//[")) { 2097 int end = opaquePart.indexOf(']'); 2098 if (end != -1 && opaquePart.indexOf(':')!=-1) { 2099 String doquote = opaquePart.substring(end + 1); 2100 sb.append(opaquePart, 0, end + 1); 2101 sb.append(quote(doquote, L_URIC, H_URIC)); 2102 } 2103 } else { 2104 sb.append(quote(opaquePart, L_URIC, H_URIC)); 2105 } 2106 } else { 2107 appendAuthority(sb, authority, userInfo, host, port); 2108 if (path != null) 2109 sb.append(quote(path, L_PATH, H_PATH)); 2110 if (query != null) { 2111 sb.append('?'); 2112 sb.append(quote(query, L_URIC, H_URIC)); 2113 } 2114 } 2115 } 2116 2117 private void appendFragment(StringBuilder sb, String fragment) { 2118 if (fragment != null) { 2119 sb.append('#'); 2120 sb.append(quote(fragment, L_URIC, H_URIC)); 2121 } 2122 } 2123 2124 private String toString(String scheme, 2125 String opaquePart, 2126 String authority, 2127 String userInfo, 2128 String host, 2129 int port, 2130 String path, 2131 String query, 2132 String fragment) 2133 { 2134 StringBuilder sb = new StringBuilder(); 2135 if (scheme != null) { 2136 sb.append(scheme); 2137 sb.append(':'); 2138 } 2139 appendSchemeSpecificPart(sb, opaquePart, 2140 authority, userInfo, host, port, 2141 path, query); 2142 appendFragment(sb, fragment); 2143 return sb.toString(); 2144 } 2145 2146 // -- Normalization, resolution, and relativization -- 2147 2148 // RFC2396 5.2 (6) 2149 private static String resolvePath(String base, String child, boolean absolute) 2150 { 2151 int i = base.lastIndexOf('/'); 2152 int cn = child.length(); 2153 String path = ""; 2154 2155 if (cn == 0) { 2156 // 5.2 (6a) 2157 if (i >= 0) 2158 path = base.substring(0, i + 1); 2159 } else { 2160 // 5.2 (6a-b) 2161 if (i >= 0 || !absolute) { 2162 path = base.substring(0, i + 1).concat(child); 2163 } else { 2164 path = "/".concat(child); 2165 } 2166 2167 } 2168 2169 // 5.2 (6c-f) 2170 String np = normalize(path); 2171 2172 // 5.2 (6g): If the result is absolute but the path begins with "../", 2173 // then we simply leave the path as-is 2174 2175 return np; 2176 } 2177 2178 // RFC2396 5.2 2179 private static URI resolve(URI base, URI child) { 2180 // check if child if opaque first so that NPE is thrown 2181 // if child is null. 2182 if (child.isOpaque() || base.isOpaque()) 2183 return child; 2184 2185 // 5.2 (2): Reference to current document (lone fragment) 2186 if ((child.scheme == null) && (child.authority == null) 2187 && child.path.isEmpty() && (child.fragment != null) 2188 && (child.query == null)) { 2189 if ((base.fragment != null) 2190 && child.fragment.equals(base.fragment)) { 2191 return base; 2192 } 2193 URI ru = new URI(); 2194 ru.scheme = base.scheme; 2195 ru.authority = base.authority; 2196 ru.userInfo = base.userInfo; 2197 ru.host = base.host; 2198 ru.port = base.port; 2199 ru.path = base.path; 2200 ru.fragment = child.fragment; 2201 ru.query = base.query; 2202 return ru; 2203 } 2204 2205 // 5.2 (3): Child is absolute 2206 if (child.scheme != null) 2207 return child; 2208 2209 URI ru = new URI(); // Resolved URI 2210 ru.scheme = base.scheme; 2211 ru.query = child.query; 2212 ru.fragment = child.fragment; 2213 2214 // 5.2 (4): Authority 2215 if (child.authority == null) { 2216 ru.authority = base.authority; 2217 ru.host = base.host; 2218 ru.userInfo = base.userInfo; 2219 ru.port = base.port; 2220 2221 String cp = child.path; 2222 if (!cp.isEmpty() && cp.charAt(0) == '/') { 2223 // 5.2 (5): Child path is absolute 2224 ru.path = child.path; 2225 } else { 2226 // 5.2 (6): Resolve relative path 2227 ru.path = resolvePath(base.path, cp, base.isAbsolute()); 2228 } 2229 } else { 2230 ru.authority = child.authority; 2231 ru.host = child.host; 2232 ru.userInfo = child.userInfo; 2233 ru.port = child.port; 2234 ru.path = child.path; 2235 } 2236 2237 // 5.2 (7): Recombine (nothing to do here) 2238 return ru; 2239 } 2240 2241 // If the given URI's path is normal then return the URI; 2242 // o.w., return a new URI containing the normalized path. 2243 // 2244 private static URI normalize(URI u) { 2245 if (u.isOpaque() || u.path == null || u.path.isEmpty()) 2246 return u; 2247 2248 String np = normalize(u.path); 2249 if (np == u.path) 2250 return u; 2251 2252 URI v = new URI(); 2253 v.scheme = u.scheme; 2254 v.fragment = u.fragment; 2255 v.authority = u.authority; 2256 v.userInfo = u.userInfo; 2257 v.host = u.host; 2258 v.port = u.port; 2259 v.path = np; 2260 v.query = u.query; 2261 return v; 2262 } 2263 2264 // If both URIs are hierarchical, their scheme and authority components are 2265 // identical, and the base path is a prefix of the child's path, then 2266 // return a relative URI that, when resolved against the base, yields the 2267 // child; otherwise, return the child. 2268 // 2269 private static URI relativize(URI base, URI child) { 2270 // check if child if opaque first so that NPE is thrown 2271 // if child is null. 2272 if (child.isOpaque() || base.isOpaque()) 2273 return child; 2274 if (!equalIgnoringCase(base.scheme, child.scheme) 2275 || !equal(base.authority, child.authority)) 2276 return child; 2277 2278 String bp = normalize(base.path); 2279 String cp = normalize(child.path); 2280 if (!bp.equals(cp)) { 2281 if (!bp.endsWith("/")) 2282 bp = bp + "/"; 2283 if (!cp.startsWith(bp)) 2284 return child; 2285 } 2286 2287 URI v = new URI(); 2288 v.path = cp.substring(bp.length()); 2289 v.query = child.query; 2290 v.fragment = child.fragment; 2291 return v; 2292 } 2293 2294 2295 2296 // -- Path normalization -- 2297 2298 // The following algorithm for path normalization avoids the creation of a 2299 // string object for each segment, as well as the use of a string buffer to 2300 // compute the final result, by using a single char array and editing it in 2301 // place. The array is first split into segments, replacing each slash 2302 // with '\0' and creating a segment-index array, each element of which is 2303 // the index of the first char in the corresponding segment. We then walk 2304 // through both arrays, removing ".", "..", and other segments as necessary 2305 // by setting their entries in the index array to -1. Finally, the two 2306 // arrays are used to rejoin the segments and compute the final result. 2307 // 2308 // This code is based upon src/solaris/native/java/io/canonicalize_md.c 2309 2310 2311 // Check the given path to see if it might need normalization. A path 2312 // might need normalization if it contains duplicate slashes, a "." 2313 // segment, or a ".." segment. Return -1 if no further normalization is 2314 // possible, otherwise return the number of segments found. 2315 // 2316 // This method takes a string argument rather than a char array so that 2317 // this test can be performed without invoking path.toCharArray(). 2318 // 2319 private static int needsNormalization(String path) { 2320 boolean normal = true; 2321 int ns = 0; // Number of segments 2322 int end = path.length() - 1; // Index of last char in path 2323 int p = 0; // Index of next char in path 2324 2325 // Skip initial slashes 2326 while (p <= end) { 2327 if (path.charAt(p) != '/') break; 2328 p++; 2329 } 2330 if (p > 1) normal = false; 2331 2332 // Scan segments 2333 while (p <= end) { 2334 2335 // Looking at "." or ".." ? 2336 if ((path.charAt(p) == '.') 2337 && ((p == end) 2338 || ((path.charAt(p + 1) == '/') 2339 || ((path.charAt(p + 1) == '.') 2340 && ((p + 1 == end) 2341 || (path.charAt(p + 2) == '/')))))) { 2342 normal = false; 2343 } 2344 ns++; 2345 2346 // Find beginning of next segment 2347 while (p <= end) { 2348 if (path.charAt(p++) != '/') 2349 continue; 2350 2351 // Skip redundant slashes 2352 while (p <= end) { 2353 if (path.charAt(p) != '/') break; 2354 normal = false; 2355 p++; 2356 } 2357 2358 break; 2359 } 2360 } 2361 2362 return normal ? -1 : ns; 2363 } 2364 2365 2366 // Split the given path into segments, replacing slashes with nulls and 2367 // filling in the given segment-index array. 2368 // 2369 // Preconditions: 2370 // segs.length == Number of segments in path 2371 // 2372 // Postconditions: 2373 // All slashes in path replaced by '\0' 2374 // segs[i] == Index of first char in segment i (0 <= i < segs.length) 2375 // 2376 private static void split(char[] path, int[] segs) { 2377 int end = path.length - 1; // Index of last char in path 2378 int p = 0; // Index of next char in path 2379 int i = 0; // Index of current segment 2380 2381 // Skip initial slashes 2382 while (p <= end) { 2383 if (path[p] != '/') break; 2384 path[p] = '\0'; 2385 p++; 2386 } 2387 2388 while (p <= end) { 2389 2390 // Note start of segment 2391 segs[i++] = p++; 2392 2393 // Find beginning of next segment 2394 while (p <= end) { 2395 if (path[p++] != '/') 2396 continue; 2397 path[p - 1] = '\0'; 2398 2399 // Skip redundant slashes 2400 while (p <= end) { 2401 if (path[p] != '/') break; 2402 path[p++] = '\0'; 2403 } 2404 break; 2405 } 2406 } 2407 2408 if (i != segs.length) 2409 throw new InternalError(); // ASSERT 2410 } 2411 2412 2413 // Join the segments in the given path according to the given segment-index 2414 // array, ignoring those segments whose index entries have been set to -1, 2415 // and inserting slashes as needed. Return the length of the resulting 2416 // path. 2417 // 2418 // Preconditions: 2419 // segs[i] == -1 implies segment i is to be ignored 2420 // path computed by split, as above, with '\0' having replaced '/' 2421 // 2422 // Postconditions: 2423 // path[0] .. path[return value] == Resulting path 2424 // 2425 private static int join(char[] path, int[] segs) { 2426 int ns = segs.length; // Number of segments 2427 int end = path.length - 1; // Index of last char in path 2428 int p = 0; // Index of next path char to write 2429 2430 if (path[p] == '\0') { 2431 // Restore initial slash for absolute paths 2432 path[p++] = '/'; 2433 } 2434 2435 for (int i = 0; i < ns; i++) { 2436 int q = segs[i]; // Current segment 2437 if (q == -1) 2438 // Ignore this segment 2439 continue; 2440 2441 if (p == q) { 2442 // We're already at this segment, so just skip to its end 2443 while ((p <= end) && (path[p] != '\0')) 2444 p++; 2445 if (p <= end) { 2446 // Preserve trailing slash 2447 path[p++] = '/'; 2448 } 2449 } else if (p < q) { 2450 // Copy q down to p 2451 while ((q <= end) && (path[q] != '\0')) 2452 path[p++] = path[q++]; 2453 if (q <= end) { 2454 // Preserve trailing slash 2455 path[p++] = '/'; 2456 } 2457 } else 2458 throw new InternalError(); // ASSERT false 2459 } 2460 2461 return p; 2462 } 2463 2464 2465 // Remove "." segments from the given path, and remove segment pairs 2466 // consisting of a non-".." segment followed by a ".." segment. 2467 // 2468 private static void removeDots(char[] path, int[] segs) { 2469 int ns = segs.length; 2470 int end = path.length - 1; 2471 2472 for (int i = 0; i < ns; i++) { 2473 int dots = 0; // Number of dots found (0, 1, or 2) 2474 2475 // Find next occurrence of "." or ".." 2476 do { 2477 int p = segs[i]; 2478 if (path[p] == '.') { 2479 if (p == end) { 2480 dots = 1; 2481 break; 2482 } else if (path[p + 1] == '\0') { 2483 dots = 1; 2484 break; 2485 } else if ((path[p + 1] == '.') 2486 && ((p + 1 == end) 2487 || (path[p + 2] == '\0'))) { 2488 dots = 2; 2489 break; 2490 } 2491 } 2492 i++; 2493 } while (i < ns); 2494 if ((i > ns) || (dots == 0)) 2495 break; 2496 2497 if (dots == 1) { 2498 // Remove this occurrence of "." 2499 segs[i] = -1; 2500 } else { 2501 // If there is a preceding non-".." segment, remove both that 2502 // segment and this occurrence of ".."; otherwise, leave this 2503 // ".." segment as-is. 2504 int j; 2505 for (j = i - 1; j >= 0; j--) { 2506 if (segs[j] != -1) break; 2507 } 2508 if (j >= 0) { 2509 int q = segs[j]; 2510 if (!((path[q] == '.') 2511 && (path[q + 1] == '.') 2512 && (path[q + 2] == '\0'))) { 2513 segs[i] = -1; 2514 segs[j] = -1; 2515 } 2516 } 2517 } 2518 } 2519 } 2520 2521 2522 // DEVIATION: If the normalized path is relative, and if the first 2523 // segment could be parsed as a scheme name, then prepend a "." segment 2524 // 2525 private static void maybeAddLeadingDot(char[] path, int[] segs) { 2526 2527 if (path[0] == '\0') 2528 // The path is absolute 2529 return; 2530 2531 int ns = segs.length; 2532 int f = 0; // Index of first segment 2533 while (f < ns) { 2534 if (segs[f] >= 0) 2535 break; 2536 f++; 2537 } 2538 if ((f >= ns) || (f == 0)) 2539 // The path is empty, or else the original first segment survived, 2540 // in which case we already know that no leading "." is needed 2541 return; 2542 2543 int p = segs[f]; 2544 while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++; 2545 if (p >= path.length || path[p] == '\0') 2546 // No colon in first segment, so no "." needed 2547 return; 2548 2549 // At this point we know that the first segment is unused, 2550 // hence we can insert a "." segment at that position 2551 path[0] = '.'; 2552 path[1] = '\0'; 2553 segs[0] = 0; 2554 } 2555 2556 2557 // Normalize the given path string. A normal path string has no empty 2558 // segments (i.e., occurrences of "//"), no segments equal to ".", and no 2559 // segments equal to ".." that are preceded by a segment not equal to "..". 2560 // In contrast to Unix-style pathname normalization, for URI paths we 2561 // always retain trailing slashes. 2562 // 2563 private static String normalize(String ps) { 2564 2565 // Does this path need normalization? 2566 int ns = needsNormalization(ps); // Number of segments 2567 if (ns < 0) 2568 // Nope -- just return it 2569 return ps; 2570 2571 char[] path = ps.toCharArray(); // Path in char-array form 2572 2573 // Split path into segments 2574 int[] segs = new int[ns]; // Segment-index array 2575 split(path, segs); 2576 2577 // Remove dots 2578 removeDots(path, segs); 2579 2580 // Prevent scheme-name confusion 2581 maybeAddLeadingDot(path, segs); 2582 2583 // Join the remaining segments and return the result 2584 String s = new String(path, 0, join(path, segs)); 2585 if (s.equals(ps)) { 2586 // string was already normalized 2587 return ps; 2588 } 2589 return s; 2590 } 2591 2592 2593 2594 // -- Character classes for parsing -- 2595 2596 // RFC2396 precisely specifies which characters in the US-ASCII charset are 2597 // permissible in the various components of a URI reference. We here 2598 // define a set of mask pairs to aid in enforcing these restrictions. Each 2599 // mask pair consists of two longs, a low mask and a high mask. Taken 2600 // together they represent a 128-bit mask, where bit i is set iff the 2601 // character with value i is permitted. 2602 // 2603 // This approach is more efficient than sequentially searching arrays of 2604 // permitted characters. It could be made still more efficient by 2605 // precompiling the mask information so that a character's presence in a 2606 // given mask could be determined by a single table lookup. 2607 2608 // To save startup time, we manually calculate the low-/highMask constants. 2609 // For reference, the following methods were used to calculate the values: 2610 2611 // Compute the low-order mask for the characters in the given string 2612 // private static long lowMask(String chars) { 2613 // int n = chars.length(); 2614 // long m = 0; 2615 // for (int i = 0; i < n; i++) { 2616 // char c = chars.charAt(i); 2617 // if (c < 64) 2618 // m |= (1L << c); 2619 // } 2620 // return m; 2621 // } 2622 2623 // Compute the high-order mask for the characters in the given string 2624 // private static long highMask(String chars) { 2625 // int n = chars.length(); 2626 // long m = 0; 2627 // for (int i = 0; i < n; i++) { 2628 // char c = chars.charAt(i); 2629 // if ((c >= 64) && (c < 128)) 2630 // m |= (1L << (c - 64)); 2631 // } 2632 // return m; 2633 // } 2634 2635 // Compute a low-order mask for the characters 2636 // between first and last, inclusive 2637 // private static long lowMask(char first, char last) { 2638 // long m = 0; 2639 // int f = Math.max(Math.min(first, 63), 0); 2640 // int l = Math.max(Math.min(last, 63), 0); 2641 // for (int i = f; i <= l; i++) 2642 // m |= 1L << i; 2643 // return m; 2644 // } 2645 2646 // Compute a high-order mask for the characters 2647 // between first and last, inclusive 2648 // private static long highMask(char first, char last) { 2649 // long m = 0; 2650 // int f = Math.max(Math.min(first, 127), 64) - 64; 2651 // int l = Math.max(Math.min(last, 127), 64) - 64; 2652 // for (int i = f; i <= l; i++) 2653 // m |= 1L << i; 2654 // return m; 2655 // } 2656 2657 // Tell whether the given character is permitted by the given mask pair 2658 private static boolean match(char c, long lowMask, long highMask) { 2659 if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches. 2660 return false; 2661 if (c < 64) 2662 return ((1L << c) & lowMask) != 0; 2663 if (c < 128) 2664 return ((1L << (c - 64)) & highMask) != 0; 2665 return false; 2666 } 2667 2668 // Character-class masks, in reverse order from RFC2396 because 2669 // initializers for static fields cannot make forward references. 2670 2671 // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | 2672 // "8" | "9" 2673 private static final long L_DIGIT = 0x3FF000000000000L; // lowMask('0', '9'); 2674 private static final long H_DIGIT = 0L; 2675 2676 // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | 2677 // "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | 2678 // "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" 2679 private static final long L_UPALPHA = 0L; 2680 private static final long H_UPALPHA = 0x7FFFFFEL; // highMask('A', 'Z'); 2681 2682 // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | 2683 // "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | 2684 // "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" 2685 private static final long L_LOWALPHA = 0L; 2686 private static final long H_LOWALPHA = 0x7FFFFFE00000000L; // highMask('a', 'z'); 2687 2688 // alpha = lowalpha | upalpha 2689 private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA; 2690 private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA; 2691 2692 // alphanum = alpha | digit 2693 private static final long L_ALPHANUM = L_DIGIT | L_ALPHA; 2694 private static final long H_ALPHANUM = H_DIGIT | H_ALPHA; 2695 2696 // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | 2697 // "a" | "b" | "c" | "d" | "e" | "f" 2698 private static final long L_HEX = L_DIGIT; 2699 private static final long H_HEX = 0x7E0000007EL; // highMask('A', 'F') | highMask('a', 'f'); 2700 2701 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | 2702 // "(" | ")" 2703 private static final long L_MARK = 0x678200000000L; // lowMask("-_.!~*'()"); 2704 private static final long H_MARK = 0x4000000080000000L; // highMask("-_.!~*'()"); 2705 2706 // unreserved = alphanum | mark 2707 private static final long L_UNRESERVED = L_ALPHANUM | L_MARK; 2708 private static final long H_UNRESERVED = H_ALPHANUM | H_MARK; 2709 2710 // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 2711 // "$" | "," | "[" | "]" 2712 // Added per RFC2732: "[", "]" 2713 private static final long L_RESERVED = 0xAC00985000000000L; // lowMask(";/?:@&=+$,[]"); 2714 private static final long H_RESERVED = 0x28000001L; // highMask(";/?:@&=+$,[]"); 2715 2716 // The zero'th bit is used to indicate that escape pairs and non-US-ASCII 2717 // characters are allowed; this is handled by the scanEscape method below. 2718 private static final long L_ESCAPED = 1L; 2719 private static final long H_ESCAPED = 0L; 2720 2721 // uric = reserved | unreserved | escaped 2722 private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED; 2723 private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED; 2724 2725 // pchar = unreserved | escaped | 2726 // ":" | "@" | "&" | "=" | "+" | "$" | "," 2727 private static final long L_PCHAR 2728 = L_UNRESERVED | L_ESCAPED | 0x2400185000000000L; // lowMask(":@&=+$,"); 2729 private static final long H_PCHAR 2730 = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask(":@&=+$,"); 2731 2732 // All valid path characters 2733 private static final long L_PATH = L_PCHAR | 0x800800000000000L; // lowMask(";/"); 2734 private static final long H_PATH = H_PCHAR; // highMask(";/") == 0x0L; 2735 2736 // Dash, for use in domainlabel and toplabel 2737 private static final long L_DASH = 0x200000000000L; // lowMask("-"); 2738 private static final long H_DASH = 0x0L; // highMask("-"); 2739 2740 // Dot, for use in hostnames 2741 private static final long L_DOT = 0x400000000000L; // lowMask("."); 2742 private static final long H_DOT = 0x0L; // highMask("."); 2743 2744 // userinfo = *( unreserved | escaped | 2745 // ";" | ":" | "&" | "=" | "+" | "$" | "," ) 2746 private static final long L_USERINFO 2747 = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask(";:&=+$,"); 2748 private static final long H_USERINFO 2749 = H_UNRESERVED | H_ESCAPED; // | highMask(";:&=+$,") == 0L; 2750 2751 // reg_name = 1*( unreserved | escaped | "$" | "," | 2752 // ";" | ":" | "@" | "&" | "=" | "+" ) 2753 private static final long L_REG_NAME 2754 = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask("$,;:@&=+"); 2755 private static final long H_REG_NAME 2756 = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask("$,;:@&=+"); 2757 2758 // All valid characters for server-based authorities 2759 private static final long L_SERVER 2760 = L_USERINFO | L_ALPHANUM | L_DASH | 0x400400000000000L; // lowMask(".:@[]"); 2761 private static final long H_SERVER 2762 = H_USERINFO | H_ALPHANUM | H_DASH | 0x28000001L; // highMask(".:@[]"); 2763 2764 // Special case of server authority that represents an IPv6 address 2765 // In this case, a % does not signify an escape sequence 2766 private static final long L_SERVER_PERCENT 2767 = L_SERVER | 0x2000000000L; // lowMask("%"); 2768 private static final long H_SERVER_PERCENT 2769 = H_SERVER; // | highMask("%") == 0L; 2770 2771 // scheme = alpha *( alpha | digit | "+" | "-" | "." ) 2772 private static final long L_SCHEME = L_ALPHA | L_DIGIT | 0x680000000000L; // lowMask("+-."); 2773 private static final long H_SCHEME = H_ALPHA | H_DIGIT; // | highMask("+-.") == 0L 2774 2775 // scope_id = alpha | digit | "_" | "." 2776 private static final long L_SCOPE_ID 2777 = L_ALPHANUM | 0x400000000000L; // lowMask("_."); 2778 private static final long H_SCOPE_ID 2779 = H_ALPHANUM | 0x80000000L; // highMask("_."); 2780 2781 // -- Escaping and encoding -- 2782 2783 private static final char[] hexDigits = { 2784 '0', '1', '2', '3', '4', '5', '6', '7', 2785 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' 2786 }; 2787 2788 private static void appendEscape(StringBuilder sb, byte b) { 2789 sb.append('%'); 2790 sb.append(hexDigits[(b >> 4) & 0x0f]); 2791 sb.append(hexDigits[(b >> 0) & 0x0f]); 2792 } 2793 2794 private static void appendEncoded(CharsetEncoder encoder, StringBuilder sb, char c) { 2795 ByteBuffer bb = null; 2796 try { 2797 bb = encoder.encode(CharBuffer.wrap(new char[]{c})); 2798 } catch (CharacterCodingException x) { 2799 assert false; 2800 } 2801 while (bb.hasRemaining()) { 2802 int b = bb.get() & 0xff; 2803 if (b >= 0x80) 2804 appendEscape(sb, (byte)b); 2805 else 2806 sb.append((char)b); 2807 } 2808 } 2809 2810 // Quote any characters in s that are not permitted 2811 // by the given mask pair 2812 // 2813 private static String quote(String s, long lowMask, long highMask) { 2814 StringBuilder sb = null; 2815 CharsetEncoder encoder = null; 2816 boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0); 2817 for (int i = 0; i < s.length(); i++) { 2818 char c = s.charAt(i); 2819 if (c < '\u0080') { 2820 if (!match(c, lowMask, highMask)) { 2821 if (sb == null) { 2822 sb = new StringBuilder(); 2823 sb.append(s, 0, i); 2824 } 2825 appendEscape(sb, (byte)c); 2826 } else { 2827 if (sb != null) 2828 sb.append(c); 2829 } 2830 } else if (allowNonASCII 2831 && (Character.isSpaceChar(c) 2832 || Character.isISOControl(c))) { 2833 if (encoder == null) 2834 encoder = UTF_8.INSTANCE.newEncoder(); 2835 if (sb == null) { 2836 sb = new StringBuilder(); 2837 sb.append(s, 0, i); 2838 } 2839 appendEncoded(encoder, sb, c); 2840 } else { 2841 if (sb != null) 2842 sb.append(c); 2843 } 2844 } 2845 return (sb == null) ? s : sb.toString(); 2846 } 2847 2848 // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets, 2849 // assuming that s is otherwise legal 2850 // 2851 private static String encode(String s) { 2852 int n = s.length(); 2853 if (n == 0) 2854 return s; 2855 2856 // First check whether we actually need to encode 2857 for (int i = 0;;) { 2858 if (s.charAt(i) >= '\u0080') 2859 break; 2860 if (++i >= n) 2861 return s; 2862 } 2863 2864 String ns = Normalizer.normalize(s, Normalizer.Form.NFC); 2865 ByteBuffer bb = null; 2866 try { 2867 bb = UTF_8.INSTANCE.newEncoder() 2868 .encode(CharBuffer.wrap(ns)); 2869 2870 } catch (CharacterCodingException x) { 2871 assert false; 2872 } 2873 2874 StringBuilder sb = new StringBuilder(); 2875 while (bb.hasRemaining()) { 2876 int b = bb.get() & 0xff; 2877 if (b >= 0x80) 2878 appendEscape(sb, (byte)b); 2879 else 2880 sb.append((char)b); 2881 } 2882 return sb.toString(); 2883 } 2884 2885 private static int decode(char c) { 2886 if ((c >= '0') && (c <= '9')) 2887 return c - '0'; 2888 if ((c >= 'a') && (c <= 'f')) 2889 return c - 'a' + 10; 2890 if ((c >= 'A') && (c <= 'F')) 2891 return c - 'A' + 10; 2892 assert false; 2893 return -1; 2894 } 2895 2896 private static byte decode(char c1, char c2) { 2897 return (byte)( ((decode(c1) & 0xf) << 4) 2898 | ((decode(c2) & 0xf) << 0)); 2899 } 2900 2901 // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes 2902 // that escapes are well-formed syntactically, i.e., of the form %XX. If a 2903 // sequence of escaped octets is not valid UTF-8 then the erroneous octets 2904 // are replaced with '\uFFFD'. 2905 // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal 2906 // with a scope_id 2907 // 2908 private static String decode(String s) { 2909 return decode(s, true); 2910 } 2911 2912 // This method was introduced as a generalization of URI.decode method 2913 // to provide a fix for JDK-8037396 2914 private static String decode(String s, boolean ignorePercentInBrackets) { 2915 if (s == null) 2916 return s; 2917 int n = s.length(); 2918 if (n == 0) 2919 return s; 2920 if (s.indexOf('%') < 0) 2921 return s; 2922 2923 StringBuilder sb = new StringBuilder(n); 2924 ByteBuffer bb = ByteBuffer.allocate(n); 2925 CharBuffer cb = CharBuffer.allocate(n); 2926 CharsetDecoder dec = UTF_8.INSTANCE.newDecoder() 2927 .onMalformedInput(CodingErrorAction.REPLACE) 2928 .onUnmappableCharacter(CodingErrorAction.REPLACE); 2929 2930 // This is not horribly efficient, but it will do for now 2931 char c = s.charAt(0); 2932 boolean betweenBrackets = false; 2933 2934 for (int i = 0; i < n;) { 2935 assert c == s.charAt(i); // Loop invariant 2936 if (c == '[') { 2937 betweenBrackets = true; 2938 } else if (betweenBrackets && c == ']') { 2939 betweenBrackets = false; 2940 } 2941 if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) { 2942 sb.append(c); 2943 if (++i >= n) 2944 break; 2945 c = s.charAt(i); 2946 continue; 2947 } 2948 bb.clear(); 2949 for (;;) { 2950 assert (n - i >= 2); 2951 bb.put(decode(s.charAt(++i), s.charAt(++i))); 2952 if (++i >= n) 2953 break; 2954 c = s.charAt(i); 2955 if (c != '%') 2956 break; 2957 } 2958 bb.flip(); 2959 cb.clear(); 2960 dec.reset(); 2961 CoderResult cr = dec.decode(bb, cb, true); 2962 assert cr.isUnderflow(); 2963 cr = dec.flush(cb); 2964 assert cr.isUnderflow(); 2965 sb.append(cb.flip().toString()); 2966 } 2967 2968 return sb.toString(); 2969 } 2970 2971 2972 // -- Parsing -- 2973 2974 // For convenience we wrap the input URI string in a new instance of the 2975 // following internal class. This saves always having to pass the input 2976 // string as an argument to each internal scan/parse method. 2977 2978 private class Parser { 2979 2980 private final String input; // URI input string 2981 private boolean requireServerAuthority = false; 2982 2983 Parser(String s) { 2984 input = s; 2985 string = s; 2986 } 2987 2988 // -- Methods for throwing URISyntaxException in various ways -- 2989 2990 private void fail(String reason) throws URISyntaxException { 2991 throw new URISyntaxException(input, reason); 2992 } 2993 2994 private void fail(String reason, int p) throws URISyntaxException { 2995 throw new URISyntaxException(input, reason, p); 2996 } 2997 2998 private void failExpecting(String expected, int p) 2999 throws URISyntaxException 3000 { 3001 fail("Expected " + expected, p); 3002 } 3003 3004 3005 // -- Simple access to the input string -- 3006 3007 // Tells whether start < end and, if so, whether charAt(start) == c 3008 // 3009 private boolean at(int start, int end, char c) { 3010 return (start < end) && (input.charAt(start) == c); 3011 } 3012 3013 // Tells whether start + s.length() < end and, if so, 3014 // whether the chars at the start position match s exactly 3015 // 3016 private boolean at(int start, int end, String s) { 3017 int p = start; 3018 int sn = s.length(); 3019 if (sn > end - p) 3020 return false; 3021 int i = 0; 3022 while (i < sn) { 3023 if (input.charAt(p++) != s.charAt(i)) { 3024 break; 3025 } 3026 i++; 3027 } 3028 return (i == sn); 3029 } 3030 3031 3032 // -- Scanning -- 3033 3034 // The various scan and parse methods that follow use a uniform 3035 // convention of taking the current start position and end index as 3036 // their first two arguments. The start is inclusive while the end is 3037 // exclusive, just as in the String class, i.e., a start/end pair 3038 // denotes the left-open interval [start, end) of the input string. 3039 // 3040 // These methods never proceed past the end position. They may return 3041 // -1 to indicate outright failure, but more often they simply return 3042 // the position of the first char after the last char scanned. Thus 3043 // a typical idiom is 3044 // 3045 // int p = start; 3046 // int q = scan(p, end, ...); 3047 // if (q > p) 3048 // // We scanned something 3049 // ...; 3050 // else if (q == p) 3051 // // We scanned nothing 3052 // ...; 3053 // else if (q == -1) 3054 // // Something went wrong 3055 // ...; 3056 3057 3058 // Scan a specific char: If the char at the given start position is 3059 // equal to c, return the index of the next char; otherwise, return the 3060 // start position. 3061 // 3062 private int scan(int start, int end, char c) { 3063 if ((start < end) && (input.charAt(start) == c)) 3064 return start + 1; 3065 return start; 3066 } 3067 3068 // Scan forward from the given start position. Stop at the first char 3069 // in the err string (in which case -1 is returned), or the first char 3070 // in the stop string (in which case the index of the preceding char is 3071 // returned), or the end of the input string (in which case the length 3072 // of the input string is returned). May return the start position if 3073 // nothing matches. 3074 // 3075 private int scan(int start, int end, String err, String stop) { 3076 int p = start; 3077 while (p < end) { 3078 char c = input.charAt(p); 3079 if (err.indexOf(c) >= 0) 3080 return -1; 3081 if (stop.indexOf(c) >= 0) 3082 break; 3083 p++; 3084 } 3085 return p; 3086 } 3087 3088 // Scan forward from the given start position. Stop at the first char 3089 // in the stop string (in which case the index of the preceding char is 3090 // returned), or the end of the input string (in which case the length 3091 // of the input string is returned). May return the start position if 3092 // nothing matches. 3093 // 3094 private int scan(int start, int end, String stop) { 3095 int p = start; 3096 while (p < end) { 3097 char c = input.charAt(p); 3098 if (stop.indexOf(c) >= 0) 3099 break; 3100 p++; 3101 } 3102 return p; 3103 } 3104 3105 // Scan a potential escape sequence, starting at the given position, 3106 // with the given first char (i.e., charAt(start) == c). 3107 // 3108 // This method assumes that if escapes are allowed then visible 3109 // non-US-ASCII chars are also allowed. 3110 // 3111 private int scanEscape(int start, int n, char first) 3112 throws URISyntaxException 3113 { 3114 int p = start; 3115 char c = first; 3116 if (c == '%') { 3117 // Process escape pair 3118 if ((p + 3 <= n) 3119 && match(input.charAt(p + 1), L_HEX, H_HEX) 3120 && match(input.charAt(p + 2), L_HEX, H_HEX)) { 3121 return p + 3; 3122 } 3123 fail("Malformed escape pair", p); 3124 } else if ((c > 128) 3125 && !Character.isSpaceChar(c) 3126 && !Character.isISOControl(c)) { 3127 // Allow unescaped but visible non-US-ASCII chars 3128 return p + 1; 3129 } 3130 return p; 3131 } 3132 3133 // Scan chars that match the given mask pair 3134 // 3135 private int scan(int start, int n, long lowMask, long highMask) 3136 throws URISyntaxException 3137 { 3138 int p = start; 3139 while (p < n) { 3140 char c = input.charAt(p); 3141 if (match(c, lowMask, highMask)) { 3142 p++; 3143 continue; 3144 } 3145 if ((lowMask & L_ESCAPED) != 0) { 3146 int q = scanEscape(p, n, c); 3147 if (q > p) { 3148 p = q; 3149 continue; 3150 } 3151 } 3152 break; 3153 } 3154 return p; 3155 } 3156 3157 // Check that each of the chars in [start, end) matches the given mask 3158 // 3159 private void checkChars(int start, int end, 3160 long lowMask, long highMask, 3161 String what) 3162 throws URISyntaxException 3163 { 3164 int p = scan(start, end, lowMask, highMask); 3165 if (p < end) 3166 fail("Illegal character in " + what, p); 3167 } 3168 3169 // Check that the char at position p matches the given mask 3170 // 3171 private void checkChar(int p, 3172 long lowMask, long highMask, 3173 String what) 3174 throws URISyntaxException 3175 { 3176 checkChars(p, p + 1, lowMask, highMask, what); 3177 } 3178 3179 3180 // -- Parsing -- 3181 3182 // [<scheme>:]<scheme-specific-part>[#<fragment>] 3183 // 3184 void parse(boolean rsa) throws URISyntaxException { 3185 requireServerAuthority = rsa; 3186 int n = input.length(); 3187 int p = scan(0, n, "/?#", ":"); 3188 if ((p >= 0) && at(p, n, ':')) { 3189 if (p == 0) 3190 failExpecting("scheme name", 0); 3191 checkChar(0, L_ALPHA, H_ALPHA, "scheme name"); 3192 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name"); 3193 scheme = input.substring(0, p); 3194 p++; // Skip ':' 3195 if (at(p, n, '/')) { 3196 p = parseHierarchical(p, n); 3197 } else { 3198 // opaque; need to create the schemeSpecificPart 3199 int q = scan(p, n, "#"); 3200 if (q <= p) 3201 failExpecting("scheme-specific part", p); 3202 checkChars(p, q, L_URIC, H_URIC, "opaque part"); 3203 schemeSpecificPart = input.substring(p, q); 3204 p = q; 3205 } 3206 } else { 3207 p = parseHierarchical(0, n); 3208 } 3209 if (at(p, n, '#')) { 3210 checkChars(p + 1, n, L_URIC, H_URIC, "fragment"); 3211 fragment = input.substring(p + 1, n); 3212 p = n; 3213 } 3214 if (p < n) 3215 fail("end of URI", p); 3216 } 3217 3218 // [//authority]<path>[?<query>] 3219 // 3220 // DEVIATION from RFC2396: We allow an empty authority component as 3221 // long as it's followed by a non-empty path, query component, or 3222 // fragment component. This is so that URIs such as "file:///foo/bar" 3223 // will parse. This seems to be the intent of RFC2396, though the 3224 // grammar does not permit it. If the authority is empty then the 3225 // userInfo, host, and port components are undefined. 3226 // 3227 // DEVIATION from RFC2396: We allow empty relative paths. This seems 3228 // to be the intent of RFC2396, but the grammar does not permit it. 3229 // The primary consequence of this deviation is that "#f" parses as a 3230 // relative URI with an empty path. 3231 // 3232 private int parseHierarchical(int start, int n) 3233 throws URISyntaxException 3234 { 3235 int p = start; 3236 if (at(p, n, '/') && at(p + 1, n, '/')) { 3237 p += 2; 3238 int q = scan(p, n, "/?#"); 3239 if (q > p) { 3240 p = parseAuthority(p, q); 3241 } else if (q < n) { 3242 // DEVIATION: Allow empty authority prior to non-empty 3243 // path, query component or fragment identifier 3244 } else 3245 failExpecting("authority", p); 3246 } 3247 int q = scan(p, n, "?#"); // DEVIATION: May be empty 3248 checkChars(p, q, L_PATH, H_PATH, "path"); 3249 path = input.substring(p, q); 3250 p = q; 3251 if (at(p, n, '?')) { 3252 p++; 3253 q = scan(p, n, "#"); 3254 checkChars(p, q, L_URIC, H_URIC, "query"); 3255 query = input.substring(p, q); 3256 p = q; 3257 } 3258 return p; 3259 } 3260 3261 // authority = server | reg_name 3262 // 3263 // Ambiguity: An authority that is a registry name rather than a server 3264 // might have a prefix that parses as a server. We use the fact that 3265 // the authority component is always followed by '/' or the end of the 3266 // input string to resolve this: If the complete authority did not 3267 // parse as a server then we try to parse it as a registry name. 3268 // 3269 private int parseAuthority(int start, int n) 3270 throws URISyntaxException 3271 { 3272 int p = start; 3273 int q = p; 3274 int qreg = p; 3275 URISyntaxException ex = null; 3276 3277 boolean serverChars; 3278 boolean regChars; 3279 boolean skipParseException; 3280 3281 if (scan(p, n, "]") > p) { 3282 // contains a literal IPv6 address, therefore % is allowed 3283 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n); 3284 } else { 3285 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n); 3286 } 3287 regChars = ((qreg = scan(p, n, L_REG_NAME, H_REG_NAME)) == n); 3288 3289 if (regChars && !serverChars) { 3290 // Must be a registry-based authority 3291 authority = input.substring(p, n); 3292 return n; 3293 } 3294 3295 // When parsing a URI, skip creating exception objects if the server-based 3296 // authority is not required and the registry parse is successful. 3297 // 3298 skipParseException = (!requireServerAuthority && regChars); 3299 if (serverChars) { 3300 // Might be (probably is) a server-based authority, so attempt 3301 // to parse it as such. If the attempt fails, try to treat it 3302 // as a registry-based authority. 3303 try { 3304 q = parseServer(p, n, skipParseException); 3305 if (q < n) { 3306 if (skipParseException) { 3307 userInfo = null; 3308 host = null; 3309 port = -1; 3310 q = p; 3311 } else { 3312 failExpecting("end of authority", q); 3313 } 3314 } else { 3315 authority = input.substring(p, n); 3316 } 3317 } catch (URISyntaxException x) { 3318 // Undo results of failed parse 3319 userInfo = null; 3320 host = null; 3321 port = -1; 3322 if (requireServerAuthority) { 3323 // If we're insisting upon a server-based authority, 3324 // then just re-throw the exception 3325 throw x; 3326 } else { 3327 // Save the exception in case it doesn't parse as a 3328 // registry either 3329 ex = x; 3330 q = p; 3331 } 3332 } 3333 } 3334 3335 if (q < n) { 3336 if (regChars) { 3337 // Registry-based authority 3338 authority = input.substring(p, n); 3339 } else if (ex != null) { 3340 // Re-throw exception; it was probably due to 3341 // a malformed IPv6 address 3342 throw ex; 3343 } else { 3344 fail("Illegal character in authority", serverChars ? q : qreg); 3345 } 3346 } 3347 3348 return n; 3349 } 3350 3351 3352 // [<userinfo>@]<host>[:<port>] 3353 // 3354 private int parseServer(int start, int n, boolean skipParseException) 3355 throws URISyntaxException 3356 { 3357 int p = start; 3358 int q; 3359 3360 // userinfo 3361 q = scan(p, n, "/?#", "@"); 3362 if ((q >= p) && at(q, n, '@')) { 3363 checkChars(p, q, L_USERINFO, H_USERINFO, "user info"); 3364 userInfo = input.substring(p, q); 3365 p = q + 1; // Skip '@' 3366 } 3367 3368 // hostname, IPv4 address, or IPv6 address 3369 if (at(p, n, '[')) { 3370 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732 3371 p++; 3372 q = scan(p, n, "/?#", "]"); 3373 if ((q > p) && at(q, n, ']')) { 3374 // look for a "%" scope id 3375 int r = scan (p, q, "%"); 3376 if (r > p) { 3377 parseIPv6Reference(p, r); 3378 if (r+1 == q) { 3379 fail ("scope id expected"); 3380 } 3381 checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID, 3382 "scope id"); 3383 } else { 3384 parseIPv6Reference(p, q); 3385 } 3386 host = input.substring(p-1, q+1); 3387 p = q + 1; 3388 } else { 3389 failExpecting("closing bracket for IPv6 address", q); 3390 } 3391 } else { 3392 q = parseIPv4Address(p, n); 3393 if (q <= p) 3394 q = parseHostname(p, n, skipParseException); 3395 p = q; 3396 } 3397 3398 // port 3399 if (at(p, n, ':')) { 3400 p++; 3401 q = scan(p, n, "/"); 3402 if (q > p) { 3403 checkChars(p, q, L_DIGIT, H_DIGIT, "port number"); 3404 try { 3405 port = Integer.parseInt(input, p, q, 10); 3406 } catch (NumberFormatException x) { 3407 fail("Malformed port number", p); 3408 } 3409 p = q; 3410 } 3411 } else if (p < n && skipParseException) { 3412 return p; 3413 } 3414 3415 if (p < n) 3416 failExpecting("port number", p); 3417 3418 return p; 3419 } 3420 3421 // Scan a string of decimal digits whose value fits in a byte 3422 // 3423 private int scanByte(int start, int n) 3424 throws URISyntaxException 3425 { 3426 int p = start; 3427 int q = scan(p, n, L_DIGIT, H_DIGIT); 3428 if (q <= p) return q; 3429 if (Integer.parseInt(input, p, q, 10) > 255) return p; 3430 return q; 3431 } 3432 3433 // Scan an IPv4 address. 3434 // 3435 // If the strict argument is true then we require that the given 3436 // interval contain nothing besides an IPv4 address; if it is false 3437 // then we only require that it start with an IPv4 address. 3438 // 3439 // If the interval does not contain or start with (depending upon the 3440 // strict argument) a legal IPv4 address characters then we return -1 3441 // immediately; otherwise we insist that these characters parse as a 3442 // legal IPv4 address and throw an exception on failure. 3443 // 3444 // We assume that any string of decimal digits and dots must be an IPv4 3445 // address. It won't parse as a hostname anyway, so making that 3446 // assumption here allows more meaningful exceptions to be thrown. 3447 // 3448 private int scanIPv4Address(int start, int n, boolean strict) 3449 throws URISyntaxException 3450 { 3451 int p = start; 3452 int q; 3453 int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT); 3454 if ((m <= p) || (strict && (m != n))) 3455 return -1; 3456 for (;;) { 3457 // Per RFC2732: At most three digits per byte 3458 // Further constraint: Each element fits in a byte 3459 if ((q = scanByte(p, m)) <= p) break; p = q; 3460 if ((q = scan(p, m, '.')) <= p) break; p = q; 3461 if ((q = scanByte(p, m)) <= p) break; p = q; 3462 if ((q = scan(p, m, '.')) <= p) break; p = q; 3463 if ((q = scanByte(p, m)) <= p) break; p = q; 3464 if ((q = scan(p, m, '.')) <= p) break; p = q; 3465 if ((q = scanByte(p, m)) <= p) break; p = q; 3466 if (q < m) break; 3467 return q; 3468 } 3469 fail("Malformed IPv4 address", q); 3470 return -1; 3471 } 3472 3473 // Take an IPv4 address: Throw an exception if the given interval 3474 // contains anything except an IPv4 address 3475 // 3476 private int takeIPv4Address(int start, int n, String expected) 3477 throws URISyntaxException 3478 { 3479 int p = scanIPv4Address(start, n, true); 3480 if (p <= start) 3481 failExpecting(expected, start); 3482 return p; 3483 } 3484 3485 // Attempt to parse an IPv4 address, returning -1 on failure but 3486 // allowing the given interval to contain [:<characters>] after 3487 // the IPv4 address. 3488 // 3489 private int parseIPv4Address(int start, int n) { 3490 int p; 3491 3492 try { 3493 p = scanIPv4Address(start, n, false); 3494 } catch (URISyntaxException | NumberFormatException x) { 3495 return -1; 3496 } 3497 3498 if (p > start && p < n) { 3499 // IPv4 address is followed by something - check that 3500 // it's a ":" as this is the only valid character to 3501 // follow an address. 3502 if (input.charAt(p) != ':') { 3503 p = -1; 3504 } 3505 } 3506 3507 if (p > start) 3508 host = input.substring(start, p); 3509 3510 return p; 3511 } 3512 3513 // hostname = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ] 3514 // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 3515 // toplabel = alpha | alpha *( alphanum | "-" ) alphanum 3516 // 3517 private int parseHostname(int start, int n, boolean skipParseException) 3518 throws URISyntaxException 3519 { 3520 int p = start; 3521 int q; 3522 int l = -1; // Start of last parsed label 3523 3524 do { 3525 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ] 3526 q = scan(p, n, L_ALPHANUM, H_ALPHANUM); 3527 if (q <= p) 3528 break; 3529 l = p; 3530 p = q; 3531 q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH); 3532 if (q > p) { 3533 if (input.charAt(q - 1) == '-') 3534 fail("Illegal character in hostname", q - 1); 3535 p = q; 3536 } 3537 q = scan(p, n, '.'); 3538 if (q <= p) 3539 break; 3540 p = q; 3541 } while (p < n); 3542 3543 if ((p < n) && !at(p, n, ':')) { 3544 if (skipParseException) { 3545 return p; 3546 } 3547 fail("Illegal character in hostname", p); 3548 } 3549 if (l < 0) 3550 failExpecting("hostname", start); 3551 3552 // for a fully qualified hostname check that the rightmost 3553 // label starts with an alpha character. 3554 if (l > start && !match(input.charAt(l), L_ALPHA, H_ALPHA)) { 3555 fail("Illegal character in hostname", l); 3556 } 3557 3558 host = input.substring(start, p); 3559 return p; 3560 } 3561 3562 3563 // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture 3564 // 3565 // Bug: The grammar in RFC2373 Appendix B does not allow addresses of 3566 // the form ::12.34.56.78, which are clearly shown in the examples 3567 // earlier in the document. Here is the original grammar: 3568 // 3569 // IPv6address = hexpart [ ":" IPv4address ] 3570 // hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] 3571 // hexseq = hex4 *( ":" hex4) 3572 // hex4 = 1*4HEXDIG 3573 // 3574 // We therefore use the following revised grammar: 3575 // 3576 // IPv6address = hexseq [ ":" IPv4address ] 3577 // | hexseq [ "::" [ hexpost ] ] 3578 // | "::" [ hexpost ] 3579 // hexpost = hexseq | hexseq ":" IPv4address | IPv4address 3580 // hexseq = hex4 *( ":" hex4) 3581 // hex4 = 1*4HEXDIG 3582 // 3583 // This covers all and only the following cases: 3584 // 3585 // hexseq 3586 // hexseq : IPv4address 3587 // hexseq :: 3588 // hexseq :: hexseq 3589 // hexseq :: hexseq : IPv4address 3590 // hexseq :: IPv4address 3591 // :: hexseq 3592 // :: hexseq : IPv4address 3593 // :: IPv4address 3594 // :: 3595 // 3596 // Additionally we constrain the IPv6 address as follows :- 3597 // 3598 // i. IPv6 addresses without compressed zeros should contain 3599 // exactly 16 bytes. 3600 // 3601 // ii. IPv6 addresses with compressed zeros should contain 3602 // less than 16 bytes. 3603 3604 private int ipv6byteCount = 0; 3605 3606 private int parseIPv6Reference(int start, int n) 3607 throws URISyntaxException 3608 { 3609 int p = start; 3610 int q; 3611 boolean compressedZeros = false; 3612 3613 q = scanHexSeq(p, n); 3614 3615 if (q > p) { 3616 p = q; 3617 if (at(p, n, "::")) { 3618 compressedZeros = true; 3619 p = scanHexPost(p + 2, n); 3620 } else if (at(p, n, ':')) { 3621 p = takeIPv4Address(p + 1, n, "IPv4 address"); 3622 ipv6byteCount += 4; 3623 } 3624 } else if (at(p, n, "::")) { 3625 compressedZeros = true; 3626 p = scanHexPost(p + 2, n); 3627 } 3628 if (p < n) 3629 fail("Malformed IPv6 address", start); 3630 if (ipv6byteCount > 16) 3631 fail("IPv6 address too long", start); 3632 if (!compressedZeros && ipv6byteCount < 16) 3633 fail("IPv6 address too short", start); 3634 if (compressedZeros && ipv6byteCount == 16) 3635 fail("Malformed IPv6 address", start); 3636 3637 return p; 3638 } 3639 3640 private int scanHexPost(int start, int n) 3641 throws URISyntaxException 3642 { 3643 int p = start; 3644 int q; 3645 3646 if (p == n) 3647 return p; 3648 3649 q = scanHexSeq(p, n); 3650 if (q > p) { 3651 p = q; 3652 if (at(p, n, ':')) { 3653 p++; 3654 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3655 ipv6byteCount += 4; 3656 } 3657 } else { 3658 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3659 ipv6byteCount += 4; 3660 } 3661 return p; 3662 } 3663 3664 // Scan a hex sequence; return -1 if one could not be scanned 3665 // 3666 private int scanHexSeq(int start, int n) 3667 throws URISyntaxException 3668 { 3669 int p = start; 3670 int q; 3671 3672 q = scan(p, n, L_HEX, H_HEX); 3673 if (q <= p) 3674 return -1; 3675 if (at(q, n, '.')) // Beginning of IPv4 address 3676 return -1; 3677 if (q > p + 4) 3678 fail("IPv6 hexadecimal digit sequence too long", p); 3679 ipv6byteCount += 2; 3680 p = q; 3681 while (p < n) { 3682 if (!at(p, n, ':')) 3683 break; 3684 if (at(p + 1, n, ':')) 3685 break; // "::" 3686 p++; 3687 q = scan(p, n, L_HEX, H_HEX); 3688 if (q <= p) 3689 failExpecting("digits for an IPv6 address", p); 3690 if (at(q, n, '.')) { // Beginning of IPv4 address 3691 p--; 3692 break; 3693 } 3694 if (q > p + 4) 3695 fail("IPv6 hexadecimal digit sequence too long", p); 3696 ipv6byteCount += 2; 3697 p = q; 3698 } 3699 3700 return p; 3701 } 3702 3703 } 3704 static { 3705 SharedSecrets.setJavaNetUriAccess( 3706 new JavaNetUriAccess() { 3707 public URI create(String scheme, String path) { 3708 return new URI(scheme, path); 3709 } 3710 } 3711 ); 3712 } 3713 }