Rahul Ravikumar | 0533600 | 2019-10-14 15:04:32 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2014 The Android Open Source Project |
| 3 | * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved. |
| 4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 5 | * |
| 6 | * This code is free software; you can redistribute it and/or modify it |
| 7 | * under the terms of the GNU General Public License version 2 only, as |
| 8 | * published by the Free Software Foundation. Oracle designates this |
| 9 | * particular file as subject to the "Classpath" exception as provided |
| 10 | * by Oracle in the LICENSE file that accompanied this code. |
| 11 | * |
| 12 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 13 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 14 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 15 | * version 2 for more details (a copy is included in the LICENSE file that |
| 16 | * accompanied this code). |
| 17 | * |
| 18 | * You should have received a copy of the GNU General Public License version |
| 19 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 20 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 21 | * |
| 22 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| 23 | * or visit www.oracle.com if you need additional information or have any |
| 24 | * questions. |
| 25 | */ |
| 26 | package java.net; |
| 27 | |
| 28 | import android.icu.text.IDNA; |
| 29 | |
| 30 | /** |
| 31 | * Provides methods to convert internationalized domain names (IDNs) between |
| 32 | * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation. |
| 33 | * Internationalized domain names can use characters from the entire range of |
| 34 | * Unicode, while traditional domain names are restricted to ASCII characters. |
| 35 | * ACE is an encoding of Unicode strings that uses only ASCII characters and |
| 36 | * can be used with software (such as the Domain Name System) that only |
| 37 | * understands traditional domain names. |
| 38 | * |
| 39 | * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. |
| 40 | * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ |
| 41 | * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a |
| 42 | * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and |
| 43 | * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert |
| 44 | * domain name string back and forth. |
| 45 | * |
| 46 | * <p>The behavior of aforementioned conversion process can be adjusted by various flags: |
| 47 | * <ul> |
| 48 | * <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted |
| 49 | * can contain code points that are unassigned in Unicode 3.2, which is the |
| 50 | * Unicode version on which IDN conversion is based. If the flag is not used, |
| 51 | * the presence of such unassigned code points is treated as an error. |
| 52 | * <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>. |
| 53 | * It is an error if they don't meet the requirements. |
| 54 | * </ul> |
| 55 | * These flags can be logically OR'ed together. |
| 56 | * |
| 57 | * <p>The security consideration is important with respect to internationalization |
| 58 | * domain name support. For example, English domain names may be <i>homographed</i> |
| 59 | * - maliciously misspelled by substitution of non-Latin letters. |
| 60 | * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a> |
| 61 | * discusses security issues of IDN support as well as possible solutions. |
| 62 | * Applications are responsible for taking adequate security measures when using |
| 63 | * international domain names. |
| 64 | * |
| 65 | * @author Edward Wang |
| 66 | * @since 1.6 |
| 67 | * |
| 68 | */ |
| 69 | public final class IDN { |
| 70 | /** |
| 71 | * Flag to allow processing of unassigned code points |
| 72 | */ |
| 73 | public static final int ALLOW_UNASSIGNED = 0x01; |
| 74 | |
| 75 | /** |
| 76 | * Flag to turn on the check against STD-3 ASCII rules |
| 77 | */ |
| 78 | public static final int USE_STD3_ASCII_RULES = 0x02; |
| 79 | |
| 80 | |
| 81 | /** |
| 82 | * Translates a string from Unicode to ASCII Compatible Encoding (ACE), |
| 83 | * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. |
| 84 | * |
| 85 | * <p>ToASCII operation can fail. ToASCII fails if any step of it fails. |
| 86 | * If ToASCII operation fails, an IllegalArgumentException will be thrown. |
| 87 | * In this case, the input string should not be used in an internationalized domain name. |
| 88 | * |
| 89 | * <p> A label is an individual part of a domain name. The original ToASCII operation, |
| 90 | * as defined in RFC 3490, only operates on a single label. This method can handle |
| 91 | * both label and entire domain name, by assuming that labels in a domain name are |
| 92 | * always separated by dots. The following characters are recognized as dots: |
| 93 | * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), |
| 94 | * and \uFF61 (halfwidth ideographic full stop). if dots are |
| 95 | * used as label separators, this method also changes all of them to \u002E (full stop) |
| 96 | * in output translated string. |
| 97 | * |
| 98 | * @param input the string to be processed |
| 99 | * @param flag process flag; can be 0 or any logical OR of possible flags |
| 100 | * |
| 101 | * @return the translated {@code String} |
| 102 | * |
| 103 | * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification |
| 104 | */ |
| 105 | public static String toASCII(String input, int flag) { |
| 106 | // BEGIN Android-changed: Use ICU4J implementation |
| 107 | try { |
| 108 | return IDNA.convertIDNToASCII(input, flag).toString(); |
| 109 | } catch (android.icu.text.StringPrepParseException e) { |
| 110 | // b/113787610: "." is a valid IDN but is rejected by ICU. |
| 111 | // Usage is relatively uncommon, so only check for it if ICU throws. |
| 112 | if (".".equals(input)) { |
| 113 | return input; |
| 114 | } |
| 115 | throw new IllegalArgumentException("Invalid input to toASCII: " + input, e); |
| 116 | } |
| 117 | // END Android-changed: Use ICU4J implementation |
| 118 | } |
| 119 | |
| 120 | |
| 121 | /** |
| 122 | * Translates a string from Unicode to ASCII Compatible Encoding (ACE), |
| 123 | * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. |
| 124 | * |
| 125 | * <p> This convenience method works as if by invoking the |
| 126 | * two-argument counterpart as follows: |
| 127 | * <blockquote> |
| 128 | * {@link #toASCII(String, int) toASCII}(input, 0); |
| 129 | * </blockquote> |
| 130 | * |
| 131 | * @param input the string to be processed |
| 132 | * |
| 133 | * @return the translated {@code String} |
| 134 | * |
| 135 | * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification |
| 136 | */ |
| 137 | public static String toASCII(String input) { |
| 138 | return toASCII(input, 0); |
| 139 | } |
| 140 | |
| 141 | |
| 142 | /** |
| 143 | * Translates a string from ASCII Compatible Encoding (ACE) to Unicode, |
| 144 | * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. |
| 145 | * |
| 146 | * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified. |
| 147 | * |
| 148 | * <p> A label is an individual part of a domain name. The original ToUnicode operation, |
| 149 | * as defined in RFC 3490, only operates on a single label. This method can handle |
| 150 | * both label and entire domain name, by assuming that labels in a domain name are |
| 151 | * always separated by dots. The following characters are recognized as dots: |
| 152 | * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), |
| 153 | * and \uFF61 (halfwidth ideographic full stop). |
| 154 | * |
| 155 | * @param input the string to be processed |
| 156 | * @param flag process flag; can be 0 or any logical OR of possible flags |
| 157 | * |
| 158 | * @return the translated {@code String} |
| 159 | */ |
| 160 | public static String toUnicode(String input, int flag) { |
| 161 | // BEGIN Android-changed: Use ICU4J implementation |
| 162 | try { |
| 163 | // ICU only translates separators to ASCII for toASCII. |
| 164 | // Java expects the translation for toUnicode too. |
| 165 | return convertFullStop(IDNA.convertIDNToUnicode(input, flag)).toString(); |
| 166 | } catch (android.icu.text.StringPrepParseException e) { |
| 167 | // The RI documentation explicitly states that if the conversion was unsuccessful |
| 168 | // the original string is returned. |
| 169 | return input; |
| 170 | } |
| 171 | // END Android-changed: Use ICU4J implementation |
| 172 | } |
| 173 | |
| 174 | // BEGIN Android-added: Use ICU4J implementation |
| 175 | private static boolean isLabelSeperator(char c) { |
| 176 | return (c == '\u3002' || c == '\uff0e' || c == '\uff61'); |
| 177 | } |
| 178 | |
| 179 | private static StringBuffer convertFullStop(StringBuffer input) { |
| 180 | for (int i = 0; i < input.length(); i++) { |
| 181 | if (isLabelSeperator(input.charAt(i))) { |
| 182 | input.setCharAt(i, '.'); |
| 183 | } |
| 184 | } |
| 185 | return input; |
| 186 | } |
| 187 | // END Android-added: Use ICU4J implementation |
| 188 | |
| 189 | /** |
| 190 | * Translates a string from ASCII Compatible Encoding (ACE) to Unicode, |
| 191 | * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. |
| 192 | * |
| 193 | * <p> This convenience method works as if by invoking the |
| 194 | * two-argument counterpart as follows: |
| 195 | * <blockquote> |
| 196 | * {@link #toUnicode(String, int) toUnicode}(input, 0); |
| 197 | * </blockquote> |
| 198 | * |
| 199 | * @param input the string to be processed |
| 200 | * |
| 201 | * @return the translated {@code String} |
| 202 | */ |
| 203 | public static String toUnicode(String input) { |
| 204 | return toUnicode(input, 0); |
| 205 | } |
| 206 | |
| 207 | |
| 208 | /* ---------------- Private members -------------- */ |
| 209 | |
| 210 | // Android-removed: Private helper methods, unused because we use ICU. |
| 211 | /* |
| 212 | // ACE Prefix is "xn--" |
| 213 | private static final String ACE_PREFIX = "xn--"; |
| 214 | private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length(); |
| 215 | |
| 216 | private static final int MAX_LABEL_LENGTH = 63; |
| 217 | |
| 218 | // single instance of nameprep |
| 219 | private static StringPrep namePrep = null; |
| 220 | |
| 221 | static { |
| 222 | InputStream stream = null; |
| 223 | |
| 224 | try { |
| 225 | final String IDN_PROFILE = "uidna.spp"; |
| 226 | if (System.getSecurityManager() != null) { |
| 227 | stream = AccessController.doPrivileged(new PrivilegedAction<InputStream>() { |
| 228 | public InputStream run() { |
| 229 | return StringPrep.class.getResourceAsStream(IDN_PROFILE); |
| 230 | } |
| 231 | }); |
| 232 | } else { |
| 233 | stream = StringPrep.class.getResourceAsStream(IDN_PROFILE); |
| 234 | } |
| 235 | |
| 236 | namePrep = new StringPrep(stream); |
| 237 | stream.close(); |
| 238 | } catch (IOException e) { |
| 239 | // should never reach here |
| 240 | assert false; |
| 241 | } |
| 242 | } |
| 243 | */ |
| 244 | |
| 245 | /* ---------------- Private operations -------------- */ |
| 246 | |
| 247 | |
| 248 | // |
| 249 | // to suppress the default zero-argument constructor |
| 250 | // |
| 251 | private IDN() {} |
| 252 | |
| 253 | // Android-removed: Private helper methods, unused because we use ICU. |
| 254 | /* |
| 255 | // |
| 256 | // toASCII operation; should only apply to a single label |
| 257 | // |
| 258 | private static String toASCIIInternal(String label, int flag) |
| 259 | { |
| 260 | // step 1 |
| 261 | // Check if the string contains code points outside the ASCII range 0..0x7c. |
| 262 | boolean isASCII = isAllASCII(label); |
| 263 | StringBuffer dest; |
| 264 | |
| 265 | // step 2 |
| 266 | // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here |
| 267 | if (!isASCII) { |
| 268 | UCharacterIterator iter = UCharacterIterator.getInstance(label); |
| 269 | try { |
| 270 | dest = namePrep.prepare(iter, flag); |
| 271 | } catch (java.text.ParseException e) { |
| 272 | throw new IllegalArgumentException(e); |
| 273 | } |
| 274 | } else { |
| 275 | dest = new StringBuffer(label); |
| 276 | } |
| 277 | |
| 278 | // step 8, move forward to check the smallest number of the code points |
| 279 | // the length must be inside 1..63 |
| 280 | if (dest.length() == 0) { |
| 281 | throw new IllegalArgumentException( |
| 282 | "Empty label is not a legal name"); |
| 283 | } |
| 284 | |
| 285 | // step 3 |
| 286 | // Verify the absence of non-LDH ASCII code points |
| 287 | // 0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f |
| 288 | // Verify the absence of leading and trailing hyphen |
| 289 | boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0); |
| 290 | if (useSTD3ASCIIRules) { |
| 291 | for (int i = 0; i < dest.length(); i++) { |
| 292 | int c = dest.charAt(i); |
| 293 | if (isNonLDHAsciiCodePoint(c)) { |
| 294 | throw new IllegalArgumentException( |
| 295 | "Contains non-LDH ASCII characters"); |
| 296 | } |
| 297 | } |
| 298 | |
| 299 | if (dest.charAt(0) == '-' || |
| 300 | dest.charAt(dest.length() - 1) == '-') { |
| 301 | |
| 302 | throw new IllegalArgumentException( |
| 303 | "Has leading or trailing hyphen"); |
| 304 | } |
| 305 | } |
| 306 | |
| 307 | if (!isASCII) { |
| 308 | // step 4 |
| 309 | // If all code points are inside 0..0x7f, skip to step 8 |
| 310 | if (!isAllASCII(dest.toString())) { |
| 311 | // step 5 |
| 312 | // verify the sequence does not begin with ACE prefix |
| 313 | if(!startsWithACEPrefix(dest)){ |
| 314 | |
| 315 | // step 6 |
| 316 | // encode the sequence with punycode |
| 317 | try { |
| 318 | dest = Punycode.encode(dest, null); |
| 319 | } catch (java.text.ParseException e) { |
| 320 | throw new IllegalArgumentException(e); |
| 321 | } |
| 322 | |
| 323 | dest = toASCIILower(dest); |
| 324 | |
| 325 | // step 7 |
| 326 | // prepend the ACE prefix |
| 327 | dest.insert(0, ACE_PREFIX); |
| 328 | } else { |
| 329 | throw new IllegalArgumentException("The input starts with the ACE Prefix"); |
| 330 | } |
| 331 | |
| 332 | } |
| 333 | } |
| 334 | |
| 335 | // step 8 |
| 336 | // the length must be inside 1..63 |
| 337 | if (dest.length() > MAX_LABEL_LENGTH) { |
| 338 | throw new IllegalArgumentException("The label in the input is too long"); |
| 339 | } |
| 340 | |
| 341 | return dest.toString(); |
| 342 | } |
| 343 | |
| 344 | // |
| 345 | // toUnicode operation; should only apply to a single label |
| 346 | // |
| 347 | private static String toUnicodeInternal(String label, int flag) { |
| 348 | boolean[] caseFlags = null; |
| 349 | StringBuffer dest; |
| 350 | |
| 351 | // step 1 |
| 352 | // find out if all the codepoints in input are ASCII |
| 353 | boolean isASCII = isAllASCII(label); |
| 354 | |
| 355 | if(!isASCII){ |
| 356 | // step 2 |
| 357 | // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here |
| 358 | try { |
| 359 | UCharacterIterator iter = UCharacterIterator.getInstance(label); |
| 360 | dest = namePrep.prepare(iter, flag); |
| 361 | } catch (Exception e) { |
| 362 | // toUnicode never fails; if any step fails, return the input string |
| 363 | return label; |
| 364 | } |
| 365 | } else { |
| 366 | dest = new StringBuffer(label); |
| 367 | } |
| 368 | |
| 369 | // step 3 |
| 370 | // verify ACE Prefix |
| 371 | if(startsWithACEPrefix(dest)) { |
| 372 | |
| 373 | // step 4 |
| 374 | // Remove the ACE Prefix |
| 375 | String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length()); |
| 376 | |
| 377 | try { |
| 378 | // step 5 |
| 379 | // Decode using punycode |
| 380 | StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null); |
| 381 | |
| 382 | // step 6 |
| 383 | // Apply toASCII |
| 384 | String toASCIIOut = toASCII(decodeOut.toString(), flag); |
| 385 | |
| 386 | // step 7 |
| 387 | // verify |
| 388 | if (toASCIIOut.equalsIgnoreCase(dest.toString())) { |
| 389 | // step 8 |
| 390 | // return output of step 5 |
| 391 | return decodeOut.toString(); |
| 392 | } |
| 393 | } catch (Exception ignored) { |
| 394 | // no-op |
| 395 | } |
| 396 | } |
| 397 | |
| 398 | // just return the input |
| 399 | return label; |
| 400 | } |
| 401 | |
| 402 | |
| 403 | // |
| 404 | // LDH stands for "letter/digit/hyphen", with characters restricted to the |
| 405 | // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen |
| 406 | // <->. |
| 407 | // Non LDH refers to characters in the ASCII range, but which are not |
| 408 | // letters, digits or the hypen. |
| 409 | // |
| 410 | // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x5B..0x60, 0x7B..0x7F |
| 411 | // |
| 412 | private static boolean isNonLDHAsciiCodePoint(int ch){ |
| 413 | return (0x0000 <= ch && ch <= 0x002C) || |
| 414 | (0x002E <= ch && ch <= 0x002F) || |
| 415 | (0x003A <= ch && ch <= 0x0040) || |
| 416 | (0x005B <= ch && ch <= 0x0060) || |
| 417 | (0x007B <= ch && ch <= 0x007F); |
| 418 | } |
| 419 | |
| 420 | // |
| 421 | // search dots in a string and return the index of that character; |
| 422 | // or if there is no dots, return the length of input string |
| 423 | // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), |
| 424 | // and \uFF61 (halfwidth ideographic full stop). |
| 425 | // |
| 426 | private static int searchDots(String s, int start) { |
| 427 | int i; |
| 428 | for (i = start; i < s.length(); i++) { |
| 429 | if (isLabelSeparator(s.charAt(i))) { |
| 430 | break; |
| 431 | } |
| 432 | } |
| 433 | |
| 434 | return i; |
| 435 | } |
| 436 | |
| 437 | // |
| 438 | // to check if a string is a root label, ".". |
| 439 | // |
| 440 | private static boolean isRootLabel(String s) { |
| 441 | return (s.length() == 1 && isLabelSeparator(s.charAt(0))); |
| 442 | } |
| 443 | |
| 444 | // |
| 445 | // to check if a character is a label separator, i.e. a dot character. |
| 446 | // |
| 447 | private static boolean isLabelSeparator(char c) { |
| 448 | return (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61'); |
| 449 | } |
| 450 | |
| 451 | // |
| 452 | // to check if a string only contains US-ASCII code point |
| 453 | // |
| 454 | private static boolean isAllASCII(String input) { |
| 455 | boolean isASCII = true; |
| 456 | for (int i = 0; i < input.length(); i++) { |
| 457 | int c = input.charAt(i); |
| 458 | if (c > 0x7F) { |
| 459 | isASCII = false; |
| 460 | break; |
| 461 | } |
| 462 | } |
| 463 | return isASCII; |
| 464 | } |
| 465 | |
| 466 | // |
| 467 | // to check if a string starts with ACE-prefix |
| 468 | // |
| 469 | private static boolean startsWithACEPrefix(StringBuffer input){ |
| 470 | boolean startsWithPrefix = true; |
| 471 | |
| 472 | if(input.length() < ACE_PREFIX_LENGTH){ |
| 473 | return false; |
| 474 | } |
| 475 | for(int i = 0; i < ACE_PREFIX_LENGTH; i++){ |
| 476 | if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){ |
| 477 | startsWithPrefix = false; |
| 478 | } |
| 479 | } |
| 480 | return startsWithPrefix; |
| 481 | } |
| 482 | |
| 483 | private static char toASCIILower(char ch){ |
| 484 | if('A' <= ch && ch <= 'Z'){ |
| 485 | return (char)(ch + 'a' - 'A'); |
| 486 | } |
| 487 | return ch; |
| 488 | } |
| 489 | |
| 490 | private static StringBuffer toASCIILower(StringBuffer input){ |
| 491 | StringBuffer dest = new StringBuffer(); |
| 492 | for(int i = 0; i < input.length();i++){ |
| 493 | dest.append(toASCIILower(input.charAt(i))); |
| 494 | } |
| 495 | return dest; |
| 496 | } |
| 497 | */ |
| 498 | } |