blob: a18c3a8daceff374e56df78e2c40cd1945fe617f [file] [log] [blame]
Rahul Ravikumar05336002019-10-14 15:04:32 -07001/*
2 * Copyright (C) 2014 The Android Open Source Project
3 * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation. Oracle designates this
9 * particular file as subject to the "Classpath" exception as provided
10 * by Oracle in the LICENSE file that accompanied this code.
11 *
12 * This code is distributed in the hope that it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 * version 2 for more details (a copy is included in the LICENSE file that
16 * accompanied this code).
17 *
18 * You should have received a copy of the GNU General Public License version
19 * 2 along with this work; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23 * or visit www.oracle.com if you need additional information or have any
24 * questions.
25 */
26package java.net;
27
28import android.icu.text.IDNA;
29
30/**
31 * Provides methods to convert internationalized domain names (IDNs) between
32 * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation.
33 * Internationalized domain names can use characters from the entire range of
34 * Unicode, while traditional domain names are restricted to ASCII characters.
35 * ACE is an encoding of Unicode strings that uses only ASCII characters and
36 * can be used with software (such as the Domain Name System) that only
37 * understands traditional domain names.
38 *
39 * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
40 * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ
41 * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a
42 * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and
43 * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert
44 * domain name string back and forth.
45 *
46 * <p>The behavior of aforementioned conversion process can be adjusted by various flags:
47 * <ul>
48 * <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted
49 * can contain code points that are unassigned in Unicode 3.2, which is the
50 * Unicode version on which IDN conversion is based. If the flag is not used,
51 * the presence of such unassigned code points is treated as an error.
52 * <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>.
53 * It is an error if they don't meet the requirements.
54 * </ul>
55 * These flags can be logically OR'ed together.
56 *
57 * <p>The security consideration is important with respect to internationalization
58 * domain name support. For example, English domain names may be <i>homographed</i>
59 * - maliciously misspelled by substitution of non-Latin letters.
60 * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a>
61 * discusses security issues of IDN support as well as possible solutions.
62 * Applications are responsible for taking adequate security measures when using
63 * international domain names.
64 *
65 * @author Edward Wang
66 * @since 1.6
67 *
68 */
69public final class IDN {
70 /**
71 * Flag to allow processing of unassigned code points
72 */
73 public static final int ALLOW_UNASSIGNED = 0x01;
74
75 /**
76 * Flag to turn on the check against STD-3 ASCII rules
77 */
78 public static final int USE_STD3_ASCII_RULES = 0x02;
79
80
81 /**
82 * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
83 * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
84 *
85 * <p>ToASCII operation can fail. ToASCII fails if any step of it fails.
86 * If ToASCII operation fails, an IllegalArgumentException will be thrown.
87 * In this case, the input string should not be used in an internationalized domain name.
88 *
89 * <p> A label is an individual part of a domain name. The original ToASCII operation,
90 * as defined in RFC 3490, only operates on a single label. This method can handle
91 * both label and entire domain name, by assuming that labels in a domain name are
92 * always separated by dots. The following characters are recognized as dots:
93 * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
94 * and &#0092;uFF61 (halfwidth ideographic full stop). if dots are
95 * used as label separators, this method also changes all of them to &#0092;u002E (full stop)
96 * in output translated string.
97 *
98 * @param input the string to be processed
99 * @param flag process flag; can be 0 or any logical OR of possible flags
100 *
101 * @return the translated {@code String}
102 *
103 * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification
104 */
105 public static String toASCII(String input, int flag) {
106 // BEGIN Android-changed: Use ICU4J implementation
107 try {
108 return IDNA.convertIDNToASCII(input, flag).toString();
109 } catch (android.icu.text.StringPrepParseException e) {
110 // b/113787610: "." is a valid IDN but is rejected by ICU.
111 // Usage is relatively uncommon, so only check for it if ICU throws.
112 if (".".equals(input)) {
113 return input;
114 }
115 throw new IllegalArgumentException("Invalid input to toASCII: " + input, e);
116 }
117 // END Android-changed: Use ICU4J implementation
118 }
119
120
121 /**
122 * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
123 * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
124 *
125 * <p> This convenience method works as if by invoking the
126 * two-argument counterpart as follows:
127 * <blockquote>
128 * {@link #toASCII(String, int) toASCII}(input,&nbsp;0);
129 * </blockquote>
130 *
131 * @param input the string to be processed
132 *
133 * @return the translated {@code String}
134 *
135 * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification
136 */
137 public static String toASCII(String input) {
138 return toASCII(input, 0);
139 }
140
141
142 /**
143 * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
144 * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
145 *
146 * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified.
147 *
148 * <p> A label is an individual part of a domain name. The original ToUnicode operation,
149 * as defined in RFC 3490, only operates on a single label. This method can handle
150 * both label and entire domain name, by assuming that labels in a domain name are
151 * always separated by dots. The following characters are recognized as dots:
152 * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
153 * and &#0092;uFF61 (halfwidth ideographic full stop).
154 *
155 * @param input the string to be processed
156 * @param flag process flag; can be 0 or any logical OR of possible flags
157 *
158 * @return the translated {@code String}
159 */
160 public static String toUnicode(String input, int flag) {
161 // BEGIN Android-changed: Use ICU4J implementation
162 try {
163 // ICU only translates separators to ASCII for toASCII.
164 // Java expects the translation for toUnicode too.
165 return convertFullStop(IDNA.convertIDNToUnicode(input, flag)).toString();
166 } catch (android.icu.text.StringPrepParseException e) {
167 // The RI documentation explicitly states that if the conversion was unsuccessful
168 // the original string is returned.
169 return input;
170 }
171 // END Android-changed: Use ICU4J implementation
172 }
173
174 // BEGIN Android-added: Use ICU4J implementation
175 private static boolean isLabelSeperator(char c) {
176 return (c == '\u3002' || c == '\uff0e' || c == '\uff61');
177 }
178
179 private static StringBuffer convertFullStop(StringBuffer input) {
180 for (int i = 0; i < input.length(); i++) {
181 if (isLabelSeperator(input.charAt(i))) {
182 input.setCharAt(i, '.');
183 }
184 }
185 return input;
186 }
187 // END Android-added: Use ICU4J implementation
188
189 /**
190 * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
191 * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
192 *
193 * <p> This convenience method works as if by invoking the
194 * two-argument counterpart as follows:
195 * <blockquote>
196 * {@link #toUnicode(String, int) toUnicode}(input,&nbsp;0);
197 * </blockquote>
198 *
199 * @param input the string to be processed
200 *
201 * @return the translated {@code String}
202 */
203 public static String toUnicode(String input) {
204 return toUnicode(input, 0);
205 }
206
207
208 /* ---------------- Private members -------------- */
209
210 // Android-removed: Private helper methods, unused because we use ICU.
211 /*
212 // ACE Prefix is "xn--"
213 private static final String ACE_PREFIX = "xn--";
214 private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length();
215
216 private static final int MAX_LABEL_LENGTH = 63;
217
218 // single instance of nameprep
219 private static StringPrep namePrep = null;
220
221 static {
222 InputStream stream = null;
223
224 try {
225 final String IDN_PROFILE = "uidna.spp";
226 if (System.getSecurityManager() != null) {
227 stream = AccessController.doPrivileged(new PrivilegedAction<InputStream>() {
228 public InputStream run() {
229 return StringPrep.class.getResourceAsStream(IDN_PROFILE);
230 }
231 });
232 } else {
233 stream = StringPrep.class.getResourceAsStream(IDN_PROFILE);
234 }
235
236 namePrep = new StringPrep(stream);
237 stream.close();
238 } catch (IOException e) {
239 // should never reach here
240 assert false;
241 }
242 }
243 */
244
245 /* ---------------- Private operations -------------- */
246
247
248 //
249 // to suppress the default zero-argument constructor
250 //
251 private IDN() {}
252
253 // Android-removed: Private helper methods, unused because we use ICU.
254 /*
255 //
256 // toASCII operation; should only apply to a single label
257 //
258 private static String toASCIIInternal(String label, int flag)
259 {
260 // step 1
261 // Check if the string contains code points outside the ASCII range 0..0x7c.
262 boolean isASCII = isAllASCII(label);
263 StringBuffer dest;
264
265 // step 2
266 // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
267 if (!isASCII) {
268 UCharacterIterator iter = UCharacterIterator.getInstance(label);
269 try {
270 dest = namePrep.prepare(iter, flag);
271 } catch (java.text.ParseException e) {
272 throw new IllegalArgumentException(e);
273 }
274 } else {
275 dest = new StringBuffer(label);
276 }
277
278 // step 8, move forward to check the smallest number of the code points
279 // the length must be inside 1..63
280 if (dest.length() == 0) {
281 throw new IllegalArgumentException(
282 "Empty label is not a legal name");
283 }
284
285 // step 3
286 // Verify the absence of non-LDH ASCII code points
287 // 0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f
288 // Verify the absence of leading and trailing hyphen
289 boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0);
290 if (useSTD3ASCIIRules) {
291 for (int i = 0; i < dest.length(); i++) {
292 int c = dest.charAt(i);
293 if (isNonLDHAsciiCodePoint(c)) {
294 throw new IllegalArgumentException(
295 "Contains non-LDH ASCII characters");
296 }
297 }
298
299 if (dest.charAt(0) == '-' ||
300 dest.charAt(dest.length() - 1) == '-') {
301
302 throw new IllegalArgumentException(
303 "Has leading or trailing hyphen");
304 }
305 }
306
307 if (!isASCII) {
308 // step 4
309 // If all code points are inside 0..0x7f, skip to step 8
310 if (!isAllASCII(dest.toString())) {
311 // step 5
312 // verify the sequence does not begin with ACE prefix
313 if(!startsWithACEPrefix(dest)){
314
315 // step 6
316 // encode the sequence with punycode
317 try {
318 dest = Punycode.encode(dest, null);
319 } catch (java.text.ParseException e) {
320 throw new IllegalArgumentException(e);
321 }
322
323 dest = toASCIILower(dest);
324
325 // step 7
326 // prepend the ACE prefix
327 dest.insert(0, ACE_PREFIX);
328 } else {
329 throw new IllegalArgumentException("The input starts with the ACE Prefix");
330 }
331
332 }
333 }
334
335 // step 8
336 // the length must be inside 1..63
337 if (dest.length() > MAX_LABEL_LENGTH) {
338 throw new IllegalArgumentException("The label in the input is too long");
339 }
340
341 return dest.toString();
342 }
343
344 //
345 // toUnicode operation; should only apply to a single label
346 //
347 private static String toUnicodeInternal(String label, int flag) {
348 boolean[] caseFlags = null;
349 StringBuffer dest;
350
351 // step 1
352 // find out if all the codepoints in input are ASCII
353 boolean isASCII = isAllASCII(label);
354
355 if(!isASCII){
356 // step 2
357 // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
358 try {
359 UCharacterIterator iter = UCharacterIterator.getInstance(label);
360 dest = namePrep.prepare(iter, flag);
361 } catch (Exception e) {
362 // toUnicode never fails; if any step fails, return the input string
363 return label;
364 }
365 } else {
366 dest = new StringBuffer(label);
367 }
368
369 // step 3
370 // verify ACE Prefix
371 if(startsWithACEPrefix(dest)) {
372
373 // step 4
374 // Remove the ACE Prefix
375 String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length());
376
377 try {
378 // step 5
379 // Decode using punycode
380 StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null);
381
382 // step 6
383 // Apply toASCII
384 String toASCIIOut = toASCII(decodeOut.toString(), flag);
385
386 // step 7
387 // verify
388 if (toASCIIOut.equalsIgnoreCase(dest.toString())) {
389 // step 8
390 // return output of step 5
391 return decodeOut.toString();
392 }
393 } catch (Exception ignored) {
394 // no-op
395 }
396 }
397
398 // just return the input
399 return label;
400 }
401
402
403 //
404 // LDH stands for "letter/digit/hyphen", with characters restricted to the
405 // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen
406 // <->.
407 // Non LDH refers to characters in the ASCII range, but which are not
408 // letters, digits or the hypen.
409 //
410 // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x5B..0x60, 0x7B..0x7F
411 //
412 private static boolean isNonLDHAsciiCodePoint(int ch){
413 return (0x0000 <= ch && ch <= 0x002C) ||
414 (0x002E <= ch && ch <= 0x002F) ||
415 (0x003A <= ch && ch <= 0x0040) ||
416 (0x005B <= ch && ch <= 0x0060) ||
417 (0x007B <= ch && ch <= 0x007F);
418 }
419
420 //
421 // search dots in a string and return the index of that character;
422 // or if there is no dots, return the length of input string
423 // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
424 // and \uFF61 (halfwidth ideographic full stop).
425 //
426 private static int searchDots(String s, int start) {
427 int i;
428 for (i = start; i < s.length(); i++) {
429 if (isLabelSeparator(s.charAt(i))) {
430 break;
431 }
432 }
433
434 return i;
435 }
436
437 //
438 // to check if a string is a root label, ".".
439 //
440 private static boolean isRootLabel(String s) {
441 return (s.length() == 1 && isLabelSeparator(s.charAt(0)));
442 }
443
444 //
445 // to check if a character is a label separator, i.e. a dot character.
446 //
447 private static boolean isLabelSeparator(char c) {
448 return (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61');
449 }
450
451 //
452 // to check if a string only contains US-ASCII code point
453 //
454 private static boolean isAllASCII(String input) {
455 boolean isASCII = true;
456 for (int i = 0; i < input.length(); i++) {
457 int c = input.charAt(i);
458 if (c > 0x7F) {
459 isASCII = false;
460 break;
461 }
462 }
463 return isASCII;
464 }
465
466 //
467 // to check if a string starts with ACE-prefix
468 //
469 private static boolean startsWithACEPrefix(StringBuffer input){
470 boolean startsWithPrefix = true;
471
472 if(input.length() < ACE_PREFIX_LENGTH){
473 return false;
474 }
475 for(int i = 0; i < ACE_PREFIX_LENGTH; i++){
476 if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){
477 startsWithPrefix = false;
478 }
479 }
480 return startsWithPrefix;
481 }
482
483 private static char toASCIILower(char ch){
484 if('A' <= ch && ch <= 'Z'){
485 return (char)(ch + 'a' - 'A');
486 }
487 return ch;
488 }
489
490 private static StringBuffer toASCIILower(StringBuffer input){
491 StringBuffer dest = new StringBuffer();
492 for(int i = 0; i < input.length();i++){
493 dest.append(toASCIILower(input.charAt(i)));
494 }
495 return dest;
496 }
497 */
498}