blob: 9cb4f6565bc094c11fd8af379865e038f0b43766 [file] [log] [blame] [edit]
#![deny(
missing_copy_implementations,
missing_debug_implementations,
missing_docs,
trivial_casts,
trivial_numeric_casts,
unsafe_code,
unused_import_braces,
unused_qualifications,
)]
#![cfg_attr(feature = "dev", feature(plugin))]
#![cfg_attr(feature = "dev", plugin(clippy))]
#![cfg_attr(feature = "dev", deny(clippy))]
//! Fonctions to decode and encode [RFC-3492 Punycode](https://tools.ietf.org/html/rfc3492).
// See [RFC-3492, section 4](https://tools.ietf.org/html/rfc3492#section-4).
const BASE : u32 = 36;
const TMIN : u32 = 1;
const TMAX : u32 = 26;
const SKEW : u32 = 38;
const DAMP : u32 = 700;
const INITIAL_BIAS : u32 = 72;
const INITIAL_N : u32 = 128;
const DELIMITER : char = '-';
/// Decode the string as Punycode. The string should not contain the initial `xn--` and must
/// contain only ASCII characters.
/// # Example
/// ```
/// assert_eq!(
/// punycode::decode("acadmie-franaise-npb1a").unwrap(),
/// "académie-française"
/// );
/// ```
pub fn decode(input: &str) -> Result<String, ()> {
if !input.is_ascii() {
return Err(());
}
let mut n = INITIAL_N;
let mut i = 0;
let mut bias = INITIAL_BIAS;
let (mut output, input) = if let Some(i) = input.rfind(DELIMITER) {
(input[0..i].chars().collect(), &input[i+1..])
}
else {
(vec![], &input[..])
};
let mut it = input.chars().peekable();
while it.peek() != None {
let oldi = i;
let mut w = 1;
for k in 1.. {
let c = if let Some(c) = it.next() {
c
}
else {
return Err(());
};
let k = k*BASE;
let digit = decode_digit(c);
if digit == BASE {
return Err(());
}
// overflow check
if digit > (std::u32::MAX - i) / w {
return Err(());
}
i += digit * w;
let t = clamped_sub(TMIN, k, bias, TMAX);
if digit < t {
break;
}
// overflow check
if BASE > (std::u32::MAX - t) / w {
return Err(());
}
w *= BASE - t;
}
let len = (output.len() + 1) as u32;
bias = adapt(i - oldi, len, oldi == 0);
let il = i / len;
// overflow check
if n > std::u32::MAX - il {
return Err(());
}
n += il;
i %= len;
if let Some(c) = std::char::from_u32(n) {
output.insert(i as usize, c);
}
else {
return Err(());
}
i += 1;
}
Ok(output.iter().cloned().collect())
}
/// Encode a string as punycode. The result string will contain only ASCII characters. The result
/// string does not start with `xn--`.
/// # Example
/// ```
/// assert_eq!(
/// punycode::encode("académie-française").unwrap(),
/// "acadmie-franaise-npb1a"
/// );
/// ```
pub fn encode(input: &str) -> Result<String, ()> {
encode_slice(&input.chars().collect::<Vec<char>>())
}
fn encode_slice(input: &[char]) -> Result<String, ()> {
let mut n = INITIAL_N;
let mut delta = 0;
let mut bias = INITIAL_BIAS;
let mut output : String = input.iter().filter(|&&c| c.is_ascii()).cloned().collect();
let mut h = output.len() as u32;
let b = h;
if b > 0 {
output.push(DELIMITER)
}
while h < input.len() as u32 {
let m = *input.iter().filter(|&&c| (c as u32) >= n).min().unwrap() as u32;
if m - n > (std::u32::MAX - delta) / (h + 1) {
return Err(());
}
delta += (m - n) * (h + 1);
n = m;
for c in input {
let c = *c as u32;
if c < n {
delta += 1;
}
else if c == n {
let mut q = delta;
for k in 1.. {
let k = k*BASE;
let t = clamped_sub(TMIN, k, bias, TMAX);
if q < t {
break;
}
output.push(encode_digit(t + (q - t) % (BASE - t)));
q = (q - t) / (BASE - t);
}
output.push(encode_digit(q));
bias = adapt(delta, h+1, h == b);
delta = 0;
h += 1;
}
}
delta += 1;
n += 1;
}
Ok(output)
}
fn adapt(delta: u32, numpoint: u32, firsttime: bool) -> u32 {
let mut delta = if firsttime {
delta / DAMP
}
else {
delta / 2
};
delta += delta / numpoint;
let mut k = 0;
while delta > (BASE - TMIN) * TMAX / 2 {
delta /= BASE - TMIN;
k += BASE
}
k + (BASE - TMIN + 1) * delta / (delta + SKEW)
}
/// Compute `lhs-rhs`. Result will be clamped in [min, max].
fn clamped_sub<T>(min: T, lhs: T, rhs: T, max: T) -> T
where T : Ord
+ std::ops::Add<Output=T>
+ std::ops::Sub<Output=T>
+ Copy
{
if min + rhs >= lhs { min }
else if max + rhs <= lhs { max }
else { lhs - rhs }
}
fn decode_digit(c: char) -> u32 {
let cp = c as u32;
match c {
'0' ... '9' => cp - ('0' as u32) + 26,
'A' ... 'Z' => cp - ('A' as u32),
'a' ... 'z' => cp - ('a' as u32),
_ => BASE,
}
}
fn encode_digit(d: u32) -> char {
let r = (d + 22 + (if d < 26 { 75 } else { 0 })) as u8 as char;
assert!(('0' <= r && r <= '9') || ('a' <= r && r <= 'z'), "r = {}", r);
r
}
#[cfg(test)]
static TESTS: &'static [(&'static str, &'static str)] = &[
// examples taken from [RCF-3492, section 7.1](https://tools.ietf.org/html/rfc3492#section-7.1)
(&"\u{0644}\u{064A}\u{0647}\u{0645}\u{0627}\u{0628}\u{062A}\u{0643}\u{0644}\
\u{0645}\u{0648}\u{0634}\u{0639}\u{0631}\u{0628}\u{064A}\u{061F}",
&"egbpdaj6bu4bxfgehfvwxn"),
(&"\u{4ED6}\u{4EEC}\u{4E3A}\u{4EC0}\u{4E48}\u{4E0D}\u{8BF4}\u{4E2D}\u{6587}",
&"ihqwcrb4cv8a8dqg056pqjye"),
(&"\u{4ED6}\u{5011}\u{7232}\u{4EC0}\u{9EBD}\u{4E0D}\u{8AAA}\u{4E2D}\u{6587}",
&"ihqwctvzc91f659drss3x8bo0yb"),
(&"\u{0050}\u{0072}\u{006F}\u{010D}\u{0070}\u{0072}\u{006F}\u{0073}\u{0074}\
\u{011B}\u{006E}\u{0065}\u{006D}\u{006C}\u{0075}\u{0076}\u{00ED}\u{010D}\
\u{0065}\u{0073}\u{006B}\u{0079}",
&"Proprostnemluvesky-uyb24dma41a"),
(&"\u{05DC}\u{05DE}\u{05D4}\u{05D4}\u{05DD}\u{05E4}\u{05E9}\u{05D5}\u{05D8}\
\u{05DC}\u{05D0}\u{05DE}\u{05D3}\u{05D1}\u{05E8}\u{05D9}\u{05DD}\u{05E2}\
\u{05D1}\u{05E8}\u{05D9}\u{05EA}",
&"4dbcagdahymbxekheh6e0a7fei0b"),
(&"\u{092F}\u{0939}\u{0932}\u{094B}\u{0917}\u{0939}\u{093F}\u{0928}\u{094D}\
\u{0926}\u{0940}\u{0915}\u{094D}\u{092F}\u{094B}\u{0902}\u{0928}\u{0939}\
\u{0940}\u{0902}\u{092C}\u{094B}\u{0932}\u{0938}\u{0915}\u{0924}\u{0947}\
\u{0939}\u{0948}\u{0902}",
&"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
(&"\u{306A}\u{305C}\u{307F}\u{3093}\u{306A}\u{65E5}\u{672C}\u{8A9E}\u{3092}\
\u{8A71}\u{3057}\u{3066}\u{304F}\u{308C}\u{306A}\u{3044}\u{306E}\u{304B}",
&"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
(&"\u{C138}\u{ACC4}\u{C758}\u{BAA8}\u{B4E0}\u{C0AC}\u{B78C}\u{B4E4}\u{C774}\
\u{D55C}\u{AD6D}\u{C5B4}\u{B97C}\u{C774}\u{D574}\u{D55C}\u{B2E4}\u{BA74}\
\u{C5BC}\u{B9C8}\u{B098}\u{C88B}\u{C744}\u{AE4C}",
&"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c"),
(&"\u{043F}\u{043E}\u{0447}\u{0435}\u{043C}\u{0443}\u{0436}\u{0435}\u{043E}\
\u{043D}\u{0438}\u{043D}\u{0435}\u{0433}\u{043E}\u{0432}\u{043E}\u{0440}\
\u{044F}\u{0442}\u{043F}\u{043E}\u{0440}\u{0443}\u{0441}\u{0441}\u{043A}\
\u{0438}",
&"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
(&"\u{0050}\u{006F}\u{0072}\u{0071}\u{0075}\u{00E9}\u{006E}\u{006F}\u{0070}\
\u{0075}\u{0065}\u{0064}\u{0065}\u{006E}\u{0073}\u{0069}\u{006D}\u{0070}\
\u{006C}\u{0065}\u{006D}\u{0065}\u{006E}\u{0074}\u{0065}\u{0068}\u{0061}\
\u{0062}\u{006C}\u{0061}\u{0072}\u{0065}\u{006E}\u{0045}\u{0073}\u{0070}\
\u{0061}\u{00F1}\u{006F}\u{006C}",
&"PorqunopuedensimplementehablarenEspaol-fmd56a"),
(&"\u{0054}\u{1EA1}\u{0069}\u{0073}\u{0061}\u{006F}\u{0068}\u{1ECD}\u{006B}\
\u{0068}\u{00F4}\u{006E}\u{0067}\u{0074}\u{0068}\u{1EC3}\u{0063}\u{0068}\
\u{1EC9}\u{006E}\u{00F3}\u{0069}\u{0074}\u{0069}\u{1EBF}\u{006E}\u{0067}\
\u{0056}\u{0069}\u{1EC7}\u{0074}",
&"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
(&"\u{0033}\u{5E74}\u{0042}\u{7D44}\u{91D1}\u{516B}\u{5148}\u{751F}",
&"3B-ww4c5e180e575a65lsy2b"),
(&"\u{5B89}\u{5BA4}\u{5948}\u{7F8E}\u{6075}\u{002D}\u{0077}\u{0069}\u{0074}\
\u{0068}\u{002D}\u{0053}\u{0055}\u{0050}\u{0045}\u{0052}\u{002D}\u{004D}\
\u{004F}\u{004E}\u{004B}\u{0045}\u{0059}\u{0053}",
&"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
(&"\u{0048}\u{0065}\u{006C}\u{006C}\u{006F}\u{002D}\u{0041}\u{006E}\u{006F}\
\u{0074}\u{0068}\u{0065}\u{0072}\u{002D}\u{0057}\u{0061}\u{0079}\u{002D}\
\u{305D}\u{308C}\u{305E}\u{308C}\u{306E}\u{5834}\u{6240}",
&"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
(&"\u{3072}\u{3068}\u{3064}\u{5C4B}\u{6839}\u{306E}\u{4E0B}\u{0032}",
&"2-u9tlzr9756bt3uc0v"),
(&"\u{004D}\u{0061}\u{006A}\u{0069}\u{3067}\u{004B}\u{006F}\u{0069}\u{3059}\
\u{308B}\u{0035}\u{79D2}\u{524D}",
&"MajiKoi5-783gue6qz075azm5e"),
(&"\u{30D1}\u{30D5}\u{30A3}\u{30FC}\u{0064}\u{0065}\u{30EB}\u{30F3}\u{30D0}",
&"de-jg4avhby1noc0d"),
(&"\u{305D}\u{306E}\u{30B9}\u{30D4}\u{30FC}\u{30C9}\u{3067}",
&"d9juau41awczczp"),
(&"\u{002D}\u{003E}\u{0020}\u{0024}\u{0031}\u{002E}\u{0030}\u{0030}\u{0020}\
\u{003C}\u{002D}",
&"-> $1.00 <--"),
// some real-life examples
(&"académie-française", &"acadmie-franaise-npb1a"),
(&"bücher", &"bcher-kva"),
(&"république-numérique", &"rpublique-numrique-bwbm"),
// some real-life TLD
(&"бг", &"90ae"),
(&"рф", &"p1ai"),
(&"укр", &"j1amh"),
(&"السعودية", &"mgberp4a5d4ar"),
(&"امارات", &"mgbaam7a8h"),
(&"مصر", &"wgbh1c"),
(&"中国", &"fiqs8s"),
(&"中國", &"fiqz9s"),
(&"台湾", &"kprw13d"),
(&"台灣", &"kpry57d"),
(&"香港", &"j6w193g"),
// other
(&"", &""),
(&"a", &"a-"),
(&"0", &"0-"),
(&"A", &"A-"),
(&"é", &"9ca"),
(&"\n", &"\n-"),
];
#[test]
fn test_decode() {
for t in TESTS {
assert_eq!(decode(&t.1), Ok(t.0.into()));
}
}
#[test]
fn test_encode() {
for t in TESTS {
assert_eq!(encode(t.0).unwrap().to_lowercase(), t.1.to_lowercase());
}
}
#[test]
fn test_fail_decode() {
assert_eq!(decode(&"bcher-kva.ch"), Err(()));
assert_eq!(decode(&"+"), Err(()));
assert_eq!(decode(&"\\"), Err(()));
assert_eq!(decode(&"é"), Err(()));
assert_eq!(decode(&"99999999"), Err(()));
}