| <?xml version="1.0" encoding="UTF-8" ?> |
| <!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd"> |
| <!-- |
| Copyright © 1991-2013 Unicode, Inc. |
| CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/) |
| For terms of use, see http://www.unicode.org/copyright.html |
| --> |
| <supplementalData> |
| <version number="$Revision$"/> |
| <transforms> |
| <transform source="Han" target="Spacedhan" direction="both" visibility="internal"> |
| <tRule> |
| # Only intended for internal use |
| # Make sure Han are normalized, including characters that contain them. |
| # The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:ideographic:]-[:sc=han:] |
| # Where XXX is the resolved [:ideographic:][:sc=han:]. It needs updating with each Unicode release! |
| :: [[、。々《-』〜・㆒-㆟㈠-㉇㊀-㊰㋀-㋋ ㍘-㍰㍻-㍿㏠-㏾🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc; |
| :: fullwidth-halfwidth; |
| 。 → '.'; |
| 。→ '.'; |
| 、→ ','; |
| 、→ ','; |
| 《→ '«'; |
| 》→ '»'; |
| 〈 → '‹'; |
| 〉→ '›'; |
| 「→ '‘'; |
| 」→ '’'; |
| 「→ '‘'; |
| 」→ '’'; |
| 『→ '“'; |
| 』→ '”'; |
| |
| ・→ '‧'; |
| ・ → '‧'; |
| 々→ '⓶'; |
| 〜→ '~'; |
| |
| $terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]]; |
| $initialPunct = [:Ps:][:Pi:]; |
| # add space between any Han or terminal punctuation and letters, and |
| # between letters and Han or initial punct |
| [[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ; |
| [:Letter:] [:Mark:]* {} [[:Ideographic:] $initialPunct] → ' ' ; |
| # remove spacing between ideographs and other letters |
| ← [:Ideographic:] { ' ' } [:Letter:] ; |
| ← [:Letter:] [:Mark:]* { ' ' } [:Ideographic:] ; |
| </tRule> |
| </transform> |
| </transforms> |
| </supplementalData> |