Copy upstream release-44-1
Bug: 327164201
Test: n/a
Change-Id: Ie7a53c59a8d5619027dd14a0047a51b8da87051b
diff --git a/docs/.gitignore b/docs/.gitignore
index fcb2476..1f09b03 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -2,3 +2,4 @@
.sass-cache
.jekyll-metadata
.jekyll-cache
+/ldml/dtd2md
diff --git a/docs/README.md b/docs/README.md
index e7415e5..e5b8265 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -14,6 +14,6 @@
##### Copyright
-Copyright © 1991-2019 Unicode, Inc.
+Copyright © 1991-2023 Unicode, Inc.
All rights reserved.
[Terms of use](https://www.unicode.org/copyright.html)
diff --git a/docs/charts/keyboard/.gitignore b/docs/charts/keyboard/.gitignore
new file mode 100644
index 0000000..fd7d0e5
--- /dev/null
+++ b/docs/charts/keyboard/.gitignore
@@ -0,0 +1,2 @@
+/node_modules
+/static/data
diff --git a/docs/charts/keyboard/build.mjs b/docs/charts/keyboard/build.mjs
new file mode 100644
index 0000000..73c4f29
--- /dev/null
+++ b/docs/charts/keyboard/build.mjs
@@ -0,0 +1,102 @@
+// do the XML parsing and fs access in a build step
+
+import { promises as fs } from "node:fs";
+import * as path from "node:path";
+import { XMLParser } from "fast-xml-parser";
+
+const KEYBOARD_PATH = "../../../keyboards/3.0";
+const IMPORT_PATH = "../../../keyboards/import";
+const DATA_PATH = "static/data";
+
+async function xmlList(basepath) {
+ const dir = await fs.opendir(basepath);
+ const xmls = [];
+ for await (const ent of dir) {
+ if (!ent.isFile() || !/\.xml$/.test(ent.name)) {
+ continue;
+ }
+ xmls.push(ent.name);
+ }
+ return xmls;
+}
+
+/**
+ * List of elements that are always arrays
+ */
+const alwaysArray = [
+ "keyboard3.transforms",
+ "keyboard3.transforms.transformGroup",
+ "keyboard3.transforms.transformGroup.transform",
+];
+
+/**
+ * Loading helper for isArray
+ * @param name
+ * @param jpath
+ * @param isLeafNode
+ * @param isAttribute
+ * @returns
+ */
+// eslint-disable-next-line @typescript-eslint/no-unused-vars
+const isArray = (name, jpath, isLeafNode, isAttribute) => {
+ if (alwaysArray.indexOf(jpath) !== -1) return true;
+ return false;
+};
+
+/**
+ * Do the XML Transform given raw XML source
+ * @param xml XML source for transforms. entire keyboard file.
+ * @param source source text
+ * @returns target text
+ */
+export function parseXml(xml) {
+ const parser = new XMLParser({
+ ignoreAttributes: false,
+ isArray,
+ });
+ const j = parser.parse(xml);
+ return j;
+}
+
+async function readFile(path) {
+ return fs.readFile(path, "utf-8");
+}
+
+async function main() {
+ const xmls = await xmlList(KEYBOARD_PATH);
+ const keyboards = await packXmls(KEYBOARD_PATH, xmls);
+ const importFiles = await xmlList(IMPORT_PATH);
+ const imports = await packXmls(IMPORT_PATH, importFiles);
+
+ const allData = {
+ keyboards,
+ imports,
+ };
+
+ const outPath = path.join(DATA_PATH, "keyboard-data.json");
+ const outJsPath = path.join(DATA_PATH, "keyboard-data.js");
+ await fs.mkdir(DATA_PATH, { recursive: true });
+ const json = JSON.stringify(allData, null, " "); // indent, in case we need to read it
+ await fs.writeFile(outPath, json, "utf-8");
+ await fs.writeFile(outJsPath, `const _KeyboardData = \n` + json);
+ return { xmls, importFiles, outPath, outJsPath };
+}
+
+main().then(
+ (done) => console.dir({ done }),
+ (err) => {
+ console.error(err);
+ process.exitCode = 1;
+ }
+);
+
+async function packXmls(basepath, xmls) {
+ const allData = {};
+ for (const fn of xmls) {
+ const fp = path.join(basepath, fn);
+ const data = await readFile(fp);
+ const parsed = parseXml(data);
+ allData[fn] = parsed;
+ }
+ return allData;
+}
diff --git a/docs/charts/keyboard/index.html b/docs/charts/keyboard/index.html
new file mode 100644
index 0000000..bcb7c7f
--- /dev/null
+++ b/docs/charts/keyboard/index.html
@@ -0,0 +1,2 @@
+<h1>You're almost there</h1>
+<a href="./static/index.html">Click here for the keyboard charts</a>
diff --git a/docs/charts/keyboard/package-lock.json b/docs/charts/keyboard/package-lock.json
new file mode 100644
index 0000000..b889a24
--- /dev/null
+++ b/docs/charts/keyboard/package-lock.json
@@ -0,0 +1,42 @@
+{
+ "name": "@unicode-org/keyboard-charts",
+ "version": "1.0.0",
+ "lockfileVersion": 3,
+ "requires": true,
+ "packages": {
+ "": {
+ "name": "@unicode-org/keyboard-charts",
+ "version": "1.0.0",
+ "license": "Unicode-DFS-2016",
+ "dependencies": {
+ "fast-xml-parser": "^4.2.5"
+ }
+ },
+ "node_modules/fast-xml-parser": {
+ "version": "4.2.5",
+ "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-4.2.5.tgz",
+ "integrity": "sha512-B9/wizE4WngqQftFPmdaMYlXoJlJOYxGQOanC77fq9k8+Z0v5dDSVh+3glErdIROP//s/jgb7ZuxKfB8nVyo0g==",
+ "funding": [
+ {
+ "type": "paypal",
+ "url": "https://paypal.me/naturalintelligence"
+ },
+ {
+ "type": "github",
+ "url": "https://github.com/sponsors/NaturalIntelligence"
+ }
+ ],
+ "dependencies": {
+ "strnum": "^1.0.5"
+ },
+ "bin": {
+ "fxparser": "src/cli/cli.js"
+ }
+ },
+ "node_modules/strnum": {
+ "version": "1.0.5",
+ "resolved": "https://registry.npmjs.org/strnum/-/strnum-1.0.5.tgz",
+ "integrity": "sha512-J8bbNyKKXl5qYcR36TIO8W3mVGVHrmmxsd5PAItGkmyzwJvybiw2IVq5nqd0i4LSNSkB/sx9VHllbfFdr9k1JA=="
+ }
+ }
+}
diff --git a/docs/charts/keyboard/package.json b/docs/charts/keyboard/package.json
new file mode 100644
index 0000000..5dccb3b
--- /dev/null
+++ b/docs/charts/keyboard/package.json
@@ -0,0 +1,22 @@
+{
+ "name": "@unicode-org/keyboard-charts",
+ "version": "1.0.0",
+ "description": "Keyboard Charts app",
+ "main": "index.js",
+ "scripts": {
+ "test": "echo \"Error: no test specified\" && exit 1",
+ "serve": "npx serve static",
+ "build": "node build.mjs"
+ },
+ "keywords": [],
+ "author": "Steven R. Loomis <[email protected]>",
+ "license": "Unicode-DFS-2016",
+ "bugs": {
+ "url": "https://github.com/unicode-org/cldr/issues"
+ },
+ "homepage": "https://github.com/unicode-org/cldr#readme",
+ "private": true,
+ "dependencies": {
+ "fast-xml-parser": "^4.2.5"
+ }
+}
diff --git a/docs/charts/keyboard/static/index.html b/docs/charts/keyboard/static/index.html
new file mode 100644
index 0000000..afb92e5
--- /dev/null
+++ b/docs/charts/keyboard/static/index.html
@@ -0,0 +1,138 @@
+<html>
+ <head>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+ <title>CLDR | Proposed Keyboard 3.0 Chart</title>
+ <script src="https://unpkg.com/vue@3/dist/vue.global.js"></script>
+ <script src="./keyboard-chart.js"></script>
+ <script src="./data/keyboard-data.js"></script>
+ <link href="./keyboard-chart.css" rel="stylesheet" />
+ <link rel="stylesheet" type="text/css" href="https://www.unicode.org/webscripts/standard_styles.css">
+ </head>
+ <body>
+ <div id="app">
+ <!-- standard unicode header-->
+ <table width="100%" cellpadding="0" cellspacing="0" border="0">
+ <!-- BEGIN HEADER BAR -->
+ <tr>
+ <td colspan="2">
+ <table width="100%" border="0" cellpadding="0" cellspacing="0">
+ <tr>
+
+ <td class="icon" style="width:38px; height:35px">
+ <a href="https://www.unicode.org/">
+ <img border="0" src="https://www.unicode.org/webscripts/logo60s2.gif" align="middle"
+ alt="[Unicode]" width="34" height="33"></a>
+ </td>
+
+ <td class="icon" style="vertical-align:middle">
+ <a class="bar"> </a>
+ <a class="bar" href="https://cldr.unicode.org/index/keyboard-workgroup"><font size="3">CLDR | Keyboard-SC | Charts</font></a>
+ </td>
+
+ <td class="bar">
+ <a href="https://www.unicode.org/main.html" class="bar">Tech Site</a>
+ | <a href="https://www.unicode.org/sitemap/" class="bar">Site Map</a> |
+ <a href="https://www.unicode.org/search" class="bar">Search </a>
+ </td>
+
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2" class="gray"> </td>
+ </tr>
+ <!-- END HEADER BAR -->
+ <!-- BEGIN CONTENTS -->
+ <tr>
+ <td class="contents" valign="top">
+
+
+ <i
+ >Note: This is a very preliminary chart. For feedback on this chart or
+ contents, please comment on:
+ <a href="https://unicode-org.atlassian.net/browse/CLDR-17205"
+ >https://unicode-org.atlassian.net/browse/CLDR-17205</a
+ ></i
+ >
+ <!-- {{ message }} -->
+ <hr />
+ <div>
+ <span v-for="file of files" :key="file">
+ <a :href="'#'+file">{{file}}</a> |
+ </span>
+ </div>
+ <hr />
+ <ol>
+ <li v-for="file of files" :key="file">
+ <h2 :id="file"><code>{{file}}</code></h2>
+ <ul>
+ <li v-for="layers of getLayers(file)">
+ <h3 v-if="layers.formId">Form: {{ layers.formId }}</h3>
+ <h3 v-if="layers.id">ID: {{ layers.id }}</h3>
+ <h4 v-if="layers.minDeviceWidth">
+ minDeviceWidth: {{ layers.minDeviceWidth }}mm
+ </h4>
+ <ul>
+ <li v-for="layer of layers.layer">
+ <h4 v-if="layer.modifiers">Modifier: {{ layer.modifiers }}</h4>
+ <h4 v-if="layer.id">{{ layer.id }}</h4>
+ <div class="rows">
+ <div class="row" v-for="row of layer.row">
+ <span
+ :title="key.id"
+ :class="getKeyClass(key)"
+ v-for="key of row.keys"
+ >
+ {{key.output}}
+ <b title="Switch" v-if="key.layerId">☞ {{key.layerId}}</b>
+ </span>
+ </div>
+ </div>
+ </li>
+ </ul>
+ </li>
+ </ul>
+ <hr />
+ </li>
+ </ol>
+ </td>
+ </tr>
+ </table>
+ </div>
+ <script>
+ const { createApp } = Vue;
+
+ createApp({
+ data() {
+ return {};
+ },
+ computed: {
+ files() {
+ return getIds();
+ },
+ },
+ methods: {
+ getLayers(id) {
+ return getKeyboardLayers(id);
+ },
+ getKeys(id) {
+ return getKeyboardKeys(id);
+ },
+ getKeyClass(key) {
+ if (key.gap) {
+ return "gap-key key";
+ } else if (key.to) {
+ return "to-key key";
+ } else if (key.switch) {
+ return "switch-key key";
+ } else {
+ return "key";
+ }
+ },
+ },
+ }).mount("#app");
+ </script>
+
+ </body>
+</html>
diff --git a/docs/charts/keyboard/static/keyboard-chart.css b/docs/charts/keyboard/static/keyboard-chart.css
new file mode 100644
index 0000000..bcdc503
--- /dev/null
+++ b/docs/charts/keyboard/static/keyboard-chart.css
@@ -0,0 +1,34 @@
+
+.rows {
+ display: table;
+ margin-top: 2em;
+}
+
+.row {
+ display: table-row;
+}
+
+.key {
+ border: 1px solid gray;
+ padding: 0.25em;
+ margin-right: 0.5em;
+ height: 2em;
+ display: table-cell;
+ font-size: small;
+}
+
+.gap-key {
+ background-color: gray;
+}
+
+.to-key {
+ background-color: beige;
+}
+
+.switch-key {
+ background-color: lime;
+}
+
+.contents {
+ padding: 1em;
+}
diff --git a/docs/charts/keyboard/static/keyboard-chart.js b/docs/charts/keyboard/static/keyboard-chart.js
new file mode 100644
index 0000000..1f8be16
--- /dev/null
+++ b/docs/charts/keyboard/static/keyboard-chart.js
@@ -0,0 +1,114 @@
+// helper functions for keyboard
+
+/**
+ * Unescape an escaped string
+ * @param str input string such as '\u017c'
+ * @returns
+ */
+function unescapeStr(str) {
+ str = str.replace(/\\u{([0-9a-fA-F]+)}/g, (a, b) =>
+ String.fromCodePoint(Number.parseInt(b, 16))
+ );
+ return str;
+}
+
+function getKeyboardLayers(id) {
+ let q = _KeyboardData.keyboards[id].keyboard3.layers;
+ if (!Array.isArray(q)) {
+ q = [q];
+ }
+ mogrifyAttrs(q);
+ const keybag = getKeyboardKeys(id);
+ mogrifyLayerList(q, keybag);
+ return q;
+}
+
+function mogrifyLayerList(layerList, keybag) {
+ layerList.forEach(({ layer }) => {
+ layer.forEach(({ row }) => {
+ row.forEach((r) => {
+ r.keys = r.keys.split(" ").map((id) =>
+ Object.assign(
+ {
+ id,
+ },
+ keybag[id]
+ )
+ );
+ });
+ });
+ });
+}
+
+function getImportFile(id) {
+ return _KeyboardData.imports[id["@_path"].split("/")[1]];
+}
+
+function getImportKeys(id) {
+ const imp = getImportFile(id);
+ if (!imp) {
+ throw Error(`Could not load import ${JSON.stringify(id)}`);
+ }
+ return imp.keys.key;
+}
+
+function mogrifyKeys(keys) {
+ // drop @'
+ if (!keys) {
+ return [];
+ }
+ return keys.reduce((p, v) => {
+ // TODO: any other swapping
+ mogrifyAttrs(v);
+ const { id, output } = v;
+ if (output) {
+ v.output = unescapeStr(output);
+ }
+ p[id] = v;
+ return p;
+ }, {});
+}
+
+function mogrifyAttrs(o) {
+ for (const k of Object.keys(o)) {
+ const ok = o[k];
+ if (/^@_/.test(k)) {
+ const attr = k.substring(2);
+ o[attr] = ok;
+ delete o[k];
+ } else if (Array.isArray(ok)) {
+ ok.forEach((e) => mogrifyAttrs(e));
+ } else if (typeof ok === "object") {
+ mogrifyAttrs(ok);
+ }
+ }
+ return o;
+}
+
+function getKeyboardKeys(id) {
+ const keys = _KeyboardData.keyboards[id].keyboard3.keys.key || [];
+ if (!keys) {
+ throw Error(`No keys for ${id}`);
+ }
+ let imports = [
+ {
+ // add implied import
+ "@_base": "cldr",
+ "@_path": "techpreview/keys-Latn-implied.xml",
+ },
+ ...(_KeyboardData.keyboards[id].keyboard3.keys.import || []),
+ ];
+
+ const importedKeys = [];
+ for (const fn of imports) {
+ for (const k of getImportKeys(fn)) {
+ importedKeys.push(k);
+ }
+ }
+
+ return mogrifyKeys([...importedKeys, ...keys]);
+}
+
+function getIds() {
+ return Object.keys(_KeyboardData.keyboards);
+}
diff --git a/docs/dev/GenerateTestData.md b/docs/dev/GenerateTestData.md
new file mode 100644
index 0000000..0fd293f
--- /dev/null
+++ b/docs/dev/GenerateTestData.md
@@ -0,0 +1,31 @@
+# TL;DR
+
+Run GenerateTestData.java.
+
+# Structure
+
+There are currently 5 directories in common/testData.
+Each also has a _readme.txt with copyright information for all the files in that directory.
+The format of the files in the directory is either in the individual data files, or in the _readme.txt
+
+* localeIdentifiers — generated data (GenerateLocaleIDTestData, GenerateLikelySubtagTests)
+ * localeCanonicalization.txt
+ * localeDisplayName.txt
+ * likelySubtags.txt
+* personNameTest — generated data (GeneratePersonNameTestData)
+ * af.txt
+ * am.txt
+ * …
+* segmentation — curated data (not generated)
+ * graphemeCluster
+ * TestSegmenter-Bengali.txt
+ * TestSegmenter-Devanagari.txt
+ * …
+* transforms — curated data (not generated)
+ * am-fonipa-t-am.tx
+ * am-Latn-t-am-m0-bgn.txt
+ * am-t-am-fonipa.txt
+ * …
+* units — generated data (TestUnits)
+ * unitPreferencesTest.txt
+ * unitsTest.txt
diff --git a/docs/dev/generate-emoji-paths.md b/docs/dev/generate-emoji-paths.md
new file mode 100644
index 0000000..148d9f7
--- /dev/null
+++ b/docs/dev/generate-emoji-paths.md
@@ -0,0 +1,64 @@
+Update emoji translations & ordering
+====================================
+
+SBRS (at the start of the release):
+-----
+
+Where the current version is VV:
+
+1. Run unicodetools GenerateEmoji with specific version number, like 14.0
+
+1. If you get an error like
+
+ * Exception in thread "main" java.lang.IllegalArgumentException: no name for 🫱🫲 1FAF1 200D 1FAF2
+ * at org.unicode.tools.emoji.EmojiData.\_getName(EmojiData.java:1230)
+ * at org.unicode.tools.emoji.EmojiData.getName(EmojiData.java:1194)
+ * at org.unicode.tools.emoji.EmojiDataSourceCombined.getName(EmojiDataSourceCombined.java:156)
+ * at org.unicode.tools.emoji.GenerateEmoji.showCandidateStyle(GenerateEmoji.java:3600)
+ * at org.unicode.tools.emoji.GenerateEmoji.main(GenerateEmoji.java:641)
+
+2. Then change the name composition algorithm if necessary (for new emoji zwj sequences)
+
+ 1. It may have also been modified during the emoji development. Typically the code that needs changing will be in Annotations.synthesize, to capture yet another special skintone instance
+ 2. Ensure that the documentation of composition of names (for new components like hair styles) in LDML is updated to match what is in org.unicode.cldr.util.Annotations.
+ 3. Make sure that org.unicode.tools.emoji.unittest.TestAll runs successfully, with -Demoji-beta.
+
+2. Copy
+
+ * /emoji/docs/Public/emoji/14.0/emoji-test.txt<br>
+ to
+ * /cldr-code/src/main/resources/org/unicode/cldr/util/data/emoji/emoji-test.txt
+
+3. Run unicode tools: org.unicode.tools.emoji.GenerateCldrData
+
+ 1. Copy each list of data from the console into (respectively) as per instructions
+
+ 1. annotations/root.xml
+ 2. annotations/en.xml
+
+ 2. Copy emoji-test.txt into org.unicode.cldr.util.data.emoji
+
+4. Run org.unicode.tools.emoji.CopyImagesToCldr.java to add images to ... /cldr/tools/cldr-apps/src/main/webapp/images/emoji
+
+ * These are the ones that show up in the info panel of the survey tool.
+ * Update the collation/root.xml using unicode/draft/emoji/charts-VV/emoji-ordering-rules.txt
+
+5. Run tests
+
+ 1. You may get an error in testAnnotationPaths.
+
+ 1. May need to change org.unicode.cldr.util.Emoji.SPECIALS to have TestAnnotations pass. These are zwj sequences whose names cannot be composed.
+ 2. eg "\[{🏳🌈}{👁🗨}{🏴☠}\]"
+
+ 2. You may also get an error in TestNames. Check the names to see what is happening, and whether to change the test or the data.
+
+TODO: test that derived names are complete
+
+BRS (if the UCD files are adjusted after the start of the release):
+----
+
+As above, except that you only need to
+
+1. Run unicodetools GenerateEmoji with the beta options
+2. Copy emoji-test.txt into org.unicode.cldr.util.data.emoji
+3. update collation/root.xml using unicode/draft/emoji/charts-XX/emoji-ordering-rules.txt
\ No newline at end of file
diff --git a/docs/ldml/.markdownlint.json b/docs/ldml/.markdownlint.json
new file mode 100644
index 0000000..b00a31e
--- /dev/null
+++ b/docs/ldml/.markdownlint.json
@@ -0,0 +1,12 @@
+{
+ "default": true,
+ "MD033": false,
+ "MD030": false,
+ "MD036": false,
+ "MD013": false,
+ "MD001": false,
+ "MD041": false,
+ "MD005": false,
+ "MD007": false,
+ "no-hard-tabs": false
+}
diff --git a/docs/ldml/tr35-collation.anchors.json b/docs/ldml/tr35-collation.anchors.json
new file mode 100644
index 0000000..7a7e8f0
--- /dev/null
+++ b/docs/ldml/tr35-collation.anchors.json
@@ -0,0 +1,116 @@
+[
+ "Abbreviating_Ordering_Specifications",
+ "additional-contractions-for-tibetan",
+ "Algorithm_Case",
+ "Algorithm_FFFE",
+ "Algorithm_Reordering_Groups",
+ "allkeys_cldrtxt",
+ "Case_Parameters",
+ "Case_Tailored",
+ "Case_Untailored",
+ "Case_Weights",
+ "case-handling",
+ "case-parameters",
+ "CJK_Index_Markers",
+ "cjk-index-markers",
+ "CLDR_Collation",
+ "CLDR_Collation_Algorithm",
+ "cldr-collation",
+ "cldr-collation-algorithm",
+ "Collation_Element",
+ "Collation_Indexes",
+ "Collation_Settings",
+ "Collation_Tailorings",
+ "Collation_Type_Fallback",
+ "Collation_Types",
+ "Collation_Version",
+ "collation-element",
+ "collation-indexes",
+ "collation-reordering",
+ "collation-rule-syntax",
+ "collation-tailorings",
+ "collation-type-fallback",
+ "collation-types",
+ "Combining_Rules",
+ "combining-rules",
+ "Common_Settings",
+ "common-settings-combinations",
+ "compute-modified-collation-elements",
+ "Contents",
+ "contents-of-part-5-collation",
+ "Context_Before",
+ "Context_Sensitive_Mappings",
+ "context-before",
+ "context-sensitive-mappings",
+ "contractions",
+ "Contractions",
+ "expansions",
+ "Expansions",
+ "File_Format_allkeys_CLDR_txt",
+ "File_Format_FractionalUCA_txt",
+ "File_Format_UCA_Rules_txt",
+ "fractionalucatxt",
+ "grouping_classes_of_characters",
+ "grouping-classes-of-characters",
+ "Index_Characters",
+ "index-characters",
+ "Interpretation_reordering",
+ "interpretation-of-a-reordering-list",
+ "Logical_Reset_Positions",
+ "logical-reset-positions",
+ "non_variable_symbols",
+ "non-variable-symbols",
+ "Normalization_Setting",
+ "notes-on-the-normalization-setting",
+ "notes-on-variable-top-settings",
+ "orderings",
+ "Orderings",
+ "parts",
+ "Parts",
+ "Placing_Characters_Before_Others",
+ "placing-characters-before-others",
+ "Reordering_Groups_allkeys",
+ "reordering-groups",
+ "reordering-groups-for-allkeystxt",
+ "Root_Collation",
+ "Root_Data_File_Formats",
+ "Root_Data_Files",
+ "root-collation",
+ "root-collation-data-file-formats",
+ "root-collation-data-files",
+ "Rules",
+ "Sample_requested_and_actual_collation_locales_and_types",
+ "Script_Reordering",
+ "Setting_Options",
+ "setting-options",
+ "Special_Purpose_Commands",
+ "Special_Purpose_Elements",
+ "special-purpose-commands",
+ "Specifying_Collation_Ordering",
+ "Specifying_Contractions",
+ "Specifying_Logical_Positions",
+ "Specifying_Previous_Context",
+ "status",
+ "summary",
+ "table-abbreviating-ordering-specifications",
+ "table-collation-settings",
+ "table-sample-requested-and-actual-collation-locales-and-types",
+ "table-special-purpose-elements",
+ "table-specifying-collation-ordering",
+ "table-specifying-contractions",
+ "table-specifying-logical-positions",
+ "table-specifying-previous-context",
+ "tailored_noncharacter_weights",
+ "tailored-noncharacter-weights",
+ "tailored-strings",
+ "tibetan_contractions",
+ "uca_rulestxt",
+ "ufffe",
+ "unicode-locale-data-markup-language-ldmlpart-5-collation",
+ "unicode-technical-standard-35",
+ "untailored-characters",
+ "Variable_Top_Settings",
+ "version",
+ "visibility",
+ "Visibility"
+]
\ No newline at end of file
diff --git a/docs/ldml/tr35-collation.md b/docs/ldml/tr35-collation.md
index 1010150..15345a0 100644
--- a/docs/ldml/tr35-collation.md
+++ b/docs/ldml/tr35-collation.md
@@ -2,7 +2,7 @@
# Unicode Locale Data Markup Language (LDML)<br/>Part 5: Collation
-|Version|42 |
+|Version|44.1 |
|-------|----------------|
|Editors|Markus Scherer (<a href="mailto:[email protected]">[email protected]</a>) and <a href="tr35.md#Acknowledgments">other CLDR committee members</a>|
@@ -21,7 +21,12 @@
### _Status_
-_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium. This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
+<!-- _This is a draft document which may be updated, replaced, or superseded by other documents at any time.
+Publication does not imply endorsement by the Unicode Consortium.
+This is not a stable document; it is inappropriate to cite this document as other than a work in progress._ -->
+
+_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium.
+This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
> _**A Unicode Technical Standard (UTS)** is an independent specification. Conformance to the Unicode Standard does not imply conformance to any UTS._
@@ -42,61 +47,61 @@
## <a name="Contents" href="#Contents">Contents of Part 5, Collation</a>
-* 1 [CLDR Collation](#CLDR_Collation)
- * 1.1 [CLDR Collation Algorithm](#CLDR_Collation_Algorithm)
- * 1.1.1 [U+FFFE](#Algorithm_FFFE)
- * 1.1.2 [Context-Sensitive Mappings](#Context_Sensitive_Mappings)
- * 1.1.3 [Case Handling](#Algorithm_Case)
- * 1.1.4 [Reordering Groups](#Algorithm_Reordering_Groups)
- * 1.1.5 [Combining Rules](#Combining_Rules)
-* 2 [Root Collation](#Root_Collation)
- * 2.1 [Grouping classes of characters](#grouping_classes_of_characters)
- * 2.2 [Non-variable symbols](#non_variable_symbols)
- * 2.3 [Additional contractions for Tibetan](#tibetan_contractions)
- * 2.4 [Tailored noncharacter weights](#tailored_noncharacter_weights)
- * 2.5 [Root Collation Data Files](#Root_Data_Files)
- * 2.6 [Root Collation Data File Formats](#Root_Data_File_Formats)
- * 2.6.1 [allkeys_CLDR.txt](#File_Format_allkeys_CLDR_txt)
- * 2.6.2 [FractionalUCA.txt](#File_Format_FractionalUCA_txt)
- * 2.6.3 [UCA_Rules.txt](#File_Format_UCA_Rules_txt)
-* 3 [Collation Tailorings](#Collation_Tailorings)
- * 3.1 [Collation Types](#Collation_Types)
- * 3.1.1 [Collation Type Fallback](#Collation_Type_Fallback)
+* [CLDR Collation](#CLDR_Collation)
+ * [CLDR Collation Algorithm](#CLDR_Collation_Algorithm)
+ * [U+FFFE](#Algorithm_FFFE)
+ * [Context-Sensitive Mappings](#Context_Sensitive_Mappings)
+ * [Case Handling](#Algorithm_Case)
+ * [Reordering Groups](#Algorithm_Reordering_Groups)
+ * [Combining Rules](#Combining_Rules)
+* [Root Collation](#Root_Collation)
+ * [Grouping classes of characters](#grouping_classes_of_characters)
+ * [Non-variable symbols](#non_variable_symbols)
+ * [Additional contractions for Tibetan](#tibetan_contractions)
+ * [Tailored noncharacter weights](#tailored_noncharacter_weights)
+ * [Root Collation Data Files](#Root_Data_Files)
+ * [Root Collation Data File Formats](#Root_Data_File_Formats)
+ * [allkeys_CLDR.txt](#File_Format_allkeys_CLDR_txt)
+ * [FractionalUCA.txt](#File_Format_FractionalUCA_txt)
+ * [UCA_Rules.txt](#File_Format_UCA_Rules_txt)
+* [Collation Tailorings](#Collation_Tailorings)
+ * [Collation Types](#Collation_Types)
+ * [Collation Type Fallback](#Collation_Type_Fallback)
* Table: [Sample requested and actual collation locales and types](#Sample_requested_and_actual_collation_locales_and_types)
- * 3.2 [Version](#Collation_Version)
- * 3.3 [Collation Element](#Collation_Element)
- * 3.4 [Setting Options](#Setting_Options)
+ * [Version](#Collation_Version)
+ * [Collation Element](#Collation_Element)
+ * [Setting Options](#Setting_Options)
* Table: [Collation Settings](#Collation_Settings)
- * 3.4.1 [Common settings combinations](#Common_Settings)
- * 3.4.2 [Notes on the normalization setting](#Normalization_Setting)
- * 3.4.3 [Notes on variable top settings](#Variable_Top_Settings)
- * 3.5 [Collation Rule Syntax](#Rules)
- * 3.6 [Orderings](#Orderings)
+ * [Common settings combinations](#Common_Settings)
+ * [Notes on the normalization setting](#Normalization_Setting)
+ * [Notes on variable top settings](#Variable_Top_Settings)
+ * [Collation Rule Syntax](#Rules)
+ * [Orderings](#Orderings)
* Table: [Specifying Collation Ordering](#Specifying_Collation_Ordering)
* Table: [Abbreviating Ordering Specifications](#Abbreviating_Ordering_Specifications)
- * 3.7 [Contractions](#Contractions)
+ * [Contractions](#Contractions)
* Table: [Specifying Contractions](#Specifying_Contractions)
- * 3.8 [Expansions](#Expansions)
- * 3.9 [Context Before](#Context_Before)
+ * [Expansions](#Expansions)
+ * [Context Before](#Context_Before)
* Table: [Specifying Previous Context](#Specifying_Previous_Context)
- * 3.10 [Placing Characters Before Others](#Placing_Characters_Before_Others)
- * 3.11 [Logical Reset Positions](#Logical_Reset_Positions)
+ * [Placing Characters Before Others](#Placing_Characters_Before_Others)
+ * [Logical Reset Positions](#Logical_Reset_Positions)
* Table: [Specifying Logical Positions](#Specifying_Logical_Positions)
- * 3.12 [Special-Purpose Commands](#Special_Purpose_Commands)
+ * [Special-Purpose Commands](#Special_Purpose_Commands)
* Table: [Special-Purpose Elements](#Special_Purpose_Elements)
- * 3.13 [Collation Reordering](#Script_Reordering)
- * 3.13.1 [Interpretation of a reordering list](#Interpretation_reordering)
- * 3.13.2 [Reordering Groups for allkeys.txt](#Reordering_Groups_allkeys)
- * 3.14 [Case Parameters](#Case_Parameters)
- * 3.14.1 [Untailored Characters](#Case_Untailored)
- * 3.14.2 [Compute Modified Collation Elements](#Case_Weights)
- * 3.14.3 [Tailored Strings](#Case_Tailored)
- * 3.15 [Visibility](#Visibility)
- * 3.16 [Collation Indexes](#Collation_Indexes)
- * 3.16.1 [Index Characters](#Index_Characters)
- * 3.16.2 [CJK Index Markers](#CJK_Index_Markers)
+ * [Collation Reordering](#Script_Reordering)
+ * [Interpretation of a reordering list](#Interpretation_reordering)
+ * [Reordering Groups for allkeys.txt](#Reordering_Groups_allkeys)
+ * [Case Parameters](#Case_Parameters)
+ * [Untailored Characters](#Case_Untailored)
+ * [Compute Modified Collation Elements](#Case_Weights)
+ * [Tailored Strings](#Case_Tailored)
+ * [Visibility](#Visibility)
+ * [Collation Indexes](#Collation_Indexes)
+ * [Index Characters](#Index_Characters)
+ * [CJK Index Markers](#CJK_Index_Markers)
-## 1 <a name="CLDR_Collation" href="#CLDR_Collation">CLDR Collation</a>
+## <a name="CLDR_Collation" href="#CLDR_Collation">CLDR Collation</a>
Collation is the general term for the process and function of determining the sorting order of strings of characters, for example for lists of strings presented to users, or in databases for sorting and selecting records.
@@ -104,11 +109,11 @@
CLDR provides collation data for many languages and styles. The data supports not only sorting but also language-sensitive searching and grouping under index headers. All CLDR collations are based on the [[UCA](https://www.unicode.org/reports/tr41/#UTS10)] default order, with common modifications applied in the CLDR root collation, and further tailored for language and style as needed.
-### 1.1 <a name="CLDR_Collation_Algorithm" href="#CLDR_Collation_Algorithm">CLDR Collation Algorithm</a>
+### <a name="CLDR_Collation_Algorithm" href="#CLDR_Collation_Algorithm">CLDR Collation Algorithm</a>
The CLDR collation algorithm is an extension of the [Unicode Collation Algorithm](https://www.unicode.org/reports/tr10/#Main_Algorithm).
-#### 1.1.1 <a name="Algorithm_FFFE" href="#Algorithm_FFFE">U+FFFE</a>
+#### <a name="Algorithm_FFFE" href="#Algorithm_FFFE">U+FFFE</a>
U+FFFE maps to a CE with a minimal, unique primary weight. Its primary weight is not "variable": U+FFFE must not become ignorable in alternate handling. On the identical level, a minimal, unique “weight” must be emitted for U+FFFE as well. This allows for [Merging Sort Keys](https://www.unicode.org/reports/tr10/#Merging_Sort_Keys) within code point space.
@@ -118,7 +123,7 @@
> 👉 **Note**: With unique, low weights on _all_ levels it is possible to achieve `sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))` . When that is not necessary, then code can be a little simpler (no special handling for U+FFFE except for backwards-secondary), sort keys can be a little shorter (when using compressible common non-primary weights for U+FFFE), and another low weight can be used in tailorings.
-#### 1.1.2 <a name="Context_Sensitive_Mappings" href="#Context_Sensitive_Mappings">Context-Sensitive Mappings</a>
+#### <a name="Context_Sensitive_Mappings" href="#Context_Sensitive_Mappings">Context-Sensitive Mappings</a>
Contraction matching, as in the UCA, starts from the first character of the contraction string. It slows down processing of that first character even when none of its contractions matches. In some cases, it is preferrable to change such contractions to mappings with a prefix (context before a character), so that complex processing is done only when the less-frequently occurring trailing character is encountered.
@@ -162,15 +167,15 @@
However, if the mapping p|c → CE(u) is missing, then text "pch" maps to CE(p)CE(d), "opch" maps to CE(o)CE(p)CE(d), and "pĉ̣" maps to CE(p)CE(c)CE(U+0323)CE(U+0302) (because discontiguous contraction matching extends _an existing match_ by one non-starter at a time).
-#### 1.1.3 <a name="Algorithm_Case" href="#Algorithm_Case">Case Handling</a>
+#### <a name="Algorithm_Case" href="#Algorithm_Case">Case Handling</a>
-CLDR specifies how to sort lowercase or uppercase first, as a stronger distinction than other tertiary variants (**caseFirst**) or while completely ignoring all other tertiary distinctions (**caseLevel**). See _Section 3.3 [Setting Options](#Setting_Options)_ and _Section 3.13 [Case Parameters](#Case_Parameters)_.
+CLDR specifies how to sort lowercase or uppercase first, as a stronger distinction than other tertiary variants (**caseFirst**) or while completely ignoring all other tertiary distinctions (**caseLevel**). See _[Setting Options](#Setting_Options)_ and _[Case Parameters](#Case_Parameters)_.
-#### 1.1.4 <a name="Algorithm_Reordering_Groups" href="#Algorithm_Reordering_Groups">Reordering Groups</a>
+#### <a name="Algorithm_Reordering_Groups" href="#Algorithm_Reordering_Groups">Reordering Groups</a>
CLDR specifies how to do parametric reordering of groups of scripts (e.g., “native script first”) as well as special groups (e.g., “digits after letters”), and provides data for the effective implementation of such reordering.
-#### 1.1.5 <a name="Combining_Rules" href="#Combining_Rules">Combining Rules</a>
+#### <a name="Combining_Rules" href="#Combining_Rules">Combining Rules</a>
Rules from different sources can be combined, with the later rules overriding the earlier ones. The following is an example of how this can be useful.
@@ -215,13 +220,13 @@
<tr><td>combined rules</td><td>,</td><td>😀</td><td>☹️</td><td>✈️️</td><td>a</td><td>y</td><td><strong><u>ü</u></strong></td><td>Z</td><td>글</td></tr>
</tbody></table>
-## 2 <a name="Root_Collation" href="#Root_Collation">Root Collation</a>
+## <a name="Root_Collation" href="#Root_Collation">Root Collation</a>
The CLDR root collation order is based on the [Default Unicode Collation Element Table (DUCET)](https://www.unicode.org/reports/tr10/#Default_Unicode_Collation_Element_Table) defined in _UTS #10: Unicode Collation Algorithm_ [[UCA](https://www.unicode.org/reports/tr41/#UTS10)]. It is used by all other locales by default, or as the base for their tailorings. (For a chart view of the UCA, see Collation Chart [[UCAChart](tr35.md#UCAChart)].)
Starting with CLDR 1.9, CLDR uses modified tables for the root collation order. The root locale ordering is tailored in the following ways:
-### 2.1 <a name="grouping_classes_of_characters" href="#grouping_classes_of_characters">Grouping classes of characters</a>
+### <a name="grouping_classes_of_characters" href="#grouping_classes_of_characters">Grouping classes of characters</a>
As of Version 6.1.0, the DUCET puts characters into the following ordering:
@@ -248,20 +253,20 @@
1. U+10A7F ( 𐩿 ) [Po] OLD SOUTH ARABIAN NUMERIC INDICATOR is put with punctuation, not symbols
2. U+20A8 ( ₨ ) [Sc] RUPEE SIGN and U+FDFC ( ﷼ ) [Sc] RIAL SIGN are put with currency signs, not with R and REH.
-### 2.2 <a name="non_variable_symbols" href="#non_variable_symbols">Non-variable symbols</a>
+### <a name="non_variable_symbols" href="#non_variable_symbols">Non-variable symbols</a>
There are multiple [Variable-Weighting](https://www.unicode.org/reports/tr10/#Variable_Weighting) options in the UCA for symbols and punctuation, including _non-ignorable_ and _shifted_. With the _shifted_ option, almost all symbols and punctuation are ignored—except at a fourth level. The CLDR root locale ordering is modified so that symbols are not affected by the _shifted_ option. That is, by default, symbols are not “variable” in CLDR. So _shifted_ only causes whitespace and punctuation to be ignored, but not symbols (like ♥). The DUCET behavior can be specified with a locale ID using the "kv" keyword, to set the Variable section to include all of the symbols below it, or be set parametrically where implementations allow access.
See also:
-* _Section 3.3, [Setting Options](#Setting_Options)_
+* _[Setting Options](#Setting_Options)_
* [https://www.unicode.org/charts/collation/](https://www.unicode.org/charts/collation/)
-### 2.3 <a name="tibetan_contractions" href="#tibetan_contractions">Additional contractions for Tibetan</a>
+### <a name="tibetan_contractions" href="#tibetan_contractions">Additional contractions for Tibetan</a>
Ten contractions are added for Tibetan: Two to fulfill [well-formedness condition 5](https://www.unicode.org/reports/tr10/#WF5), and eight more to preserve the default order for Tibetan. For details see _UTS #10, Section 3.8.2, [Well-Formedness of the DUCET](https://www.unicode.org/reports/tr10/#Well_Formed_DUCET)_.
-### 2.4 <a name="tailored_noncharacter_weights" href="#tailored_noncharacter_weights">Tailored noncharacter weights</a>
+### <a name="tailored_noncharacter_weights" href="#tailored_noncharacter_weights">Tailored noncharacter weights</a>
U+FFFE and U+FFFF have special tailorings:
@@ -283,7 +288,7 @@
> 👉 **Note**: Java uses an early version of this collation syntax, but has not been updated recently. It does not support any of the syntax marked with [...], and its default table is not the DUCET nor the CLDR root collation.
-### 2.5 <a name="Root_Data_Files" href="#Root_Data_Files">Root Collation Data Files</a>
+### <a name="Root_Data_Files" href="#Root_Data_Files">Root Collation Data Files</a>
The CLDR root collation data files are in the CLDR repository and release, under the path [common/uca/](https://github.com/unicode-org/cldr/blob/main/common/uca/).
@@ -305,17 +310,17 @@
* CollationTest_CLDR_NON_IGNORABLE.txt
* CollationTest_CLDR_SHIFTED.txt
-### 2.6 <a name="Root_Data_File_Formats" href="#Root_Data_File_Formats">Root Collation Data File Formats</a>
+### <a name="Root_Data_File_Formats" href="#Root_Data_File_Formats">Root Collation Data File Formats</a>
The file formats may change between versions of CLDR. The formats for CLDR 23 and beyond are as follows. As usual, text after a # is a comment.
-#### 2.6.1 <a name="File_Format_allkeys_CLDR_txt" href="#File_Format_allkeys_CLDR_txt">allkeys_CLDR.txt</a>
+#### <a name="File_Format_allkeys_CLDR_txt" href="#File_Format_allkeys_CLDR_txt">allkeys_CLDR.txt</a>
-This file defines CLDR’s tailoring of the DUCET, as described in _Section 2, [Root Collation](#Root_Collation)_ .
+This file defines CLDR’s tailoring of the DUCET, as described in _[Root Collation](#Root_Collation)_ .
The format is similar to that of [allkeys.txt](https://www.unicode.org/reports/tr10/#File_Format), although there may be some differences in whitespace.
-#### 2.6.2 <a name="File_Format_FractionalUCA_txt" href="#File_Format_FractionalUCA_txt">FractionalUCA.txt</a>
+#### <a name="File_Format_FractionalUCA_txt" href="#File_Format_FractionalUCA_txt">FractionalUCA.txt</a>
The format is illustrated by the following sample lines, with commentary afterwards.
@@ -515,11 +520,11 @@
> 👉 **Note**: The particular primary lead bytes for Hani vs. IMPLICIT vs. TRAILING are only an example. An implementation is free to move them if it also moves the explicit TRAILING weights. This affects only a small number of explicit mappings in FractionalUCA.txt, such as for U+FFFD, U+FFFF, and the “unassigned first primary”. It is possible to use no SPECIAL bytes at all, and to use only the one primary lead byte FF for TRAILING weights.
-#### 2.6.3 <a name="File_Format_UCA_Rules_txt" href="#File_Format_UCA_Rules_txt">UCA_Rules.txt</a>
+#### <a name="File_Format_UCA_Rules_txt" href="#File_Format_UCA_Rules_txt">UCA_Rules.txt</a>
-The format for this file uses the CLDR collation syntax, see _Section 3, [Collation Tailorings](#Collation_Tailorings)_.
+The format for this file uses the CLDR collation syntax, see _[Collation Tailorings](#Collation_Tailorings)_.
-## 3 <a name="Collation_Tailorings" href="#Collation_Tailorings">Collation Tailorings</a>
+## <a name="Collation_Tailorings" href="#Collation_Tailorings">Collation Tailorings</a>
```xml
<!ELEMENT collations (alias | (defaultCollation?, collation*, special*)) >
@@ -531,7 +536,7 @@
> 👉 **Note**: CLDR collation tailoring data should follow the [CLDR Collation Guidelines](https://cldr.unicode.org/index/cldr-spec/collation-guidelines).
-### 3.1 <a name="Collation_Types" href="#Collation_Types">Collation Types</a>
+### <a name="Collation_Types" href="#Collation_Types">Collation Types</a>
Each locale may have multiple sort orders (types). The `defaultCollation` element defines the default tailoring for a locale and its sublocales. For example:
@@ -547,7 +552,7 @@
> 👉 **Note**: In CLDR 23 and before, LDML collation files used an XML format. Starting with CLDR 24, the XML collation syntax is deprecated and no longer used. See the _[CLDR 23 version of this document](https://www.unicode.org/reports/tr35/tr35-31/tr35-collation.html#Collation_Tailorings)_ for details about the XML collation syntax.
-#### 3.1.1 <a name="Collation_Type_Fallback" href="#Collation_Type_Fallback">Collation Type Fallback</a>
+#### <a name="Collation_Type_Fallback" href="#Collation_Type_Fallback">Collation Type Fallback</a>
When loading a requested tailoring from its data file and the parent file chain, use the following type fallback to find the tailoring.
@@ -591,13 +596,13 @@
| el/searchjl | root/search | "search.+" falls back to "search", found in root |
| ko/searchjl | ko/searchjl | requested data is actually available |
-### 3.2 <a name="Collation_Version" href="#Collation_Version">Version</a>
+### <a name="Collation_Version" href="#Collation_Version">Version</a>
The `version` attribute is used in case a specific version of the UCA is to be specified. It is optional, and is specified if the results are to be identical on different systems. If it is not supplied, then the version is assumed to be the same as the Unicode version for the system as a whole.
> 👉 **Note**: For version 3.1.1 of the UCA, the version of Unicode must also be specified with any versioning information; an example would be "3.1.1/3.2" for version 3.1.1 of the UCA, for version 3.2 of Unicode. This was changed by decision of the UTC, so that dual versions were no longer necessary. So for UCA 4.0 and beyond, the version just has a single number.
-### 3.3 <a name="Collation_Element" href="#Collation_Element">Collation Element</a>
+### <a name="Collation_Element" href="#Collation_Element">Collation Element</a>
```xml
<!ELEMENT collation (alias | (cr*, special*)) >
@@ -614,7 +619,7 @@
</collation>
```
-### 3.4 <a name="Setting_Options" href="#Setting_Options">Setting Options</a>
+### <a name="Setting_Options" href="#Setting_Options">Setting Options</a>
Parametric settings can be specified in language tags or in rule syntax (in the form `[keyword value]` ). For example, `-ks-level2` or `[strength 2]` will only compare strings based on their primary and secondary weights.
@@ -646,16 +651,16 @@
<tr><td>false</td><td><i><b><code>[normalization off]</code></b></i></td></tr>
<tr><td rowspan="2">kc</td><td>true</td><td><code>[caseLevel on]</code></td>
- <td rowspan="2">If set to <b>on</b><i>,</i> a level consisting only of case characteristics will be inserted in front of tertiary level, as a "Level 2.5". To ignore accents but take case into account, set strength to <b>primary</b> and case level to <b>on</b>. For details, see <i>Section 3.14, <a href="#Case_Parameters">Case Parameters</a></i> .</td></tr>
+ <td rowspan="2">If set to <b>on</b><i>,</i> a level consisting only of case characteristics will be inserted in front of tertiary level, as a "Level 2.5". To ignore accents but take case into account, set strength to <b>primary</b> and case level to <b>on</b>. For details, see <i><a href="#Case_Parameters">Case Parameters</a></i> .</td></tr>
<tr><td>false</td><td><i><b><code>[caseLevel off]</code></b></i></td></tr>
<tr><td rowspan="3">kf</td><td>upper</td><td><code>[caseFirst upper]</code></td>
- <td rowspan="3">If set to <b>upper</b>, causes upper case to sort before lower case. If set to <b>lower</b>, causes lower case to sort before upper case. Useful for locales that have already supported ordering but require different order of cases. Affects case and tertiary levels. For details, see <i>Section 3.14, <a href="#Case_Parameters">Case Parameters</a></i> .</td></tr>
+ <td rowspan="3">If set to <b>upper</b>, causes upper case to sort before lower case. If set to <b>lower</b>, causes lower case to sort before upper case. Useful for locales that have already supported ordering but require different order of cases. Affects case and tertiary levels. For details, see <i><a href="#Case_Parameters">Case Parameters</a></i> .</td></tr>
<tr><td>lower</td><td><code>[caseFirst lower]</code></td></tr>
<tr><td>false</td><td><i><b><code>[caseFirst off]</code></b></i></td></tr>
<tr><td rowspan="2">kh</td><td>true<br/><i><b>Deprecated:</b></i> Use rules with quater­nary relations instead.</td><td><code>[hiraganaQ on]</code></td>
- <td rowspan="2">Controls special treatment of Hiragana code points on quaternary level. If turned <b>on</b>, Hiragana codepoints will get lower values than all the other non-variable code points in <b>shifted</b>. That is, the normal Level 4 value for a regular collation element is FFFF, as described in [<a href="https://www.unicode.org/reports/tr41/#UTS10">UCA</a>], <i>Section 3.6, <a href="https://www.unicode.org/reports/tr10/#Variable_Weighting">Variable Weighting</a></i> . This is changed to FFFE for [:script=Hiragana:] characters. The strength must be greater or equal than quaternary if this attribute is to have any effect.</td></tr>
+ <td rowspan="2">Controls special treatment of Hiragana code points on quaternary level. If turned <b>on</b>, Hiragana codepoints will get lower values than all the other non-variable code points in <b>shifted</b>. That is, the normal Level 4 value for a regular collation element is FFFF, as described in [<a href="https://www.unicode.org/reports/tr41/#UTS10">UCA</a>], <i><a href="https://www.unicode.org/reports/tr10/#Variable_Weighting">Variable Weighting</a></i> . This is changed to FFFE for [:script=Hiragana:] characters. The strength must be greater or equal than quaternary if this attribute is to have any effect.</td></tr>
<tr><td>false</td><td><i><b><code>[hiraganaQ off]</code></b></i></td></tr>
<tr><td rowspan="2">kn</td><td>true</td><td><code>[numericOrdering on]</code></td>
@@ -663,23 +668,23 @@
<tr><td>false</td><td><i><b><code>[numericOrdering off]</code></b></i></td></tr>
<tr><td>kr</td><td>a sequence of one or more reorder codes: <b>space, punct, symbol, currency, digit</b>, or any BCP47 script ID</td><td><code>[reorder Grek digit]</code></td>
- <td>Specifies a reordering of scripts or other significant blocks of characters such as symbols, punctuation, and digits. For the precise meaning and usage of the reorder codes, see <i>Section 3.13, <a href="#Script_Reordering">Collation Reordering</a>.</i></td></tr>
+ <td>Specifies a reordering of scripts or other significant blocks of characters such as symbols, punctuation, and digits. For the precise meaning and usage of the reorder codes, see <i><a href="#Script_Reordering">Collation Reordering</a>.</i></td></tr>
<tr><td rowspan="4">kv</td><td>space</td><td><code>[maxVariable space]</code></td>
<td rowspan="4">Sets the variable top to the top of the specified reordering group. All code points with primary weights less than or equal to the variable top will be considered variable, and thus affected by the alternate handling. Variables are ignorable by default in [<a href="https://www.unicode.org/reports/tr41/#UTS10">UCA</a>], but not in CLDR.</td></tr>
<tr><td>punct</td><td><i><b><code>[maxVariable punct]</code></b></i></td></tr>
<tr><td>symbol</td><td><b><code>[maxVariable symbol]</code><br/>(UCA default)</b></td></tr>
<tr><td>currency</td><td><code>[maxVariable currency]</code></td></tr>
-<tr><td>vt</td><td>See <i>Part 1 Section 3.6.4, <a href="tr35.md#Unicode_Locale_Extension_Data_Files">U Extension Data Files</a></i>.<br/><i><b>Deprecated:</b></i> Use maxVariable instead.</td><td><code>&\u00XX\uYYYY < [variable top]</code><br/><br/>(the default is set to the highest punctuation, thus including spaces and punctuation, but not symbols)</td>
+<tr><td>vt</td><td>See <i>Part 1 <a href="tr35.md#Unicode_Locale_Extension_Data_Files">U Extension Data Files</a></i>.<br/><i><b>Deprecated:</b></i> Use maxVariable instead.</td><td><code>&\u00XX\uYYYY < [variable top]</code><br/><br/>(the default is set to the highest punctuation, thus including spaces and punctuation, but not symbols)</td>
<td>The BCP47 value is described in <i>Appendix Q: <a href="tr35.md#Locale_Extension_Key_and_Type_Data">Locale Extension Keys and Types</a>.</i><br/><br/>Sets the string value for the variable top. All the code points with primary weights less than or equal to the variable top will be considered variable, and thus affected by the alternate handling.<br/>An implementation that supports the variableTop setting should also support the maxVariable setting, and it should "pin" ("round up") the variableTop to the top of the containing reordering group.<br/>Variables are ignorable by default in [<a href="https://www.unicode.org/reports/tr41/#UTS10">UCA</a>], but not in CLDR. See below for more information.</td></tr>
<tr><td><i>n/a</i></td><td><i>n/a</i></td><td><i>n/a</i></td>
- <td>match-boundaries: <i><b>none</b></i> | whole-character | whole-word<br/>Defined by <i>Section 8, <a href="https://www.unicode.org/reports/tr10/#Searching">Searching and Matching</a></i> of [<a href="https://www.unicode.org/reports/tr41/#UTS10">UCA</a>].</td></tr>
+ <td>match-boundaries: <i><b>none</b></i> | whole-character | whole-word<br/>Defined by <i><a href="https://www.unicode.org/reports/tr10/#Searching">Searching and Matching</a></i> of [<a href="https://www.unicode.org/reports/tr41/#UTS10">UCA</a>].</td></tr>
<tr><td><i>n/a</i></td><td><i>n/a</i></td><td><i>n/a</i></td>
- <td>match-style: <i><b>minimal</b></i> | medial | maximal<br/>Defined by <i>Section 8, <a href="https://www.unicode.org/reports/tr10/#Searching">Searching and Matching</a></i> of [<a href="https://www.unicode.org/reports/tr41/#UTS10">UCA</a>].</td></tr>
+ <td>match-style: <i><b>minimal</b></i> | medial | maximal<br/>Defined by <i><a href="https://www.unicode.org/reports/tr10/#Searching">Searching and Matching</a></i> of [<a href="https://www.unicode.org/reports/tr41/#UTS10">UCA</a>].</td></tr>
</tbody></table>
-#### 3.4.1 <a name="Common_Settings" href="#Common_Settings">Common settings combinations</a>
+#### <a name="Common_Settings" href="#Common_Settings">Common settings combinations</a>
Some commonly used parametric collation settings are available via combinations of LDML settings attributes:
@@ -689,7 +694,7 @@
* “Ignore punctuation” (completely): **strength=tertiary alternate=shifted**
* “Ignore punctuation” but distinguish among punctuation marks: **strength=quaternary alternate=shifted**
-#### 3.4.2 <a name="Normalization_Setting" href="#Normalization_Setting">Notes on the normalization setting</a>
+#### <a name="Normalization_Setting" href="#Normalization_Setting">Notes on the normalization setting</a>
The UCA always normalizes input strings into NFD form before the rest of the algorithm. However, this results in poor performance.
@@ -699,13 +704,13 @@
In order to handle strings with these characters (e.g., “aä” and “ӛ́” [which are in FCD]) exactly as with prior NFD normalization, an implementation needs to either add overlap contractions to its data (e.g., “a+ä” and “ә+◌̈́”), or it needs to decompose the relevant composites (e.g., ‘ä’ and ‘◌̈́’) as soon as they are encountered.
-#### 3.4.3 <a name="Variable_Top_Settings" href="#Variable_Top_Settings">Notes on variable top settings</a>
+#### <a name="Variable_Top_Settings" href="#Variable_Top_Settings">Notes on variable top settings</a>
Users may want to include more or fewer characters as Variable. For example, someone could want to restrict the Variable characters to just include space marks. In that case, maxVariable would be set to "space". (In CLDR 24 and earlier, the now-deprecated variableTop would be set to U+1680, see the “Whitespace” [UCA collation chart](https://www.unicode.org/charts/collation/)). Alternatively, someone could want more of the Common characters in them, and include characters up to (but not including) '0', by setting maxVariable to "currency". (In CLDR 24 and earlier, the now-deprecated variableTop would be set to U+20BA, see the “Currency-Symbol” collation chart).
The effect of these settings is to customize to ignore different sets of characters when comparing strings. For example, the locale identifier "de-u-ka-shifted-kv-currency" is requesting settings appropriate for German, including German sorting conventions, and that currency symbols and characters sorting below them are ignored in sorting.
-### 3.5 <a name="Rules" href="#Rules">Collation Rule Syntax</a>
+### <a name="Rules" href="#Rules">Collation Rule Syntax</a>
```xml
<!ELEMENT cr #PCDATA >
@@ -729,7 +734,7 @@
The collation syntax is case-sensitive.
-### 3.6 <a name="Orderings" href="#Orderings">Orderings</a>
+### <a name="Orderings" href="#Orderings">Orderings</a>
The root collation mappings form the initial state. Mappings are added and removed via a sequence of rule chains. Each tailoring rule builds on the current state after all of the preceding rules (and is not affected by any following rules). Rule chains may alternate with comments, settings, and special commands.
@@ -774,7 +779,7 @@
* Weights must be allocated in accordance with the [UCA well-formedness conditions](https://www.unicode.org/reports/tr10/#Well-Formed).
* When incrementing any weight, lower-level weights should be reset to the “common” values, to help with sort key compression.
-In all cases, even for `=` , the case bits are recomputed according to _Section 3.13, [Case Parameters](#Case_Parameters)_. (This can be skipped if an implementation does not support the caseLevel or caseFirst settings.)
+In all cases, even for `=` , the case bits are recomputed according to _[Case Parameters](#Case_Parameters)_. (This can be skipped if an implementation does not support the caseLevel or caseFirst settings.)
For example, `&ae<x` maps ‘x’ to two collation elements. The first one is the same as for ‘a’, and the second one has a primary weight between those for ‘e’ and ‘f’. As a result, ‘x’ sorts between “ae” and “af”. (If the primary of the first collation element was incremented instead, then ‘x’ would sort after “az”. While also sorting primary-after “ae” this would be surprising and sub-optimal.)
@@ -792,7 +797,7 @@
| `<<<<*` | `& k`<br/>`<<<<* qQ` | `& k`<br/>`<<<< q <<<< Q` |
| `=*` | `& v`<br/>`=* VwW` | `& v`<br/>`= V = w = W` |
-### 3.7 <a name="Contractions" href="#Contractions">Contractions</a>
+### <a name="Contractions" href="#Contractions">Contractions</a>
A multi-character relation string defines a contraction.
@@ -802,7 +807,7 @@
| ---------------- | ----------- |
| `& k`<br/>`< ch` | Make the sequence 'ch' sort after 'k', as a primary (base-character) difference |
-### 3.8 <a name="Expansions" href="#Expansions">Expansions</a>
+### <a name="Expansions" href="#Expansions">Expansions</a>
A mapping to multiple collation elements defines an expansion. This is normally the result of a reset position (and/or preceding relation) that yields multiple collation elements, for example `&ae<x` or `&æ<y` .
@@ -816,7 +821,7 @@
In summary, there are two ways of specifying expansions which produce subtly different mappings. The use of extension strings is unusual but sometimes necessary.
-### 3.9 <a name="Context_Before" href="#Context_Before">Context Before</a>
+### <a name="Context_Before" href="#Context_Before">Context Before</a>
A relation string can have a prefix (context before) which makes the mapping from the relation string to its tailored position conditional on the string occurring after that prefix. For details see the specification of _[Context-Sensitive Mappings](#Context_Sensitive_Mappings)_.
@@ -834,7 +839,7 @@
* `< def / ghi`
* `< abc | def`
-### 3.10 <a name="Placing_Characters_Before_Others" href="#Placing_Characters_Before_Others">Placing Characters Before Others</a>
+### <a name="Placing_Characters_Before_Others" href="#Placing_Characters_Before_Others">Placing Characters Before Others</a>
There are certain circumstances where characters need to be placed before a given character, rather than after. This is the case with Pinyin, for example, where certain accented letters are positioned before the base letter. That is accomplished with the following syntax.
@@ -847,7 +852,7 @@
* `&[before 2] a < à # error`
* `&[before 2] a <<< à # error`
-### 3.11 <a name="Logical_Reset_Positions" href="#Logical_Reset_Positions">Logical Reset Positions</a>
+### <a name="Logical_Reset_Positions" href="#Logical_Reset_Positions">Logical Reset Positions</a>
The CLDR table (based on UCA) has the following overall structure for weights, going from low to high.
@@ -888,7 +893,7 @@
The value can be changed by using the maxVariable setting. This takes effect, however, after the rules have been built, and does not affect any characters that are reset relative to the `[last variable]` value when the rules are being built. The maxVariable setting might also be changed via a runtime parameter. That also does not affect the rules.
(In CLDR 24 and earlier, the variable top could also be set by using a tailoring rule with `[variable top]` in the place of a relation string.)
-### 3.12 <a name="Special_Purpose_Commands" href="#Special_Purpose_Commands">Special-Purpose Commands</a>
+### <a name="Special_Purpose_Commands" href="#Special_Purpose_Commands">Special-Purpose Commands</a>
The import command imports rules from another collation. This allows for better maintenance and smaller rule sizes. The source is a BCP 47 language tag with an optional collation type but without other extensions. The collation type is the BCP 47 form of the collation type in the source; it defaults to "standard".
@@ -939,7 +944,7 @@
</collation>
```
-### 3.13 <a name="Script_Reordering" href="#Script_Reordering">Collation Reordering</a>
+### <a name="Script_Reordering" href="#Script_Reordering">Collation Reordering</a>
Collation reordering allows scripts and certain other defined blocks of characters to be moved relative to each other parametrically, without changing the detailed rules for all the characters involved. This reordering is done on top of any specific ordering rules within the script or block currently in effect. Reordering can specify groups to be placed at the start and/or the end of the collation order. For example, to reorder Greek characters before Latin characters, and digits afterwards (but before other scripts), the following can be used:
@@ -962,7 +967,7 @@
Interaction with **alternate=shifted**: Whether a primary weight is “variable” is determined according to the “variable top”, before applying script reordering. Once that is determined, script reordering is applied to the primary weight regardless of whether it is “regular” (used in the primary level) or “shifted” (used in the quaternary level).
-#### 3.13.1 <a name="Interpretation_reordering" href="#Interpretation_reordering">Interpretation of a reordering list</a>
+#### <a name="Interpretation_reordering" href="#Interpretation_reordering">Interpretation of a reordering list</a>
The reordering list is interpreted as if it were processed in the following way.
@@ -982,7 +987,7 @@
| `en-u-kr-arab-cyrl-others-symbol` | Reorder Arabic characters first, then Cyrillic, and put symbols at the end—after all other characters. |
| `en-u-kr-others` | Remove any locale-specific reordering, and use DUCET order for reordering blocks. |
-The default reordering groups are defined by the FractionalUCA.txt file, based on the primary weights of associated collation elements. The file contains special mappings for the start of each group, script, and reorder-reserved range, see _Section 2.6.2, [FractionalUCA.txt](#File_Format_FractionalUCA_txt)_.
+The default reordering groups are defined by the FractionalUCA.txt file, based on the primary weights of associated collation elements. The file contains special mappings for the start of each group, script, and reorder-reserved range, see _[FractionalUCA.txt](#File_Format_FractionalUCA_txt)_.
There are some special cases:
@@ -1001,13 +1006,13 @@
* Primary weights from different original lead bytes can be reordered to a shared lead byte, as long as they do not overlap. Primary compression ends when the target lead byte differs or when the original lead byte of the next primary is not compressible.
* Non-compressible groups and scripts begin or end on whole-primary-lead-byte boundaries (or both), so that reordering cannot surround a non-compressible script by two compressible ones within the same target lead byte. This is so that primary compression can be terminated reliably (choosing the low or high terminator byte) simply by comparing the previous and current primary weights. Otherwise it would have to also check for another condition (e.g., equal scripts).
-#### 3.13.2 <a name="Reordering_Groups_allkeys" href="#Reordering_Groups_allkeys">Reordering Groups for allkeys.txt</a>
+#### <a name="Reordering_Groups_allkeys" href="#Reordering_Groups_allkeys">Reordering Groups for allkeys.txt</a>
For allkeys_CLDR.txt, the start of each reordering group can be determined from FractionalUCA.txt, by finding the first real mapping (after “xyz first primary”) of that group (e.g., `0060; [0D 07, 05, 05] # Zyyy Sk [0312.0020.0002] * GRAVE ACCENT` ), and looking for that mapping's character sequence ( `0060` ) in allkeys_CLDR.txt. The comment in FractionalUCA.txt ( `[0312.0020.0002]` ) also shows the allkeys_CLDR.txt collation elements.
The DUCET ordering of some characters is slightly different from the CLDR root collation order. The reordering groups for the DUCET are not specified. The following describes how reordering groups for the DUCET can be derived.
-For allkeys_DUCET.txt, the start of each reordering group is normally the primary weight corresponding to the same character sequence as for allkeys_CLDR.txt. In a few cases this requires adjustment, especially for the special reordering groups, due to CLDR’s ordering the common characters more strictly by category than the DUCET (as described in _Section 2, [Root Collation](#Root_Collation)_). The necessary adjustment would set the start of each allkeys_DUCET.txt reordering group to the primary weight of the first mapping for the relevant General_Category for a special reordering group (for characters that sort before ‘a’), or the primary weight of the first mapping for the first script (e.g., sc=Grek) of an “alphabetic” group (for characters that sort at or after ‘a’).
+For allkeys_DUCET.txt, the start of each reordering group is normally the primary weight corresponding to the same character sequence as for allkeys_CLDR.txt. In a few cases this requires adjustment, especially for the special reordering groups, due to CLDR’s ordering the common characters more strictly by category than the DUCET (as described in _[Root Collation](#Root_Collation)_). The necessary adjustment would set the start of each allkeys_DUCET.txt reordering group to the primary weight of the first mapping for the relevant General_Category for a special reordering group (for characters that sort before ‘a’), or the primary weight of the first mapping for the first script (e.g., sc=Grek) of an “alphabetic” group (for characters that sort at or after ‘a’).
Note that the following only applies to primary weights greater than the one for U+FFFE and less than "trailing" weights.
@@ -1023,7 +1028,7 @@
Each collation element of an expansion may be in a different reordering group, for example for parenthesized characters.
-### 3.14 <a name="Case_Parameters" href="#Case_Parameters">Case Parameters</a>
+### <a name="Case_Parameters" href="#Case_Parameters">Case Parameters</a>
The **case level** is an _optional_ intermediate level ("2.5") between Level 2 and Level 3 (or after Level 1, if there is no Level 2 due to strength settings). The case level is used to support two parametric features: ignoring non-case variants (Level 3 differences) except for case, and giving case differences a higher-level priority than other tertiary differences. Distinctions between small and large Kana characters are also included as case differences, to support Japanese collation.
@@ -1033,7 +1038,7 @@
When either the **case level** or **case first** parameters are set, the following describes the derivation of the modified collation elements. It assumes the original levels for the code point are [p.s.t] (primary, secondary, tertiary). This derivation may change in future versions of LDML, to track the case characteristics more closely.
-#### 3.14.1 <a name="Case_Untailored" href="#Case_Untailored">Untailored Characters</a>
+#### <a name="Case_Untailored" href="#Case_Untailored">Untailored Characters</a>
For untailored characters and strings, that is, for mappings in the root collation, the case value for each collation element is computed from the tertiary weight listed in allkeys_CLDR.txt. This is used to modify the collation element.
@@ -1043,7 +1048,7 @@
2. UNCASED otherwise
3. FractionalUCA.txt encodes the case information in bits 6 and 7 of the first byte in each tertiary weight. The case bits are set to 00 for UNCASED and LOWERCASE, and 10 for UPPER. There is no MIXED case value (01) in the root collation.
-#### 3.14.2 <a name="Case_Weights" href="#Case_Weights">Compute Modified Collation Elements</a>
+#### <a name="Case_Weights" href="#Case_Weights">Compute Modified Collation Elements</a>
From a computed case value, set a weight **c** according to the following.
@@ -1074,7 +1079,7 @@
The case weight of a tertiary-ignorable CE must be 0 so that [[UCA](https://www.unicode.org/reports/tr41/#UTS10)] [well-formedness condition 1](https://www.unicode.org/reports/tr10/#WF1) is fulfilled.
-#### 3.14.3 <a name="Case_Tailored" href="#Case_Tailored">Tailored Strings</a>
+#### <a name="Case_Tailored" href="#Case_Tailored">Tailored Strings</a>
Characters and strings that are tailored have case values computed from their root collation case bits.
@@ -1092,15 +1097,15 @@
> 👉 **Note**: Almost all Cased characters have primary (non-ignorable) root collation CEs, except for U+0345 Combining Ypogegrammeni which is Lowercase. All Uppercase characters have primary root collation CEs.
-### 3.15 <a name="Visibility" href="#Visibility">Visibility</a>
+### <a name="Visibility" href="#Visibility">Visibility</a>
-Collations have external visibility by default, meaning that they can be displayed in a list of collation options for users to choose from. A collation whose type name starts with "private-" is internal and should not be shown in such a list. Collations are typically internal when they are partial sequences included in other collations. See _Section 3.1, [Collation Types](#Collation_Types)_ .
+Collations have external visibility by default, meaning that they can be displayed in a list of collation options for users to choose from. A collation whose type name starts with "private-" is internal and should not be shown in such a list. Collations are typically internal when they are partial sequences included in other collations. See _[Collation Types](#Collation_Types)_ .
-### 3.16 <a name="Collation_Indexes" href="#Collation_Indexes">Collation Indexes</a>
+### <a name="Collation_Indexes" href="#Collation_Indexes">Collation Indexes</a>
-#### 3.16.1 <a name="Index_Characters" href="#Index_Characters">Index Characters</a>
+#### <a name="Index_Characters" href="#Index_Characters">Index Characters</a>
-The main data includes `<exemplarCharacters>` for collation indexes. See _Part 2 General, Section 3, [Character Elements](tr35-general.md#Character_Elements)_, for general information about exemplar characters.
+The main data includes `<exemplarCharacters>` for collation indexes. See _Part 2 General, [Character Elements](tr35-general.md#Character_Elements)_, for general information about exemplar characters.
The index characters are a set of characters for use as a UI "index", that is, a list of clickable characters (or character sequences) that allow the user to see a segment of a larger "target" list. Each character corresponds to a bucket in the target list. One may have different kinds of index lists; one that produces an index list that is relatively static, and the other is a list that produces roughly equally-sized buckets. While CLDR is mostly focused on the first, there is provision for supporting the second as well.
@@ -1159,9 +1164,9 @@
<exemplarCharacters type="index">[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z]</exemplarCharacters>
```
-The display of the index characters can be modified with the Index labels elements, discussed in the _Part 2 General, Section 3.3, [Index Labels](tr35-general.md#IndexLabels)_.
+The display of the index characters can be modified with the Index labels elements, discussed in the _Part 2 General, [Index Labels](tr35-general.md#IndexLabels)_.
-#### 3.16.2 <a name="CJK_Index_Markers" href="#CJK_Index_Markers">CJK Index Markers</a>
+#### <a name="CJK_Index_Markers" href="#CJK_Index_Markers">CJK Index Markers</a>
Special index markers have been added to the CJK collations for stroke, pinyin, zhuyin, and unihan. These markers allow for effective and robust use of indexes for these collations.
@@ -1185,6 +1190,6 @@
* * *
-Copyright © 2001–2022 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
+Copyright © 2001–2023 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
Unicode and the Unicode logo are trademarks of Unicode, Inc., and are registered in some jurisdictions.
diff --git a/docs/ldml/tr35-dates.anchors.json b/docs/ldml/tr35-dates.anchors.json
new file mode 100644
index 0000000..06d4b5b
--- /dev/null
+++ b/docs/ldml/tr35-dates.anchors.json
@@ -0,0 +1,120 @@
+[
+ "am--pm",
+ "availableFormats_appendItems",
+ "Calendar_Data",
+ "Calendar_Elements",
+ "Calendar_Fields",
+ "Calendar_Preference_Data",
+ "calendar-data",
+ "calendar-elements",
+ "calendar-fields",
+ "calendar-preference-data",
+ "Contents",
+ "contents-of-part-4-dates",
+ "Date_Field_Symbol_Table",
+ "Date_Format_Pattern_Examples",
+ "Date_Format_Patterns",
+ "Date_Patterns_AM_PM",
+ "Date_Patterns_Eras",
+ "Date_Patterns_Week_Elements",
+ "Date_Patterns_Week_Of_Year",
+ "Date_Time_Combination_Examples",
+ "date-format-patterns",
+ "dateFormats",
+ "dateTimeFormat",
+ "dateTimeFormats",
+ "Day_Period_Rule_Sets",
+ "Day_Period_Rules",
+ "day-period-rule-sets",
+ "day-period-rules",
+ "dayPeriods",
+ "dfst-day",
+ "dfst-era",
+ "dfst-hour",
+ "dfst-minute",
+ "dfst-month",
+ "dfst-period",
+ "dfst-quarter",
+ "dfst-second",
+ "dfst-sep",
+ "dfst-week",
+ "dfst-weekday",
+ "dfst-year",
+ "dfst-zone",
+ "element-dateformats",
+ "element-datetimeformat",
+ "element-datetimeformats",
+ "element-dayperiods",
+ "element-intervalformats",
+ "element-timeformats",
+ "elements-availableformats-appenditems",
+ "elements-monthpatterns-cyclicnamesets",
+ "elements-months-days-quarters-eras",
+ "eras",
+ "fallbackFormat",
+ "first-day-overrides",
+ "Fixed_periods",
+ "fixed-periods",
+ "goals",
+ "intervalFormats",
+ "Localized_Pattern_Characters",
+ "localized-pattern-characters-deprecated",
+ "Mapping_Requested_Time_Skeletons_To_Patterns",
+ "Matching_Skeletons",
+ "matching-skeletons",
+ "Metazone_Names",
+ "metazone-names",
+ "metazones",
+ "Metazones",
+ "Missing_Skeleton_Fields",
+ "missing-skeleton-fields",
+ "monthPatterns_cyclicNameSets",
+ "months_days_quarters_eras",
+ "Overview_Dates_Element_Supplemental",
+ "overview-dates-element-supplemental-date-and-calendar-information",
+ "parsing",
+ "Parsing_Dates_Times",
+ "Parsing_Day_Periods",
+ "parsing-dates-and-times",
+ "parsing-day-periods",
+ "parts",
+ "Parts",
+ "Primary_Zones",
+ "primary-zones",
+ "status",
+ "summary",
+ "Supplemental_Calendar_Data",
+ "Supplemental_Time_Zone_Data",
+ "supplemental-calendar-data",
+ "supplemental-time-zone-data",
+ "table-date-field-symbol-table",
+ "table-date-format-pattern-examples",
+ "table-date-time-combination-examples",
+ "table-mapping-requested-time-skeletons-to-patterns",
+ "table-timezonenames-elements-used-for-fallback",
+ "table-week-designation-types",
+ "Time_Data",
+ "Time_Zone_Format_Terminology",
+ "Time_Zone_Goals",
+ "Time_Zone_Names",
+ "Time_Zone_Parsing",
+ "time-data",
+ "time-zone-format-terminology",
+ "time-zone-names",
+ "timeFormats",
+ "timeZoneNames_Elements_Used_for_Fallback",
+ "unicode-locale-data-markup-language-ldmlpart-4-dates",
+ "unicode-technical-standard-35",
+ "Using_Time_Zone_Names",
+ "using-time-zone-names",
+ "Variable_periods",
+ "variable-periods",
+ "Week_Data",
+ "Week_Designation_Types",
+ "week-data",
+ "week-elements",
+ "week-of-year",
+ "Windows_Zones",
+ "windows-zones",
+ "Year_Length_Examples"
+]
\ No newline at end of file
diff --git a/docs/ldml/tr35-dates.md b/docs/ldml/tr35-dates.md
index 3c59f6b..701ac5b 100644
--- a/docs/ldml/tr35-dates.md
+++ b/docs/ldml/tr35-dates.md
@@ -2,7 +2,7 @@
# Unicode Locale Data Markup Language (LDML)<br/>Part 4: Dates
-|Version|42 |
+|Version|44.1 |
|-------|------------------|
|Editors|Peter Edberg and <a href="tr35.md#Acknowledgments">other CLDR committee members</a>|
@@ -16,7 +16,12 @@
### _Status_
-_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium. This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
+<!-- _This is a draft document which may be updated, replaced, or superseded by other documents at any time.
+Publication does not imply endorsement by the Unicode Consortium.
+This is not a stable document; it is inappropriate to cite this document as other than a work in progress._ -->
+
+_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium.
+This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
> _**A Unicode Technical Standard (UTS)** is an independent specification. Conformance to the Unicode Standard does not imply conformance to any UTS._
@@ -37,55 +42,56 @@
## <a name="Contents" href="#Contents">Contents of Part 4, Dates</a>
-* 1 [Overview: Dates Element, Supplemental Date and Calendar Information](#Overview_Dates_Element_Supplemental)
-* 2 [Calendar Elements](#Calendar_Elements)
- * 2.1 [Elements months, days, quarters, eras](#months_days_quarters_eras)
- * 2.2 [Elements monthPatterns, cyclicNameSets](#monthPatterns_cyclicNameSets)
- * 2.3 [Element dayPeriods](#dayPeriods)
- * 2.4 [Element dateFormats](#dateFormats)
- * 2.5 [Element timeFormats](#timeFormats)
- * 2.6 [Element dateTimeFormats](#dateTimeFormats)
- * 2.6.1 [Element dateTimeFormat](#dateTimeFormat)
+* [Overview: Dates Element, Supplemental Date and Calendar Information](#Overview_Dates_Element_Supplemental)
+* [Calendar Elements](#Calendar_Elements)
+ * [Elements months, days, quarters, eras](#months_days_quarters_eras)
+ * [Elements monthPatterns, cyclicNameSets](#monthPatterns_cyclicNameSets)
+ * [Element dayPeriods](#dayPeriods)
+ * [Element dateFormats](#dateFormats)
+ * [Element timeFormats](#timeFormats)
+ * [Element dateTimeFormats](#dateTimeFormats)
+ * [Element dateTimeFormat](#dateTimeFormat)
* Table: [Date-Time Combination Examples](#Date_Time_Combination_Examples)
- * 2.6.2 [Elements availableFormats, appendItems](#availableFormats_appendItems)
+ * [Elements availableFormats, appendItems](#availableFormats_appendItems)
* Table: [Mapping Requested Time Skeletons To Patterns](#Mapping_Requested_Time_Skeletons_To_Patterns)
- * 2.6.2.1 [Matching Skeletons](#Matching_Skeletons)
- * 2.6.2.2 [Missing Skeleton Fields](#Missing_Skeleton_Fields)
- * 2.6.3 [Element intervalFormats](#intervalFormats)
-* 3 [Calendar Fields](#Calendar_Fields)
-* 4 [Supplemental Calendar Data](#Supplemental_Calendar_Data)
- * 4.1 [Calendar Data](#Calendar_Data)
- * 4.2 [Calendar Preference Data](#Calendar_Preference_Data)
- * 4.3 [Week Data](#Week_Data)
+ * [Matching Skeletons](#Matching_Skeletons)
+ * [Missing Skeleton Fields](#Missing_Skeleton_Fields)
+ * [Element intervalFormats](#intervalFormats)
+* [Calendar Fields](#Calendar_Fields)
+* [Supplemental Calendar Data](#Supplemental_Calendar_Data)
+ * [Calendar Data](#Calendar_Data)
+ * [Calendar Preference Data](#Calendar_Preference_Data)
+ * [Week Data](#Week_Data)
* Table: [Week Designation Types](#Week_Designation_Types)
- * 4.4 [Time Data](#Time_Data)
- * 4.5 [Day Period Rule Sets](#Day_Period_Rule_Sets)
- * 4.5.1 [Day Period Rules](#Day_Period_Rules)
- * 4.5.1.1 [Fixed periods](#Fixed_periods)
- * 4.5.1.2 [Variable periods](#Variable_periods)
- * 4.5.1.3 [Parsing Day Periods](#Parsing_Day_Periods)
-* 5 [Time Zone Names](#Time_Zone_Names)
+ * [First Day Overrides](#first-day-overrides)
+ * [Time Data](#Time_Data)
+ * [Day Period Rule Sets](#Day_Period_Rule_Sets)
+ * [Day Period Rules](#Day_Period_Rules)
+ * [Fixed periods](#Fixed_periods)
+ * [Variable periods](#Variable_periods)
+ * [Parsing Day Periods](#Parsing_Day_Periods)
+* [Time Zone Names](#Time_Zone_Names)
* Table: [timeZoneNames Elements Used for Fallback](#timeZoneNames_Elements_Used_for_Fallback)
- * 5.1 [Metazone Names](#Metazone_Names)
-* 6 [Supplemental Time Zone Data](#Supplemental_Time_Zone_Data)
- * 6.1 [Metazones](#Metazones)
- * 6.2 [Windows Zones](#Windows_Zones)
- * 6.3 [Primary Zones](#Primary_Zones)
-* 7 [Using Time Zone Names](#Using_Time_Zone_Names)
- * 7.1 [Time Zone Format Terminology](#Time_Zone_Format_Terminology)
- * 7.2 [Goals](#Time_Zone_Goals)
- * 7.3 [Parsing](#Time_Zone_Parsing)
-* 8 [Date Format Patterns](#Date_Format_Patterns)
+ * [Metazone Names](#Metazone_Names)
+* [Supplemental Time Zone Data](#Supplemental_Time_Zone_Data)
+ * [Metazones](#Metazones)
+ * [Windows Zones](#Windows_Zones)
+ * [Primary Zones](#Primary_Zones)
+* [Using Time Zone Names](#Using_Time_Zone_Names)
+ * [Time Zone Format Terminology](#Time_Zone_Format_Terminology)
+ * [Goals](#Time_Zone_Goals)
+ * [Parsing](#Time_Zone_Parsing)
+* [Date Format Patterns](#Date_Format_Patterns)
* Table: [Date Format Pattern Examples](#Date_Format_Pattern_Examples)
* Table: [Date Field Symbol Table](#Date_Field_Symbol_Table)
- * 8.1 [Localized Pattern Characters (deprecated)](#Localized_Pattern_Characters)
- * 8.2 [AM / PM](#Date_Patterns_AM_PM)
- * 8.3 [Eras](#Date_Patterns_Eras)
- * 8.4 [Week of Year](#Date_Patterns_Week_Of_Year)
- * 8.5 [Week Elements](#Date_Patterns_Week_Elements)
-* 9 [Parsing Dates and Times](#Parsing_Dates_Times)
+ * [Localized Pattern Characters (deprecated)](#Localized_Pattern_Characters)
+ * [AM / PM](#Date_Patterns_AM_PM)
+ * [Eras](#Date_Patterns_Eras)
+ * [Week of Year](#Date_Patterns_Week_Of_Year)
+ * [Week Elements](#Date_Patterns_Week_Elements)
+* [Parsing Dates and Times](#Parsing_Dates_Times)
-## 1 <a name="Overview_Dates_Element_Supplemental" href="#Overview_Dates_Element_Supplemental">Overview: Dates Element, Supplemental Date and Calendar Information</a>
+## <a name="Overview_Dates_Element_Supplemental" href="#Overview_Dates_Element_Supplemental">Overview: Dates Element, Supplemental Date and Calendar Information</a>
```xml
<!ELEMENT dates (alias | (calendars?, fields?, timeZoneNames?, special*)) >
@@ -93,10 +99,10 @@
The LDML top-level `<dates>` element contains information regarding the format and parsing of dates and times, the formatting of date/time intervals, and the naming of various calendar elements.
-* The `<calendars>` element is described in section 2 [Calendar Elements](#Calendar_Elements).
-* The `<fields>` element is described in section 3 [Calendar Fields](#Calendar_Fields).
-* The `<timeZoneNames>` element is described in section 5 [Time Zone Names](#Time_Zone_Names).
-* The formats use pattern characters described in section 8 [Date Format Patterns](#Date_Format_Patterns).
+* The `<calendars>` element is described in [Calendar Elements](#Calendar_Elements).
+* The `<fields>` element is described in [Calendar Fields](#Calendar_Fields).
+* The `<timeZoneNames>` element is described in [Time Zone Names](#Time_Zone_Names).
+* The formats use pattern characters described in [Date Format Patterns](#Date_Format_Patterns).
```xml
<!ELEMENT supplementalData ( …, calendarData?, calendarPreferenceData?, weekData?, timeData?, …, timezoneData?, …, metazoneInfo?, …, dayPeriodRuleSet*, metaZones?, primaryZones?, windowsZones?, …) >
@@ -104,11 +110,11 @@
The relevant top-level supplemental elements are listed above.
-* The `<calendarData>`, `<calendarPreferenceData>`, `<weekData>`, `<timeData>`, and `<dayPeriodRuleSet>` elements are described in section 4 [Supplemental Calendar Data](#Supplemental_Calendar_Data).
+* The `<calendarData>`, `<calendarPreferenceData>`, `<weekData>`, `<timeData>`, and `<dayPeriodRuleSet>` elements are described in [Supplemental Calendar Data](#Supplemental_Calendar_Data).
* The `<timezoneData>` element is deprecated and no longer used; the `<metazoneInfo>` element is deprecated at this level, and is now only used as a sub-element of `<metaZones>`.
-* The `<metaZones>`, `<primaryZones>`, and `<windowsZones>` elements are described in section 6 [Supplemental Time Zone Data](#Supplemental_Time_Zone_Data).
+* The `<metaZones>`, `<primaryZones>`, and `<windowsZones>` elements are described in [Supplemental Time Zone Data](#Supplemental_Time_Zone_Data).
-## 2 <a name="Calendar_Elements" href="#Calendar_Elements">Calendar Elements</a>
+## <a name="Calendar_Elements" href="#Calendar_Elements">Calendar Elements</a>
```xml
<!ELEMENT calendars (alias | (calendar*, special*)) >
@@ -127,7 +133,7 @@
The primary difference between Gregorian and "generic" format data is that date formats in "generic" usually include era with year, in order to provide an indication of which calendar is being used (Gregorian calendar formats may also commonly include era with year when Gregorian is not the default calendar for the locale). Otherwise, the "generic" date formats should normally be consistent with those in the Gregorian calendar. The "generic" calendar formats are intended to provide a consistent set of default formats for non-Gregorian calendars in the locale, so that in most cases the only data items that need be provided for non-Gregorian calendars are the era names and month names (and the latter only for calendars other than Buddhist, Japanese, and Minguo, since those inherit month names from Gregorian).
-### 2.1 <a name="months_days_quarters_eras" href="#months_days_quarters_eras">Elements months, days, quarters, eras</a>
+### <a name="months_days_quarters_eras" href="#months_days_quarters_eras">Elements months, days, quarters, eras</a>
```xml
<!ELEMENT months ( alias | (monthContext*, special*)) >
@@ -326,7 +332,7 @@
</eras>
```
-### 2.2 <a name="monthPatterns_cyclicNameSets" href="#monthPatterns_cyclicNameSets">Elements monthPatterns, cyclicNameSets</a>
+### <a name="monthPatterns_cyclicNameSets" href="#monthPatterns_cyclicNameSets">Elements monthPatterns, cyclicNameSets</a>
```xml
<!ELEMENT monthPatterns ( alias | (monthPatternContext*, special*)) >
@@ -413,7 +419,7 @@
</cyclicNameSets>
```
-### 2.3 <a name="dayPeriods" href="#dayPeriods">Element dayPeriods</a>
+### <a name="dayPeriods" href="#dayPeriods">Element dayPeriods</a>
The former `am`/`pm` elements have been deprecated, and replaced by the more flexible `dayPeriods`.
@@ -448,7 +454,7 @@
</dayPeriods>
```
-### 2.4 <a name="dateFormats" href="#dateFormats">Element dateFormats</a>
+### <a name="dateFormats" href="#dateFormats">Element dateFormats</a>
```xml
<!ELEMENT dateFormats (alias | (default*, dateFormatLength*, special*)) >
@@ -515,7 +521,7 @@
The `datetimeSkeleton` element contains a _skeleton_ (see [availableFormats](#availableFormats_appendItems)) derived from the pattern. In the future the intent is to be able to generate the standard patterns from these `datetimeSkeleton` elements. However, in CLDR 40, the mechanisms associated with the `availableFormats` elements are not quite powerful enough to generate patterns that exactly match all of the ones provided in the `pattern` elements.
-### 2.5 <a name="timeFormats" href="#timeFormats">Element timeFormats</a>
+### <a name="timeFormats" href="#timeFormats">Element timeFormats</a>
```xml
<!ELEMENT timeFormats (alias | (default*, timeFormatLength*, special*)) >
@@ -551,7 +557,7 @@
Time formats use the specific non-location format (z or zzzz) for the time zone name. This is the format that should be used when formatting a specific time for presentation. When formatting a time referring to a recurring time (such as a meeting in a calendar), applications should substitute the generic non-location format (v or vvvv) for the time zone in the time format pattern. See _[Using Time Zone Names](#Using_Time_Zone_Names)_ for a complete description of available time zone formats and their uses.
-### 2.6 <a name="dateTimeFormats" href="#dateTimeFormats">Element dateTimeFormats</a>
+### <a name="dateTimeFormats" href="#dateTimeFormats">Element dateTimeFormats</a>
```xml
<!ELEMENT dateTimeFormats (alias | (default*, dateTimeFormatLength*, availableFormats*, appendItems*, intervalFormats*, special*)) >
@@ -632,7 +638,7 @@
These formats allow for date and time formats to be composed in various ways.
-#### 2.6.1 <a name="dateTimeFormat" href="#dateTimeFormat">Element dateTimeFormat</a>
+#### <a name="dateTimeFormat" href="#dateTimeFormat">Element dateTimeFormat</a>
```xml
<!ELEMENT dateTimeFormatLength (alias | (default*, dateTimeFormat*, special*))>
@@ -662,7 +668,7 @@
* If a single date or relative date is being combined with a single time, by default use the atTime pattern (if available) to produce an event time: “March 15 at 3:00 PM” or “tomorrow at 3:00 PM”. However, at least in the case of combining a single date and time, APIs should also offer a “current time” option of using the standard combining pattern to produce a format more suitable for indicating the current time: “March 15, 3:00 PM”.
* For all other uses of these patterns, use the standard pattern.
-#### 2.6.2 <a name="availableFormats_appendItems" href="#availableFormats_appendItems">Elements availableFormats, appendItems</a>
+#### <a name="availableFormats_appendItems" href="#availableFormats_appendItems">Elements availableFormats, appendItems</a>
```xml
<!ELEMENT availableFormats (alias | (dateFormatItem*, special*))>
@@ -719,7 +725,7 @@
The dateFormatItems inherit from their parent locale, so the inherited items need to be considered when processing.
-##### 2.6.2.1 <a name="Matching_Skeletons" href="#Matching_Skeletons">Matching Skeletons</a>
+##### <a name="Matching_Skeletons" href="#Matching_Skeletons">Matching Skeletons</a>
It is not necessary to supply `dateFormatItem`s with skeletons for every field length; fields in the skeleton and pattern are expected to be adjusted in parallel to handle a request.
@@ -773,7 +779,7 @@
<dateFormatItem id="yMMM">y年M月</dateFormatItem>
```
-If this is the best match for a requested skeleton yMMMM, automatic expansion should not produce a corresponding pattern “y年MMMM月”; rather, since “y年M月” specifies a numeric month M, automatic expansion should not modify the pattern, and should produce “y年M月” as the match for requested skeleton yMMMM.
+If this is the best match for a requested skeleton yMMMM, automatic expansion should not produce a corresponding pattern “y年MMMM月”; rather, since “y年M月” specifies a numeric month M, automatic expansion should not modify the pattern, and should produce “y年M月” as the match for requested skeleton yMMMM.
---
@@ -789,7 +795,7 @@
Finally: If the requested skeleton included both seconds and fractional seconds and the dateFormatItem skeleton included seconds but not fractional seconds, then the seconds field of the corresponding pattern should be adjusted by appending the locale’s decimal separator, followed by the sequence of ‘S’ characters from the requested skeleton.
-##### 2.6.2.2 <a name="Missing_Skeleton_Fields" href="#Missing_Skeleton_Fields">Missing Skeleton Fields</a>
+##### <a name="Missing_Skeleton_Fields" href="#Missing_Skeleton_Fields">Missing Skeleton Fields</a>
If a client-requested set of fields includes both date and time fields, and if the `availableFormats` data does not include a `dateFormatItem` whose skeleton matches the same set of fields, then the request should be handled as follows:
@@ -809,7 +815,7 @@
In case the best match does not include all the requested calendar fields, the `appendItems` element describes how to append needed fields to one of the existing formats. Each `appendItem` element covers a single calendar field. In the pattern, {0} represents the format string, {1} the data content of the field, and {2} the display name of the field (see [Calendar Fields](#Calendar_Fields)).
-#### 2.6.3 <a name="intervalFormats" href="#intervalFormats">Element intervalFormats</a>
+#### <a name="intervalFormats" href="#intervalFormats">Element intervalFormats</a>
```xml
<!ELEMENT intervalFormats (alias | (intervalFormatFallback*, intervalFormatItem*, special*)) >
@@ -861,7 +867,7 @@
6. If there is a match, use the pieces of the corresponding pattern to format the start and end datetime, as above.
7. Otherwise, format the start and end datetime using the fallback pattern.
-## 3 <a name="Calendar_Fields" href="#Calendar_Fields">Calendar Fields</a>
+## <a name="Calendar_Fields" href="#Calendar_Fields">Calendar Fields</a>
```xml
<!ELEMENT fields ( alias | (field*, special*)) >
@@ -998,25 +1004,30 @@
As in other cases, **narrow** may be ambiguous out of context.
-## 4 <a name="Supplemental_Calendar_Data" href="#Supplemental_Calendar_Data">Supplemental Calendar Data</a>
+## <a name="Supplemental_Calendar_Data" href="#Supplemental_Calendar_Data">Supplemental Calendar Data</a>
-### 4.1 <a name="Calendar_Data" href="#Calendar_Data">Calendar Data</a>
+### <a name="Calendar_Data" href="#Calendar_Data">Calendar Data</a>
```xml
<!ELEMENT calendarData ( calendar* )>
-<!ELEMENT calendar ( calendarSystem?, eras? )>
+<!ELEMENT calendar ( calendarSystem?, inheritEras?, eras? )>
<!ATTLIST calendar type NMTOKENS #REQUIRED>
<!ATTLIST calendar territories NMTOKENS #IMPLIED > <!-- deprecated, replaced by calendarPreferenceData -->
<!ELEMENT calendarSystem EMPTY>
<!ATTLIST calendarSystem type (solar | lunar | lunisolar | other) #REQUIRED>
+<!ELEMENT inheritEras EMPTY >
+<!ATTLIST inheritEras calendar NMTOKEN #REQUIRED >
+
<!ELEMENT eras ( era* )>
<!ELEMENT era EMPTY>
<!ATTLIST era type NMTOKENS #REQUIRED>
<!ATTLIST era start CDATA #IMPLIED>
<!ATTLIST era end CDATA #IMPLIED>
+<!ATTLIST era code NMTOKEN #IMPLIED >
+<!ATTLIST era aliases NMTOKENS #IMPLIED >
```
The `<calendarData>` element now provides only locale-independent data about calendar behaviors via its `<calendar>` subelements, which for each calendar can specify the astronomical basis of the calendar (solar, lunar, etc.) and the date ranges for its eras.
@@ -1024,8 +1035,8 @@
Era start or end dates are specified in terms of the equivalent proleptic Gregorian date (in "y-M-d" format). Eras may be open-ended, with unspecified start or end dates. For example, here are the eras for the Gregorian calendar:
```xml
-<era type="0" end="0" />
-<era type="1" start="1" />
+<era type="0" end="0-12-31" code="gregory-inverse" aliases="bc bce"/>
+<era type="1" start="1-01-01" code="gregory" aliases="ad ce"/>
```
For a sequence of eras with specified start dates, the end of each era need not be explicitly specified (it is assumed to match the start of the subsequent era). For example, here are the first few eras for the Japanese calendar:
@@ -1037,9 +1048,30 @@
…
```
+Some eras have additional `code` and `aliases` attributes that define invariant strings for identifying the eras. The `code` is a single globally unique identifier, and `aliases` are space-separated identifiers unique within the calendar. The code and aliases follow the following rules:
+
+1. Every calendar has either an era with a `code` that is the same as the BCP-47 name of that calendar or an `inheritEras` element pointing to another calendar with such an era. This era should be used for anchoring the "extended year" in the calendar (`u` in the date format pattern).
+2. Eras that count backwards (larger numbers for older years) are suffixed with `-inverse`.
+3. If the same era code is used in multiple calendars, then the calculations for year, month, and day in that era must be the same in all calendars in which it is used. For example, the `ethioaa` era is used in two calendar systems.
+
+If a `<calendar>` contains an `<inheritEras/>` element, all eras from the specified calendar should be inserted in order into the sequence of eras for the current calendar and follow the same start and end date rules. For example:
+
+```xml
+<calendar type="japanese">
+ <inheritEras calendar="gregorian" />
+ <eras>
+ <era type="0" start="645-6-19"/>
+ <era type="1" start="650-2-15"/>
+ <!-- ... -->
+ </eras>
+</calendar>
+```
+
+This means that the two eras from calendar "gregorian" should be inserted into the era list for "japanese" for calculations and formatting.
+
**Note:** The `territories` attribute in the `calendar` element is deprecated. It was formerly used to indicate calendar preference by territory, but this is now given by the _[Calendar Preference Data](#Calendar_Preference_Data)_ below.
-### 4.2 <a name="Calendar_Preference_Data" href="#Calendar_Preference_Data">Calendar Preference Data</a>
+### <a name="Calendar_Preference_Data" href="#Calendar_Preference_Data">Calendar Preference Data</a>
```xml
<!ELEMENT calendarPreferenceData ( calendarPreference* ) >
@@ -1064,7 +1096,7 @@
The calendars in common use for a locale should typically be shown in UIs that provide a choice of calendars. (An 'Other...' button could give access to the other available calendars.)
-### 4.3 <a name="Week_Data" href="#Week_Data">Week Data</a>
+### <a name="Week_Data" href="#Week_Data">Week Data</a>
```xml
<!ELEMENT weekData ( minDays*, firstDay*, weekendStart*, weekendEnd*, weekOfPreference* )>
@@ -1112,9 +1144,11 @@
…
```
-In order for a week to count as the first week of a new year for week-of-year calculations, it must include at least the number of days in the new year specified by the minDays value; otherwise the week will count as the last week of the previous year (and for week-of-month calculations, `minDays` also specifies the minimum number of days in the new month for a week to count as part of that month).
+In order for a week to count as the first week of a new year for week-of-year calculations, the week beginning with `firstDay` must include at least the number of days in the new year specified by the `minDays` value; otherwise the week will count as the last week of the previous year (and for week-of-month calculations, `minDays` also specifies the minimum number of days in the new month for a week to count as part of that month).
-The day indicated by `firstDay` is the one that should be shown as the first day of the week in a calendar view. This is not necessarily the same as the first day after the weekend (or the first work day of the week), which should be determined from the weekend information. Currently, day-of-week numbering is based on `firstDay` (that is, day 1 is the day specified by `firstDay`), but in the future we may add a way to specify this separately.
+> **Note:** For week-of-year calculations, Gregorian years may have 52 or 53 weeks. Changes in the value of `minDays` or `firstDay` can affect the year to which a date is assigned as well as the number of weeks in a given year; implementations that parse dates using week-of-year formats should be prepared to handle such cases. For example when parsing a date in week 53 of a year for which current values of `minDays` and `firstDay` no longer result in a 53-week year, that date should be treated as in the first week of the following year.
+
+The day indicated by `firstDay` is the one that should be shown as the first day of the week in a calendar view. This is not necessarily the same as the first day after the weekend (or the first work day of the week), which should be determined from the weekend information. Currently, day-of-week numbering is based on `firstDay` (that is, day 1 is the day specified by `firstDay`), but in the future we may add a way to specify this separately. The `firstDay` value determined from the region can be overridden by the locale keyword "fw", see [Unicode First Day Identifier](tr35.md#UnicodeFirstDayIdentifier).
What is meant by the weekend varies from country to country. It is typically when most non-retail businesses are closed. The time should not be specified unless it is a well-recognized part of the day. The `weekendStart` day defaults to "sat", and `weekendEnd` day defaults to "sun". For more information, see _[Dates and Date Ranges](tr35.md#Date_Ranges)_.
@@ -1129,7 +1163,31 @@
| weekOfDate | the week of April 11, 2016 | \<field type="week"\>\<relativePeriod>the week of {0}\<… | The date pattern that replaces {0} is determined separately and may use the first day or workday of the week, the range of the full week or work week, etc. |
| weekOfInterval | the week of April 11–15 | \<field type="week"\>\<relativePeriod>the week of {0}\<… | (same comment as above) |
-### 4.4 <a name="Time_Data" href="#Time_Data">Time Data</a>
+#### First Day Overrides
+
+The calculation of the first day of the week depends on various fields in a locale_identifier, according to the following algorithm. The data in the `firstDay` elements is treated as a map from region to day, with any missing value using the value for 001.
+
+1. If there is a valid `-u-fw-` day value, return that day.
+2. Else if there is a valid `-u-rg-` region value, return that region's firstDay map value.
+3. Else if there is a valid `-u-ca-` calendar value, where that calendar specifies the first day, then return that first day. (Most calendars do not specify the first day.)
+4. Else if there is an explicit region subtag, then return that region's firstDay map value.
+5. Else if there is a valid `-u-sd-` subdivision value, return that region's firstDay map value.
+6. Else if the [Add Likely Subtags](tr35.md#Likely_Subtags) algorithm produces a region, return that region's firstDay map value.
+7. Else return the firstDay map value for 001.
+
+*Example:*
+
+| Locale Identifier | "Winning" subtags | Region |
+|----|----|----|
+|en-AU-u-ca-iso8601-fw-tue-rg-afzzzz-sd-cabc | -fw-tue | n/a, uses Tuesday |
+|en-AU-u-ca-iso8601-rg-afzzzz-sd-cabc | -rg-afzzzz | AF |
+|en-AU-u-ca-iso8601-sd-cabc | -ca-iso8601 | n/a, uses Monday |
+|en-AU-u-sd-cabc | -AU | AU |
+|en-u-sd-cabc | -sd-cabc | CA |
+|en | | US (from likely subtags) |
+|zxx | 001 | (fallback) |
+
+### <a name="Time_Data" href="#Time_Data">Time Data</a>
```xml
<!ELEMENT timeData ( hours* ) >
@@ -1168,7 +1226,9 @@
Some systems may not want to use B and b, even if preferred for the locale, so for compatibility the `preferred` value is limited to {H, h, K, k}, and is the option selected by the ‘j’ date symbol. Thus the `preferred` value may not be the same as the first `allowed` value.
-### 4.5 <a name="Day_Period_Rule_Sets" href="#Day_Period_Rule_Sets">Day Period Rule Sets</a>
+The preferred value for the locale can be overridden by the locale keyword "hc", see [Unicode Hour Cycle Identifier ](tr35.md#UnicodeHourCycleIdentifier).
+
+### <a name="Day_Period_Rule_Sets" href="#Day_Period_Rule_Sets">Day Period Rule Sets</a>
```xml
<!ELEMENT dayPeriodRuleSet ( dayPeriodRules* ) >
@@ -1211,11 +1271,11 @@
As with plurals, the exact set of periods used for any language may be different. It is the responsibility of any translation software to pick the relevant day periods for the locale for display to the translator (and end user).
-#### 4.5.1 <a name="Day_Period_Rules" href="#Day_Period_Rules">Day Period Rules</a>
+#### <a name="Day_Period_Rules" href="#Day_Period_Rules">Day Period Rules</a>
Here are the requirements for a rule set.
-##### 4.5.1.1 <a name="Fixed_periods" href="#Fixed_periods">Fixed periods</a>
+##### <a name="Fixed_periods" href="#Fixed_periods">Fixed periods</a>
There are 4 dayPeriods that are fixed; am/pm are always defined, and always have the same meaning and definition for every locale. Midnight and noon are optional, however if they are defined, they have the same meaning and definition as in all other locales where they are defined.
@@ -1234,7 +1294,7 @@
It is strongly recommended that implementations provide for the ability to specify whether **midnight** is supported or not (and for either 00:00 or 24:00 or both), since only the caller knows enough of the context to determine what to use. In the absence of such information, 24:00 may be the best choice.
-##### 4.5.1.2 <a name="Variable_periods" href="#Variable_periods">Variable periods</a>
+##### <a name="Variable_periods" href="#Variable_periods">Variable periods</a>
1. If a locale has a set of dayPeriodRules for variable periods, it needs to completely cover the 24 hours in a day (from 0:00 before 24:00), with **no** overlaps between any dayPeriodRules. They may overlap with the **Fixed Periods**.
If it does not have a rule set for variable periods, behavior should fall back to using the fixed periods (am, pm).
@@ -1253,7 +1313,7 @@
* `<dayPeriod type = "night1" from="21:00" to="24:00"/>`
9. 24:00 is _only_ allowed in _before_="24:00".
-##### 4.5.1.3 <a name="Parsing_Day_Periods" href="#Parsing_Day_Periods">Parsing Day Periods</a>
+##### <a name="Parsing_Day_Periods" href="#Parsing_Day_Periods">Parsing Day Periods</a>
When parsing, if the hour is present with a strict parse the dayperiod is checked for consistency with the hour. If there is no hour, the center of the first matching dayPeriodRule can be chosen (starting from 0:00). However, if there is other information available when parsing, a different point within the interval may be chosen.
@@ -1265,7 +1325,7 @@
For examples, see [Day Periods Chart](https://unicode-org.github.io/cldr-staging/charts/38/supplemental/day_periods.html).
-## 5 <a name="Time_Zone_Names" href="#Time_Zone_Names">Time Zone Names</a>
+## <a name="Time_Zone_Names" href="#Time_Zone_Names">Time Zone Names</a>
```xml
<!ELEMENT timeZoneNames (alias | (hourFormat*, gmtFormat*, gmtZeroFormat*, regionFormat*, fallbackFormat*, zone*, metazone*, special*)) >
@@ -1366,7 +1426,7 @@
The conversion from local time into UTC depends on the particular time zone rules, which will vary by location. The standard data used for converting local time (sometimes called _wall time_) to UTC and back is the _TZ Data_ [[Olson](tr35.md#Olson)], used by Linux, UNIX, Java, ICU, and others. The data includes rules for matching the laws for time changes in different countries. For example, for the US it is:
-> "During the period commencing at 2 o'clock antemeridian on the second Sunday of March of each year and ending at 2 o'clock antemeridian on the first Sunday of November of each year, the standard time of each zone established by sections 261 to 264 of this title, as modified by section 265 of this title, shall be advanced one hour..." (United States Law - 15 U.S.C. §6(IX)(260-7), as amended by Energy Policy Act of 2005).
+> "During the period commencing at 2 o'clock antemeridian on the second Sunday of March of each year and ending at 2 o'clock antemeridian on the first Sunday of November of each year, the standard time of each zone established by sections 261 to 264 of this title, as modified by of this title, shall be advanced one hour..." (United States Law - 15 U.S.C. §6(IX)(260-7), as amended by Energy Policy Act of 2005).
Each region that has a different time zone or daylight savings time rules, either now or at any time back to 1970, is given a unique internal ID, such as `Europe/Paris` . (Some IDs are also distinguished on the basis of differences before 1970.) As with currency codes, these are internal codes. A localized string associated with these is provided for users (such as in the Windows _Control Panels>Date/Time>Time Zone_).
@@ -1395,7 +1455,7 @@
> **Note:** User interfaces for time zone selection can use the "generic location format" for time zone names to obtain the most useful ordering of names in a menu or list; see _[Using Time Zone Names](#Using_Time_Zone_Names)_ and the zone section of the _[Date Field Symbol Table](#Date_Field_Symbol_Table)._
-### 5.1 <a name="Metazone_Names" href="#Metazone_Names">Metazone Names</a>
+### <a name="Metazone_Names" href="#Metazone_Names">Metazone Names</a>
A metazone is a grouping of one or more internal TZIDs that share a common display name in current customary usage, or that have shared a common display name during some particular time period. For example, the zones _Europe/Paris, Europe/Andorra, Europe/Tirane, Europe/Vienna, Europe/Sarajevo, Europe/Brussels, Europe/Zurich, Europe/Prague, Europe/Berlin_, and so on are often simply designated _Central European Time_ (or translated equivalent).
@@ -1464,9 +1524,9 @@
The `commonlyUsed` element is now deprecated. The CLDR committee has found it nearly impossible to obtain accurate and reliable data regarding which time zone abbreviations may be understood in a given territory, and therefore has changed to a simpler approach. Thus, if the short metazone form is available in a given locale, it is to be used for formatting regardless of the value of commonlyUsed. If a given short metazone form is known NOT to be understood in a given locale and the parent locale has this value such that it would normally be inherited, the inheritance of this value can be explicitly disabled by use of the 'no inheritance marker' as the value, which is 3 simultaneous empty set characters (U+2205).
-## 6 <a name="Supplemental_Time_Zone_Data" href="#Supplemental_Time_Zone_Data">Supplemental Time Zone Data</a>
+## <a name="Supplemental_Time_Zone_Data" href="#Supplemental_Time_Zone_Data">Supplemental Time Zone Data</a>
-### 6.1 <a name="Metazones" href="#Metazones">Metazones</a>
+### <a name="Metazones" href="#Metazones">Metazones</a>
```xml
<!ELEMENT metaZones (metazoneInfo?, mapTimezones?) >
@@ -1521,7 +1581,7 @@
....
```
-### 6.2 <a name="Windows_Zones" href="#Windows_Zones">Windows Zones</a>
+### <a name="Windows_Zones" href="#Windows_Zones">Windows Zones</a>
```xml
<!ELEMENT windowsZones (mapTimezones?) >
@@ -1551,7 +1611,7 @@
**Note:** Not all Unicode time zones have equivalent Windows TZID mappings. Also, not all Windows TZIDs have equivalent Unicode time zones. For example, there is no equivalent Windows zone for Unicode time zone "Australia/Lord_Howe", and there is no equivalent Unicode time zone for Windows zone "E. Europe Standard Time" (as of CLDR 25 release).
-### 6.3 <a name="Primary_Zones" href="#Primary_Zones">Primary Zones</a>
+### <a name="Primary_Zones" href="#Primary_Zones">Primary Zones</a>
```xml
<!ELEMENT primaryZones ( primaryZone* ) >
@@ -1571,11 +1631,11 @@
This information was previously specified by the LDML `<singleCountries>` element under each locale’s `<timeZoneNames>` element. However, that approach had inheritance issues, and the data is not really locale-specific anyway.
-## 7 <a name="Using_Time_Zone_Names" href="#Using_Time_Zone_Names">Using Time Zone Names</a>
+## <a name="Using_Time_Zone_Names" href="#Using_Time_Zone_Names">Using Time Zone Names</a>
There are three main types of formats for zone identifiers: GMT, generic (wall time), and standard/daylight. Standard and daylight are equivalent to a particular offset from GMT, and can be represented by a GMT offset as a fallback. In general, this is not true for the generic format, which is used for picking timezones or for conveying a timezone for specifying a recurring time (such as a meeting in a calendar). For either purpose, a GMT offset would lose information.
-### 7.1 <a name="Time_Zone_Format_Terminology" href="#Time_Zone_Format_Terminology">Time Zone Format Terminology</a>
+### <a name="Time_Zone_Format_Terminology" href="#Time_Zone_Format_Terminology">Time Zone Format Terminology</a>
The following terminology defines more precisely the formats that are used.
@@ -1695,7 +1755,7 @@
**regionFormat:** a formatting string such as "{0} Time", where {0} is the country or city.
-### 7.2 <a name="Time_Zone_Goals" href="#Time_Zone_Goals">Goals</a>
+### <a name="Time_Zone_Goals" href="#Time_Zone_Goals">Goals</a>
The timezones are designed so that:
@@ -1832,7 +1892,7 @@
**Note:** As with the _regionFormat_, exceptional cases need to be explicitly translated.
-### 7.3 <a name="Time_Zone_Parsing" href="#Time_Zone_Parsing">Parsing</a>
+### <a name="Time_Zone_Parsing" href="#Time_Zone_Parsing">Parsing</a>
In parsing, an implementation will be able to either determine the zone id, or a simple offset from GMT for anything formatting according to the above process.
@@ -1885,7 +1945,7 @@
* PST8PDT → America/Los_Angeles → “PST” → America/Los_Angeles
* America/Vancouver → “Pacific Time (Canada)” → America/Vancouver
-## 8 <a name="Date_Format_Patterns" href="#Date_Format_Patterns">Date Format Patterns</a>
+## <a name="Date_Format_Patterns" href="#Date_Format_Patterns">Date Format Patterns</a>
A date pattern is a character string consisting of two types of elements:
@@ -2074,7 +2134,7 @@
<td rowspan="3"><strong>AM, PM<br/></strong>May be upper or lowercase depending on the locale and other options.
The wide form may be the same as the short form if the “real” long form (eg <em>ante meridiem</em>) is not customarily used.
The narrow form must be unique, unlike some other fields.
- See also Section 9 <a href="#Parsing_Dates_Times">Parsing Dates and Times</a>.</td></tr>
+ See also <a href="#Parsing_Dates_Times">Parsing Dates and Times</a>.</td></tr>
<tr><td>aaaa</td><td>am. [e.g. 12 am.]</td><td>Wide</td></tr>
<tr><td>aaaaa</td><td>a [e.g. 12a]</td><td>Narrow</td></tr>
<!-- b -->
@@ -2170,7 +2230,7 @@
In CLDR 26 the time separator pattern character was specified to be COLON.
This was withdrawn in CLDR 28 due to backward compatibility issues, and no time separator pattern character is currently defined.</span><br/><br/>
Like the use of "," in number formats, this character in a date pattern is replaced with the corresponding number symbol which may depend on the numbering system.
- For more information, see <em><strong>Part 3: <a href="tr35-numbers.md#Contents">Numbers</a></strong>, Section 2.3 <a href="tr35-numbers.md#Number_Symbols">Number Symbols</a></em>.</td></tr>
+ For more information, see <em><strong>Part 3: <a href="tr35-numbers.md#Contents">Numbers</a></strong>, <a href="tr35-numbers.md#Number_Symbols">Number Symbols</a></em>.</td></tr>
<!-- == == == ZONE == == == -->
<tr><th rowspan="23"><a name="dfst-zone" id="dfst-zone" href="#dfst-zone">zone</a></th><td rowspan="2">z</td><td>z..zzz</td><td>PDT</td>
@@ -2246,27 +2306,27 @@
<i><b>Note</b>: The seconds field is not supported by the ISO8601 specification.</i></td></tr>
</tbody></table>
-### 8.1 <a name="Localized_Pattern_Characters" href="#Localized_Pattern_Characters">Localized Pattern Characters (deprecated)</a>
+### <a name="Localized_Pattern_Characters" href="#Localized_Pattern_Characters">Localized Pattern Characters (deprecated)</a>
These are characters that can be used when displaying a date pattern to an end user. This can occur, for example, when a spreadsheet allows users to specify date patterns. Whatever is in the string is substituted one-for-one with the characters "GyMdkHmsSEDFwWahKzYeugAZvcLQqVUOXxr", with the above meanings. Thus, for example, if 'J' is to be used instead of 'Y' to mean Year (for Week of Year), then the string would be: "GyMdkHmsSEDFwWahKzJeugAZvcLQqVUOXxr".
This element is deprecated. It is recommended instead that a more sophisticated UI be used for localization, such as using icons to represent the different formats (and lengths) in the [Date Field Symbol Table](#Date_Field_Symbol_Table).
-### 8.2 <a name="Date_Patterns_AM_PM" href="#Date_Patterns_AM_PM">AM / PM</a>
+### <a name="Date_Patterns_AM_PM" href="#Date_Patterns_AM_PM">AM / PM</a>
Even for countries where the customary date format only has a 24 hour format, both the am and pm localized strings must be present and must be distinct from one another. Note that as long as the 24 hour format is used, these strings will normally never be used, but for testing and unusual circumstances they must be present.
-### 8.3 <a name="Date_Patterns_Eras" href="#Date_Patterns_Eras">Eras</a>
+### <a name="Date_Patterns_Eras" href="#Date_Patterns_Eras">Eras</a>
There are only two values for era in the Gregorian calendar, with two common naming conventions (here in abbreviated form for English): "BC" and "AD", or "BCE" and "CE". These values can be translated into other languages, like "a.C." and "d.C." for Spanish, but there are no other eras in the Gregorian calendar. Other calendars have different numbers of eras. Care should be taken when translating the era names for a specific calendar.
-### 8.4 <a name="Date_Patterns_Week_Of_Year" href="#Date_Patterns_Week_Of_Year">Week of Year</a>
+### <a name="Date_Patterns_Week_Of_Year" href="#Date_Patterns_Week_Of_Year">Week of Year</a>
Values calculated for the Week of Year field range from 1 to 53 for the Gregorian calendar (they may have different ranges for other calendars). Week 1 for a year is the first week that contains at least the specified minimum number of days from that year. Weeks between week 1 of one year and week 1 of the following year are numbered sequentially from 2 to 52 or 53 (if needed). For example, January 1, 1998 was a Thursday. If the first day of the week is MONDAY and the minimum days in a week is 4 (these are the values reflecting ISO 8601 and many national standards), then week 1 of 1998 starts on December 29, 1997, and ends on January 4, 1998. However, if the first day of the week is SUNDAY, then week 1 of 1998 starts on January 4, 1998, and ends on January 10, 1998. The first three days of 1998 are then part of week 53 of 1997.
Values are similarly calculated for the Week of Month.
-### 8.5 <a name="Date_Patterns_Week_Elements" href="#Date_Patterns_Week_Elements">Week Elements</a>
+### <a name="Date_Patterns_Week_Elements" href="#Date_Patterns_Week_Elements">Week Elements</a>
**firstDay**
@@ -2280,7 +2340,7 @@
Indicates the day and time that the weekend starts or ends. As with firstDay, keywords are used instead of numbers.
-## 9 <a name="Parsing_Dates_Times" href="#Parsing_Dates_Times">Parsing Dates and Times</a>
+## <a name="Parsing_Dates_Times" href="#Parsing_Dates_Times">Parsing Dates and Times</a>
For general information on lenient parsing, see [Lenient Parsing](tr35.md#Lenient_Parsing) in the core specification. This section provides additional information specific to parsing of dates and times.
@@ -2303,6 +2363,6 @@
* * *
-Copyright © 2001–2022 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
+Copyright © 2001–2023 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
Unicode and the Unicode logo are trademarks of Unicode, Inc., and are registered in some jurisdictions.
diff --git a/docs/ldml/tr35-general.anchors.json b/docs/ldml/tr35-general.anchors.json
new file mode 100644
index 0000000..2ee16df
--- /dev/null
+++ b/docs/ldml/tr35-general.anchors.json
@@ -0,0 +1,156 @@
+[
+ "Annotations",
+ "annotations-and-labels",
+ "annotations-character-labels",
+ "case",
+ "Case",
+ "case_compound_units",
+ "Character_Elements",
+ "Character_Labels",
+ "Character_Mapping",
+ "Character_More_Info",
+ "Character_Parse_Lenient",
+ "character-elements",
+ "Choice_Patterns",
+ "choice-patterns",
+ "compound-units",
+ "compoundUnitPattern",
+ "Contents",
+ "contents-of-part-2-general",
+ "context",
+ "Context",
+ "Context_Transform_Elements",
+ "contexttransform-elements",
+ "contextTransformUsage_type_attribute_values",
+ "Conversion_Rules",
+ "conversion-rules",
+ "coordinateunit",
+ "coordinateUnit",
+ "definiteness",
+ "Delimiter_Elements",
+ "delimiter-elements",
+ "deriving-the-case-of-unit-components",
+ "deriving-the-gender-of-compound-units",
+ "deriving-the-plural-category-of-unit-components",
+ "Display_Name_Elements",
+ "display-name-elements",
+ "Dual_Rules",
+ "dual-rules",
+ "durationunit",
+ "durationUnit",
+ "ellipsis",
+ "Ellipsis",
+ "example",
+ "Example",
+ "Example_Units",
+ "example-1",
+ "example-2",
+ "example-units",
+ "exemplar-syntax",
+ "exemplars",
+ "Exemplars",
+ "ExemplarSyntax",
+ "features",
+ "Filter_Rules",
+ "filter-rules",
+ "gender",
+ "Gender",
+ "gender_compound_units",
+ "gender-of-lists",
+ "Grammatical_Derivations",
+ "Grammatical_Features",
+ "grammatical-derivations",
+ "grammatical-features",
+ "index-labels",
+ "IndexLabels",
+ "inheritance",
+ "Inheritance",
+ "Intermixing_Transform_Rules_and_Conversion_Rules",
+ "intermixing-transform-rules-and-conversion-rules",
+ "Inverse_Summary",
+ "inverse-summary",
+ "Layout_Elements",
+ "layout-elements",
+ "List_Gender",
+ "list-patterns",
+ "ListPatterns",
+ "locale_display_name_algorithm",
+ "locale_display_name_fields",
+ "locale-display-name-algorithm",
+ "locale-display-name-fields",
+ "mapping",
+ "Measurement_Elements",
+ "Measurement_System_Data",
+ "measurement-elements-deprecated",
+ "measurement-system-data",
+ "more-information",
+ "nomenclature",
+ "parse-lenient",
+ "parts",
+ "Parts",
+ "perUnitPatterns",
+ "pivots",
+ "Pivots",
+ "plural_compound_units",
+ "POSIX_Elements",
+ "posix-elements",
+ "precomposed-compound-units",
+ "Private_Use_Units",
+ "private-use-units",
+ "Reference_Elements",
+ "reference-element",
+ "restrictions",
+ "Restrictions",
+ "revisiting",
+ "Revisiting",
+ "Rule_Syntax",
+ "rule-syntax",
+ "Segmentation_Exceptions",
+ "Segmentation_Inheritance",
+ "segmentation-inheritance",
+ "segmentation-suppressions",
+ "segmentations",
+ "Segmentations",
+ "status",
+ "summary",
+ "syntax",
+ "synthesizing-sequence-names",
+ "SynthesizingNames",
+ "table-case",
+ "table-characterlabel",
+ "table-characterlabelpattern",
+ "table-element-contexttransformusage-type-attribute-values",
+ "table-synthesized-emoji-sequence-names",
+ "table-values",
+ "table-values-1",
+ "table-values-2",
+ "Tailor_Linebreak_With_Delimiters",
+ "tailoring-linebreak-using-delimiters",
+ "Territory_Based_Unit_Preferences",
+ "territory-based-unit-preferences",
+ "Transform_Rules",
+ "Transform_Rules_Syntax",
+ "transform-rules",
+ "transform-rules-syntax",
+ "transform-syntax-characters",
+ "transforms",
+ "Transforms",
+ "Typographic_Names",
+ "typographic-names",
+ "unicode-locale-data-markup-language-ldmlpart-2-general",
+ "unicode-technical-standard-35",
+ "Unit_Elements",
+ "Unit_Identifier_Uniqueness",
+ "Unit_Identifiers",
+ "Unit_Preference_and_Conversion",
+ "Unit_Sequences",
+ "unit-elements",
+ "unit-identifier-uniqueness",
+ "unit-identifiers",
+ "unit-preference-and-conversion-data",
+ "unit-sequences-mixed-units",
+ "Variable_Definition_Rules",
+ "variable-definition-rules",
+ "variants",
+ "Variants"
+]
\ No newline at end of file
diff --git a/docs/ldml/tr35-general.md b/docs/ldml/tr35-general.md
index c718bac..3c1c839 100644
--- a/docs/ldml/tr35-general.md
+++ b/docs/ldml/tr35-general.md
@@ -2,7 +2,7 @@
# Unicode Locale Data Markup Language (LDML)<br/>Part 2: General
-|Version|42 |
+|Version|44.1 |
|-------|---------------------|
|Editors|Yoshito Umaoka (<a href="mailto:[email protected]">[email protected]</a>) and <a href="tr35.md#Acknowledgments">other CLDR committee members|
@@ -21,7 +21,12 @@
### _Status_
-_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium. This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
+<!-- _This is a draft document which may be updated, replaced, or superseded by other documents at any time.
+Publication does not imply endorsement by the Unicode Consortium.
+This is not a stable document; it is inappropriate to cite this document as other than a work in progress._ -->
+
+_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium.
+This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
> _**A Unicode Technical Standard (UTS)** is an independent specification. Conformance to the Unicode Standard does not imply conformance to any UTS._
@@ -42,92 +47,94 @@
## <a name="Contents" href="#Contents">Contents of Part 2, General</a>
-* 1 [Display Name Elements](#Display_Name_Elements)
- * 1.1 [Locale Display Name Algorithm](#locale_display_name_algorithm)
- * 1.2 [Locale Display Name Fields](#locale_display_name_fields)
-* 2 [Layout Elements](#Layout_Elements)
-* 3 [Character Elements](#Character_Elements)
- * 3.1 [Exemplars](#Exemplars)
- * 3.1.1 [Exemplar Syntax](#ExemplarSyntax)
- * 3.1.2 [Restrictions](#Restrictions)
- * 3.2 ~~[Mapping](#Character_Mapping)~~
- * 3.3 ~~[Index Labels](#IndexLabels)~~
- * 3.4 [Ellipsis](#Ellipsis)
- * 3.5 [More Information](#Character_More_Info)
- * 3.6 [Parse Lenient](#Character_Parse_Lenient)
-* 4 [Delimiter Elements](#Delimiter_Elements)
- * 4.1 [Tailoring Linebreak Using Delimiters](#Tailor_Linebreak_With_Delimiters)
-* 5 [Measurement System Data](#Measurement_System_Data)
- * 5.1 [Measurement Elements (deprecated)](#Measurement_Elements)
-* 6 [Unit Elements](#Unit_Elements)
- * 6.1 [Unit Preference and Conversion Data](#Unit_Preference_and_Conversion)
- * 6.2 [Unit Identifiers](#Unit_Identifiers)
+* [Display Name Elements](#Display_Name_Elements)
+ * [Locale Display Name Algorithm](#locale_display_name_algorithm)
+ * [Locale Display Name Fields](#locale_display_name_fields)
+* [Layout Elements](#Layout_Elements)
+* [Character Elements](#Character_Elements)
+ * [Exemplars](#Exemplars)
+ * [Exemplar Syntax](#ExemplarSyntax)
+ * [Restrictions](#Restrictions)
+ * ~~[Mapping](#Character_Mapping)~~
+ * ~~[Index Labels](#IndexLabels)~~
+ * [Ellipsis](#Ellipsis)
+ * [More Information](#Character_More_Info)
+ * [Parse Lenient](#Character_Parse_Lenient)
+* [Delimiter Elements](#Delimiter_Elements)
+ * [Tailoring Linebreak Using Delimiters](#Tailor_Linebreak_With_Delimiters)
+* [Measurement System Data](#Measurement_System_Data)
+ * [Measurement Elements (deprecated)](#Measurement_Elements)
+* [Unit Elements](#Unit_Elements)
+ * [Unit Preference and Conversion Data](#Unit_Preference_and_Conversion)
+ * [Unit Identifiers](#Unit_Identifiers)
* [Nomenclature](#nomenclature)
* [Syntax](#syntax)
- * 6.3 [Example Units](#Example_Units)
- * 6.4 [Compound Units](#compound-units)
+ * [Unit Identifier Uniqueness](#Unit_Identifier_Uniqueness)
+ * [Example Units](#Example_Units)
+ * [Compound Units](#compound-units)
* [Precomposed Compound Units](#precomposed-compound-units)
- * 6.5 [Unit Sequences (Mixed Units)](#Unit_Sequences)
- * 6.6 [durationUnit](#durationUnit)
- * 6.7 [coordinateUnit](#coordinateUnit)
- * 6.8 [Territory-Based Unit Preferences](#Territory_Based_Unit_Preferences)
- * 6.9 [Private-Use Units](#Private_Use_Units)
-* 7 [POSIX Elements](#POSIX_Elements)
-* 8 [Reference Element](#Reference_Elements)
-* 9 [Segmentations](#Segmentations)
- * 9.1 [Segmentation Inheritance](#Segmentation_Inheritance)
- * 9.2 [Segmentation Suppressions](#Segmentation_Exceptions)
-* 10 [Transforms](#Transforms)
- * 10.1 [Inheritance](#Inheritance)
- * 10.1.1 [Pivots](#Pivots)
- * 10.2 [Variants](#Variants)
- * 10.3 [Transform Rules Syntax](#Transform_Rules_Syntax)
- * 10.3.1 [Dual Rules](#Dual_Rules)
- * 10.3.2 [Context](#Context)
- * 10.3.3 [Revisiting](#Revisiting)
- * 10.3.4 [Example](#Example)
- * 10.3.5 [Rule Syntax](#Rule_Syntax)
- * 10.3.6 [Transform Rules](#Transform_Rules)
- * 10.3.7 [Variable Definition Rules](#Variable_Definition_Rules)
- * 10.3.8 [Filter Rules](#Filter_Rules)
- * 10.3.9 [Conversion Rules](#Conversion_Rules)
- * 10.3.10 [Intermixing Transform Rules and Conversion Rules](#Intermixing_Transform_Rules_and_Conversion_Rules)
- * 10.3.11 [Inverse Summary](#Inverse_Summary)
-* 11 [List Patterns](#ListPatterns)
- * 11.1 [Gender of Lists](#List_Gender)
-* 12 [ContextTransform Elements](#Context_Transform_Elements)
+ * [Unit Sequences (Mixed Units)](#Unit_Sequences)
+ * [durationUnit](#durationUnit)
+ * [coordinateUnit](#coordinateUnit)
+ * [Territory-Based Unit Preferences](#Territory_Based_Unit_Preferences)
+ * [Private-Use Units](#Private_Use_Units)
+* [POSIX Elements](#POSIX_Elements)
+* [Reference Element](#Reference_Elements)
+* [Segmentations](#Segmentations)
+ * [Segmentation Inheritance](#Segmentation_Inheritance)
+ * [Segmentation Suppressions](#Segmentation_Exceptions)
+* [Transforms](#Transforms)
+ * [Inheritance](#Inheritance)
+ * [Pivots](#Pivots)
+ * [Variants](#Variants)
+ * [Transform Rules Syntax](#Transform_Rules_Syntax)
+ * [Dual Rules](#Dual_Rules)
+ * [Context](#Context)
+ * [Revisiting](#Revisiting)
+ * [Example](#Example)
+ * [Rule Syntax](#Rule_Syntax)
+ * [Transform Rules](#Transform_Rules)
+ * [Variable Definition Rules](#Variable_Definition_Rules)
+ * [Filter Rules](#Filter_Rules)
+ * [Conversion Rules](#Conversion_Rules)
+ * [Intermixing Transform Rules and Conversion Rules](#Intermixing_Transform_Rules_and_Conversion_Rules)
+ * [Inverse Summary](#Inverse_Summary)
+ * [Transform Syntax Characters](#transform-syntax-characters)
+* [List Patterns](#ListPatterns)
+ * [Gender of Lists](#List_Gender)
+* [ContextTransform Elements](#Context_Transform_Elements)
* Table: [Element contextTransformUsage type attribute values](#contextTransformUsage_type_attribute_values)
-* 13 [Choice Patterns](#Choice_Patterns)
-* 14 [Annotations and Labels](#Annotations)
- * 14.1 [Synthesizing Sequence Names](#SynthesizingNames)
+* [Choice Patterns](#Choice_Patterns)
+* [Annotations and Labels](#Annotations)
+ * [Synthesizing Sequence Names](#SynthesizingNames)
* [Table: Synthesized Emoji Sequence Names](#table-synthesized-emoji-sequence-names)
- * 14.2 [Annotations Character Labels](#Character_Labels)
+ * [Annotations Character Labels](#Character_Labels)
* [Table: characterLabelPattern](#table-characterlabelpattern)
* [Table: characterLabel](#table-characterlabel)
- * 14.3 [Typographic Names](#Typographic_Names)
-* 15 [Grammatical Features](#Grammatical_Features)
+ * [Typographic Names](#Typographic_Names)
+* [Grammatical Features](#Grammatical_Features)
* [Features](#features)
- * 15.1 [Gender](#Gender)
+ * [Gender](#Gender)
* [Example](#example)
* [Table: Values](#table-values)
- * 15.2 [Case](#Case)
+ * [Case](#Case)
* [Table: Case](#table-case)
* [Example](#example)
* [Table: Values](#table-values)
* [Definiteness](#definiteness)
* [Table: Values](#table-values)
-* 16 [Grammatical Derivations](#Grammatical_Derivations)
- * 16.1 [Deriving the Gender of Compound Units](#gender_compound_units)
- * 16.2 [Deriving the Plural Category of Unit Components](#plural_compound_units)
- * 16.3 [Deriving the Case of Unit Components](#case_compound_units)
+* [Grammatical Derivations](#Grammatical_Derivations)
+ * [Deriving the Gender of Compound Units](#gender_compound_units)
+ * [Deriving the Plural Category of Unit Components](#plural_compound_units)
+ * [Deriving the Case of Unit Components](#case_compound_units)
-## 1 <a name="Display_Name_Elements" href="#Display_Name_Elements">Display Name Elements</a>
+## <a name="Display_Name_Elements" href="#Display_Name_Elements">Display Name Elements</a>
```xml
<!ELEMENT localeDisplayNames ( alias | ( localeDisplayPattern?, languages?, scripts?, territories?, subdivisions?, variants?, keys?, types?, transformNames?, measurementSystemNames?, codePatterns?, special* ) )>
```
-Display names for scripts, languages, countries, currencies, and variants in this locale are supplied by this element. They supply localized names for these items for use in user-interfaces for various purposes such as displaying menu lists, displaying a language name in a dialog, and so on. Capitalization should follow the conventions used in the middle of running text; the `<contextTransforms>` element may be used to specify the appropriate capitalization for other contexts (see _Section 12 [ContextTransform Elements](#Context_Transform_Elements)_). Examples are given below.
+Display names for scripts, languages, countries, currencies, and variants in this locale are supplied by this element. They supply localized names for these items for use in user-interfaces for various purposes such as displaying menu lists, displaying a language name in a dialog, and so on. Capitalization should follow the conventions used in the middle of running text; the `<contextTransforms>` element may be used to specify the appropriate capitalization for other contexts (see _[ContextTransform Elements](#Context_Transform_Elements)_). Examples are given below.
> **Note:** The "en" locale may contain translated names for deprecated codes for debugging purposes. Translation of deprecated codes into other languages is discouraged.
@@ -156,9 +163,9 @@
<type type="pinyin" key="collation">Pinyin Sort Order</type>
```
-### 1.1 <a name="locale_display_name_algorithm" href="#locale_display_name_algorithm">Locale Display Name Algorithm</a>
+### <a name="locale_display_name_algorithm" href="#locale_display_name_algorithm">Locale Display Name Algorithm</a>
-A locale display name LDN is generated for a locale identifier L in the following way. First, convert the locale identifier to *canonical syntax* per **[Part 1, Section 3.2.1 Canonical Unicode Locale Identifiers](tr35.md#Canonical_Unicode_Locale_Identifiers)**. That will put the subtags in a defined order, and replace aliases by their canonical counterparts. (That defined order is followed in the processing below.)
+A locale display name LDN is generated for a locale identifier L in the following way. First, convert the locale identifier to *canonical syntax* per **[Part 1, Canonical Unicode Locale Identifiers](tr35.md#Canonical_Unicode_Locale_Identifiers)**. That will put the subtags in a defined order, and replace aliases by their canonical counterparts. (That defined order is followed in the processing below.)
Then follow each of the following steps for the subtags in L, building a base name LDN and a list of qualifying strings LQS.
@@ -184,7 +191,7 @@
* `<language type="az">Azerbaijani</language>`
* **`<language type="az" alt="short">Azeri</language>`**
-In addition, the input locale display name could be minimized (see [Part 1: Section 4.3 Likely Subtags](tr35.md#Likely_Subtags)) before generating the LDN. Selective minimization is often the best choice. For example, in a menu list it is often clearer to show the region if there are any regional variants. Thus the user would just see \["Spanish"\] for es if the latter is the only supported Spanish, but where es-MX is also listed, then see \["Spanish (Spain)", "Spanish (Mexico)"\].
+In addition, the input locale display name could be minimized (see [Part 1: Likely Subtags](tr35.md#Likely_Subtags)) before generating the LDN. Selective minimization is often the best choice. For example, in a menu list it is often clearer to show the region if there are any regional variants. Thus the user would just see \["Spanish"\] for es if the latter is the only supported Spanish, but where es-MX is also listed, then see \["Spanish (Spain)", "Spanish (Mexico)"\].
* * *
@@ -192,7 +199,7 @@
When the display name contains "(" or ")" characters (or full-width equivalents), replace them by "\[", "\]" (or full-width equivalents) before adding.
-1. **Language.** Match the L subtags against the type values in the `<language>` elements. Pick the element with the most subtags matching. If there is more than one such element, pick the one that has subtypes matching earlier. If there are two such elements, pick the one that is alphabetically less. If there is no match, then further convert L to *canonical form* per **[Part 1, Section 3.2.1 Canonical Unicode Locale Identifiers](tr35.md#Canonical_Unicode_Locale_Identifiers)** and try the preceding steps again. Set LBN to the selected value. Disregard any of the matching subtags in the following processing.
+1. **Language.** Match the L subtags against the type values in the `<language>` elements. Pick the element with the most subtags matching. If there is more than one such element, pick the one that has subtypes matching earlier. If there are two such elements, pick the one that is alphabetically less. If there is no match, then further convert L to *canonical form* per **[Part 1, Canonical Unicode Locale Identifiers](tr35.md#Canonical_Unicode_Locale_Identifiers)** and try the preceding steps again. Set LBN to the selected value. Disregard any of the matching subtags in the following processing.
* If CombineLanguage is false, only choose matches with the language subtag matching.
2. **Script, Region, Variants.** Where any of these subtags are in L, append the matching element value to LQS.
3. **T extensions.** Get the value of the `key="h0" type="hybrid"` element, if there is one; otherwise the value of the `<key type="t">` element. Next get the locale display name of the tlang. Join the pair using `<localePattern>` and append to the LQS. Then format and add display names to LQS for any of the remaining tkey-tvalue pairs as described below.
@@ -228,13 +235,13 @@
-### 1.2 <a name="locale_display_name_fields" href="#locale_display_name_fields">Locale Display Name Fields</a>
+### <a name="locale_display_name_fields" href="#locale_display_name_fields">Locale Display Name Fields</a>
```xml
<languages>
```
-This contains a list of elements that provide the user-translated names for language codes, as described in _[Section 3, Unicode Language and Locale Identifiers](tr35.md#Unicode_Language_and_Locale_Identifiers)_.
+This contains a list of elements that provide the user-translated names for language codes, as described in _[Unicode Language and Locale Identifiers](tr35.md#Unicode_Language_and_Locale_Identifiers)_.
```xml
<language type="ab">Abkhazian</language>
@@ -272,7 +279,7 @@
<scripts>
```
-This element can contain a number of `script` elements. Each `script` element provides the localized name for a script code, as described in _[Section 3, Unicode Language and Locale Identifiers](tr35.md#Unicode_Language_and_Locale_Identifiers)_ (see also _UAX #24: Script Names_ [[UAX24](https://www.unicode.org/reports/tr41/#UAX24)]). For example, in the language of this locale, the name for the Latin script might be "Romana", and for the Cyrillic script is "Kyrillica". That would be expressed with the following.
+This element can contain a number of `script` elements. Each `script` element provides the localized name for a script code, as described in _[Unicode Language and Locale Identifiers](tr35.md#Unicode_Language_and_Locale_Identifiers)_ (see also _UAX #24: Script Names_ [[UAX24](https://www.unicode.org/reports/tr41/#UAX24)]). For example, in the language of this locale, the name for the Latin script might be "Romana", and for the Cyrillic script is "Kyrillica". That would be expressed with the following.
```xml
<script type="Latn">Romana</script>
@@ -299,7 +306,7 @@
<territories>
```
-This contains a list of elements that provide the user-translated names for territory codes, as described in _[Section 3, Unicode Language and Locale Identifiers](tr35.md#Unicode_Language_and_Locale_Identifiers)_.
+This contains a list of elements that provide the user-translated names for territory codes, as described in _[Unicode Language and Locale Identifiers](tr35.md#Unicode_Language_and_Locale_Identifiers)_.
```xml
<territory type="AD">Andorra</territory>
@@ -317,7 +324,7 @@
* Territory names may not match the official name of the territory, and the English or French names may not match those in ISO 3166. Reasons for this include:
* CLDR favors customary names in common parlance, not necessarily the official names.
* CLDR endeavors to provide names that are not too long, in order to avoid problems with truncation or overflow in user interfaces.
-* In general the territory names should also match those used in currency names, see **Part 3** _Section 4 [Currencies](tr35-numbers.md#Currencies)_.
+* In general the territory names should also match those used in currency names, see **Part 3** _[Currencies](tr35-numbers.md#Currencies)_.
* * *
@@ -325,7 +332,7 @@
<variants>
```
-This contains a list of elements that provide the user-translated names for the _variant_code_ values described in _[Section 3, Unicode Language and Locale Identifiers](tr35.md#Unicode_Language_and_Locale_Identifiers)_.
+This contains a list of elements that provide the user-translated names for the _variant_code_ values described in _[Unicode Language and Locale Identifiers](tr35.md#Unicode_Language_and_Locale_Identifiers)_.
```xml
<variant type="nynorsk">Nynorsk</variant>
@@ -337,7 +344,7 @@
<keys>
```
-This contains a list of elements that provide the user-translated names for the _key_ values described in _[Section 3, Unicode Language and Locale Identifiers](tr35.md#Unicode_Language_and_Locale_Identifiers)_.
+This contains a list of elements that provide the user-translated names for the _key_ values described in _[Unicode Language and Locale Identifiers](tr35.md#Unicode_Language_and_Locale_Identifiers)_.
```xml
<key type="collation">Sortierung</key>
@@ -355,7 +362,7 @@
<types>
```
-This contains a list of elements that provide the user-translated names for the _type_ values described in _[Section 3, Unicode Language and Locale Identifiers](tr35.md#Unicode_Language_and_Locale_Identifiers)_. Since the translation of an option name may depend on the _key_ it is used with, the latter is optionally supplied.
+This contains a list of elements that provide the user-translated names for the _type_ values described in _[Unicode Language and Locale Identifiers](tr35.md#Unicode_Language_and_Locale_Identifiers)_. Since the translation of an option name may depend on the _key_ it is used with, the latter is optionally supplied.
```xml
<type type="phonebook" key="collation">Telefonbuch</type>
@@ -419,9 +426,9 @@
<subdivision type="AL-MK">Mallakastër</subdivision> <!-- in AL-04 : Fier County -->
```
-See also **Part 6** _Section 2.1.1 [Subdivision Containment](tr35-info.md#Subdivision_Containment)_.
+See also **Part 6** _[Subdivision Containment](tr35-info.md#Subdivision_Containment)_.
-## 2 <a name="Layout_Elements" href="#Layout_Elements">Layout Elements</a>
+## <a name="Layout_Elements" href="#Layout_Elements">Layout Elements</a>
```xml
<!ELEMENT layout ( alias | (orientation*, inList*, inText*, special*) ) >
@@ -456,7 +463,7 @@
<inList> (deprecated)
```
-The `<inList>` element is deprecated and has been superseded by the `<contextTransforms>` element; see _Section 12 [ContextTransform Elements](#Context_Transform_Elements)_.
+The `<inList>` element is deprecated and has been superseded by the `<contextTransforms>` element; see _[ContextTransform Elements](#Context_Transform_Elements)_.
This element controls whether display names (language, territory, etc) are title cased in GUI menu lists and the like. It is only used in languages where the normal display is lower case, but title case is used in lists. There are two options:
@@ -474,7 +481,7 @@
<inText> (deprecated)
```
-The `<inList>` element is deprecated and has been superseded by the `<contextTransforms>` element; see _Section 12 [ContextTransform Elements](#Context_Transform_Elements)_.
+The `<inList>` element is deprecated and has been superseded by the `<contextTransforms>` element; see _[ContextTransform Elements](#Context_Transform_Elements)_.
This element indicates the casing of the data in the category identified by the `inText` `type` attribute, when that data is written in text or how it would appear in a dictionary. For example:
@@ -489,7 +496,7 @@
* lowercase-words : all words in the phrase should be lower case
* mixed : a mixture of upper and lower case is permitted, generally used when the correct value is unknown
-## 3 <a name="Character_Elements" href="#Character_Elements">Character Elements</a>
+## <a name="Character_Elements" href="#Character_Elements">Character Elements</a>
```xml
<!ELEMENT characters ( alias | ( exemplarCharacters*, ellipsis*, moreInformation*, stopwords*, indexLabels*, mapping*, parseLenients*, special* ) ) >
@@ -497,7 +504,7 @@
The `<characters>` element provides optional information about characters that are in common use in the locale, and information that can be helpful in picking resources or data appropriate for the locale, such as when choosing among character encodings that are typically used to transmit data in the language of the locale. It may also be used to help reduce confusability issues: see [[UTR39](https://www.unicode.org/reports/tr41/#UTR36)]. It typically only occurs in a language locale, not in a language/territory locale. The stopwords are an experimental feature, and should not be used.
-### 3.1 <a name="Exemplars" href="#Exemplars">Exemplars</a>
+### <a name="Exemplars" href="#Exemplars">Exemplars</a>
Exemplars are characters used by a language, separated into different categories. The following table provides a summary, with more details below.
@@ -534,7 +541,7 @@
> © ® ™ @ & ° ‧ ·/ # % ¶ § * † ‡
> + − ± × ÷ < ≤ = ≅ ≥ > √
-The numbers exemplars do not currently include lesser-used characters: exponential notation (3.1 × 10²³, ∞, NaN). Nor does it contain the units or currency symbols such as $, ¥, ₹, … It does contain %, because that occurs in the percent format. It may contain some special formatting characters like the RLM. A full list of the currency symbols used with that locale are in the `<currencies>` element, while the units can be gotten from the `<units>` element (both using inheritance, of course).The digits used in each numbering system are accessed in numberingSystems.xml. For more information, see _**Part 3: [Numbers](tr35-numbers.md#Contents)**, Section 2 [Number Elements](tr35-numbers.md#Number_Elements)_.
+The numbers exemplars do not currently include lesser-used characters: exponential notation (3.1 × 10²³, ∞, NaN). Nor does it contain the units or currency symbols such as $, ¥, ₹, … It does contain %, because that occurs in the percent format. It may contain some special formatting characters like the RLM. A full list of the currency symbols used with that locale are in the `<currencies>` element, while the units can be gotten from the `<units>` element (both using inheritance, of course).The digits used in each numbering system are accessed in numberingSystems.xml. For more information, see _**Part 3: [Numbers](tr35-numbers.md#Contents)**, [Number Elements](tr35-numbers.md#Number_Elements)_.
_Examples for zh.xml:_
@@ -557,7 +564,7 @@
The display of the index characters can be modified with the `indexLabel`s elements, discussed in Section 3.3.
-#### 3.1.1 <a name="ExemplarSyntax" href="#ExemplarSyntax">Exemplar Syntax</a>
+#### <a name="ExemplarSyntax" href="#ExemplarSyntax">Exemplar Syntax</a>
In all of the exemplar characters, the list of characters is in the [Unicode Set](tr35.md#Unicode_Sets) format, which normally allows boolean combinations of sets of letters and Unicode properties.
@@ -573,18 +580,18 @@
The ordering of the characters in the set is irrelevant, but for readability in the XML file the characters should be in sorted order according to the locale's conventions. The main and auxiliary sets should only contain lower case characters (except for the special case of Turkish and similar languages, where the dotted capital I should be included); the upper case letters are to be mechanically added when the set is used. For more information on casing, see the discussion of Special Casing in the Unicode Character Database.
-#### 3.1.2 <a name="Restrictions" href="#Restrictions">Restrictions</a>
+#### <a name="Restrictions" href="#Restrictions">Restrictions</a>
1. The main, auxiliary and index sets are normally restricted to those letters with a specific [Script](https://www.unicode.org/Public/UNIDATA/Scripts.txt) character property (that is, not the values Common or Inherited) or required [Default_Ignorable_Code_Point](https://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt) characters (such as a non-joiner), or combining marks, or the [Word_Break](https://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt) properties [Katakana](https://www.unicode.org/reports/tr29/#Katakana), [ALetter](https://www.unicode.org/reports/tr29/#ALetter), or [MidLetter](https://www.unicode.org/reports/tr29/#MidLetter).
2. The auxiliary set should not overlap with the main set. There is one exception to this: Hangul Syllables and CJK Ideographs can overlap between the sets.
3. Any [Default_Ignorable_Code_Point](https://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt)s should be in the auxiliary set, or, if they are only needed for currency formatting, in the currency set. These can include characters such as U+200E LEFT-TO-RIGHT MARK and U+200F RIGHT-TO-LEFT MARK which may be needed in bidirectional text in order for date, currency or other formats to display correctly.
4. For exemplar characters the [Unicode Set](tr35.md#Unicode_Sets) format is restricted so as to not use properties or boolean combinations.
-### 3.2 ~~<a name="Character_Mapping" href="#Character_Mapping">Mapping</a>~~
+### ~~<a name="Character_Mapping" href="#Character_Mapping">Mapping</a>~~
**This element has been deprecated.** For information on its structure and how it was intended to specify locale-specific preferred encodings for various purposes (e-mail, web), see the [Mapping](https://www.unicode.org/reports/tr35/tr35-39/tr35-general.html#Character_Mapping) section from the CLDR 27 version of the LDML Specification.
-### 3.3 ~~<a name="IndexLabels" href="#IndexLabels">Index Labels</a>~~
+### ~~<a name="IndexLabels" href="#IndexLabels">Index Labels</a>~~
**This element and its subelements have been deprecated.** For information on its structure and how it was intended to provide data for a compressed display of index exemplar characters where space is limited, see the [Index Labels](https://www.unicode.org/reports/tr35/tr35-39/tr35-general.html#IndexLabels) section from the CLDR 27 version of the LDML Specification.
@@ -592,7 +599,7 @@
<!ELEMENT indexLabels (indexSeparator*, compressedIndexSeparator*, indexRangePattern*, indexLabelBefore*, indexLabelAfter*, indexLabel*) >
```
-### 3.4 <a name="Ellipsis" href="#Ellipsis">Ellipsis</a>
+### <a name="Ellipsis" href="#Ellipsis">Ellipsis</a>
```xml
<!ELEMENT ellipsis ( #PCDATA ) >
@@ -613,7 +620,7 @@
<ellipsis type="word-initial">… {0}</ellipsis>
```
-### 3.5 <a name="Character_More_Info" href="#Character_More_Info">More Information</a>
+### <a name="Character_More_Info" href="#Character_More_Info">More Information</a>
The moreInformation string is one that can be displayed in an interface to indicate that more information is available. For example:
@@ -621,7 +628,7 @@
<moreInformation>?</moreInformation>
```
-### 3.6 <a name="Character_Parse_Lenient" href="#Character_Parse_Lenient">Parse Lenient</a>
+### <a name="Character_Parse_Lenient" href="#Character_Parse_Lenient">Parse Lenient</a>
```xml
<!ELEMENT parseLenients ( alias | ( parseLenient*, special* ) ) >
@@ -647,7 +654,7 @@
The `sample` attribute value is a paradigm element of that UnicodeSet, but the only reason for pulling it out separately is so that different classes of characters are separated, and to enable inheritance overriding. The first version of this data is populated with the data used for lenient parsing from ICU.
-## 4 <a name="Delimiter_Elements" href="#Delimiter_Elements">Delimiter Elements</a>
+## <a name="Delimiter_Elements" href="#Delimiter_Elements">Delimiter Elements</a>
```xml
<!ELEMENT delimiters (alias | (quotationStart*, quotationEnd*, alternateQuotationStart*, alternateQuotationEnd*, special*)) >
@@ -668,7 +675,7 @@
<alternateQuotationEnd>’</alternateQuotationEnd>
```
-### 4.1 <a name="Tailor_Linebreak_With_Delimiters" href="#Tailor_Linebreak_With_Delimiters">Tailoring Linebreak Using Delimiters</a>
+### <a name="Tailor_Linebreak_With_Delimiters" href="#Tailor_Linebreak_With_Delimiters">Tailoring Linebreak Using Delimiters</a>
The delimiter data can be used for language-specific tailoring of linebreak behavior, as suggested
in the [description of linebreak class QU: Quotation](https://www.unicode.org/reports/tr14/#QU)
@@ -685,7 +692,7 @@
* U+2019 RIGHT SINGLE QUOTATION MARK, often used as apostrophe, should not be changed from QU; otherwise it will introduce breaks after apostrophe.
* Several locales (mostly for central and eastern Europe) have U+201C LEFT DOUBLE QUOTATION MARK as `<quotationEnd>` or `<alternateQuotationEnd>`. However users in these locales may also encounter English text in which U+201C is used as `<quotationStart>`. In order to prevent improper breaks for English text, in these locales U+201C should not be changed from QU.
-## 5 <a name="Measurement_System_Data" href="#Measurement_System_Data">Measurement System Data</a>
+## <a name="Measurement_System_Data" href="#Measurement_System_Data">Measurement System Data</a>
```xml
<!ELEMENT measurementData ( measurementSystem*, paperSize* ) >
@@ -718,7 +725,7 @@
* The "metric" value indicates the use of SI [[ISO1000](tr35.md#ISO1000)] base or derived units, or non-SI units accepted for use with SI: for example, meters, kilograms, liters, and degrees Celsius.
* The "US" value indicates the customary system of measurement as used in the United States: feet, inches, pints, quarts, degrees Fahrenheit, and so on.
-* The "UK" value indicates the mix of metric units and Imperial units (feet, inches, pints, quarts, and so on) used in the United Kingdom, in which Imperial volume units such as pint, quart, and gallon are different sizes than in the "US" customary system. For more detail about specific units for various usages, see **Part 6: Supplemental:** _Section 2.4.1 [Preferred Units for Specific Usages](tr35-info.md#Preferred_Units_For_Usage)_.
+* The "UK" value indicates the mix of metric units and Imperial units (feet, inches, pints, quarts, and so on) used in the United Kingdom, in which Imperial volume units such as pint, quart, and gallon are different sizes than in the "US" customary system. For more detail about specific units for various usages, see **Part 6: Supplemental:** _[Preferred Units for Specific Usages](tr35-info.md#Preferred_Units_For_Usage)_.
In some cases, it may be common to use different measurement systems for different categories of measurements. For example, the following indicates that for the category of temperature, in the regions LR and MM, it is more common to use metric units than US units.
@@ -732,9 +739,9 @@
The measurement information was formerly in the main LDML file, and had a somewhat different format.
-Again, for finer-grained detail about specific units for various usages, see **Part 6: Supplemental:** _Section 2.4.1 [Preferred Units for Specific Usages](tr35-info.md#Preferred_Units_For_Usage)_.
+Again, for finer-grained detail about specific units for various usages, see **Part 6: Supplemental:** _[Preferred Units for Specific Usages](tr35-info.md#Preferred_Units_For_Usage)_.
-### 5.1 <a name="Measurement_Elements" href="#Measurement_Elements">Measurement Elements (deprecated)</a>
+### <a name="Measurement_Elements" href="#Measurement_Elements">Measurement Elements (deprecated)</a>
```xml
<!ELEMENT measurement (alias | (measurementSystem?, paperSize?, special*)) >
@@ -742,7 +749,7 @@
The `measurement` element is deprecated in the main LDML files, because the data is more appropriately organized as connected to territories, not to linguistic data. Instead, the `measurementData` element in the supplemental data file should be used.
-## 6 <a name="Unit_Elements" href="#Unit_Elements">Unit Elements</a>
+## <a name="Unit_Elements" href="#Unit_Elements">Unit Elements</a>
```xml
<!ELEMENT units (alias | (unit*, unitLength*, durationUnit*, special*) ) >
@@ -791,7 +798,7 @@
</unit>
```
-The German rules are more complicated, because German has both gender and case. They thus have additional information, as illustrated below. Note that if there is no `@case` attribute, for backwards compatibility the implied case is nominative. The possible values for @case are listed in the `grammaticalFeatures` element. These follow the inheritance specified in Part 1, Section [4.1.2 Lateral Inheritance](tr35.md#Lateral_Inheritance). Note that the additional grammar elements are only present in the `<unitLength type='long'>` form.
+The German rules are more complicated, because German has both gender and case. They thus have additional information, as illustrated below. Note that if there is no `@case` attribute, for backwards compatibility the implied case is nominative. The possible values for @case are listed in the `grammaticalFeatures` element. These follow the inheritance specified in Part 1, Section Lateral Inheritance](tr35.md#Lateral_Inheritance). Note that the additional grammar elements are only present in the `<unitLength type='long'>` form.
```xml
<unit type="duration-day">
@@ -809,13 +816,13 @@
</unit>
```
-These follow the inheritance specified in Part 1, Section [4.1.2 Lateral Inheritance](tr35.md#Lateral_Inheritance). In addition to supporting language-specific plural cases such as “one” and “other”, unitPatterns support the language-independent explicit cases “0” and “1” for special handling of numeric values that are exactly 0 or 1; see [Explicit 0 and 1 rules](tr35-numbers.md#Explicit_0_1_rules).
+These follow the inheritance specified in Part 1, Section Lateral Inheritance](tr35.md#Lateral_Inheritance). In addition to supporting language-specific plural cases such as “one” and “other”, unitPatterns support the language-independent explicit cases “0” and “1” for special handling of numeric values that are exactly 0 or 1; see [Explicit 0 and 1 rules](tr35-numbers.md#Explicit_0_1_rules).
The `<unitPattern>` elements may be used to format quantities with decimal values; in such cases the choice of plural form will depend not only on the numeric value, but also on its formatting (see [Language Plural Rules](tr35-numbers.md#Language_Plural_Rules)). In addition to formatting units for stand-alone use, `<unitPattern>` elements are increasingly being used to format units for use in running text; for such usages, the developing [Grammatical Features](#Grammatical_Features) information will be very useful.
Note that for certain plural cases, the unit pattern may not provide for inclusion of a numeric value—that is, it may not include “{0}”. This is especially true for the explicit cases “0” and “1” (which may have patterns like “zero seconds”). In certain languages such as Arabic and Hebrew, this may also be true with certain units for the plural cases “zero”, “one”, or “two” (in these languages, such plural cases are only used for the corresponding exact numeric values, so there is no concern about loss of precision without the numeric value).
-Units, like other values with a `count` attribute, use a special inheritance. See **Part 1: Core:** _Section 4.1 [Multiple Inheritance](tr35.md#Multiple_Inheritance)_.
+Units, like other values with a `count` attribute, use a special inheritance. See **Part 1: Core:** _[Multiple Inheritance](tr35.md#Multiple_Inheritance)_.
The displayName is used for labels, such as in a UI. It is typically lowercased and as neutral a plural form as possible, and then uses the casing context for the proper display. For example, for English in a UI it would appear as titlecase:
@@ -827,7 +834,7 @@
</tbody></table>
-### 6.1 <a name="Unit_Preference_and_Conversion" href="#Unit_Preference_and_Conversion">Unit Preference and Conversion Data</a>
+### <a name="Unit_Preference_and_Conversion" href="#Unit_Preference_and_Conversion">Unit Preference and Conversion Data</a>
Different locales have different preferences for which unit or combination of units is used for a particular usage, such as measuring a person’s height. This is more fine-grained than merely a preference for metric versus US or UK measurement systems. For example, one locale may use meters alone, while another may use centimeters alone or a combination of meters and centimeters; a third may use inches alone, or (informally) a combination of feet and inches.
@@ -835,9 +842,9 @@
The size of the measurement can also be taken into account, so that an infant can have a height as _18 inches_, and an adult the height as _6 foot 2 inches._
-This data is supplied in **Part 6: [Supplemental](tr35-info.md#Contents)**: [Section 13 Unit Conversion](tr35-info.md#Unit_Conversion) and [Section 13 Unit Preferences](tr35-info.md#Unit_Preferences).
+This data is supplied in **Part 6: [Supplemental](tr35-info.md#Contents)**: [Unit Conversion](tr35-info.md#Unit_Conversion) and [Unit Preferences](tr35-info.md#Unit_Preferences).
-### 6.2 <a name="Unit_Identifiers" href="#Unit_Identifiers">Unit Identifiers</a>
+### <a name="Unit_Identifiers" href="#Unit_Identifiers">Unit Identifiers</a>
Units of measurement, such as _meter_, have defined programmatic identifiers as described in this section.
The main identifier is a _core unit identifier_, which encompasses a number of simpler types of identifiers as follows.
@@ -863,10 +870,10 @@
| day | duration-day |
-The list of valid CLDR simple unit identifiers is found in _Section [3.11 Validity Data](tr35.md#Validity_Data)_.
+The list of valid CLDR simple unit identifiers is found in _[Section Validity Data](tr35.md#Validity_Data)_.
These names should not be presented to end users, however: the translated names for different languages (or variants of English) are available in the CLDR localized data.
All syntactically valid CLDR unit identifiers values that are not listed in the validity data are reserved by CLDR for additional future units.
-There is one exception: implementations that need to define their own unit identifiers can do so via _Section 6.6 [Private-Use Units](#Private_Use_Units)_.
+There is one exception: implementations that need to define their own unit identifiers can do so via _[Private-Use Units](#Private_Use_Units)_.
A core unit identifier that is not a simple unit is called a _complex unit_ (aka _compound unit_).
A complex unit identifier can be constructed from simple unit identifiers using multiplication (kilogram-meter) and division (kilogram-per-meter), powers (square-second), and prefixes (kilo-, 100-, kiBi).
@@ -876,7 +883,7 @@
Thus they support converting generated units such as inch-pound-per-square-week into comparable units, such as newtons.
Where a core unit ID or mixed unit ID does not have an explicit translation in CLDR, a mechanism is supplied for producing a generated translation from the translations for the simple unit identifiers.
-See _Section 6.4 [Compound Units](#compound-units)_.
+See _[Compound Units](#compound-units)_.
That can be used for less common units, such as _petasecond_.
However, the generated translations may have the wrong spelling in languages where orthographic changes are needed when combining words.
For example, “kilometer” can be formed in English from “kilo” and “meter”; the same process in Greek would combine “χιλιο” and “μέτρα” to get “χιλιομέτρα” — when the correct result is “χιλιόμετρα” (note the different location of the accent).
@@ -895,7 +902,7 @@
#### Syntax
-The formal syntax for identifiers is provided below.
+The formal syntax for identifiers is provided below.
Some of the constraints reference data from the unitIdComponents in [Unit_Conversion](tr35-info.md#Unit_Conversion).
<!-- HTML: no header -->
@@ -907,16 +914,21 @@
| long_unit_identifier</td></tr>
<tr><td>core_unit_identifier</td><td>:=</td>
- <td>product_unit ("-per-" product_unit)*<br/>
- | "per-" product_unit ("-per-" product_unit)*
+ <td>product_unit ("-" per "-" product_unit)*<br/>
+ | per "-" product_unit ("-" per "-" product_unit)*
<ul><li><em>Examples:</em>
<ul><li>foot-per-second-per-second</li>
<li>per-second</li>
</ul></li>
<li><em>Note:</em> The normalized form will have only one "per"</li>
- <li><em>Note:</em>The token 'per' is the single value in <unitIdComponent type=”per”></li>
</ul></td></tr>
+<tr><td>per</td><td>:=</td>
+ <td>"per"
+ <ul>
+ <li><em>Constraint:</em> The token 'per' is the single value in <unitIdComponent type="per"></li>
+ </ul></td></tr>
+
<tr><td>product_unit</td><td>:=</td>
<td>single_unit ("-" single_unit)* ("-" pu_single_unit)*<br/>
| pu_single_unit ("-" pu_single_unit)*
@@ -929,10 +941,10 @@
<ul><li><em>Examples: </em>square-meter, or 100-square-meter</li></ul></td></tr>
<tr><td>pu_single_unit</td><td>:=</td>
- <td>“xxx-” single_unit | “x-” single_unit
+ <td>"xxx-" single_unit | "x-" single_unit
<ul><li><em>Example:</em> xxx-square-knuts (a Harry Potter unit)</li>
- <li><em>Note:</em> “x-” is only for backwards compatibility</li>
- <li>See Section 6.6 <a href="#Private_Use_Units">Private-Use Units</a></li>
+ <li><em>Note:</em> "x-" is only for backwards compatibility</li>
+ <li>See <a href="#Private_Use_Units">Private-Use Units</a></li>
</ul></td></tr>
<tr><td>number_prefix</td><td>:=</td>
@@ -948,20 +960,21 @@
<tr><td>dimensionality_prefix</td><td>:=</td>
<td>"square-"<p>| "cubic-"<p>| "pow" ([2-9]|1[0-5]) "-"
<ul>
+ <li><em>Constraint:</em> must be value in: <unitIdComponent type="power">.</li>
<li><em>Note:</em> "pow2-" and "pow3-" canonicalize to "square-" and "cubic-"</li>
- <li><em>Note:</em>These are values in <unitIdComponent type=”power”></li>
+ <li><em>Note:</em> These are values in <unitIdComponent type="power"></li>
</ul></td></tr>
<tr><td>simple_unit</td><td>:=</td>
<td>(prefix_component "-")* (prefixed_unit | base_component) ("-" suffix_component)*<br/>
| currency_unit<br/>
- | “em” | “g” | “us” | “hg” | "of"
+ | "em" | "g" | "us" | "hg" | "of"
<ul>
<li><em>Examples:</em> kilometer, meter, cup-metric, fluid-ounce, curr-chf, em</li>
- <li><em>Note:</em> Three simple units are currently allowed as legacy usage, for tokens that wouldn’t otherwise be a base_component due to length (eg, “<strong>g</strong>-force”).
- We will likely deprecate those and add conformant aliases in the future: the “hg” and “of” are already only in deprecated simple_units.</li>
+ <li><em>Note:</em> Three simple units are currently allowed as legacy usage, for tokens that wouldn’t otherwise be a base_component due to length (eg, "<strong>g</strong>-force").
+ We will likely deprecate those and add conformant aliases in the future: the "hg" and "of" are already only in deprecated simple_units.</li>
</ul></td></tr>
-
+
<tr><td>prefixed_unit</td><td></td>
<td>prefix base_component<ul><li><em>Example: </em>kilometer</li></ul></td></tr>
@@ -970,36 +983,47 @@
<tr><td>si_prefix</td><td>:=</td>
<td>"deka" | "hecto" | "kilo", …
- <ul><li><em>Note: </em>See full list at <a href="https://www.nist.gov/pml/special-publication-811">NIST special publication 811</a></li></ul></td></tr>
+ <ul><li><em>Note:</em> See full list at <a href="https://www.nist.gov/pml/special-publication-811">NIST special publication 811</a></li></ul></td></tr>
<tr><td>binary_prefix</td><td>:=</td>
<td>"kibi", "mebi", …
- <ul><li><em>Note: </em>See full list at <a href="https://physics.nist.gov/cuu/Units/binary.html">Prefixes for binary multiples</a></li></ul></td></tr>
+ <ul><li><em>Note:</em> See full list at <a href="https://physics.nist.gov/cuu/Units/binary.html">Prefixes for binary multiples</a></li></ul></td></tr>
<tr><td>prefix_component</td><td>:=</td>
<td>[a-z]{3,∞}
- <ul><li><em>Constraint:</em> must be value in: <unitIdComponent type=”prefix_component”>.</li></ul></td></tr>
+ <ul><li><em>Constraint:</em> must be value in: <unitIdComponent type="prefix">.</li></ul></td></tr>
<tr><td>base_component</td><td>:=</td>
<td>[a-z]{3,∞}
<ul><li><em>Constraint:</em> must not be a value in any of the following:<br>
- <unitIdComponent type=”prefix_component”><br>
- or <unitIdComponent type=”suffix_component”> <br>
- or <unitIdComponent type=”power”><br>
- or <unitIdComponent type=”and”><br>
- or <unitIdComponent type=”per”>.
- </li></ul>
- <ul><li><em>Constraint:</em> must not have a prefix as an initial segment.</li></ul>
+ <unitIdComponent type="prefix"><br>
+ or <unitIdComponent type="suffix"> <br>
+ or <unitIdComponent type="power"><br>
+ or <unitIdComponent type="and"><br>
+ or <unitIdComponent type="per">.
+ </li>
+ <li><em>Constraint:</em> must not have a prefix as an initial segment.</li>
+ <li><em>Constraint:</em> no two different base_components will share the first 8 letters.
+ (<b>For more information, see <a href="#Unit_Identifier_Uniqueness">Unit Identifier Uniqueness</a>.)</b>
+ </li>
+ </ul>
</td></tr>
<tr><td>suffix_component</td><td>:=</td>
<td>[a-z]{3,∞}
- <ul><li><em>Constraint:</em> must be value in: <unitIdComponent type=”suffix_component”></li></ul></td></tr>
+ <ul>
+ <li><em>Constraint:</em> must be value in: <unitIdComponent type="suffix"></li>
+ </ul></td></tr>
<tr><td>mixed_unit_identifier</td><td>:=</td>
- <td>(single_unit | pu_single_unit) ("-and-" (single_unit | pu_single_unit ))*
+ <td>(single_unit | pu_single_unit) ("-" and "-" (single_unit | pu_single_unit ))*
<ul><li><em>Example: foot-and-inch</em></li>
- <li><em>Note:</em>The token 'and' is the single value in <unitIdComponent type=”and”></li>
+ </ul></td></tr>
+
+<tr><td>and</td><td>:=</td>
+ <td>"and"
+ <ul>
+ <li><em>Constraint:</em> The token 'and' is the single value in <unitIdComponent type="and"></li>
</ul></td></tr>
<tr><td>long_unit_identifier</td><td>:=</td>
@@ -1010,32 +1034,56 @@
<tr><td>currency_unit</td><td>:=</td>
<td>"curr-" [a-z]{3}
- <ul><li><em>Constraints:</em>
- <ul><li>The first part of the currency_unit is a standard prefix; the second part of the currency unit must be a valid <a href="tr35.md#UnicodeCurrencyIdentifier">Unicode currency identifier</a>. Note: CLDR does not provide conversions for currencies; this is only intended for formatting.</li>
- </ul></li>
- <li><em>Examples:</em> curr-eur-per-square-meter, or pound-per-curr-usd</li>
- </ul></td></tr>
+ <ul>
+ <li><em>Constraint:</em> The first part of the currency_unit is a standard prefix; the second part of the currency unit must be a valid <a href="tr35.md#UnicodeCurrencyIdentifier">Unicode currency identifier</a>.</li>
+ </ul>
+ <ul>
+ <li><em>Examples:</em> <b>curr-eur</b>-per-square-meter, or pound-per-<b>curr-usd</b></li>
+ <li><em>Note:</em> CLDR does not provide conversions for currencies; this is only intended for formatting.
+ The locale data for currencies is supplied in the <code>currencies</code> element, not in the <code>units</code> element.</li>
+ </ul>
+ </td></tr>
</tbody></table>
-Note that while the syntax allows for number_prefixes in multiple places, the typical use case is only one instances, and after a "-per-".
+Note that while the syntax allows for number_prefixes in multiple places, the typical use case is only one instance, after a "-per-".
-The simple_unit structure does not allow for any two simple_units to overlap.
-That is, there are no cases where simple_unit1 consists of X-Y and simple_unit2 consists of Y-Z.
-This was not true in previous versions of LDML: cup-metric overlapped with metric-ton.
-That meant that the unit identifiers for the product_unit of cup and metric-ton and the product_unit of cup-metric and ton were ambiguous.
+The simple_unit structure does not allow for any two simple_units to overlap.
+That is, there are no cases where simple_unit1 consists of X-Y and simple_unit2 consists of Y-Z.
+This was not true in previous versions of LDML: cup-metric overlapped with metric-ton.
+That meant that the unit identifiers for the product_unit of cup and metric-ton and the product_unit of cup-metric and ton were ambiguous.
-The constraint that the identifiers can't overlap also means that parsing of multiple-subtag simple units is simpler.
+The constraint that the identifiers can't overlap also means that parsing of multiple-subtag simple units is simpler.
For example:
* When a prefix_component is encountered, one can collect any other prefix-components, then one base_component, then any suffix components, and stop.
* Similarly, when a base_component is encountered, one can collect any suffix components, and stop.
* Encountering a suffix_component in any other circumstance is an error.
-### 6.3 <a name="Example_Units" href="#Example_Units">Example Units</a>
+### <a name="Unit_Identifier_Uniqueness" href="#Unit_Identifier_Uniqueness">Unit Identifier Uniqueness</a>
+CLDR Unit Identifiers can be used as values in locale identifiers. When that is done, the syntax is modified whenever a `prefixed_unit` would be longer than 8 characters. In such a case:
+
+* If there is no `prefix` the `prefixed_unit` is truncated to 8 characters.
+* If there is a `prefix`, a hyphen is added between the `prefix` and the `base_component`. If that `base_component` is longer than 8 characters, it is truncated to 8 characters.
+
+_Example_
+| Unit identifer | BCP47 syntax example | Comment |
+| ---- | ---- | ---- |
+| kilogram | en-u-ux-kilogram | kilogram fits in 8 characters |
+| centilux | en-u-ux-centilux | centilux fixs in 8 characters |
+| steradian | en-u-ux-steradia | steradian exceeds 8 characters |
+| centigram | en-u-ux-centi-gram | centigram exceeds 8 characters |
+| kilometer | en-u-ux-kilo-meter | kilometer exceeds 8 characters |
+| quectolux | en-u-ux-kilo-meter | kilometer exceeds 8 characters |
+
+This requires that each of the elements in base_components are unique to eight letters, that is: **no two different base_components will share the first 8 letters**.
+
+The reason that the `prefixed_unit` as a whole is not simply truncated to 8 characters is that would impose too strict a constraint. There are 5 letter prefixes such as 'centi' and more recently 6 letter prefixes such as 'quecto'. That would cause prefixed `base_component` as short as 'gram' and 'gray' to be ambiguous when truncated to 8 letters: 'centigra'; and 'lumen' and 'lux' would fail with the 6 letter prefixes.
+
+### <a name="Example_Units" href="#Example_Units">Example Units</a>
The following table contains examples of groupings and units currently defined by CLDR.
The units in CLDR are not comprehensive; it is anticipated that more will be added over time.
-The complete list of supported units is in the validity data: see _Section [3.11 Validity Data](tr35.md#Validity_Data)_.
+The complete list of supported units is in the validity data: see _[Section Validity Data](tr35.md#Validity_Data)_.
| Type | Core Unit Identifier | Compound? | Sample Format |
| -------------- | ------------------------ | --------- | -------------- |
@@ -1159,18 +1207,19 @@
Where the unit of measurement is one of the [International System of Units (SI)](https://physics.nist.gov/cuu/Units/units.html), the short and narrow forms will typically use the international symbols, such as “mm” for millimeter. They may, however, be different if that is customary for the language or locale. For example, in Russian it may be more typical to see the Cyrillic characters “мм”.
-Units are included for translation even where they are not typically used in a particular locale, such as kilometers in the US, or inches in Germany. This is to account for use by travelers and specialized domains, such as the German “Fernseher von 32 bis 55 Zoll (80 bis 140 cm)” for TV screen size in inches and centimeters.
+Units are sometimes included for translation even where they are not typically used in a particular locale, such as kilometers in the US, or inches in Germany. This is to account for use by travelers and specialized domains, such as the German “Fernseher von 32 bis 55 Zoll (80 bis 140 cm)” for TV screen size in inches and centimeters.
For temperature, there is a special unit `<unit type="temperature-generic">`, which is used when it is clear from context whether Celcius or Fahrenheit is implied.
For duration, there are special units such as `<unit type="duration-year-person">` and `<unit type="duration-year-week">` for indicating the age of a person, which requires special forms in some languages. For example, in "zh", references to a person being 3 days old or 30 years old would use the forms “他3天大” and “他30岁” respectively.
<a name="compoundUnitPattern"></a><a name="perUnitPatterns"></a>
-### 6.4 <a name="compound-units" href="#compound-units">Compound Units</a>
+
+### Compound Units
A common combination of units is X per Y, such as _miles per hour_ or _liters per second_ or _kilowatt-hours_.
-There are different types of structure used to build the localized name of compound units. All of these follow the inheritance specified in [Part 1, Section 4.1.2 Lateral Inheritance](tr35.md#Lateral_Inheritance).
+There are different types of structure used to build the localized name of compound units. All of these follow the inheritance specified in [Part 1, Lateral Inheritance](tr35.md#Lateral_Inheritance).
**Prefixes** are for powers of 10 and powers of 1024 (the latter only used with digital units of measure). These are invariant for case, gender, or plural (though those could be added in the future if needed by a language).
@@ -1200,7 +1249,7 @@
There can be at most one "per" pattern used in producing a compound unit, while the "times" pattern can be used multiple times.
-`compoundUnitPattern1`s are used for expressing powers, such as square meter or cubic foot. These are the most complicated, since they can vary by plural category (count), by case, and by gender. However, these extra attributes are only used if they are present in the `grammaticalFeatures` element for the language in question. See [Section 15, Grammatical Features](#Grammatical_Features). Note that the additional grammar elements are only present in the `<unitLength type='long'>` form.
+`compoundUnitPattern1`s are used for expressing powers, such as square meter or cubic foot. These are the most complicated, since they can vary by plural category (count), by case, and by gender. However, these extra attributes are only used if they are present in the `grammaticalFeatures` element for the language in question. See [Grammatical Features](#Grammatical_Features). Note that the additional grammar elements are only present in the `<unitLength type='long'>` form.
```xml
<compoundUnit type="power2">
@@ -1317,7 +1366,7 @@
**per0(...), times0(...), etc.**
-1. These represent the **deriveComponent** data values from **Section 16 [Grammatical Derivations](#Grammatical_Derivations)**, where value0 of the per-structure is given as per0(...), and so on.
+1. These represent the **deriveComponent** data values from **[Grammatical Derivations](#Grammatical_Derivations)**, where value0 of the per-structure is given as per0(...), and so on.
2. "power" corresponds to dimensionality_prefix, while "prefix" corresponds to si_prefix.
If the locale does not provide full modern coverage, the process could fall back to root locale for some localized patterns. That may give a "ransom-note" effect for the user. To avoid that, it may be preferable to abort the process at that point, and then localize the unitId for the root locale.
@@ -1330,7 +1379,7 @@
There is also a precomposed **perUnitPattern** which is used as the denominator with another unit name. For example, a form such as "{0} per second" can be used to form "2 feet **per second**". The difference between these is that in some inflected languages, the compoundUnit cannot be used to form grammatical phrases. This is typically because the "per" + "second" combine in a non-trivial way. The `perUnitPattern` should be applied if the denominator has only one element, and matches the `perUnitPattern` type.
-### 6.5 <a name="Unit_Sequences" href="#Unit_Sequences">Unit Sequences (Mixed Units)</a>
+### <a name="Unit_Sequences" href="#Unit_Sequences">Unit Sequences (Mixed Units)</a>
Units may be used in composed sequences (aka _mixed units_), such as **5° 30′** for 5 degrees 30 minutes, or **3 ft 2 in.** For that purpose, the appropriate width of the unit `listPattern` can be used to compose the units in a sequence.
@@ -1342,7 +1391,7 @@
In such a sequence, decimal fractions are typically only displayed for the last element of the sequence, if at all.
-### 6.6 <a name="durationUnit" href="#durationUnit">durationUnit</a>
+### <a name="durationUnit" href="#durationUnit">durationUnit</a>
The durationUnit is a special type of unit used for composed time unit durations.
@@ -1354,7 +1403,7 @@
The type contains a skeleton, where 'h' stands for hours, 'm' for minutes, and 's' for seconds. These are the same symbols used in availableFormats, except that there is no need to distinguish different forms of the hour.
-### 6.7 <a name="coordinateUnit" href="#coordinateUnit">coordinateUnit</a>
+### <a name="coordinateUnit" href="#coordinateUnit">coordinateUnit</a>
The **coordinateUnitPattern** is a special type of pattern used for composing degrees of latitude and longitude, with an indicator of the quadrant. There are exactly 4 type values, plus a displayName for the items in this category. An angle is composed using the appropriate combination of the **angle-degrees**, **angle-arc-minute** and **angle-arc-second** values. It is then substituted for the placeholder field {0} in the appropriate **coordinateUnit** pattern.
@@ -1366,13 +1415,13 @@
<coordinateUnitPattern type="west">{0}W</coordinateUnitPattern>
```
-### 6.8 <a name="Territory_Based_Unit_Preferences" href="#Territory_Based_Unit_Preferences">Territory-Based Unit Preferences</a>
+### <a name="Territory_Based_Unit_Preferences" href="#Territory_Based_Unit_Preferences">Territory-Based Unit Preferences</a>
Different locales have different preferences for which unit or combination of units is used for a particular usage, such as measuring a person’s height. This is more fine-grained than merely a preference for metric versus US or UK measurement systems. For example, one locale may use meters alone, while another may use centimeters alone or a combination of meters and centimeters; a third may use inches alone, or (informally) a combination of feet and inches.
The `<unitPreferenceData>` element, described in [Preferred Units for Specific Usages](tr35-info.md#Preferred_Units_For_Usage), provides information on which unit or combination of units is used for various purposes in different locales, with options for the level of formality and the scale of the measurement (e.g. measuring the height of an adult versus that of an infant).
-### 6.9 <a name="Private_Use_Units" href="#Private_Use_Units">Private-Use Units</a>
+### <a name="Private_Use_Units" href="#Private_Use_Units">Private-Use Units</a>
CLDR has reserved the "xxx-" prefix in the simple_unit part of the unit identifier BNF for private-use units. CLDR will never define a type, simple unit, or compound unit such that the unit identifier starts with "xxx-", ends with "-xxx", or contains "-xxx-".
@@ -1384,7 +1433,7 @@
The older syntax used “x-”, which was expanded to “xxx-” to simplify use with BCP47 syntax. That should be converted to “xxx-”.
-## 7 <a name="POSIX_Elements" href="#POSIX_Elements">POSIX Elements</a>
+## <a name="POSIX_Elements" href="#POSIX_Elements">POSIX Elements</a>
```xml
<!ELEMENT posix (alias | (messages*, special*)) >
@@ -1428,7 +1477,7 @@
This would match n,N,no,nO,No,NO.
-## 8 <a name="Reference_Elements" href="#Reference_Elements">Reference Element</a>
+## <a name="Reference_Elements" href="#Reference_Elements">Reference Element</a>
(Use only in supplemental data; deprecated for ldml.dtd and locale data)
@@ -1447,7 +1496,7 @@
<reference type="R3" uri="URN:ISBN:91-47-04974-X">Svenska skrivregler</reference>
```
-## 9 <a name="Segmentations" href="#Segmentations">Segmentations</a>
+## <a name="Segmentations" href="#Segmentations">Segmentations</a>
```xml
<!ELEMENT segmentations ( alias | segmentation*) >
@@ -1476,7 +1525,14 @@
The `segmentations` element provides for segmentation of text into words, lines, or other segments. The structure is based on [[UAX29](https://www.unicode.org/reports/tr41/#UAX29)] notation, but adapted to be machine-readable. It uses a list of variables (representing character classes) and a list of rules. Each must have an `id` attribute.
-The rules in _root_ implement the segmentations found in [[UAX29](https://www.unicode.org/reports/tr41/#UAX29)] and [[UAX14](https://www.unicode.org/reports/tr41/#UAX14)], for grapheme clusters, words, sentences, and lines. They can be overridden by rules in child locales.
+The rules in _root_ implement the segmentations found in [[UAX29](https://www.unicode.org/reports/tr41/#UAX29)] and
+[[UAX14](https://www.unicode.org/reports/tr41/#UAX14)], for grapheme clusters, words, sentences, and lines. They can be
+overridden by rules in child locales. In addition, there are several locale keywords that affect segmentation:
+
+* "dx", [Unicode Dictionary Break Exclusion Identifier](tr35.md#UnicodeDictionaryBreakExclusionIdentifier)
+* "lb", [Unicode Line Break Style Identifier](tr35.md#UnicodeLineBreakStyleIdentifier)
+* "lw", [Unicode Line Break Word Identifier ](tr35.md#UnicodeLineBreakWordIdentifier)
+* "ss", [Unicode Sentence Break Suppressions Identifier ](tr35.md#UnicodeSentenceBreakSuppressionsIdentifier)
Here is an example:
@@ -1507,7 +1563,7 @@
...
```
-**Variables:** All variable ids must start with a $, and otherwise be valid identifiers according to the Unicode definitions in [[UAX31](https://www.unicode.org/reports/tr41/#UAX31)]. The contents of a variable is a regular expression using variables and [UnicodeSet](tr35.md#Unicode_Sets)s. The ordering of variables is important; they are evaluated in order from first to last (see _[Section 9.1 Segmentation Inheritance](#Segmentation_Inheritance)_). It is an error to use a variable before it is defined.
+**Variables:** All variable ids must start with a $, and otherwise be valid identifiers according to the Unicode definitions in [[UAX31](https://www.unicode.org/reports/tr41/#UAX31)]. The contents of a variable is a regular expression using variables and [UnicodeSet](tr35.md#Unicode_Sets)s. The ordering of variables is important; they are evaluated in order from first to last (see _[Segmentation Inheritance](#Segmentation_Inheritance)_). It is an error to use a variable before it is defined.
**Rules:** The contents of a rule uses the syntax of [[UAX29](https://www.unicode.org/reports/tr41/#UAX29)]. The rules are evaluated in numeric id order (which may not be the order in which they appear in the file). The first rule that matches determines the status of a boundary position, that is, whether it breaks or not. Thus ÷ means a break is allowed; × means a break is forbidden. It is an error if the rule does not contain exactly one of these characters (except where a rule has no contents at all, or if the rule uses a variable that has not been defined.
@@ -1529,7 +1585,7 @@
> ...
> ```
-### 9.1 <a name="Segmentation_Inheritance" href="#Segmentation_Inheritance">Segmentation Inheritance</a>
+### <a name="Segmentation_Inheritance" href="#Segmentation_Inheritance">Segmentation Inheritance</a>
Variables and rules both inherit from the parent.
@@ -1551,7 +1607,7 @@
<rule id="3" /> // deletes rule 3
````
-### 9.2 <a name="Segmentation_Exceptions" href="#Segmentation_Exceptions">Segmentation Suppressions</a>
+### <a name="Segmentation_Exceptions" href="#Segmentation_Exceptions">Segmentation Suppressions</a>
**Note:** As of CLDR 26, the `<suppressions>` data is to be considered a technology preview. Data currently in CLDR was extracted from the Unicode Localization Interoperability project, or ULI. The ULI committee has been disbanded, but historical information can be found at <https://www.unicode.org/uli/>.
@@ -1576,7 +1632,7 @@
**Note:** These elements were called `<exceptions>` and `<exception>` prior to CLDR 26, but those names are now deprecated.
-## 10 <a name="Transforms" href="#Transforms">Transforms</a>
+## <a name="Transforms" href="#Transforms">Transforms</a>
Transforms provide a set of rules for transforming text via a specialized set of context-sensitive matching rules. They are commonly used for transliterations or transcriptions, but also other transformations such as full-width to half-width (for _katakana_ characters). The rules can be simple one-to-one relationships between characters, or involve more complicated mappings. Here is an example:
@@ -1669,7 +1725,7 @@
Note that the script and region codes are cased iff they are in the main subtag, but are lowercase in extensions.
-### 10.1 <a name="Inheritance" href="#Inheritance">Inheritance</a>
+### <a name="Inheritance" href="#Inheritance">Inheritance</a>
The CLDR transforms are built using the following locale inheritance. While this inheritance is not required of LDML implementations, the transforms supplied with CLDR may not otherwise behave as expected without some changes.
@@ -1719,7 +1775,7 @@
Japanese and Korean are special, since they can be represented by combined script codes, such as ja_Jpan, ja_Hrkt, ja_Hira, or ja_Kana. These need to be considered in the above fallback chain as well.
-#### 10.1.1 <a name="Pivots" href="#Pivots">Pivots</a>
+#### <a name="Pivots" href="#Pivots">Pivots</a>
Transforms can also use _pivots_. These are used when there is no direct transform between a source and target, but there are transforms X-Y and Y-Z. In such a case, the transforms can be internally chained to get X-Y = X-Y;Y-Z. This is done explicitly with the Indic script transforms: to get Devanagari-Latin, internally it is done by transforming first from Devanagari to Interindic (an internal superset encoding for Indic scripts), then from Interindic to Latin. This allows there to be only N sets of transform rules for the Indic scripts: each one to and from Interindic. These pivots are explicitly represented in the CLDR transforms.
@@ -1731,7 +1787,7 @@
The interaction of implicit pivots and inheritance may result in a longer inheritance chain lookup than desired, so implementers may consider having some sort of caching mechanism to increase performance.
-### 10.2 <a name="Variants" href="#Variants">Variants</a>
+### <a name="Variants" href="#Variants">Variants</a>
Variants used in CLDR include UNGEGN and BGN, both indicating sources for transliterations. There is an additional attribute `private="true"` which is used to indicate that the transform is meant for internal use, and should not be displayed as a separate choice in a UI.
@@ -1755,9 +1811,9 @@
* UKPCGN - Permanent Committee on Geographical Names for British Official Use
* RUGOST - Russian Main Administration of Geodesy and Cartography
-The rules for transforms are described in Section 10.3 [Transform Rules Syntax](#Transform_Rules_Syntax). For more information on Transliteration, see [Transliteration Guidelines](https://cldr.unicode.org/index/cldr-spec/transliteration-guidelines).
+The rules for transforms are described in [Transform Rules Syntax](#Transform_Rules_Syntax). For more information on Transliteration, see [Transliteration Guidelines](https://cldr.unicode.org/index/cldr-spec/transliteration-guidelines).
-### 10.3 <a name="Transform_Rules_Syntax" href="#Transform_Rules_Syntax">Transform Rules Syntax</a>
+### <a name="Transform_Rules_Syntax" href="#Transform_Rules_Syntax">Transform Rules Syntax</a>
```xml
<!ELEMENT transforms ( transform*) >
@@ -1793,7 +1849,8 @@
The `visibility` attribute indicates whether the IDs should be externally visible, or whether they are only used internally.
-In previous versions, the rules were expressed as fine-grained XML. That was discarded in CLDR version 29, in favor of a simpler format where the separate rules are simply terminated with ";".
+Note: In CLDR v28 and before, the rules were expressed as fine-grained XML.
+That was discarded in CLDR version 29, in favor of a simpler format where the separate rules are simply terminated with ";".
The transform rules are similar to regular-expression substitutions, but adapted to the specific domain of text transformations. The rules and comments in this discussion will be intermixed, with # marking the comments. The simplest rule is a conversion rule, which replaces one string of characters with another. The conversion rule takes the following form:
@@ -1818,6 +1875,8 @@
'←' → arrow' 'sign ;
```
+Note: The characters `→`, `←`, `↔` are preferred, but can be represented by the ASCII character `>`, `<`, and `<>`, respectively.
+
Spaces may be inserted anywhere without any effect on the rules. Use extra space to separate items out for clarity without worrying about the effects. This feature is particularly useful with combining marks; it is handy to put some spaces around it to separate it from the surrounding text. The following is an example:
```
@@ -1846,7 +1905,7 @@
$pi → p ;
```
-#### 10.3.1 <a name="Dual_Rules" href="#Dual_Rules">Dual Rules</a>
+#### <a name="Dual_Rules" href="#Dual_Rules">Dual Rules</a>
Rules can also specify what happens when an inverse transform is formed. To do this, we reverse the direction of the "←" sign. Thus the above example becomes:
@@ -1860,7 +1919,7 @@
$pi ↔ p ;
```
-#### 10.3.2 <a name="Context" href="#Context">Context</a>
+#### <a name="Context" href="#Context">Context</a>
Context can be used to have the results of a transformation be different depending on the characters before or after. The following rule removes hyphens, but only when they follow lowercase characters:
@@ -1897,22 +1956,55 @@
It will thus convert “-B A-B a-b” to “B AB a-b”.
-#### 10.3.3 <a name="Revisiting" href="#Revisiting">Revisiting</a>
+#### <a name="Revisiting" href="#Revisiting">Revisiting</a>
-If the resulting text contains a vertical bar "|", then that means that processing will proceed from that point and that the transform will revisit part of the resulting text. Thus the | marks a "cursor" position. For example, if we have the following, then the string "xa" will convert to "w".
+If the resulting text contains a vertical bar "|", then that means that processing will proceed from that point and that the transform will revisit part of the resulting text.
+Thus the | marks a "cursor" position.
+For example, if we have the following, then the string "xa" will convert to "yw".
```
x → y | z ;
z a → w ;
```
-First, "xa" is converted to "yza". Then the processing will continue from after the character "y", pick up the "za", and convert it. Had we not had the "|", the result would have been simply "yza". The '@' character can be used as filler character to place the revisiting point off the start or end of the string. Thus the following causes x to be replaced, and the cursor to be backed up by two characters.
+First, "xa" is converted to "yza". Then the processing will continue from after the character "y", pick up the "za", and convert it. Had we not had the "|", the result would have been simply "yza".
+
+The '@' character can be used as filler character to place the revisiting point off the start or end of the string — but only within the context. Consider the following rules, with the table afterwards showing how they work.
```
-x → |@@y;
+1. [a-z]{x > |@ab ;
+2. ab > J;
+3. ca > M;
+```
+The ⸠ indicates the virtual cursor:
+
+| Current text | Matching rule |
+| - | - |
+| ⸠cx | no match, cursor advances one code point |
+| c⸠x | matches rule 1, so the text is replaced and cursor backs up. |
+| ⸠cab | matches rule 3, so the text is replaced, with cursor at the end. |
+| Mb⸠ | cursor is at the end, so we are done. |
+
+Notice that rule 2 did not have a chance to trigger.
+
+There is a current restriction that @ cannot back up before the before_context or after the after_context.
+Consider the rules if rule 1 is adjusted to have no before_context.
+
+```
+1'. x > |@ab ;
+2. ab > J ;
+3. ca > M;
```
-#### 10.3.4 <a name="Example" href="#Example">Example</a>
+In that case, the results are different.
+| Current text | Matching rule |
+| - | - |
+| ⸠cx | no match, cursor advances one code point |
+| c⸠x | matches rule 1, so the text is replaced and cursor backs up; but only to where |
+| c⸠ab | matches **rule 2**, so the text is replaced, with cursor at the end. |
+| cJ⸠ | cursor is at the end, so we are done. |
+
+#### <a name="Example" href="#Example">Example</a>
The following shows how these features are combined together in the Transliterator "Any-Publishing". This transform converts the ASCII typewriter conventions into text more suitable for desktop publishing (in English). It turns straight quotation marks or UNIX style quotation marks into curly quotation marks, fixes multiple spaces, and converts double-hyphens into a dash.
@@ -1953,7 +2045,7 @@
<https://util.unicode.org/UnicodeJsps/transform.jsp>
-#### 10.3.5 <a name="Rule_Syntax" href="#Rule_Syntax">Rule Syntax</a>
+#### <a name="Rule_Syntax" href="#Rule_Syntax">Rule Syntax</a>
The following describes the full format of the list of rules used to create a transform. Each rule in the list is terminated by a semicolon. The list consists of the following:
@@ -1967,7 +2059,7 @@
The rule list can also generate the inverse of the transform. In that case, the inverse of each of the rules is used, as described below.
-#### 10.3.6 <a name="Transform_Rules" href="#Transform_Rules">Transform Rules</a>
+#### <a name="Transform_Rules" href="#Transform_Rules">Transform Rules</a>
Each transform rule consists of two colons followed by a transform name, which is of the form source-target. For example:
@@ -1998,7 +2090,7 @@
:: lower ; # executed for both the normal and the inverse
```
-#### 10.3.7 <a name="Variable_Definition_Rules" href="#Variable_Definition_Rules">Variable Definition Rules</a>
+#### <a name="Variable_Definition_Rules" href="#Variable_Definition_Rules">Variable Definition Rules</a>
Each variable definition is of the following form:
@@ -2016,7 +2108,7 @@
Variables are only replaced within other variable definition rules and within conversion rules. They have no effect on transliteration rules.
-#### 10.3.8 <a name="Filter_Rules" href="#Filter_Rules">Filter Rules</a>
+#### <a name="Filter_Rules" href="#Filter_Rules">Filter Rules</a>
A filter rule consists of two colons followed by a UnicodeSet. This filter is global in that only the characters matching the filter will be affected by any transform rules or conversion rules. The inverse filter rule consists of two colons followed by a UnicodeSet in parentheses. This filter is also global for the inverse transform.
@@ -2032,7 +2124,7 @@
The filters keep the transform from mistakenly converting any of the "pivot" characters. Note that this is a case where a rule list contains no conversion rules at all, just transform rules and filters.
-#### 10.3.9 <a name="Conversion_Rules" href="#Conversion_Rules">Conversion Rules</a>
+#### <a name="Conversion_Rules" href="#Conversion_Rules">Conversion Rules</a>
Conversion rules can be forward, backward, or double. The complete conversion rule syntax is described below:
@@ -2067,7 +2159,13 @@
> b | c ← e { f g } h ;
> ```
-#### 10.3.10 <a name="Intermixing_Transform_Rules_and_Conversion_Rules" href="#Intermixing_Transform_Rules_and_Conversion_Rules">Intermixing Transform Rules and Conversion Rules</a>
+The `completed_result` | `result_to_revisit` is also known as the `resulting_text`. Either or both of the values can be empty. For example, the following removes any a, b, or c.
+
+```
+[a-c] → ;
+```
+
+#### <a name="Intermixing_Transform_Rules_and_Conversion_Rules" href="#Intermixing_Transform_Rules_and_Conversion_Rules">Intermixing Transform Rules and Conversion Rules</a>
Transform rules and conversion rules may be freely intermixed. Inserting a transform rule into the middle of a set of conversion rules has an important side effect.
@@ -2144,7 +2242,7 @@
ss → z ;
```
-#### 10.3.11 <a name="Inverse_Summary" href="#Inverse_Summary">Inverse Summary</a>
+#### <a name="Inverse_Summary" href="#Inverse_Summary">Inverse Summary</a>
The following table shows how the same rule list generates two different transforms, where the inverse is restated in terms of forward rules (this is a contrived example, simply to show the reordering):
@@ -2189,7 +2287,57 @@
Note how the irrelevant rules (the inverse filter rule and the rules containing ←) are omitted (ignored, actually) in the forward direction, and notice how things are reversed: the transform rules are inverted and happen in the opposite order, and the groups of conversion rules are also executed in the opposite relative order (although the rules within each group are executed in the same order).
-## 11 <a name="ListPatterns" href="#ListPatterns">List Patterns</a>
+Because the order of rules matters, the following will not work as expected
+```
+c → s;
+ch → kh;
+```
+The second rule can never execute, because it is "masked" by the first.
+To help prevent errors, implementations should try to alert readers when this occurs, eg:
+```
+Rule {c > s;} masks {ch > kh;}
+```
+
+### Transform Syntax Characters
+
+The following summarizes the syntax characters used in transforms.
+
+| Character(s) | Description | Example |
+| - | - | - |
+| ; | End of a conversion rule, variable definition, or transform rule invocation | a → b ; |
+| \:\: | Invoke a transform | :: Null ; |
+| (, ) | In a transform rule invocation, marks the backwards transform | :: Null (NFD); |
+| $ | Mark the start of a variable, when followed by an ASCII letter | $abc |
+| = | Used to define variables | $a = abc ; |
+| →, \> | Transform from left to right (only for forward conversion rules) | a → b ; |
+| ←, \< | Transform from right to left (only for backward conversion rules) | a ← b ; |
+| ↔, \<\> | Transform from left to right (for forward) and right to left (for backward) | a ↔ b ; |
+| { | Mark the boundary between before_context and the text_to_replace | a {b} c → B ; |
+| } | Mark the boundary between the text_to_replace and after_context | a {b} c → B ; |
+| ' | Escape one or more characters, until the next ' | '\<\>' → x ; |
+| " | Escape one or more characters, until the next " | "\<\>" → x ; |
+| \\ | Escape the next character | \\\<\\\> → x ; |
+| # | Comment (until the end of a line) | a → ; # remove a |
+| \| | In the resulting_text, moves the cursor | a → A \| b; |
+| @ | In the resulting_text, filler character used to move the cursor before the start or after the end of the result | a → Ab@\|; |
+| (, ) | In text_to_replace, a capturing group | ([a-b]) > &hex($1); |
+| $ | In replacement_text, when followed by 1..9, is replaced by the contents of a capture group | ([a-b]) > &hex($1); |
+| ^ | In a before_context, by itself, equivalent to [$] **(deprecated)** | ... |
+| ? | In a before_context, after_context, or text_to_replace, a possessive quantifier for zero or one | a?b → c ; |
+| + | In a before_context, after_context, or text_to_replace, a possessive quantifier for one or more | a+b → c ; |
+| * | In a before_context, after_context, or text_to_replace, a possessive quantifier for zero or more | a*b → c ; |
+| & | Invoke a function in the replacement_text | ([a-b]) > &hex($1); |
+| !, %, _, ~, -, ., / | Reserved for future syntax | ... |
+| SPACE | Ignored except when quoted | a b # same as ab |
+| \uXXXX | Hex notation: 4 Xs | \u0061 |
+| \x{XX...} | Hex notation: 1-6 Xs | \x{61} |
+| [, ] | Marks a UnicodeSet | [a-z] |
+| \p{...} | Marks a UnicodeSet formed from a property | \p{di} |
+| \P{...} | Marks a negative UnicodeSet formed from a property | \p{DI} |
+| $ | Within a UnicodeSet (not before ASCII letter), matches the start or end of the source text (but is not replaced) | [$] b → c |
+| Other | Many of these characters have special meanings inside a UnicodeSet | ... |
+
+## <a name="ListPatterns" href="#ListPatterns">List Patterns</a>
```xml
<!ELEMENT listPatterns (alias | (listPattern*, special*)) >
@@ -2308,7 +2456,9 @@
In many languages there may not be a difference among many of these lists. In others, the spacing, the length or presence or a conjunction, and the separators may change.
-### 11.1 <a name="List_Gender" href="#List_Gender">Gender of Lists</a>
+Currently there are no locale keywords that affect list patterns; they are selected using the base locale ID, ignoring anu -u- extension keywords.
+
+### <a name="List_Gender" href="#List_Gender">Gender of Lists</a>
```xml
<!-- Gender List support -->
@@ -2341,7 +2491,7 @@
2. **mixedNeutral:** If the elements of the list are all male, "male" form is used for the list. If all the elements of the lists are female, "female" form is used. If the list has a mix of male, female and neutral names, the "other" form is used.
3. **maleTaints:** If all the elements of the lists are female, "female" form is used, otherwise the "male" form is used.
-## 12 <a name="Context_Transform_Elements" href="#Context_Transform_Elements">ContextTransform Elements</a>
+## <a name="Context_Transform_Elements" href="#Context_Transform_Elements">ContextTransform Elements</a>
```xml
<!ELEMENT contextTransforms ( alias | (contextTransformUsage*, special*)) >
@@ -2422,7 +2572,7 @@
| `unit-pattern` | `units/unitLength[type=*]/unit[type=*]/unitPattern[count=*]` unit names |
| `number-spellout` | `rbnf/rulesetGrouping[type=*]/ruleset[type=*]/rbnfrule` number spellout rules |
-## 13 <a name="Choice_Patterns" href="#Choice_Patterns">Choice Patterns</a>
+## <a name="Choice_Patterns" href="#Choice_Patterns">Choice Patterns</a>
A choice pattern is a string that chooses among a number of strings, based on numeric value. It has the following form:
@@ -2447,7 +2597,7 @@
Quoting is done using ' characters, as in date or number formats.
-## 14 <a name="Annotations" href="#Annotations">Annotations and Labels</a>
+## <a name="Annotations" href="#Annotations">Annotations and Labels</a>
Annotations provide information about characters, typically used in input. For example, on a mobile keyboard they can be used to do completion. They are typically used for symbols, especially emoji characters.
@@ -2486,29 +2636,31 @@
The cp value may contain sequences, but does not contain any Emoji or Text Variant (VS15 & VS16) characters. All such characters should be removed before looking up any short names and keywords.
-### 14.1 <a name="SynthesizingNames" href="#SynthesizingNames">Synthesizing Sequence Names</a>
+### <a name="SynthesizingNames" href="#SynthesizingNames">Synthesizing Sequence Names</a>
Many emoji are represented by sequences of characters. When there are no `annotation` elements for that string, the short name can be synthesized as follows. **Note:** The process details may change after the release of this specification, and may further change in the future if other sequences are added. Please see the [Known Issues](https://cldr.unicode.org/index/downloads/cldr-41#h.qa3jolg7zi2s) section of the CLDR download page for any updates.
1. If **sequence** is an **emoji flag sequence**, look up the territory name in CLDR for the corresponding ASCII characters and return as the short name. For example, the regional indicator symbols P+F would map to “Französisch-Polynesien” in German.
2. If **sequence** is an **emoji tag sequence**, look up the subdivision name in CLDR for the corresponding ASCII characters and return as the short name. For example, the TAG characters gbsct would map to “Schottland” in German.
3. If **sequence** is a keycap sequence or 🔟, use the characterLabel for "keycap" as the **prefixName** and set the **suffix** to be the sequence (or "10" in the case of 🔟), then go to step 8.
-4. Let **suffix** and **prefixName** be "".
-5. If **sequence** contains any emoji modifiers, move them (in order) into **suffix**, removing them from **sequence**.
-6. If **sequence** is a "KISS", "HEART", "FAMILY", or "HOLDING HANDS" emoji ZWJ sequence, move the characters in **sequence** to the front of **suffix**, and set the **sequence** to be "💏", "💑", or "👪" respectively, and go to step 7.
+4. If the **sequence** ends with the string ZWJ + ➡️, look up the name of that sequence with that string removed. Embed that name into the "facing-right" characterLabelPattern and return it.
+5. Let **suffix** and **prefixName** be "".
+6. If **sequence** contains any emoji modifiers, move them (in order) into **suffix**, removing them from **sequence**.
+7. If **sequence** is a "KISS", "HEART", "FAMILY", or "HOLDING HANDS" emoji ZWJ sequence, move the characters in **sequence** to the front of **suffix**, and set the **sequence** to be "💏", "💑", or "👪" respectively, and go to step 7.
1. A KISS sequence contains ZWJ, "💋", and "❤", which are skipped in moving to **suffix**.
2. A HEART sequence contains ZWJ and "❤", which are skipped in moving to **suffix**.
3. A HOLDING HANDS sequence contains ZWJ+🤝+ZWJ, which are skipped in moving to **suffix**.
4. A FAMILY sequence contains only characters from the set {👦, 👧, 👨, 👩, 👴, 👵, 👶}. Nothing is skipped in moving to **suffix**, except ZWJ.
-7. If **sequence** ends with ♂ or ♀, and does not have a name, remove the ♂ or ♀ and move the name for "👨" or "👩" respectively to the start of **prefixName**.
-8. Transform **sequence** and append to **prefixName**, by successively getting names for the longest subsequences, skipping any singleton ZWJ characters. If there is more than one name, use the listPattern for unit-short, type=2 to link them.
-9. Transform **suffix** into **suffixName** in the same manner.
-10. If both the **prefixName** and **suffixName** are non-empty, form the name by joining them with the "category-list" characterLabelPattern and return it. Otherwise return whichever of them is non-empty.
+8. If **sequence** ends with ♂ or ♀, and does not have a name, remove the ♂ or ♀ and move the name for "👨" or "👩" respectively to the start of **prefixName**.
+9. Transform **sequence** and append to **prefixName**, by successively getting names for the longest subsequences, skipping any singleton ZWJ characters. If there is more than one name, use the listPattern for unit-short, type=2 to link them.
+10. Transform **suffix** into **suffixName** in the same manner.
+11. If both the **prefixName** and **suffixName** are non-empty, form the name by joining them with the "category-list" characterLabelPattern and return it. Otherwise return whichever of them is non-empty.
The synthesized keywords can follow a similar process.
1. For an **emoji flag sequence** or **emoji tag sequence** representing a subdivision, use "flag".
2. For keycap sequences, use "keycap".
+3. For sequences with ZWJ + ➡️, use the keywords for the sequence without the ZWJ + ➡️.
3. For other sequences, add the keywords for the subsequences used to get the short names for **prefixName**, and the short names used for **suffixName**.
Some examples for English data (v30) are given in the following table.
@@ -2547,7 +2699,7 @@
For more information, see [Unicode Emoji](https://www.unicode.org/reports/tr51/).
-### 14.2 <a name="Character_Labels" href="#Character_Labels">Annotations Character Labels</a>
+### <a name="Character_Labels" href="#Character_Labels">Annotations Character Labels</a>
```xml
<!ELEMENT characterLabels ( alias | ( characterLabelPattern*, characterLabel*, special* ) ) >
@@ -2620,9 +2772,11 @@
| limited_use | limited-use | Not in common modern use. |
| male | male | Indicates that a character is male or masculine in appearance. |
| modifier | modifier | A Unicode modifier letter or symbol. |
-| nonspacing | nonspacing | Uses for characters that occupy no width by themselves, such as the ¨ over the a in ä. |
+| nonspacing | nonspacing | Used for characters that occupy no width by themselves, such as the ¨ over the a in ä. |
+| facing-left | facing-left | Characters that face to the left. Also used to construct names for emoji variants. |
+| facing-right | facing-right | Characters that face to the right. Also used to construct names for emoji variants. |
-### 14.3 <a name="Typographic_Names" href="#Typographic_Names">Typographic Names</a>
+### <a name="Typographic_Names" href="#Typographic_Names">Typographic Names</a>
```xml
<!ELEMENT typographicNames ( alias | ( axisName*, styleName*, featureName*, special* ) ) >
@@ -2664,7 +2818,7 @@
2. Look up (type, subtype) in a data table derived from CLDR’s style names. If CLDR supplies multiple alternate names for this (type, subtype), use the one whose “alt” key is matching; otherwise, use the default alternate (which has no “alt” attribute in CLDR).
7. Concatenate the strings, with a separator between them.
-## 15 <a name="Grammatical_Features" href="#Grammatical_Features">Grammatical Features</a>
+## <a name="Grammatical_Features" href="#Grammatical_Features">Grammatical Features</a>
LDML supplies grammatical information that can be used to distinguish localized forms on a per-locale basis. The current data is part of an initial phase; the longer term plan is to add structure to permit localized forms based on these features, starting with measurement units such as the dative form in Serbian of “kilometer”. That will allow unit values to be inserted as placeholders into messages and adopt the right forms for grammatical agreement.
@@ -2707,14 +2861,14 @@
* a scope attribute is only used when there is a corresponding “general” element, one for the same language and target without a scope attribute.
* the scope attribute values must be narrower (a proper subset, possibly empty) of those in the corresponding general element.
-### 15.1 <a name="Gender" href="#Gender">Gender</a>
+### <a name="Gender" href="#Gender">Gender</a>
-Feature that classifies nouns in classes.
-This is grammatical gender, which may be assigned on the basis of sex in some languages, but may be completely separate in others.
-Also used to tag elements in CLDR that should agree with a particular gender of an associated noun.
+Feature that classifies nouns in classes.
+This is grammatical gender, which may be assigned on the basis of sex in some languages, but may be completely separate in others.
+Also used to tag elements in CLDR that should agree with a particular gender of an associated noun.
(adapted from: [linguistics-ontology.org/gold/2010/GenderProperty](http://linguistics-ontology.org/gold/2010/GenderProperty))
-The term "gender" is somewhat of a misnomer, because CLDR treats "gender" as a broad term, equivalent to "noun class".
+The term "gender" is somewhat of a misnomer, because CLDR treats "gender" as a broad term, equivalent to "noun class".
Thus it bundles noun class categories such as gender and animacy into a single identifier, such as "feminine-animate".
#### Example
@@ -2736,7 +2890,7 @@
| masculine | In a masculine/feminine or in a masculine/feminine/neuter gender system, gender that denotes specifically male persons (or animals) or that is assigned arbitrarily to object. | adapted from: [wikipedia.org/wiki/Grammatical_gender](https://en.wikipedia.org/wiki/Grammatical_gender), [linguistics-ontology.org/gold/2010/MasculineGender](http://linguistics-ontology.org/gold/2010/MasculineGender) |
| neuter | In a masculine/feminine/neuter or common/neuter gender system, gender that generally denotes an object. | adapted from: [wikipedia.org/wiki/Grammatical_gender](https://en.wikipedia.org/wiki/Grammatical_gender), [linguistics-ontology.org/gold/2010/NeuterGender](http://linguistics-ontology.org/gold/2010/NeuterGender) |
-There are further simplifications in the identifiers.
+There are further simplifications in the identifiers.
For example, consider a language that has 3 genders, and two levels of animacy, but only for masculine.
The set of combinations would be:
@@ -2752,11 +2906,11 @@
* feminine
* neuter
-That is:
+That is:
* unspecified and animate are dropped.
* if there is only a single gender with inanimate, then the gender is dropped.
-### 15.2 <a name="Case" href="#Case">Case</a>
+### <a name="Case" href="#Case">Case</a>
#### Table: Case
@@ -2816,7 +2970,7 @@
| unspecified | Noun without any definiteness marking in some specific construction (specific to Danish). | |
-## 16 <a name="Grammatical_Derivations" href="#Grammatical_Derivations">Grammatical Derivations</a>
+## <a name="Grammatical_Derivations" href="#Grammatical_Derivations">Grammatical Derivations</a>
```xml
<!ELEMENT grammaticalData ( grammaticalFeatures*, grammaticalDerivations*) >
@@ -2860,7 +3014,7 @@
Note that the prefix and power nodes are unary (exactly 1 child), the per pattern is unary or binary (1 or 2 children), and the times pattern is n-ary (where n > 1).
-Each section below is only applicable if the language has more than one value _for units_: for example, for plural categories the language has to have more than just "other". When that information is available for a language, it is found in **Section 15 [Grammatical Features](#Grammatical_Features)**.
+Each section below is only applicable if the language has more than one value _for units_: for example, for plural categories the language has to have more than just "other". When that information is available for a language, it is found in **[Grammatical Features](#Grammatical_Features)**.
The gender derivation would be appropriate for an API call like `String genderValue = getGrammaticalGender(locale, "kilogram-meter-per-square-second")`. This can be used where the choice of word forms in the rest of a phrase can depend on the gender of the unit.
@@ -2870,9 +3024,9 @@
**times**(_kilogram, **times**(square-kilometer, **times**(ampere, candela)))_
-For a description of how to use these fields to construct a localized name, see **Section 6.4 [Compound Units](#compound-units)**.
+For a description of how to use these fields to construct a localized name, see **[Compound Units](#compound-units)**.
-### 16.1 <a name="gender_compound_units" href="#gender_compound_units">Deriving the Gender of Compound Units</a>
+### <a name="gender_compound_units" href="#gender_compound_units">Deriving the Gender of Compound Units</a>
The **deriveCompound\[@feature="gender"\]** data provides information for how to derive the gender of the whole compound from the gender of its atomic units and structure. The `attributeValues` of value are: **`0` (=gender of the first element), `1` (=gender of second element), or one of the valid gender values for the language.** In the unusual case that the 'per' compound has no first element and 0 is supplied, then the value is 1.
@@ -2890,7 +3044,7 @@
* The gender of the compound is the gender of the first component of the 'per', that is, of the "gram". So if gram is feminine in that language, the gender of the compound is feminine.
-### 16.2 <a name="plural_compound_units" href="#plural_compound_units">Deriving the Plural Category of Unit Components</a>
+### <a name="plural_compound_units" href="#plural_compound_units">Deriving the Plural Category of Unit Components</a>
The `deriveComponent[@feature="plural"]` data provides information for how to derive the plural category for each of the atomic units, from the plural category of the whole compound and the structure of the compound. The `attributeValues` of `value0` and `value1` are: `compound` (=the `pluralCategory` of the compound), or one of the valid plural category values for the language.
@@ -2908,7 +3062,7 @@
* When the plural form of gram-per-meter is needed (rather than singular), then the gram part of the translation has to have a plural form like “grams”, while the meter part of the translation has to have a singular form like “metre”. This would be composed with the pattern for "per" (say "{0} pro {1}") to get "grams pro metre".
-### 16.3 <a name="case_compound_units" href="#case_compound_units">Deriving the Case of Unit Components</a>
+### <a name="case_compound_units" href="#case_compound_units">Deriving the Case of Unit Components</a>
The `deriveComponent[@feature="case"]` data provides information for how to derive the grammatical case for each of the atomic units, from the grammatical case of the whole compound and the structure of the compound. The `attributeValues` of value0 and value1 are: `compound` (=the grammatical case of the compound), or one of the valid grammatical case values for the language.
@@ -2927,6 +3081,6 @@
* * *
-Copyright © 2001–2022 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
+Copyright © 2001–2023 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
Unicode and the Unicode logo are trademarks of Unicode, Inc., and are registered in some jurisdictions.
diff --git a/docs/ldml/tr35-info.anchors.json b/docs/ldml/tr35-info.anchors.json
new file mode 100644
index 0000000..a215768
--- /dev/null
+++ b/docs/ldml/tr35-info.anchors.json
@@ -0,0 +1,94 @@
+[
+ "Alias_Attribute_Values",
+ "aliases",
+ "Appendix_Supplemental_Metadata",
+ "caveats",
+ "constants",
+ "constraints",
+ "Contents",
+ "contents-of-part-6-supplemental",
+ "conversion-data",
+ "conversion-mechanisms",
+ "Coverage_Level_Data_Requirements",
+ "Coverage_Level_Default_Values",
+ "Coverage_Level_Definitions",
+ "Coverage_Levels",
+ "coverage-levels",
+ "data-requirements",
+ "Default_Content",
+ "default-content",
+ "default-values",
+ "definitions",
+ "derived-unit-system",
+ "discarding-offsets",
+ "duplicate-units",
+ "exceptional-cases",
+ "identities",
+ "introduction-supplemental-data",
+ "language_population_percent",
+ "literacy_percent",
+ "literacy_percent_for_langPop",
+ "locale-metadata-elements",
+ "Metadata_Elements",
+ "mixed-units",
+ "official_language",
+ "official_regional_language",
+ "Parent_Locales",
+ "parent-locales",
+ "parts",
+ "Parts",
+ "Postal_Code_Validation",
+ "postal-code-validation-deprecated",
+ "Preferred_Units_For_Usage",
+ "preferred-units-for-specific-usages",
+ "quantities-and-base-units",
+ "rgScope",
+ "rgscope-scope-of-the-rg-locale-key",
+ "status",
+ "Subdivision_Containment",
+ "subdivision-containment",
+ "summary",
+ "Supplemental_Alias_Information",
+ "Supplemental_Character_Fallback_Data",
+ "Supplemental_Code_Mapping",
+ "Supplemental_Data",
+ "Supplemental_Deprecated_Information",
+ "Supplemental_Language_Data",
+ "Supplemental_Language_Grouping",
+ "Supplemental_Territory_Containment",
+ "Supplemental_Territory_Information",
+ "supplemental-alias-information",
+ "supplemental-character-fallback-data",
+ "supplemental-code-mapping",
+ "supplemental-deprecated-information-deprecated",
+ "supplemental-language-data",
+ "supplemental-language-grouping",
+ "supplemental-metadata",
+ "supplemental-territory-containment",
+ "supplemental-territory-information",
+ "table-alias-attribute-values",
+ "Telephone_Code_Data",
+ "telephone-code-data-deprecated",
+ "Territory_Based_Preferences",
+ "Territory_Data",
+ "territory-based-preferences",
+ "territory-data",
+ "testing",
+ "unicode-locale-data-markup-language-ldmlpart-6-supplemental",
+ "Unit_Conversion",
+ "Unit_Identifier_Normalization",
+ "Unit_Preferences",
+ "Unit_Preferences_Data",
+ "Unit_Preferences_Overrides",
+ "unit-conversion",
+ "unit-identifier-normalization",
+ "unit-parsing-data",
+ "unit-preferences",
+ "unit-preferences-data",
+ "unit-preferences-overrides",
+ "unittype-vs-quantity",
+ "unresolved-units",
+ "Version_Information",
+ "version-information",
+ "writing_percent"
+]
\ No newline at end of file
diff --git a/docs/ldml/tr35-info.md b/docs/ldml/tr35-info.md
index 4de6c44..fb6e0fa 100644
--- a/docs/ldml/tr35-info.md
+++ b/docs/ldml/tr35-info.md
@@ -1,8 +1,8 @@
-## Unicode Technical Standard #35
+## Unicode Technical Standard #35
# Unicode Locale Data Markup Language (LDML)<br/>Part 6: Supplemental
-|Version|42 |
+|Version|44.1 |
|-------|-----------|
|Editors|Steven Loomis (<a href="mailto:[email protected]">[email protected]</a>) and <a href="tr35.md#Acknowledgments">other CLDR committee members|
@@ -21,7 +21,12 @@
### _Status_
-_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium. This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
+<!-- _This is a draft document which may be updated, replaced, or superseded by other documents at any time.
+Publication does not imply endorsement by the Unicode Consortium.
+This is not a stable document; it is inappropriate to cite this document as other than a work in progress._ -->
+
+_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium.
+This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
> _**A Unicode Technical Standard (UTS)** is an independent specification. Conformance to the Unicode Standard does not imply conformance to any UTS._
@@ -42,36 +47,38 @@
## <a name="Contents" href="#Contents">Contents of Part 6, Supplemental</a>
-* 1 Introduction [Supplemental Data](#Supplemental_Data)
-* 2 [Territory Data](#Territory_Data)
- * 2.1 [Supplemental Territory Containment](#Supplemental_Territory_Containment)
- * 2.2 [Subdivision Containment](#Subdivision_Containment)
- * 2.3 [Supplemental Territory Information](#Supplemental_Territory_Information)
- * 2.4 [Territory-Based Preferences](#Territory_Based_Preferences)
- * 2.4.1 [Preferred Units for Specific Usages](#Preferred_Units_For_Usage)
- * 2.5 [`<rgScope>`: Scope of the “rg” Locale Key](#rgScope)
-* 3 [Supplemental Language Data](#Supplemental_Language_Data)
-* 3.1 [Supplemental Language Grouping](#Supplemental_Language_Grouping)
-* 4 [Supplemental Code Mapping](#Supplemental_Code_Mapping)
-* 5 ~~[Telephone Code Data](#Telephone_Code_Data)~~ (Deprecated)
-* 6 ~~[Postal Code Validation (Deprecated)](#Postal_Code_Validation)~~
-* 7 [Supplemental Character Fallback Data](#Supplemental_Character_Fallback_Data)
-* 8 [Coverage Levels](#Coverage_Levels)
- * 8.1 [Definitions](#Coverage_Level_Definitions)
- * 8.2 [Data Requirements](#Coverage_Level_Data_Requirements)
- * 8.3 [Default Values](#Coverage_Level_Default_Values)
-* 9 [Supplemental Metadata](#Appendix_Supplemental_Metadata)
- * 9.1 [Supplemental Alias Information](#Supplemental_Alias_Information)
+* Introduction [Supplemental Data](#Supplemental_Data)
+* [Territory Data](#Territory_Data)
+ * [Supplemental Territory Containment](#Supplemental_Territory_Containment)
+ * [Subdivision Containment](#Subdivision_Containment)
+ * [Supplemental Territory Information](#Supplemental_Territory_Information)
+ * [Territory-Based Preferences](#Territory_Based_Preferences)
+ * [Preferred Units for Specific Usages](#Preferred_Units_For_Usage)
+ * [`<rgScope>`: Scope of the “rg” Locale Key](#rgScope)
+* [Supplemental Language Data](#Supplemental_Language_Data)
+* [Supplemental Language Grouping](#Supplemental_Language_Grouping)
+* [Supplemental Code Mapping](#Supplemental_Code_Mapping)
+* ~~[Telephone Code Data](#Telephone_Code_Data)~~ (Deprecated)
+* ~~[Postal Code Validation (Deprecated)](#Postal_Code_Validation)~~
+* [Supplemental Character Fallback Data](#Supplemental_Character_Fallback_Data)
+* [Coverage Levels](#Coverage_Levels)
+ * [Definitions](#Coverage_Level_Definitions)
+ * [Data Requirements](#Coverage_Level_Data_Requirements)
+ * [Default Values](#Coverage_Level_Default_Values)
+* [Supplemental Metadata](#Appendix_Supplemental_Metadata)
+ * [Supplemental Alias Information](#Supplemental_Alias_Information)
* Table: [Alias Attribute Values](#Alias_Attribute_Values)
- * 9.2 ~~[Supplemental Deprecated Information (Deprecated)](#Supplemental_Deprecated_Information)~~
- * 9.3 [Default Content](#Default_Content)
-* 10 [Locale Metadata Elements](#Metadata_Elements)
-* 11 [Version Information](#Version_Information)
-* 12 [Parent Locales](#Parent_Locales)
-* 13 [Unit Conversion](#Unit_Conversion)
+ * ~~[Supplemental Deprecated Information (Deprecated)](#Supplemental_Deprecated_Information)~~
+ * [Default Content](#Default_Content)
+* [Locale Metadata Elements](#Metadata_Elements)
+* [Version Information](#Version_Information)
+* [Parent Locales](#Parent_Locales)
+* [Unit Conversion](#Unit_Conversion)
* [Unit Parsing Data](#unit-parsing-data)
* [Constants](#constants)
* [Conversion Data](#conversion-data)
+ * [Derived Unit System](#derived-unit-system)
+ * [Conversion Mechanisms](#conversion-mechanisms)
* [Exceptional Cases](#exceptional-cases)
* [Identities](#identities)
* [Aliases](#aliases)
@@ -83,13 +90,13 @@
* [Unit Identifier Normalization](#Unit_Identifier_Normalization)
* [Mixed Units](#mixed-units)
* [Testing](#testing)
-* 14 [Unit Preferences](#Unit_Preferences)
- * 14.2 [Unit Preferences Overrides](#Unit_Preferences_Data)
- * 14.2 [Unit Preferences Data](#Unit_Preferences_Data)
+* [Unit Preferences](#Unit_Preferences)
+ * [Unit Preferences Overrides](#Unit_Preferences_Overrides)
+ * [Unit Preferences Data](#Unit_Preferences_Data)
* [Constraints](#constraints)
* [Caveats](#caveats)
-## 1 Introduction <a name="Supplemental_Data" href="#Supplemental_Data">Supplemental Data</a>
+## Introduction <a name="Supplemental_Data" href="#Supplemental_Data">Supplemental Data</a>
The following represents the format for additional supplemental information. This is information that is important for internationalization and proper use of CLDR, but is not contained in the locale hierarchy. It is not localizable, nor is it overridden by locale data. The current CLDR data can be viewed in the [Supplemental Charts](https://unicode-org.github.io/cldr-staging/charts/38/supplemental/index.html).
@@ -97,13 +104,13 @@
<!ELEMENT supplementalData (version, generation?, cldrVersion?, currencyData?, territoryContainment?, subdivisionContainment?, languageData?, territoryInfo?, postalCodeData?, calendarData?, calendarPreferenceData?, weekData?, timeData?, measurementData?, unitPreferenceData?, timezoneData?, characters?, transforms?, metadata?, codeMappings?, parentLocales?, likelySubtags?, metazoneInfo?, plurals?, telephoneCodeData?, numberingSystems?, bcp47KeywordMappings?, gender?, references?, languageMatching?, dayPeriodRuleSet*, metaZones?, primaryZones?, windowsZones?, coverageLevels?, idValidity?, rgScope?) >
```
-The data in CLDR is presently split into multiple files: supplementalData.xml, supplementalMetadata.xml, characters.xml, likelySubtags.xml, ordinals.xml, plurals.xml, telephoneCodeData.xml, genderList.xml, plus transforms (see _Part 2 Section 10 [Transforms](tr35-general.md#Transforms)_ and _Part 2 Section 10.3 [Transform Rule Syntax](tr35-general.md#Transform_Rules_Syntax)_). The split is just for convenience: logically, they are treated as though they were a single file. Future versions of CLDR may split the data in a different fashion. Do not depend on any specific XML filename or path for supplemental data.
+The data in CLDR is presently split into multiple files: supplementalData.xml, supplementalMetadata.xml, characters.xml, likelySubtags.xml, ordinals.xml, plurals.xml, telephoneCodeData.xml, genderList.xml, plus transforms (see _Part 2 [Transforms](tr35-general.md#Transforms)_ and _Part 2 [Transform Rule Syntax](tr35-general.md#Transform_Rules_Syntax)_). The split is just for convenience: logically, they are treated as though they were a single file. Future versions of CLDR may split the data in a different fashion. Do not depend on any specific XML filename or path for supplemental data.
Note that [Chapter 10](#Metadata_Elements) presents information about metadata that is maintained on a per-locale basis. It is included in this section because it is not intended to be used as part of the locale itself.
-## 2 <a name="Territory_Data" href="#Territory_Data">Territory Data</a>
+## <a name="Territory_Data" href="#Territory_Data">Territory Data</a>
-### 2.1 <a name="Supplemental_Territory_Containment" href="#Supplemental_Territory_Containment">Supplemental Territory Containment</a>
+### <a name="Supplemental_Territory_Containment" href="#Supplemental_Territory_Containment">Supplemental Territory Containment</a>
```xml
<!ELEMENT territoryContainment ( group* ) >
@@ -163,7 +170,7 @@
That is, the type value isn’t a grouping, but if you filter out groupings you can drop this containment. In the example above, EU is a grouping, and contained in 150.
-### 2.2 <a name="Subdivision_Containment" href="#Subdivision_Containment">Subdivision Containment</a>
+### <a name="Subdivision_Containment" href="#Subdivision_Containment">Subdivision Containment</a>
```xml
<!ELEMENT subdivisionContainment ( subgroup* ) >
@@ -190,7 +197,7 @@
\* The type attribute contained only a `unicode_region_subtag` `unicode_subdivision_suffix` values were used in the `contains` attribute; these are not unique across multiple territories, so for lower levels a now-deprecated
-### 2.3 <a name="Supplemental_Territory_Information" href="#Supplemental_Territory_Information">Supplemental Territory Information</a>
+### <a name="Supplemental_Territory_Information" href="#Supplemental_Territory_Information">Supplemental Territory Information</a>
```xml
<!ELEMENT territory ( languagePopulation* ) >
@@ -242,7 +249,7 @@
<a name="official_regional_language" href="#official_regional_language">official regional language</a> — a language that is official (_de jure_ or _de facto_) in a major region within a country, but does not qualify as an official language of the country as a whole. For example, it can be used in an official petition to a provincial government, but not the central government. The term “major” is meant to distinguish from smaller-scale usage, such as for a town or village.
-### 2.4 <a name="Territory_Based_Preferences" href="#Territory_Based_Preferences">Territory-Based Preferences</a>
+### <a name="Territory_Based_Preferences" href="#Territory_Based_Preferences">Territory-Based Preferences</a>
The default preference for several locale items is based solely on a [unicode_region_subtag](tr35.md#unicode_region_subtag), which may either be specified as part of a [unicode_language_id](tr35.md#unicode_language_id), inferred from other locale ID elements using the [Likely Subtags](tr35.md#Likely_Subtags) mechanism, or provided explicitly using an “rg” [Region Override](tr35.md#RegionOverride) locale key. For more information on this process see [Locale Inheritance and Matching](tr35.md#Locale_Inheritance). The specific items that are handled in this way are:
@@ -253,16 +260,16 @@
* Default measurement system and paper size (see [Measurement System Data](tr35-general.md#Measurement_System_Data))
* Default units for specific usage (see [Preferred Units for Specific Usages](#Preferred_Units_For_Usage), below)
-The mu, ms, and rg keys also interact with the base locale and the unit preferences. For more information, see _Section 14 [Unit Preferences](#Unit_Preferences)._
+The mu, ms, and rg keys also interact with the base locale and the unit preferences. For more information, see _[Unit Preferences](#Unit_Preferences)._
-#### 2.4.1 <a name="Preferred_Units_For_Usage" href="#Preferred_Units_For_Usage">Preferred Units for Specific Usages</a>
+#### <a name="Preferred_Units_For_Usage" href="#Preferred_Units_For_Usage">Preferred Units for Specific Usages</a>
The determination of preferred units depends on the locale identifer: the keys mu, ms, rg, the base locale (language, script, region) and the user preferences.
-_For information about preferred units and unit conversion, see Section 13 [Unit Conversion](#Unit_Conversion) and Section 14 [Unit Preferences](#Unit_Preferences)._
+_For information about preferred units and unit conversion, see [Unit Conversion](#Unit_Conversion) and [Unit Preferences](#Unit_Preferences)._
-### 2.5 <a name="rgScope" href="#rgScope">`<rgScope>`: Scope of the “rg” Locale Key</a>
+### <a name="rgScope" href="#rgScope">`<rgScope>`: Scope of the “rg” Locale Key</a>
-The supplemental `<rgScope>` element specifies the data paths for which the region used for data lookup is determined by the value of any “rg” key present in the locale identifier (see [Region Override](tr35.md#RegionOverride)). If no “rg” key is present, the region used for lookup is determined as usual: from the unicode_region_subtag if present, else inferred from the unicode_language_subtag. The DTD structure is as follows:
+The supplemental `<rgScope>` element specifies the data paths for which the region used for data lookup is determined by the value of any “rg” key present in the locale identifier (see [Region Override](tr35.md#RegionOverride) and [Region Priority Inheritance](tr35.md#Region_Priority_Inheritance)). If no “rg” key is present, the region used for lookup is determined as usual: from the unicode_region_subtag if present, else inferred from the unicode_language_subtag. The DTD structure is as follows:
```xml
<!ELEMENT rgScope ( rgPath* ) >
@@ -292,7 +299,7 @@
* An attribute value of `'*'` indicates that the path applies regardless of the value of the attribute.
* Each path must have exactly one attribute whose value is marked here as `'#'`; in actual data items with this path, the corresponding value is a list of region codes. It is the region codes in this list that are compared with the region specified by the “rg” key to determine which data item to use for this path.
-## 3 <a name="Supplemental_Language_Data" href="#Supplemental_Language_Data">Supplemental Language Data</a>
+## <a name="Supplemental_Language_Data" href="#Supplemental_Language_Data">Supplemental Language Data</a>
```xml
<!ELEMENT languageData ( language* ) >
@@ -304,7 +311,7 @@
<!ATTLIST language alt NMTOKENS #IMPLIED >
```
-The language data is used for consistency checking and testing. It provides a list of which languages are used with which scripts and in which countries. To a large extent, however, the territory list has been superseded by the data in _Section 2.2 [Supplemental Territory Information](#Supplemental_Territory_Information)_ .
+The language data is used for consistency checking and testing. It provides a list of which languages are used with which scripts and in which countries. To a large extent, however, the territory list has been superseded by the data in _[Supplemental Territory Information](#Supplemental_Territory_Information)_ .
```xml
<languageData>
@@ -321,7 +328,7 @@
...
```
-## 3.1 <a name="Supplemental_Language_Grouping" href="#Supplemental_Language_Grouping">Supplemental Language Grouping</a>
+## <a name="Supplemental_Language_Grouping" href="#Supplemental_Language_Grouping">Supplemental Language Grouping</a>
```xml
<!ELEMENT languageGroups ( languageGroup* ) >
@@ -346,7 +353,7 @@
| Finno-Permic languages | [Q161240](https://www.wikidata.org/wiki/Q161240) |
| Finno-Ugric languages | [Q79890](https://www.wikidata.org/wiki/Q79890) | fiu |
-## 4 <a name="Supplemental_Code_Mapping" href="#Supplemental_Code_Mapping">Supplemental Code Mapping</a>
+## <a name="Supplemental_Code_Mapping" href="#Supplemental_Code_Mapping">Supplemental Code Mapping</a>
```xml
<!ELEMENT codeMappings (languageCodes*, territoryCodes*, currencyCodes*) >
@@ -398,9 +405,10 @@
<currencyCodes type="ZMW" numeric="967" />
```
-## 5 ~~<a name="Telephone_Code_Data" href="#Telephone_Code_Data">Telephone Code Data</a>~~ (Deprecated)
+## ~~<a name="Telephone_Code_Data" href="#Telephone_Code_Data">Telephone Code Data</a>~~ (Deprecated)
Deprecated in CLDR v34, and data removed.
+The data and structure for phone numbers changes quite often, so the recommended alternative is the open-source library [libphonenumber](https://github.com/google/libphonenumber#what-is-it).
```xml
<!ELEMENT telephoneCodeData ( codesByTerritory* ) >
@@ -437,13 +445,9 @@
</codesByTerritory>
```
-## 6 ~~<a name="Postal_Code_Validation" href="#Postal_Code_Validation">Postal Code Validation (Deprecated)</a>~~
+## ~~<a name="Postal_Code_Validation" href="#Postal_Code_Validation">Postal Code Validation (Deprecated)</a>~~
-Deprecated in v27. Please see other services that are kept up to date, such as:
-
-* [https://i18napis.appspot.com/address/data/US](https://i18napis.appspot.com/address/data/US)
-* [https://i18napis.appspot.com/address/data/CH](https://i18napis.appspot.com/address/data/CH)
-* ...
+Deprecated in v27. Please see other services that are kept up to date, such as <https://github.com/google/libaddressinput>
```xml
<!ELEMENT postalCodeData (postCodeRegex*) >
@@ -465,7 +469,7 @@
The most complicated currently is the UK.
-## 7 <a name="Supplemental_Character_Fallback_Data" href="#Supplemental_Character_Fallback_Data">Supplemental Character Fallback Data</a>
+## <a name="Supplemental_Character_Fallback_Data" href="#Supplemental_Character_Fallback_Data">Supplemental Character Fallback Data</a>
```xml
<!ELEMENT characters ( character-fallback*) >
@@ -508,24 +512,23 @@
* the explicit _substitutes_ value (in order)
* `toNFKC`(_value_)
-## 8 <a name="Coverage_Levels" href="#Coverage_Levels">Coverage Levels</a>
+## <a name="Coverage_Levels" href="#Coverage_Levels">Coverage Levels</a>
The following describes the structure used to set coverage levels used for CLDR.
-That structure is primarily intended for internal use in CLDR tooling — it is not anticipated that users of CLDR data would need it.
+That structure is used in CLDR tooling, and can also be used by consumers of CLDR data, such as described in [Data Size Reduction](tr35.md#Data_Size).
-Each level adds to what is in the lower level. This list will change between releases of CLDR, and more detailed information for each level is on [Coverage Levels](https://cldr.unicode.org/index/cldr-spec/coverage-levels).
+The following lists the coverage levels. The qualifications for each level may change between releases of CLDR, and more detailed information for each level is on [Coverage Levels](https://cldr.unicode.org/index/cldr-spec/coverage-levels). Each level adds to what is in the lower level, so Basic includes all of Core, Moderate all of Basic, and so on.
-
-| Level | Description | |
-| ----: | ------------- | --- |
+| Code | Level | Description |
+| ----: | ------------- | -------------- |
| 0 | undetermined | Does not meet any of the following levels. |
| 10 | core | Core Locale — Has minimal data about the language and writing system that is required before other information can be added using the CLDR survey tool. |
| 40 | basic | Selectable Locale — Minimal locale data necessary for a "selectable" locale in a platform UI. Very basic number and datetime formatting, etc. |
| 60 | moderate | Document Content Locale — Minimal locale data for applications such as spreadsheets and word processors to support general document content internationalization: formatting number, datetime, currencies, sorting, plural handling, and so on. |
| 80 | modern | UI Locale — Contains all fields in normal modern use, including all CLDR locale names, country names, timezone names, currencies in use, and so on. |
-| 100 | comprehensive | Above modern level; typically far more data than is needed in practice. |
+| 100 | comprehensive | Above modern level; typically more data than is needed in most implementations. |
-Levels 40 through 80 are based on the definitions and specifications listed below.
+The Basic through Modern levels are based on the definitions and specifications listed below.
```xml
<!ELEMENT coverageLevels ( approvalRequirements, coverageVariable*, coverageLevel* ) >
@@ -598,7 +601,7 @@
For more information on the CLDR Voting process, see [https://cldr.unicode.org/index/process](https://cldr.unicode.org/index/process)
-### 8.1 <a name="Coverage_Level_Definitions" href="#Coverage_Level_Definitions">Definitions</a>
+### <a name="Coverage_Level_Definitions" href="#Coverage_Level_Definitions">Definitions</a>
This is a snapshot of the contents of certain variables. The actual definitions in the coverageLevels.xml file may vary from these descriptions.
* _Target-Language_ is the language under consideration.
@@ -616,7 +619,7 @@
* _Calendar-List_ is the set of calendars in customary use in any of _Target-Territories_, plus Gregorian.
* _Number-System-List_ is the set of number systems in customary use in the language.
-### 8.2 <a name="Coverage_Level_Data_Requirements" href="#Coverage_Level_Data_Requirements">Data Requirements</a>
+### <a name="Coverage_Level_Data_Requirements" href="#Coverage_Level_Data_Requirements">Data Requirements</a>
The required data to qualify for each level based on these definitions is then the following.
@@ -641,7 +644,7 @@
4. currencies: displayNames and symbol for all currencies in _Currency-List_, for all plural forms
5. transforms: (moderate and above) transliteration between Latin and each other script in _Target-Scripts._
-### 8.3 <a name="Coverage_Level_Default_Values" href="#Coverage_Level_Default_Values">Default Values</a>
+### <a name="Coverage_Level_Default_Values" href="#Coverage_Level_Default_Values">Default Values</a>
Items should _only_ be included if they are not the same as the default, which is:
@@ -656,7 +659,7 @@
* scripts: Latn, Thai, ...
* variants: PHONEBOOK, ...
-## 9 <a name="Appendix_Supplemental_Metadata" href="#Appendix_Supplemental_Metadata">Supplemental Metadata</a>
+## <a name="Appendix_Supplemental_Metadata" href="#Appendix_Supplemental_Metadata">Supplemental Metadata</a>
Note that this section discusses the `<metadata>` element within the `<supplementalData>` element. For the per-locale metadata used in tests and the Survey Tool, see [10: Locale Metadata Element](#Metadata_Elements).
@@ -667,7 +670,7 @@
* Appendix L: [Canonical Form](tr35.md#Canonical_Form)
* Appendix M: [Coverage Levels](#Coverage_Levels)
-### 9.1 <a name="Supplemental_Alias_Information" href="#Supplemental_Alias_Information">Supplemental Alias Information</a>
+### <a name="Supplemental_Alias_Information" href="#Supplemental_Alias_Information">Supplemental Alias Information</a>
```xml
<!ELEMENT alias (languageAlias*,scriptAlias*,territoryAlias*,subdivisionAlias*,variantAlias*,zoneAlias*) >
@@ -688,7 +691,7 @@
<!ATTLIST languageAlias reason ( deprecated | overlong | macrolanguage | legacy | bibliographic ) #IMPLIED >
```
-This element provides information as to parts of locale IDs that should be substituted when accessing CLDR data. This logical substitution should be done to both the locale id, and to any lookup for display names of languages, territories, and so on. The replacement for the language and territory types is more complicated: see _Part 1: [Core](tr35.md#Contents), Section 3.3.1 [BCP 47 Language Tag Conversion](tr35.md#BCP_47_Language_Tag_Conversion)_ for details.
+This element provides information as to parts of locale IDs that should be substituted when accessing CLDR data. This logical substitution should be done to both the locale id, and to any lookup for display names of languages, territories, and so on. The replacement for the language and territory types is more complicated: see _Part 1: [Core](tr35.md#Contents), [BCP 47 Language Tag Conversion](tr35.md#BCP_47_Language_Tag_Conversion)_ for details.
```xml
<alias>
@@ -715,7 +718,7 @@
| | legacy | The code in type is a legacy code that is replaced by another code for compatibility with established legacy usage, such as 'sh' by 'sr_Latn' |
| | bibliographic | The code in type is a [bibliographic code](https://www.loc.gov/standards/iso639-2/langhome.html), which is replaced by a terminology code, such as 'alb' by 'sq'. |
-### 9.2 ~~<a name="Supplemental_Deprecated_Information" href="#Supplemental_Deprecated_Information">Supplemental Deprecated Information (Deprecated)</a>~~
+### ~~<a name="Supplemental_Deprecated_Information" href="#Supplemental_Deprecated_Information">Supplemental Deprecated Information (Deprecated)</a>~~
```xml
<!ELEMENT deprecated ( deprecatedItems* ) >
@@ -732,7 +735,7 @@
Where particular values are deprecated (such as territory codes like SU for Soviet Union), the names for such codes may be removed from the common/main translated data after some period of time. However, typically supplemental information for deprecated codes is retained, such as containment, likely subtags, older currency codes usage, etc. The English name may also be retained, for debugging purposes.
-### 9.3 <a name="Default_Content" href="#Default_Content">Default Content</a>
+### <a name="Default_Content" href="#Default_Content">Default Content</a>
```xml
<!ELEMENT defaultContent EMPTY >
@@ -745,9 +748,9 @@
If an implementation is to use a different default locale, then the data needs to be _pivoted_; all of the data from the CLDR for the current default locale pushed out to the locales that inherit from it, then the new default content locale's data moved into the base. There are tools in CLDR to perform this operation.
-For the relationship between Inheritance, DefaultContent, LikelySubtags, and LocaleMatching, see **_Section 4.2.6 [Inheritance vs Related Information](tr35.md#Inheritance_vs_Related)_**.
+For the relationship between Inheritance, DefaultContent, LikelySubtags, and LocaleMatching, see **_[Inheritance vs Related Information](tr35.md#Inheritance_vs_Related)_**.
-## 10 <a name="Metadata_Elements" href="#Metadata_Elements">Locale Metadata Elements</a>
+## <a name="Metadata_Elements" href="#Metadata_Elements">Locale Metadata Elements</a>
Note: This section refers to the per-locale `<metadata>` element, containing metadata about a particular locale. This is in contrast to the [_Supplemental_ Metadata](#Appendix_Supplemental_Metadata), which is in the supplemental tree and is not specific to a locale.
@@ -769,7 +772,7 @@
The `<casingItem>` data is generated by a tool based on the data available in CLDR. In cases where the generated casing information is incorrect and needs to be manually edited, the `override` attribute is set to `true` so that the tool will not override the manual edits. When the casing information is known to be both correct and something that should apply to all elements of the specified type in a given locale, the `forceErr` attribute may be set to `true` to force an error instead of a warning for items that do not match the casing information.
-## 11 <a name="Version_Information" href="#Version_Information">Version Information</a>
+## <a name="Version_Information" href="#Version_Information">Version Information</a>
```xml
<!ELEMENT version EMPTY >
@@ -781,11 +784,11 @@
The `unicodeVersion` attribute defines the version of the Unicode standard that is used to interpret data. Specifically, some data elements such as exemplar characters are expressed in terms of UnicodeSets. Since UnicodeSets can be expressed in terms of Unicode properties, their meaning depends on the Unicode version from which property values are derived.
-## 12 <a name="Parent_Locales" href="#Parent_Locales">Parent Locales</a>
+## <a name="Parent_Locales" href="#Parent_Locales">Parent Locales</a>
The parentLocales data is supplemental data, but is described in detail in the [core specification section 4.1.3.](tr35.md#Parent_Locales)
-## 13 <a name="Unit_Conversion" href="#Unit_Conversion">Unit Conversion</a>
+## <a name="Unit_Conversion" href="#Unit_Conversion">Unit Conversion</a>
The unit conversion data ([units.xml](https://github.com/unicode-org/cldr/blob/main/common/supplemental/units.xml)) provides the data for converting all of the cldr unit identifiers to base units, and back. That allows conversion between any two convertible units, such as two units of length. For any two convertible units (such as acre and dunum) the first can be converted to the base unit (square-meter), then that base unit can be converted to the second unit.
@@ -797,8 +800,8 @@
<!ATTLIST unitIdComponent type NMTOKEN #REQUIRED >
<!ATTLIST unitIdComponent values NMTOKENS #REQUIRED >
-These elements provide support for parsing unit identifiers, as described in [Unit Elements](tr35-general.md#Unit_Elements).
-Each of the values has tokens with specific functions, identified by the type.
+These elements provide support for parsing unit identifiers, as described in [Unit Elements](tr35-general.md#Unit_Elements).
+Each of the values has tokens with specific functions, identified by the type.
For example the following values can be suffixes in a simple_unit identifier such as `quart-imperial`.
```
@@ -894,14 +897,53 @@
Where a factor is not present, the value is 1; where an offset is not present, the value is 0.
-The `systems` attribute indicates the measurement system(s). Multiple values may be given; for example, _minute_ is marked as systems="metric ussystem uksystem"
+The `systems` attribute indicates the measurement system(s) or other characteristics of a set of unts. Multiple values may be given; for example, a unit could be marked as systems="`si_acceptable` `metric_adjacent` `prefixable`".
-Attribute Value | Description
------------- | -------------
-_si_ | the _International System of Units (SI)_
-_metric_ | a superset of the _si_ units, with some non-SI units accepted for use with the SI or simple multiples of metric units, such as pound-metric (= ½ kilogram)
-_ussystem_ | the inch-pound system as used in the US, also called _US Customary Units_
-_uksystem_ | the inch-pound system as used in the UK, also called _British Imperial Units_, differing mostly in units of volume
+The allowed attributes are the following:
+
+Attribute Value | Description
+------------ | -------------
+`si` | The _International System of Units (SI)_ See [NIST Guide to the SI, Chapter 4: The Two Classes of SI Units and the SI Prefixes](https://www.nist.gov/pml/special-publication-811/nist-guide-si-chapter-4-two-classes-si-units-and-si-prefixes). Examples: meter, ampere.
+`si_acceptable` | Units acceptable for use with the SI. See [NIST Guide to the SI, Chapter 5: Units Outside the SI](https://www.nist.gov/pml/special-publication-811/nist-guide-si-chapter-5-units-outside-si). Examples: hour, liter, knot, hectare.
+`metric` | A superset of the _si_ units
+`metric_adjacent` | Units commonly accepted in some countries that follow the metric system. Examples: month, arc-second, pound-metric (= ½ kilogram), mile-scandinavian.
+`ussystem` | The inch-pound system as used in the US, also called _US Customary Units_.
+`uksystem` | The inch-pound system as used in the UK, also called _British Imperial Units_, differing mostly in units of volume
+`jpsystem` | Traditional units used in Japan. For examples, see [Japanese units of measurement](https://en.wikipedia.org/wiki/Japanese_units_of_measurement).
+`astronomical` | Additional units used in astronomy. Examples: parsec, light-year, earth-mass
+`person_age` | Special units used for people’s ages in some languages. Except for translation, they have the same system as the associated regular units.
+`currency` | Currency units. These are constructed algorithmically from the Unicode currency identifiers, and do not occur in the child elements of `convertUnits`. Examples: curr-usd (US dollar), curr-eur (Euro).
+`prefixable` | Those units that typically use SI prefixes or the [IEC binary prefixes](https://www.nist.gov/pml/special-publication-811/nist-guide-si-appendix-d-bibliography#05). This can include measures like `parsec` that are not SI units. It allows implementations to group those units together, and to do sanity checks on the prefix+unit combinations, if they choose. However, implementations may choose to allow prefixes on other units, especially since there is a significant variance in usage: even a term like `megafoot` might be acceptable in some contexts.
+
+Over time, additional systems may be added, and the systems for a particular unit may be refined.
+
+#### Derived Unit System
+
+The systems attributes also apply to compound units, and are computed in the following way.
+
+1. The `prefixable` system is only applicable to base_components, and is thus removed
+2. The `number_prefixes`, `dimensionality_prefix`, `si_prefix`, and `binary_prefix` are ignored
+ * Example: systems(square-kilometer) = systems(meter)
+3. Currency units have the `currency` system
+ * Example: systems(curr-usd) = {currency}
+4. Units linked by `-and-`, `-per-`, and *adjacency* are resolved using a modified intersection, where:
+ 1. The intersection of {… si …} and {… si_acceptable … } is {… si_acceptable …}
+ 2. The intersection of {… metric …} and {… metric_adjacent … } is {… metric_adjacent …}
+
+Examples:
+```
+systems(liter-per-hectare)
+ = {si_acceptable metric} ∪ {si_acceptable metric}
+ = {si_acceptable metric}
+systems(meter-per-hectare)
+ = {si metric} ∩ {si_acceptable metric}
+ = {si_acceptable metric}
+systems(mile-scandinavian-per-hour)
+ = {metric_adjacent} ∩ {si_acceptable metric_adjacent}
+ = {metric_adjacent}
+```
+
+#### Conversion Mechanisms
CLDR follows conversion values where possible from:
* [NIST Special Publication 1038](https://www.govinfo.gov/content/pkg/GOVPUB-C13-f10c2ff9e7af2091314396a2d53213e4/pdf/GOVPUB-C13-f10c2ff9e7af2091314396a2d53213e4.pdf)
@@ -1022,7 +1064,7 @@
The order of the elements in the file is significant, since it is used in [Unit_Identifier_Normalization](#Unit_Identifier_Normalization).
-The quantity values themselves are informative. Therer mayreflecting that _force per area_ can be referenced as either _pressure_ or _stress_, for example). The quantity for a complex unit that has a reciprocal is formed by prepending “inverse-” to the quantity, such as _inverse-consumption._
+The quantity values themselves are informative. For example, _force per area_ can be referenced as either _pressure_ or _stress_. The quantity for a complex unit that has a reciprocal is formed by prepending “inverse-” to the quantity, such as _inverse-consumption._
The base units for the quantities and the quantities themselves are based on [NIST Special Publication 811](https://www.nist.gov/pml/special-publication-811) and the earlier [NIST Special Publication 1038](https://www.govinfo.gov/content/pkg/GOVPUB-C13-f10c2ff9e7af2091314396a2d53213e4/pdf/GOVPUB-C13-f10c2ff9e7af2091314396a2d53213e4.pdf). In some cases, a different unit is chosen for the base. For example, a _revolution_ (360°) is chosen for the base unit for angles instead of the SI _radian_, and _item_ instead of the SI _mole_. Additional base units are added where necessary, such as _bit_ and _pixel_.
@@ -1073,11 +1115,11 @@
The [unitsTest.txt](https://github.com/unicode-org/cldr/blob/main/common/testData/units/unitsTest.txt) file supplies a list of all the CLDR units with conversions, for testing implementations. Instructions for use are supplied in the header of the file.
-## 14 <a name="Unit_Preferences" href="#Unit_Preferences">Unit Preferences</a>
+## <a name="Unit_Preferences" href="#Unit_Preferences">Unit Preferences</a>
Different locales have different preferences for which unit or combination of units is used for a particular usage, such as measuring a person’s height. This is more fine-grained than merely a preference for metric versus US or UK measurement systems. For example, one locale may use meters alone, while another may use centimeters alone or a combination of meters and centimeters; a third may use inches alone, or (informally) a combination of feet and inches.
-### 14.2 <a name="Unit_Preferences_Data" href="#Unit_Preferences_Data">Unit Preferences Overrides</a>
+### <a name="Unit_Preferences_Overrides" href="#Unit_Preferences_Overrides">Unit Preferences Overrides</a>
The determination of preferred units depends on the locale identifer: the keys mu, ms, rg, their values, the base locale (language, script, region) and the user preferences data.
@@ -1090,24 +1132,48 @@
| 3 | en-u-rg-dezzzz. | Celsius | despite the likely region of US |
| 4 | en | Fahrenheit | because the likely region for en with no region is US |
-The ms value maps to a region according to the following table. That is then the input for the Unit Preferences Data below.
+The **ms** value is used in the following way.
-| Key-Value | Region for Unit Preferences |
-|-------------|-----------------------------|
-| ms-metric | 001 |
-| ms-ussystem | US |
-| ms-uksystem | UK |
+1. Find the corresponding Key-Value row in the table below.
+2. Get the unit preferences for the **locale**, **category**, and **usage**.
+3. If any of the units in that set have a measurement system that doesn’t match the -u-ms- value, get unit preferences again, but using the fallback region instead of the locale's region.
-Thus _for the purposes of unit preferences_ the following behave identically:
+| Key-Value | Unit Systems Match | Fallback Region for Unit Preferences |
+|-------------|-----------------------------|--------------------------------------|
+| ms-metric | metric OR metric_adjacent | 001 |
+| ms-ussystem | ussystem | US |
+| ms-uksystem | uksystem | UK |
-| Locale | Equivalents |
-|-------------------|------------|
-| en-GB-ms-ussystem | en-US, en |
-| en-US-ms-uksystem | en-GB |
-| en-ms-uksystem | en-GB |
+**Example A: xx-SE-u-ms-metric, length, road**
+1. Fetch the data from `<unitPreferences category="length" usage="road">` for xx-SE
+```
+<unitPreference regions="SE">mile-scandinavian</unitPreference>
+<unitPreference regions="SE">kilometer</unitPreference>
+<unitPreference regions="SE" geq="300.0" skeleton="precision-increment/50">meter</unitPreference>
+<unitPreference regions="SE" geq="10" skeleton="precision-increment/10">meter</unitPreference>
+<unitPreference regions="SE" skeleton="precision-increment/1">meter</unitPreference>
+```
+2. Meter is **metric**, mile-scandinavian is **metric_adjacent** so they both match the key-value ms-**metric**, so no change is made.
+
+**Example B: xx-GB-u-ms-ussystem, volume, fluid**
+1. Fetch the data from `<unitPreferences category="volume" usage="fluid">` for xx-GB
+```
+<unitPreference regions="GB">gallon-imperial</unitPreference>
+<unitPreference regions="GB">fluid-ounce-imperial</unitPreference>
+```
+2. At least one of {gallon-imperial, fluid-ounce-imperial} does not match ms-**ussystem** so the locale is shifted to xx-**US**, and uses the following:
+```
+<unitPreference regions="US">gallon</unitPreference>
+<unitPreference regions="US">quart</unitPreference>
+<unitPreference regions="US">pint</unitPreference>
+<unitPreference regions="US">cup</unitPreference>
+<unitPreference regions="US">fluid-ounce</unitPreference>
+<unitPreference regions="US">tablespoon</unitPreference>
+<unitPreference regions="US">teaspoon</unitPreference>
+```
APIs should clearly allow for both the use of unit preferences with the above process, and for the _invariant use_ of a unit measure.
-That is, while an application will usually want to obey the preferences for the locale or in the locale ID, there will definitely be instances where it will want to not use them.
+That is, while an application will usually want to obey the preferences for the locale or in the locale ID, there will definitely be instances where it will want to not use them.
For example, in showing the weather, an application may want to show:
High today: 68°F (20°C)
@@ -1115,7 +1181,7 @@
To do that, the application needs to show the first value with the locale information, and then (a) query what the alternative is, and show the temperature in that.
As an example, ICU only uses the unit preferences (with rg, ms, and/or mu and the likely region) in formatting units when a usage parameter is set.
-### 14.2 <a name="Unit_Preferences_Data" href="#Unit_Preferences_Data">Unit Preferences Data</a>
+### <a name="Unit_Preferences_Data" href="#Unit_Preferences_Data">Unit Preferences Data</a>
The CLDR data is intended to map from a particular usage — e.g. measuring the height of a person or the fuel consumption of an automobile — to the unit or combination of units typically used for that usage in a given region. Considerations for such a mapping include:
@@ -1140,7 +1206,7 @@
```
<table><tbody>
-<tr><td>category</td><td>A unit quantity, such as “area” or “length”. See Section 13 Unit Conversion</td></tr>
+<tr><td>category</td><td>A unit quantity, such as “area” or “length”. See Unit Conversion</td></tr>
<tr><td>usage</td><td>A type of usage, such as person-height.</td></tr>
<tr><td>regions</td><td>One or more region identifiers (macroregions or regions), subdivision identifiers, or language identifiers, such as 001, US, usca, and de-CH.</td></tr>
<tr><td>geq</td><td>A threshold value, in a unit determined by the unitPreference element value. The unitPreference element is only used for values higher than this value (and lower than any higher value).<br/>The value must be non-negative. For picking negative units (-3 meters), use the absolute value to pick the unit.</td></tr>
@@ -1252,6 +1318,6 @@
* * *
-Copyright © 2001–2022 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
+Copyright © 2001–2023 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
Unicode and the Unicode logo are trademarks of Unicode, Inc., and are registered in some jurisdictions.
diff --git a/docs/ldml/tr35-keyboards.anchors.json b/docs/ldml/tr35-keyboards.anchors.json
new file mode 100644
index 0000000..1ecfd11
--- /dev/null
+++ b/docs/ldml/tr35-keyboards.anchors.json
@@ -0,0 +1,83 @@
+[
+ "accessibility",
+ "additional-features",
+ "backspace-transforms",
+ "compatibility-notice",
+ "Contents",
+ "contents-of-part-7-keyboards",
+ "definitions",
+ "disallowed-regex-features",
+ "element-display",
+ "element-displayoptions",
+ "element-displays",
+ "element-flick",
+ "element-flicks",
+ "element-flicksegment",
+ "element-form",
+ "element-forms",
+ "element-hierarchy",
+ "element-import",
+ "element-info",
+ "element-key",
+ "element-keyboard3",
+ "element-keys",
+ "element-layer",
+ "element-layers",
+ "element-locale",
+ "element-locales",
+ "element-reorder",
+ "element-row",
+ "element-scancodes",
+ "element-set",
+ "element-settings",
+ "element-string",
+ "element-transform",
+ "element-transformgroup",
+ "element-transforms",
+ "element-unicodeset",
+ "element-variables",
+ "element-version",
+ "escaping",
+ "example-post-reorder-transforms",
+ "example-transformgroup-with-reorder-elements",
+ "example-transformgroup-with-transform-elements",
+ "extensibility",
+ "file-and-directory-structure",
+ "goals-and-non-goals",
+ "implied-form-values",
+ "implied-keys",
+ "important-note",
+ "invariants",
+ "keyboard-ids",
+ "keyboard-test-data",
+ "keyboards",
+ "layer-modifier-components",
+ "layer-modifier-matching",
+ "markers",
+ "modifier-left--and-right--keys",
+ "parts",
+ "Parts",
+ "platform-behaviors-in-edge-cases",
+ "principles-for-keyboard-ids",
+ "regex-like-syntax",
+ "replacement-syntax",
+ "status",
+ "summary",
+ "test-doctype",
+ "test-element-backspace",
+ "test-element-check",
+ "test-element-emit",
+ "test-element-info",
+ "test-element-keyboardtest",
+ "test-element-keystroke",
+ "test-element-repertoire",
+ "test-element-startcontext",
+ "test-element-test",
+ "test-element-tests",
+ "test-examples",
+ "unicode-locale-data-markup-language-ldmlpart-7-keyboards",
+ "unicode-technical-standard-35-tech-preview",
+ "unicodeset-escaping",
+ "using-import-with-reorder-elements",
+ "uts18-escaping"
+]
\ No newline at end of file
diff --git a/docs/ldml/tr35-keyboards.md b/docs/ldml/tr35-keyboards.md
index c7ac7d0..e2de6f0 100644
--- a/docs/ldml/tr35-keyboards.md
+++ b/docs/ldml/tr35-keyboards.md
@@ -1,8 +1,8 @@
-## Unicode Technical Standard #35
+## Unicode Technical Standard #35 Tech Preview
# Unicode Locale Data Markup Language (LDML)<br/>Part 7: Keyboards
-|Version|42 |
+|Version|44.1 |
|-------|-------------|
|Editors|Steven Loomis (<a href="mailto:[email protected]">[email protected]</a>) and <a href="tr35.md#Acknowledgments">other CLDR committee members</a>|
@@ -10,17 +10,15 @@
#### _Important Note_
-> The CLDR [Keyboard Workgroup](https://cldr.unicode.org/index/keyboard-workgroup) is currently
-> developing major changes to the CLDR keyboard specification. These changes are targeted for
-> CLDR version 43. Please see [CLDR-15034](https://unicode-org.atlassian.net/browse/CLDR-15034) for
-> the latest information.
-
+> This is a technical preview of a future version of the LDML Part 7. See [_Status_](#status), below.
+>
+> There are breaking changes, see [Compatibility Notice](#compatibility-notice)
### _Summary_
This document describes parts of an XML format (_vocabulary_) for the exchange of structured locale data. This format is used in the [Unicode Common Locale Data Repository](https://www.unicode.org/cldr/).
-This is a partial document, describing keyboard mappings. For the other parts of the LDML see the [main LDML document](tr35.md) and the links above.
+This is a partial document, describing keyboards. For the other parts of the LDML see the [main LDML document](tr35.md) and the links above.
_Note:_
Some links may lead to in-development or older
@@ -29,11 +27,12 @@
### _Status_
-_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium. This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
+This document is a _technical preview_ of the Keyboard standard.
-> _**A Unicode Technical Standard (UTS)** is an independent specification. Conformance to the Unicode Standard does not imply conformance to any UTS._
+To process earlier XML files, use the data and specification from v43.1, found at <https://www.unicode.org/reports/tr35/tr35-69/tr35.html>
-_Please submit corrigenda and other comments with the CLDR bug reporting form [[Bugs](tr35.md#Bugs)]. Related information that is useful in understanding this document is found in the [References](tr35.md#References). For the latest version of the Unicode Standard see [[Unicode](tr35.md#Unicode)]. For a list of current Unicode Technical Reports see [[Reports](tr35.md#Reports)]. For more information about versions of the Unicode Standard, see [[Versions](tr35.md#Versions)]._
+The CLDR [Keyboard Workgroup](https://cldr.unicode.org/index/keyboard-workgroup) is currently
+developing this technical preview to the CLDR keyboard specification.
## <a name="Parts" href="#Parts">Parts</a>
@@ -55,155 +54,169 @@
* [_Status_](#status)
* [Parts](#Parts)
* [Contents of Part 7, Keyboards](#Contents)
-* 1 [Keyboards](#Introduction)
-* 2 [Goals and Non-goals](#Goals_and_Nongoals)
-* 3 [Definitions](#Definitions)
- * 3.1 [Escaping](#Escaping)
-* 4 [File and Directory Structure](#File_and_Dir_Structure)
-* 5 [Element Hierarchy - Layout File](#Element_Heirarchy_Layout_File)
- * 5.1 [Element: keyboard](#Element_Keyboard)
- * 5.2 [Element: version](#Element_version)
- * 5.3 ~~[Element: generation](#Element_generation)~~
- * 5.4 [Element: info](#Element_info)
- * 5.5 [Element: names](#Element_names)
- * 5.6 [Element: name](#Element_name)
- * 5.7 [Element: settings](#Element_settings)
- * 5.8 [Element: keyMap](#Element_keyMap)
- * Table: [Possible Modifier Keys](#Possible_Modifier_Keys)
- * 5.9 [Element: map](#Element_map)
- * 5.9.1 [Elements: flicks, flick](#Element_flicks)
- * 5.10 [Element: import](#Element_import)
- * 5.11 [Element: displayMap](#Element_displayMap)
- * 5.12 [Element: display](#Element_display)
- * 5.13 [Element: layer](#Element_layer)
- * 5.14 [Element: row](#Element_row)
- * 5.15 [Element: switch](#Element_switch)
- * 5.16 [Element: vkeys](#Element_vkeys)
- * 5.17 [Element: vkey](#Element_vkey)
- * 5.18 [Element: transforms](#Element_transforms)
- * 5.19 [Element: transform](#Element_transform)
- * 5.20 [Element: reorders, reorder](#Element_reorder)
- * 5.21 [Element: transform final](#Element_final)
- * 5.22 [Element: backspaces](#Element_backspaces)
- * 5.23 [Element: backspace](#Element_backspace)
-* 6 [Element Hierarchy - Platform File](#Element_Heirarchy_Platform_File)
- * 6.1 [Element: platform](#Element_platform)
- * 6.2 [Element: hardwareMap](#Element_hardwareMap)
- * 6.3 [Element: map](#Element_hardwareMap_map)
-* 7 [Invariants](#Invariants)
-* 8 [Data Sources](#Data_Sources)
- * Table: [Key Map Data Sources](#Key_Map_Data_Sources)
-* 9 [Keyboard IDs](#Keyboard_IDs)
- * 9.1 [Principles for Keyboard Ids](#Principles_for_Keyboard_Ids)
-* 10 [Platform Behaviors in Edge Cases](#Platform_Behaviors_in_Edge_Cases)
+* [Keyboards](#keyboards)
+* [Goals and Non-goals](#goals-and-non-goals)
+ * [Compatibility Notice](#compatibility-notice)
+ * [Accessibility](#accessibility)
+* [Definitions](#definitions)
+ * [Escaping](#escaping)
+ * [UnicodeSet Escaping](#unicodeset-escaping)
+ * [UTS18 Escaping](#uts18-escaping)
+* [File and Directory Structure](#file-and-directory-structure)
+ * [Extensibility](#extensibility)
+* [Element Hierarchy](#element-hierarchy)
+ * [Element: keyboard3](#element-keyboard3)
+ * [Element: locales](#element-locales)
+ * [Element: locale](#element-locale)
+ * [Element: version](#element-version)
+ * [Element: info](#element-info)
+ * [Element: settings](#element-settings)
+ * [Element: keys](#element-keys)
+ * [Element: key](#element-key)
+ * [Implied Keys](#implied-keys)
+ * [Element: flicks](#element-flicks)
+ * [Element: flick](#element-flick)
+ * [Element: flickSegment](#element-flicksegment)
+ * [Element: import](#element-import)
+ * [Element: displays](#element-displays)
+ * [Element: display](#element-display)
+ * [Element: displayOptions](#element-displayoptions)
+ * [Element: forms](#element-forms)
+ * [Element: form](#element-form)
+ * [Implied Form Values](#implied-form-values)
+ * [Element: scanCodes](#element-scancodes)
+ * [Element: layers](#element-layers)
+ * [Element: layer](#element-layer)
+ * [Layer Modifier Components](#layer-modifier-components)
+ * [Modifier Left- and Right- keys](#modifier-left--and-right--keys)
+ * [Layer Modifier Matching](#layer-modifier-matching)
+ * [Element: row](#element-row)
+ * [Element: variables](#element-variables)
+ * [Element: string](#element-string)
+ * [Element: set](#element-set)
+ * [Element: unicodeSet](#element-unicodeset)
+ * [Element: transforms](#element-transforms)
+ * [Markers](#markers)
+ * [Element: transformGroup](#element-transformgroup)
+ * [Example: `transformGroup` with `transform` elements](#example-transformgroup-with-transform-elements)
+ * [Example: `transformGroup` with `reorder` elements](#example-transformgroup-with-reorder-elements)
+ * [Element: transform](#element-transform)
+ * [Regex-like Syntax](#regex-like-syntax)
+ * [Additional Features](#additional-features)
+ * [Disallowed Regex Features](#disallowed-regex-features)
+ * [Replacement syntax](#replacement-syntax)
+ * [Element: reorder](#element-reorder)
+ * [Using `<import>` with `<reorder>` elements](#using-import-with-reorder-elements)
+ * [Example Post-reorder transforms](#example-post-reorder-transforms)
+ * [Backspace Transforms](#backspace-transforms)
+* [Invariants](#invariants)
+* [Keyboard IDs](#keyboard-ids)
+ * [Principles for Keyboard IDs](#principles-for-keyboard-ids)
+* [Platform Behaviors in Edge Cases](#platform-behaviors-in-edge-cases)
+* [Keyboard Test Data](#keyboard-test-data)
+ * [Test Doctype](#test-doctype)
+ * [Test Element: keyboardTest](#test-element-keyboardtest)
+ * [Test Element: info](#test-element-info)
+ * [Test Element: repertoire](#test-element-repertoire)
+ * [Test Element: tests](#test-element-tests)
+ * [Test Element: test](#test-element-test)
+ * [Test Element: startContext](#test-element-startcontext)
+ * [Test Element: keystroke](#test-element-keystroke)
+ * [Test Element: emit](#test-element-emit)
+ * [Test Element: backspace](#test-element-backspace)
+ * [Test Element: check](#test-element-check)
+ * [Test Examples](#test-examples)
-## 1 <a name="Introduction" href="#Introduction">Keyboards</a>
+## Keyboards
-The CLDR keyboard format provides for the communication of keyboard mapping data between different modules, and the comparison of data across different vendors and platforms. The standardized identifier for keyboards can be used to communicate, internally or externally, a request for a particular keyboard mapping that is to be used to transform either text or keystrokes. The corresponding data can then be used to perform the requested actions.
+The Unicode Standard and related technologies such as CLDR have dramatically improved the path to language support. However, keyboard support remains platform and vendor specific, causing inconsistencies in implementation as well as timeline.
-For example, a web-based virtual keyboard may transform text in the following way. Suppose the user types a key that produces a "W" on a qwerty keyboard. A web-based tool using an azerty virtual keyboard can map that text ("W") to the text that would have resulted from typing a key on an azerty keyboard, by transforming "W" to "Z". Such transforms are in fact performed in existing web applications.
+> “More and more language communities are determining that digitization is vital to their approach to language preservation and that engagement with Unicode is essential to becoming fully digitized. For many of these communities, however, getting new characters or a new script added to The Unicode Standard is not the end of their journey. The next, often more challenging stage is to get device makers, operating systems, apps and services to implement the script requirements that Unicode has just added to support their language. …
+>
+> “However, commensurate improvements to streamline new language support on the input side have been lacking. CLDR’s new Keyboard Subcommittee has been established to address this very gap.”
+> _(Cornelius et. al, “Standardizing Keyboards with CLDR,” presented at the 45th Internationalization and Unicode Conference, Santa Clara, California, USA, October 2021)_
+
+The CLDR keyboard format seeks to address these challenges, by providing an interchange format for the communication of keyboard mapping data independent of vendors and platforms. Keyboard authors can then create a single mapping file for their language, which implementations can use to provide that language’s keyboard mapping on their own platform.
+
+Additionally, the standardized identifier for keyboards can be used to communicate, internally or externally, a request for a particular keyboard mapping that is to be used to transform either text or keystrokes. The corresponding data can then be used to perform the requested actions. For example, a remote screen-access application (such as used for customer service or server management) would be able to communicate and choose the same keyboard layout on the remote device as is used in front of the user, even if the two systems used different platforms.
The data can also be used in analysis of the capabilities of different keyboards. It also allows better interoperability by making it easier for keyboard designers to see which characters are generally supported on keyboards for given languages.
-To illustrate this specification, here is an abridged layout representing the English US 101 keyboard on the Mac OSX operating system (with an inserted long-press example). For more complete examples, and information collected about keyboards, see keyboard data in XML.
+<!-- To illustrate this specification, here is an abridged layout representing the English US 101 keyboard on the macOS operating system (with an inserted long-press example). -->
-```xml
-<keyboard locale="en-t-k0-osx">
- <version platform="10.4" number="$Revision: 8294 $" />
- <names>
- <name value="U.S." />
- </names>
- <keyMap>
- <map iso="E00" to="`" />
- <map iso="E01" to="1" />
- <map iso="D01" to="q" />
- <map iso="D02" to="w" />
- <map iso="D03" to="e" longPress="é è ê ë" />
- …
- </keyMap>
- <keyMap modifiers="caps">
- <map iso="E00" to="`" />
- <map iso="E01" to="1" />
- <map iso="D01" to="Q" />
- <map iso="D02" to="W" />
- …
- </keyMap>
- <keyMap modifiers="opt">
- <map iso="E00" to="`" />
- <map iso="E01" to="¡" /> <!-- key=1 -->
- <map iso="D01" to="œ" /> <!-- key=Q -->
- <map iso="D02" to="∑" /> <!-- key=W -->
- …
- </keyMap>
- <transforms type="simple">
- <transform from="` " to="`" />
- <transform from="`a" to="à" />
- <transform from="`A" to="À" />
- <transform from="´ " to="´" />
- <transform from="´a" to="á" />
- <transform from="´A" to="Á" />
- <transform from="˜ " to="˜" />
- <transform from="˜a" to="ã" />
- <transform from="˜A" to="Ã" />
- …
- </transforms>
-</keyboard>
-```
+For complete examples, see the XML files in the CLDR source repository.
-And its associated platform file (which includes the hardware mapping):
-
-```xml
-<platform id="osx">
- <hardwareMap>
- <map keycode="0" iso="C01" />
- <map keycode="1" iso="C02" />
- <map keycode="6" iso="B01" />
- <map keycode="7" iso="B02" />
- <map keycode="12" iso="D01" />
- <map keycode="13" iso="D02" />
- <map keycode="18" iso="E01" />
- <map keycode="50" iso="E00" />
- </hardwareMap>
-</platform>
-```
+Attribute values should be evaluated considering the DTD and [DTD Annotations](tr35.md#dtd-annotations).
* * *
-## 2 <a name="Goals_and_Nongoals" href="#Goals_and_Nongoals">Goals and Non-goals</a>
+## Goals and Non-goals
Some goals of this format are:
+1. Physical and virtual keyboard layouts defined in a single file.
+2. Provide definitive platform-independent definitions for new keyboard layouts.
+ * For example, a new French standard keyboard layout would have a single definition which would be usable across all implementations.
+3. Allow platforms to be able to use CLDR keyboard data for the character-emitting keys (non-frame) aspects of keyboard layouts.
+ * For example, platform-specific keys such as Fn, Numpad, IME swap keys, and cursor keys are out of scope.
+ * This also means that modifier (frame) keys cannot generate output, such as capslock -> backslash.
+4. Deprecate & archive existing LDML platform-specific layouts so they are not part of future releases.
+
+<!--
1. Make the XML as readable as possible.
2. Represent faithfully keyboard data from major platforms: it should be possible to create a functionally-equivalent data file (such that given any input, it can produce the same output).
-3. Make as much commonality in the data across platforms as possible to make comparison easy.
+3. Make as much commonality in the data across platforms as possible to make comparison easy. -->
Some non-goals (outside the scope of the format) currently are:
-1. Display names or symbols for keycaps (eg, the German name for "Return"). If that were added to LDML, it would be in a different structure, outside the scope of this section.
-2. Advanced IME features, handwriting recognition, etc.
-3. Roundtrip mappings—the ability to recover precisely the same format as an original platform's representation. In particular, the internal structure may have no relation to the internal structure of external keyboard source data, the only goal is functional equivalence.
+1. Adaptation for screen scaling resolution. Instead, keyboards should define layouts based on physical size. Platforms may interpret physical size definitions and adapt for different physical screen sizes with different resolutions.
+2. Unification of platform-specific virtual key and scan code mapping tables.
+3. Unification of pre-existing platform layouts themselves (e.g. existing fr-azerty on platform a, b, c).
+4. Support for prior (pre 3.0) CLDR keyboard files. See [Compatibility Notice](#compatibility-notice).
+5. Run-time efficiency. [LDML is explicitly an interchange format](tr35.md#Introduction), and so it is expected that data will be transformed to a more compact format for use by a keystroke processing engine.
-Note: During development of this section, it was considered whether the modifier RAlt (=AltGr) should be merged with Option. In the end, they were kept separate, but for comparison across platforms implementers may choose to unify them.
+<!-- 1. Display names or symbols for keycaps (eg, the German name for "Return"). If that were added to LDML, it would be in a different structure, outside the scope of this section.
+2. Advanced IME features, handwriting recognition, etc.
+3. Roundtrip mappings—the ability to recover precisely the same format as an original platform's representation. In particular, the internal structure may have no relation to the internal structure of external keyboard source data, the only goal is functional equivalence. -->
+
+<!-- Note: During development of this section, it was considered whether the modifier RAlt (= AltGr) should be merged with Option. In the end, they were kept separate, but for comparison across platforms implementers may choose to unify them. -->
Note that in parts of this document, the format `@x` is used to indicate the _attribute_ **x**.
-* * *
+### Compatibility Notice
-## 3 <a name="Definitions" href="#Definitions">Definitions</a>
+> 👉 Note: CLDR-TC has agreed that the changes required were too extensive to maintain compatibility. For this reason, the `ldmlKeyboard3.dtd` DTD used here is _not_ compatible with DTDs from prior versions of CLDR such as v43 and prior.
+>
+> To process earlier XML files, use the data and specification from v43.1, found at <https://www.unicode.org/reports/tr35/tr35-69/tr35.html>
-**Arrangement** is the term used to describe the relative position of the rectangles that represent keys, either physically or virtually. A physical keyboard has a static arrangement while a virtual keyboard may have a dynamic arrangement that changes per language and/or layer. While the arrangement of keys on a keyboard may be fixed, the mapping of those keys may vary.
+### Accessibility
-**Base character:** The character emitted by a particular key when no modifiers are active. In ISO terms, this is group 1, level 1.
+Keyboard use can be challenging for individuals with various types of disabilities. For this revision, the committee is not evaluating features or architectural designs for the purpose of improving accessibility. Such consideration could be fruitful for future revisions. However, some points on this topic should be made:
-**Base map:** A mapping from the ISO positions to the base characters. There is only one base map per layout. The characters on this map can be output by not using any modifier keys.
+1. Having an industry-wide standard format for keyboards will enable accessibility software to make use of keyboard data with a reduced dependence on platform-specific knowledge.
+2. Features which require certain levels of mobility or speed of entry should be considered for their impact on accessibility. This impact could be mitigated by means of additional, accessible methods of generating the same output.
+3. Public feedback is welcome on any aspects of this document which might hinder accessibility.
-**Core keyboard layout:** also known as “alpha” block. The primary set of key values on a keyboard that are used for typing the target language of the keyboard. For example, the three rows of letters on a standard US QWERTY keyboard (QWERTYUIOP, ASDFGHJKL, ZXCVBNM) together with the most significant punctuation keys. Usually this equates to the minimal keyset for a language as seen on mobile phone keyboards.
+## Definitions
-**Hardware map:** A mapping between key codes and ISO layout positions.
+**Arrangement:** The relative position of the rectangles that represent keys, either physically or virtually. A hardware keyboard has a static arrangement while a touch keyboard may have a dynamic arrangement that changes per language and/or layer. While the arrangement of keys on a keyboard may be fixed, the mapping of those keys may vary.
+
+**Base character:** The character emitted by a particular key when no modifiers are active. In ISO 9995-1:2009 terms, this is Group 1, Level 1.
+
+**Core keys:** also known as “alphanumeric” section. The primary set of key values on a keyboard that are used for typing the target language of the keyboard. For example, the three rows of letters on a standard US QWERTY keyboard (QWERTYUIOP, ASDFGHJKL, ZXCVBNM) together with the most significant punctuation keys. Usually this equates to the minimal set of keys for a language as seen on mobile phone keyboards.
+Distinguished from the **frame keys**.
+
+**Dead keys:** These are keys which do not emit normal characters by themselves. They are so named because to the user, they may appear to be “dead,” i.e., non-functional. However, they do produce a change to the input context. For example, in many Latin keyboards hitting the `^` dead-key followed by the `e` key produces `ê`. The `^` by itself may be invisible or presented in a special way by the platform.
+
+**Frame keys:** These are keys which are outside of the area of the **core keys** and typically do not emit characters. These keys include **modifier** keys, such as Shift or Ctrl, but also include platform specific keys: Fn, IME and layout-switching keys, cursor keys, insert emoji keys etc.
+
+**Hardware keyboard:** an input device which has individual keys that are pressed. Each key has a unique identifier and the arrangement doesn't change, even if the mapping of those keys does. Also known as a physical keyboard.
+
+<!-- **Hardware map:** A mapping between and layout positions. -->
**Input Method Editor (IME):** a component or program that supports input of large character sets. Typically, IMEs employ contextual logic and candidate UI to identify the Unicode characters intended by the user.
-**ISO position:** The corresponding position of a key using the ISO layout convention where rows are identified by letters and columns are identified by numbers. For example, "D01" corresponds to the "Q" key on a US keyboard. For the purposes of this document, an ISO layout position is depicted by a one-letter row identifier followed by a two digit column number (like "B03", "E12" or "C00"). The following diagram depicts a typical US keyboard layout superimposed with the ISO layout indicators (it is important to note that the number of keys and their physical placement relative to each-other in this diagram is irrelevant, rather what is important is their logical placement using the ISO convention):
+<!-- **ISO position:** The corresponding position of a key using the ISO layout convention where rows are identified by letters and columns are identified by numbers. For example, "D01" corresponds to the "Q" key on a US keyboard. For the purposes of this document, an ISO layout position is depicted by a one-letter row identifier followed by a two digit column number (like "B03", "E12" or "C00"). The following diagram depicts a typical US keyboard layout superimposed with the ISO layout indicators (it is important to note that the number of keys and their physical placement relative to each-other in this diagram is irrelevant, rather what is important is their logical placement using the ISO convention):

@@ -211,127 +224,239 @@

-If it becomes necessary in the future, the format could extend the ISO layout to support keys that are located to the left of the "00" column by using negative column numbers "-01", "-02" and so on, or 100's complement "99", "98",...
+If it becomes necessary in the future, the format could extend the ISO layout to support keys that are located to the left of the "00" column by using negative column numbers "-01", "-02" and so on, or 100's complement "99", "98",... -->
-**Key:** A key on a physical keyboard.
+**Key:** A physical key on a hardware keyboard, or a virtual key on a touch keyboard.
**Key code:** The integer code sent to the application on pressing a key.
-**Key map:** The basic mapping between ISO positions and the output characters for each set of modifier combinations associated with a particular layout. There may be multiple key maps for each layout.
+**Key map:** The basic mapping between hardware or on-screen positions and the output characters for each set of modifier combinations associated with a particular layout. There may be multiple key maps for each layout.
-**Keyboard:** The physical keyboard.
+**Keyboard:** A particular arrangement of keys for the inputting of text, such as a hardware keyboard or a touch keyboard.
+
+**Keyboard author:** The person or group of people designing and producing a particular keyboard layout designed to support one or more languages. In the context of this specification, that author may be editing the LDML XML file directly or by means of software tools.
**Keyboard layout:** A layout is the overall keyboard configuration for a particular locale. Within a keyboard layout, there is a single base map, one or more key maps and zero or more transforms.
-**Layer** is an arrangement of keys on a virtual keyboard. Since it is often not intended to use two hands on a visual keyboard to allow the pressing of modifier keys. Modifier keys are made sticky in that one presses one, the visual representation, and even arrangement, of the keys change, and you press the key. This visual representation is a layer. Thus a virtual keyboard is made up of a set of layers.
+**Layer** is an arrangement of keys on a touch keyboard. A touch keyboard is made up of a set of layers. Each layer may have a different key layout, unlike with a hardware keyboard, and may not correspond directly to a hardware keyboard's modifier keys. A layer is accessed via a switch key. See also touch keyboard, modifier, switch.
-**Long-press key:** also known as a “child key”. A secondary key that is invoked from a top level key on a software keyboard. Secondary keys typically provide access to variants of the top level key, such as accented variants (a => á, à, ä, ã)
+**Long-press key:** also known as a “child key”. A secondary key that is invoked from a top level key on a touch keyboard. Secondary keys typically provide access to variants of the top level key, such as accented variants (a => á, à, ä, ã)
-**Modifier:** A key that is held to change the behavior of a keyboard. For example, the "Shift" key allows access to upper-case characters on a US keyboard. Other modifier keys include but is not limited to: Ctrl, Alt, Option, Command and Caps Lock.
+**Modifier:** A key that is held to change the behavior of a hardware keyboard. For example, the "Shift" key allows access to upper-case characters on a US keyboard. Other modifier keys include but are not limited to: Ctrl, Alt, Option, Command and Caps Lock. On a touch keyboard, keys that appear to be modifier keys should be considered to be layer-switching keys.
-**Physical keyboard** is a keyboard that has individual keys that are pressed. Each key has a unique identifier and the arrangement doesn't change, even if the mapping of those keys does.
+**Physical keyboard:** see **Hardware keyboard**
-**Transform:** A transform is an element that specifies a set of conversions from sequences of code points into one (or more) other code points. For example, in most latin keyboards hitting the "^" dead-key followed by the "e" key produces "ê".
+**Touch keyboard:** A keyboard that is rendered on a, typically, touch surface. It has a dynamic arrangement and contrasts with a hardware keyboard. This term has many synonyms: software keyboard, SIP (Software Input Panel), virtual keyboard. This contrasts with other uses of the term virtual keyboard as an on-screen keyboard for reference or accessibility data entry.
-**Virtual keyboard** is a keyboard that is rendered on a, typically, touch surface. It has a dynamic arrangement and contrasts with a physical keyboard. This term has many synonyms: touch keyboard, software keyboard, SIP (Software Input Panel). This contrasts with other uses of the term virtual keyboard as an on-screen keyboard for reference or accessibility data entry.
+**Transform:** A transform is an element that specifies a set of conversions from sequences of code points into one (or more) other code points. Transforms may reorder or replace text. They may be used to implement “dead key” behaviors, simple orthographic corrections, visual (typewriter) type input etc.
-### 3.1 <a name="Escaping" href="#Escaping">Escaping</a>
+**Virtual keyboard:** see **Touch keyboard**
-When explicitly specified, attributes can contain escaped characters. This specification uses two methods of escaping, the _UnicodeSet_ notation and the `\u{...}` notation.
+### Escaping
-The _UnicodeSet_ notation is described in [UTS#35 section 5.3.3](tr35.md#Unicode_Sets) and allows for comprehensive character matching, including by character range, properties, names, or codepoints. Currently, the following attributes allow _UnicodeSet_ notation:
+When explicitly specified, attribute values can contain escaped characters. This specification uses two methods of escaping, the _UnicodeSet_ notation and the `\u{...}` notation.
-* `from`, `before`, `after` on the `<transform>` element
-* `from`, `before`, `after` on the `<reorder>` element
-* `from`, `before`, `after` on the `<backspace>` element
+### UnicodeSet Escaping
-The `\u{...}` notation, a subset of hex notation, is described in [UTS#18 section 1.1](https://www.unicode.org/reports/tr18/#Hex_notation). It can refer to one or multiple individual codepoints. Currently, the following attributes allow the `\u{...}` notation:
+The _UnicodeSet_ notation is described in [UTS #35 section 5.3.3](tr35.md#Unicode_Sets) and allows for comprehensive character matching, including by character range, properties, names, or codepoints.
-* `to`, `longPress`, `multitap`, `hint` on the `<map>` element
-* `to` on the `<transform>` element
-* `to` on the `<backspace>` element
+Note that the `\u1234` and `\x{C1}` format escaping is not supported, only the `\u{…}` format (using `bracketedHex`).
-Characters of general category of Combining Mark (M), Control characters (Cc), Format characters (Cf), and whitespace other than space should be encoded using one of the notation above as appropriate.
+Currently, the following attribute values allow _UnicodeSet_ notation:
+
+* `from` or `before` on the `<transform>` element
+* `from` or `before` on the `<reorder>` element
+* `chars` on the [`<repertoire>`](#test-element-repertoire) test element.
+
+### UTS18 Escaping
+
+The `\u{...}` notation, a subset of hex notation, is described in [UTS #18 section 1.1](https://www.unicode.org/reports/tr18/#Hex_notation). It can refer to one or multiple individual codepoints. Currently, the following attribute values allow the `\u{...}` notation:
+
+* `output` on the `<key>` element
+* `from` or `to` on the `<transform>` element
+* `value` on the `<variable>` element
+* `output` and `display` on the `<display>` element
+* `baseCharacter` on the `<displayOptions>` element
+* Some attributes on [Keyboard Test Data](#keyboard-test-data) subelements
+
+Characters of general category of Mark (M), Control characters (Cc), Format characters (Cf), and whitespace other than space should be encoded using one of the notation above as appropriate.
+
+Attribute values escaped in this manner are annotated with the `<!--@ALLOWS_UESC-->` DTD annotation, see [DTD Annotations](tr35.md#dtd-annotations)
* * *
-## 4 <a name="File_and_Dir_Structure" href="#File_and_Dir_Structure">File and Directory Structure</a>
+## File and Directory Structure
-Each platform has its own directory, where a "platform" is a designation for a set of keyboards available from a particular source, such as Windows or ChromeOS. This directory name is the platform name (see Table 2 located further in the document). Within this directory there are two types of files:
+* New collection of layouts that are prescriptive, and define the common core for a keyboard that can be consumed as data for implementation on different platforms will be included in the CLDR repository. This collection will be in a different location than the existing CLDR keyboard files under main/keyboards. We should remove the existing data files, but keep the old DTD in the same place for compatibility, and also so that conversion tools can use it to read older files.
+* New layouts will have version metadata to indicate their specification compliance version number. For this tech preview, the value used must be `techpreview`.
-1. A single platform file (see XML structure for Platform file), this file includes a mapping of hardware key codes to the ISO layout positions. This file is also open to expansion for any configuration elements that are valid across the whole platform and that are not layout specific. This file is simply called `_platform.xml`.
-2. Multiple layout files named by their locale identifiers. (eg. `lt-t-k0-chromeos.xml` or `ne-t-k0-windows.xml`).
+```xml
+<keyboard3 conformsTo="techpreview"/>
+```
-Keyboard data that is not supported on a given platform, but intended for use with that platform, may be added to the directory `/und/`. For example, there could be a file `/und/lt-t-k0-chromeos.xml`, where the data is intended for use with ChromeOS, but does not reflect data that is distributed as part of a standard ChromeOS release.
+> _Note_: Unlike other LDML files, layouts are designed to be used outside of the CLDR source tree. A new mechanism for referencing the DTD path should ideally be used, such as a URN or FPI. See <https://unicode-org.atlassian.net/browse/CLDR-15505> for discussion. For this tech preview, a relative path to the dtd will continue to be used as below. Future versions may give other recommendations.
+
+```xml
+<!DOCTYPE keyboard3 SYSTEM "../dtd/ldmlKeyboard3.dtd">
+```
+
+* The filename of a keyboard .xml file does not have to match the BCP47 primary locale ID, but it is recommended to do so. The CLDR repository may enforce filename consistency.
+
+### Extensibility
+
+For extensibility, the `<special>` element will be allowed at nearly every level.
+
+See [Element special](tr35.md#special) in Part 1.
* * *
-## 5 <a name="Element_Heirarchy_Layout_File" href="#Element_Heirarchy_Layout_File">Element Hierarchy - Layout File</a>
+## Element Hierarchy
-### 5.1 <a name="Element_Keyboard" href="#Element_Keyboard">Element: keyboard</a>
+This section describes the XML elements in a keyboard layout file, beginning with the top level element `<keyboard3>`.
+
+### Element: keyboard3
This is the top level element. All other elements defined below are under this element.
**Syntax**
```xml
-<keyboard locale="{locale ID}">
+<keyboard3 locale="{locale ID}">
{definition of the layout as described by the elements defined below}
-</keyboard>
+</keyboard3>
```
> <small>
>
> Parents: _none_
-> Children: [version](#Element_version), [~~generation~~](#Element_generation), [info](#Element_info), [names](#Element_names), [settings](#Element_settings), [import](#Element_import), [keyMap](#Element_KeyMap), [displayMap](#Element_DisplayMap), [layer](#Element_layer), [vkeys](#Element_vkeys), [transforms](#Element_transforms), [reorders](#Element_reorder), [backspaces](#Element_backspaces)
-> Occurence: required, single
+>
+> Children: [displays](#element-displays), [import](#element-import), [info](#element-info), [keys](#element-keys), [flicks](#element-flicks), [layers](#element-layers), [locales](#element-locales), [settings](#element-settings), [_special_](tr35.md#special), [transforms](#element-transforms), [variables](#element-variables), [version](#element-version)
+>
+> Occurrence: required, single
>
> </small>
+_Attribute:_ `conformsTo` (required)
+
+This attribute distinguishes the keyboard from prior versions,
+and it also specifies the minimum CLDR version required.
+
+For purposes of this current draft specification, the value should always be `techpreview`.
+
+```xml
+<keyboard3 … conformsTo="techpreview"/>
+```
+
_Attribute:_ `locale` (required)
-This mandatory attribute represents the locale of the keyboard using Unicode locale identifiers (see [LDML](tr35.md)) - for example `"el"` for Greek. Sometimes, the locale may not specify the base language. For example, a Devanagari keyboard for many languages could be specified by BCP-47 code: `"und-Deva"`. For details, see [Keyboard IDs](#Keyboard_IDs) .
+This attribute represents the primary locale of the keyboard using BCP 47 [Unicode locale identifiers](tr35.md#Canonical_Unicode_Locale_Identifiers) - for example `"el"` for Greek. Sometimes, the locale may not specify the base language. For example, a Devanagari keyboard for many languages could be specified by BCP-47 code: `"und-Deva"`. However, it is better to list out the languages explicitly using the [`locales`](#element-locales) element.
+
+For further details about the choice of locale ID, see [Keyboard IDs](#keyboard-ids).
**Example** (for illustrative purposes only, not indicative of the real data)
```xml
-<keyboard locale="ka-t-k0-qwerty-windows">
+<keyboard3 locale="ka">
…
-</keyboard>
+</keyboard3>
```
+
```xml
-<keyboard locale="fr-CH-t-k0-android">
+<keyboard3 locale="fr-CH-t-k0-azerty">
…
-</keyboard>
+</keyboard3>
```
* * *
-### 5.2 <a name="Element_version" href="#Element_version">Element: version</a>
+### Element: locales
+
+The optional `<locales>` element allows specifying additional or alternate locales. Denotes intentional support for an extra language, not just that a keyboard incidentally supports a language’s orthography.
+
+**Syntax**
+
+```xml
+<locales>
+ <locale id="…"/>
+ <locale id="…"/>
+</locales>
+```
+
+> <small>
+>
+> Parents: [keyboard3](#element-keyboard3)
+>
+> Children: [locale](#element-locale)
+>
+> Occurrence: optional, single
+>
+> </small>
+
+### Element: locale
+
+The optional `<locales>` element allows specifying additional or alternate locales. Denotes intentional support for an extra language, not just that a keyboard incidentally supports a language’s orthography.
+
+**Syntax**
+
+```xml
+<locale id="{id}"/>
+```
+
+> <small>
+>
+> Parents: [locales](#element-locales)
+>
+> Children: _none_
+>
+> Occurrence: optional, multiple
+>
+> </small>
+
+_Attribute:_ `id` (required)
+
+> The [BCP 47](tr35.md#Canonical_Unicode_Locale_Identifiers) locale ID of an additional language supported by this keyboard.
+> Must _not_ include the `-k0-` subtag for this additional language.
+
+**Example**
+
+See [Principles for Keyboard IDs](#principles-for-keyboard-ids) for discussion and further examples.
+
+```xml
+<!-- Pan Nigerian Keyboard-->
+<keyboard3 locale="mul-Latn-NG-t-k0-panng">
+ <locales>
+ <locale id="ha"/>
+ <locale id="ig"/>
+ <!-- others … -->
+ </locales>
+</keyboard3>
+```
+
+* * *
+
+### Element: version
Element used to keep track of the source data version.
**Syntax**
```xml
-<version platform=".." number="..">
+<version number="..">
```
> <small>
>
-> Parents: [keyboard](#Element_keyboard)
+> Parents: [keyboard3](#element-keyboard3)
+>
> Children: _none_
-> Occurence: required, single
+>
+> Occurrence: optional, single
>
> </small>
-_Attribute:_ `platform` (required)
-
-> The platform source version. Specifies what version of the platform the data is from. For example, data from Mac OSX 10.4 would be specified as `platform="10.4"`. For platforms that have unstable version numbers which change frequently (like Linux), this field is set to an integer representing the iteration of the data starting with `"1"`. This number would only increase if there were any significant changes in the keyboard data.
-
_Attribute:_ `number` (required)
-> The data revision version. The attribute value must start with `$Revision` and end with `$`.
+> Must be a [[SEMVER](https://semver.org)] compatible version number, such as `1.0.0` or `38.0.0-beta.11`
_Attribute:_ `cldrVersion` (fixed by DTD)
@@ -340,427 +465,395 @@
**Example**
```xml
-<keyboard locale="..-osx">
+<keyboard3 locale="tok">
…
- <version platform="10.4" number="1"/>
+ <version number="1"/>
…
-</keyboard>
+</keyboard3>
```
* * *
-### 5.3 ~~<a name="Element_generation" href="#Element_generation">Element: generation</a>~~
-
-The `generation` element is now deprecated. It was used to keep track of the generation date of the data.
-
-* * *
-
-### 5.4 <a name="Element_info" href="#Element_info">Element: info</a>
+### Element: info
Element containing informative properties about the layout, for displaying in user interfaces etc.
**Syntax**
```xml
-<info [author="{author}"]
- [normalization="{form}"]
- [layout="{hint of the layout}"]
- [indicator="{short identifier}"] />
+<info
+ name="{keyboard name}"
+ author="{author}"
+ layout="{hint of the layout}"
+ indicator="{short identifier}" />
```
> <small>
>
-> Parents: [keyboard](#Element_keyboard)
+> Parents: [keyboard3](#element-keyboard3)
+>
> Children: _none_
-> Occurence: optional, single
+>
+> Occurrence: required, single
>
> </small>
-_Attribute:_ `author` (optional)
+_Attribute:_ `name` (required)
+
+> Note that this is the only required attribute for the `<info>` element.
+>
+> This attribute is an informative name for the keyboard.
+
+```xml
+<keyboard3 locale="bg-t-k0-phonetic-trad">
+ …
+ <info name="Bulgarian (Phonetic Traditional)" />
+ …
+</keyboard3>
+```
+
+* * *
+
+
+_Attribute:_ `author`
> The `author` attribute contains the name of the author of the layout file.
-_Attribute:_ `normalization` (optional)
+_Attribute:_ `layout`
-> The `normalization` attribute describes the intended normalization form of the keyboard layout output. The valid values are `NFC`, `NFD` or `other`.
-> An example use case is aiding user to choose among the two same layouts with one outputting characters in the normalization form C and one in the normalization form D.
+> The `layout` attribute describes the layout pattern, such as QWERTY, DVORAK, INSCRIPT, etc. typically used to distinguish various layouts for the same language.
+>
+> This attribute is not localized, but is an informative identifier for implementation use.
-_Attribute:_ `layout` (optional)
+_Attribute:_ `indicator`
-> The `layout` attribtue describes the layout pattern, such as QWERTY, DVORAK, INSCRIPT, etc. typically used to distinguish various layouts for the same language.
-
-_Attribute:_ `indicator` (optional)
-
-> The `indicator` attribute describes a short string to be used in currently selected layout indicator, such as US, SI9 etc.
+> The `indicator` attribute describes a short string to be used in currently selected layout indicator, such as `US`, `SI9` etc.
> Typically, this is shown on a UI element that allows switching keyboard layouts and/or input languages.
+>
+> This attribute is not localized.
* * *
-### 5.5 <a name="Element_names" href="#Element_names">Element: names</a>
+### Element: settings
-Element used to store any names given to the layout by the platform.
+An element used to keep track of layout-specific settings by implementations. This element may or may not show up on a layout. These settings reflect the normal practice by the implementation. However, an implementation using the data may customize the behavior.
**Syntax**
```xml
-<names>
- {set of name elements}
-</names>
+<settings normalization="disabled" />
```
> <small>
>
-> Parents: [keyboard](#Element_keyboard)
-> Children: [name](#Element_name)
-> Occurence: required, single
+> Parents: [keyboard3](#element-keyboard3)
>
-> </small>
-
-### 5.6 <a name="Element_name" href="#Element_name">Element: name</a>
-
-A single name given to the layout by the platform.
-
-**Syntax**
-
-```xml
-<name value="..">
-```
-
-> <small>
->
-> Parents: [names](#Element_names)
> Children: _none_
-> Occurence: required, multiple
+>
+> Occurrence: optional, single
+>
> </small>
-_Attribute:_ `value` (required)
+_Attribute:_ `normalization="disabled"`
-> The name of the layout.
+> Normalization will not typically be the responsibility of the keyboard author, rather this will be managed by the implementation.
+> The implementation will apply normalization as appropriate when matching transform rules and `<display>` value matching.
+> Output from the keyboard, following application of all transform rules, will be normalized to implementation or application-requested form.
+>
+> However, it is recognized that there may be some keyboards which, for compatibility or legacy reasons, need to manage their own normalization. The implementation in that case will do no normalization at all. The keyboard author must make use of transforms in the keyboard to any required normalization. In this case, the attribute value `normalization="disabled"` is used to indicate that no automatic normalization happens.
+>
+> **Note**: while this attribute is allowed by the specification, its use is discouraged, and keyboards with `normalization="disabled"` would not be accepted into the ClDR repository.
+
**Example**
```xml
-<keyboard locale="bg-t-k0-windows-phonetic-trad">
+<keyboard3 locale="bg">
…
- <names>
- <name value="Bulgarian (Phonetic Traditional)" />
- </names>
+ <settings normalization="disabled" />
…
-</keyboard>
+</keyboard3>
```
* * *
-### 5.7 <a name="Element_settings" href="#Element_settings">Element: settings</a>
+### Element: keys
-An element used to keep track of layout specific settings. This element may or may not show up on a layout. These settings reflect the normal practice on the platform. However, an implementation using the data may customize the behavior. For example, for `transformFailure` the implementation could ignore the setting, or modify the text buffer in some other way (such as by emitting backspaces).
+This element defines the properties of all possible keys via [`<key>` elements](#element-key) used in all layouts.
+It is a “bag of keys” without specifying any ordering or relation between the keys.
+There is only a single `<keys>` element in each layout.
**Syntax**
```xml
-<settings [fallback="omit"] [transformFailure="omit"] [transformPartial="hide"] />
+<keys>
+ <key … />
+ <key … />
+ <key … />
+</keys>
```
> <small>
>
-> Parents: [keyboard](#Element_keyboard)
+> Parents: [keyboard3](#element-keyboard3)
+> Children: [key](#element-key)
+> Occurrence: optional, single
+>
+> </small>
+
+
+
+* * *
+
+### Element: key
+
+This element defines a mapping between an abstract key and its output. This element must have the `keys` element as its parent. The `key` element is referenced by the `keys=` attribute of the [`row` element](#element-row).
+
+**Syntax**
+
+```xml
+<key
+ id="{key id}"
+ flickId="{flick identifier}"
+ gap="true"
+ longPressKeyIds="{long press list id}"
+ longPressDefaultKeyId="{default longpress key}"
+ multiTapKeyIds="{multi tap list id}"
+ stretch="true"
+ layerId="{switch layer id}"
+ output="{the output}"
+ width="{key width}"
+ />
+```
+
+> <small>
+>
+> Parents: [keys](#element-keys)
+>
> Children: _none_
-> Occurence: optional, single
>
+> Occurrence: optional, multiple
> </small>
-_Attribute:_ `fallback="omit"` (optional)
+**Note**: The `id` attribute is required.
-> The presence of this attribute means that when a modifier key combination goes unmatched, no output is produced. The default behavior (when this attribute is not present) is to fallback to the base map when the modifier key combination goes unmatched.
+**Note**: _at least one of_ `layerId`, `gap`, or `output` are required.
-If this attribute is present, it must have a value of omit.
+_Attribute:_ `id`
-_Attribute:_ `transformFailure="omit"` (optional)
-
-> This attribute describes the behavior of a transform when it is escaped (see the `transform` element in the Layout file for more information). A transform is escaped when it can no longer continue due to the entry of an invalid key. For example, suppose the following set of transforms are valid:
+> The `id` attribute uniquely identifies the key. NMTOKEN. It can (but needn't be) the key name (a, b, c, A, B, C, …), or any other valid token (e-acute, alef, alif, alpha, …).
>
-> ^e → ê
->
-> ^a → â
+> In the future, this attribute’s definition is expected to be updated to align with [UAX#31](https://www.unicode.org/reports/tr31/). Please see [CLDR-17043](https://unicode-org.atlassian.net/browse/CLDR-17043) for more details.
-Suppose a user now enters the "\^" key then "\^" is now stored in a buffer and may or may not be shown to the user (see the `partial` attribute).
+_Attribute:_ `flickId="{flick id}"` (optional)
-If a user now enters d, then the transform has failed and there are two options for output.
+> The `flickId` attribute indicates that this key makes use of a [`flick`](#element-flick) set with the specified id.
-1. default behavior - "^d"
+_Attribute:_ `gap="true"` (optional)
-2. omit - "" (nothing and the buffer is cleared)
-
-The default behavior (when this attribute is not present) is to emit the contents of the buffer upon failure of a transform.
-
-If this attribute is present, it must have a value of omit.
-
-_Attribute:_ `transformPartial="hide"` (optional)
-
-> This attribute describes the behavior the system while in a transform. When this attribute is present then don't show the values of the buffer as the user is typing a transform (this behavior can be seen on Windows or Linux platforms).
-
-By default (when this attribute is not present), show the values of the buffer as the user is typing a transform (this behavior can be seen on the Mac OSX platform).
-
-If this attribute is present, it must have a value of hide.
-
-**Example**
+> The `gap` attribute indicates that this key does not have any appearance, but represents a "gap" of the specified number of key widths. Can be used with `width` to set a width.
```xml
-<keyboard locale="bg-t-k0-windows-phonetic-trad">
- …
- <settings fallback="omit" transformPartial="hide" />
- …
-</keyboard>
+<key id="mediumgap" gap="true" width="1.5"/>
```
-Indicates that:
+_Attribute:_ `longPressKeyIds="{list of key ids}"` (optional)
-1. When a modifier combination goes unmatched, do not output anything when a key is pressed.
-2. If a transform is escaped, output the contents of the buffer.
-3. During a transform, hide the contents of the buffer as the user is typing.
-
-* * *
-
-### 5.8 <a name="Element_keyMap" href="#Element_keyMap">Element: keyMap</a>
-
-This element defines the group of mappings for all the keys that use the same set of modifier keys. It contains one or more map elements.
-
-**Syntax**
-
-```xml
-<keyMap [modifiers="{Set of Modifier Combinations}"]>
- {a set of map elements}
-</keyMap>
-```
-
-> <small>
+> A space-separated ordered list of `key` element ids, which keys which can be emitted by "long-pressing" this key. This feature is prominent in mobile devices.
>
-> Parents: [keyboard](#Element_keyboard)
-> Children: [map](#Element_map), [flicks](#Element_flicks)
-> Occurence: required, multiple
+> In a list of keys specified by `longPressKeyIds`, the key matching `longPressDefaultKeyId` attribute (if present) specifies the default long-press target, which could be different than the first element. It is an error if the `longPressDefaultKeyId` key is not in the `longPressKeyIds` list.
>
-> </small>
-
-_Attribute:_ `modifiers` (optional)
-
-> A set of modifier combinations that cause this key map to be "active". Each combination is separated by a space. The interpretation is that there is a match if any of the combinations match, that is, they are ORed. Therefore, the order of the combinations within this attribute does not matter.
-
-> A combination is simply a concatenation of words to represent the simultaneous activation of one or more modifier keys. The order of the modifier keys within a combination does not matter, although don't care cases are generally added to the end of the string for readability (see next paragraph). For example: `"cmd+caps"` represents the Caps Lock and Command modifier key combination. Some keys have right or left variant keys, specified by a 'R' or 'L' suffix. For example: `"ctrlR+caps"` would represent the Right-Control and Caps Lock combination. For simplicity, the presence of a modifier without a 'R' or 'L' suffix means that either its left or right variants are valid. So `"ctrl+caps"` represents the same as `"ctrlL+ctrlR?+caps ctrlL?+ctrlR+caps"`.
-
-A modifier key may be further specified to be in a "don't care" state using the '?' suffix. The "don't care" state simply means that the preceding modifier key may be either ON or OFF. For example `"ctrl+shift?"` could be expanded into `"ctrl ctrl+shift"`.
-
-Within a combination, the presence of a modifier WITHOUT the '?' suffix indicates this key MUST be on. The converse is also true, the absence of a modifier key means it MUST be off for the combination to be active.
-
-Here is an exhaustive list of all possible modifier keys:
-
-###### Table: <a name="Possible_Modifier_Keys" href="#Possible_Modifier_Keys">Possible Modifier Keys</a>
-
-| Modifier Keys | | Comments |
-|---------------|----------|---------------------------------|
-| `altL` | `altR` | xAlty → xAltR+AltL? xAltR?AltLy |
-| `ctrlL` | `ctrlR` | ditto for Ctrl |
-| `shiftL` | `shiftR` | ditto for Shift |
-| `optL` | `optR` | ditto for Opt |
-| `caps` | | Caps Lock |
-| `cmd` | | Command on the Mac |
-
-All sets of modifier combinations within a layout are disjoint with no-overlap existing between the key maps. That is, for every possible modifier combination, there is at most a single match within the layout file. There are thus never multiple matches. If no exact match is available, the match falls back to the base map unless the `fallback="omit"` attribute in the `settings` element is set, in which case there would be no output at all.
-
-**Example**
-
-To illustrate, the following example produces an invalid layout because pressing the "Ctrl" modifier key produces an indeterminate result:
-
-```xml
-<keyMap modifiers="ctrl+shift?">
- …
-</keyMap>
-```
-
-```xml
-<keyMap modifiers="ctrl">
- …
-</keyMap>
-```
-
-Modifier Examples:
-
-```xml
-<keyMap modifiers="cmd?+opt+caps?+shift" />
-```
-
-Caps-Lock may be ON or OFF, Option must be ON, Shift must be ON and Command may be ON or OFF.
-
-```xml
-<keyMap modifiers="shift caps" />
-```
-
-Caps-Lock must be ON OR Shift must be ON.
-
-If the `modifiers` attribute is not present on a `keyMap` then that particular key map is the base map.
-
-* * *
-
-### 5.9 <a name="Element_map" href="#Element_map">Element: map</a>
-
-This element defines a mapping between the base character and the output for a particular set of active modifier keys. This element must have the `keyMap` element as its parent.
-
-If a `map` element for a particular ISO layout position has not been defined then if this key is pressed, no output is produced.
-
-**Syntax**
-
-```xml
-<map
- iso="{the iso position}"
- to="{the output}"
- [longPress="{long press keys}"]
- [transform="no"]
- [multitap="{the output on subsequent taps}"]
- [longPress-status="optional"]
- [optional="{optional mappings}"]
- [hint="{hint to long press content}"]
- /><!-- {Comment to improve readability (if needed)} -->
-```
-
-> <small>
+> Implementations shall ignore any gestures (such as flick, multiTap, longPress) defined on keys in the `longPressKeyIds` list.
>
-> Parents: [keyMap](#Element_keyMap)
-> Children: _none_
-> Occurence: optional, multiple
->
-> </small>
-
-_Attribute:_ `iso` (exactly one of base and iso is required)
-
-> The `iso` attribute represents the ISO layout position of the key (see the definition at the beginning of the document for more information).
-
-_Attribute:_ `to` (required)
-
-> The `to` attribute contains the output sequence of characters that is emitted when pressing this particular key. Control characters, whitespace (other than the regular space character) and combining marks in this attribute are escaped using the `\u{...}` notation.
-
-_Attribute:_ `longPress="optional"` (optional)
-
-> The `longPress` attribute contains any characters that can be emitted by "long-pressing" a key, this feature is prominent in mobile devices. The possible sequences of characters that can be emitted are whitespace delimited. Control characters, combining marks and whitespace (which is intended to be a long-press option) in this attribute are escaped using the `\u{...}` notation.
-
-_Attribute:_ `transform="no"` (optional)
-
-> The `transform` attribute is used to define a key that never participates in a transform but its output shows up as part of a transform. This attribute is necessary because two different keys could output the same characters (with different keys or modifier combinations) but only one of them is intended to be a dead-key and participate in a transform. This attribute value must be no if it is present.
-
-_Attribute:_ `multitap` (optional)
-
-> A space-delimited list of strings, where each successive element of the list is produced by the corresponding number of quick taps. For example, three taps on the key C01 will produce a “c” in the following example (first tap produces “a”, two taps produce “bb” etc.).
->
-> _Example:_
->
-> ```xml
-> <map iso="C01" to="a" multitap="bb c d">
-> ```
-> Control characters, combining marks and whitespace (which is intended to be a multitap option) in this attribute are escaped using the `\u{...}` notation.
-
-_Attribute:_ `longPress-status` (optional)
-
-> Indicates optional `longPress` values. Must only occur with a `longPress` value. May be suppressed or shown, depending on user settings. There can be two `map` elements that differ only by `longPress-status`, allowing two different sets of `longPress` values.
->
-> _Example:_
->
-> ```xml
-> <map iso="D01" to="a" longPress="à â % æ á ä ã å ā ª" />
-> <map iso="D01" to="a" longPress="à â á ä ã å ā" longPress-status="optional" />
-> ```
-
-_Attribute:_ `optional` (optional)
-
-> Indicates optional mappings. May be suppressed or shown, depending on user settings.
-
-_Attribute:_ `hint` (optional)
-
-> Indicates a hint as to long-press contents, such as the first character of the `longPress` value, that can be displayed on the key. May be suppressed or shown, depending on user Settings. Characters in this attribute can be escaped using the `\u{...}` notation.
->
-> _Example:_ where the hint is "{":
+> For example, if the default key is a key whose [display](#element-displays) value is `{`, an implementation might render the key as follows:
>
> 
+>
+> _Example:_
+> - pressing the `o` key will produce `o`
+> - holding down the key will produce a list `ó`, `{` (where `{` is the default and produces a marker)
+>
+> ```xml
+> <displays>
+> <displays output="\m{marker}" display="{" />
+> </displays>
+>
+> <keys>
+> <key id="o" output="o" longPressKeyIds="o-acute marker" longPressDefaultKeyId="marker">
+> <key id="o-acute" output="ó"/>
+> <key id="marker" display="{"/>
+> </key>
+>
+> ```
-For example, suppose there are the following keys, their output and one transform:
+_Attribute:_ `longPressDefaultKeyId="{key-id}"` (optional)
-```
-E00 outputs `
-Option+E00 outputs ` (the dead-version which participates in transforms).
-`e → è
-```
+> Specifies the default key, by id, in a list of long-press keys. See the discussion of `LongPressKeyIds`, above.
-Then the first key must be tagged with `transform="no"` to indicate that it should never participate in a transform.
+_Attribute:_ `multiTapKeyIds` (optional)
-Comment: US key equivalent, base key, escaped output and escaped longpress
+> A space-separated ordered list of `key` element ids, which keys, where each successive key in the list is produced by the corresponding number of quick taps.
+> It is an error for a key to reference itself in the `multiTapKeyIds` list.
+>
+> Implementations shall ignore any gestures (such as flick, multiTap, longPress) defined on keys in the `multiTapKeyIds` list.
+>
+> _Example:_
+> - first tap on the key will produce “a”
+> - two taps will produce “bb”
+> - three taps on the key will produce “c”
+> - four taps on the key will produce “d”
+>
+> ```xml
+> <keys>
+> <key id="a" output="a" multiTapKeyIds="bb c d">
+> <key id="bb" output="bb" />
+> <key id="c" output="c" />
+> <key id="d" output="d" />
+> </key>
+> ```
-In the generated files, a comment is included to help the readability of the document. This comment simply shows the English key equivalent (with prefix `key=`), the base character (`base=`), the escaped output (`to=`) and escaped long-press keys (`long=`). These comments have been inserted strategically in places to improve readability. Not all comments include all components since some of them may be obvious.
+**Note**: Behavior past the end of the multiTap list is implementation specific.
-**Example**
+_Attribute:_ `stretch="true"` (optional)
+
+> The `stretch` attribute indicates that a touch layout may stretch this key to fill available horizontal space on the row.
+> This is used, for example, on the spacebar. Note that `stretch=` is ignored for hardware layouts.
+
+_Attribute:_ `layerId="shift"` (optional)
+
+> The `layerId` attribute indicates that this key switches to another `layer` with the specified id (such as `<layer id="shift"/>` in this example).
+> Note that a key may have both a `layerId=` and a `output=` attribute, indicating that the key outputs _prior_ to switching layers.
+> Also note that `layerId=` is ignored for hardware layouts: their shifting is controlled via
+> the modifier keys.
+>
+> This attribute is an NMTOKEN.
+>
+> In the future, this attribute’s definition is expected to be updated to align with [UAX#31](https://www.unicode.org/reports/tr31/). Please see [CLDR-17043](https://unicode-org.atlassian.net/browse/CLDR-17043) for more details.
+
+
+_Attribute:_ `output`
+
+> The `output` attribute contains the sequence of characters that is emitted when pressing this particular key. Control characters, whitespace (other than the regular space character) and combining marks in this attribute are escaped using the `\u{...}` notation. More than one key may output the same output.
+>
+> The `output` attribute may also contain the `\m{…}` syntax to insert a marker. See the definition of [markers](#markers).
+
+_Attribute:_ `width="1.2"` (optional, default "1.0")
+
+> The `width` attribute indicates that this key has a different width than other keys, by the specified number of key widths.
```xml
-<keyboard locale="fr-BE-t-k0-windows">
- …
- <keyMap modifiers="shift">
- <map iso="D01" to="A" /> <!-- key=Q -->
- <map iso="D02" to="Z" /> <!-- key=W -->
- <map iso="D03" to="E" />
- <map iso="D04" to="R" />
- <map iso="D05" to="T" />
- <map iso="D06" to="Y" />
- …
- </keyMap>
- …
-</keyboard>
+<key id="wide-a" output="a" width="1.2"/>
+<key id="wide-gap" gap="true" width="2.5"/>
```
+##### Implied Keys
+
+Not all keys need to be listed explicitly. The following two can be assumed to already exist:
+
```xml
-<keyboard locale="ps-t-k0-windows">
- …
- <keyMap modifiers='altR+caps? ctrl+alt+caps?'>
- <map iso="D04" to="\u{200e}" /> <!-- key=R base=ق -->
- <map iso="D05" to="\u{200f}" /> <!-- key=T base=ف -->
- <map iso="D08" to="\u{670}" /> <!-- key=I base=ه to= ٰ -->
- …
- </keyMap>
- …
-</keyboard>
+<key id="gap" gap="true" width="1"/>
+<key id="space" output=" " stretch="true" width="1"/>
```
+In addition, these 62 keys, comprising 10 digit keys, 26 Latin lower-case keys, and 26 Latin upper-case keys, where the `id` is the same as the `to`, are assumed to exist:
+
+```xml
+<key id="0" output="0"/>
+<key id="1" output="1"/>
+<key id="2" output="2"/>
+…
+<key id="A" output="A"/>
+<key id="B" output="B"/>
+<key id="C" output="C"/>
+…
+<key id="a" output="a"/>
+<key id="b" output="b"/>
+<key id="c" output="c"/>
+…
+```
+
+These implied keys are available in a data file named `keyboards/import/keys-Latn-implied.xml` in the CLDR distribution for the convenience of implementations.
+
+Thus, the implied keys behave as if the following import were present.
+
+```xml
+<keyboard3>
+ <keys>
+ <import base="cldr" path="techpreview/keys-Latn-implied.xml" />
+ </keys>
+</keyboard3>
+```
+
+**Note:** All implied keys may be overridden, as with all other imported data items. See the [`import`](#element-import) element for more details.
+
* * *
-#### 5.9.1 <a name="Element_flicks" href="#Element_flicks">Elements: flicks, flick</a>
+#### Element: flicks
-The `flicks` element is used to generate results from a "flick" of the finger on a mobile device.
+The `flicks` element is a collection of `flick` elements.
+
+> <small>
+>
+> Parents: [keyboard3](#element-keyboard3)
+>
+> Children: [flick](#element-flick), [import](#element-import), [_special_](tr35.md#special)
+>
+> Occurrence: optional, single
+> </small>
+
+* * *
+
+#### Element: flick
+
+The `flick` element is used to generate results from a "flick" of the finger on a mobile device.
**Syntax**
```xml
-<flicks iso="{the iso position}">
- {a set of flick elements}
-</flicks>
+<keyboard3>
+ <keys>
+ <key id="a" flicks="a-flicks" output="a" />
+ </keys>
+ <flicks>
+ <flick id="a-flicks">
+ <flickSegment … />
+ <flickSegment … />
+ <flickSegment … />
+ </flick>
+ </flicks>
+</keyboard3>
```
> <small>
>
-> Parents: [keyMap](#Element_keyMap)
-> Children: [flick](#Element_flicks)
-> Occurence: optional, multiple
+> Parents: [flicks](#element-flicks)
+>
+> Children: [flickSegment](#element-flicksegment), [_special_](tr35.md#special)
+>
+> Occurrence: optional, multiple
>
> </small>
-_Attribute:_ `iso` (required)
+_Attribute:_ `id` (required)
-> The `iso` attribute represents the ISO layout position of the key (see the definition at the beginning of the document for more information).
+> The `id` attribute identifies the flicks. It can be any NMTOKEN.
+>
+> The `flick` elements do not share a namespace with the `key`s, so it would also be allowed
+> to have `<key id="a" flick="a"/>`
+>
+> In the future, this attribute’s definition is expected to be updated to align with [UAX#31](https://www.unicode.org/reports/tr31/). Please see [CLDR-17043](https://unicode-org.atlassian.net/browse/CLDR-17043) for more details.
-**Syntax**
+* * *
-```xml
-<flick directions="{list of directions}" to="{the output}" />
-```
+#### Element: flickSegment
> <small>
>
-> Parents: [flicks](#Element_flicks)
+> Parents: [flick](#element-flick)
+>
> Children: _none_
-> Occurence: required, multiple
+>
+> Occurrence: required, multiple
>
> </small>
@@ -768,465 +861,793 @@
> The `directions` attribute value is a space-delimited list of keywords, that describe a path, currently restricted to the cardinal and intercardinal directions `{n e s w ne nw se sw}`.
-_Attribute:_ `to` (required)
+_Attribute:_ `keyId` (required)
-> The to attribute value is the result of (one or more) flicks.
+> The `keyId` attribute value is the result of (one or more) flicks.
+>
+> Implementations shall ignore any gestures (such as flick, multiTap, longPress) defined on the key specified by `keyId`.
+
**Example**
-where a flick to the Northeast then South produces two code points.
+where a flick to the Northeast then South produces `Å`.
```xml
-<flicks iso="C01">
- <flick directions="ne s" to="\uABCD\uDCBA" />
+<keys>
+ <key id="something" flickId="a" output="Something" />
+ <key id="A-ring" output="A-ring" />
+</keys>
+
+<flicks>
+ <flick id="a">
+ <flickSegment directions="ne s" keyId="A-ring" />
+ </flick>
</flicks>
```
* * *
-### 5.10 <a name="Element_import" href="#Element_import">Element: import</a>
+### Element: import
-The `import` element references another file of the same type and includes all the subelements of the top level element as though the `import` element were being replaced by those elements, in the appropriate section of the XML file. For example:
+The `import` element is used to reference another xml file so that elements are imported from
+another file. The use case is to be able to import a standard set of `transform`s and similar
+from the CLDR repository, especially to be able to share common information relevant to a particular script.
+The intent is for each single XML file to contain all that is needed for a keyboard layout, other than required standard import data from the CLDR repository.
+
+`<import>` can be used as a child of a number of elements (see the _Parents_ section immediately below). Multiple `<import>` elements may be used, however, `<import>` elements must come before any other sibling elements.
+If two identical elements are defined, the later element will take precedence, that is, override.
+
+**Note:** imported files do not have any indication of their normalization mode. For this reason, the keyboard author must verify that the imported file is of a compatible normalization mode. See the [`settings` element](#element-settings) for further details.
**Syntax**
-
```xml
-<import path="standard_transforms.xml">
+<import base="cldr" path="techpreview/keys-Zyyy-punctuation.xml"/>
```
-
> <small>
>
-> Parents: [keyboard](#Element_keyboard)
+> Parents: [displays](#element-displays), [keyboard3](#element-keyboard3), [keys](#element-keys), [flicks](#element-flicks), [layers](#element-layers), [transformGroup](#element-transformgroup), [transforms](#element-transforms), [variables](#element-variables)
> Children: _none_
-> Occurence: optional, multiple
+>
+> Occurrence: optional, multiple
>
> </small>
+_Attribute:_ `base`
+
+> The base may be omitted (indicating a local import) or have the value `"cldr"`.
+
+**Note:** `base="cldr"` is required for all `<import>` statements within keyboard files in the CLDR repository.
+
_Attribute:_ `path` (required)
-> The value is contains a relative path to the included ldml file. There is a standard set of directories to be searched that an application may provide. This set is always prepended with the directory in which the current file being read, is stored.
+> If `base` is `cldr`, then the `path` must start with a CLDR version (such as `techpreview`) representing the CLDR version to pull imports from. The imports are located in the `keyboard/import` subdirectory of the CLDR source repository.
+> Implementations are not required to have all CLDR versions available to them.
+>
+> If `base` is omitted, then `path` is an absolute or relative file path.
-If two identical elements, as described below, are defined, the later element will take precedence. Thus if a `hardwareMap/map` for the same keycode on the same page is defined twice (for example once in an included file), the later one will be the resulting mapping.
-Elements are considered to have three attributes that make them unique: the tag of the element, the parent and the identifying attribute. The parent in its turn is a unique element and so on up the chain. If the distinguishing attribute is optional, its non-existence is represented with an empty value. Here is a list of elements and their defining attributes. If an element is not listed then if it is a leaf element, only one occurs and it is merely replaced. If it has children, then the subelements are considered, in effect merging the element in question.
+**Further Examples**
-| Element | Parent | Distinguishing attribute |
-|--------------|--------------|------------------------------|
-| `import` | `keyboard` | `@path` |
-| `keyMap` | `keyboard` | `@modifiers` |
-| `map` | `keyMap` | `@iso` |
-| `flicks` | `keyMap` | `@iso` |
-| `flick` | `flicks` | `@directions` |
-| `display` | `displayMap` | `@to` |
-| `layer` | `keyboard` | `@modifier` |
-| `row` | `layer` | `@keys` |
-| `switch` | `layer` | `@iso` |
-| `vkeys` | `layer` | `@iso` |
-| `transforms` | `keyboard` | `@type` |
-| `transform` | `keyboard` | `@before`, `@from`, `@after` |
-| `reorder` | `reorders` | `@before`, `@from`, `@after` |
-| `backspace` | `backspaces` | `@before`, `@from`, `@after` |
+```xml
+<!-- in a keyboard xml file-->
+…
+<transforms type="simple">
+ <import base="cldr" path="techpreview/transforms-example.xml"/>
+ <transform from="` " to="`" />
+ <transform from="^ " to="^" />
+</transforms>
+…
-In order to help identify mistakes, it is an error if a file contains two elements that override each other. All element overrides must come as a result of an `<include>` element either for the element overridden or the element overriding.
-The following elements are not imported from the source file:
+<!-- contents of transforms-example.xml -->
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE transforms SYSTEM "../dtd/ldmlKeyboard3.dtd">
+<transforms>
+ <!-- begin imported part-->
+ <transform from="`a" to="à" />
+ <transform from="`e" to="è" />
+ <transform from="`i" to="ì" />
+ <transform from="`o" to="ò" />
+ <transform from="`u" to="ù" />
+ <!-- end imported part -->
+</transforms>
+```
-* `version`
-* `generation`
-* `names`
-* `settings`
+**Note:** The DOCTYPE and root element, here `transforms`, is the same as
+the _parent_ of the `<import/>` element. It is an error to import an XML file
+whose root element is different than the parent element of the `<import/>` element.
+
+After loading, the above example will be the equivalent of the following.
+
+```xml
+<transforms type="simple">
+ <!-- begin imported part-->
+ <transform from="`a" to="à" />
+ <transform from="`e" to="è" />
+ <transform from="`i" to="ì" />
+ <transform from="`o" to="ò" />
+ <transform from="`u" to="ù" />
+ <!-- end imported part -->
+
+ <!-- this line is after the import -->
+ <transform from="^ " to="^" />
+ <transform from="` " to="`" />
+</transforms>
+```
* * *
-### 5.11 <a name="Element_displayMap" href="#Element_displayMap">Element: displayMap</a>
+### Element: displays
-The displayMap can be used to describe what is to be displayed on the keytops for various keys. For the most part, such explicit information is unnecessary since the `@to` element from the `keyMap/map` element can be used. But there are some characters, such as diacritics, that do not display well on their own and so explicit overrides for such characters can help. The `displayMap` consists of a list of display subelements.
+The displays can be used to describe what is to be displayed on the keytops for various keys. For the most part, such explicit information is unnecessary since the `@to` element from the `keys/key` element can be used. But there are some characters, such as diacritics, that do not display well on their own and so explicit overrides for such characters can help.
+Another useful scenario is where there are doubled diacritics, or multiple characters with spacing issues.
-DisplayMaps are designed to be shared across many different keyboard layout descriptions, and included in where needed.
+The `displays` consists of a list of display subelements.
+
+`displays` elements are designed to be shared across many different keyboard layout descriptions, and imported with `<import>` where needed.
+
+For combining characters, U+25CC `◌` is used as a base. It is an error to use a combining character without a base in the `display` attribute.
+
+For example, a key which outputs a combining tilde (U+0303) can be represented as follows:
+
+```xml
+ <display output="\u{0303}" display="◌̃" /> <!-- \u{25CC} \u{0303}-->
+```
+
+This way, a key which outputs a combining tilde (U+0303) will be represented as `◌̃` (a tilde on a dotted circle).
+
+Some scripts/languages may prefer a different base than U+25CC.
+See [`<displayOptions baseCharacter=…/>`](#element-displayoptions).
**Syntax**
```xml
-<displayMap>
+<displays>
{a set of display elements}
-</displayMap>
+</displays>
```
+**Note**: There is currently no way to indicate a custom display for a key without output (i.e. without a `to=` attribute), nor is there a way to indicate that such a key has a standardized identity (e.g. that a key should be identified as a “Shift”). These may be addressed in future versions of this standard.
+
> <small>
>
-> Parents: [keyboard](#Element_keyboard)
-> Children: [display](#Element_display)
-> Occurence: optional, single
+> Parents: [keyboard3](#element-keyboard3)
+>
+> Children: [display](#element-display), [displayOptions](#element-displayoptions), [_special_](tr35.md#special)
+>
+> Occurrence: optional, single
>
> </small>
* * *
-### 5.12 <a name="Element_display" href="#Element_display">Element: display</a>
+### Element: display
-The `display` element describes how a character, that has come from a `keyMap/map` element, should be displayed on a keyboard layout where such display is possible.
+The `display` element describes how a character, that has come from a `keys/key` element, should be displayed on a keyboard layout where such display is possible.
**Syntax**
```xml
-<display to="{the output}" display="{show as}" />
+<display output="{the output}" display="{show as}" />
```
> <small>
>
-> Parents: [displayMap](#Element_displayMap)
+> Parents: [displays](#element-displays)
+>
> Children: _none_
-> Occurence: required, multiple
+>
+> Occurrence: required, multiple
>
> </small>
-_Attribute:_ `to` (required)
+One of the `output` or `id` attributes is required.
-> Specifies the character or character sequence from the `keyMap/map` element that is to have a special display.
+_Attribute:_ `output` (optional)
+
+> Specifies the character or character sequence from the `keys/key` element that is to have a special display.
+> This attribute may be escaped with `\u` notation, see [Escaping](#escaping).
+> The `output` attribute may also contain the `\m{…}` syntax to reference a marker. See [Markers](#markers). Implementations may highlight a displayed marker, such as with a lighter text color, or a yellow highlight.
+> String variables may be substituted. See [String variables](#element-string)
+
+_Attribute:_ `id` (optional)
+
+> Specifies the `key` id. This is useful for keys which do not produce any output (no `output=` value), such as a shift key.
+>
+> This attribute must match `[A-Za-z0-9][A-Za-z0-9-]*`
_Attribute:_ `display` (required)
-> Required and specifies the character sequence that should be displayed on the keytop for any key that generates the `@to` sequence. (It is an error if the value of the `display` attribute is the same as the value of the `to` attribute.)
+> Required and specifies the character sequence that should be displayed on the keytop for any key that generates the `@output` sequence or has the `@id`. (It is an error if the value of the `display` attribute is the same as the value of the `output` attribute, this would be an extraneous entry.)
+
+> String variables may be substituted. See [String variables](#element-string)
+
+This attribute may be escaped with `\u` notation, see [Escaping](#escaping).
**Example**
```xml
-<keyboard>
- <keyMap>
- <map iso="C01" to="a" longpress="\u0301 \u0300" />
- </keyMap>
- <displayMap>
- <display to="\u0300" display="\u02CB" />
- <display to="\u0301" display="\u02CA" />
- </displayMap>
-</keyboard>
+<keyboard3>
+ <keys>
+ <key id="grave" output="\u{0300}" /> <!-- combining grave -->
+ <key id="marker" output="\m{acute}" /> <!-- generates a marker-->
+ <key id="numeric" layerId="numeric" /> <!-- changes layers-->
+ </keys>
+ <displays>
+ <display output="\u{0300}" display="ˋ" /> <!-- \u{02CB} -->
+ <display keyId="numeric" display="#" /> <!-- display the layer shift key as # -->
+ <display output="\m{acute}" display="´" /> <!-- Display \m{acute} as ´ -->
+ </displays>
+</keyboard3>
```
-To allow `displayMap`s to be shared across descriptions, there is no requirement that `@to` in a `display` element matches any `@to` in any `keyMap/map` element in the keyboard description.
+To allow `displays` elements to be shared across keyboards, there is no requirement that `@output` in a `display` element matches any `@output`/`@id` in any `keys/key` element in the keyboard description.
* * *
-### 5.13 <a name="Element_layer" href="#Element_layer">Element: layer</a>
+### Element: displayOptions
-A `layer` element describes the configuration of keys on a particular layer of a keyboard. It contains one or more `row` elements to describe which keys exist in each `row` and optionally one or more `switch` elements that describe how keys in the layer switch the layer to another. In addition, for platforms that require a mapping from a key to a virtual key (for example Windows or Mac) there is also an optional `vkeys` element to describe the mapping.
+The `displayOptions` is an optional singleton element providing additional settings on this `displays`. It is structured so as to provide for future flexibility in such options.
**Syntax**
```xml
-<layer modifier="{Set of Modifier Combinations}">
+<displays>
+ <display …/>
+ <displayOptions baseCharacter="x"/>
+</displays>
+```
+
+> <small>
+>
+> Parents: [displays](#element-displays)
+>
+> Children: _none_
+>
+> Occurrence: optional, single
+>
+> </small>
+
+_Attribute:_ `baseCharacter` (optional)
+
+**Note:** At present, this is the only option settable in the `displayOptions`.
+
+> Some scripts/languages may prefer a different base than U+25CC.
+> For Lao for example, `x` is often used as a base instead of `◌`.
+> Setting `baseCharacter="x"` (for example) is a _hint_ to the implementation which
+> requests U+25CC to be substituted with `x` on display.
+> As a hint, the implementation may ignore this option.
+>
+> **Note** that not all base characters will be suitable as bases for combining marks.
+
+This attribute may be escaped with `\u` notation, see [Escaping](#escaping).
+
+* * *
+
+### Element: forms
+
+This element represents a set of `form` elements which define the layout of a particular hardware form.
+
+
+> <small>
+>
+> Parents: [keyboard3](#element-keyboard3)
+>
+> Children: [import](#element-import), [form](#element-form), [_special_](tr35.md#special)
+>
+> Occurrence: optional, single
+>
+> </small>
+
+***Syntax***
+
+```xml
+<forms>
+ <form id="iso">
+ <!-- ... -->
+ </form>
+ <form id="us">
+ <!-- ... -->
+ </form>
+</forms>
+```
+
+* * *
+
+### Element: form
+
+This element represents a specific `form` element which defines the layout of a particular hardware form.
+
+> *Note:* Most keyboards will not need to use this element directly, and the CLDR repository will not accept keyboards which define a custom `form` element. This element is provided for two reasons:
+
+1. To formally specify the standard hardware arrangements used with CLDR for implementations. Implementations can verify the arrangement, and validate keyboards against the number of rows and the number of keys per row.
+
+2. To allow a way to customize the scancode layout for keyboards not intended to be included in the common CLDR repository.
+
+See [Implied Form Values](#implied-form-values), below.
+
+> <small>
+>
+> Parents: [forms](#element-forms)
+>
+> Children: [scanCodes](#element-scancodes), [_special_](tr35.md#special)
+>
+> Occurrence: optional, multiple
+>
+> </small>
+
+_Attribute:_ `id` (required)
+
+> This attribute specifies the form id. The value may not be `touch`.
+
+***Syntax***
+
+```xml
+<form id="us">
+ <scanCodes codes="00 01 02"/>
+ <scanCodes codes="03 04 05"/>
+</form>
+```
+
+##### Implied Form Values
+
+There is an implied set of `<form>` elements corresponding to the default forms, thus implementations must behave as if there was the following import statement:
+
+```xml
+<keyboard3>
+ <forms>
+ <import base="cldr" path="techpreview/scanCodes-implied.xml" /> <!-- the version will match the current conformsTo of the file -->
+ </forms>
+</keyboard3>
+```
+
+Here is a summary of the implied form elements. Keyboards included in the CLDR Repository must only use these `form=` values and may not override the scanCodes.
+
+> - `touch` - Touch (non-hardware) layout.
+> - `abnt2` - Brazilian 103 key ABNT2 layout (iso + extra key near right shift)
+> - `iso` - European 102 key layout (extra key near left shift)
+> - `jis` - Japanese 109 key layout
+> - `us` - ANSI 101 key layout
+> - `ks` - Korean KS layout
+
+* * *
+
+### Element: scanCodes
+
+This element represents a keyboard row, and defines the scan codes for the non-frame keys in that row.
+
+> <small>
+>
+> Parents: [form](#element-form)
+>
+> Children: none
+>
+> Occurrence: required, multiple
+>
+> </small>
+
+> _Attribute:_ `codes` (required)
+
+> The `codes` attribute is a space-separated list of 2-digit hex bytes, each representing a scan code.
+
+**Syntax**
+
+```xml
+<scanCodes codes="29 02 03 04 05 06 07 08 09 0A 0B 0C 0D" />
+```
+
+* * *
+
+### Element: layers
+
+This element represents a set of `layer` elements with a specific physical form factor, whether
+hardware or touch layout.
+
+> <small>
+>
+> Parents: [keyboard3](#element-keyboard3)
+>
+> Children: [import](#element-import), [layer](#element-layer), [_special_](tr35.md#special)
+>
+> Occurrence: required, multiple
+>
+> </small>
+
+- At least one `layers` element is required.
+
+_Attribute:_ `form` (required)
+
+> This attribute specifies the physical layout of a hardware keyboard,
+> or that the form is a `touch` layout.
+>
+> When using an on-screen touch keyboard, if the keyboard does not specify a `<layers form="touch">`
+> element, a `<layers form="{hardware}">` element can be used as an fallback alternative.
+> If there is no `hardware` form, the implementation may need
+> to choose a different keyboard file, or use some other fallback behavior when using a
+> hardware keyboard.
+>
+> Because a hardware keyboard facilitates non-trivial amounts of text input,
+> and many touch devices can also be connected to a hardware keyboard, it
+> is recommended to always have at least one hardware (non-touch) form.
+>
+> Multiple `<layers form="touch">` elements are allowed with distinct `minDeviceWidth` values.
+> At most one hardware (non-`touch`) `<layers>` element is allowed. If a different key arrangement is desired between, for example, `us` and `iso` formats, these should be separated into two different keyboards.
+>
+> The typical keyboard author will be designing a keyboard based on their circumstances and the hardware that they are using. So, for example, if they are in South East Asia, they will almost certainly be using an 101 key hardware keyboard with US key caps. So we want them to be able to reference that (`<layers form="us">`) in their design, rather than having to work with an unfamiliar form.
+>
+> A mismatch between the hardware layout in the keyboard file, and the actual hardware used by the user could result in some keys being inaccessible to the user if their hardware cannot generate the scancodes corresponding to the layout specified by the `form=` attribute. Such keys could be accessed only via an on-screen keyboard utility. Conversely, a user with hardware keys that are not present in the specified `form=` will result in some hardware keys which have no function when pressed.
+>
+>
+> The value of the `form=` attribute may be `touch`, or correspond to a `form` element. See [`form`](#element-form).
+>
+
+_Attribute:_ `minDeviceWidth`
+
+> This attribute specifies the minimum required width, in millimeters (mm), of the touch surface. The `layers` entry with the greatest matching width will be selected. This attribute is intended for `form="touch"`, but is supported for hardware forms.
+>
+> This must be a whole number between 1 and 999, inclusive.
+
+### Element: layer
+
+A `layer` element describes the configuration of keys on a particular layer of a keyboard. It contains one or more `row` elements to describe which keys exist in each row.
+
+**Syntax**
+
+```xml
+<layer id="layerId" modifiers="{Set of Modifier Combinations}">
...
</layer>
```
> <small>
>
-> Parents: [keyboard](#Element_keyboard)
-> Children: [row](#Element_row), [switch](#Element_switch), [vkeys](#Element_vkeys)
-> Occurence: optional, multiple
+> Parents: [keyboard3](#element-keyboard3)
+>
+> Children: [row](#element-row), [_special_](tr35.md#special)
+>
+> Occurrence: optional, multiple
>
> </small>
-_Attribute:_ `modifier` (required)
+_Attribute_ `id` (required for `touch`)
-> This has two roles. It acts as an identifier for the `layer` element and also provides the linkage into a keyMap. A modifier is a single modifier combination such that it is matched by one of the modifier combinations in one of the `keyMap/@modifiers` attribute. To indicate that no modifiers apply the reserved name of "none" is used. For the purposes of fallback vkey mapping, the following modifier components are reserved: "shift", "ctrl", "alt", "caps", "cmd", "opt" along with the "L" and "R" optional single suffixes for the first 3 in that list. There must be a `keyMap` whose `@modifiers` attribute matches the `@modifier` attribute of the `layer` element. It is an error if there is no such `keyMap`.
+> The `id` attribute identifies the layer for touch layouts. This identifier specifies the layout as the target for layer switching, as specified by the `switch=` attribute on the [`<key>`](#element-key) element.
+> Touch layouts must have one `layer` with `id="base"` to serve as the base layer.
+>
+> Must match `[A-Za-z0-9][A-Za-z0-9-]*`
-The `keymap/@modifier` often includes multiple combinations that match. It is not necessary (or prefered) to include all of these. Instead a minimal matching element should be used, such that exactly one keymap is matched.
+_Attribute:_ `modifiers` (required for `hardware`)
-The following are examples of situations where the `@modifiers` and `@modifier` do not match, with a different keymap definition than above.
-
-| `keyMap/@modifiers` | `layer/@modifier` |
-|---------------------|---------------------|
-| `shiftL` | `shift` (ambiguous) |
-| `altR` | `alt` |
-| `shiftL?+shiftR` | `shift` |
-
-And these do match:
-
-| `keyMap/@modifiers` | `layer/@modifier` |
-|---------------------|-------------------|
-| `shiftL shiftR` | `shift` |
-
-The use of `@modifier` as an identifier for a layer, is sufficient since it is always unique among the set of `layer` elements in a keyboard.
-
-* * *
-
-### 5.14 <a name="Element_row" href="#Element_row">Element: row</a>
-
-A `row` element describes the keys that are present in the row of a keyboard. `row` elements are ordered within a `layout` element with the top visual row being stored first.
-
-The row element introduces the `keyId` which may be an `ISOKey` or a `specialKey`. More formally:
-
-```
-keyId = ISOKey | specialKey
-ISOKey = [A-Z][0-9][0-9]
-specialKey = [a-z][a-zA-Z0-9]{2,7}
-```
-
-ISOKey denotes a key having an [ISO Position](#Definitions). SpecialKey is used to identify functional keys occurring on a virtual keyboard layout.
+> This has two roles. It acts as an identifier for the `layer` element for hardware keyboards (in the absence of the id= element) and also provides the linkage from the hardware modifiers into the correct `layer`.
+>
+> For hardware layouts, the use of `@modifiers` as an identifier for a layer is sufficient since it is always unique among the set of `layer` elements in each `form`.
+>
+> The set of modifiers must match `(none|([A-Za-z0-9]+)( [A-Za-z0-9]+)*)`
+>
+> To indicate that no modifiers apply, the reserved name of `none` is used.
**Syntax**
```xml
-<row keys="{keyId}" />
+<layer id="base" modifiers="none">
+ <row keys="a" />
+</layer>
+
+<layer id="upper" modifiers="shift">
+ <row keys="A" />
+</layer>
+
+<layer id="altgr" modifiers="altR">
+ <row keys="a-umlaut" />
+</layer>
+
+<layer id="upper-altgr" modifiers="altR shift">
+ <row keys="A-umlaut" />
+</layer>
+```
+
+#### Layer Modifier Components
+
+ The following modifier components can be used, separated by spaces.
+
+ - `none` (no modifier)
+ - `alt`
+ - `altL`
+ - `altR`
+ - `caps`
+ - `ctrl`
+ - `ctrlL`
+ - `ctrlR`
+ - `shift`
+ - `other` (matches if no other layers match)
+
+1. `alt` in this specification is referred to on some platforms as "opt" or "option".
+
+2. `none` and `other` may not be combined with any other components.
+
+#### Modifier Left- and Right- keys
+
+1. `L` or `R` indicates a left- or right- side modifier only (such as `altL`)
+ whereas `alt` indicates _either_ left or right alt key (that is, `altL` or `altR`). `ctrl` indicates either left or right ctrl key (that is, `ctrlL` or `ctrlR`).
+
+2. If there are any layers (in the same `form=`) with a modifier `alt`, there may not also be another layer with `altL` or `altR`. Similarly, if there is a layer with a modifier `ctrl`, there may not be a layer with `ctrlL` or `ctrlR`.
+
+3. Left- and right- side modifiers may not be mixed together in a single `modifier` attribute value, so neither `altL ctrlR"` nor `altL altR` are allowed.
+
+4. `shift` indicates either shift key. The left and right shift keys are not distinguishable in this specification.
+
+#### Layer Modifier Matching
+
+Layers are matched exactly based on the modifier keys which are down. For example:
+
+- `none` as a modifier will only match if *all* of the keys `caps`, `alt`, `ctrl` and `shift` are up.
+
+- `alt` as a modifier will only match if either `alt` is down, *and* `caps`, `ctrl`, and `shift` are up.
+
+- `altL ctrl` as a modifier will only match if the left `alt` is down, either `ctrl` is down, *and* `shift` and `caps` are up.
+
+- `other` as a modifier will match if no other layers match.
+
+Multiple modifier sets may be separated by commas. For example, `none, shift caps` will match either no modifiers *or* shift and caps. `ctrlL altL, altR` will match either left-control and left-alt, *or* right-alt.
+
+Keystrokes where there isn’t an explicitly matching layer, and where there is no layer with `other` specified, are ignored.
+
+* * *
+
+### Element: row
+
+A `row` element describes the keys that are present in the row of a keyboard.
+
+**Syntax**
+
+```xml
+<row keys="{keyId} {keyId} …" />
```
> <small>
>
-> Parents: [layer](#Element_layer)
+> Parents: [layer](#element-layer)
+>
> Children: _none_
-> Occurence: required, multiple
+>
+> Occurrence: required, multiple
>
> </small>
_Attribute:_ `keys` (required)
-> This is a string that lists the `keyId` for each of the keys in a row. Key ranges may be contracted to firstkey-lastkey but only for `ISOKey` type `keyId`s. The interpolation between the first and last keys names is entirely numeric. Thus `D00-D03` is equivalent to `D00 D01 D02 D03`. It is an error if the first and last keys do not have the same alphabetic prefix or the last key numeric component is less than or equal to the first key numeric component.
-
-`specialKey` type `keyId`s may take any value within their syntactic constraint. But the following `specialKey`s are reserved to allow applications to identify them and give them special handling:
-
-* `"bksp"`, `"enter"`, `"space"`, `"tab"`, "`esc"`, `"sym"`, `"num"`
-* all the reserved modifier names
-* specialKeys starting with the letter "x" for future reserved names.
+> This is a string that lists the id of [`key` elements](#element-key) for each of the keys in a row, whether those are explicitly listed in the file or are implied. See the `key` documentation for more detail.
+>
+> For non-`touch` forms, the number of keys in each row may not exceed the number of scan codes defined for that row, and the number of rows may not exceed the defined number of rows for that form. See [`scanCodes`](#element-scancodes);
**Example**
Here is an example of a `row` element:
```xml
-<layer modifier="none">
- <row keys="D01-D10" />
- <row keys="C01-C09" />
- <row keys="shift B01-B07 bksp" />
- <row keys="sym A01 smilies A02-A03 enter" />
-</layer>
+<row keys="a z e r t y u i o p caret dollar" />
```
* * *
-### 5.15 <a name="Element_switch" href="#Element_switch">Element: switch</a>
-
-The `switch` element describes a function key that has been included in the layout. It specifies which layer pressing the key switches you to and also what the key looks like.
-
-**Syntax**
-
-```xml
-<switch iso="{specialKey}"
- layer="{Set of Modifier Combinations}"
- display="{show as}" />
-```
+### Element: variables
> <small>
>
-> Parents: [layer](#Element_layer)
-> Children: _none_
-> Occurence: optional, multiple
+> Parents: [keyboard3](#element-keyboard3)
>
+> Children: [import](#element-import), [_special_](tr35.md#special), [string](#element-string), [set](#element-set), [unicodeSet](#element-unicodeset)
+>
+> Occurrence: optional, single
> </small>
-_Attribute:_ `iso` (required)
+This is a container for variables to be used with [transform](#element-transform), [display](#element-display) and [key](#element-key) elements.
-> The `keyId` as specified in one of the `row` elements. This must be a `specialKey` and not an `ISOKey`.
-
-_Attribute:_ `layer` (required)
-
-> The modifier attribute of the resulting `layer` element that describes the layer the user gets switched to.
-
-_Attribute:_ `display` (required)
-
-> A string to be displayed on the key.
+Note that the `id=` attribute must be unique across all children of the `variables` element.
**Example**
-Here is an example of a `switch` element for a shift key:
-
```xml
-<layer modifier="none">
- <row keys="D01-D10" />
- <row keys="C01-C09" />
- <row keys="shift B01-B07 bksp" />
- <row keys="sym A01 smilies A02-A03 enter" />
- <switch iso="shift" layer="shift" display="⇪" />
-</layer>
-<layer modifier="shift">
- <row keys="D01-D10" />
- <row keys="C01-C09" />
- <row keys="shift B01-B07 bksp" />
- <row keys="sym A01 smilies A02-A03 enter" />
- <switch iso="shift" layer="none" display="⇪" />
-</layer>
+<variables>
+ <string id="y" value="yes" /> <!-- a simple string-->
+ <set id="upper" value="A B C D E FF" /> <!-- a set with 6 items -->
+ <unicodeSet id="consonants" value="[कसतनमह]" /> <!-- a UnicodeSet -->
+</variables>
```
* * *
-### 5.16 <a name="Element_vkeys" href="#Element_vkeys">Element: vkeys</a>
-
-On some architectures, applications may directly interact with keys before they are converted to characters. The keys are identified using a virtual key identifier or vkey. The mapping between a physical keyboard key and a vkey is keyboard-layout dependent. For example, a French keyboard would identify the D01 key as being an 'a' with a vkey of 'a' as opposed to 'q' on a US English keyboard. While vkeys are layout dependent, they are not modifier dependent. A shifted key always has the same vkey as its unshifted counterpart. In effect, a key is identified by its vkey and the modifiers active at the time the key was pressed.
-
-**Syntax**
-
-```xml
-<vkeys>
- {a set of vkey elements}
-</vkeys>
-```
+### Element: string
> <small>
>
-> Parents: [layer](#Element_layer), [keyboard](#Element_keyboard)
-> Children: [vkey](#Element_vkey)
-> Occurence: optional, multiple
+> Parents: [variables](#element-variables)
>
-> </small>
-
-_Attribute:_ `type`
-
-> Current values: android, chromeos, osx, und, windows.
-
-For a physical keyboard there is a layout specific default mapping of keys to vkeys. These are listed in a `vkeys` element which takes a list of `vkey` element mappings and is identified by a type. There are different vkey mappings required for different platforms. While `type="windows"` vkeys are very similar to `type="osx"` vkeys, they are not identical and require their own mapping.
-
-The most common model for specifying vkeys is to import a standard mapping, say to the US layout, and then to add a `vkeys` element to change the mapping appropriately for the specific layout.
-
-In addition to describing physical keyboards, vkeys also get used in virtual keyboards. Here the vkey mapping is local to a layer and therefore a `vkeys` element may occur within a `layout` element. In the case where a `layout` element has no `vkeys` element then the resulting mapping may either be empty (none of the keys represent keys that have vkey identifiers) or may fallback to the layout wide vkeys mapping. Fallback only occurs if the layout's `modifier` attribute consists only of standard modifiers as listed as being reserved in the description of the `layout/@modifier` attribute, and if the modifiers are standard for the platform involved. So for Windows, `"cmd"` is a reserved modifier but it is not standard for Windows. Therefore on Windows the vkey mapping for a layout with `@modifier="cmd" `would be empty.
-
-A `vkeys` element consists of a list of `vkey` elements.
-
-* * *
-
-### 5.17 <a name="Element_vkey" href="#Element_vkey">Element: vkey</a>
-
-A `vkey` element describes a mapping between a key and a vkey for a particular platform.
-
-**Syntax**
-
-```xml
-<vkey iso="{iso position}" vkey="{identifier}"
- [modifier="{Set of Modifier Combinations}"] />
-```
-
-> <small>
->
-> Parents: [vkeys](#Element_vkeys)
> Children: _none_
-> Occurence: required, multiple
>
+> Occurrence: optional, multiple
> </small>
-_Attribute:_ `iso` (required)
+> This element represents a single string which is used by the [transform](#element-transform) elements for string matching and substitution, as well as by the [key](#element-key) and [display](#element-display) elements.
-> The ISOkey being mapped.
+_Attribute:_ `id` (required)
-_Attribute:_ `vkey` (required)
+> Specifies the identifier (name) of this string.
+> All ids must be unique across all types of variables.
+>
+> `id` must match `[0-9A-Za-z_]{1,32}`
-> The resultant vkey identifier (the value is platform specific).
+_Attribute:_ `value` (required)
-_Attribute:_ `modifier`
-
-> This attribute may only be used if the parent `vkeys` element is a child of a `layout` element. If present it allows an unmodified key from a layer to represent a modified virtual key.
+> Strings may contain whitespaces. However, for clarity, it is recommended to escape spacing marks, even in strings.
+> This attribute may be escaped with `\u` notation, see [Escaping](#escaping).
+> Variables may refer to other string variables if they have been previously defined, using `${string}` syntax.
+> [Markers](#markers) may be included with the `\m{…}` notation.
**Example**
-This example shows some of the mappings for a French keyboard layout:
-
- _shared/win-vkey.xml_
```xml
-<keyboard>
- <vkeys type="windows">
- <vkey iso="D01" vkey="VK_Q" />
- <vkey iso="D02" vkey="VK_W" />
- <vkey iso="C01" vkey="VK_A" />
- <vkey iso="B01" vkey="VK_Z" />
- </vkeys>
-</keyboard>
+<variables>
+ <string id="cluster_hi" value="हि" /> <!-- a string -->
+ <string id="zwnj" value="\u{200C}"/> <!-- single codepoint -->
+ <string id="acute" value="\m{acute}"/> <!-- refer to a marker -->
+ <string id="backquote" value="`"/>
+ <string id="zwnj_acute" value="${zwnj}${acute}" /> <!-- Combine two variables -->
+ <string id="zwnj_sp_acute" value="${zwnj}\u{0020}${acute}" /> <!-- Combine two variables -->
+</variables>
```
-_shared/win-fr.xml_
+These may be then used in multiple contexts:
```xml
-<keyboard>
- <import path="shared/win-vkey.xml">
- <keyMap>
- <map iso="D01" to="a" />
- <map iso="D02" to="z" />
- <map iso="C01" to="q" />
- <map iso="B01" to="w" />
- </keyMap>
- <keyMap modifiers="shift">
- <map iso="D01" to="A" />
- <map iso="D02" to="Z" />
- <map iso="C01" to="Q" />
- <map iso="B01" to="W" />
- </keyMap>
- <vkeys type="windows">
- <vkey iso="D01" vkey="VK_A" />
- <vkey iso="D02" vkey="VK_Z" />
- <vkey iso="C01" vkey="VK_Q" />
- <vkey iso="B01" vkey="VK_W" />
- </vkeys>
-</keyboard>
-```
-
-In the context of a virtual keyboard there might be a symbol layer with the following layout:
-
-```xml
-<keyboard>
- <keyMap>
- <map iso="D01" to="1" />
- <map iso="D02" to="2" />
- ...
- <map iso="D09" to="9" />
- <map iso="D10" to="0" />
- <map iso="C01" to="!" />
- <map iso="C02" to="@" />
- ...
- <map iso="C09" to="(" />
- <map iso="C10" to=")" />
- </keyMap>
- <layer modifier="sym">
- <row keys="D01-D10" />
- <row keys="C01-C09" />
- <row keys="shift B01-B07 bksp" />
- <row keys="sym A00-A03 enter" />
- <switch iso="sym" layer="none" display="ABC" />
- <switch iso="shift" layer="sym+shift" display="&=/<" />
- <vkeys type="windows">
- <vkey iso="D01" vkey="VK_1" />
- ...
- <vkey iso="D10" vkey="VK_0" />
- <vkey iso="C01" vkey="VK_1" modifier="shift" />
- ...
- <vkey iso="C10" vkey="VK_0" modifier="shift" />
- </vkeys>
- </layer>
-</keyboard>
+<!-- as part of a regex -->
+<transform from="${cluster_hi}X" to="X" />
+<transform from="Y" to="${cluster_hi}" />
+…
+<!-- as part of a key bag -->
+<key id="hi_key" output="${cluster_hi}" />
+<key id="acute_key" output="${acute}" />
+…
+<!-- Display ´ instead of the non-displayable marker -->
+<display output="${acute}" display="${backquote}" />
```
* * *
-### 5.18 <a name="Element_transforms" href="#Element_transforms">Element: transforms</a>
+### Element: set
-This element defines a group of one or more `transform` elements associated with this keyboard layout. This is used to support features such as dead-keys, character reordering, etc. using a straightforward structure that works for all the keyboards tested, and that results in readable source data.
+> <small>
+>
+> Parents: [variables](#element-variables)
+>
+> Children: _none_
+>
+> Occurrence: optional, multiple
+> </small>
-There can be multiple `<transforms>` elements
+> This element represents a set of strings used by the [transform](#element-transform) elements for string matching and substitution.
-Syntax
+_Attribute:_ `id` (required)
+
+> Specifies the identifier (name) of this set.
+> All ids must be unique across all types of variables.
+>
+> `id` must match `[0-9A-Za-z_]{1,32}`
+
+_Attribute:_ `value` (required)
+
+> The `value` attribute is always a set of strings separated by whitespace, even if there is only a single item in the set, such as `"A"`.
+> Leading and trailing whitespace is ignored.
+> This attribute may be escaped with `\u` notation, see [Escaping](#escaping).
+> Sets may refer to other string variables if they have been previously defined, using `${string}` syntax, or to other previously-defined sets using `$[set]` syntax.
+> Set references must be separated by whitespace: `$[set1]$[set2]` is an error; instead use `$[set1] $[set2]`.
+> [Markers](#markers) may be included with the `\m{…}` notation.
+
+**Examples**
+
+```xml
+<variables>
+ <set id="upper" value="A B CC D E FF " /> <!-- 6 items -->
+ <set id="lower" value="a b c d e f " /> <!-- 6 items -->
+ <set id="upper_or_lower" value="$[upper] $[lower]" /> <!-- Concatenate two sets -->
+ <set id="lower_or_upper" value="$[lower] $[upper]" /> <!-- Concatenate two sets -->
+ <set id="a" value="A"/> <!-- Just one element, an 'A'-->
+ <set id="cluster_or_zwnj" value="${hi_cluster} ${zwnj}"/> <!-- 2 items: "हि \u${200C}"-->
+</variables>
+```
+
+Match "X" followed by any uppercase letter:
+
+```xml
+<transform from="X$[upper]" to="…" />
+```
+
+Map from upper to lower:
+
+```xml
+<transform from="($[upper])" to="$[1:lower]" />
+```
+
+See [transform](#element-transform) for further details and syntax.
+
+* * *
+
+### Element: unicodeSet
+
+> <small>
+>
+> Parents: [variables](#element-variables)
+>
+> Children: _none_
+>
+> Occurrence: optional, multiple
+> </small>
+
+> This element represents a set, using a subset of the [UnicodeSet](tr35.md#Unicode_Sets) format, used by the [`transform`](#element-transform) elements for string matching and substitution.
+> Note important restrictions on the syntax below.
+
+_Attribute:_ `id` (required)
+
+> Specifies the identifier (name) of this unicodeSet.
+> All ids must be unique across all types of variables.
+>
+> `id` must match `[0-9A-Za-z_]{1,32}`
+
+_Attribute:_ `value` (required)
+
+> String value in [UnicodeSet](tr35.md#Unicode_Sets) format.
+> Leading and trailing whitespace is ignored.
+> Variables may refer to other string variables if they have been previously defined, using `${string}` syntax, or to other previously-defined UnicodeSets (not sets) using `$[unicodeSet]` syntax.
+
+**Syntax Note**
+
+- Warning: UnicodeSets look superficially similar to regex character classes as used in [`transform`](#element-transform) elements, but they are different. UnicodeSets must be defined with a `unicodeSet` element, and referenced with the `$[unicodeSet]` notation in transforms. UnicodeSets cannot be specified inline in a transform, and can only be used indirectly by reference to the corresponding `unicodeSet` element.
+- Multi-character strings (`{}`) are not supported, such as `[żġħ{ie}{għ}]`.
+- UnicodeSet property notation (`\p{…}` or `[:…:]`) may **NOT** be used, because that would make implementations dependent on a particular version of Unicode. However, implementations and tools may wish to pre-calculate the value of a particular UnicodeSet, and "freeze" it as explicit code points. The example below of `$[KhmrMn]` matches all nonspacing marks in the `Khmr` script.
+- UnicodeSets may represent a very large number of codepoints. A limit may be set on how many unique range entries may be matched.
+
+**Examples**
+
+```xml
+<variables>
+ <unicodeSet id="consonants" value="[कसतनमह]" /> <!-- unicode set range -->
+ <unicodeSet id="range" value="[a-z D E F G \u{200A}]" /> <!-- a through z, plus a few others -->
+ <unicodeSet id="newrange" value="[$[range]-[G]]" /> <!-- The above range, but not including G -->
+ <unicodeSet id="KhmrMn" value="[\u{17B4}\u{17B5}\u{17B7}-\u{17BD}\u{17C6}\u{17C9}-\u{17D3}\u{17DD}]"> <!-- [[:Khmr:][:Mn:]] as of Unicode 15.0-->
+</variables>
+```
+
+The `unicodeSet` element may not be used as the source or target for mapping operations (`$[1:variable]` syntax).
+The `unicodeSet` element may not be referenced by [`key`](#element-key) and [`display`](#element-display) elements.
+
+* * *
+
+### Element: transforms
+
+This element defines a group of one or more `transform` elements associated with this keyboard layout. This is used to support features such as dead-keys, character reordering, backspace behavior, etc. using a straightforward structure that works for all the keyboards tested, and that results in readable source data.
+
+There can be multiple `<transforms>` elements, but only one for each `type`.
+
+**Syntax**
```xml
<transforms type="...">
- {a set of transform elements}
+ {a set of transform groups}
</transforms>
```
> <small>
>
-> Parents: [keyboard](#Element_keyboard)
-> Children: [transform](#Element_transform)
-> Occurence: optional, multiple
+> Parents: [keyboard3](#element-keyboard3)
+>
+> Children: [import](#element-import), [_special_](tr35.md#special), [transformGroup](#element-transformgroup)
+>
+> Occurrence: optional, multiple
>
> </small>
_Attribute:_ `type` (required)
-> Current values: `simple`, `final`.
-
+> Values: `simple`, `backspace`
There are other keying behaviors that are needed particularly in handing complex orthographies from various parts of the world. The behaviors intended to be covered by the transforms are:
@@ -1234,210 +1655,497 @@
* Error indication. Sometimes a keyboard layout will want to specify to the application that a particular keying sequence in a context is in error and that the application should indicate that that particular keypress is erroneous.
* Backspace handling. There are various approaches to handling the backspace key. An application may treat it as an undo of the last key input, or it may simply delete the last character in the currently output text, or it may use transform rules to tell it how much to delete.
-We consider each transform type in turn and consider attributes to the `<transforms>` element pertinent to that type.
+#### Markers
+
+Markers are placeholders which record some state, but without producing normal visible text output. They were designed particularly to support dead-keys.
+
+The marker ID is any valid `NMTOKEN` (But see [CLDR-17043](https://unicode-org.atlassian.net/browse/CLDR-17043) for future discussion.)
+
+Consider the following abbreviated example:
+
+```xml
+ <display output="\m{circ_marker}" display="^" />
+…
+ <key id="circ_key" output="\m{circ_marker}" />
+ <key id="e" output="e" />
+…
+ <transform from="\m{circ_marker}e" to="ê" />
+```
+
+1. The user presses the `circ_key` key. The key can be shown with the keycap `^` due to the `<display>` element.
+
+2. The special marker, `circ_marker`, is added to the end of input context.
+
+ The input context does not match any transforms.
+
+ The input context has:
+
+ - …
+ - marker `circ_marker`
+
+3. Also due to the `<display>` element, implementations can opt to display a visible `^` (perhaps visually distinct from a plain `^` carat). Implementations may opt to display nothing and only store the marker in the input context.
+
+4. The user now presses the `e` key, which is also added to the input context. The input context now has:
+
+ - …
+ - character `e`
+ - marker `circ_marker`
+
+5. Now, the input context matches the transform. The `e` and the marker are replaced with `ê`.
+
+ The input context now has:
+
+ - …
+ - character `ê`
+
+**Using markers to inhibit other transforms**
+
+Sometimes it is desirable to prevent transforms from having an effect.
+Perhaps two different keys output the same characters, with different key or modifier combinations, but only one of them is intended to participate in a transform.
+
+Consider the following case, where pressing the keys `X`, `e` results in `^e`, which is transformed into `ê`.
+
+```xml
+<keys>
+ <key id="X" output="^"/>
+ <key id="e" output="e" />
+</keys>
+<transforms>
+ <transform from="^e" output="ê"/>
+</transforms>
+```
+
+However, what if the user wanted to produce `^e` without the transform taking effect?
+One strategy would be to use a marker, which won’t be visible in the output, but will inhibit the transform.
+
+```xml
+<keys>
+ <key id="caret" output="^\m{no_transform}"/>
+ <key id="X" output="^" />
+ <key id="e" output="e" />
+</keys>
+…
+<transforms>
+ <!-- this wouldn't match the key caret output because of the marker -->
+ <transform from="^e" output="ê"/>
+</transforms>
+```
+
+Pressing `caret` `e` will result in `^e` (with an invisible _no_transform_ marker — note that any name could be used). The `^e` won’t have the transform applied, at least while the marker’s context remains valid.
+
+Another strategy might be to use a marker to indicate where transforms are desired, instead of where they aren't desired.
+
+```xml
+<keys>
+ <key id="caret" output="^"/>
+ <key id="X" output="^\m{transform}"/>
+ <key id="e" output="e" />
+</keys>
+…
+<transforms …>
+ <!-- Won't match ^e without marker. -->
+ <transform from="^\m{transform}e" output="ê"/>
+</transforms>
+```
+
+In this way, only the `X`, `e` keys will produce `^e` with a _transform_ marker (again, any name could be used) which will cause the transform to be applied. One benefit is that navigating to an existing `^` in a document and adding an `e` will result in `^e`, and this output will not be affected by the transform, because there will be no marker present there (remember that markers are not stored with the document but only recorded in memory temporarily during text input).
+
+**Effect of markers on final text**
+
+All markers must be removed before text is returned to the application from the input context.
+If the input context changes, such as if the cursor or mouse moves the insertion point somewhere else, all markers in the input context are removed.
+
+**Implementation Notes**
+
+Ideally, markers are implemented entirely out-of-band from the normal text stream. However, implementations _may_ choose to map each marker to a [Unicode private-use character](https://www.unicode.org/glossary/#private_use_character) for use only within the implementation’s processing and temporary storage in the input context.
+
+For example, the first marker encountered could be represented as U+E000, the second by U+E001 and so on. If a regex processing engine were used, then those PUA characters could be processed through the existing regex processing engine. `[^\u{E000}-\u{E009}]` could be used as an expression to match a character that is not a marker, and `[Ee]\u{E000}` could match `E` or `e` followed by the first marker.
+
+Such implementations must take care to remove all such markers (see prior section) from the resultant text. As well, implementations must take care to avoid conflicts if applications themselves are using PUA characters, such as is often done with not-yet-encoded scripts or characters.
* * *
-### 5.19 <a name="Element_transform" href="#Element_transform">Element: transform</a>
+### Element: transformGroup
-This element must have the `transforms` element as its parent. This element represents a single transform that may be performed using the keyboard layout. A transform is an element that specifies a set of conversions from sequences of code points into one (or more) other code points.. For example, in most French keyboards hitting the "^" dead-key followed by the "e" key produces "ê".
+> <small>
+>
+> Parents: [transforms](#element-transforms)
+>
+> Children: [import](#element-import), [reorder](#element-reorder), [_special_](tr35.md#special), [transform](#element-transform)
+>
+> Occurrence: optional, multiple
+> </small>
+
+A `transformGroup` represents a set of transform elements or reorder elements.
+
+Each `transformGroup` is processed entirely before proceeding to the next one.
+
+
+Each `transformGroup` element, after imports are processed, must have either [reorder](#element-reorder) elements or [transform](#element-transform) elements, but not both. The `<transformGroup>` element may not be empty.
+
+**Examples**
+
+
+#### Example: `transformGroup` with `transform` elements
+
+This is a `transformGroup` that consists of one or more [`transform`](#element-transform) elements, prefaced by one or more `import` elements. See the discussion of those elements for details. `import` elements in this group may not import `reorder` elements.
+
+
+```xml
+<transformGroup>
+ <import path="..."/> <!-- optional import elements-->
+ <transform />
+ <!-- other <transform/> elements -->
+</transformGroup>
+```
+
+
+#### Example: `transformGroup` with `reorder` elements
+
+This is a `transformGroup` that consists of one or more [`transform`](#element-transform) elements, optionally prefaced by one or more `import` elements that import `transform` elements. See the discussion of those elements for details.
+
+`import` elements in this group may not import `transform` elements.
+
+```xml
+<transformGroup>
+ <import path="..."/> <!-- optional import elements-->
+ <reorder ... />
+ <!-- other <reorder> elements -->
+</transformGroup>
+```
+
+* * *
+
+### Element: transform
+
+This element represents a single transform that may be performed using the keyboard layout. A transform is an element that specifies a set of conversions from sequences of code points into (one or more) other code points. For example, in most French keyboards hitting the `^` dead-key followed by the `e` key produces `ê`.
+
+Matches are processed against the "input context", a temporary buffer containing all relevant text up to the insertion point. If the user moves the insertion point, the input context is discarded and recreated from the application’s text buffer. Implementations may discard the input context at any time.
+
+The input context may contain, besides regular text, any [Markers](#markers) as a result of keys or transforms, since the insertion point was moved.
+
+Using regular expression terminology, matches are done as if there was an implicit `$` (match end of buffer) at the end of each pattern. In other words, `<transform from="ke" …>` will not match an input context ending with `…keyboard`, but it will match the last two codepoints of an input context ending with `…awake`.
+
+All of the `transform` elements in a `transformGroup` are tested for a match, in order, until a match is found. Then, the matching element is processed, and then processing proceeds to the **next** `transformGroup`. If none of the `transform` elements match, processing proceeds without modification to the buffer to the **next** `transformGroup`.
**Syntax**
```xml
-<transform from="{combination of characters}" to="{output}"
- [before="{look-behind required match}"]
- [after="{look-ahead required match}"]
- [error="fail"] />
+<transform from="{input rule}" to="{output pattern}"/>
```
> <small>
>
-> Parents: [transforms](#Element_transforms)
+> Parents: [transformGroup](#element-transformgroup)
> Children: _none_
-> Occurence: required, multiple
+> Occurrence: required, multiple
>
> </small>
+
_Attribute:_ `from` (required)
-> The `from` attribute consists of a sequence of elements. Each element matches one character and may consist of a codepoint or a UnicodeSet (both as defined in [UTS#35 section 5.3.3](https://www.unicode.org/reports/tr35/#Unicode_Sets)).
+> The `from` attribute consists of an input rule for matching the input context.
+>
+> The `transform` rule and output pattern uses a modified, mostly subsetted, regular expression syntax, with EcmaScript syntax (with the `u` Unicode flag) as its baseline reference (see [MDN-REGEX](https://developer.mozilla.org/docs/Web/JavaScript/Guide/Regular_Expressions)). Differences from regex implementations will be noted.
-For example, suppose there are the following transforms:
+#### Regex-like Syntax
-```
-^e → ê
-^a → â
-^o → ô
-```
+- **Simple matches**
-If the user types a key that produces "\^", the keyboard enters a dead state. When the user then types a key that produces an "e", the transform is invoked, and "ê" is output. Suppose a user presses keys producing "\^" then "u". In this case, there is no match for the "\^u", and the "\^" is output if the `transformFailure` attribute in the `settings` element is set to emit. If there is no transform starting with "u", then it is also output (again only if `transformFailure` is set to emit) and the mechanism leaves the "dead" state.
+ `abc` `𐒵`
-The UI may show an initial sequence of matching characters with a special format, as is done with dead-keys on the Mac, and modify them as the transform completes. This behavior is specified in the `partial` attribute in the `transform` element.
+- **Unicode codepoint escapes**
-Most transforms in practice have only a couple of characters. But for completeness, the behavior is defined on all strings. The following applies when no exact match exists:
+ `\u{1234} \u{012A}`
+ `\u{22} \u{012a} \u{1234A}`
-1. If there could be a longer match if the user were to type additional keys, go into a 'dead' state.
-2. If there could not be a longer match, find the longest actual match, emit the transformed text (if `transformFailure` is set to emit), and start processing again with the remainder.
-3. If there is no possible match, output the first character, and start processing again with the remainder.
+ The hex escaping is case insensitive. The value may not match a surrogate or illegal character, nor a marker character.
+ The form `\u{…}` is preferred as it is the same regardless of codepoint length.
-Suppose that there are the following transforms:
+- **Fixed character classes and escapes**
-```
-ab → x
-abc → y
-abef → z
-bc → m
-beq → n
-```
+ `\s \S \t \r \n \f \v \\ \$ \d \w \D \W \0`
-Here's what happens when the user types various sequence characters:
+ The value of these classes do not change with Unicode versions.
-| Input characters | Result | Comments |
-|------------------|--------|----------|
-| ab | | No output, since there is a longer transform with this as prefix. |
-| abc | y | Complete transform match. |
-| abd | xd | The longest match is "ab", so that is converted and output. The 'd' follows, since it is not the start of any transform. |
-| abeq | xeq | "ab" wins over "beq", since it comes first. That is, there is no longer possible match starting with 'a'. |
-| bc | m | |
+ `\s` for example is exactly `[\f\n\r\t\v\u{00a0}\u{1680}\u{2000}-\u{200a}\u{2028}\u{2029}\u{202f}\u{205f}\u{3000}\u{feff}]`
-Control characters, combining marks and whitespace in this attribute are escaped using the `\u{...}` notation.
+ `\\` and `\$` evaluate to `\` and `$`, respectively.
-_Attribute:_ `to` (required)
+- **Character classes**
-> This attribute represents the characters that are output from the transform. The output can contain more than one character, so you could have `<transform from="´A" to="Fred"/>`
+ `[abc]` `[^def]` `[a-z]` `[ॲऄ-आइ-ऋ]` `[\u{093F}-\u{0944}\u{0962}\u{0963}]`
-Control characters, whitespace (other than the regular space character) and combining marks in this attribute are escaped using the `\u{...}` notation.
+ - supported
+ - no Unicode properties such as `\p{…}`
+ - Warning: Character classes look superficially similar to UnicodeSets as defined in [`unicodeSet`](#element-unicodeset) elements, but they are different. UnicodeSets must be defined with a `unicodeSet` element, and referenced with the `$[unicodeSet]` notation in transforms. UnicodeSets cannot be used directly in a transform.
-Examples
+- **Bounded quantifier**
-```xml
-<keyboard locale="fr-CA-t-k0-CSA-osx">
- <transforms type="simple">
- <transform from="´a" to="á" />
- <transform from="´A" to="Á" />
- <transform from="´e" to="é" />
- <transform from="´E" to="É" />
- <transform from="´i" to="í" />
- <transform from="´I" to="Í" />
- <transform from="´o" to="ó" />
- <transform from="´O" to="Ó" />
- <transform from="´u" to="ú" />
- <transform from="´U" to="Ú" />
- </transforms>
- ...
-</keyboard>
-```
+ `{x,y}`
-```xml
-<keyboard locale="nl-BE-t-k0-chromeos">
- <transforms type="simple">
- <transform from="\u{30c}a" to="ǎ" /> <!-- ̌a → ǎ -->
- <transform from="\u{30c}A" to="Ǎ" /> <!-- ̌A → Ǎ -->
- <transform from="\u{30a}a" to="å" /> <!-- ̊a → å -->
- <transform from="\u{30a}A" to="Å" /> <!-- ̊A → Å -->
- </transforms>
- ...
-</keyboard>
-```
+ `x` and `y` are required single digits representing the minimum and maximum number of occurrences.
+ `x` must be ≥ 0, `y` must be ≥ x and ≥ 1
-_Attribute:_ `before` (optional)
+- **Optional Specifier**
-> This attribute consists of a sequence of elements (codepoint or UnicodeSet) to match the text up to the current position in the text (this is similar to a regex "look behind" assertion: `(?<=a)b` matches a "b" that is preceded by an "a"). The attribute must match for the transform to apply. If missing, no before constraint is applied. The attribute value must not be empty.
+ `?` - equivalent of `{0,1}`
-_Attribute:_ `after` (optional)
+- **Numbered Capture Groups**
-> This attribute consists of a sequence of elements (codepoint or UnicodeSet) and matches as a zero-width assertion after the `@from` sequence. The attribute must match for the transform to apply. If missing, no after constraint is applied. The attribute value must not be empty. When the transform is applied, the string matched by the `@from` attribute is replaced by the string in the `@to` attribute, with the text matched by the `@after` attribute left unchanged. After the change, the current position is reset to just after the text output from the `@to` attribute and just before the text matched by the `@after` attribute. Warning: some legacy implementations may not be able to make such an adjustment and will place the current position after the `@after` matched string.
+ `([abc])([def])` (up to 9 groups)
-_Attribute:_ `error="fail"` (optional)
+ These refer to groups captured as a set, and can be referenced with the `$1` through `$9` operators in the `to=` pattern. May not be nested.
-> If set this attribute indicates that the keyboarding application may indicate an error to the user in some way. Processing may stop and rewind to any state before the key was pressed. If processing does stop, no further transforms on the same input are applied. The `@error` attribute takes the value `"fail"`, or must be absent. If processing continues, the `@to` is used for output as normal. It thus should contain a reasonable value.
+- **Non-capturing groups**
-For example:
+ `(?:thismatches)`
-```xml
-<transform from="\u037A\u037A" to="\u037A" error="fail" />
-```
+- **Nested capturing groups**
-This indicates that it is an error to type two iota subscripts immediately after each other.
+ `(?:[abc]([def]))|(?:[ghi])`
-In terms of how these different attributes work in processing a sequences of transforms, consider the transform:
+ Capture groups may be nested, however only the innermost group is allowed to be a capture group. The outer group must be a non-capturing group.
-```xml
-<transform before="X" from="Y" after="Z" to="B" />
-```
+- **Disjunctions**
-This would transform the string:
+ `abc|def`
-```
-XYZ → XBZ
-```
+ Match either `abc` or `def`.
-If we mark where the current match position is before and after the transform we see:
+- **Match a single Unicode codepoint**
-```
-X | Y Z → X B | Z
-```
+ `.`
-And a subsequent transform could transform the Z string, looking back (using @before) to match the B.
+ Matches a codepoint, not individual code units. (See the ’u’ option in EcmaScript262 regex.)
+ For example, Osage `𐒵` is one match (`.`) not two.
+ Does not match [markers](#markers). (See `\m{.}` and `\m{marker}`, below.)
+
+- **Match the start of the text context**
+
+ `^`
+
+ The start of the context could be the start of a line, a grid cell, or some other formatting boundary.
+ See description at the top of [`transforms`](#element-transform).
+
+#### Additional Features
+
+The following are additions to standard Regex syntax.
+
+- **Match a Marker**
+
+ `\m{Some_Marker}`
+
+ Matches the named marker.
+ Also see [Markers](#markers).
+
+- **Match a single marker**
+
+ `\m{.}`
+
+ Matches any single marker.
+ Also see [Markers](#markers).
+
+- **String Variables**
+
+ `${zwnj}`
+
+ In this usage, the variable with `id="zwnj"` will be substituted in at this point in the expression. The variable can contain a range, a character, or any other portion of a pattern. If `zwnj` is a simple string, the pattern will match that string at this point.
+
+- **Set and UnicodeSet variables**
+
+ `$[upper]`
+
+ Given a space-separated variable, this syntax will match _any_ of the substrings. This expression may be thought of (and implemented) as if it were a _non-capturing group_. It may, however, be enclosed within a capturing group. For example, the following definition of `$[upper]` will match as if it were written `(?:A|B|CC|D|E|FF)`.
+
+ ```xml
+ <variables>
+ <set id="upper" value=" A B CC D E FF " />
+ </variables>
+ ```
+
+ This expression in a `from=` may be used to **insert a mapped variable**, see below under [Replacement syntax](#replacement-syntax).
+
+#### Disallowed Regex Features
+
+- **Unicode properties**
+
+ `\p{property}` `\P{property}`
+
+ **Rationale:** The behavior of this feature varies by Unicode version, and so would not have predictable results.
+
+ Tooling may choose to suggest an expansion of properties, such as `\p{Mn}` to all non spacing marks for a certain Unicode version. As well, a set of variables could be constructed in an `import`-able file matching particularly useful Unicode properties.
+
+ ```xml
+ <unicodeSet id="Mn" value="[\u{034F}\u{0591}-\u{05AF}\u{05BD}\u{05C4}\u{05C5}\…]" /> <!-- 1,985 code points -->
+ ```
+
+- **Backreferences**
+
+ `([abc])-\1` `\k<something>`
+
+ **Rationale:** Implementation and cognitive complexity.
+
+- **Unbounded Quantifiers**
+
+ `* + *? +? {1,} {0,}`
+
+ **Rationale:** Implementation and Computational complexity.
+
+- **Nested capture groups**
+
+ `((a|b|c)|(d|e|f))`
+
+ **Rationale:** Computational and cognitive complexity.
+
+- **Named capture groups**
+
+ `(?<something>)`
+
+ **Rationale:** Implementation complexity.
+
+- **Assertions** other than `^`
+
+ `\b` `\B` `(?<!…)` …
+
+ **Rationale:** Implementation complexity.
+
+- **End marker**
+
+ `$`
+
+ The end marker can be thought of as being implicitly at the end of every `from=` pattern, matching the insertion point. Transforms do not match past the insertion point.
+
+_Attribute:_ `to`
+
+> This attribute represents the characters that are output from the transform.
+>
+> If this attribute is absent, it indicates that the no characters are output, such as with a backspace transform.
+>
+> A final rule such as `<transform from=".*"/>` will remove all context which doesn’t match one of the prior rules.
+
+#### Replacement syntax
+
+Used in the `to=`
+
+- **Literals**
+
+ `$$ \$ \\` = `$ $ \`
+
+- **Entire matched substring**
+
+ `$0`
+
+- **Insert the specified capture group**
+
+ `$1 $2 $3 … $9`
+
+- **Insert an entire variable**
+
+ `${variable}`
+
+ The entire contents of the named variable will be inserted at this point.
+
+- **Insert a mapped set**
+
+ `$[1:variable]` (Where "1" is any numbered capture group from 1 to 9)
+
+ Maps capture group 1 to variable `variable`. The `from=` side must also contain a grouped variable. This expression may appear anywhere or multiple times in the `to=` pattern.
+
+ **Example**
+
+ ```xml
+ <set id="upper" value="A B CC D E FF G" />
+ <set id="lower" value="a b c d e \u{0192} g" />
+ <!-- note that values may be spaced for ease of reading -->
+ …
+ <transform from="($[upper])" to="$[1:lower]" />
+ ```
+
+ - The capture group on the `from=` side **must** contain exactly one set variable. `from="Q($[upper])X"` can be used (other context before or after the capture group), but `from="(Q$[upper])"` may not be used with a mapped variable and is flagged as an error.
+
+ - The `from=` and `to=` sides of the pattern must both be using `set` variables. There is no way to insert a set literal on either side and avoid using a variable.
+ A UnicodeSet may not be used directly, but must be defined as a `unicodeSet` variable.
+
+ - The two variables (here `upper` and `lower`) must have exactly the same number of whitespace-separated items. Leading and trailing space (such as at the end of `lower`) is ignored. A variable without any spaces is considered to be a set variable of exactly one item.
+
+ - As described in [Additional Features](#additional-features), the `upper` set variable as used here matches as if it is `((?:A|B|CC|D|E|FF|G))`, showing the enclosing capturing group. When text from the input context matches this expression, and all above conditions are met, the mapping proceeds as follows:
+
+ 1. The portion of the input context, such as `CC`, is matched against the above calculated pattern.
+
+ 2. The position within the `from=` variable (`upper`) is calculated. The regex match may not have this information, but the matched substring `CC` can be compared against the tokenized input variable: `A`, `B`, `CC`, `D`, … to find that the 3rd item matches exactly.
+
+ 3. The same position within the `to=` variable (`lower`) is calculated. The 3rd item is `c`.
+
+ 4. `CC` in the input context is replaced with `c`, and processing proceeds to the next `transformGroup`.
+
+- **Emit a marker**
+
+ `\m{Some_marker}`
+
+ Emits the named mark. Also see [Markers](#markers).
* * *
-### 5.20 <a name="Element_reorder" href="#Element_reorder">Element: reorders, reorder</a>
+### Element: reorder
-The reorder transform is applied after all transform except for those with `type="final"`.
+The reorder transform consists of a [`<transformGroup>`](#element-transformgroup) element containing `<reorder>` elements. Multiple such `<transformGroup>` elements may be contained in an enclosing `<transforms>` element.
+
+One or more [`<import>`](#element-import) elements are allowed to precede the `<reorder>` elements.
This transform has the job of reordering sequences of characters that have been typed, from their typed order to the desired output order. The primary concern in this transform is to sort combining marks into their correct relative order after a base, as described in this section. The reorder transforms can be quite complex, keyboard layouts will almost always import them.
The reordering algorithm consists of four parts:
-1. Create a sort key for each character in the input string. A sort key has 4 parts: (primary, index, tertiary).
+1. Create a sort key for each character in the input string. A sort key has 4 parts (primary, index, tertiary, quaternary):
* The **primary weight** is the primary order value.
* The **secondary weight** is the index, a position in the input string, usually of the character itself, but it may be of a character earlier in the string.
* The **tertiary weight** is a tertiary order value (defaulting to 0).
* The **quaternary weight** is the index of the character in the string. This is solely to ensure a stable sort for sequences of characters with the same tertiary weight.
2. Mark each character as to whether it is a prebase character, one that is typed before the base and logically stored after. Thus it will have a primary order > 0.
3. Use the sort key and the prebase mark to identify runs. A run starts with a prefix that contains any prebase characters and a single base character whose primary and tertiary key is 0. The run extends until, but not including, the start of the prefix of the next run or end of the string.
- * `run := prebase* (primary=0 && tertiary=0) ((primary≠0 || tertiary≠0) && !prebase)*`
+ * `run := preBase* (primary=0 && tertiary=0) ((primary≠0 || tertiary≠0) && !preBase)*`
4. Sort the character order of each character in the run based on its sort key.
-The primary order of a character with the Unicode property Combining_Character_Class (ccc) of 0 may well not be 0. In addition, a character may receive a different primary order dependent on context. For example, in the Devanagari sequence ka halant ka, the first ka would have a primary order 0 while the halant ka sequence would give both halant and the second ka a primary order > 0, for example 2. Note that “base” character in this discussion is not a Unicode base character. It is instead a character with primary=0.
+The primary order of a character with the Unicode property `Canonical_Combining_Class` (ccc) of 0 may well not be 0. In addition, a character may receive a different primary order dependent on context. For example, in the Devanagari sequence ka halant ka, the first ka would have a primary order 0 while the halant ka sequence would give both halant and the second ka a primary order > 0, for example 2. Note that “base” character in this discussion is not a Unicode base character. It is instead a character with primary=0.
-In order to get the characters into the correct relative order, it is necessary not only to order combining marks relative to the base character, but also to order some combining marks in a subsequence following another combining mark. For example in Devanagari, a nukta may follow consonant character, but it may also follow a conjunct consisting of consonant, halant, consonant. Notice that the second consonant is not, in this model, the start of a new run because some characters may need to be reordered to before the first base, for example repha. The repha would get primary < 0, and be sorted before the character with order = 0, which is, in the case of Devanagari, the initial consonant of the orthographic syllable.
+In order to get the characters into the correct relative order, it is necessary not only to order combining marks relative to the base character, but also to order some combining marks in a subsequence following another combining mark. For example in Devanagari, a nukta may follow a consonant character, but it may also follow a conjunct consisting of consonant, halant, consonant. Notice that the second consonant is not, in this model, the start of a new run because some characters may need to be reordered to before the first base, for example repha. The repha would get primary < 0, and be sorted before the character with order = 0, which is, in the case of Devanagari, the initial consonant of the orthographic syllable.
-The reorder transform consists of a single element type: `<reorder>` encapsulated in a `<reorders>` element. Each is a rule that matches against a string of characters with the action of setting the various ordering attributes (`primary`, `tertiary`, `tertiary_base`, `prebase`) for the matched characters in the string.
+The reorder transform consists of `<reorder>` elements encapsulated in a `<transformGroup>` element. Each element is a rule that matches against a string of characters with the action of setting the various ordering attributes (`primary`, `tertiary`, `tertiaryBase`, `preBase`) for the matched characters in the string.
+
+The relative ordering of `<reorder>` elements is not significant.
**Syntax**
```xml
-<reorder from="{combination of characters}"
- [before="{look-behind required match}"]
- [after="{look-ahead required match}"]
- [order="{list of weights}"]
- [tertiary="{list of weights}"]
- [tertiary_base="{list of true/false}"]
- [prebase="{list of true/false}"] />
+<transformGroup>
+ <!-- one or more <import/> elements are allowed at this point -->
+ <reorder from="{combination of characters}"
+ before="{look-behind required match}"
+ order="{list of weights}"
+ tertiary="{list of weights}"
+ tertiaryBase="{list of true/false}"
+ preBase="{list of true/false}" />
+ <!-- other <reorder/> elements... -->
+</transformGroup>
```
> <small>
>
-> Parents: [reorders](#Element_reorder)
+> Parents: [transformGroup](#element-transformgroup)
> Children: _none_
-> Occurence: required, multiple
+> Occurrence: optional, multiple
>
> </small>
_Attribute:_ `from` (required)
-> This attribute follows the `transform/@from` attribute and contains a string of elements. Each element matches one character and may consist of a codepoint or a UnicodeSet (both as defined in UTS#35 section 5.3.3).
+> This attribute contains a string of elements. Each element matches one character and may consist of a codepoint or a UnicodeSet (both as defined in [UTS #35 Part One](tr35.md#Unicode_Sets)).
_Attribute:_ `before`
-> This attribute follows the `transform/@before` attribute and contains the element string that must match the string immediately preceding the start of the string that the @from matches.
-
-_Attribute:_ `after`
-
-> This attribute follows the `transform/@after` attribute and contains the element string that must match the string immediately following the end of the string that the `@from` matches.
+> This attribute contains the element string that must match the string immediately preceding the start of the string that the @from matches.
_Attribute:_ `order`
@@ -1464,26 +2172,24 @@
>
> A tertiary character receives its primary order and index from a previous character, which it is intended to sort closely after. The sort key for a tertiary character consists of:
>
-> * Primary weight is the primary weight of the primary character
+> * Primary weight is the primary weight of the primary character..
> * Secondary weight is the index of the primary character, not the tertiary character
> * Tertiary weight is the tertiary value for the character.
> * Quaternary weight is the index of the tertiary character.
-_Attribute:_ `tertiary_base`
+_Attribute:_ `tertiaryBase`
-> This attribute is a space separated list of `"true"` or `"false"` values corresponding to each character matched. It is illegal for a tertiary character to have a true `tertiary_base` value. For a primary character it marks that this character may have tertiary characters moved after it. When calculating the secondary weight for a tertiary character, the most recently encountered primary character with a true `tertiary_base` attribute is used. Primary characters with an `@order` value of 0 automatically are treated as having `tertiary_base` true regardless of what is specified for them.
+> This attribute is a space separated list of `"true"` or `"false"` values corresponding to each character matched. It is illegal for a tertiary character to have a true `tertiaryBase` value. For a primary character it marks that this character may have tertiary characters moved after it. When calculating the secondary weight for a tertiary character, the most recently encountered primary character with a true `tertiaryBase` attribute is used. Primary characters with an `@order` value of 0 automatically are treated as having `tertiaryBase` true regardless of what is specified for them.
-_Attribute:_ `prebase`
+_Attribute:_ `preBase`
> This attribute gives the prebase attribute for each character matched. The value may be `"true"` or `"false"` or a space separated list of such values. If missing the value for all the characters matched is false. It is illegal for a tertiary character to have a true prebase value.
>
> If a primary character has a true prebase value then the character is marked as being typed before the base character of a run, even though it is intended to be stored after it. The primary order gives the intended position in the order after the base character, that the prebase character will end up. Thus `@order` shall not be 0. These characters are part of the run prefix. If such characters are typed then, in order to give the run a base character after which characters can be sorted, an appropriate base character, such as a dotted circle, is inserted into the output run, until a real base character has been typed. A value of `"false"` indicates that the character is not a prebase.
-There is no `@error` attribute.
+For `@from` attributes with a match string length greater than 1, the sort key information (`@order`, `@tertiary`, `@tertiaryBase`, `@preBase`) may consist of a space-separated list of values, one for each element matched. The last value is repeated to fill out any missing values. Such a list may not contain more values than there are elements in the `@from` attribute:
-For `@from` attributes with a match string length greater than 1, the sort key information (`@order`, `@tertiary`, `@tertiary_base`, `@prebase`) may consist of a space separated list of values, one for each element matched. The last value is repeated to fill out any missing values. Such a list may not contain more values than there are elements in the `@from` attribute:
-
-```
+```java
if len(@from) < len(@list) then error
else
while len(@from) > len(@list)
@@ -1494,7 +2200,7 @@
**Example**
-For example, consider the word Northern Thai (nod-Lana) word: ᨡ᩠ᩅᩫ᩶ 'roasted'. This is ideally encoded as the following:
+For example, consider the Northern Thai (`nod-Lana`, Tai Tham script) word: ᨡ᩠ᩅᩫ᩶ 'roasted'. This is ideally encoded as the following:
| name | _kha_ | _sakot_ | _wa_ | _o_ | _t2_ |
|------|-------|---------|------|------|------|
@@ -1529,110 +2235,127 @@
We want all of these sequences to end up ordered as the first. To do this, we use the following rules:
```xml
-<reorder from="\u1A60" order="127" /> <!-- max possible order -->
-<reorder from="\u1A6B" order="42" />
-<reorder from="[\u1A75-\u1A79]" order="55" />
-<reorder before="\u1A6B" from="\u1A60\u1A45" order="10" />
-<reorder before="\u1A6B[\u1A75-\u1A79]" from="\u1A60\u1A45" order="10" />
-<reorder before="\u1A6B" from="\u1A60[\u1A75-\u1A79]\u1A45" order="10 55 10" />
+<reorder from="\u{1A60}" order="127" /> <!-- max possible order -->
+<reorder from="\u{1A6B}" order="42" />
+<reorder from="[\u{1A75}-\u{1A79}]" order="55" />
+<reorder before="\u{1A6B}" from="\u{1A60}\u{1A45}" order="10" />
+<reorder before="\u{1A6B}[\u{1A75}-\u{1A79}]" from="\u{1A60}\u{1A45}" order="10" />
+<reorder before="\u{1A6B}" from="\u{1A60}[\u{1A75}-\u{1A79}]\u{1A45}" order="10 55 10" />
```
The first reorder is the default ordering for the _sakot_ which allows for it to be placed anywhere in a sequence, but moves any non-consonants that may immediately follow it, back before it in the sequence. The next two rules give the orders for the top vowel component and tone marks respectively. The next three rules give the _sakot_ and _wa_ characters a primary order that places them before the _o_. Notice particularly the final reorder rule where the _sakot_+_wa_ is split by the tone mark. This rule is necessary in case someone types into the middle of previously normalized text.
-`<reorder>` elements are priority ordered based first on the length of string their `@from` attribute matches and then the sum of the lengths of the strings their `@before` and `@after` attributes match.
+`<reorder>` elements are priority ordered based first on the length of string their `@from` attribute matches and then the sum of the lengths of the strings their `@before` attribute matches.
-If a layout has two `<reorders>` elements, e.g. from importing one and specifying the second, then `<reorder>` elements are merged. The @from string in a `<reorder>` element describes a set of strings that it matches. This also holds for the `@before` and `@after` attributes. The intersection of two `<reorder>` elements consists of the intersections of their `@from`, `@before` and `@after` string sets. It is illegal for the intersection between any two `<reorder>` elements in the same `<reorders>` element to be non empty, although implementors are encouraged to have pity on layout authors when reporting such errors, since they can be hard to track down.
+#### Using `<import>` with `<reorder>` elements
-If two `<reorder>` elements in two different `<reorders>` elements have a non empty intersection, then they are split and merged. They are split such that where there were two `<reorder>` elements, there are, in effect (but not actuality), three elements consisting of:
+This section describes the impact of using [`import`](#element-import) elements with `<reorder>` elements.
-* `@from`, `@before`, `@after` that match the intersection of the two rules. The other attributes are merged, as described below.
-* `@from`, `@before`, `@after` that match the set of strings in the first rule not in the intersection with the other attributes from the first rule.
-* `@from`, `@before`, `@after` that match the set of strings in the second rule not in the intersection, with the other attributes from the second rule.
+The @from string in a `<reorder>` element describes a set of strings that it matches. This also holds for the `@before` attribute. The **intersection** of any two `<reorder>` elements consists of the intersections of their `@from` and `@before` string sets. Tooling should warn users if the intersection between any two `<reorder>` elements in the same `<transformGroup>` element to be non empty prior to processing imports.
-When merging the other attributes, the second rule is taken to have priority (occurring later in the layout description file). Where the second rule does not define the value for a character but the first does, it is taken from the first rule, otherwise it is taken from the second rule.
+If two `<reorder>` elements have a non empty intersection, then they are split and merged. They are split such that where there were two `<reorder>` elements, there are, in effect (but not actuality), three elements consisting of:
-Notice that it is possible for two rules to match the same string, but for them not to merge because the distribution of the string across `@before`, `@from`, and `@after` is different. For example:
+* `@from`, `@before` that match the intersection of the two rules. The other attributes are merged, as described below.
+* `@from`, `@before` that match the set of strings in the first rule not in the intersection with the other attributes from the first rule.
+* `@from`, `@before` that match the set of strings in the second rule not in the intersection, with the other attributes from the second rule.
+
+When merging the other attributes, the second rule is taken to have priority (being an override of the earlier element). Where the second rule does not define the value for a character but the first does, the value is taken from the first rule, otherwise it is taken from the second rule.
+
+Notice that it is possible for two rules to match the same string, but for them not to merge because the distribution of the string across `@before` and `@from` is different. For example, the following would not merge:
```xml
-<reorder before="ab" from="cd" after="e" />
+<reorder before="ab" from="cd" />
+<reorder before="a" from="bcd" />
```
-would not merge with:
-
-```xml
-<reorder before="a" from="bcd" after="e" />
-```
-
-When two `<reorders>` elements merge as the result of an import, the resulting `reorder` elements are sorted into priority order for matching.
+After `<reorder>` elements merge, the resulting `reorder` elements are sorted into priority order for matching.
Consider this fragment from a shared reordering for the Myanmar script:
```xml
-<!-- medial-r -->
-<reorder from="\u103C" order="20" />
+<!-- File: "myanmar-reordering.xml" -->
+<transformGroup>
+ <!-- medial-r -->
+ <reorder from="\u{103C}" order="20" />
-<!-- [medial-wa or shan-medial-wa] -->
-<reorder from="[\u103D\u1082]" order="25" />
+ <!-- [medial-wa or shan-medial-wa] -->
+ <reorder from="[\u{103D}\u{1082}]" order="25" />
-<!-- [medial-ha or shan-medial-wa]+asat = Mon asat -->
-<reorder from="[\u103E\u1082]\u103A" order="27" />
+ <!-- [medial-ha or shan-medial-wa]+asat = Mon asat -->
+ <reorder from="[\u{103E}\u{1082}]\u{103A}" order="27" />
-<!-- [medial-ha or mon-medial-wa] -->
-<reorder from="[\u103E\u1060]" order="27" />
+ <!-- [medial-ha or mon-medial-wa] -->
+ <reorder from="[\u{103E}\u{1060}]" order="27" />
-<!-- [e-vowel or shan-e-vowel] -->
-<reorder from="[\u1031\u1084]" order="30" />
+ <!-- [e-vowel (U+1031) or shan-e-vowel (U+1084)] -->
+ <reorder from="[\u{1031}\u{1084}]" order="30" />
-<reorder from="[\u102D\u102E\u1033-\u1035\u1071-\u1074\u1085\u109D\uA9E5]" order="35" />
+ <reorder from="[\u{102D}\u{102E}\u{1033}-\u{1035}\u{1071}-\u{1074}\u{1085}\u{109D}\u{A9E5}]" order="35" />
+</transformGroup>
```
-A particular Myanmar keyboard layout can have this `reorders` element:
+A particular Myanmar keyboard layout can have these `reorder` elements:
```xml
-<reorders>
+<transformGroup>
+ <import path="myanmar-reordering.xml"/> <!-- import the above transformGroup -->
<!-- Kinzi -->
- <reorder from="\u1004\u103A\u1039" order="-1" />
+ <reorder from="\u{1004}\u{103A}\u{1039}" order="-1" />
<!-- e-vowel -->
- <reorder from="\u1031" prebase="1" />
+ <reorder from="\u{1031}" preBase="1" />
<!-- medial-r -->
- <reorder from="\u103C" prebase="1" />
-</reorders>
+ <reorder from="\u{103C}" preBase="1" />
+</transformGroup>
```
-The effect of this that the _e-vowel_ will be identified as a prebase and will have an order of 30. Likewise a _medial-r_ will be identified as a prebase and will have an order of 20. Notice that a _shan-e-vowel_ will not be identified as a prebase (even if it should be!). The _kinzi_ is described in the layout since it moves something across a run boundary. By separating such movements (prebase or moving to in front of a base) from the shared ordering rules, the shared ordering rules become a self-contained combining order description that can be used in other keyboards or even in other contexts than keyboarding.
+The effect of this is that the _e-vowel_ will be identified as a prebase and will have an order of 30. Likewise a _medial-r_ will be identified as a prebase and will have an order of 20. Notice that a _shan-e-vowel_ (`\u{1084}`) will not be identified as a prebase (even if it should be!). The _kinzi_ is described in the layout since it moves something across a run boundary. By separating such movements (prebase or moving to in front of a base) from the shared ordering rules, the shared ordering rules become a self-contained combining order description that can be used in other keyboards or even in other contexts than keyboarding.
-* * *
+#### Example Post-reorder transforms
-### 5.21 <a name="Element_final" href="#Element_final">Element: transform final</a>
+It may be desired to perform additional processing following reorder operations. This may be aaccomplished by adding an additional `<transformGroup>` element after the reorders.
-The final transform is applied after the reorder transform. It executes in a similar way to the simple transform with the settings ignored, as if there were no settings in the `<settings>` element.
-
-**Example**
-
-This is an example from Khmer where split vowels are combined after reordering.
+First, a partial example from Khmer where split vowels are combined after reordering.
```xml
-<transforms type="final">
- <transform from="\u17C1\u17B8" to="\u17BE" />
- <transform from="\u17C1\u17B6" to="\u17C4" />
-</transforms>
+…
+<transformGroup>
+ <reorder … />
+ <reorder … />
+ <reorder … />
+ …
+</transformGroup>
+<transformGroup>
+ <transform from="\u{17C1}\u{17B8}" to="\u{17BE}" />
+ <transform from="\u{17C1}\u{17B6}" to="\u{17C4}" />
+</transformGroup>
```
-Another example allows a keyboard implementation to alert or stop people typing two lower vowels in a Burmese cluster:
+Another partial example allows a keyboard implementation to prevent people typing two lower vowels in a Burmese cluster:
```xml
-<transform from="[\u102F\u1030\u1048\u1059][\u102F\u1030\u1048\u1059]" error="fail" />
+…
+<transformGroup>
+ <reorder … />
+ <reorder … />
+ <reorder … />
+ …
+</transformGroup>
+<transformGroup>
+ <transform from="[\u{102F}\u{1030}\u{1048}\u{1059}][\u{102F}\u{1030}\u{1048}\u{1059}]" />
+</transformGroup>
```
* * *
-### 5.22 <a name="Element_backspaces" href="#Element_backspaces">Element: backspaces</a>
+### Backspace Transforms
-The backspace transform is an optional transform that is not applied on input of normal characters, but is only used to perform extra backspace modifications to previously committed text.
+The `<transforms type="backspace">` describe an optional transform that is not applied on input of normal characters, but is only used to perform extra backspace modifications to previously committed text.
-Keyboarding applications typically, but are not required, to work in one of two modes:
+When the backspace key is pressed, the `<transforms type="backspace">` element (if present) is processed, and then the `<transforms type="simple">` element (if processed) as with any other key.
+
+Keyboarding applications typically work, but are not required to, in one of two modes:
**_text entry_**
@@ -1642,222 +2365,107 @@
> text editing happens when a user moves the cursor into some previously entered text which may have been entered by someone else. As such, there is no way to know in which order things were typed, but a user will still want appropriate behaviour when they press backspace. This may involve deleting more than one character or replacing a sequence of characters with a different sequence.
-In the text entry mode, there is no need for any special description of backspace behaviour. A keyboarding application will typically keep a history of previous output states and just revert to the previous state when backspace is hit.
+In text editing mode, different keyboard layouts may behave differently in the same textual context. The backspace transform allows the keyboard layout to specify the effect of pressing backspace in a particular textual context. This is done by specifying a set of backspace rules that match a string before the cursor and replace it with another string. The rules are expressed within a `transforms type="backspace"` element.
-In text editing mode, different keyboard layouts may behave differently in the same textual context. The backspace transform allows the keyboard layout to specify the effect of pressing backspace in a particular textual context. This is done by specifying a set of backspace rules that match a string before the cursor and replace it with another string. The rules are expressed as `backspace` elements encapsulated in a `backspaces` element.
-
-**Syntax**
```xml
-<backspaces>
- {a set of backspace elements}
-</backspace>
+<transforms type="backspace">
+ <transformGroup>
+ <transform from="{combination of characters}" to="{output}" />
+ </transformGroup>
+</transforms>
```
-> <small>
->
-> Parents: [keyboard](#Element_keyboard)
-> Children: [backspace](#Element_backspace)
-> Occurence: optional, single
->
-> </small>
-
-* * *
-
-### 5.23 <a name="Element_backspace" href="#Element_backspace">Element: backspace</a>
-
-**Syntax**
-
-```xml
-<backspace from="{combination of characters}" [to="{output}"]
- [before="{look-behind required match}"]
- [after="{look-ahead required match}"]
- [error="fail"] />
-```
-
-> <small>
->
-> Parents: [backspaces](#Element_backspaces)
-> Children: _none_
-> Occurence: required, multiple
->
-> </small>
-
-The `backspace` element has the same `@before`, `@from`, `@after`, `@to`, `@error` of the `transform` element. The `@to` is optional with `backspace`.
-
**Example**
-For example, consider deleting a Devanagari ksha:
+For example, consider deleting a Devanagari ksha क्श:
+
+While this character is made up of three codepoints, the following rule causes all three to be deleted by a single press of the backspace.
+
```xml
-<backspaces>
- <backspace from="\u0915\u094D\u0936"/>
-</backspaces>
+<transforms type="backspace">
+ <transformGroup>
+ <transform from="\u{0915}\u{094D}\u{0936}"/>
+ </transformGroup>
+</transforms>
```
-Here there is no `@to` attribute since the whole string is being deleted. This is not uncommon in the backspace transforms.
+Note that the optional attribute `@to` is omitted, since the whole string is being deleted. This is not uncommon in backspace transforms.
A more complex example comes from a Burmese visually ordered keyboard:
```xml
-<backspaces>
- <!-- Kinzi -->
- <backspace from="[\u1004\u101B\u105A]\u103A\u1039" />
+<transforms type="backspace">
+ <transformGroup>
+ <!-- Kinzi -->
+ <transform from="[\u{1004}\u{101B}\u{105A}]\u{103A}\u{1039}" />
- <!-- subjoined consonant -->
- <backspace from="\u1039[\u1000-\u101C\u101E\u1020\u1021\u1050\u1051\u105A-\u105D]" />
+ <!-- subjoined consonant -->
+ <transform from="\u{1039}[\u{1000}-\u{101C}\u{101E}\u{1020}\u{1021}\u{1050}\u{1051}\u{105A}-\u{105D}]" />
- <!-- tone mark -->
- <backspace from="\u102B\u103A" />
+ <!-- tone mark -->
+ <transform from="\u{102B}\u{103A}" />
- <!-- Handle prebases -->
- <!-- diacritics stored before e-vowel -->
- <backspace from="[\u103A-\u103F\u105E-\u1060\u1082]\u1031" to="\u1031" />
+ <!-- Handle prebases -->
+ <!-- diacritics stored before e-vowel -->
+ <transform from="[\u{103A}-\u{103F}\u{105E}-\u{1060}\u{1082}]\u{1031}" to="\u{1031}" />
- <!-- diacritics stored before medial r -->
- <backspace from="[\u103A-\u103B\u105E-\u105F]\u103C" to="\u103C" />
+ <!-- diacritics stored before medial r -->
+ <transform from="[\u{103A}-\u{103B}\u{105E}-\u{105F}]\u{103C}" to="\u{103C}" />
- <!-- subjoined consonant before e-vowel -->
- <backspace from="\u1039[\u1000-\u101C\u101E\u1020\u1021]\u1031" to="\u1031" />
+ <!-- subjoined consonant before e-vowel -->
+ <transform from="\u{1039}[\u{1000}-\u{101C}\u{101E}\u{1020}\u{1021}]\u{1031}" to="\u{1031}" />
- <!-- base consonant before e-vowel -->
- <backspace from="[\u1000-\u102A\u103F-\u1049\u104E]\u1031" to="\uFDDF\u1031" />
+ <!-- base consonant before e-vowel -->
+ <transform from="[\u{1000}-\u{102A}\u{103F}-\u{1049}\u{104E}]\u{1031}" to="\m{prebase}\u{1031}" />
- <!-- subjoined consonant before medial r -->
- <backspace from="\u1039[\u1000-\u101C\u101E\u1020\u1021]\u103C" to="\u103C" />
+ <!-- subjoined consonant before medial r -->
+ <transform from="\u{1039}[\u{1000}-\u{101C}\u{101E}\u{1020}\u{1021}]\u{103C}" to="\u{103C}" />
- <!-- base consonant before medial r -->
- <backspace from="[\u1000-\u102A\u103F-\u1049\u104E]\u103C" to="\uFDDF\u103C" />
+ <!-- base consonant before medial r -->
+ <transform from="[\u{1000}-\u{102A}\u{103F}-\u{1049}\u{104E}]\u{103C}" to="\m{prebase}\u{103C}" />
- <!-- delete lone medial r or e-vowel -->
- <backspace from="\uFDDF[\u1031\u103C]" />
-</backspaces>
+ <!-- delete lone medial r or e-vowel -->
+ <transform from="\m{prebase}[\u{1031}\u{103C}]" />
+ </transformGroup>
+</transforms>
```
The above example is simplified, and doesn't fully handle the interaction between medial-r and e-vowel.
-The character \\uFDDF does not represent a literal character, but is instead a special placeholder, a "filler string". When a keyboard implementation handles a user pressing a key that inserts a prebase character, it also has to insert a special filler string before the prebase to ensure that the prebase character does not combine with the previous cluster. See the reorder transform for details. The precise filler string is implementation dependent. Rather than requiring keyboard layout designers to know what the filler string is, we reserve a special character that the keyboard layout designer may use to reference this filler string. It is up to the keyboard implementation to, in effect, replace that character with the filler string.
+
+> The character `\m{prebase}` does not represent a literal character, but is instead a special marker, used as a "filler string". When a keyboard implementation handles a user pressing a key that inserts a prebase character, it also has to insert a special filler string before the prebase to ensure that the prebase character does not combine with the previous cluster. See the reorder transform for details. See [markers](#markers) for the `\m` syntax.
The first three transforms above delete various ligatures with a single keypress. The other transforms handle prebase characters. There are two in this Burmese keyboard. The transforms delete the characters preceding the prebase character up to base which gets replaced with the prebase filler string, which represents a null base. Finally the prebase filler string + prebase is deleted as a unit.
-The backspace transform is much like other transforms except in its processing model. If we consider the same transform as in the simple transform example, but as a backspace:
+If no specified transform among all `transformGroup`s under the `<transforms type="backspace">` element matches, a default will be used instead — an implied final transform that simply deletes the codepoint at the end of the input context. This implied transform is effectively similar to the following code sample, even though the `*` operator is not actually allowed in `from=`. See the documentation for *Match a single Unicode codepoint* under [transform syntax](#regex-like-syntax) and [markers](#markers), above.
+
+It is important that implementations do not by default delete more than one non-marker codepoint at a time, except in the case of emoji clusters. Note that implementations will vary in the emoji handling due to the iterative nature of successive Unicode releases. See [UTS#51 §2.4.2: Emoji Modifiers in Text](https://www.unicode.org/reports/tr51/#Emoji_Modifiers_in_Text)
```xml
-<backspace before="X" from="Y" after="Z" to="B"/>
-```
+<transforms type="backspace">
+ <!-- Other explicit transforms -->
-This would transform the string:
-
-```
-XYZ → XBZ
-```
-
-If we mark where the current match position is before and after the transform we see:
-
-```
-X Y | Z → X B | Z
-```
-
-Whereas a simple or final transform would then run other transforms in the transform list, advancing the processing position until it gets to the end of the string, the backspace transform only matches a single backspace rule and then finishes.
-
-* * *
-
-## 6 <a name="Element_Heirarchy_Platform_File" href="#Element_Heirarchy_Platform_File">Element Hierarchy - Platform File</a>
-
-There is a separate XML structure for platform-specific configuration elements. The most notable component is a mapping between the hardware key codes to the ISO layout positions for that platform.
-
-### 6.1 <a name="Element_platform" href="#Element_platform">Element: platform</a>
-
-This is the top level element. This element contains a set of elements defined below. A document shall only contain a single instance of this element.
-
-**Syntax**
-
-```xml
-<platform>
- {platform-specific elements}
-</platform>
-```
-
-> <small>
->
-> Parents: _none_
-> Children: [hardwareMap](#Element_hardwareMap)
-> Occurence: required, single
->
-> </small>
-
-
-### 6.2 <a name="Element_hardwareMap" href="#Element_hardwareMap">Element: hardwareMap</a>
-
-This element must have a `platform` element as its parent. This element contains a set of map elements defined below. A document shall only contain a single instance of this element.
-
-**Syntax**
-
-```xml
-<platform>
- <hardwareMap>
- {a set of map elements}
- </hardwareMap>
-</platform>
-```
-
-> <small>
->
-> Parents: [platform](#Element_platform)
-> Children: [map](#Element_hardwareMap_map)
-> Occurence: optional, single
->
-> </small>
-
-### 6.3 <a name="Element_hardwareMap_map" href="#Element_hardwareMap_map">Element: map</a>
-
-This element must have a `hardwareMap` element as its parent. This element maps between a hardware keycode and the corresponding ISO layout position of the key.
-
-**Syntax**
-
-```xml
-<map keycode="{hardware keycode}" iso="{ISO layout position}" />
-```
-
-> <small>
->
-> Parents: [hardwareMap](#Element_hardwareMap)
-> Children: _none_
-> Occurence: required, multiple
-> </small>
-
-_Attribute:_ `keycode` (required)
-
-> The hardware key code value of the key. This value is an integer which is provided by the keyboard driver.
-
-_Attribute:_ `iso` (required)
-
-> The corresponding position of a key using the ISO layout convention where rows are identified by letters and columns are identified by numbers. For example, "D01" corresponds to the "Q" key on a US keyboard. (See the definition at the beginning of the document for a diagram).
-
-**Example**
-
-```xml
-<platform>
- <hardwareMap>
- <map keycode="2" iso="E01" />
- <map keycode="3" iso="E02" />
- <map keycode="4" iso="E03" />
- <map keycode="5" iso="E04" />
- <map keycode="6" iso="E05" />
- <map keycode="7" iso="E06" />
- <map keycode="41" iso="E00" />
- </hardwareMap>
-</platform>
+ <!-- Final implicit backspace transform: Delete the final codepoint. -->
+ <transformGroup>
+ <!-- (:?\m{.})* - matches any number of contiguous markers -->
+ <transform from="(:?\m{.})*.(:?\m{.})*" /> <!-- deletes any number of markers directly on either side of the final pre-caret codepoint -->
+ </transformGroup>
+</transforms>
```
* * *
-## 7 <a name="Invariants" href="#Invariants">Invariants</a>
+## Invariants
Beyond what the DTD imposes, certain other restrictions on the data are imposed on the data.
+Please note the constraints given under each element section above.
+DTD validation alone is not sufficient to verify a keyboard file.
-1. For a given platform, every `map[@iso]` value must be in the hardwareMap if there is one (`_keycodes.xml`)
-2. Every `map[@base]` value must also be in `base[@base]` value
+<!--
+TODO: Rewrite this? Probably push out to each element's section?
+
3. No `keyMap[@modifiers]` value can overlap with another `keyMap[@modifiers]` value.
* eg you can't have `"RAlt Ctrl"` in one `keyMap`, and `"Alt Shift"` in another (because Alt = RAltLAlt).
4. Every sequence of characters in a `transform[@from]` value must be a concatenation of two or more `map[@to]` values.
@@ -1871,10 +2479,9 @@
| Notation | Notes |
|------------------------------------------|-------|
-| Lower case character (eg. _x_ ) | Interpreted as any combination of modifiers. <br/> (eg. _x_ = CtrlShiftOption) |
-| Upper-case character (eg. _Y_ ) | Interpreted as a single modifier key (which may or may not have a L and R variant) <br/> (eg. _Y_ = Ctrl, _RY_ = RCtrl, etc..) |
-| Y? ⇔ Y ∨ ∅ <br/> Y ⇔ LY ∨ RY ∨ LYRY | Eg. Opt? ⇔ ROpt ∨ LOpt ∨ ROptLOpt ∨ ∅ <br/> Eg. Opt ⇔ ROpt ∨ LOpt ∨ ROptLOpt |
-
+| Lower case character (e.g. _x_ ) | Interpreted as any combination of modifiers. <br/> (e.g. _x_ = CtrlShiftOption) |
+| Upper-case character (e.g. _Y_ ) | Interpreted as a single modifier key (which may or may not have an L and R variant) <br/> (e.g. _Y_ = Ctrl, _RY_ = RCtrl, etc.) |
+| Y? ⇔ Y ∨ ∅ <br/> Y ⇔ LY ∨ RY ∨ LYRY | E.g. Opt? ⇔ ROpt ∨ LOpt ∨ ROptLOpt ∨ ∅ <br/> E.g. Opt ⇔ ROpt ∨ LOpt ∨ ROptLOpt |
| Axiom | Example |
|---------------------------------------------|----------------------------------------------|
@@ -1889,57 +2496,421 @@
| xY? ⋁ x ⇒ xY? | |
| xLY? ⋁ x ⇒ xLY? | |
| xLY ⋁ x ⇒ xLY? | |
+-->
* * *
-## 8 <a name="Data_Sources" href="#Data_Sources">Data Sources</a>
-
-Here is a list of the data sources used to generate the initial key map layouts:
-
-###### Table: <a name="Key_Map_Data_Sources" href="#Key_Map_Data_Sources">Key Map Data Sources</a>
-
-| Platform | Source | Notes |
-|----------|--------|-------|
-| Android | Android 4.0 - Ice Cream Sandwich ([https://source.android.com/docs/setup/download/downloading](https://source.android.com/docs/setup/download/downloading)) | Parsed layout files located in packages/inputmethods/LatinIME/java/res |
-| ChromeOS | XKB ([https://www.x.org/wiki/XKB/](https://www.x.org/wiki/XKB/)) | The ChromeOS represents a very small subset of the keyboards available from XKB.
-| Mac OSX | Ukelele bundled System Keyboards ([https://software.sil.org/ukelele/](https://software.sil.org/ukelele/)) | These layouts date from Mac OSX 10.4 and are therefore a bit outdated |
-| Windows | Generated .klc files from the [Microsoft Keyboard Layout Creator](https://www.microsoft.com/en-us/download/details.aspx?id=102134) |
-
-* * *
-
-## 9 <a name="Keyboard_IDs" href="#Keyboard_IDs">Keyboard IDs</a>
+## Keyboard IDs
There is a set of subtags that help identify the keyboards. Each of these are used after the `"t-k0"` subtags to help identify the keyboards. The first tag appended is a mandatory platform tag followed by zero or more tags that help differentiate the keyboard from others with the same locale code.
-### 9.1 <a name="Principles_for_Keyboard_Ids" href="#Principles_for_Keyboard_Ids">Principles for Keyboard Ids</a>
+### Principles for Keyboard IDs
-The following are the design principles for the ids.
+The following are the design principles for the IDs.
1. BCP47 compliant.
- 1. Eg, `en-t-k0-extended`.
-2. Use the minimal language id based on `likelySubtag`s.
- 1. Eg, instead of `en-US-t-k0-xxx`, use `en-t-k0-xxx`. Because there is `<likelySubtag from="en" to="en_Latn_US"/>`, en-US → en.
+ 1. Eg, `en`, `sr-Cyrl`, or `en-t-k0-extended`.
+2. Use the minimal language id based on `likelySubtags` (see [Part 1: Likely Subtags](tr35.md#Likely_Subtags))
+ 1. Eg, instead of `fa-Arab`, use `fa`.
2. The data is in <https://github.com/unicode-org/cldr/blob/main/common/supplemental/likelySubtags.xml>
-3. The platform goes first, if it exists. If a keyboard on the platform changes over time, both are dated, eg `bg-t-k0-chromeos-2011`. When selecting, if there is no date, it means the latest one.
-4. Keyboards are only tagged that differ from the "standard for each platform". That is, for each language on a platform, there will be a keyboard with no subtags other than the platform. Subtags with a common semantics across platforms are used, such as `-extended`, `-phonetic`, `-qwerty`, `-qwertz`, `-azerty`, …
+3. Keyboard files should be platform-independent, however, if included, a platform id is the first subtag after `-t-k0-`. If a keyboard on the platform changes over time, both are dated, eg `bg-t-k0-chromeos-2011`. When selecting, if there is no date, it means the latest one.
+4. Keyboards are only tagged that differ from the "standard for each language". That is, for each language on a platform, there will be a keyboard with no subtags. Subtags with common semantics across languages and platforms are used, such as `-extended`, `-phonetic`, `-qwerty`, `-qwertz`, `-azerty`, …
5. In order to get to 8 letters, abbreviations are reused that are already in [bcp47](https://github.com/unicode-org/cldr/blob/main/common/bcp47/) -u/-t extensions and in [language-subtag-registry](https://www.iana.org/assignments/language-subtag-registry) variants, eg for Traditional use `-trad` or `-traditio` (both exist in [bcp47](https://github.com/unicode-org/cldr/blob/main/common/bcp47/)).
-6. Multiple languages cannot be indicated, so the predominant target is used.
+6. Multiple languages cannot be indicated in the locale id, so the predominant target is used.
1. For Finnish + Sami, use `fi-t-k0-smi` or `extended-smi`
+ 2. The [`<locales>`](#element-locales) element may be used to identify additional languages.
7. In some cases, there are multiple subtags, like `en-US-t-k0-chromeos-intl-altgr.xml`
8. Otherwise, platform names are used as a guide.
+**Examples**
+
+```xml
+<!-- Serbian Latin -->
+<keyboard3 locale="sr-Latn"/>
+```
+
+```xml
+<!-- Serbian Cyrillic -->
+<keyboard3 locale="sr-Cyrl"/>
+```
+
+```xml
+<!-- Pan Nigerian Keyboard-->
+<keyboard3 locale="mul-Latn-NG-t-k0-panng">
+ <locales>
+ <locale id="ha"/>
+ <locale id="ig"/>
+ <!-- others … -->
+ </locales>
+</keyboard3>
+```
+
+```xml
+<!-- Finnish Keyboard including Skolt Sami -->
+<keyboard3 locale="fi-t-k0-smi">
+ <locales>
+ <locale id="sms"/>
+ </locales>
+</keyboard3>
+```
+
* * *
-## 10 <a name="Platform_Behaviors_in_Edge_Cases" href="#Platform_Behaviors_in_Edge_Cases">Platform Behaviors in Edge Cases</a>
+## Platform Behaviors in Edge Cases
-| Platform | No modifier combination match is available | No map match is available for key position | Transform fails (ie. if \^d is pressed when that transform does not exist) |
+| Platform | No modifier combination match is available | No map match is available for key position | Transform fails (i.e. if \^d is pressed when that transform does not exist) |
|----------|--------------------------------------------|--------------------------------------------|---------------------------------------------------------------------------|
-| ChromeOS | Fall back to base | Fall back to character in a keyMap with same "level" of modifier combination. If this character does not exist, fall back to (n-1) level. (This is handled data-generation side). <br/> In the spec: No output | No output at all |
-| Mac OSX | Fall back to base (unless combination is some sort of keyboard shortcut, eg. cmd-c) | No output | Both keys are output separately |
+| Chrome OS | Fall back to base | Fall back to character in a keyMap with same "level" of modifier combination. If this character does not exist, fall back to (n-1) level. (This is handled data-generation-side.) <br/> In the specification: No output | No output at all |
+| Mac OS X | Fall back to base (unless combination is some sort of keyboard shortcut, e.g. cmd-c) | No output | Both keys are output separately |
| Windows | No output | No output | Both keys are output separately |
* * *
-Copyright © 2001–2022 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
+## Keyboard Test Data
+
+Keyboard Test Data allows the keyboard author to provide regression test data to validate the repertoire and behavior of a keyboard. Tooling can run these regression tests against an implementation, and can also be used as part of the development cycle to validate that keyboard changes do not deviate from expected behavior. In the interest of complete coverage, tooling could also indicate whether all keys and gestures in a layout are exercised by the test data.
+
+Test data files have a separate DTD, named `ldmlKeyboardTest3.dtd`. Note that multiple test data files can refer to the same keyboard. Test files should be named similarly to the keyboards which they test, such as `fr_test.xml` to test `fr.xml`.
+
+Sample test data files are located in the `keyboards/test` subdirectory.
+
+The following describes the structure of a keyboard test file.
+
+### Test Doctype
+
+```xml
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE keyboardTest3 SYSTEM "../dtd/ldmlKeyboardTest3.dtd">
+```
+
+The top level element is named `keyboardTest`.
+
+### Test Element: keyboardTest
+
+> <small>
+>
+> Children: [info](#test-element-info), [repertoire](#test-element-repertoire), [_special_](tr35.md#special), [tests](#test-element-tests)
+> </small>
+
+This is the top level element.
+
+_Attribute:_ `conformsTo` (required)
+
+The `conformsTo` attribute here is the same as on the [`<keyboard3>`](#element-keyboard3) element.
+
+```xml
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE keyboardTest3 SYSTEM "../dtd/ldmlKeyboardTest3.dtd">
+<keyboardTest3 conformsTo="techpreview">
+ …
+</keyboardTest3>
+```
+
+### Test Element: info
+
+> <small>
+>
+> Parents: [keyboardTest](#test-element-keyboardtest)
+>>
+> Occurrence: Required, Single
+> </small>
+
+_Attribute:_ `author`
+
+This freeform attribute allows for description of the author or authors of this test file.
+
+_Attribute:_ `keyboard` (required)
+
+This attribute specifies the keyboard’s file name, such as `fr-t-k0-azerty.xml`.
+
+_Attribute:_ `name` (required)
+
+This attribute specifies a name for this overall test file. These names could be output to the user during test execution, used to summarize success and failure, or used to select or deselect test components.
+
+**Example**
+
+```xml
+<info keyboard="fr-t-k0-azerty.xml" author="Team Keyboard" name="fr-test" />
+```
+
+### Test Element: repertoire
+
+> <small>
+>
+> Parents: [keyboardTest](#test-element-keyboardtest)
+>
+> Children: _none_
+>
+> Occurrence: Optional, Multiple
+> </small>
+
+This element represents a repertoire test, to validate the available characters and their reachability. This test ensures that each of the specified characters is somehow typeable on the keyboard, after transforms have been applied. The characters in the repertoire will be matched against the complete set of possible generated outputs, post-transform, of all keys on the keyboard.
+
+_Attribute:_ `name` (required)
+
+This attribute specifies a unique name for this repertoire test. These names could be output to the user during test execution, used to summarize success and failure, or used to select or deselect test components.
+
+_Attribute:_ `type`
+
+This attribute is one of the following:
+
+| type | Meaning |
+|-----------|----------------------------------------------------------------------------------------------------------|
+| default | This is the default, indicates that _any_ gesture or keystroke may be used to generate each character |
+| simple | Each of the characters must be typeable by simple single keystrokes without needing any gestures. |
+| gesture | The characters are typeable by use of any gestures such as flicks, long presses, or multiple taps. |
+| flick | The characters are typeable by use of any `flick` element. |
+| longPress | The characters are typeable by use of any `longPress` value. |
+| multiTap | The characters are typeable by use of any `multiTap` value. |
+| hardware | The characters are typeable by use of any simple keystrokes on any hardware layout. |
+
+_Attribute:_ `chars` (required)
+
+This attribute specifies a list of characters in UnicodeSet format, which is specified in [UTS #35 Part One](tr35.md#Unicode_Sets).
+
+**Example**
+
+```xml
+<repertoire chars="[a b c d e \u{22}]" type="default" />
+
+<!-- taken from CLDR's common/main/fr.xml main exemplars - indicates that all of these characters should be reachable without requiring a gesture.
+Note that the 'name' is arbitrary. -->
+<repertoire name="cldr-fr-main" chars="[a à â æ b c ç d e é è ê ë f g h i î ï j k l m n o ô œ p q r s t u ù û ü v w x y ÿ z]" type="simple" />
+
+<!-- taken from CLDR's common/main/fr.xml auxiliary exemplars - indicates that all of these characters should be reachable even if a gesture is required.-->
+<repertoire name="cldr-fr-auxiliary" chars="[á å ä ã ā ć ē í ì ī ij ñ ó ò ö õ ø ř š ſ ß ú ǔ]" type="gesture" />
+
+```
+
+Note: CLDR’s extensive [exemplar set](tr35-general.md#Character_Elements) data may be useful in validating a language’s repertoire against a keyboard. Tooling may wish to make use of this data in order to suggest recommended repertoire values for a language.
+
+### Test Element: tests
+
+> <small>
+>
+> Parents: [keyboardTest](#test-element-keyboardtest)
+>
+> Children: [_special_](tr35.md#special), [test](#test-element-test)
+>
+> Occurrence: Optional, Multiple
+> </small>
+
+This element specifies a particular suite of `<test>` elements.
+
+_Attribute:_ `name` (required)
+
+This attribute specifies a unique name for this suite of tests. These names could be output to the user during test execution, used to summarize success and failure, or used to select or deselect test components.
+
+**Example**
+
+```xml
+<tests name="key-tests">
+ <test name="key-test">
+ …
+ </test>
+ <test name="gestures-test">
+ …
+ </test>
+</tests>
+<tests name="transform tests">
+ <test name="transform test">
+ …
+ </test>
+</tests>
+```
+
+### Test Element: test
+
+> <small>
+>
+> Parents: [tests](#test-element-tests)
+>
+> Children: [startContext](#test-element-startcontext), [emit](#test-element-emit), [keystroke](#test-element-keystroke), [backspace](#test-element-backspace), [check](#test-element-check), [_special_](tr35.md#special)
+>
+> Occurrence: Required, Multiple
+> </small>
+
+This attribute specifies a specific isolated regression test. Multiple test elements do not interact with each other.
+
+The order of child elements is significant, with cumulative effects: they must be processed from first to last.
+
+_Attribute:_ `name` (required)
+
+This attribute specifies a unique name for this particular test. These names could be output to the user during test execution, used to summarize success and failure, or used to select or deselect test components.
+
+**Example**
+
+```xml
+<info keyboard="fr-t-k0-azerty.xml" author="Team Keyboard" name="fr-test" />
+```
+
+### Test Element: startContext
+
+This element specifies pre-existing text in a document, as if prior to the user’s insertion point. This is useful for testing transforms and reordering. If not specified, the startContext can be considered to be the empty string ("").
+
+> <small>
+>
+> Parents: [test](#test-element-test)
+>
+> Children: _none_
+>
+> Occurrence: Optional, Single
+> </small>
+
+_Attribute:_ `to` (required)
+
+Specifies the starting context. This text may be escaped with `\u` notation, see [Escaping](#escaping).
+
+**Example**
+
+```xml
+<startContext to="abc\u{0022}"/>
+```
+
+
+### Test Element: keystroke
+
+> <small>
+>
+> Parents: [test](#test-element-test)
+>
+> Children: _none_
+>
+> Occurrence: Optional, Multiple
+> </small>
+
+This element represents a single keystroke or other gesture event, identified by a particular key element.
+
+Optionally, one of the gesture attributes, either `flick`, `longPress`, or `tapCount` may be specified. If none of the gesture attributes are specified, then a regular keypress is effected on the key. It is an error to specify more than one gesture attribute.
+
+If a key is not found, or a particular gesture has no definition, the output should be behave as if the user attempted to perform such an action. For example, an unspecified `flick` would result in no output.
+
+When a key is found, processing continues with the transform and other elements before updating the test output buffer.
+
+_Attribute:_ `key` (required)
+
+This attribute specifies a key by means of the key’s `id` attribute.
+
+_Attribute:_ `flick`
+
+This attribute specifies a flick gesture to be performed on the specified key instead of a keypress, such as `e` or `nw se`. This value corresponds to the `directions` attribute of the [`<flickSegment>`](#element-flicksegment) element.
+
+_Attribute:_ `longPress`
+
+This attribute specifies that a long press gesture should be performed on the specified key instead of a keypress. For example, `longPress="2"` indicates that the second character in a longpress series should be chosen. `longPress="0"` indicates that the `longPressDefault` value, if any, should be chosen. This corresponds to `longPress` and `longPressDefault` on [`<key>`](#element-key).
+
+_Attribute:_ `tapCount`
+
+This attribute specifies that a multi-tap gesture should be performed on the specified key instead of a keypress. For example, `tapCount="3"` indicates that the key should be tapped three times in rapid succession. This corresponds to `multiTap` on [`<key>`](#element-key). The minimum tapCount is 2.
+
+**Example**
+
+```xml
+<keystroke key="q"/>
+<keystroke key="doublequote"/>
+<keystroke key="s" flick="nw se"/>
+<keystroke key="e" longPress="1"/>
+<keystroke key="E" tapCount="2"/>
+```
+
+### Test Element: emit
+
+> <small>
+>
+> Parents: [test](#test-element-test)
+>
+> Children: _none_
+>
+> Occurrence: Optional, Multiple
+> </small>
+
+This element also represents an input event, except that the input is specified in terms of textual value rather than key or gesture identity. This element is particularly useful for testing transforms.
+
+Processing of the specified text continues with the transform and other elements before updating the test output buffer.
+
+_Attribute:_ `to` (required)
+
+This attribute specifies a string of output text representing a single keystroke or gesture. This string is intended to match the output of a `key`, `flick`, `longPress` or `multiTap` element or attribute.
+Tooling should give a hint if this attribute does not match at least one keystroke or gesture. Note that the specified text is not injected directly into the output buffer.
+
+This attribute may be escaped with `\u` notation, see [Escaping](#escaping).
+
+**Example**
+
+```xml
+<emit to="s"/>
+```
+
+
+### Test Element: backspace
+
+> <small>
+>
+> Parents: [test](#test-element-test)
+>
+> Children: _none_
+>
+> Occurrence: Optional, Multiple
+> </small>
+
+This element represents a backspace action, as if the user typed the backspace key
+
+**Example**
+
+```xml
+<backspace/>
+```
+
+### Test Element: check
+
+> <small>
+>
+> Parents: [test](#test-element-test)
+>
+> Children: _none_
+>
+> Occurrence: Optional, Multiple
+> </small>
+
+This element represents a check on the current output buffer.
+
+_Attribute:_ `result` (required)
+
+This attribute specifies the expected resultant text in a document after processing this event and all prior events, and including any `startContext` text. This text may be escaped with `\u` notation, see [Escaping](#escaping).
+
+**Example**
+
+```xml
+<check result="abc\u{0022}s\u{0022}•éÈ"/>
+```
+
+
+### Test Examples
+
+```xml
+
+<test name="spec-sample">
+ <startContext to="abc\u{0022}"/>
+ <!-- simple, key specified by to -->
+ <emit to="s"/>
+ <check result="abc\u{0022}s"/>
+ <!-- simple, key specified by id -->
+ <keystroke key="doublequote"/>
+ <check result="abc\u{0022}s\u{0022}"/>
+ <!-- flick -->
+ <keystroke key="s" flick="nw se"/>
+ <check result="abc\u{0022}s\u{0022}•"/>
+ <!-- longPress -->
+ <keystroke key="e" longPress="1"/>
+ <check result="abc\u{0022}s\u{0022}•é"/>
+ <!-- multiTap -->
+ <keystroke key="E" tapCount="2"/>
+ <check result="abc\u{0022}s\u{0022}•éÈ"/>
+</test>
+```
+
+* * *
+
+Copyright © 2001–2023 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
Unicode and the Unicode logo are trademarks of Unicode, Inc., and are registered in some jurisdictions.
diff --git a/docs/ldml/tr35-numbers.anchors.json b/docs/ldml/tr35-numbers.anchors.json
new file mode 100644
index 0000000..312ba09
--- /dev/null
+++ b/docs/ldml/tr35-numbers.anchors.json
@@ -0,0 +1,102 @@
+[
+ "Approximate_Number_Formatting",
+ "approximate-number-formatting",
+ "Collapsing_Number_Ranges",
+ "collapsing-number-ranges",
+ "Compact_Number_Formats",
+ "compact-number-formats",
+ "Contents",
+ "contents-of-part-3-numbers",
+ "currencies",
+ "Currencies",
+ "Currency_Formats",
+ "currency-formats",
+ "default-numbering-system",
+ "defaultNumberingSystem",
+ "Examples_of_minimumGroupingDigits",
+ "examples-of-minimumgroupingdigits",
+ "Explicit_0_1_rules",
+ "Explicit_Plus",
+ "explicit-0-and-1-rules",
+ "explicit-plus-signs",
+ "formatting",
+ "Formatting",
+ "Language_Plural_Rules",
+ "language-plural-rules",
+ "Minimal_Pairs",
+ "minimal-pairs",
+ "Miscellaneous_Patterns",
+ "miscellaneous-patterns",
+ "Number_Elements",
+ "Number_Format_Patterns",
+ "Number_Formats",
+ "Number_Pattern_Character_Definitions",
+ "Number_Pattern_Examples",
+ "Number_Patterns",
+ "Number_Range_Formatting",
+ "Number_Symbols",
+ "number-elements",
+ "number-format-patterns",
+ "number-formats",
+ "number-patterns",
+ "number-range-formatting",
+ "number-symbols",
+ "Numbering_Systems",
+ "numbering-systems",
+ "operands",
+ "Operands",
+ "other-numbering-systems",
+ "otherNumberingSystems",
+ "padding",
+ "Padding",
+ "Parsing_Numbers",
+ "parsing-numbers",
+ "parts",
+ "Parts",
+ "Plural_Operand_Examples",
+ "Plural_Operand_Meanings",
+ "Plural_Ranges",
+ "Plural_Rules_Examples",
+ "Plural_rules_syntax",
+ "Plural_Samples_Examples",
+ "plural-ranges",
+ "plural-rules-syntax",
+ "Quoting_Rules",
+ "quoting-rules",
+ "Range_Pattern_Processing",
+ "range-pattern-processing",
+ "relations",
+ "Relations",
+ "Relations_Examples",
+ "rounding",
+ "Rounding",
+ "Rule-Based_Number_Formatting",
+ "rule-based-number-formatting",
+ "Sample_Patterns_and_Results",
+ "samples",
+ "Samples",
+ "sci",
+ "scientific-notation",
+ "sigdig",
+ "Significant_Digits_Examples",
+ "significant-digits",
+ "Special_Pattern_Characters",
+ "special-pattern-characters",
+ "status",
+ "summary",
+ "Supplemental_Currency_Data",
+ "supplemental-currency-data",
+ "table-number-pattern-character-definitions",
+ "table-number-pattern-examples",
+ "table-plural-operand-examples",
+ "table-plural-operand-meanings",
+ "table-plural-rules-examples",
+ "table-plural-samples-examples",
+ "table-relations-examples",
+ "table-sample-patterns-and-results",
+ "table-significant-digits-examples",
+ "unicode-locale-data-markup-language-ldmlpart-3-numbers",
+ "unicode-technical-standard-35",
+ "Using_cardinals",
+ "using-cardinals"
+]
\ No newline at end of file
diff --git a/docs/ldml/tr35-numbers.md b/docs/ldml/tr35-numbers.md
index 9c36002..675a0c1 100644
--- a/docs/ldml/tr35-numbers.md
+++ b/docs/ldml/tr35-numbers.md
@@ -2,8 +2,8 @@
# Unicode Locale Data Markup Language (LDML)<br/>Part 3: Numbers
-|Version|42 |
-|-------|------------------|
+|Version|44.1 |
+|-------|----------|
|Editors|Shane F. Carr (<a href="mailto:[email protected]">[email protected]</a>) and <a href="tr35.md#Acknowledgments">other CLDR committee members|
For the full header, summary, and status, see [Part 1: Core](tr35.md).
@@ -16,7 +16,13 @@
### _Status_
-_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium. This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
+_This is a draft document which may be updated, replaced, or superseded by other documents at any time.
+<!-- _This is a draft document which may be updated, replaced, or superseded by other documents at any time.
+Publication does not imply endorsement by the Unicode Consortium.
+This is not a stable document; it is inappropriate to cite this document as other than a work in progress._ -->
+
+_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium.
+This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
> _**A Unicode Technical Standard (UTS)** is an independent specification. Conformance to the Unicode Standard does not imply conformance to any UTS._
@@ -37,53 +43,53 @@
## <a name="Contents" href="#Contents">Contents of Part 3, Numbers</a>
-* 1 [Numbering Systems](#Numbering_Systems)
-* 2 [Number Elements](#Number_Elements)
- * 2.1 [Default Numbering System](#defaultNumberingSystem)
- * 2.2 [Other Numbering Systems](#otherNumberingSystems)
- * 2.3 [Number Symbols](#Number_Symbols)
- * 2.4 [Number Formats](#Number_Formats)
- * 2.4.1 [Compact Number Formats](#Compact_Number_Formats)
- * 2.4.2 [Currency Formats](#Currency_Formats)
- * 2.5 [Miscellaneous Patterns](#Miscellaneous_Patterns)
- * 2.6 [Minimal Pairs](#Minimal_Pairs)
-* 3 [Number Format Patterns](#Number_Format_Patterns)
- * 3.1 [Number Patterns](#Number_Patterns)
+* [Numbering Systems](#Numbering_Systems)
+* [Number Elements](#Number_Elements)
+ * [Default Numbering System](#defaultNumberingSystem)
+ * [Other Numbering Systems](#otherNumberingSystems)
+ * [Number Symbols](#Number_Symbols)
+ * [Number Formats](#Number_Formats)
+ * [Compact Number Formats](#Compact_Number_Formats)
+ * [Currency Formats](#Currency_Formats)
+ * [Miscellaneous Patterns](#Miscellaneous_Patterns)
+ * [Minimal Pairs](#Minimal_Pairs)
+* [Number Format Patterns](#Number_Format_Patterns)
+ * [Number Patterns](#Number_Patterns)
* Table: [Number Pattern Examples](#Number_Pattern_Examples)
- * 3.2 [Special Pattern Characters](#Special_Pattern_Characters)
+ * [Special Pattern Characters](#Special_Pattern_Characters)
* Table: [Number Pattern Character Definitions](#Number_Pattern_Character_Definitions)
* Table: [Sample Patterns and Results](#Sample_Patterns_and_Results)
- * 3.2.1 [Explicit Plus Signs](#Explicit_Plus)
- * 3.3 [Formatting](#Formatting)
- * 3.4 [Scientific Notation](#sci)
- * 3.5 [Significant Digits](#sigdig)
+ * [Explicit Plus Signs](#Explicit_Plus)
+ * [Formatting](#Formatting)
+ * [Scientific Notation](#sci)
+ * [Significant Digits](#sigdig)
* Table: [Significant Digits Examples](#Significant_Digits_Examples)
- * 3.6 [Padding](#Padding)
- * 3.7 [Rounding](#Rounding)
- * 3.8 [Quoting Rules](#Quoting_Rules)
-* 4 [Currencies](#Currencies)
- * 4.1 [Supplemental Currency Data](#Supplemental_Currency_Data)
-* 5 [Language Plural Rules](#Language_Plural_Rules)
+ * [Padding](#Padding)
+ * [Rounding](#Rounding)
+ * [Quoting Rules](#Quoting_Rules)
+* [Currencies](#Currencies)
+ * [Supplemental Currency Data](#Supplemental_Currency_Data)
+* [Language Plural Rules](#Language_Plural_Rules)
* [Explicit 0 and 1 rules](#Explicit_0_1_rules)
- * 5.1 [Plural rules syntax](#Plural_rules_syntax)
- * 5.1.1 [Operands](#Operands)
+ * [Plural rules syntax](#Plural_rules_syntax)
+ * [Operands](#Operands)
* Table: [Plural Operand Meanings](#Plural_Operand_Meanings)
* Table: [Plural Operand Examples](#Plural_Operand_Examples)
- * 5.1.2 [Relations](#Relations)
+ * [Relations](#Relations)
* Table: [Relations Examples](#Relations_Examples)
* Table: [Plural Rules Examples](#Plural_Rules_Examples)
- * 5.1.3 [Samples](#Samples)
+ * [Samples](#Samples)
* Table: [Plural Samples Examples](#Plural_Samples_Examples)
- * 5.1.4 [Using Cardinals](#Using_cardinals)
- * 5.2 [Plural Ranges](#Plural_Ranges)
-* 6 [Rule-Based Number Formatting](#Rule-Based_Number_Formatting)
-* 7 [Parsing Numbers](#Parsing_Numbers)
-* 8 [Number Range Formatting](#Number_Range_Formatting)
- * 8.1 [Approximate Number Formatting](#Approximate_Number_Formatting)
- * 8.2 [Collapsing Number Ranges](#Collapsing_Number_Ranges)
- * 8.3 [Range Pattern Processing](#Range_Pattern_Processing)
+ * [Using Cardinals](#Using_cardinals)
+ * [Plural Ranges](#Plural_Ranges)
+* [Rule-Based Number Formatting](#Rule-Based_Number_Formatting)
+* [Parsing Numbers](#Parsing_Numbers)
+* [Number Range Formatting](#Number_Range_Formatting)
+ * [Approximate Number Formatting](#Approximate_Number_Formatting)
+ * [Collapsing Number Ranges](#Collapsing_Number_Ranges)
+ * [Range Pattern Processing](#Range_Pattern_Processing)
-## 1 <a name="Numbering_Systems" href="#Numbering_Systems">Numbering Systems</a>
+## <a name="Numbering_Systems" href="#Numbering_Systems">Numbering Systems</a>
```xml
<!ELEMENT numberingSystems ( numberingSystem* ) >
@@ -122,7 +128,7 @@
For general information about the numbering system data, including the BCP47 identifiers, see the main document _Section Q.1.1 [Numbering System Data](tr35.md#Numbering%20System%20Data)._
-## 2 <a name="Number_Elements" href="#Number_Elements">Number Elements</a>
+## <a name="Number_Elements" href="#Number_Elements">Number Elements</a>
```xml
<!ELEMENT numbers ( alias | ( defaultNumberingSystem*, otherNumberingSystems*, minimumGroupingDigits*, symbols*, decimalFormats*, scientificFormats*, percentFormats*, currencyFormats*, currencies?, miscPatterns*, minimalPairs*, special* ) ) >
@@ -130,7 +136,7 @@
The numbers element supplies information for formatting and parsing numbers and currencies. It has the following sub-elements: `<defaultNumberingSystem>`, `<otherNumberingSystems>`, `<symbols>`, `<decimalFormats>`, `<scientificFormats>`, `<percentFormats>`, `<currencyFormats>`, and `<currencies>`. The currency IDs are from [[ISO4217](tr35.md#ISO4217)] (plus some additional common-use codes). For more information, including the pattern structure, see _[Section 3: Number Format Patterns](#Number_Format_Patterns)_.
-### 2.1 <a name="defaultNumberingSystem" href="#defaultNumberingSystem">Default Numbering System</a>
+### <a name="defaultNumberingSystem" href="#defaultNumberingSystem">Default Numbering System</a>
```xml
<!ELEMENT defaultNumberingSystem ( #PCDATA )>
@@ -138,7 +144,7 @@
This element indicates which numbering system should be used for presentation of numeric quantities in the given locale.
-### 2.2 <a name="otherNumberingSystems" href="#otherNumberingSystems">Other Numbering Systems</a>
+### <a name="otherNumberingSystems" href="#otherNumberingSystems">Other Numbering Systems</a>
```xml
<!ELEMENT otherNumberingSystems ( alias | ( native*, traditional*, finance*)) >
@@ -167,7 +173,7 @@
For more information on numbering systems and their definitions, see _[Section 1: Numbering Systems](#Numbering_Systems)_.
-### 2.3 <a name="Number_Symbols" href="#Number_Symbols">Number Symbols</a>
+### <a name="Number_Symbols" href="#Number_Symbols">Number Symbols</a>
```xml
<!ELEMENT symbols (alias | (decimal*, group*, list*, percentSign*, nativeZeroDigit*, patternDigit*, plusSign*, minusSign*, approximatelySign*, exponential*, superscriptingExponent*, perMille*, infinity*, nan*, currencyDecimal*, currencyGroup*, timeSeparator*, special*)) >
@@ -187,7 +193,7 @@
**list**
-> symbol used to separate numbers in a list intended to represent structured data such as an array; must be different from the **decimal** value. This list separator is for “non-linguistic” usage as opposed to the listPatterns for “linguistic” lists (e.g. “Bob, Carol, and Ted”) described in Part 2, _Section 11 [List Patterns](tr35-general.md#ListPatterns)_.
+> symbol used to separate numbers in a list intended to represent structured data such as an array; must be different from the **decimal** value. This list separator is for “non-linguistic” usage as opposed to the listPatterns for “linguistic” lists (e.g. “Bob, Carol, and Ted”) described in Part 2, _[List Patterns](tr35-general.md#ListPatterns)_.
**percentSign**
@@ -274,7 +280,7 @@
The `numberSystem` attribute is used to specify that the given number symbols are to be used when the given numbering system is active. Number symbols can only be defined for numbering systems of the "numeric" type, since any special symbols required for an algorithmic numbering system should be specified by the RBNF formatting rules used for that numbering system. By default, number symbols without a specific `numberSystem` attribute are assumed to be used for the "latn" numbering system, which is western (ASCII) digits. Locales that specify a numbering system other than "latn" as the default should also specify number formatting symbols that are appropriate for use within the context of the given numbering system. For example, a locale that uses the Arabic-Indic digits as its default would likely use an Arabic comma for the grouping separator rather than the ASCII comma.
For more information on numbering systems and their definitions, see _[Section 1: Numbering Systems](#Numbering_Systems)_.
-### 2.4 <a name="Number_Formats" href="#Number_Formats">Number Formats</a>
+### <a name="Number_Formats" href="#Number_Formats">Number Formats</a>
```xml
<!ELEMENT decimalFormats (alias | (default*, decimalFormatLength*, special*)) >
@@ -342,7 +348,7 @@
The `numberSystem` attribute is used to specify that the given number formatting pattern(s) are to be used when the given numbering system is active. By default, number formatting patterns without a specific `numberSystem` attribute are assumed to be used for the "latn" numbering system, which is western (ASCII) digits. Locales that specify a numbering system other than "latn" as the default should also specify number formatting patterns that are appropriate for use within the context of the given numbering system.
For more information on numbering systems and their definitions, see _[Section 1: Numbering Systems](#Numbering_Systems)_.
-#### 2.4.1 <a name="Compact_Number_Formats" href="#Compact_Number_Formats">Compact Number Formats</a>
+#### <a name="Compact_Number_Formats" href="#Compact_Number_Formats">Compact Number Formats</a>
A pattern `type` attribute is used for _compact number formats_, such as the following:
@@ -401,7 +407,7 @@
The short format is designed for UI environments where space is at a premium, and should ideally result in a formatted string no more than about 6 em wide (with no fractional digits).
-#### 2.4.2 <a name="Currency_Formats" href="#Currency_Formats">Currency Formats</a>
+#### <a name="Currency_Formats" href="#Currency_Formats">Currency Formats</a>
Patterns for use with currency formatting:
@@ -415,7 +421,7 @@
<!ELEMENT currencyPatternAppendISO ( #PCDATA ) >
```
-The following additional elements were intended to allow proper placement of the currency symbol relative to the numeric quantity. These are specified in the root locale and typically not overridden in any other locale. However, as of CLDR 42, the preferred approach to controlling placement of the currency symbol is use of the `alt="alphaNextToNumber"` variant for `currencyFormat` `pattern`s. See below and _[Section 4 - Currencies](#Currencies)_ for additional information on the use of these options.
+The following additional elements were intended to allow proper placement of the currency symbol relative to the numeric quantity. These are specified in the root locale and typically not overridden in any other locale. However, as of CLDR 42, the preferred approach to controlling placement of the currency symbol is use of the `alt="alphaNextToNumber"` variant for `currencyFormat` `pattern`s. See below and _[- Currencies](#Currencies)_ for additional information on the use of these options.
```xml
<!ELEMENT currencySpacing (alias | (beforeCurrency*, afterCurrency*, special*)) >
@@ -426,7 +432,7 @@
<!ELEMENT insertBetween ( #PCDATA ) >
```
-In addition to a standard currency format, in which negative currency amounts might typically be displayed as something like “-$3.27”, locales may provide an "accounting" form, in which for "en_US" the same example would appear as “($3.27)”.
+In addition to a standard currency format, in which negative currency amounts might typically be displayed as something like “-$3.27”, locales may provide an "accounting" form, in which for "en_US" the same example would appear as “($3.27)”. The locale keyword "cf" can be used to select the standard or accounting form, see [Unicode Currency Format Identifier](tr35.md#UnicodeCurrencyFormatIdentifier).
```xml
<currencyFormats>
@@ -467,9 +473,9 @@
The `currencyPatternAppendISO` element provides a pattern that can be used to combine currency format that uses a currency symbol (¤ or ¤¤¤¤¤) with the ISO 4217 3-letter code for the same currency (¤¤), to produce a result such as “$1,432.00 USD”. Using such a format is only recommended to resolve ambiguity when:
* The currency symbol being used is the narrow symbol (¤¤¤¤¤) or has the same value as the narrow symbol, and
* The currency symbol does not have the same value as the ISO 4217 3-letter code.
-Most locales will not need to override the pattern provided in root, shown in the xml sample above.
+Most locales will not need to override the pattern provided in root, shown in the xml sample above.
-### 2.5 <a name="Miscellaneous_Patterns" href="#Miscellaneous_Patterns">Miscellaneous Patterns</a>
+### <a name="Miscellaneous_Patterns" href="#Miscellaneous_Patterns">Miscellaneous Patterns</a>
```xml
<!ELEMENT miscPatterns (alias | (default*, pattern*, special*)) >
@@ -505,7 +511,7 @@
</miscPatterns>
```
-### 2.6 <a name="Minimal_Pairs" href="#Minimal_Pairs">Minimal Pairs</a>
+### <a name="Minimal_Pairs" href="#Minimal_Pairs">Minimal Pairs</a>
```xml
<!ELEMENT minimalPairs ( alias | ( pluralMinimalPairs*, ordinalMinimalPairs*, caseMinimalPairs*, genderMinimalPairs*, special* ) ) >
@@ -554,9 +560,9 @@
For more information, see [Plural Rules](https://cldr.unicode.org/index/cldr-spec/plural-rules) and [Grammatical Inflection](https://cldr.unicode.org/translation/grammatical-inflection).
-## 3 <a name="Number_Format_Patterns" href="#Number_Format_Patterns">Number Format Patterns</a>
+## <a name="Number_Format_Patterns" href="#Number_Format_Patterns">Number Format Patterns</a>
-### 3.1 <a name="Number_Patterns" href="#Number_Patterns">Number Patterns</a>
+### <a name="Number_Patterns" href="#Number_Patterns">Number Patterns</a>
Number patterns affect how numbers are interpreted in a localized context. Here are some examples, based on the French locale. The "." shows where the decimal point should go. The "," shows where the thousands separator should go. A "0" indicates zero-padding: if the number is too short, a zero (in the locale's numeric set) will go there. A "#" indicates no padding: if the number is too short, nothing goes there. A "¤" shows where the currency sign will go. The following illustrates the effects of different patterns for the French locale, with the number "1234.567". Notice how the pattern characters ',' and '.' are replaced by the characters appropriate for the locale.
@@ -578,11 +584,11 @@
_When parsing using a pattern, a lenient parse should be used; see [Lenient Parsing](tr35.md#Lenient_Parsing)._ As noted there, lenient parsing should ignore bidi format characters.
-### 3.2 <a name="Special_Pattern_Characters" href="#Special_Pattern_Characters">Special Pattern Characters</a>
+### <a name="Special_Pattern_Characters" href="#Special_Pattern_Characters">Special Pattern Characters</a>
Many characters in a pattern are taken literally; they are matched during parsing and output unchanged during formatting. Special characters, on the other hand, stand for other characters, strings, or classes of characters. For example, the '#' character is replaced by a localized digit for the chosen numberSystem. Often the replacement character is the same as the pattern character; in the U.S. locale, the ',' grouping character is replaced by ','. However, the replacement is still happening, and if the symbols are modified, the grouping character changes. Some special characters affect the behavior of the formatter by their presence; for example, if the percent character is seen, then the value is multiplied by 100 before being displayed.
-To insert a special character in a pattern as a literal, that is, without any special meaning, the character must be quoted. There are some exceptions to this which are noted below. The Localized Replacement column shows the replacement from _Section 2.3 [Number Symbols](#Number_Symbols)_ or the numberSystem's digits: _italic_ indicates a special function.
+To insert a special character in a pattern as a literal, that is, without any special meaning, the character must be quoted. There are some exceptions to this which are noted below. The Localized Replacement column shows the replacement from _[Number Symbols](#Number_Symbols)_ or the numberSystem's digits: _italic_ indicates a special function.
Invalid sequences of special characters (such as “¤¤¤¤¤¤” in current CLDR) should be handled for formatting and parsing as described in [Handling Invalid Patterns](tr35.md#Invalid_Patterns).
@@ -595,10 +601,10 @@
| @ | Number | digit | Significant digit |
| # | Number | digit, _nothing_ | Digit, omitting leading/trailing zeros |
| . | Number | decimal, currencyDecimal | Decimal separator or monetary decimal separator |
-| - | Number | minusSign, plusSign, approximatelySign | Minus sign. **Warning:** the pattern '-'0.0 is not the same as the pattern -0.0. In the former case, the minus sign is a literal. In the latter case, it is a special symbol, which is replaced by the minusSymbol, and can also be replaced by the plusSymbol for a format like +12% as in Section 3.2.1 [Explicit Plus Signs](#Explicit_Plus). |
+| - | Number | minusSign, plusSign, approximatelySign | Minus sign. **Warning:** the pattern '-'0.0 is not the same as the pattern -0.0. In the former case, the minus sign is a literal. In the latter case, it is a special symbol, which is replaced by the minusSymbol, and can also be replaced by the plusSymbol for a format like +12% as in [Explicit Plus Signs](#Explicit_Plus). |
| , | Number | group, currencyGroup | Grouping separator. May occur in both the integer part and the fractional part. The position determines the grouping. |
| E | Number | exponential, superscriptingExponent | Separates mantissa and exponent in scientific notation. _Need not be quoted in prefix or suffix._ |
-| + | Exponent or Number (for explicit plus) | plusSign | Prefix positive exponents with localized plus sign. Used for explicit plus for numbers as well, as described in Section 3.2.1 [Explicit Plus Signs](#Explicit_Plus). _Need not be quoted in prefix or suffix._ |
+| + | Exponent or Number (for explicit plus) | plusSign | Prefix positive exponents with localized plus sign. Used for explicit plus for numbers as well, as described in [Explicit Plus Signs](#Explicit_Plus). _Need not be quoted in prefix or suffix._ |
| % | Prefix or suffix | percentSign | Multiply by 100 and show as percentage |
| ‰ (U+2030) | Prefix or suffix | perMille | Multiply by 1000 and show as per mille (aka “basis points”) |
| ; | Subpattern boundary | _syntax_ | Separates positive and negative subpatterns. When there is no explicit negative subpattern, an implicit negative subpattern is formed from the positive pattern with a prefixed - (ASCII U+002D HYPHEN-MINUS). |
@@ -664,9 +670,9 @@
| 1 | 4 | 10000 | 1,0000 |
| 2 | 4 | 10000 | 10000 |
-#### 3.2.1 <a name="Explicit_Plus" href="#Explicit_Plus">Explicit Plus Signs</a>
+#### <a name="Explicit_Plus" href="#Explicit_Plus">Explicit Plus Signs</a>
-An explicit "plus" format can be formed, so as to show a visible + sign when formatting a non-negative number. The displayed plus sign can be an ASCII plus or another character, such as + U+FF0B FULLWIDTH PLUS SIGN or ➕ U+2795 HEAVY PLUS SIGN; it is taken from whatever is set for plusSign in _Section 2.3 [Number Symbols](#Number_Symbols)_.
+An explicit "plus" format can be formed, so as to show a visible + sign when formatting a non-negative number. The displayed plus sign can be an ASCII plus or another character, such as + U+FF0B FULLWIDTH PLUS SIGN or ➕ U+2795 HEAVY PLUS SIGN; it is taken from whatever is set for plusSign in _[Number Symbols](#Number_Symbols)_.
1. Get the negative subpattern (explicit or implicit).
2. Replace any unquoted ASCII minus sign by an ASCII plus sign.
@@ -674,7 +680,7 @@
For an example, see [Sample Patterns and Results](#Sample_Patterns_and_Results).
-### 3.3 <a name="Formatting" href="#Formatting">Formatting</a>
+### <a name="Formatting" href="#Formatting">Formatting</a>
Formatting is guided by several parameters, all of which can be specified either using a pattern or using an external API designed for number formatting. The following description applies to formats that do not use [scientific notation](#sci) or [significant digits](#sigdig).
@@ -690,7 +696,7 @@
Infinity is represented as a single character, typically ∞ `(U+221E)` , with the positive or negative prefixes and suffixes applied. The infinity character is determined by the localized number symbols.
-### 3.4 <a name="sci" href="#sci">Scientific Notation</a>
+### <a name="sci" href="#sci">Scientific Notation</a>
Numbers in scientific notation are expressed as the product of a mantissa and a power of ten, for example, 1234 can be expressed as 1.234 x 10<sup>3</sup>. The mantissa is typically in the half-open interval [1.0, 10.0) or sometimes [0.0, 1.0), but it need not be. In a pattern, the exponent character immediately followed by one or more digit characters indicates scientific notation. Example: "0.###E0" formats the number 1234 as "1.234E3".
@@ -720,7 +726,7 @@
* ###E0 means engineering notation with infinite precision.
* Exponential patterns may not contain grouping separators.
-### 3.5 <a name="sigdig" href="#sigdig">Significant Digits</a>
+### <a name="sigdig" href="#sigdig">Significant Digits</a>
There are two ways of controlling how many digits are shown: (a) significant digits counts, or (b) integer and fraction digit counts. Integer and fraction digit counts are described above. When a formatter is using significant digits counts, it uses however many integer and fraction digits are required to display the specified number of significant digits. It may ignore min/max integer/fraction digits, or it may use them to the extent possible.
@@ -740,7 +746,7 @@
* The number of significant digits has no effect on parsing.
* Significant digits may be used together with exponential notation. Such patterns are equivalent to a normal exponential pattern with a minimum and maximum integer digit count of one, a minimum fraction digit count of `Minimum Significant Digits - 1`, and a maximum fraction digit count of `Maximum Significant Digits - 1`. For example, the pattern `"@@###E0"` is equivalent to `"0.0###E0"`.
-### 3.6 <a name="Padding" href="#Padding">Padding</a>
+### <a name="Padding" href="#Padding">Padding</a>
Patterns support padding the result to a specific width. In a pattern the pad escape character, followed by a single pad character, causes padding to be parsed and formatted. The pad escape character is '*'. For example, `"$*x#,##0.00"` formats 123 to `"$xx123.00"` , and 1234 to `"$1,234.00"` .
@@ -749,7 +755,7 @@
* Padding may be inserted at one of four locations: before the prefix, after the prefix, before the suffix, or after the suffix. No padding can be specified in any other location. If there is no prefix, before the prefix and after the prefix are equivalent, likewise for the suffix.
* When specified in a pattern, the code point immediately following the pad escape is the pad character. This may be any character, including a special pattern character. That is, the pad escape _escapes_ the following character. If there is no character after the pad escape, then the pattern is illegal.
-### 3.7 <a name="Rounding" href="#Rounding">Rounding</a>
+### <a name="Rounding" href="#Rounding">Rounding</a>
Patterns support rounding to a specific increment. For example, 1230 rounded to the nearest 50 is 1250. Mathematically, rounding to specific increments is performed by dividing by the increment, rounding to an integer, then multiplying by the increment. To take a more bizarre example, 1.234 rounded to the nearest 0.65 is 1.3, as follows:
@@ -767,11 +773,11 @@
* Some locales use rounding in their currency formats to reflect the smallest currency denomination.
* In a pattern, digits '1' through '9' specify rounding, but otherwise behave identically to digit '0'.
-### 3.8 <a name="Quoting_Rules" href="#Quoting_Rules">Quoting Rules</a>
+### <a name="Quoting_Rules" href="#Quoting_Rules">Quoting Rules</a>
Single quotes (**'**) enclose bits of the pattern that should be treated literally. Inside a quoted string, two single quotes ('') are replaced with a single one ('). For example: `'X '`#`' Q '` -> **X 1939 Q** (Literal strings `shaded`.)
-## 4 <a name="Currencies" href="#Currencies">Currencies</a>
+## <a name="Currencies" href="#Currencies">Currencies</a>
```xml
<!ELEMENT currencies (alias | (default?, currency*, special*)) >
@@ -820,13 +826,13 @@
```
Note on displayNames:
-* In general the region portion of the displayName should match the territory name, see **Part 2** _Section 1.2 [Locale Display Name Fields](tr35-general.md#locale_display_name_fields)_.
+* In general the region portion of the displayName should match the territory name, see **Part 2** _[Locale Display Name Fields](tr35-general.md#locale_display_name_fields)_.
* As a result, the English currency displayName in CLDR may not match the name in ISO 4217.
To format a particular currency value "ZWD" for a particular numeric value _n_ using the (long) display name:
1. If the numeric value is exactly 0 or 1, first see if there is a count with a matching explicit number (0 or 1). If so, use that string (see [Explicit 0 and 1 rules](#Explicit_0_1_rules)).
-2. Otherwise, determine the `count` value that corresponds to _n_ using the rules in _[Section 5 - Language Plural Rules](#Language_Plural_Rules)_
+2. Otherwise, determine the `count` value that corresponds to _n_ using the rules in _[- Language Plural Rules](#Language_Plural_Rules)_
3. Next, get the currency unitPattern.
1. Look for a `unitPattern` element that matches the `count` value, starting in the current locale and then following the locale fallback chain up to, but not including root.
2. If no matching `unitPattern` element was found in the previous step, then look for a `unitPattern` element that matches `count="other"`, starting in the current locale and then following the locale fallback chain up to root (which has a `unitPattern` element with `count="other"` for every unit type).
@@ -871,7 +877,7 @@
---
-Currencies can also contain optional grouping, decimal data, and pattern elements. This data is inherited from the `<symbols>` in the same locale data (if not present in the chain up to root), so only the _differing_ data will be present. See the main document _Section 4.1 [Multiple Inheritance](tr35.md#Multiple_Inheritance)_.
+Currencies can also contain optional grouping, decimal data, and pattern elements. This data is inherited from the `<symbols>` in the same locale data (if not present in the chain up to root), so only the _differing_ data will be present. See the main document _[Multiple Inheritance](tr35.md#Multiple_Inheritance)_.
> **Note:** _Currency values should **never** be interchanged without a known currency code. You never want the number 3.5 interpreted as $3.50 by one user and €3.50 by another._ Locale data contains localization information for currencies, not a currency value for a country. A currency amount logically consists of a numeric value, plus an accompanying currency code (or equivalent). The currency code may be implicit in a protocol, such as where USD is implicit. But if the raw numeric value is transmitted without any context, then it has no definitive interpretation.
@@ -883,7 +889,7 @@
For background information on currency names, see [[CurrencyInfo](tr35.md#CurrencyInfo)].
-### 4.1 <a name="Supplemental_Currency_Data" href="#Supplemental_Currency_Data">Supplemental Currency Data</a>
+### <a name="Supplemental_Currency_Data" href="#Supplemental_Currency_Data">Supplemental Currency Data</a>
```xml
<!ELEMENT currencyData ( fractions*, region+ ) >
@@ -965,8 +971,8 @@
```
* **iso4217:** the ISO 4217 code for the currency in question. Note that some additional codes that were in widespread usage are included, others such as GHP are not included because they were never used.
-* **from:** the currency was valid from to the datetime indicated by the value. See the main document _Section 5.2.1 [Dates and Date Ranges](tr35.md#Date_Ranges)_.
-* **to:** the currency was valid up to the datetime indicated by the value of _before_. See the main document _Section 5.2.1 [Dates and Date Ranges](tr35.md#Date_Ranges)_.
+* **from:** the currency was valid from to the datetime indicated by the value. See the main document _[Dates and Date Ranges](tr35.md#Date_Ranges)_.
+* **to:** the currency was valid up to the datetime indicated by the value of _before_. See the main document _[Dates and Date Ranges](tr35.md#Date_Ranges)_.
* **tender:** indicates whether or not the ISO currency code represents a currency that was or is legal tender in some country. The default is "true". Certain ISO codes represent things like financial instruments or precious metals, and do not represent normally interchanged currencies.
@@ -1004,7 +1010,7 @@
> * RS & ME copy the former CS, except that the line for EUR is dropped from RS
> * CS now terminates on Jun 3, 2006 (following the UN info)
-## 5 <a name="Language_Plural_Rules" href="#Language_Plural_Rules">Language Plural Rules</a>
+## <a name="Language_Plural_Rules" href="#Language_Plural_Rules">Language Plural Rules</a>
```xml
<!ELEMENT plurals (pluralRules*, pluralRanges*) >
@@ -1088,6 +1094,8 @@
There are also variants of the above: for example, short fractions may have the Digits behavior, but longer fractions may just look at the final digit of the fraction.
+Currently there are no locale keywords that affect plural rule selection; they are selected using the base locale ID, ignoring any -u- extension keywords.
+
#### <a name="Explicit_0_1_rules" href="#Explicit_0_1_rules">Explicit 0 and 1 rules</a>
Some types of CLDR data (such as [unitPatterns](tr35-general.md#Unit_Elements) and [currency displayNames](#Currencies)) allow specification of plural rules for explicit cases “0” and “1”, in addition to the language-specific plural cases specified above: “zero”, “one”, “two” ... “other”. For the language-specific plural rules:
@@ -1108,7 +1116,7 @@
* count=“one”: {0} book, e.g. “1 book”
* count=“other”: {0} books, e.g. “3 books”
-### 5.1 <a name="Plural_rules_syntax" href="#Plural_rules_syntax">Plural rules syntax</a>
+### <a name="Plural_rules_syntax" href="#Plural_rules_syntax">Plural rules syntax</a>
The xml value for each pluralRule is a _condition_ with a boolean result.
That value specifies whether that rule (i.e. that plural form) applies to a given _source number N_ in sampleValue syntax, where _N_ can be expressed as a decimal fraction or with compact decimal formatting.
@@ -1156,7 +1164,7 @@
* The samples should be included, since they are used by client software for samples and determining whether the keyword has finite values or not.
* The 'other' keyword must have no condition, and all other keywords must have a condition.
-#### 5.1.1 <a name="Operands" href="#Operands">Operands</a>
+#### <a name="Operands" href="#Operands">Operands</a>
The operands are numeric values corresponding to features of the *source number N*, and have the following meanings given in the table below.
Note that, contrary to source numbers, operands are treated numerically.
@@ -1198,7 +1206,7 @@
| 1.20050c3 | 1200.5 | 1200 | 2 | 1 | 50 | 5 | 3 |
-#### 5.1.2 <a name="Relations" href="#Relations">Relations</a>
+#### <a name="Relations" href="#Relations">Relations</a>
The positive relations are of the format **x = y** and **x = y mod z**. The **y** value can be a comma-separated list, such as **n = 3, 5, 7..15**, and is treated as if each relation were expanded into an OR statement. The range value **a..b** is equivalent to listing all the ***integers*** between **a** and **b**, inclusive. When **!=** is used, it means the entire relation is negated.
@@ -1239,7 +1247,7 @@
| zero: n = 0 or n != 1 and n mod 100 = 1..19 <br/> one: n = 1 | Each rule must not overlap with other rules. Also note that a modulus is applied to n in the last rule, thus its condition holds for 119, 219, 319, … |
| one: n = 1 <br/> few: n mod 10 = 2..4 and n mod 100 != 12..14 | This illustrates conjunction and negation. The condition for 'few' has two parts, both of which must be met: "n mod 10 = 2..4" and "n mod 100 != 12..14". The first part applies a modulus to n before the test as in the previous example. The second part applies a different modulus and also uses negation, thus it matches all numbers _not_ in 12, 13, 14, 112, 113, 114, 212, 213, 214, … |
-#### 5.1.3 <a name="Samples" href="#Samples">Samples</a>
+#### <a name="Samples" href="#Samples">Samples</a>
Samples are provided if sample indicator (@integer or @decimal) is present on any rule. (CLDR always provides samples.)
@@ -1259,15 +1267,15 @@
In determining whether a set of samples is infinite, leading zero integer digits and trailing zero decimals are not significant. Thus "i = 1000 and f = 0" is satisfied by 01000, 1000, 1000.0, 1000.00, 1000.000, 01c3 etc. but is still considered finite.
-#### 5.1.4 <a name="Using_cardinals" href="#Using_cardinals">Using Cardinals</a>
+#### <a name="Using_cardinals" href="#Using_cardinals">Using Cardinals</a>
Elements such as `<currencyFormats>`, `<currency>` and `<unit>` provide selection among subelements designating various localized cardinal plural forms by tagging each of the relevant subelements with a different count value, or with no count value in some cases. Note that the plural forms for a specific currencyFormat, unit type, or currency type may not use all of the different plural-form tags defined for the language. To format a currency or unit type for a particular numeric value, determine the count value according to the plural rules for the language, then select the appropriate display form for the currency format, currency type or unit type using the rules in those sections:
* 2.3 [Number Symbols](#Number_Symbols) (for `currencyFormat`s elements)
-* Section 4 [Currencies](#Currencies) (for `currency` elements)
-* The main document section 5.11 [Unit Elements](tr35.md#Unit_Elements)
+* [Currencies](#Currencies) (for `currency` elements)
+* The main document [Unit Elements](tr35.md#Unit_Elements)
-### 5.2 <a name="Plural_Ranges" href="#Plural_Ranges">Plural Ranges</a>
+### <a name="Plural_Ranges" href="#Plural_Ranges">Plural Ranges</a>
```xml
<!ELEMENT pluralRanges (pluralRange*) >
@@ -1285,9 +1293,9 @@
The data has been gathered presuming that in any usage, the start value is strictly less than the end value, and that no values are negative. Results for any cases that do not meet these criteria are undefined.
-For the formatting of number ranges, see <a name="Number_Range_Formatting" href="#Number_Range_Formatting">Number Range Formatting</a>.
+For the formatting of number ranges, see <a href="#Number_Range_Formatting">Number Range Formatting</a>.
-## 6 <a name="Rule-Based_Number_Formatting" href="#Rule-Based_Number_Formatting">Rule-Based Number Formatting</a>
+## <a name="Rule-Based_Number_Formatting" href="#Rule-Based_Number_Formatting">Rule-Based Number Formatting</a>
```xml
<!ELEMENT rbnf ( alias | rulesetGrouping*) >
@@ -1342,7 +1350,7 @@
Contains the actual formatting rule for a particular number or sequence of numbers. The `value` attribute is used to indicate the starting number to which the rule applies. The actual text of the rule is identical to the ICU syntax, with the exception that Unicode left and right arrow characters are used to replace < and > in the rule text, since < and > are reserved characters in XML. The `radix` attribute is used to indicate an alternate radix to be used in calculating the prefix and postfix values for number formatting. Alternate radix values are typically used for formatting year numbers in formal documents, such as "nineteen hundred seventy-six" instead of "one thousand nine hundred seventy-six".
-## 7 <a name="Parsing_Numbers" href="#Parsing_Numbers">Parsing Numbers</a>
+## <a name="Parsing_Numbers" href="#Parsing_Numbers">Parsing Numbers</a>
The following elements are relevant to determining the value of a parsed number:
@@ -1369,7 +1377,7 @@
* A currency symbol in the input should be interpreted as the longest match found in the set of possible currency symbols.
* Especially in cases of ambiguity, the user's input should be echoed back, properly formatted according to the locale, before it is actually used for anything.
-## 8 <a name="Number_Range_Formatting" href="#Number_Range_Formatting">Number Range Formatting</a>
+## <a name="Number_Range_Formatting" href="#Number_Range_Formatting">Number Range Formatting</a>
Often ranges of numbers are presented to users, such as in “Length: 3.2–4.5 centimeters”. This means any length from 3.2 cm to 4.5 cm, inclusive.
@@ -1386,7 +1394,7 @@
For plural rule selection of number ranges, see [Plural Ranges](#Plural_Ranges).
-### 8.1 <a name="Approximate_Number_Formatting" href="#Approximate_Number_Formatting">Approximate Number Formatting</a>
+### <a name="Approximate_Number_Formatting" href="#Approximate_Number_Formatting">Approximate Number Formatting</a>
*Approximate number formatting* refers to a specific format of numbers in which the value is understood to not be exact; for example, "\~5 minutes".
@@ -1394,7 +1402,7 @@
If the number is negative, or if the formatting options request the sign to be displayed, *prepend* the `approximatelySign` to the plus or minus sign before substituting it into the pattern. For example, "\~-5" means "approximately negative five". This procedure may change in the future.
-### 8.2 <a name="Collapsing_Number_Ranges" href="#Collapsing_Number_Ranges">Collapsing Number Ranges</a>
+### <a name="Collapsing_Number_Ranges" href="#Collapsing_Number_Ranges">Collapsing Number Ranges</a>
*Collapsing* a number range refers to the process of removing duplicated information in the *lower* and *upper* values. For example, if the lower string is "3.2 centimeters" and the upper string is "4.5 centimeters", it is desirable to remove the extra "centimeters" token.
@@ -1419,8 +1427,8 @@
The above description describes the expected output. Internally, the implementation may determine the equivalent units of measurement by passing the codes back from the number formatters, allowing for a precise determination of "semantically equivalent".
-Two semantically equivalent tokens can be *collapsed* if they appear at the start of both values or the end of both values.
-However, the implementation may choose different levels of aggressiveness with regard to collapsing tokens.
+Two semantically equivalent tokens can be *collapsed* if they appear at the start of both values or the end of both values.
+However, the implementation may choose different levels of aggressiveness with regard to collapsing tokens.
An API for displaying ranges should permit control over whether the tokens are collapsed or not, and the levels of aggressiveness.
The currently recommended heuristic is:
@@ -1434,13 +1442,13 @@
* 2M ft – 5M ft ⇒ 2M – 5M ft
4. When the tokens can have distinct plural forms, modify the remaining token so that it has the correct plural form. That is, use [Plural Ranges](#Plural_Ranges) to calculate the correct plural category for the range, and pick the variant of that the remaining token corresponding to that plural form.
-In bidi contexts, the data is built so that rule #3 works **visually**.
-For example, if a range from 2 km to 5 km would be presented visually as "_mk 5 – mk 2_", the collapsed form would be "_mk 5 – 2_".
-(The _mk_ is a stand-in for the native representation.)
+In bidi contexts, the data is built so that rule #3 works **visually**.
+For example, if a range from 2 km to 5 km would be presented visually as "_mk 5 – mk 2_", the collapsed form would be "_mk 5 – 2_".
+(The _mk_ is a stand-in for the native representation.)
This requires consistent visualy reordering among the elements: the range, the prefixes and the suffixes.
Thus a prefix value will be reordered to be visually a suffix value, and the order of the range will be visually reversed.
-### 8.3 <a name="Range_Pattern_Processing" href="#Range_Pattern_Processing">Range Pattern Processing</a>
+### <a name="Range_Pattern_Processing" href="#Range_Pattern_Processing">Range Pattern Processing</a>
To obtain a number range pattern, the following steps are taken:
@@ -1458,6 +1466,6 @@
* * *
-Copyright © 2001–2022 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
+Copyright © 2001–2023 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
Unicode and the Unicode logo are trademarks of Unicode, Inc., and are registered in some jurisdictions.
diff --git a/docs/ldml/tr35-personNames.anchors.json b/docs/ldml/tr35-personNames.anchors.json
new file mode 100644
index 0000000..e251448
--- /dev/null
+++ b/docs/ldml/tr35-personNames.anchors.json
@@ -0,0 +1,57 @@
+[
+ "access-personname-object",
+ "api-implementation",
+ "choose-a-namepattern",
+ "choose-a-personname-element",
+ "cldr-person-names",
+ "Contents",
+ "contents-of-part-8-person-names",
+ "derive-initials",
+ "derive-the-formatting-locale",
+ "derive-the-name-locale",
+ "derive-the-name-order",
+ "example-1",
+ "example-2",
+ "example-usage",
+ "examples-of-space-replacement",
+ "expected-values",
+ "fields",
+ "foreignspacereplacement-element",
+ "formality",
+ "formatting-examples",
+ "formatting-process",
+ "future-modifiers",
+ "grammatical-modifiers-for-names",
+ "handle-core-and-prefix",
+ "handle-missing-surname",
+ "handling-foreign-names",
+ "initialpattern-element",
+ "introduction",
+ "length",
+ "modifiers",
+ "nameorderlocales-element",
+ "namepattern-syntax",
+ "nativespacereplacement-element",
+ "not-in-scope",
+ "order",
+ "parameterdefault-element",
+ "parts",
+ "person-name-attributes",
+ "person-name-formatting-overview",
+ "person-name-object",
+ "personname-data-interface-examples",
+ "personname-element",
+ "personnames-element",
+ "process-a-namepattern",
+ "sample-name",
+ "setting-the-spacereplacement",
+ "status",
+ "summary",
+ "switch-the-formatting-locale-if-necessary",
+ "syntax",
+ "syntax-1",
+ "unicode-locale-data-markup-language-ldmlpart-8-person-names",
+ "unicode-technical-standard-35",
+ "usage",
+ "xml-structure"
+]
\ No newline at end of file
diff --git a/docs/ldml/tr35-personNames.md b/docs/ldml/tr35-personNames.md
index 7a02f2d..2d64759 100644
--- a/docs/ldml/tr35-personNames.md
+++ b/docs/ldml/tr35-personNames.md
@@ -2,9 +2,9 @@
# Unicode Locale Data Markup Language (LDML)<br/>Part 8: Person Names
-|Version|42 |
+|Version|44.1 |
|-------|------------------------|
-|Editors|Mark Davis, Peter Edberg, Rich Gillam, Alex Kolisnychenko, Mike McKenna and <a href="tr35.md#Acknowledgments">other CLDR committee members</a>|
+|Editors|Mark Davis, Peter Edberg, Rich Gillam, Alex Kolisnychenko, Mike McKenna and [other CLDR committee members](tr35.md#Acknowledgments)|
For the full header, summary, and status, see [Part 1: Core](tr35.md).
@@ -16,15 +16,18 @@
### _Status_
-_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium. This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
+<!-- _This is a draft document which may be updated, replaced, or superseded by other documents at any time.
+Publication does not imply endorsement by the Unicode Consortium.
+This is not a stable document; it is inappropriate to cite this document as other than a work in progress._ -->
-[_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium. This is a stable document and may be used as reference material or cited as a normative reference by other specifications._]: #
+_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium.
+This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
> _**A Unicode Technical Standard (UTS)** is an independent specification. Conformance to the Unicode Standard does not imply conformance to any UTS._
_Please submit corrigenda and other comments with the CLDR bug reporting form [[Bugs](tr35.md#Bugs)]. Related information that is useful in understanding this document is found in the [References](tr35.md#References). For the latest version of the Unicode Standard see [[Unicode](tr35.md#Unicode)]. For a list of current Unicode Technical Reports see [[Reports](tr35.md#Reports)]. For more information about versions of the Unicode Standard, see [[Versions](tr35.md#Versions)]._
-## <a name="Parts" href="#Parts">Parts</a>
+## Parts
The LDML specification is divided into the following parts:
@@ -37,53 +40,64 @@
* Part 7: [Keyboards](tr35-keyboards.md#Contents) (keyboard mappings)
* Part 8: [Person Names](tr35-personNames.md#Contents) (person names)
-## <a name="Contents" href="#Contents">Contents of Part 8, Person Names</a>
+## <a name="Contents">Contents of Part 8, Person Names</a>
-* 1 [CLDR Person Names](#CLDRPersonNames)
- * 1.1 [Introduction](#Introduction)
- * 1.1.1 [Not in scope](#not-in-scope)
- * 1.2 [API Implementation](#APIImplementation)
- * 1.3 [Person Name Formatting Overview](#PersonNameFormattingOverview)
- * 1.4 [Example Usage](#ExampleUsage)
-* 2 [XML Structure](#2-xml-structure)
- * 2.1 [personNames Element](#2-1-personnames-element)
- * 2.2 [personName Element](#2-2-personname-element)
- * 2.3 [nameOrderLocales Element](#2-3-nameorderlocales-element)
- * 2.4 [foreignSpaceReplacement Element](#2-4-foreignspacereplacement-element)
- * 2.5 [initialPattern Element](#2-5-initialpattern-element)
- * 2.5.1 [Syntax](#syntax)
-* 3 [Person Name Object](#3-person-name-object)
-* 4 [Person Name Attributes](#4-person-name-attributes)
- * 4.1 [order](#4-1-order)
- * 4.2 [length](#4-2-length)
- * 4.3 [usage](#4-3-usage)
- * 4.4 [formality](#4-4-formality)
-* 5 [namePattern Syntax](#5-namepattern-syntax)
- * 5.1 [Fields](#5-1-fields)
- * 5.2 [Modifiers](#5-2-modifiers)
-* 6 [Formatting Process](#6-formatting-process)
- * 6.1 [Derive the name locale](#6-1-derive-the-name-locale)
- * 6.2 [Derive the formatting locale](#6-2-derive-the-formatting-locale)
- * 6.3 [Derive the name order](#6-3-derive-the-name-order)
- * 6.4 [Choose a personName](#6-4-choose-a-personname)
- * 6.5 [Choose a namePattern](#6-5-choose-a-namepattern)
- * 6.6 [Examples of choosing a namePattern](#6-6-examples-of-choosing-a-namepattern)
- * 6.6.1 [Examples for rules 1 and 2](#examples-for-rules-1-and-2)
- * 6.6.2 [Examples for rule 3 and the interaction between the rules](#examples-for-rule-3-and-the-interaction-between-the-rules)
- * 6.7 [Deriving initials](#6-7-deriving-initials)
- * 6.8 [Handling foreign names](#6-8-handling-foreign-names)
-* 7 [Sample Name](#7-sample-name)
- * 7.1 [Syntax](#7-1-syntax)
- * 7.2 [Expected values](#7-2-expected-values)
-* 8 [PersonName Data Interface Examples](#8-personname-data-interface-examples)
+* [CLDR Person Names](#cldr-person-names)
+ * [Introduction](#introduction)
+ * [Not in scope](#not-in-scope)
+ * [API Implementation](#api-implementation)
+ * [Person Name Formatting Overview](#person-name-formatting-overview)
+ * [Example Usage](#example-usage)
+* [XML Structure](#xml-structure)
+ * [personNames Element](#personnames-element)
+ * [personName Element](#personname-element)
+ * [nameOrderLocales Element](#nameorderlocales-element)
+ * [parameterDefault Element](#parameterdefault-element)
+ * [foreignSpaceReplacement Element](#foreignspacereplacement-element)
+ * [nativeSpaceReplacement Element](#nativespacereplacement-element)
+ * [initialPattern Element](#initialpattern-element)
+ * [Syntax](#syntax)
+* [Person Name Object](#person-name-object)
+* [Person Name Attributes](#person-name-attributes)
+ * [order](#order)
+ * [length](#length)
+ * [usage](#usage)
+ * [formality](#formality)
+* [namePattern Syntax](#namepattern-syntax)
+ * [Fields](#fields)
+ * [Modifiers](#modifiers)
+ * [Grammatical Modifiers for Names](#grammatical-modifiers-for-names)
+ * [Future Modifiers](#future-modifiers)
+* [Formatting Process](#formatting-process)
+ * [Derive the name locale](#derive-the-name-locale)
+ * [Derive the formatting locale](#derive-the-formatting-locale)
+ * [Switch the formatting locale if necessary](#switch-the-formatting-locale-if-necessary)
+ * [Derive the name order](#derive-the-name-order)
+ * [Choose a personName element](#choose-a-personname-element)
+ * [Choose a namePattern](#choose-a-namepattern)
+ * [Access PersonName object](#access-personname-object)
+ * [Handle missing surname](#handle-missing-surname)
+ * [Handle core and prefix](#handle-core-and-prefix)
+ * [Derive initials](#derive-initials)
+ * [Process a namePattern](#process-a-namepattern)
+ * [Handling foreign names](#handling-foreign-names)
+ * [Setting the spaceReplacement](#setting-the-spacereplacement)
+ * [Examples of space replacement](#examples-of-space-replacement)
+ * [Formatting examples](#formatting-examples)
+* [Sample Name](#sample-name)
+ * [Syntax](#syntax)
+ * [Expected values](#expected-values)
+* [PersonName Data Interface Examples](#personname-data-interface-examples)
+ * [Example 1](#example-1)
+ * [Example 2](#example-2)
-## 1 <a name="CLDRPersonNames" href="#CLDRPersonNames">CLDR Person Names</a>
+## CLDR Person Names
-### 1.1 <a name="Introduction" href="#Introduction">Introduction</a>
+### Introduction
-CLDR provides formatting for person names, such as John Smith or 宮崎駿. These use patterns to show how a name object (for example, from a database) should be formatted for a particular locale. Name data has fields for the parts of people’s names, such as a **given** field with a value of “Maria”, and a **surname** field value of “Schmidt”.
+CLDR provides formatting for person names, such as John Smith or 宮崎駿. These use patterns to show how a name object (for example, from a database) should be formatted for a particular locale. Name data has fields for the parts of people’s names, such as a **given** field with a value of “Maria”, and a **surname** field value of “Schmidt”.
-There is a wide variety in the way that people’s names appear in different languages.
+There is a wide variety in the way that people’s names appear in different languages.
* People may have a different number of names, depending on their culture—they might have only one name (“Zendaya”), two (“Albert Einstein”), or three or more.
* People may have multiple words in a particular name field, eg “Mary Beth” as a given name, or “van Berg” as a surname.
@@ -91,24 +105,21 @@
* The ordering of name fields can be different across languages, as well as the spacing (or lack thereof) and punctuation.
* Name formatting needs to be adapted to different circumstances, such as a need to be presented shorter or longer; formal or informal context; or when talking about someone, or talking to someone, or as a monogram (JFK).
-This document provides the [LDML](http://www.unicode.org/reports/tr35/) specification for formatting of personal names, using data, structure, and examples.
-
-> **This is a technology preview; thus not intended for production software (except where itself marked as a technology preview). We have gathered a first round of data, and intend to refine the way in which we gather data. We are looking for additional feedback on the tech preview so that we can make improvements. For example, there are a few areas where we intend enhancements: handling native vs foreign names (in the native script); handling prefix and suffix fields; and so on.**
+This document provides the [LDML](tr35.md) specification for formatting of personal names, using data, structure, and examples.
The CLDR functionality is targeted at formatting names for typical usage on computers (e.g. contact names, automated greetings, etc.), rather than being designed for special circumstances or protocol, such addressing royalty. However, the structure may be enhanced in the future when it becomes clear that additional features are needed for some languages.
-This addition to CLDR is based on review of current standards and practices that exist in LDAP, hcard, HTML and various other international standards and commercial implementations.
+This addition to CLDR is based on review of current standards and practices that exist in LDAP, OECD, S42, hCard, HTML and various other international standards and commercial implementations.
-Additions to those structures were made to accomodate known issues in large population groups, such as mononyms in Indonesia, patronymic and matronymic naming structure in Iceland and India, the need for a second surname in Spanish-speaking regions and the common case of chains of patronymic names in Arabic-speaking locales. The formatting patterns allow for specifying different “input parameters” to account for different contexts.
+Additions to those structures were made to accommodate known issues in large population groups, such as mononyms in Indonesia, patronymic and matronymic naming structure in Iceland and India, the need for a second surname in Spanish-speaking regions and the common case of chains of patronymic names in Arabic-speaking locales. The formatting patterns allow for specifying different “input parameters” to account for different contexts.
-#### 1.1.1 <a name="not-in-scope" href="#not-in-scope">Not in scope</a>
+#### Not in scope
The following features are currently out of scope for Person Names formating:
* Grammatical inflection of formatted names.
-* Context-specific cultural aspects, such as when to use “-san” vs “-sama” when addressing a Japanese person.
-* Providing lists of prefixes and suffixes (Mr, Ms., Mx., Dr., etc.).
-* Distinctions among prefixes and suffixes, such as title (Dr., Prof., Sir), gender-honorifics, generation (Jr., IV), accreditations (MBA, Esq.).
+* Context-specific cultural aspects, such as when to use “-san” vs “-sama” when addressing a Japanese person.
+* Providing locale-specific lists of titles, generation terms, and credentials for use in pull-down menus or validation (Mr, Ms., Mx., Dr., Jr., M.D., etc.).
* Validation of input, such as which fields are required, and what characters are allowed.
* Combining alternative names, such as multicultural names in Hong Kong "[Jackie Chan Kong-Sang](https://en.wikipedia.org/wiki/Jackie_Chan)”, or ‘Dwayne “The Rock” Johnson’.
* More than two levels of formality for names.
@@ -122,13 +133,24 @@
| Mary | | Beth Estrella | |
| Mary | | Beth | Estrella |
- * Parsing out the other components of a name in a string, such as surname prefixes ([Tussenvoegsel](https://en.wikipedia.org/wiki/Tussenvoegsel) in Dutch).
+ * Parsing out the other components of a name in a string, such as surname prefixes ([Tussenvoegsel](https://en.wikipedia.org/wiki/Tussenvoegsel) in Dutch).
-### 1.2 <a name="APIImplementation" href="#APIImplementation">API Implementation</a>
+### API Implementation
-A Tech Preview API for formatting personal names is included in ICU. The implementation can be found at [PersonNameFormatter.java](https://github.com/unicode-org/icu/blob/main/icu4j/main/classes/core/src/com/ibm/icu/text/PersonNameFormatter.java).
+A draft API for formatting personal names was first included in ICU4J 73 and has been updated for ICU4J 74 to reflect updates in this specification and associated data. (“Draft” means that the full functionality is present, but the API might be refined before it is stabilized.) The implementation can be found at the following:
-### 1.3 <a name="PersonNameFormattingOverview" href="#PersonNameFormattingOverview">Person Name Formatting Overview</a>
+* [PersonName.java](https://github.com/unicode-org/icu/blob/main/icu4j/main/core/src/main/java/com/ibm/icu/text/PersonName.java)
+* [PersonNameFormatter.java](https://github.com/unicode-org/icu/blob/main/icu4j/main/core/src/main/java/com/ibm/icu/text/PersonNameFormatter.java)
+* [SimplePersonName.java](https://github.com/unicode-org/icu/blob/main/icu4j/main/core/src/main/java/com/ibm/icu/text/SimplePersonName.java)
+
+In addition to the settings in this document, it is recommended that implementations provide some additional features in their APIs to allow more control for clients, notably:
+
+1. forceGivenFirst — no matter what the values are in nameOrderLocales or in the NameObject, display the name as givenFirst.
+2. forceSurnameFirst — no matter what the values are in nameOrderLocales or in the NameObject, display the name as surnameFirst.
+3. forceNativeOrdering — no matter what the values are in nameOrderLocales or in the NameObject, display the name with the same ordering as the native locale.
+4. surnameFirstAllCaps — display the surname and surname2 fields in all caps **if** not using native order. Thus where the foreign name ordering is surnameFirst, the name {given=Shinzo, surname=Abe} would display as “ABE Shinzo”.
+
+### Person Name Formatting Overview
Logically, the model used for applying the CLDR data is the following:
@@ -136,22 +158,23 @@
Conceptually, CLDR person name formatting depends on data supplied by a PersonName Data Interface. That could be a very thin interface that simply accesses a database record, or it could be a more sophisticated interface that can modify the raw data before presenting it to be formatted. For example, based on the formatting locale a PersonName data interface could transliterate names that are in another script, or supply equivalent titles in different languages.
-The specification below will talk about a “PersonName object” as an entity that is logically accessed via such an interface. If multiple formatted names are needed, such as in different scripts or with alternate names, or pronunciations (eg kana), the presumption is that those are logically separate PersonName objects. See [[Person Name Object](#3-person-name-object)].
+The specification below will talk about a “PersonName object” as an entity that is logically accessed via such an interface. If multiple formatted names are needed, such as in different scripts or with alternate names, or pronunciations (eg kana), the presumption is that those are logically separate PersonName objects. See [[Person Name Object](#person-name-object)].
The following summarizes the name data supplied via the PersonName Data Interface:
* Name data is composed of one or more name parts, which are categorized in this standard as
- * _prefix_ - a string that may precede a name and may indicate an honorific, title, etc.
+ * _title_ - a string that represents one or more honorifics or titles, such as “Mr.”, or “Herr Doctor”.
* _given_ - usually a name given to someone that is not passed to a person by way of parentage
* _given2_ - name or names that may appear between the first given name string and the surname. In the West, this may be a middle name, in Slavic regions it may be a patronymic name, and in parts of the Middle East, it may be the _nasab (نسب)_ or series of patronymics.
* _surname_ - usually the family name passed to a person that indicates their family, tribe, or community. In most Western languages, this is known as the last name.
* _surname2_ - in some cultures, both the parent’s surnames are used and need to be handled separately for formatting in different contexts.
- * _suffix_ - a string that may succeed a person’s name to indicate status, generation, or title.
- * _See the section on [[Fields](#5-1-fields)] for more details._
+ * _generation_ - a string that represents a generation marker, such as “Jr.” or “III”.
+ * _credentials_ - a string that represents one or more credentials or accreditations, such as “M.D.”, or “MBA”.
+ * _See the section on [[Fields](#fields)] for more details._
* Name data may have additional attributes that this specification accommodates.
* _-informal_ - A name may have a formal and an informal presentation form, for example “Bob” vs “Robert” or “Са́ша” vs “Алекса́ндра”. This is accomplished by using the simple construct _given-informal_.
- * _-prefix_ and _-core_ - In some languages the surname may have a prefix that needs to be treated differently, for example “van de Berg”. The data can refer to “van de” as _surname-prefix_ and “Berg” with _surname-core_ and the PersonNames formatters will format them correctly in Dutch and many other languages.
- * _See the section on [[Modifiers](#5-2-modifiers)] for more details._
+ * _-prefix_ and _-core_ - In some languages the surname may have a prefix that needs to be treated differently, for example “van den Berg”. The data can refer to “van den” as _surname-prefix_ and “Berg” with _surname-core_ and the PersonNames formatters will format them correctly in Dutch and many other languages.
+ * _See the section on [[Modifiers](#modifiers)] for more details._
To format a name correctly, the correct context needs to be known. The context is composed of:
@@ -162,79 +185,75 @@
* **_length_** - used to select patterns for common short, medium, and long formatted names.
* **_usage_** - this is used to select the correct pattern to format a name when a program is _addressing_ or talking to a person or it is _referring_ to or talking about another person.
* **_formality_** - This is used to select the formal or informal formatting of a name.
- * _See [[Person Name Attributes](#4-person-name-attributes)] for more details._
+ * _See [[Person Name Attributes](#person-name-attributes)] for more details._
-### 1.4 <a name="ExampleUsage" href="#ExampleUsage">Example Usage</a>
+### Example Usage
As an example, consider a person’s name that may contain:
-| `prefix` | `given` | `given2` | `surname` | `suffix` |
-| -------- | -------- | -------- | --------- | -------- |
-| | Robin | Finley | Wang | Ph.D. |
+| `title` | `given` | `given2` | `surname` | `credentials` |
+| -------- | -------- | -------- | --------- | -------- |
+| | Robin | Finley | Wang | Ph.D. |
If the selected personName data has the following formatting pattern:
-> `{prefix} {given} {given2-initial} {surname}, {suffix}`
+> `{title} {given} {given2-initial} {surname}, {credentials}`
Then the output is:
> Robin F. Wang, Ph.D.
-The _prefix_ field is empty, so both it and the space that follows it in the formatting pattern are omitted from the output, the _given2_ field is formatted as an initial, and a preceding comma is placed before the _suffix_.
+The _title_ field is empty, so both it and the space that follows it in the formatting pattern are omitted from the output, the _given2_ field is formatted as an initial, and a preceding comma is placed before the _credentials_.
-Sections below specify the precise manner in which a pattern is selected, and how the pattern is modified for missing fields.
-
-## 2 <a name="2-xml-structure" href="#2-xml-structure">XML Structure</a>
+Sections below specify the precise manner in which a pattern is selected, and how the pattern is modified for missing fields.
-Person name formatting data is stored as LDML with schema defined as follows.
+## XML Structure
-### 2.1 <a name="2-1-personnames-element" href="#2-1-personnames-element">personNames Element</a>
+Person name formatting data is stored as LDML with schema defined as follows. Each element has a brief description of the usage, but the exact algorithms for using these elements are provided in [Formatting Process](#formatting-process).
+
+
+### personNames Element
```xml
-<!ELEMENT personNames ( nameOrderLocales*, foreignSpaceReplacement?, initialPattern*, personName+, sampleName* ) >
+<!ELEMENT personNames ( nameOrderLocales*, parameterDefault*, nativeSpaceReplacement*, foreignSpaceReplacement*, initialPattern*, personName*, sampleName* ) >
```
The LDML top-level `<personNames>` element contains information regarding the formatting of person names, and the formatting of person names in specific contexts for a specific locale.
-### 2.2 <a name="2-2-personname-element" href="#2-2-personname-element">personName Element</a>
+### personName Element
-The `<personName>` element contains the format patterns, or `<namePattern>` elements, for a specific context and is described in [[namePattern Syntax](#5-namepattern-syntax)]
+The `<personName>` element contains the format patterns, or `<namePattern>` elements, for a specific context and is described in [[namePattern Syntax](#namepattern-syntax)]
-The `<namePattern>` syntax is described in [[Person Name Format Patterns](#6-formatting-process)].
+The `<namePattern>` syntax is described in [[Person Name Format Patterns](#formatting-process)].
```xml
<!ELEMENT personName ( namePattern+ ) >
-<!ATTLIST personName order NMTOKENS #IMPLIED >
+<!ATTLIST personName order NMTOKEN #IMPLIED >
```
-* `NMTOKENS` is a space delimited list of `( surnameFirst | givenFirst | sorting )`
+* `NMTOKEN` is one of `( surnameFirst | givenFirst | sorting )`
```xml
-<!ATTLIST personName length NMTOKENS #IMPLIED >
+<!ATTLIST personName length NMTOKEN #IMPLIED >
```
-* `NMTOKENS` is a space delimited list of `( long | medium | short )`
+* `NMTOKEN` is one of `( long | medium | short )`
```xml
-<!ATTLIST personName usage NMTOKENS #IMPLIED >
+<!ATTLIST personName usage NMTOKEN #IMPLIED >
```
-* `NMTOKENS` is a space delimited list of `( addressing | referring | monogram )`
+* `NMTOKEN` is one of `( addressing | referring | monogram )`
```xml
-<!ATTLIST personName formality NMTOKENS #IMPLIED >
+<!ATTLIST personName formality NMTOKEN #IMPLIED >
```
-* `NMTOKENS` is a space delimited list of `( formal | informal )`
+* `NMTOKEN` is one of `( formal | informal )`
The `<personName>` element has attributes of `order`, `length`, `usage`, and `formality`, and contains one or more `<namePattern>` elements.
-* For each attribute, there must be at least one attribute value, no value can occur twice, and order is not important (but the canonical order of elements is `order, length, usage, formality`). Thus
- * `formality="informal informal"` is invalid,
- * as is `formality=""`.
- * `formality="formal informal"` is valid and canonical
- * `formality="informal formal"` is valid, but not canonical
-* A missing attribute is equivalent to a list of all valid values for that attribute. For example, if `formality=...` is missing, it is equivalent to `formality="formal informal"`.
+A missing attribute matches all valid values for that attribute. For example, if `formality=...` is missing, it is equivalent to multiple lines, one for each possible `formality` attribute.
```xml
<!ELEMENT namePattern ( #PCDATA ) >
@@ -244,18 +263,18 @@
> `<namePattern>{surname}, {given} {given2}</namePattern>`
-which produces output like _“Smith, Robert James”_. See [[namePattern Syntax](#5-namepattern-syntax)] for more details.
+which produces output like _“Smith, Robert James”_. See [[namePattern Syntax](#namepattern-syntax)] for more details.
-### 2.3 <a name="2-3-nameorderlocales-element" href="#2-3-nameorderlocales-element">nameOrderLocales Element</a>
+### nameOrderLocales Element
-The `<nameOrderLocales>` element is optional, and contains information about selecting patterns based on the locale of a passed in PersonName object to determine the order of elements in a formatted name. For more information see [[NameOrder](#6-3-derive-the-name-order)]. It has a structure as follows:
+The `<nameOrderLocales>` element is optional, and contains information about selecting patterns based on the locale of a passed in PersonName object to determine the order of elements in a formatted name. For more information see [[NameOrder](#derive-the-name-order)]. It has a structure as follows:
```xml
<!ELEMENT nameOrderLocales `( #PCDATA )`>
<!ATTLIST nameOrderLocales order ( givenFirst | surnameFirst ) #REQUIRED >
```
-* `#PCDATA `is a space delimited list of one or more [unicode_locale_id](https://unicode.org/reports/tr35/#unicode_locale_id)s. Normally they are limited to language, script, and region. The _und_ locale ID may only occur once, either in _surnameFirst_ or _givenFirst_, but not both, and matches all base locales not explicitly listed.
+* `#PCDATA `is a space delimited list of one or more [unicode_locale_id](tr35.md#unicode_locale_id)s. Normally each locale is limited to language, script, and region. The _und_ locale ID may only occur once, either in _surnameFirst_ or _givenFirst_, but not both, and matches all base locales not explicitly listed.
An example from English may look like the following
@@ -264,9 +283,18 @@
This would tell the formatting code, when handling person name data from an English locale, to use patterns with the `givenFirst` order attribute for all data except name data from Korean, Vietnamese, Cantonese, and Chinese locales, where the `surnameFirst` patterns should be used.
-### 2.4 <a name="2-4-foreignspacereplacement-element" href="#2-4-foreignspacereplacement-element">foreignSpaceReplacement Element</a>
+### parameterDefault Element
+```xml
+<!ELEMENT parameterDefault ( #PCDATA ) >
+<!ATTLIST parameterDefault parameter (length | formality) #REQUIRED >
+```
+Many clients of the person-names functionality don’t really care about formal versus informal; they just want whatever the “normal” formality level is for the user’s language. The same goes for the default length.
-The `<foreignSpaceReplacement>` element is used to specify how delimiters should appear between name parts when the name data (name locale) is different from the requested locale (formatting locale)., but they both use the same script.
+This parameter provides that information, so that APIs can allow users to use default values for the formality and length. The exact form that this takes depends on the API conventions, of course.
+
+### foreignSpaceReplacement Element
+
+The `<foreignSpaceReplacement>` element is used to specify how spaces should be handled when the name language is **different from** the formatting language. It is used in languages that don't normally require spaces between words. For example, Japanese and Chinese have the value of a middle dot (‘·’ U+00B7 MIDDLE DOT or ‘・’ U+30FB KATAKANA MIDDLE DOT), so that it is used between words in a foreign name; most other languages have the value of SPACE.
```xml
<!ELEMENT foreignSpaceReplacement ( #PCDATA ) >
@@ -274,15 +302,27 @@
```
* `xml:space` must be set to `'preserve'` so that actual spaces in the pattern are preserved. See [W3C XML White Space Handling](https://www.w3.org/TR/xml/#sec-white-space).
-* The `#PCDATA `is the character sequence used to replace spaces between fields for name data from a name locale that is different from the formatting locale, but are in the same script.
+* The `#PCDATA `is the character sequence used to replace spaces when postprocessing a pattern.
-### 2.5 <a name="2-5-initialpattern-element" href="#2-5-initialpattern-element">initialPattern Element</a>
+### nativeSpaceReplacement Element
+
+The `<nativeSpaceReplacement>` element is used to specify how spaces should be handled when the name language is **the same as** the formatting language. It is used in languages that don't normally require spaces between words, but may use spaces within names. For example, Japanese and Chinese have the value of an empty string between words in a native name; most other languages have the value of SPACE.
+
+```xml
+<!ELEMENT nativeSpaceReplacement ( #PCDATA ) >
+<!ATTLIST nativeSpaceReplacement xml:space preserve #REQUIRED >
+```
+
+* `xml:space` must be set to `'preserve'` so that actual spaces in the pattern are preserved. See [W3C XML White Space Handling](https://www.w3.org/TR/xml/#sec-white-space).
+* The `#PCDATA `is the character sequence used to replace spaces when postprocessing a pattern.
+
+### initialPattern Element
The `<initialPattern>` element is used to specify how to format initials of name parts.
**_initial_** is a pattern used to display a single initial in the locale, while **_initialSequence_** is a pattern used to “glue” together multiple initials for multiword fields, for example with the given name “Mary Beth” in English.
-#### 2.5.1 <a name="syntax" href="#syntax">Syntax</a>
+#### Syntax
```xml
<!ELEMENT initialPattern ( #PCDATA ) >
@@ -297,41 +337,41 @@
> `<initialPattern type="initialSequence">{0} {1}</initialPattern>`
-## 3 <a name="3-person-name-object" href="#3-person-name-object">Person Name Object</a>
+## Person Name Object
The information that is to be formatted logically consists of a data object containing a number of fields. This data object is a construct for the purpose of formatting, and doesn’t represent the source of the name data. That is, the original source may contain more information. The PersonName object is merely a logical ‘transport’ of information to formatting; it may in actuality consist of, for example, an API that fetches fields from a database.
-Note that an application might have more than one set of name data for a given person, such as data for both a legal name and a nickname or preferred name. Or the source data may contain two whole sets of name data for a person from an Eastern Slavic region, one in Cyrillic characters and one in Latin characters. Or it might contain phonetic data for a name (commonly used in Japan). The additional application-specific information in person’s names is out of scope for the CLDR Person Name formatting data. Thus a calling application may produce more than one PersonName object to format depending on the purpose.
+Note that an application might have more than one set of name data for a given person, such as data for both a legal name and a nickname or preferred name. Or the source data may contain two whole sets of name data for a person from an Eastern Slavic region, one in Cyrillic characters and one in Latin characters. Or it might contain phonetic data for a name (commonly used in Japan). The additional application-specific information in person’s names is out of scope for the CLDR Person Name formatting data. Thus a calling application may produce more than one PersonName object to format depending on the purpose.
For illustration, the following is a sample PersonName object.
| Field | Value | Comment |
| ---------------- | ------------ | ------------------------------- |
-| `prefix` | “Dr.” | |
+| `title` | “Dr.” | |
| `given` | “William” | |
| `given-informal` | “Bill” | example inclusion of "nickname" |
| `given2` | “Torval” | |
| `surname` | “Brown” | |
| `nameLocale` | “und-US” | this is just for illustration |
-| `preferredOrder` | “givenFirst” | this too |
+| `preferredOrder` | “givenFirst” | values are givenFirst and surnameFirst |
-A PersonName object is logically composed of the fields above plus other possible variations. See [[Fields](#5-1-fields)]. There must be at least one field present: either a `given` or `surname` field. Other fields are optional, and some of them can be constructed from other fields if necessary.
+A PersonName object is logically composed of the fields above plus other possible variations. See [[Fields](#fields)]. There must be at least one field present: either a `given` or `surname` field. Other fields are optional, and some of them can be constructed from other fields if necessary.
-A modifier is supplied, _-informal_, which can be used to indicate which data element to choose when formatting informal cases which might include nicknames or preferred names. For more details, see section on [_[Modifiers](#5-2-modifiers)_] in [namePattern Syntax](#5-namepattern-syntax) below.
+A modifier is supplied, _-informal_, which can be used to indicate which data element to choose when formatting informal cases which might include nicknames or preferred names. For more details, see section on [_[Modifiers](#modifiers)_] in [namePattern Syntax](#namepattern-syntax) below.
-## 4 <a name="4-person-name-attributes" href="#4-person-name-attributes">Person Name Attributes</a>
+## Person Name Attributes
-A person name pattern may have any of four attributes: order, length, usage, and formality. LDML specifies that all the values for these attributes are unique. For example, because length=long is valid, usage=long cannot also be valid. That allows the pattern labels to be simple, because the attribute names can be skipped. That is,
+A person name pattern may have any of four attributes: order, length, usage, and formality. LDML specifies that all the values for these attributes are unique. For example, because length=long is valid, usage=long cannot also be valid. That allows the pattern labels to be simple, because the attribute names can be skipped. That is,
> `{order=givenFirst, length=long, usage=referring, formality=formal}`
-can be abbreviated without loss of information as:
+can be abbreviated without loss of information as:
> _givenFirst-long-referring-formal._
Each of these attributes are described below using sample PersonName objects as examples.
-### 4.1 <a name="4-1-order" href="#4-1-order">order</a>
+### order
The order attribute is used for patterns with different orders of fields. The order=sorting patterns are chosen based on input parameters, while the choice between givenFirst and surnameFirst is based on features of the PersonName object to be formatted and the nameOrder element values.
@@ -339,13 +379,13 @@
| -------------- | -------------------------------------------- |
| `givenFirst` | The given name precedes the surname. |
| `surnameFirst` | The surname precedes the given name. |
-| `sorting` | Used to format names for a for a sorted list.<br/>example: “Brown, William” [medium, informal] |
+| `sorting` | Used to format names for a sorted list.<br/>example: “Brown, William” [medium, informal] |
For example, when the display language is Japanese, it is customary to use _surnameFirst_ for names of people from Japan and Hungary, but use _givenFirst_ for names of people from the United States and France. Although the English pattern for sorting is distinct from the other patterns (except for unusual names), that is not necessarily the case in other languages.
-### 4.2 <a name="4-2-length" href="#4-2-length">length</a>
+### length
-The `length` attribute specifies the relative length of a formatted name depending on context. For example, a `long` formal name in English would include prefix, given, given2, surname plus suffix; whereas a `short` informal name may only be the given name.
+The `length` attribute specifies the relative length of a formatted name depending on context. For example, a `long` formal name in English might include title, given, given2, surname plus generation and credentials; whereas a `short` informal name may only be the given name.
Note that the formats may be the same for different lengths depending on the formality, usage, and cultural conventions for the locale. For example, medium and short may be the same for a particular context.
@@ -355,7 +395,7 @@
| `medium` | A `medium` length is between long and short.<br/>Example: `usage="referring", formality="formal"`<br/>_“Robert Smith”_ |
| `short` | A `short` length uses a minimum set of names.<br/>Example: `usage="referring", formality="formal"`<br/>_“Mr. Smith”_ |
-### 4.3 <a name="4-3-usage" href="#4-3-usage">usage</a>
+### usage
The usage indicates if the formatted name is being used to address someone, refer to someone, or present their name in an abbreviated form.
@@ -363,15 +403,15 @@
| Parameter | Description |
| ------------ | ----------- |
-| `addressing` | Used when speaking “to” a person, or “vocative” case. This may also have an effect on the formality.<br/>example: “Robert” [medium, informal] |
-| `referring` | Used when speaking “about” a person, or “nominative” case.<br/>example: “Robert Smith” [medium, informal] |
-| `monogram` | The `monogram` usage is for a specific abbreviated form.<br/>Example: monogram for Robert James Smith may be “RS” or “RJS”.<br/>`usage=”referring”, formality=”formal”`<br/>_“RJS”_ |
+| `addressing` | Used when speaking “to” a person, or “vocative” case. This may also have an effect on the formality.<br/>example: “Welcome, **Robert**” |
+| `referring` | Used when speaking “about” a person, or “nominative” case.<br/>example: “**Robert Smith** joined your group” |
+| `monogram` | The `monogram` usage is for a specific abbreviated form for computer UI.<br/>Example: a monogram for Robert James Smith may be **RS** or **RJS**.|
Slavic languages provide a good example of `addressing` vs `referring`. An example _uk-Cyrl_ PersonName object:
| Field | Value | Comment |
| ---------------- | ------------ | ------------------------------- |
-| `prefix` | “г-н” | “Mr.” |
+| `title` | “г-н” | “Mr.” |
| `given` | “Иван” | “Ivan” |
| `given2` | “Петрович” | “Petrovich” |
| `surname` | “Васильев” | “Vasiliev” |
@@ -384,11 +424,11 @@
* Васильев Иван Петрович `// "Vasiliev Ivan Petrovich"`
-The `monogram` usage is for very short abbreviated names, such as might be found in online messaging text avatars or other annotations. Ideally, a `monogram` format should result in something that could fit in an em square. Some emoji provide examples of this: 🅰️ 🆎 🆘
+The `monogram` usage is for very short abbreviated names, such as might be found in online messaging text avatars or other annotations. Ideally, a `monogram` format should result in something that could fit in an em square. Some emoji provide examples of this: 🅰️ 🆎 🆘
When used with `length`, for many alphabetic locales a `monogram` would resolve to one, two, or three characters for short, medium, and long respectively. But that may vary depending on the usage in a locale.
-### 4.4 <a name="4-4-formality" href="#4-4-formality">formality</a>
+### formality
The `formality` indicates the formality of usage. A name on a badge for an informal gathering may be much different from an award announcement at the Nobel Prize Ceremonies.
@@ -396,10 +436,10 @@
| Parameter | Description |
| ---------- | ----------- |
-| `formal` | A more formal name for the individual. The composition depends upon the language. For example, a particular locale might include the prefix and suffix and a full middle name (given2) in the long form.<br/><br/>`length="medium", formality="formal"`<br/>“Robert J. Smith” |
-| `informal` | A less formal name for the individual. The composition depends upon the language. For example, a language might exclude the prefix, suffix and given2 (middle) name. Depending on the length, it may also exclude the surname. The formatting algorithm should choose any passed in name data that has an _informal_ attribute, if available.<br/><br/>`length="medium", formality="informal"`<br/>“Bob Smith” |
+| `formal` | A more formal name for the individual. The composition depends upon the language. For example, a particular locale might include the title, generation, credentials and a full middle name (given2) in the long form.<br/><br/>`length="medium", formality="formal"`<br/>“Robert J. Smith” |
+| `informal` | A less formal name for the individual. The composition depends upon the language. For example, a language might exclude the title, credentials and given2 (middle) name. Depending on the length, it may also exclude the surname. The formatting algorithm should choose any passed in name data that has an _informal_ attribute, if available.<br/><br/>`length="medium", formality="informal"`<br/>“Bob Smith” |
-## 5 <a name="5-namepattern-syntax" href="#5-namepattern-syntax">namePattern Syntax</a>
+## namePattern Syntax
A _namePattern_ is composed of a sequence of field IDs, each enclosed in curly braces, and separated by zero or more literal characters (eg, space or comma + space). An Extended Backus Normal Form (EBNF) is used to describe the namePattern format for a specific set of attributes. It has the following structure. This is the `( #PCDATA )` reference in the element specification above.
@@ -407,39 +447,40 @@
| ------------ | ----------------------------- | -------- |
| namePattern | = literal?<br/><span style="white-space:nowrap">( modField literal? )+;</span> | Two literals cannot be adjacent |
| modField | <span style="white-space:nowrap">= '{' field modifierList? '}';</span> | A name field, optionally modified |
-| field | = 'prefix'<br/>\| 'given'<br/>\| 'given2'<br/>\| 'surname'<br/>\| 'surname2'<br/>\| 'suffix' ; | See [Fields](#5-1-fields) |
-| modifierList | = '-informal'?<br/><span style="white-space:nowrap">( '-allCaps' \| ‘-initialCap' )?;</span><br/><span style="white-space:nowrap">( '-initial' \| '-monogram' )?</span><br/><span style="white-space:nowrap">( '-prefix' \| '-core' )?</span> | Optional modifiers that can be applied to name parts, see [Modifiers](#5-2-modifiers). Note that some modifiers are exclusive: only `prefix` or `core`, only `initial` or `monogram`, only `allCaps` or `initialCap`. |
+| field | = 'title'<br/>\| 'given'<br/>\| 'given2'<br/>\| 'surname'<br/>\| 'surname2'<br/>\| 'generation'<br/>\| 'credentials' ; | See [Fields](#fields) |
+| modifierList | = '-informal'?<br/><span style="white-space:nowrap">( '-allCaps' \| ‘-initialCap' )?;</span><br/><span style="white-space:nowrap">( '-initial' \| '-monogram' )?</span><br/><span style="white-space:nowrap">( '-prefix' \| '-core' )?</span> | Optional modifiers that can be applied to name parts, see [Modifiers](#modifiers). Note that some modifiers are exclusive: only `prefix` or `core`, only `initial` or `monogram`, only `allCaps` or `initialCap`. |
| literal | = codepoint+ ; | One or more Unicode codepoints. |
-### 5.1 <a name="5-1-fields" href="#5-1-fields">Fields</a>
+### Fields
The Person Name formatting data assumes that the name data to be formatted consists of the fields in the table below. All of the fields may contain multiple words. Field IDs are lowercase ASCII alphanumeric, and start with an alphabetic character.
-When determining how a full name is to be placed into name fields, the data to be formatted should be organized functionally. That is, if a name part is on the dividing line between `given2` and `given`, the key feature is whether it would always occur with the rest of the given name. For example, in _“Mary Jean Smith”_, if _“Mary”_ never occurs without the _“Jean”_, then the given name should be _“Mary Jean”_. If _“Smith”_ never occurs without the _“Jean”_, the `surname` should be _“Jean Smith”_. Otherwise, _“Jean”_ would be the `given2` field.
+When determining how a full name is to be placed into name fields, the data to be formatted should be organized functionally. That is, if a name part is on the dividing line between `given2` and `given`, the key feature is whether it would always occur with the rest of the given name. For example, in _“Mary Jean Smith”_, if _“Mary”_ never occurs without the _“Jean”_, then the given name should be _“Mary Jean”_. If _“Smith”_ never occurs without the _“Jean”_, the `surname` should be _“Jean Smith”_. Otherwise, _“Jean”_ would be the `given2` field.
-For example, a patronymic would be treated as a `given2` name in most slavic languages.
+For example, a patronymic would be treated as a `given2` name in most slavic languages.
In some cultures, two surnames are used to indicate the paternal and maternal family names or generational names indicating father, grandfather. The `surname2` field is used to indicate this. The CLDR PersonName formatting data assumes that if a PersonName object to be formatted does not have two surnames, then the `surname2` field is not populated. (That is, no pattern should have a `surname2` field without a surname field.) Order of fields in a pattern can vary arbitrarily by locale.
-In most cultures, there is a concept of nickname or preferred name, which is used in informal settings or sometimes to represent a “public” or “stage name”. The nickname or preferred name may be submitted as a separate PersonName object to be formatted, or included with a modifier such as `given-informal`.
+In most cultures, there is a concept of nickname or preferred name, which is used in informal settings or sometimes to represent a “public” or “stage name”. The nickname or preferred name may be submitted as a separate PersonName object to be formatted, or included with a modifier such as `given-informal`.
| Field | Description<br/>Note: The values for each are as supplied by the PersonName object, via the PersonName data interface. |
| ---------- | ----------- |
-| `prefix` | Typically a title, honorific, or generational qualifier.<br/>Example: ‘Ms.’, ‘Mr.’, ’Dr’, ‘President’<br/><br/>Note that CLDR PersonName formats data does not define regional or locale-specific lists of prefixes, honorifics, or titles such as “Mr”, “Ms”, “Mx”, “Prof”, “Jr”, etc. |
+| `title` | A title or honorific qualifier.<br/>Example: ‘Ms.’, ‘Mr.’, ’Dr’, ‘President’<br/><br/>Note that CLDR PersonName formats data does not define regional or locale-specific lists of titles or honorifics such as “Mr”, “Ms”, “Mx”, “Prof”, etc. |
| `given` | The “given” name. Can be multiple words such as “Mary Ann”.<br/>Examples: “Janus”, “Mary Jean”, or “Jean-Louis”|
| `given2` | Additional given name or names or middle name, usually names(s) written between the given and surname. Can be multiple words. In some references, also known as a “second” or “additional” given name or patronymic. This field is separate from the “given” field because it is often optional in various presentation forms.<br/>Examples: “Horatio Wallace” as in<br/>`{ given: "Janus", `**`given2: "Horatio Wallace"`**`, surname: "Young" }`<br/><br/>“S.” as in “Harry S. Truman”. Yes, his full middle name was legally just “S.”.|
| `surname` | The “family name”. Can be more than one word.<br/><br/>Example: “van Gogh” as in<br/>`{ given: "Vincent", given2: "Willem", `**`surname: "van Gogh"`**` }`<br/><br/>Other examples: “Heathcote-Drummond-Willoughby” as in “William Emanuel Heathcote-Drummond-Willoughby III”|
| `surname2` | Secondary surname (used in some cultures), such as second or maternal surname in Mexico and Spain. This field is separate from the “surname” field because it is often optional in various presentation forms, and is considered a separate distinct name in some cultures.<br/><br/>Example: “Barrientos” in “Diego Rivera Barrientos”;<br/>`{ given: "Diego", surname: "Rivera", `**`surname2: "Barrientos"`**` }`<br/><br/>Example: if "Mary Jane Smith" moves to Spain the new name may be<br/>`{ given: "Mary", given2: "Jane", surname: "Smith", `**`surname2: "Jones"`**` }`|
-| `suffix` | Typically a title, honorific, or generational qualifier.<br/>Example: “PhD”, “Jr.”<br/><br/>Example: “Sonny Jarvis Jr.”<br/>`{ given: "Salvatore", given2: "Blinken", surname: "Jarvis", `**`suffix: "Jr."`**` }`<br/><br/>An alternate PersonName object may be presented for formatting using the “stage” name from the application’s data:<br/>`{ given: "Salvatore", given-informal: "Sonny", given2: "", surname: "Jarvis", `**`suffix: "Jr."`**` }` |
+| `credentials` | A credential or accreditation qualifier.<br/>Example: “PhD”, “MBA”<br/><br/>Example: “Salvatore Jarvis MBA”<br/>`{ given: "Salvatore", given2: "Blinken", surname: "Jarvis", `**`credentials: "MBA"`**` }`<br/><br/>An alternate PersonName object may be presented for formatting using the “stage” name from the application’s data:<br/>`{ given: "Salvatore", given-informal: "Salvatore", given2: "", surname: "Jarvis", `**`credentials: "MBA"`**` }` |
+| `generation` | A generation qualifier.<br/>Example: “III”, “Jr.”<br/><br/>Example: “Sonny Jarvis Jr.”<br/>`{ given: "Salvatore", given2: "Blinken", surname: "Jarvis", `**`generation: "Jr."`**` }` |
Some other examples:
* British name: _John Ronald Reuel Tolkien_: `given` name is "John", `given2` name would be "Ronald Reuel", and the `surame` is "Tolkien".
* Dutch name: _Anneliese Louise van der Pol_: `given` name: "Anneliese", `given2` name: "Louise", `surname`: "van der Pol"
- * Also surname-prefix: “van der”, surname-core: “Pol” — see below.
+ * Also surname-prefix: “van der”, surname-core: “Pol” — see below.
* French name: “Jean-Louis Trintignant” would _not_ be Jean (`given`) Louis (`given2`) Trintignant (`surname`), since “Louis” wouldn’t be discarded when formatting. Instead it would be Jean-Louis (`given`) Trintignant (`surname`)
-Note: If the legal name, stage name, etc. are substantially different, then that information can be logically in a separate PersonName object. That is, it is up to the implementation to maintain any distinctions that are important to it: CLDR PersonName formats is focusing on formatting a PersonName object that is given to it.
+Note: If the legal name, stage name, etc. are substantially different, then that information can be logically in a separate PersonName object. That is, it is up to the implementation to maintain any distinctions that are important to it: CLDR PersonName formats is focusing on formatting a PersonName object that is given to it.
`surname2` would only be asked for in certain locales, and where it is considered a separate, divisible name, such as in Mexico or Spain. For instance, in Mexico, the first and second surname are used for the legal name and in formal settings, and sometimes only the first surname is used in familiar or informal contexts.
@@ -448,7 +489,7 @@
How names get placed into fields to be formatted is beyond the scope of CLDR PersonName formats; this document just lays out the assumptions the formatting code makes when formatting the names.
-### 5.2 <a name="5-2-modifiers" href="#5-2-modifiers">Modifiers</a>
+### Modifiers
Each field in a pattern can have one or more modifiers. The modifiers can be appended to any field name, such as `{given-initial}` for the first grapheme of the given name. If more than one modifier is applied, they must be structured as in the EBNF.
@@ -460,75 +501,126 @@
| prefix | Return the “prefix” name, or the “tussenvoegsel'' if present. For example, “van der Poel” becomes “van der”, “bint Fadi” becomes “bint”, “di Santis” becomes “di”. Note that what constitutes the prefix is language- and locale-sensitive. It may be passed in as part of the PersonName object, similar to the _“-informal”_ modifier, e.g. as _“surname-prefix”_.<br/><br/>The implementation of this modifier depends on the PersonName object. CLDR does not currently provide support for automatic identification of tussenvoegsels, but may in the future.<br/><br/>If the resulting _“-prefix”_ value is empty, it defaults to an empty string.<br/><br/>An example sorting pattern for “Johannes van den Berg” may be<br/>{surname-core}, {given} {given2} {surname-prefix}<br/><br/>Only the _“-prefix”_ or the _“-core”_ modifier may be used, but not both. They are mutually exclusive. |
| core | Return the “core” name, removing any tussenvoegsel. For example, “van der Poel” becomes “Poel”, “bint Fadi” becomes “Fadi”, “di Santis” becomes “Santis”. Note that what constitutes the core is language- and locale-sensitive.<br/><br/>The implementation of this modifier depends on the PersonName object. CLDR does not currently provide support for identification of tussenvoegsel, but may in the future.<br/><br/>If the resulting _“-core”_ value is empty, it defaults to the field it modifies. E.g., if _“surname-core”_ is empty in the PersonName object to be formatted, it will default to the _“surname”_ field.<br/><br/>Vice-versa, if the _surname_ field is empty, the formatter will attempt to use _surname-prefix_ and _surname-core_, if present, to format the name.<br/><br/>Only the _“-prefix”_ or the _“-core”_ modifier may be used, but not both. They are mutually exclusive. |
| allCaps | Requests the element in all caps, which is desired In some contexts. For example, a new guideline in Japan is that for the Latin representation of Japanese names, the family name comes first and is presented in all capitals. This would be represented as<br/>“{surname-allCaps} {given}”<br/><br/>Hayao Miyazaki (宮崎 駿) would be represented in Latin characters in Japan (ja-Latn-JP) as _“MIYAZAKI Hayao”_<br/><br/>_The default implementation uses the default Unicode uppercase algorithm; if the PersonName object being formatted has a locale, and CLDR supports a locale-specific algorithm for that locale, then that algorithm is used. The PersonName object can override this, as detailed below._<br/><br/>Only the _“-allCaps”_ or the _“-initalCap”_ modifier may be used, but not both. They are mutually exclusive. |
-| initialCap | Request the element with the first grapheme capitalized, and remaining characters unchanged. This is used in cases where an element is usually in lower case but may need to be modified. For example in Dutch, the name<br/>{ prefix: “dhr.”, given: ”Johannes”, surname: “van den Berg” },<br/>when addressed formally, would need to be “dhr. Van den Berg”. This would be represented as<br/>“{prefix} {surname-initialCap}”<br/><br/>Only the _“-allCaps”_ or the _“-initalCap”_ modifier may be used, but not both. They are mutually exclusive. |
+| initialCap | Request the element with the first grapheme capitalized, and remaining characters unchanged. This is used in cases where an element is usually in lower case but may need to be modified. For example in Dutch, the name<br/>{ title: “dhr.”, given: ”Johannes”, surname: “van den Berg” },<br/>when addressed formally, would need to be “dhr. Van den Berg”. This would be represented as<br/>“{title} {surname-initialCap}”<br/><br/>Only the _“-allCaps”_ or the _“-initalCap”_ modifier may be used, but not both. They are mutually exclusive. |
| initial | Requests the initial grapheme cluster of each word in a field. The `initialPattern` patterns for the locale are used to create the format and layout for lists of initials. For example, if the initialPattern types are<br/>`<initialPattern type="initial">{0}.</initialPattern>`<br/>`<initialPattern type="initialSequence">{0} {1}</initialPattern>`<br/>then a name such as<br/>{ given: “John”, given2: “Ronald Reuel”, surname: “Tolkien” }<br/>could be represented as<br/>“{given-initial-allCaps} {given2-initial-allCaps} {surname}”<br/>and will format to “**J. R. R. Tolkien**”<br/><br/>_The default implementation uses the first grapheme cluster of each word for the value for the field; if the PersonName object has a locale, and CLDR supports a locale-specific grapheme cluster algorithm for that locale, then that algorithm is used. The PersonName object can override this, as detailed below._<br/><br/>Only the _“-initial”_ or the _“-monogram”_ modifier may be used, but not both. They are mutually exclusive. |
| monogram | Requests initial grapheme. Example: A name such as<br/>{ given: “Landon”, given2: “Bainard Crawford”, surname: “Johnson” }<br/>could be represented as<br/>“{given-monogram-allCaps}{given2-monogram-allCaps}{surname-monogram-allCaps}”<br/>or “**LBJ**”<br/><br/>_The default implementation uses the first grapheme cluster of the value for the field; if the PersonName object has a locale, and CLDR supports a locale-specific grapheme cluster algorithm for that locale, then that algorithm is used. The PersonName object can override this, as detailed below. The difference between monogram an initial is that monogram only returns one element, not one element per word._<br/><br/>Only the _“-initial”_ or the _“-monogram”_ modifier may be used, but not both. They are mutually exclusive. |
+| retain | This is needed in languages that preserve punctuation when forming initials. For example, normally the name {given=Anne-Marie} is converted into initials with {given-initialCaps} as “A. M.”. However, where a language preserves the hyphen, the pattern should use {given-initialCaps**-retain**} instead. In that case, the result is “A.-M.”. (The periods are added by the pattern-initialSequence.) |
+| genitive, vocative | Patterns can use these modifiers so that better results can be obtained for inflected languages. However, see the details below. |
-There may be more modifiers in the future.
+#### Grammatical Modifiers for Names
-Examples:
+The CLDR person name formatting does not itself support grammatical inflection.
+However, name sources (NameObject) can support inflections, either by having additional fields or by using an inflection engine that can handle personal name parts.
+
+In the current release, the focus is on supporting `referring` and `addressing` forms.
+Typically the `referring` forms will be in the most neutral (*nominative*) case, and the `addressing` forms will be in the *vocative* case.
+Some modifiers have been added to facilitate this, so that there can be patterns like: {given-vocative} {surname-vocative}.
+
+Notice that some **parts** of the formatted name may be in different grammatical cases, so the cases may not be consistent across the whole name.
+For example:
+
+| English Pattern | Examples | Latvian Pattern | Examples |
+| ---- | ---- | ---- | ---- |
+| {given} {surname} | John Smith | {given} {surname} | Kārlis Ozoliņš |
+| {title} {surname} | Mr Smith | {surname} {title} | Ozoliņa kungs |
+
+Notice that the `surname` in Latvian needs to change to the genitive case with that pattern:
+
+Ozoliņš ➡︎ **Ozoliņa**
+
+That is accomplished by changing the pattern to be {surname<b>-genitive</b>} {title}. In this case the {surname} should only be genitive if followed by the {title}.
+
+#### Future Modifiers
+
+Additional modifiers may be added in future versions of CLDR.
+
+Examples:
1. For the initial of the surname **_“de Souza”_**, in a language that treats the “de” as a tussenvoegsel, the PersonName object can automatically recast `{surname-initial}` to:<br/>`{surname-prefix-initial}{surname-core-initial-allCaps} `to get “dS” instead of “d”.
2. If the locale expects a surname prefix to to be sorted after a surname, then both `{surname-core} `then `{surname-prefix}` would be used as in<br/>`{surname-core}, {given} {given2} {surname-prefix}`
+3. Only the grammatical modifiers requested by translators for `referring` or `addressing` have been added as yet, but additional grammatical modifiers may be added in the future.
-## 6 <a name="6-formatting-process" href="#6-formatting-process">Formatting Process</a>
+## Formatting Process
-The patterns are in personName elements, which are themselves in a personNames container element. The following describes how these patterns are chosen. If the name locale is different than the formatting locale, then additional processing needs to take place: see [Handling foreign names](#6-8-handling-foreign-names).
+The patterns are in **personName** elements, which are themselves in a **personNames** container element. The following describes how the formatter's locale interacts with the personName's locale, how the name patterns are chosen, and how they are processed.
-The details of the XML structure behind the data referenced here are in [XML Structure](#2-xml-structure).
+The details of the XML structure behind the data referenced here are in [XML Structure](#xml-structure).
-### 6.1 <a name="6-1-derive-the-name-locale" href="#6-1-derive-the-name-locale">Derive the name locale</a>
+The formatting process may be refined in the future. In particular, additional data may be added to allow further customization.
-Create a **full name locale** as follows.
+The term **maximal likely locale** used below is the result of using the [Likely Subtags](tr35.md#Likely_Subtags) data to map from a locale to a full representation that includes the base language, script, and region.
-1. First, let the **full formatting locale** be the fully-fleshed-out formatting locale using likely subtags.
-2. If there is a name locale available via the PersonName data interface, obtain the full name locale from the name locale using likely subtags. Thus de ⇒ de_Latn_de.
-3. Otherwise the full name locale is created based on the characters in the name and the full formatting locale, as follows:
- 1. Find the predominant script for the name in the following way.
- 1. For each character in the given and surname, find the script(s) of the character using the Script_Extensions property.
- 2. For each of those scripts, increment a counter for that script, and record the position of the first character encountered in that script.
- 2. The predominant script is the script with the highest counter value.
- 1. In the rare case that there are multiple counters with the highest counter value, take the one with the lowest first position.
- 2. In the even rarer case that there is still more than one, use the script whose script code is alphabetically lowest. (These two steps are simply to guarantee a determinant result.)
- 3. If the predominant script is the same as the script of the full formatting locale, then let the full name locale be the full formatting locale.
- 4. Otherwise, find the likely locale for the predominant script, as specified by the likely subtags. (This will add a language and region.) Let the full name locale be that likely locale.
+### Derive the name locale
-In all steps below, the "name locale" is the full name locale.
+Construct the **name script** in the following way.
+1. Iterate through the characters of the surname, then through the given name.
+ 1. Find the script of that character using the Script property.
+ 2. If the script is not Common, Inherited, nor Unknown, return that script as the **name script**
+2. If nothing is found during the iteration, return Zzzz (Unknown Script)
-### 6.2 <a name="6-2-derive-the-formatting-locale" href="#6-2-derive-the-formatting-locale">Derive the formatting locale</a>
+Construct the **name base language** in the following way.
+1. If the PersonName object can provide a name locale, return its language.
+2. Otherwise, find the maximal likely locale for the name script and return its base language (first subtag).
-If the full name locale is different from the full formatting locale, and the predominant script of the name is different from the script of the formatting locale, then let the full formatting locale be the full name locale.
+Construct the **name locale** in the following way:
+1. If the PersonName object can provide a name locale, return a locale formed from it by replacing its script by the name script.
+2. Otherwise, return the locale formed from the name base language plus name script.
-In all steps below, the "formatting locale" is the full formatting locale.
+Construct the **name ordering locale** in the following way:
+1. If the PersonName object can provide a name locale, return it.
+2. Otherwise, return the maximal likely locale for “und-” + name script.
-### 6.3 <a name="6-3-derive-the-name-order" href="#6-3-derive-the-name-order">Derive the name order</a>
+### Derive the formatting locale
+
+Let the **full formatting locale** be the maximal likely locale for the formatter's locale. The **formatting base language** is the base language (first subtag) of the full formatting locale, and the **formatting script** is the script code of the full formatting locale.
+
+#### Switch the formatting locale if necessary
+
+A few script values represent a set of scripts, such as Jpan = {Hani, Kana, Hira}. Two script codes are said to _match_ when they are either identical, or one represents a set which contains the other, or they both represent sets which intersect. For example, Hani and Jpan match, because {Hani, Kana, Hira} contains Hani.
+
+If the **name script** doesn't match the **formatting script**:
+1. If the name locale has name formatting data, then set the formatting locale to the name locale.
+2. Otherwise, set the formatting locale to the maximal likely locale for the the locale formed from und, plus the name script plus the region of the nameLocale.
+
+For example, when a Hindi (Devanagari) formatter is called upon to format a name object that has the locale Ukrainian (Cyrillic):
+* If the name is written with Cyrillic letters, under the covers a Ukrainian (Cyrillic) formatter should be instantiated and used to format that name.
+* If the name is written in Greek letters, then under the covers a Greek (Greek-script) formatter should be instantiated and used to format.
+
+To determine whether there is name formatting data for a locale, get the values for each of the following paths.
+If at least one of them doesn’t inherit their value from root, then the locale has name formatting data.
+* //ldml/personNames/nameOrderLocales[@order="givenFirst"]
+* //ldml/personNames/nameOrderLocales[@order="surnameFirst"]
+
+### Derive the name order
A PersonName object’s fields are used to derive an order, as follows:
-1. If the PersonName object to be formatted has a `preferredOrder` field, then return that field’s value
-2. Otherwise use the nameOrderLocales elements to find the most best match for the full name locale, as follows.
- 1. For each locale L1 in the parent locale lookup chain* for the full name locale, do the following
+1. If the calling API requests sorting order, that is used.
+2. Otherwise, if the PersonName object to be formatted has a `preferredOrder` field, then return that field’s value
+3. Otherwise, use the nameOrderLocales elements to find the best match for the name locale, as follows.
+ 1. For each locale L1 in the parent locale lookup chain* for the **name ordering locale**, do the following
1. Create a locale L2 by replacing the language subtag by 'und'. (Eg, 'de_DE' ⇒ 'und_DE')
2. For each locale L in {L1, L2}, do the following
1. If there is a precise match among the givenFirst nameOrderLocales for L, then let the nameOrder be givenFirst, and stop.
2. Otherwise if there is a precise match among the surnameFirst nameOrderLocales for L, then let the nameOrder be surnameFirst, and stop.
+ 2. Otherwise, let the nameOrder be givenFirst, and stop.
-For example, here is a parent locale lookup chain:
+\* For example, here is a parent locale lookup chain:
- de_Latn_de ⇒ de_Latn ⇒ de_de ⇒ de ⇒ und
+ de_Latn_DE ⇒ de_Latn ⇒ de_DE ⇒ de ⇒ und
-In other words, you'll check the givenFirst and surnameFirst resources for the following locales, in this order:
+In other words, with the name locale of `de_Latin_DE` you'll check the givenFirst and surnameFirst resources for the following locales, in this order:
de_Latin_DE, und_Latn_DE, de_Latn, und_Latn, de_DE, und_DE, de, und
-This process will always terminate, because there is always a und value in one of the two nameOrderLocales elements.
+This process will always terminate, because there is always a und value in one of the two nameOrderLocales elements. Remember that the lookup chain requires use of the parentLocales elements: it is not just truncation.
For example, the data for a particular locale might look like the following:
```xml
<nameOrderLocales order="surnameFirst">zh ja und-CN und-TW und-SG und-HK und-MO und-HU und-JP</nameOrderLocales>
```
-The nameOrderLocales will match any locale with a zh or ja [unicode_language_subtag](https://unicode.org/reports/tr35/#unicode_language_subtag) and any locale with a CN, TW, SG, HK MO, HU, or JP [unicode_region_subtag](https://unicode.org/reports/tr35/#unicode_region_subtag).
+These nameOrderLocales will match any locale with a zh or ja [unicode_language_subtag](tr35.md#unicode_language_subtag) and any locale with a CN, TW, SG, HK MO, HU, or JP [unicode_region_subtag](tr35.md#unicode_region_subtag).
Here are some more examples. Note that if there is no order field or locale field in the PersonName object to be formatted, and the script of the PersonName data is different from that of the formatting locale, then the default result is givenFirst.
@@ -540,7 +632,7 @@
| | fr | givenFirst |
| | | givenFirst |
-### 6.4 <a name="6-4-choose-a-personname" href="#6-4-choose-a-personname">Choose a personName</a>
+### Choose a personName element
The personName data in CLDR provides representations for how names are to be formatted across the different axes of _order_, _length_, _usage_, and _formality_. More than one `namePattern` can be associated with a single `personName` entry. An algorithm is then used to choose the best `namePattern` to use.
@@ -549,7 +641,7 @@
```xml
<personNames>
<personName order="givenFirst" length="long" usage="referring" formality="formal">
- <namePattern>{prefix} {given} {given2} {surname}, {suffix}</namePattern>
+ <namePattern>{title} {given} {given2} {surname}, {credentials}</namePattern>
</personName>
<personName order="givenFirst" length="long" usage="referring" formality="informal">
<namePattern>{given} «{given2}» {surname}</namePattern>
@@ -573,7 +665,7 @@
* The usage attribute values contain U or there is no usage attribute, and
* The formality attribute values contain F or there is no formality attribute
-Example for input parameters
+Example for input parameters
> `order = `**`givenFirst`**`, length = `**`long`**`, usage = `**`referring`**`, formality = `**`formal`**
@@ -587,9 +679,9 @@
To find the matching personName element, traverse all the personNames in order until the first one is found. This will always terminate since the data is well-formed in CLDR.
-### 6.5 <a name="6-5-choose-a-namepattern" href="#6-5-choose-a-namepattern">Choose a namePattern</a>
+### Choose a namePattern
-To format a name, the fields in a namePattern are replaced with fields fetched from the PersonName Data Interface. The personName element can contain multiple namePattern elements. Choose one based on the fields in the input PersonName object that are populated:
+To format a name, the fields in a namePattern are replaced with fields fetched from the PersonName Data Interface. The personName element can contain multiple namePattern elements. Choose one based on the fields in the input PersonName object that are populated:
1. Find the set of patterns with the most populated fields.
2. If there is just one element in that set, use it.
2. Otherwise, among that set, find the set of patterns with the fewest unpopulated fields.
@@ -603,136 +695,37 @@
3. Pattern C is discarded, because it has the least number of populated name fields.
4. Out of the remaining patterns A and B, pattern B wins, because it has only 3 unpopulated fields compared to pattern A.
-If the “winning” namePattern still has fields that are unpopulated in the PersonName object, we alter the pattern algorithmically as follows:
+### Access PersonName object
-1. If one or more fields at the start of the pattern are empty, all fields, whitespace, and literal text before the **first** populated field are deleted.
-2. If one or more fields at the end of the pattern are empty, all fields, whitespace, and literal text after the **last** populated field are deleted.
-3. For each empty field in the middle of the pattern (going from left to right), that field and all literal text between it and the nearest whitespace or field on both sides is deleted. If this results in two whitespace characters next to each other, they are coalesced into one.
+#### Handle missing surname
-### 6.6 <a name="6-6-examples-of-choosing-a-namepattern" href="#6-6-examples-of-choosing-a-namepattern">Examples of choosing a namePattern</a>
+All PersonName objects will have a given name (for mononyms the given name is used). However, there may not be a surname. In that case, the following process is followed so that formatted patterns produce reasonable results.
-#### 6.6.1 <a name="examples-for-rules-1-and-2" href="#examples-for-rules-1-and-2">Examples for rules 1 and 2</a>
+1. If there is no surname from a PersonName P1 _and_ the pattern either doesn't include the given name or only shows an initial for the given name, then:
+ 1. Construct and use a derived PersonName P2, whereby P2 behaves exactly as P1 except that:
+ 1. Any request for a surname field (with any modifiers) returns P1's given name (with the same modifiers)
+ 2. Any request for a given name field (with any modifiers) returns "" (empty string)
-The personName element contains:
+As always, this is a logical description and may be optimized in implementations. For example, an implemenation may use an interface for P2 that just delegates calls to P1, with some redirection for accesses to surname and given name.
+#### Handle core and prefix
-> `<namePattern>{prefix} {given} {given2} {surname}, {suffix}</namePattern>`
+A given field may have a core value, a prefix value, and/or a ‘plain’ value (neither core nor prefix). If one or more of them are missing, then the returned values should be adjusted according to the table below. In the three cells on the left, a ✓ indicates that a value is available, an ✖️ if there is none. In three cells on the right, the value of = means the returned value is unchanged, ✖️ means the returned value is “empty”, and anything else is a description of what to change it to.
+| prefix | core | plain | | prefix | core | plain |
+| ------ | ---- | ----- |-| ------ | ---- | ----- |
+| ✓ | ✓ | ✓ | | = | = | = |
+| ✓ | ✖️ | ✓ | | ✖️ | plain | = |
+| ✖️ | ✓ | ✓ | | = | plain | = |
+| ✖️ | ✖️ | ✓ | | = | plain | = |
+| ✓ | ✓ | ✖️ | | = | = | prefix + " " + core |
+| ✖️ | ✓ | ✖️ | | = | = | core |
+| ✓ | ✖️ | ✖️ | | ✖️ | = | = |
+| ✖️ | ✖️ | ✖️ | | = | = | = |
-The input PersonName object contains:
+For example, if the surname-prefix is "von und zu" and the surname-core is "Stettbach" and there is no surname (plain), then the derived value for the (plain) surname is "von und zu Stettbach". (The cases where existing prefix values are changed should not be necessary with well-formed PersonName data.)
-| `prefix` | `given` | `given2` | `surname` | `suffix` |
-| -------- | ------- | -------- | --------- | -------- |
-| | Raymond | J. | Johnson | Jr. |
-
-The output is:
-
-> Raymond J. Johnson, Jr.
-
-The “prefix” field is empty, and so both it and the space that follows it are omitted from the output, according to rule 1 above.
-
-If, instead, the input PersonName object contains:
-
-| `prefix` | `given` | `given2` | `surname` | `suffix` |
-| -------- | ------- | -------- | --------- | -------- |
-| | Raymond | J. | Johnson | |
-
-The output is:
-
-> Raymond J. Johnson
-
-The “prefix” field is empty, and so both it and the space that follows it are omitted from the output, according to rule 1 above.
-
-The “suffix” field is also empty, so it and both the comma and the space that precede it are omitted from the output, according to rule 2 above.
-
-#### 6.6.2 <a name="examples-for-rule-3-and-the-interaction-between-the-rules" href="#examples-for-rule-3-and-the-interaction-between-the-rules">Examples for rule 3 and the interaction between the rules</a>
-
-To see how rule 3 interacts with the other rules, consider an imaginary language in which people generally have given and given2 (or middle) names, and the given2 name is always written with parentheses around it, and the given name is usually written as an initial with a following period.
-
-The personName element contains:
-
-> `<namePattern>{given-initial}. ({given2}) {surname}</namePattern>`
-
-
-The input PersonName object contains:
-
-| `given` | `given2` | `surname` |
-| ------- | -------- | --------- |
-| Foo | Bar | Baz |
-
-The output is:
-
-> F. (Bar) Baz
-
-If, instead, the input PersonName object contains:
-
-| `given` | `given2` | `surname` |
-| ------- | -------- | --------- |
-| Foo | | Baz |
-
-The output is:
-
-> F. Baz
-
-The “given2” field is empty, so it and the surrounding parentheses are omitted from the output, as is one of the surrounding spaces, according to rule 3. The period after “{given-initial}” remains, because it is separated from the “{given2}” element by space-- punctuation around a missing field is only deleted up until the closest space in each direction.
-
-If there were no space between the period and the parentheses, as might happen if our hypothetical language didn’t use spaces:
-
-> `<namePattern>{given-initial}.({given2}) {surname}</namePattern>`
-
-The input PersonName object still contains:
-
-| `given` | `given2` | `surname` |
-| ------- | -------- | --------- |
-| Foo | | Baz |
-
-The output is:
-
-> F Baz
-
-Both the period after “{given-initial}” _and_ the parentheses around “{given2}” are omitted from the output, because there was no space between them — instead, we delete punctuation all the way up to the neighboring field. To solve this (making sure the “{given-initial}” field always has a period after it), you would add another namePattern:
-
-> `<namePattern>{given-initial}.({given2}) {surname}</namePattern>`<br/>
-> `<namePattern alt=”2”>{given-initial}. {surname}</namePattern>`
-
-The first pattern would be used when the “given2” field is populated, and the second pattern would be used when the “given2” field is empty.
-
-Rules 1 and 3 can conflict in similar ways. If the personName element contains (there’s a space between the period and the opening parenthesis again):
-
-> `<namePattern>{given-initial}. ({given2}) {surname}</namePattern>`
-
-And the input PersonName object contains:
-
-| `given` | `given2` | `surname` |
-| ------- | -------- | --------- |
-| | Bar | Baz |
-
-The output is:
-
-> Bar) Baz
-
-Because the “given” field is empty, rule 1 not only has us delete it, but also all punctuation up to “{given2}”. This includes _both_ the period _and_ the opening parenthesis. Again, to solve this, you’d supply two namePatterns:
-
-> `<namePattern>{given-initial}. ({given2}) {surname}</namePattern>`<br/>
-> `<namePattern alt=”2”> ({given2}) {surname}</namePattern>`
-
-The output would then be:
-
-> (Bar) Baz
-
-The first namePattern would be used if the “given” field was populated, and the second would be used if it was empty.
-
-If, instead, the input PersonName object contains:
-
-| `given` | `given2` | `surname` |
-| ------- | -------- | --------- |
-| Foo | | Baz |
-
-The output is:
-
-> F. Baz
-
-### 6.7 <a name="6-7-deriving-initials" href="#6-7-deriving-initials">Deriving initials</a>
+#### Derive initials
The following process is used to produce initials when they are not supplied by the PersonName object. Assuming the input example is “Mary Beth”:
@@ -743,41 +736,66 @@
| 3. The ***initial*** pattern is applied to each<br/>` <initialPattern type="initial">{0}.</initialPattern>` | “M.” and “B.” |
| 4. Finally recombined with ***initialSequence***<br/>` <initialPattern type="initialSequence">{0} {1}</initialPattern>` | “M. B.” |
-See the “initial” modifier in the [Modifiers](#5-2-modifiers) section for more details.
+See the “initial” modifier in the [Modifiers](#modifiers) section for more details.
-### 6.8 <a name="6-8-handling-foreign-names" href="#6-8-handling-foreign-names">Handling foreign names</a>
+### Process a namePattern
-There are two main challenges in dealing with foreign name formatting that needs to be considered. One is the ordering, which is dealt with under the section [[2.3 nameOrderLocales Element](#2-3-nameorderlocales-element)]. The other is spacing.
+The “winning” namePattern may still have fields that are unpopulated (empty) in the PersonName object. That namePattern is populated with field values with the following steps:
-Some writing systems require spaces (or some other non-letters) to separate words. For example, [Hayao Miyazaki](https://en.wikipedia.org/wiki/Hayao_Miyazaki) is written in English with given name first and with a space between the two name fields, while in Japanese there is no space with surname first:
+1. If one or more fields at the start of the pattern are empty, all fields and literal text before the **first** populated field are omitted.
+2. If one or more fields at the end of the pattern are empty, all fields and literal text after the **last** populated field are omitted.
+3. Processing from the start of the remaining pattern:
+ 1. If there are two or more empty fields separated only by literals, the fields and the literals between them are removed.
+ 2. If there is a single empty field, it is removed.
+4. If the processing from step 3 results in two adjacent literals (call them A and B), they are coalesced into one literal as follows:
+ 1. If either is empty the result is the other one.
+ 2. If B matches the end of A, then the result is A. So xyz + yz ⇒ xyz, and xyz + xyz ⇒ xyz.
+ 3. Otherwise the result is A + B, further modified by replacing any sequence of two or more white space characters by the first whitespace character.
+5. All of the fields are replaced by the corresponding values from the PersonName object.
-> [宮崎駿](https://ja.wikipedia.org/wiki/%E5%AE%AE%E5%B4%8E%E9%A7%BF)
+The result is the **formatted value**. However, there is one further step that might further modify that value.
-1. If a locale requires spaces between words, the normal patterns for the formatting locale are used. On Wikipedia, for example, note the space within the Japanese name on pages from English and Korean (an ideographic space is used here for emphasis).
+#### Handling foreign names
-* “[Hayao Miyazaki (宮崎<span style="background-color:aqua"> </span>駿, Miyazaki Hayao](https://en.wikipedia.org/wiki/Hayao_Miyazaki)…” or
-* “[미야자키<span style="background-color:aqua"> </span>하야오(일본어: 宮﨑<span style="background-color:aqua"> </span>駿 Miyazaki Hayao](https://ko.wikipedia.org/wiki/%EB%AF%B8%EC%95%BC%EC%9E%90%ED%82%A4_%ED%95%98%EC%95%BC%EC%98%A4)…”.
+There are two main challenges in dealing with foreign name formatting that needs to be considered. One is the ordering, which is dealt with under the section [nameOrderLocales Element](#nameorderlocales-element)]. The other is spacing.
-2. If a locale **doesn’t** require spaces between words, there are two cases, based on whether the foreign name is written in the locale’s script, or the foreign name is left in its native script. In both cases, patterns from the **locale of the name** are used. For example, the formatting locale might be Japanese, and the locale of the PersonName object might be de_CH, German (Switzerland), such as Albert Einstein.
+Some writing systems require spaces (or some other non-letters) to separate words. For example, [Hayao Miyazaki](https://en.wikipedia.org/wiki/Hayao_Miyazaki) is written in English with given name first and with a space between the two name fields, while in Japanese there is no space with surname first: [宮崎駿](https://ja.wikipedia.org/wiki/%E5%AE%AE%E5%B4%8E%E9%A7%BF)
- 1. **The foreign name is written in the formatting locale’s script.** In that case, the **foreignSpaceReplacement** is substituted for each space in the patterns from the _locale of the name_. Here are examples for Albert Einstein in Japanese and Chinese:
- * [アルベルト<span style="background-color:aqua">・</span>アインシュタイン](https://ja.wikipedia.org/wiki/%E3%82%A2%E3%83%AB%E3%83%99%E3%83%AB%E3%83%88%E3%83%BB%E3%82%A2%E3%82%A4%E3%83%B3%E3%82%B7%E3%83%A5%E3%82%BF%E3%82%A4%E3%83%B3)
- * [阿尔伯特<span style="background-color:aqua">·</span>爱因斯坦](https://zh.wikipedia.org/wiki/%E9%98%BF%E5%B0%94%E4%BC%AF%E7%89%B9%C2%B7%E7%88%B1%E5%9B%A0%E6%96%AF%E5%9D%A6)
- 2. **The foreign name is written in a different script.** In that case, the patterns from the **locale of the name** are used as is.
- * [Albert Einstein](https://de.wikipedia.org/wiki/Albert_Einstein)
+If a locale requires spaces between words, the normal patterns for the formatting locale are used. On Wikipedia, for example, note the space within the Japanese name on pages from English and Korean (an ideographic space is used here for emphasis).
-In both cases, the ordering may be changed according to the **Name Order for Locales** settings that each locale provides. If the PersonName object does not supply a locale for a name, then a default locale will be derived based on other information (such as the script of the characters in the name fields).
+* “[Hayao Miyazaki (宮崎<span style="background-color:aqua"> </span>駿, Miyazaki Hayao](https://en.wikipedia.org/wiki/Hayao_Miyazaki)…” or
+* “[미야자키<span style="background-color:aqua"> </span>하야오(일본어: 宮﨑<span style="background-color:aqua"> </span>駿 Miyazaki Hayao](https://ko.wikipedia.org/wiki/%EB%AF%B8%EC%95%BC%EC%9E%90%ED%82%A4_%ED%95%98%EC%95%BC%EC%98%A4)…”.
-> **Note** In the tech preview, the structure isn't yet powerful enough to handle cases with `foreignSpaceReplacement` where the formatting locale doesn’t need spaces between words, but the name locale has the same ordering as the formatting locale.
-> For example, consider where the formatting locale is Thai, and the name is in English, but transliterated into Thai.
+If a locale **doesn’t** require spaces between words, there are two cases, based on whether the name is foreign or not (based on the PersonName objects explicit or calculated locale's language subtag). For example, the formatting locale might be Japanese, and the locale of the PersonName object might be de_CH, German (Switzerland), such as Albert Einstein. When the locale is foreign, the **foreignSpaceReplacement** is substituted for each space in the formatted name. When the name locale is native, a **nativeSpaceReplacement** is substituted for each space in the formatted name. The precise algorithm is given below.
-To illustrate how foreign space replacement works, consider the following name data. For illustration, the name locale is given in the maximized form: in practice, `ja` would be used instead of `ja_Jpan_JP`, and so on.: For more information, see Likely Subtags [TBD add link].
+Here are examples for Albert Einstein in Japanese and Chinese:
+* [アルベルト<span style="background-color:aqua">・</span>アインシュタイン](https://ja.wikipedia.org/wiki/%E3%82%A2%E3%83%AB%E3%83%99%E3%83%AB%E3%83%88%E3%83%BB%E3%82%A2%E3%82%A4%E3%83%B3%E3%82%B7%E3%83%A5%E3%82%BF%E3%82%A4%E3%83%B3)
+* [阿尔伯特<span style="background-color:aqua">·</span>爱因斯坦](https://zh.wikipedia.org/wiki/%E9%98%BF%E5%B0%94%E4%BC%AF%E7%89%B9%C2%B7%E7%88%B1%E5%9B%A0%E6%96%AF%E5%9D%A6)
+
+#### Setting the spaceReplacement
+
+1. The foreignSpaceReplacement is provided by the value for the `foreignSpaceReplacement` element; the default value is a SPACE (" ").
+2. The nativeSpaceReplacement is provided by the value for the `nativeSpaceReplacement` element; the default value is SPACE (" ").
+3. If the formatter base language matches the name base language, then let spaceReplacement = nativeSpaceReplacement, otherwise let spaceReplacement = foreignSpaceReplacement.
+4. Replace all sequences of space in the formatted value string by the spaceReplacement.
+
+For the purposes of this algorithm, two base languages are said to __match__ when they are identical, or if both are in {ja, zh, yue}.
+
+**Note:** in the future the plan is to make the specific languages and scripts used in this algorithm be data-driven.
+
+Remember that **a name in a different script** will use a different locale for formatting, as per [Switch the formatting locale if necessary](#switch-the-formatting-locale-if-necessary).
+For example, when formatting a name for Japanese, if the name is in the Latin script, a Latin based locale will be used to format it, such as when “Albert Einstein” appears in Latin characters as in the Wikipedia page [Albert Einstein](https://ja.wikipedia.org/wiki/Albert_Einstein).
+
+#### Examples of space replacement
+
+To illustrate how foreign space replacement works, consider the following name data. For illustration, the name locale is given in the maximized form: in practice, `ja` would be used instead of `ja_Jpan_JP`, and so on.: For more information, see [Likely Subtags](tr35.md#Likely_Subtags).
| name locale | given | surname |
| ------------- | -------- | ------------- |
| `de_Latn_CH` | Albert | Einstein |
-| `de_Jpan_CH` | アルベルト | アインシュタイン |
-| `ja_Jpan_JP` | Hayao | Miyazaki |
+| `de_Kata_CH` | アルベルト | アインシュタイン |
+| `ja_Kata_CH` | アルベルト | アインシュタイン |
+| `ja_Latn_JP` | Hayao | Miyazaki |
| `ja_Jpan_JP` | 駿 | 宮崎 |
Suppose the PersonNames formatting patterns for `ja_JP` and `de_CH` contained the following:
@@ -788,14 +806,15 @@
<personNames>
<nameOrderLocales order="givenFirst">und</nameOrderLocales>
<<strong>nameOrderLocales</strong> order="<strong>surnameFirst</strong>">hu <strong>ja</strong> ko vi yue zh <strong>und_JP</strong></nameOrderLocales>
+ <<strong>nativeSpaceReplacement</strong> xml:space="preserve"><span style="background-color:aqua"></span></nativeSpaceReplacement>
<<strong>foreignSpaceReplacement</strong> xml:space="preserve"><span style="background-color:aqua">・</span></foreignSpaceReplacement>
. . .
<personName order="<strong>givenFirst</strong>" length="medium" usage="referring" formality="formal">
- <namePattern>{given}<span style="background-color:aqua"> </span>{given2}<span style="background-color:aqua"> </span>{surname}{suffix}</namePattern>
+ <namePattern>{given}<span style="background-color:aqua"> </span>{given2}<span style="background-color:aqua"> </span>{surname}{generation}</namePattern>
</personName>
. . .
<personName order="<strong>surnameFirst</strong>" length="medium" usage="referring" formality="formal">
- <namePattern>{surname}{given2}{given}{suffix}</namePattern>
+ <namePattern>{surname}{given2}{given}{generation}</namePattern>
</personName>
. . .
</personNames>
@@ -810,19 +829,20 @@
<nameOrderLocales order="<strong>givenFirst</strong>">und <strong>de</strong></nameOrderLocales>
<nameOrderLocales order="surnameFirst">ko vi yue zh</nameOrderLocales>
<foreignSpaceReplacemen xml:space="preserve"><span style="background-color:aqua"> </span></foreignSpaceReplacement>
- . . .
+ . . .
<personName order="givenFirst" length="medium" usage="referring" formality="formal">
- <namePattern>{given}<span style="background-color:aqua"> </span>{given2-initial}<span style="background-color:aqua"> </span>{surname}, {suffix}</namePattern>
+ <namePattern>{given}<span style="background-color:aqua"> </span>{given2-initial}<span style="background-color:aqua"> </span>{surname}, {generation}</namePattern>
</personName>
- . . .
+ . . .
<personName order="surnameFirst" length="medium" usage="referring" formality="formal">
- <namePattern>{surname}<span style="background-color:aqua">, </span>{given}<span style="background-color:aqua"> </span>{given2-initial}<span style="background-color:aqua">,</span> {suffix}</namePattern>
+ <namePattern>{surname}<span style="background-color:aqua">, </span>{given}<span style="background-color:aqua"> </span>{given2-initial}<span style="background-color:aqua">,</span> {generation}</namePattern>
</personName>
- . . .
+ . . .
</personNames>`
</pre>
The name data would resolve as follows:
+<!-- TODO Replace the following with a markdown table -->
<table>
<tr>
@@ -928,33 +948,157 @@
</table>
<br/>
-## 7 <a name="7-sample-name" href="#7-sample-name">Sample Name</a>
+### Formatting examples
+
+The personName element contains:
+
+
+> `<namePattern>{title} {given} {given2} {surname}, {credentials}</namePattern>`
+
+
+The input PersonName object contains:
+
+| `title` | `given` | `given2` | `surname` | `generation` |
+| -------- | ------- | -------- | --------- | -------- |
+| | Raymond | J. | Johnson | Jr. |
+
+The output is:
+
+> Raymond J. Johnson, Jr.
+
+The “title” field is empty, and so both it and the space that follows it are omitted from the output, according to rule 1 above.
+
+If, instead, the input PersonName object contains:
+
+| `title` | `given` | `given2` | `surname` | `generation` |
+| -------- | ------- | -------- | --------- | -------- |
+| | Raymond | J. | Johnson | |
+
+The output is:
+
+> Raymond J. Johnson
+
+The “title” field is empty, and so both it and the space that follows it are omitted from the output, according to rule 1 above.
+
+The “generation” field is also empty, so it and both the comma and the space that precede it are omitted from the output, according to rule 2 above.
+
+To see how rule 3 interacts with the other rules, consider an imaginary language in which people generally have given and given2 (or middle) names, and the given2 name is always written with parentheses around it, and the given name is usually written as an initial with a following period.
+
+The personName element contains:
+
+> `<namePattern>{given-initial}. ({given2}) {surname}</namePattern>`
+
+
+The input PersonName object contains:
+
+| `given` | `given2` | `surname` |
+| ------- | -------- | --------- |
+| Foo | Bar | Baz |
+
+The output is:
+
+> F. (Bar) Baz
+
+If, instead, the input PersonName object contains:
+
+| `given` | `given2` | `surname` |
+| ------- | -------- | --------- |
+| Foo | | Baz |
+
+The output is:
+
+> F. Baz
+
+The “given2” field is empty, so it and the surrounding parentheses are omitted from the output, as is one of the surrounding spaces, according to rule 3. The period after “{given-initial}” remains, because it is separated from the “{given2}” element by space-- punctuation around a missing field is only deleted up until the closest space in each direction.
+
+If there were no space between the period and the parentheses, as might happen if our hypothetical language didn’t use spaces:
+
+> `<namePattern>{given-initial}.({given2}) {surname}</namePattern>`
+
+The input PersonName object still contains:
+
+| `given` | `given2` | `surname` |
+| ------- | -------- | --------- |
+| Foo | | Baz |
+
+The output is:
+
+> F Baz
+
+Both the period after “{given-initial}” _and_ the parentheses around “{given2}” are omitted from the output, because there was no space between them — instead, we delete punctuation all the way up to the neighboring field. To solve this (making sure the “{given-initial}” field always has a period after it), you would add another namePattern:
+
+> `<namePattern>{given-initial}.({given2}) {surname}</namePattern>`<br/>
+> `<namePattern alt=”2”>{given-initial}. {surname}</namePattern>`
+
+The first pattern would be used when the “given2” field is populated, and the second pattern would be used when the “given2” field is empty.
+
+Rules 1 and 3 can conflict in similar ways. If the personName element contains (there’s a space between the period and the opening parenthesis again):
+
+> `<namePattern>{given-initial}. ({given2}) {surname}</namePattern>`
+
+And the input PersonName object contains:
+
+| `given` | `given2` | `surname` |
+| ------- | -------- | --------- |
+| | Bar | Baz |
+
+The output is:
+
+> Bar) Baz
+
+Because the “given” field is empty, rule 1 not only has us delete it, but also all punctuation up to “{given2}”. This includes _both_ the period _and_ the opening parenthesis. Again, to solve this, you’d supply two namePatterns:
+
+> `<namePattern>{given-initial}. ({given2}) {surname}</namePattern>`<br/>
+> `<namePattern alt=”2”> ({given2}) {surname}</namePattern>`
+
+The output would then be:
+
+> (Bar) Baz
+
+The first namePattern would be used if the “given” field was populated, and the second would be used if it was empty.
+
+If, instead, the input PersonName object contains:
+
+| `given` | `given2` | `surname` |
+| ------- | -------- | --------- |
+| Foo | | Baz |
+
+The output is:
+
+> F. Baz
+
+## Sample Name
The sampleName element is used for test names in the personNames LDML data for each locale to aid in testing and display in the CLDR Survey Tool. They are not intended to be used in production software as prompts or placeholders in a user interface and should not be displayed in a user interface.
-### 7.1 <a name="7-1-syntax" href="#7-1-syntax">Syntax</a>
+### Syntax
```xml
<!ELEMENT sampleName ( nameField+ ) >
-<!ATTLIST sampleName item NMTOKENS #REQUIRED >
+<!ATTLIST sampleName item NMTOKEN #REQUIRED >
```
-* `NMTOKENS` must be one of `( givenOnly | givenSurnameOnly | given12Surname | full )`. However, these may change arbitrarily in the future.
+* `NMTOKEN` must be one of `( nativeG, nativeGS, nativeGGS, nativeFull, foreignG, foreignGS, foreignGGS, foreignFull )`. However, these may change arbitrarily in the future.
-### 7.2 <a name="7-2-expected-values" href="#7-2-expected-values">Expected values</a>
+### Expected values
+The item values starting with "native" are expected to be native names, in native script.
+The item values starting with "foreign" are expected to be foreign names, in native script.
+There are no foreign names or native names in a foreign script, because those should be handled by a different locale's data.
+
+The rest of the item value indicates how many fields are present.
For the expected sample name items, assume a name such as Mr. Richard “Rich” Edward Smith Iglesias Ph.D.
-* `givenOnly` is for an example name with only the given is presented: “Richard” or “Rich” (informal)
-* `givenSurnameOnly` is for an example name with only the given name and surname: “Richard Smith” or “Rich Smith” (informal)
-* `given12Surname` is for an example using all given names and a surname: “Richard Edward Smith” and “Rich E. Smith” (informal)
-* `full` is used to present a name using all fields: “Mr. Richard Edward Smith Iglesias, Ph.D.”
+* `G` is for an example name with only the given is presented: “Richard” or “Rich” (informal)
+* `GS` is for an example name with only the given name and surname: “Richard Smith” or “Rich Smith” (informal)
+* `GSS` is for an example using both given and given2 names and a surname: “Richard Edward Smith” and “Rich E. Smith” (informal)
+* `Full` is used to present a name using all possible fields: “Mr. Richard Edward Smith Iglesias, Ph.D.”
-The `nameField` values and their modifiers are described in the [Person Name Object](#3-person-name-object) and [namePattern Syntax](#5-namepattern-syntax) sections.
+The `nameField` values and their modifiers are described in the [Person Name Object](#person-name-object) and [namePattern Syntax](#namepattern-syntax) sections.
-## 8 <a name="8-personname-data-interface-examples" href="#8-personname-data-interface-examples">PersonName Data Interface Examples</a>
+## PersonName Data Interface Examples
-### 8.1 <a name="8-1-example-1" href="#8-1-example-1">Example 1</a>
+### Example 1
Greek initials can be produced via the following process in the PersonName object, and returned to the formatter.
@@ -966,19 +1110,16 @@
* Χριστίνα Λόπεζ (Christina Lopez) ⟶ Χ. Λόπεζ (C. Lopez)
* Ντέιβιντ Λόπεζ (David Lopez) ⟶ Ντ. Λόπεζ (D. Lopez)<br/>Note that Ντ is a digraph representing the sound D.
-### 8.2 <a name="8-2-example-2" href="#8-2-example-2">Example 2</a>
+### Example 2
To make an initial when there are multiple words, an implementation might produce the following:
-* Janus H. W. Young ⇒ {given2-initial} producing “H.W.”.
-* Erik Martin van der Poel: {given2-initial} producing “V” by default, but might produce “vdP” or P in other languages.
-* A field containing multiple words might not actually initialize all of them, such as in “Mohammed bin Ali bin Osman” (“MAO”).
-* John Ronald Reuel Tolkien as “J.R.R. Tolkien” from { given: “John”, given2: “Ronald Reuel”, surname: “Tolkien” }
-* The short version of "Son Heung-min" is "H. Son" and not "H. M. Son" or the like. Korean given-names have hyphens and the part after the hyphen is lower-case.
+* A field containing multiple words might skip some of them, such as in “Mohammed bin Ali bin Osman” (“MAO”).
+* The short version of "Son Heung-min" is "H. Son" and not "H. M. Son" or the like. Korean given-names have hyphens and the part after the hyphen is lower-case.
* * *
-Copyright © 2001–2022 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
+Copyright © 2001–2023 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
Unicode and the Unicode logo are trademarks of Unicode, Inc., and are registered in some jurisdictions.
diff --git a/docs/ldml/tr35.anchors.json b/docs/ldml/tr35.anchors.json
new file mode 100644
index 0000000..5478a70
--- /dev/null
+++ b/docs/ldml/tr35.anchors.json
@@ -0,0 +1,475 @@
+[
+ "1-multimap-interpretation",
+ "1.-multimap-interpretation",
+ "2-alias-elements",
+ "2.-alias-elements",
+ "3.-matches",
+ "4-replacement",
+ "4.-replacement",
+ "5-canonicalizing-syntax",
+ "5.-canonicalizing-syntax",
+ "a1-element-fallback",
+ "a10-deprecated-subelements-of-timezonenames",
+ "a11-deprecated-subelements-of-zone-and-metazone",
+ "a12-renamed-attribute-values-for-contexttransformusage-element",
+ "a13-deprecated-subelements-of-segmentations",
+ "a14-element-cp",
+ "a15-attribute-validsublocales",
+ "a16-elements-postalcodedata-postcoderegex",
+ "a17-element-telephonecodedata",
+ "a2-bcp-47-keyword-mapping",
+ "a3-choice-patterns",
+ "a4-element-default",
+ "a5-deprecated-common-attributes",
+ "a51-attribute-standard",
+ "a52-attribute-draft-in-non-leaf-elements",
+ "a6-element-base",
+ "a7-element-rules",
+ "a8-deprecated-subelements-of-dates",
+ "a9-deprecated-subelements-of-calendars",
+ "acknowledgments",
+ "Acknowledgments",
+ "Alias_Elements",
+ "alt_attribute",
+ "annex-a-deprecated-structure",
+ "annex-b-links-to-other-parts",
+ "annex-c-localeid-canonicalization",
+ "Attribute_draft",
+ "Attribute_draft_nonLeaf",
+ "Attribute_standard",
+ "Attribute_type",
+ "attribute-alt",
+ "attribute-draft",
+ "attribute-references",
+ "attribute-type",
+ "attribute-value-constraints",
+ "Backslash_Escapes",
+ "backslash-escapes",
+ "BCP_47_Conformance",
+ "BCP_47_Language_Tag_Conversion",
+ "bcp-47-conformance",
+ "bcp-47-language-tag-conversion",
+ "BCP47",
+ "BCP47_Keyword_Mapping",
+ "BCP47_T_Extension",
+ "Boolean_Operations",
+ "boolean-operations",
+ "Bugs",
+ "Bundle_vs_Item_Lookup",
+ "bundle-vs-item-lookup",
+ "ByType",
+ "Calendar_Elements",
+ "Calendar_Fields",
+ "Calendar_Preference_Data",
+ "Calendars",
+ "Canonical_Form",
+ "Canonical_Unicode_Locale_Identifiers",
+ "canonical-form",
+ "canonical-unicode-locale-identifiers",
+ "Case_Parameters",
+ "Character_Elements",
+ "Charts",
+ "Checking_for_Draft_Status",
+ "checking-for-draft-status",
+ "Choice_Patterns",
+ "codepoints",
+ "CODEPOINTS",
+ "Collation_Element",
+ "Collation_Elements",
+ "Collation_Settings",
+ "Collation_Version",
+ "comments",
+ "Comments",
+ "Common_Attributes",
+ "Common_Elements",
+ "Common_Structures",
+ "common-attributes",
+ "common-elements",
+ "common-structures",
+ "Comparisons",
+ "Compatibility_with_Older_Identifiers",
+ "compatibility-with-older-identifiers",
+ "conformance",
+ "Conformance",
+ "content",
+ "Content",
+ "Contents",
+ "contents-of-part-1-core",
+ "Context_Before",
+ "Context_Transform_Elements",
+ "Contractions",
+ "Count_Fallback_currency",
+ "Count_Fallback_normal",
+ "Coverage_Levels",
+ "Currencies",
+ "CurrencyInfo",
+ "Data_Size",
+ "data-size-reduction",
+ "DataFormats",
+ "Date_Elements",
+ "Date_Field_Symbol_Table",
+ "Date_Format_Patterns",
+ "Date_Ranges",
+ "date-and-date-ranges",
+ "dateFormats",
+ "dateTimeFormats",
+ "DayPeriodRules",
+ "dayPeriods",
+ "definitions",
+ "Definitions",
+ "Delimiter_Elements",
+ "Deprecated_Common_Attributes",
+ "Deprecated_Structure",
+ "Deprecated_subelements_of_calendars",
+ "Deprecated_subelements_of_dates",
+ "Deprecated_subelements_of_segmentations",
+ "Deprecated_subelements_of_timeZoneNames",
+ "Deprecated_subelements_of_zone_metazone",
+ "Display_Name_Elements",
+ "DTD_Annotations",
+ "dtd-annotations",
+ "DUCET",
+ "ebnf",
+ "Element_base",
+ "Element_cp",
+ "Element_default",
+ "Element_displayName",
+ "Element_rules",
+ "element-alias",
+ "element-displayname",
+ "element-special",
+ "enhanced-language-matching",
+ "EnhancedLanguageMatching",
+ "Escaping_Characters",
+ "escaping-characters",
+ "ExemplarSyntax",
+ "Expansions",
+ "Extended_Pictographic",
+ "extended-pictographic",
+ "extensions",
+ "Fallback_Elements",
+ "fallbackFormat",
+ "fallbackRegionFormat",
+ "FAQ",
+ "FCD",
+ "Field_Definitions",
+ "Format_Parse_Issues",
+ "Glossary",
+ "handling-invalid-patterns",
+ "Horizontal_Slicing",
+ "horizontal-slicing",
+ "Hybrid_Locale",
+ "hybrid-locale-identifiers",
+ "ICUCollation",
+ "ICUTransforms",
+ "ICUUnicodeSet",
+ "Identifiers",
+ "Identity_Elements",
+ "identity-elements",
+ "IndexLabels",
+ "Inheritance_and_Validity",
+ "Inheritance_vs_Related",
+ "Inheritance_with_source_locale_",
+ "inheritance-and-validity",
+ "inheritance-vs-related-information",
+ "introduction",
+ "Introduction",
+ "Invalid_Patterns",
+ "ISO1000",
+ "ISO15924",
+ "ISO3166",
+ "ISO4217",
+ "ISO639",
+ "ISO8601",
+ "issues-in-formatting-and-parsing",
+ "ITUE164",
+ "JavaChoice",
+ "Key_And_Type_Definitions_",
+ "Key_Type_Definitions",
+ "key-and-type-definitions",
+ "Keyword_and_Default_Resolution",
+ "keyword-and-default-resolution",
+ "Labels.txt",
+ "labelstxt",
+ "Language_and_Locale_IDs",
+ "Language_Plural_Rules",
+ "Language_Tag_to_Locale_Identifier",
+ "language-identifier-field-definitions",
+ "language-matching",
+ "LanguageMatching",
+ "Lateral_Inheritance",
+ "lateral-inheritance",
+ "Layout_Elements",
+ "LDML",
+ "Legacy_Variant_Mappings",
+ "Legacy_Variants",
+ "legacy-variants",
+ "Lenient_Parsing",
+ "lenient-parsing",
+ "Likely_Subtags",
+ "likely-subtags",
+ "Links_to_Other_Parts",
+ "List_Gender",
+ "ListPatterns",
+ "Lists_of_Code_Points",
+ "lists-of-code-points",
+ "Locale",
+ "Locale_Extension_Key_and_Type_Data",
+ "Locale_Extension_Mappings",
+ "Locale_Inheritance",
+ "locale-inheritance-and-matching",
+ "LocaleExplorer",
+ "LocaleId_Canonicalization",
+ "LocaleId_Definitions",
+ "localeid-definitions",
+ "localeProject",
+ "Localized_Pattern_Characters",
+ "LOCODE",
+ "Logical_Reset_Positions",
+ "lookup",
+ "Lookup",
+ "Lookup-Differences",
+ "Loose_Matching",
+ "loose-matching",
+ "LowerExceptions",
+ "match_expressions",
+ "matches",
+ "Measurement_Elements",
+ "Measurement_System_Data",
+ "MeasurementUnitPreferenceOverride",
+ "Message_Formatting_and_Exceptions",
+ "message-formatting-and-exceptions",
+ "modifications",
+ "Modifications",
+ "monthPatterns_cyclicNameSets",
+ "months_days_quarters_eras",
+ "motivation",
+ "Motivation",
+ "Multiple_Inheritance",
+ "NamingGuideline",
+ "Number_Elements",
+ "Number_Format_Patterns",
+ "Number_Symbols",
+ "Numbering_Systems",
+ "numbering-system-data",
+ "Numbering%20System%20Data",
+ "Numeric_Codes",
+ "numeric-codes",
+ "Old_Locale_Extension_Syntax",
+ "old-locale-extension-syntax",
+ "Olson",
+ "OpenOffice",
+ "optimizations",
+ "ordering",
+ "Ordering",
+ "Orderings",
+ "other_extensions",
+ "Parent_Locales",
+ "parent-locales",
+ "Part_2_Links",
+ "Part_3_Links",
+ "Part_4_Links",
+ "Part_5_Links",
+ "Part_6_Links",
+ "Part_7_Links",
+ "parts",
+ "Parts",
+ "Placing_Characters_Before_Others",
+ "POSIX_Elements",
+ "Postal_Code_Validation",
+ "postCodeElements",
+ "preprocessing",
+ "private_use",
+ "PRIVATE_USE",
+ "Private_Use_CLDR",
+ "Private_Use_Codes",
+ "private-use-codes",
+ "processing-languageids",
+ "processing-localeids",
+ "Property_Data",
+ "property-data",
+ "proposed-update-unicode-technical-standard-35",
+ "pu_extensions",
+ "RBBI",
+ "RBNF",
+ "Reference_Elements",
+ "references",
+ "References",
+ "references_attribute",
+ "Region_Priority_Inheritance",
+ "region-priority-inheritance",
+ "RegionOverride",
+ "Relation_to_OpenI18n",
+ "relation-to-openi18n",
+ "Renamed_attribute_values_for_contextTransformUsage",
+ "reorder_code",
+ "REORDER_CODE",
+ "Reports",
+ "Resolved_Data_File",
+ "resolved-data-file",
+ "RFC6067",
+ "RFC6497",
+ "rg_key_value",
+ "RG_KEY_VALUE",
+ "Rule-Based_Number_Formatting",
+ "Rules",
+ "Sample_Special_Elements",
+ "sample-special-elements",
+ "script_code",
+ "SCRIPT_CODE",
+ "Script_Metadata",
+ "Script_Reordering",
+ "script-metadata",
+ "Segmentation_Inheritance",
+ "Segmentation_Tests",
+ "segmentation-tests",
+ "Segmentations",
+ "Setting_Options",
+ "special",
+ "Special_Codes",
+ "Special_Purpose_Commands",
+ "special-codes",
+ "status",
+ "String_Range",
+ "string-range",
+ "subdivision_code",
+ "SUBDIVISION_CODE",
+ "subdivision-codes",
+ "summary",
+ "Supplemental_Calendar_Data",
+ "Supplemental_Character_Fallback_Data",
+ "Supplemental_Code_Mapping",
+ "Supplemental_Currency_Data",
+ "Supplemental_Data",
+ "Supplemental_Language_Data",
+ "Supplemental_Territory_Containment",
+ "Supplemental_Territory_Information",
+ "Supplemental_Timezone_Data",
+ "syntax-special-case-examples",
+ "t_Extension",
+ "t-extension-data-files",
+ "table-bcp-47-language-tag-to-unicode-bcp-47-locale-identifier-examples",
+ "table-count-fallback-currency",
+ "table-count-fallback-normal",
+ "table-inheritance-with-sourcelocale",
+ "table-keytype-definitions",
+ "table-legacy-variant-mappings",
+ "table-locale-extension-mappings",
+ "table-lookup-differences",
+ "table-part-2-links-general-display-names--transforms-etc",
+ "table-part-3-links-numbers-number--currency-formatting",
+ "table-part-4-links-dates-date-time-time-zone-formatting",
+ "table-part-5-links-collation-sorting-searching-grouping",
+ "table-part-6-links-supplemental-supplemental-data",
+ "table-part-7-links-keyboards-keyboard-mappings",
+ "table-private-use-codes-in-cldr",
+ "Telephone_Code_Data",
+ "telephoneCodeData",
+ "territory-exception",
+ "Text_Directionality",
+ "text-directionality",
+ "Time_Zone_Fallback",
+ "Time_Zone_Identifiers",
+ "time-zone-identifiers",
+ "timeFormats",
+ "Timezone_Names",
+ "Transform_Rules",
+ "Transformed_Content_Data_File",
+ "transformed_extensions",
+ "Transforms",
+ "Transmitting_Locale_Information",
+ "transmitting-locale-information",
+ "truncation",
+ "u_Extension",
+ "u-extension-data-files",
+ "UCAChart",
+ "UncasedExceptions",
+ "Unicode",
+ "Unicode_Language_and_Locale_Identifiers",
+ "unicode_language_id",
+ "Unicode_language_identifier",
+ "unicode_language_subtag",
+ "unicode_language_subtag_validity",
+ "unicode_language_subtag-also-known-as-a-unicode-base-language-code",
+ "Unicode_Locale_Extension_Data_Files",
+ "unicode_locale_extensions",
+ "unicode_locale_id",
+ "Unicode_locale_identifier",
+ "Unicode_Locale_Identifier_BCP_47_to_CLDR",
+ "Unicode_Locale_Identifier_CLDR_to_BCP_47",
+ "unicode_measure_unit",
+ "Unicode_Properties",
+ "unicode_region_subtag",
+ "unicode_region_subtag_validity",
+ "unicode_region_subtag-also-known-as-a-unicode-region-code-or-a-unicode-territory-code",
+ "unicode_script_subtag",
+ "unicode_script_subtag_validity",
+ "unicode_script_subtag-also-known-as-a-unicode-script-code",
+ "Unicode_Sets",
+ "Unicode_Subdivision_Codes",
+ "unicode_subdivision_id",
+ "unicode_subdivision_subtag_validity",
+ "unicode_variant_subtag",
+ "unicode_variant_subtag_validity",
+ "unicode_variant_subtag-also-known-as-a-unicode-language-variant-code",
+ "unicode-bcp-47-t-extension",
+ "unicode-bcp-47-u-extension",
+ "unicode-language-and-locale-identifiers",
+ "unicode-language-and-locale-ids",
+ "unicode-language-identifier",
+ "unicode-locale-data-markup-language-ldml",
+ "unicode-locale-identifier",
+ "unicode-locale-identifier-bcp-47-to-cldr",
+ "unicode-locale-identifier-cldr-to-bcp-47",
+ "unicode-properties",
+ "unicode-sets",
+ "UnicodeCalendarIdentifier",
+ "UnicodeCollationIdentifier",
+ "UnicodeCurrencyFormatIdentifier",
+ "UnicodeCurrencyIdentifier",
+ "UnicodeDictionaryBreakExclusionIdentifier",
+ "UnicodeEmojiPresentationStyleIdentifier",
+ "UnicodeFirstDayIdentifier",
+ "UnicodeHourCycleIdentifier",
+ "UnicodeLineBreakStyleIdentifier",
+ "UnicodeLineBreakWordIdentifier",
+ "UnicodeMeasurementSystemIdentifier",
+ "UnicodeNumberSystemIdentifier",
+ "UnicodeSentenceBreakSuppressionsIdentifier",
+ "UnicodeSet_Examples",
+ "unicodeset-examples",
+ "unicodeset-syntax",
+ "UnicodeSubdivisionIdentifier",
+ "UnicodeTimezoneIdentifier",
+ "UnicodeVariantIdentifier",
+ "Unit_Elements",
+ "Unknown_or_Invalid_Identifiers",
+ "unknown-or-invalid-identifiers",
+ "UNM49",
+ "UpperExceptions",
+ "UTCInfo",
+ "Valid_Attribute_Values",
+ "Valid_Data",
+ "valid-attribute-values",
+ "valid-data",
+ "validity",
+ "Validity",
+ "Validity_Data",
+ "validity-data",
+ "validSubLocales",
+ "Variables_in_UnicodeSets",
+ "variables-in-unicodesets",
+ "Versions",
+ "Vertical_Slicing",
+ "vertical-slicing",
+ "Visibility",
+ "week",
+ "what-is-a-locale",
+ "WindowsCulture",
+ "Written_Language",
+ "written-language",
+ "XML_Format",
+ "xml-format",
+ "XMLSchema",
+ "XPath"
+]
\ No newline at end of file
diff --git a/docs/ldml/tr35.md b/docs/ldml/tr35.md
index e6389dd..daacc0e 100644
--- a/docs/ldml/tr35.md
+++ b/docs/ldml/tr35.md
@@ -2,18 +2,18 @@
# Unicode Locale Data Markup Language (LDML)
-|Version|42 |
+|Version|44.1 |
|-------|----------|
|Editors|Mark Davis (<a href="mailto:[email protected]">[email protected]</a>) and <a href="tr35.md#Acknowledgments">other CLDR committee members</a>|
-|Date|2022-10-17|
-|This Version|<a href="https://www.unicode.org/reports/tr35/tr35-67/tr35.html">https://www.unicode.org/reports/tr35/tr35-67/tr35.html</a>|
-|Previous Version|<a href="https://www.unicode.org/reports/tr35/tr35-66/tr35.html">https://www.unicode.org/reports/tr35/tr35-66/tr35.html</a>|
+|Date|2023-12-06|
+|This Version|<a href="https://www.unicode.org/reports/tr35/tr35-71/tr35.html">https://www.unicode.org/reports/tr35/tr35-71/tr35.html</a>|
+|Previous Version|<a href="https://www.unicode.org/reports/tr35/tr35-70/tr35.html">https://www.unicode.org/reports/tr35/tr35-70/tr35.html</a>|
|Latest Version|<a href="https://www.unicode.org/reports/tr35/">https://www.unicode.org/reports/tr35/</a>|
|Corrigenda|<a href="https://cldr.unicode.org/index/corrigenda">https://cldr.unicode.org/index/corrigenda</a>|
|Latest Proposed Update|<a href="https://www.unicode.org/reports/tr35/proposed.html">https://www.unicode.org/reports/tr35/proposed.html</a></td></tr>
|Namespace|<a href="https://www.unicode.org/cldr/">https://www.unicode.org/cldr/</a>|
-|DTDs|<a href="https://www.unicode.org/cldr/dtd/42/">https://www.unicode.org/cldr/dtd/42/</a>|
-|Revision|<a href="#Modifications">67</a>|
+|DTDs|<a href="https://www.unicode.org/cldr/dtd/44/">https://www.unicode.org/cldr/dtd/44/</a>|
+|Revision|<a href="#Modifications">71</a>|
### _Summary_
@@ -26,7 +26,12 @@
### _Status_
-_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium. This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
+<!-- _This is a draft document which may be updated, replaced, or superseded by other documents at any time.
+Publication does not imply endorsement by the Unicode Consortium.
+This is not a stable document; it is inappropriate to cite this document as other than a work in progress._ -->
+
+_This document has been reviewed by Unicode members and other interested parties, and has been approved for publication by the Unicode Consortium.
+This is a stable document and may be used as reference material or cited as a normative reference by other specifications._
> _**A Unicode Technical Standard (UTS)** is an independent specification. Conformance to the Unicode Standard does not imply conformance to any UTS._
@@ -49,109 +54,118 @@
## <a name="Contents" href="#Contents">Contents of Part 1, Core</a>
-* 1 [Introduction](#Introduction)
- * 1.1 [Conformance](#Conformance)
-* 2 [What is a Locale?](#Locale)
-* 3 [Unicode Language and Locale Identifiers](#Unicode_Language_and_Locale_Identifiers)
- * _[3.1 Unicode Language Identifier](#Unicode_language_identifier)_
- * _[3.2 Unicode Locale Identifier](#Unicode_locale_identifier)_
- * 3.2.1 [Canonical Unicode Locale Identifiers](#Canonical_Unicode_Locale_Identifiers)
- * 3.3 [BCP 47 Conformance](#BCP_47_Conformance)
- * 3.3.1 [BCP 47 Language Tag Conversion](#BCP_47_Language_Tag_Conversion)
+* [Introduction](#Introduction)
+ * [Conformance](#Conformance)
+ * [EBNF](#ebnf)
+* [What is a Locale?](#Locale)
+* [Unicode Language and Locale Identifiers](#Unicode_Language_and_Locale_Identifiers)
+ * _[Unicode Language Identifier](#Unicode_language_identifier)_
+ * _[Unicode Locale Identifier](#Unicode_locale_identifier)_
+ * [Canonical Unicode Locale Identifiers](#Canonical_Unicode_Locale_Identifiers)
+ * [BCP 47 Conformance](#BCP_47_Conformance)
+ * [BCP 47 Language Tag Conversion](#BCP_47_Language_Tag_Conversion)
* Table: [BCP 47 Language Tag to Unicode BCP 47 Locale Identifier](#Language_Tag_to_Locale_Identifier) Examples
* [Unicode Locale Identifier: CLDR to BCP 47](#Unicode_Locale_Identifier_CLDR_to_BCP_47)
* [Unicode Locale Identifier: BCP 47 to CLDR](#Unicode_Locale_Identifier_BCP_47_to_CLDR)
* [Truncation](#truncation)
- * 3.4 [Language Identifier Field Definitions](#Field_Definitions)
+ * [Language Identifier Field Definitions](#Field_Definitions)
* [`unicode_language_subtag`](#unicode_language_subtag_validity) (also known as a _Unicode base language code_)
* [`unicode_script_subtag`](#unicode_script_subtag_validity) (also known as a _Unicode script code_)
* [`unicode_region_subtag`](#unicode_region_subtag_validity) (also known as a _Unicode region code,_ or a _Unicode territory code_)
* [`unicode_variant_subtag`](#unicode_variant_subtag_validity) (also known as a _Unicode language variant code_)
- * 3.5 [Special Codes](#Special_Codes)
- * 3.5.1 [Unknown or Invalid Identifiers](#Unknown_or_Invalid_Identifiers)
- * 3.5.2 [Numeric Codes](#Numeric_Codes)
- * 3.5.3 [Private Use Codes](#Private_Use_Codes)
+ * [Special Codes](#Special_Codes)
+ * [Unknown or Invalid Identifiers](#Unknown_or_Invalid_Identifiers)
+ * [Numeric Codes](#Numeric_Codes)
+ * [Private Use Codes](#Private_Use_Codes)
* Table: [Private Use Codes in CLDR](#Private_Use_CLDR)
- * 3.6 [Unicode BCP 47 U Extension](#u_Extension)
- * 3.6.1 [Key And Type Definitions](#Key_And_Type_Definitions_)
+ * [Unicode BCP 47 U Extension](#u_Extension)
+ * [Key And Type Definitions](#Key_And_Type_Definitions_)
* Table: [Key/Type Definitions](#Key_Type_Definitions)
- * 3.6.2 [Numbering System Data](#Numbering%20System%20Data)
- * 3.6.3 [Time Zone Identifiers](#Time_Zone_Identifiers)
- * 3.6.4 [U Extension Data Files](#Unicode_Locale_Extension_Data_Files)
- * 3.6.5 [Subdivision Codes](#Unicode_Subdivision_Codes)
- * 3.6.5.1 [Validity](#Validity)
- * 3.7 [Unicode BCP 47 T Extension](#BCP47_T_Extension)
- * 3.7.1 [T Extension Data Files](#Transformed_Content_Data_File)
- * 3.8 [Compatibility with Older Identifiers](#Compatibility_with_Older_Identifiers)
- * 3.8.1 [Old Locale Extension Syntax](#Old_Locale_Extension_Syntax)
+ * [Numbering System Data](#Numbering%20System%20Data)
+ * [Time Zone Identifiers](#Time_Zone_Identifiers)
+ * [U Extension Data Files](#Unicode_Locale_Extension_Data_Files)
+ * [Subdivision Codes](#Unicode_Subdivision_Codes)
+ * [Validity](#Validity)
+ * [Unicode BCP 47 T Extension](#BCP47_T_Extension)
+ * [T Extension Data Files](#Transformed_Content_Data_File)
+ * [Compatibility with Older Identifiers](#Compatibility_with_Older_Identifiers)
+ * [Old Locale Extension Syntax](#Old_Locale_Extension_Syntax)
* Table: [Locale Extension Mappings](#Locale_Extension_Mappings)
- * 3.8.2 [Legacy Variants](#Legacy_Variants)
+ * [Legacy Variants](#Legacy_Variants)
* Table: [Legacy Variant Mappings](#Legacy_Variant_Mappings)
- * 3.8.3 [Relation to OpenI18n](#Relation_to_OpenI18n)
- * 3.9 [Transmitting Locale Information](#Transmitting_Locale_Information)
- * 3.9.1 [Message Formatting and Exceptions](#Message_Formatting_and_Exceptions)
- * 3.10 [Unicode Language and Locale IDs](#Language_and_Locale_IDs)
- * 3.10.1 [Written Language](#Written_Language)
- * 3.10.2 [Hybrid Locale Identifiers](#Hybrid_Locale)
- * 3.11 [Validity Data](#Validity_Data)
-* 4 [Locale Inheritance and Matching](#Locale_Inheritance)
- * 4.1 [Lookup](#Lookup)
- * 4.1.1 [Bundle vs Item Lookup](#Bundle_vs_Item_Lookup)
+ * [Relation to OpenI18n](#Relation_to_OpenI18n)
+ * [Transmitting Locale Information](#Transmitting_Locale_Information)
+ * [Message Formatting and Exceptions](#Message_Formatting_and_Exceptions)
+ * [Unicode Language and Locale IDs](#Language_and_Locale_IDs)
+ * [Written Language](#Written_Language)
+ * [Hybrid Locale Identifiers](#Hybrid_Locale)
+ * [Validity Data](#Validity_Data)
+* [Locale Inheritance and Matching](#Locale_Inheritance)
+ * [Lookup](#Lookup)
+ * [Bundle vs Item Lookup](#Bundle_vs_Item_Lookup)
* Table: [Lookup Differences](#Lookup-Differences)
- * 4.1.2 [Lateral Inheritance](#Lateral_Inheritance)
+ * [Lateral Inheritance](#Lateral_Inheritance)
* Table: [Count Fallback: normal](#Count_Fallback_normal)
* Table: [Count Fallback: currency](#Count_Fallback_currency)
- * 4.1.3 [Parent Locales](#Parent_Locales)
- * 4.2 [Inheritance and Validity](#Inheritance_and_Validity)
- * 4.2.1 [Definitions](#Definitions)
- * 4.2.2 [Resolved Data File](#Resolved_Data_File)
- * 4.2.3 [Valid Data](#Valid_Data)
- * 4.2.4 [Checking for Draft Status](#Checking_for_Draft_Status)
- * 4.2.5 [Keyword and Default Resolution](#Keyword_and_Default_Resolution)
- * 4.2.6 [Inheritance vs Related Information](#Inheritance_vs_Related)
- * 4.3 [Likely Subtags](#Likely_Subtags)
- * 4.4 [Language Matching](#LanguageMatching)
- * 4.4.1 [Enhanced Language Matching](#EnhancedLanguageMatching)
-* 5 [XML Format](#XML_Format)
- * 5.1 [Common Elements](#Common_Elements)
- * 5.1.1 [Element special](#special)
- * 5.1.1.1 [Sample Special Elements](#Sample_Special_Elements)
- * 5.1.2 [Element alias](#Alias_Elements)
+ * [Parent Locales](#Parent_Locales)
+ * [Region-Priority Inheritance](#Region_Priority_Inheritance)
+ * [Inheritance and Validity](#Inheritance_and_Validity)
+ * [Definitions](#Definitions)
+ * [Resolved Data File](#Resolved_Data_File)
+ * [Valid Data](#Valid_Data)
+ * [Checking for Draft Status](#Checking_for_Draft_Status)
+ * [Keyword and Default Resolution](#Keyword_and_Default_Resolution)
+ * [Inheritance vs Related Information](#Inheritance_vs_Related)
+ * [Likely Subtags](#Likely_Subtags)
+ * [Language Matching](#LanguageMatching)
+ * [Enhanced Language Matching](#EnhancedLanguageMatching)
+* [XML Format](#XML_Format)
+ * [Common Elements](#Common_Elements)
+ * [Element special](#special)
+ * [Sample Special Elements](#Sample_Special_Elements)
+ * [Element alias](#Alias_Elements)
* Table: [Inheritance with `source="locale"`](#Inheritance_with_source_locale_)
- * 5.1.3 [Element displayName](#Element_displayName)
- * 5.1.4 [Escaping Characters](#Escaping_Characters)
- * 5.2 [Common Attributes](#Common_Attributes)
- * 5.2.1 [Attribute type](#Attribute_type)
- * 5.2.2 [Attribute draft](#Attribute_draft)
- * 5.2.3 [Attribute alt](#alt_attribute)
- * 5.2.4 [Attribute references](#references_attribute)
- * 5.3 [Common Structures](#Common_Structures)
- * 5.3.1 [Date and Date Ranges](#Date_Ranges)
- * 5.3.2 [Text Directionality](#Text_Directionality)
- * 5.3.3 [Unicode Sets](#Unicode_Sets)
- * 5.3.3.1 [Lists of Code Points](#Lists_of_Code_Points)
- * 5.3.3.2 [Unicode Properties](#Unicode_Properties)
- * 5.3.3.3 [Boolean Operations](#Boolean_Operations)
- * 5.3.3.4 [UnicodeSet Examples](#UnicodeSet_Examples)
- * 5.3.4 [String Range](#String_Range)
- * 5.4 [Identity Elements](#Identity_Elements)
- * 5.5 [Valid Attribute Values](#Valid_Attribute_Values)
- * 5.6 [Canonical Form](#Canonical_Form)
- * 5.6.1 [Content](#Content)
- * 5.6.2 [Ordering](#Ordering)
- * 5.6.3 [Comments](#Comments)
- * 5.7 [DTD Annotations](#DTD_Annotations)
- * 5.7.1 [Attribute Value Constraints](#match_expressions)
-* 6 [Property Data](#Property_Data)
- * 6.1 [Script Metadata](#Script_Metadata)
- * 6.2 [Extended Pictographic](#Extended_Pictographic)
- * 6.3 [Labels.txt](#Labels.txt)
- * 6.4 [Segmentation Tests](#Segmentation_Tests)
-* 7 [Issues in Formatting and Parsing](#Format_Parse_Issues)
- * 7.1 [Lenient Parsing](#Lenient_Parsing)
- * 7.1.1 [Motivation](#Motivation)
- * 7.1.2 [Loose Matching](#Loose_Matching)
- * 7.2 [Handling Invalid Patterns](#Invalid_Patterns)
+ * [Element displayName](#Element_displayName)
+ * [Escaping Characters](#Escaping_Characters)
+ * [Common Attributes](#Common_Attributes)
+ * [Attribute type](#Attribute_type)
+ * [Attribute draft](#Attribute_draft)
+ * [Attribute alt](#alt_attribute)
+ * [Attribute references](#references_attribute)
+ * [Common Structures](#Common_Structures)
+ * [Date and Date Ranges](#Date_Ranges)
+ * [Text Directionality](#Text_Directionality)
+ * [Unicode Sets](#Unicode_Sets)
+ * [UnicodeSet syntax](#unicodeset-syntax)
+ * [Syntax Special Case Examples](#syntax-special-case-examples)
+ * [Lists of Code Points](#Lists_of_Code_Points)
+ * [Backslash Escapes](#Backslash_Escapes)
+ * [Unicode Properties](#Unicode_Properties)
+ * [Boolean Operations](#Boolean_Operations)
+ * [Variables in UnicodeSets](#Variables_in_UnicodeSets)
+ * [UnicodeSet Examples](#UnicodeSet_Examples)
+ * [String Range](#String_Range)
+ * [Identity Elements](#Identity_Elements)
+ * [Valid Attribute Values](#Valid_Attribute_Values)
+ * [Canonical Form](#Canonical_Form)
+ * [Content](#Content)
+ * [Ordering](#Ordering)
+ * [Comments](#Comments)
+ * [DTD Annotations](#DTD_Annotations)
+ * [Attribute Value Constraints](#match_expressions)
+* [Property Data](#Property_Data)
+ * [Script Metadata](#Script_Metadata)
+ * [Extended Pictographic](#Extended_Pictographic)
+ * [Labels.txt](#Labels.txt)
+ * [Segmentation Tests](#Segmentation_Tests)
+* [Issues in Formatting and Parsing](#Format_Parse_Issues)
+ * [Lenient Parsing](#Lenient_Parsing)
+ * [Motivation](#Motivation)
+ * [Loose Matching](#Loose_Matching)
+ * [Handling Invalid Patterns](#Invalid_Patterns)
+* [Data Size Reduction](#Data_Size)
+ * [Vertical Slicing](#Vertical_Slicing)
+ * [Horizontal Slicing](#Horizontal_Slicing)
* [Annex A Deprecated Structure](#Deprecated_Structure)
* [A.1 Element fallback](#Fallback_Elements)
* [A.2 BCP 47 Keyword Mapping](#BCP47_Keyword_Mapping)
@@ -183,7 +197,7 @@
* [LocaleId Definitions](#LocaleId_Definitions)
* [1. Multimap interpretation](#1.-multimap-interpretation)
* [2. Alias elements](#2.-alias-elements)
- * [3. Matches](#3.-matches)
+ * [Matches](#3.-matches)
* [4. Replacement](#4.-replacement)
* [Territory Exception](#territory-exception)
* [5. Canonicalizing Syntax](#5.-canonicalizing-syntax)
@@ -195,7 +209,7 @@
* [Acknowledgments](#Acknowledgments)
* [Modifications](#Modifications)
-## 1 <a name="Introduction" href="#Introduction">Introduction</a>
+## <a name="Introduction" href="#Introduction">Introduction</a>
Not long ago, computer systems were like separate worlds, isolated from one another. The internet and related events have changed all that. A single system can be built of many different components, hardware and software, all needing to work together. Many different technologies have been important in bridging the gaps; in the internationalization arena, Unicode has provided a lingua franca for communicating textual data. However, there remain differences in the locale data used by different systems.
@@ -211,7 +225,7 @@
As LDML is an interchange format, it was designed for ease of maintenance and simplicity of transformation into other formats, above efficiency of run-time lookup and use. Implementations should consider converting LDML data into a more compact format prior to use.
-### 1.1 <a name="Conformance" href="#Conformance">Conformance</a>
+### <a name="Conformance" href="#Conformance">Conformance</a>
There are many ways to use the Unicode LDML format and the data in CLDR, and the Unicode Consortium does not restrict the ways in which the format or data are used. However, an implementation may also claim conformance to LDML or to CLDR, as follows:
@@ -233,9 +247,15 @@
> _Field X can contain any Unicode region subtag values as given in Unicode Technical Standard #35: Unicode Locale Data Markup Language (LDML), excluding grouping codes._
+### EBNF
+The BNF syntax used in LDML is a variant of the Extended Backus-Naur Form (EBNF) notation used in [W3C XML Notation](https://www.w3.org/TR/REC-xml/#sec-notation). The main differences are:
+1. Bounded repetition following Perl regex syntax is allowed, such as alphanum{3,8}
+2. Constraints (well-formedness or validity) use separate notes
-## 2 <a name="Locale" href="#Locale">What is a Locale?</a>
+In the text, this is sometimes referred to as "EBNF (Perl-based)".
+
+## <a name="Locale" href="#Locale">What is a Locale?</a>
Before diving into the XML structure, it is helpful to describe the model behind the structure. People do not have to subscribe to this model to use data in LDML, but they do need to understand it so that the data can be correctly translated into whatever model their implementation uses.
@@ -245,19 +265,19 @@
Locale data in a system may also change over time: country boundaries change; governments (and currencies) come and go: committees impose new standards; bugs are found and fixed in the source data; and so on. Thus the data needs to be versioned for stability over time.
-In general terms, the locale id is a parameter that is supplied to a particular service (date formatting, sorting, spell-checking, and so on). The format in this document does not attempt to represent all the data that could conceivably be used by all possible services. Instead, it collects together data that is in common use in systems and internationalization libraries for basic services. The main difference among locales is in terms of language; there may also be some differences according to different countries or regions. However, the line between _locales_ and _languages_, as commonly used in the industry, are rather fuzzy. Note also that the vast majority of the locale data in CLDR is in fact language data; all non-linguistic data is separated out into a separate tree. For more information, see _[Section 3.10 Language and Locale IDs](#Language_and_Locale_IDs)_.
+In general terms, the locale id is a parameter that is supplied to a particular service (date formatting, sorting, spell-checking, and so on). The format in this document does not attempt to represent all the data that could conceivably be used by all possible services. Instead, it collects together data that is in common use in systems and internationalization libraries for basic services. The main difference among locales is in terms of language; there may also be some differences according to different countries or regions. However, the line between _locales_ and _languages_, as commonly used in the industry, are rather fuzzy. Note also that the vast majority of the locale data in CLDR is in fact language data; all non-linguistic data is separated out into a separate tree. For more information, see _[Language and Locale IDs](#Language_and_Locale_IDs)_.
We will speak of data as being "in locale X". That does not imply that a locale _is_ a collection of data; it is simply shorthand for "the set of data associated with the locale id X". Each individual piece of data is called a _resource_ or _field_, and a tag indicating the key of the resource is called a _resource tag._
<a name="Identifiers"></a>
-## 3 <a name="Unicode_Language_and_Locale_Identifiers" href="#Unicode_Language_and_Locale_Identifiers">Unicode Language and Locale Identifiers</a>
+## <a name="Unicode_Language_and_Locale_Identifiers" href="#Unicode_Language_and_Locale_Identifiers">Unicode Language and Locale Identifiers</a>
Unicode LDML uses stable identifiers based on [[BCP47](#BCP47)] for distinguishing among languages, locales, regions, currencies, time zones, transforms, and so on. There are many systems for identifiers for these entities. The Unicode LDML identifiers may not match the identifiers used on a particular target system. If so, some process of identifier translation may be required when using LDML data.
-The BCP 47 extensions (-u- and -t-) are described in _Section 3.6 [Unicode BCP 47 U Extension](#u_Extension)_ and _Section 3.7 [Unicode BCP 47 T Extension](#BCP47_T_Extension)_.
+The BCP 47 extensions (-u- and -t-) are described in _[Unicode BCP 47 U Extension](#u_Extension)_ and _[Unicode BCP 47 T Extension](#BCP47_T_Extension)_.
-### _<a name="Unicode_language_identifier" href="#Unicode_language_identifier">3.1 Unicode Language Identifier</a>_
+### _<a name="Unicode_language_identifier" href="#Unicode_language_identifier">Unicode Language Identifier</a>_
A _Unicode language identifier_ has the following structure (provided in EBNF (Perl-based)). The following table defines syntactically well-formed identifiers: they are not necessarily valid identifiers. For additional validity criteria, see the links on the right.
@@ -304,13 +324,16 @@
<tr><td><code>alphanum</code></td><td><pre>= [0-9 A-Z a-z] ;</pre></td></tr>
</tbody></table>
-The semantics of the various subtags is explained in _Section 3.4 [Language Identifier Field Definitions](#Field_Definitions)_ ; there are also direct links from [`unicode_language_subtag`](#unicode_language_subtag) , etc. While theoretically the [`unicode_language_subtag`](#unicode_language_subtag) may have more than 3 letters through the IANA registration process, in practice that has not occurred. The [`unicode_language_subtag`](#unicode_language_subtag) "und" may be omitted when there is a [`unicode_script_subtag`](#unicode_script_subtag) ; for that reason [`unicode_language_subtag`](#unicode_language_subtag) values with 4 letters are not permitted. However, such [`unicode_language_id`](#unicode_language_id) values are not intended for general interchange, because they are not valid BCP 47 tags. Instead, they are intended for certain protocols such as the identification of transliterators or font ScriptLangTag values. For more information on language subtags with 4 letters, see [BCP 47 Language Tag to Unicode BCP 47 Locale Identifier](#Language_Tag_to_Locale_Identifier).
+> As is often the case, the complete syntactic constraints are not easily captured by ABNF, so there is a further condition:
+> The sequence of variant subtags must not have any duplicates (eg, de-1996-fonipa-1996 is not syntactically well-formed).
+
+The semantics of the various subtags is explained in _[Language Identifier Field Definitions](#Field_Definitions)_ ; there are also direct links from [`unicode_language_subtag`](#unicode_language_subtag) , etc. While theoretically the [`unicode_language_subtag`](#unicode_language_subtag) may have more than 3 letters through the IANA registration process, in practice that has not occurred. The [`unicode_language_subtag`](#unicode_language_subtag) "und" may be omitted when there is a [`unicode_script_subtag`](#unicode_script_subtag) ; for that reason [`unicode_language_subtag`](#unicode_language_subtag) values with 4 letters are not permitted. However, such [`unicode_language_id`](#unicode_language_id) values are not intended for general interchange, because they are not valid BCP 47 tags. Instead, they are intended for certain protocols such as the identification of transliterators or font ScriptLangTag values. For more information on language subtags with 4 letters, see [BCP 47 Language Tag to Unicode BCP 47 Locale Identifier](#Language_Tag_to_Locale_Identifier).
For example, "en-US" (American English), "en_GB" (British English), "es-419" (Latin American Spanish), and "uz-Cyrl" (Uzbek in Cyrillic) are all valid Unicode language identifiers.
-### _<a name="Unicode_locale_identifier" href="#Unicode_locale_identifier">3.2 Unicode Locale Identifier</a>_
+### _<a name="Unicode_locale_identifier" href="#Unicode_locale_identifier">Unicode Locale Identifier</a>_
-A _Unicode locale identifier_ is composed of a Unicode language identifier plus (optional) locale extensions. It has the following structure. The semantics of the U and T extensions are explained in _Section 3.6 [Unicode BCP 47 U Extension](#u_Extension)_ and _Section 3.7 [Unicode BCP 47 T Extension](#BCP47_T_Extension)_. Other extensions and private use extensions are supported for pass-through. The following table defines syntactically _well-formed_ identifiers: they are not necessarily _valid_ identifiers. For additional validity criteria, see the links on the right.
+A _Unicode locale identifier_ is composed of a Unicode language identifier plus (optional) locale extensions. It has the following structure. The semantics of the U and T extensions are explained in _[Unicode BCP 47 U Extension](#u_Extension)_ and _[Unicode BCP 47 T Extension](#BCP47_T_Extension)_. Other extensions and private use extensions are supported for pass-through. The following table defines syntactically _well-formed_ identifiers: they are not necessarily _valid_ identifiers. For additional validity criteria, see the links on the right.
As is often the case, the complete syntactic constraints are not easily captured by ABNF, so there is a further condition: There cannot be more than one extension with the same singleton (-a-, …, -t-, -u-, …). Note that the private use extension (-x-) must come after all other extensions.
@@ -334,7 +357,10 @@
| `tkey` | `= alpha digit ;` |
| `tvalue` | `= (sep alphanum{3,8})+ ;` |
-For historical reasons, this is called a Unicode locale identifier. However, it really functions (with few exceptions) as a language identifier, and accesses language-based data. Except where it would be unclear, this document uses the term "locale" data loosely to encompass both types of data: for more information, see _[Section 3.10 Language and Locale IDs](#Language_and_Locale_IDs)_.
+> As is often the case, the complete syntactic constraints are not easily captured by ABNF, so there is a further condition:
+> The sequence of variant subtags in a tlang must not have any duplicates.
+
+For historical reasons, this is called a Unicode locale identifier. However, it really functions (with few exceptions) as a language identifier, and accesses language-based data. Except where it would be unclear, this document uses the term "locale" data loosely to encompass both types of data: for more information, see _[Language and Locale IDs](#Language_and_Locale_IDs)_.
As of the release of this specification, there were no other_extensions defined. The other_extensions are present in the syntax to allow implementations to preserve that information.
@@ -344,7 +370,7 @@
All identifier field values are case-insensitive. Although case distinctions do not carry any special meaning, an implementation of LDML should use the casing recommendations in [[BCP47](#BCP47)], especially when a Unicode locale identifier is used for locale data exchange in software protocols.
-#### 3.2.1 <a name="Canonical_Unicode_Locale_Identifiers" href="#Canonical_Unicode_Locale_Identifiers">Canonical Unicode Locale Identifiers</a>
+#### <a name="Canonical_Unicode_Locale_Identifiers" href="#Canonical_Unicode_Locale_Identifiers">Canonical Unicode Locale Identifiers</a>
A [`unicode_locale_id`](#unicode_locale_id) has _canonical syntax_ when:
@@ -363,13 +389,13 @@
For example, the canonical form of "en-u-foo-bar-nu-thai-ca-buddhist-kk-true" is "en-u-bar-foo-ca-buddhist-kk-nu-thai". The attributes `"foo"` and `"bar"` in this example are provided only for illustration; no attribute subtags are defined by the current CLDR specification.
NOTE: Some people may wonder why CLDR uses alphabetical order for variants, rather than the ordering in [Section 4.1](https://www.rfc-editor.org/rfc/rfc5646.html#section-4.1) of BCP 47. Here are the considerations that lead to that decision:
- * The ordering in Section 4.1 is recommended, but not required for conformance. In particular, use of and ordering by Prefix is recommended but not required.
+ * The ordering in is recommended, but not required for conformance. In particular, use of and ordering by Prefix is recommended but not required.
* Moreover, [Section 4.5](https://www.rfc-editor.org/rfc/rfc5646.html#section-4.5) states that “If more than one variant appears within a tag, processors MAY reorder the variants to obtain better matching behavior or more consistent presentation.”
* The best practices for internationalization have moved well beyond some of the guidelines and recommendations in BCP 47, especially for language matching and language fallback.
* Robust implementations will accept the variants in any order, just as they accept extensions in any order.
* A canonical order allows for determination of identity of identifiers via string comparison.
- * The ordering in Section 4.1 does not result in a determinant order for canonicalization, since the mechanism for determining “importance” is not specified: ca-valencia-fonipa and ca-fonipa-valencia could both be ‘canonical’ variants of one another.
- * Pure alphabetical order is determinant and simple to implement while the ordering in Section 4.1 is indeterminant, more complex, and provides no significant benefit in modern applications.
+ * The ordering in does not result in a determinant order for canonicalization, since the mechanism for determining “importance” is not specified: ca-valencia-fonipa and ca-fonipa-valencia could both be ‘canonical’ variants of one another.
+ * Pure alphabetical order is determinant and simple to implement while the ordering in is indeterminant, more complex, and provides no significant benefit in modern applications.
**Note:** The current version of CLDR data uses some non-preferred _syntax_ for backward compatibility. This might be changed in future CLDR releases.
@@ -379,7 +405,7 @@
A [`unicode_locale_id`](#unicode_locale_id) is in _canonical form_ when it has canonical syntax and contains no aliased subtags. A [`unicode_locale_id`](#unicode_locale_id) can be transformed into canonical form according to [Annex C. LocaleId Canonicalization](#LocaleId_Canonicalization).
-A [`unicode_locale_id`](#unicode_locale_id) is _maximal_ when the [`unicode_language_id`](#unicode_language_id) and tlang (if any) have been transformed by the Add Likely Subtags operation in _Section 4.3 [Likely Subtags](#Likely_Subtags)_, excluding "und".
+A [`unicode_locale_id`](#unicode_locale_id) is _maximal_ when the [`unicode_language_id`](#unicode_language_id) and tlang (if any) have been transformed by the Add Likely Subtags operation in _[Likely Subtags](#Likely_Subtags)_, excluding "und".
> _Example:_ the maxmal form of ja-Kana-t-it is ja-Kana-JP-t-it-latn-it
@@ -392,7 +418,7 @@
The equivalence relationship may change over time, such as when subtags are deprecated or likely subtag mappings change. For example, if two countries were to merge, then various subtags would become deprecated. These kinds of changes are generally very infrequent.
-### 3.3 <a name="BCP_47_Conformance" href="#BCP_47_Conformance">BCP 47 Conformance</a>
+### <a name="BCP_47_Conformance" href="#BCP_47_Conformance">BCP 47 Conformance</a>
Unicode language and locale identifiers inherit the design and the repertoire of subtags from [[BCP47](#BCP47)] Language Tags. There are some extensions and restrictions made for the use of the Unicode locale identifier in CLDR:
@@ -413,7 +439,7 @@
* the term _Unicode CLDR locale identifier_ applies where the backwards compatibility syntax is used.
* the term _Unicode BCP 47 locale identifier_ applies otherwise. A _Unicode BCP 47 locale identifier_ is also a valid BCP 47 language tag.
-#### 3.3.1 <a name="BCP_47_Language_Tag_Conversion" href="#BCP_47_Language_Tag_Conversion">BCP 47 Language Tag Conversion</a>
+#### <a name="BCP_47_Language_Tag_Conversion" href="#BCP_47_Language_Tag_Conversion">BCP 47 Language Tag Conversion</a>
The different identifiers can be converted to one another as described in this section.
@@ -477,13 +503,13 @@
Theoretically, a language tag could be far longer, due to the possibility of a large number of variants and extensions.
In practice, the typical size of a locale or language identifier will be much smaller, so implementations can optimize for smaller sizes, as long as there is an escape mechanism allowing for up to 255.
-### 3.4 <a name="Field_Definitions" href="#Field_Definitions">Language Identifier Field Definitions</a>
+### <a name="Field_Definitions" href="#Field_Definitions">Language Identifier Field Definitions</a>
-Unicode language and locale identifier field values are provided in the following table. Note that some private-use BCP 47 field values are given specific meanings in CLDR. While field values are based on [[BCP47](#BCP47)] subtag values, their validity status in CLDR is specified by means of machine-readable files in the [common/validity/](https://github.com/unicode-org/cldr-staging/tree/main/production/common/validity) subdirectory, such as language.xml. For the format of those files and more information, see _[Section 3.11 Validity Data](#Validity_Data)_.
+Unicode language and locale identifier field values are provided in the following table. Note that some private-use BCP 47 field values are given specific meanings in CLDR. While field values are based on [[BCP47](#BCP47)] subtag values, their validity status in CLDR is specified by means of machine-readable files in the [common/validity/](https://github.com/unicode-org/cldr-staging/tree/main/production/common/validity) subdirectory, such as language.xml. For the format of those files and more information, see _[Validity Data](#Validity_Data)_.
#### <a name="unicode_language_subtag_validity" href="#unicode_language_subtag_validity">`unicode_language_subtag`</a> (also known as a _Unicode base language code_)
-Subtags in the language.xml file (see _Section 3.11 [Validity Data](#Validity_Data)_ ). These are based on [[BCP47](#BCP47)] subtag values marked as **Type: language**
+Subtags in the language.xml file (see _[Validity Data](#Validity_Data)_ ). These are based on [[BCP47](#BCP47)] subtag values marked as **Type: language**
ISO 639-3 introduces the notion of "macrolanguages", where certain ISO 639-1 or ISO 639-2 codes are given broad semantics, and additional codes are given for the narrower semantics. For backwards compatibility, Unicode language identifiers retain use of the narrower semantics for these codes. For example:
@@ -499,7 +525,7 @@
If a language subtag matches the `type` attribute of a `languageAlias` element, then the replacement value is used instead. For example, because "swh" occurs in `<languageAlias type="swh" replacement="sw" />` , "sw" must be used instead of "swh". Thus Unicode language identifiers use "ar-EG" for Standard Arabic (Egypt), not "arb-EG"; they use "zh-TW" for Mandarin Chinese (Taiwan), not "cmn-TW".
-The private use codes listed as **excluded** in _Section 3.5.3 [Private Use Codes](#Private_Use_Codes)_ will never be given specific semantics in Unicode identifiers, and are thus safe for use for other purposes by other applications.
+The private use codes listed as **excluded** in _[Private Use Codes](#Private_Use_Codes)_ will never be given specific semantics in Unicode identifiers, and are thus safe for use for other purposes by other applications.
The CLDR provides data for normalizing language/locale codes, including mapping overlong codes like "eng-840" or "eng-USA" to the correct code "en-US"; see the **[Aliases](https://unicode-org.github.io/cldr-staging/charts/38/supplemental/aliases.html)** Chart.
@@ -513,7 +539,7 @@
#### <a name="unicode_script_subtag_validity" href="#unicode_script_subtag_validity">`unicode_script_subtag`</a> (also known as a _Unicode script code_)
-Subtags in the script.xml file (see _Section 3.11 [Validity Data](#Validity_Data)_). These are based on [[BCP47](#BCP47)] subtag values marked as **Type: script**
+Subtags in the script.xml file (see _[Validity Data](#Validity_Data)_). These are based on [[BCP47](#BCP47)] subtag values marked as **Type: script**
In most cases the script is not necessary, since the language is only customarily written in a single script. Examples of cases where it is used are:
@@ -563,14 +589,14 @@
<td colspan="2"> </td></tr>
</tbody></table>
-The private use subtags listed as **excluded** in _Section 3.5.3 [Private Use Codes](#Private_Use_Codes)_ will never be given specific semantics in Unicode identifiers, and are thus safe for use for other purposes by other applications.
+The private use subtags listed as **excluded** in _[Private Use Codes](#Private_Use_Codes)_ will never be given specific semantics in Unicode identifiers, and are thus safe for use for other purposes by other applications.
#### <a name="unicode_region_subtag_validity" href="#unicode_region_subtag_validity">`unicode_region_subtag`</a> (also known as a _Unicode region code,_ or a _Unicode territory code_)
-Subtags in the region.xml file (see _Section 3.11 [Validity Data](#Validity_Data)_). These are based on [[BCP47](#BCP47)] subtag values marked as **Type: region**
+Subtags in the region.xml file (see _[Validity Data](#Validity_Data)_). These are based on [[BCP47](#BCP47)] subtag values marked as **Type: region**
Unicode identifiers give specific semantics to the following subtags.
-(The alpha2 codes are used as Unicode region subtags. The alpha3 and numeric codes are derived according to _Section 3.5.2 [Numeric Codes](#Numeric_Codes)_ and listed here for additional documentation.)
+(The alpha2 codes are used as Unicode region subtags. The alpha3 and numeric codes are derived according to _[Numeric Codes](#Numeric_Codes)_ and listed here for additional documentation.)
| alpha2 | alpha3 | num | Name | Comment | ISO 3166-1 status |
| ------ | ------ | --- | ---------------------------- | ------- | ----------------- |
@@ -583,7 +609,7 @@
| `ZZ` | `ZZZ` | 999 | Unknown or Invalid Territory | used in APIs or as replacement for invalid code | private use |
-The private use subtags listed as **excluded** in _Section 3.5.3 [Private Use Codes](#Private_Use_Codes)_ will normally never be given specific semantics in Unicode identifiers, and are thus safe for use for other purposes by other applications. However, LDML may follow widespread industry practice in the use of some of these codes, such as for XK.
+The private use subtags listed as **excluded** in _[Private Use Codes](#Private_Use_Codes)_ will normally never be given specific semantics in Unicode identifiers, and are thus safe for use for other purposes by other applications. However, LDML may follow widespread industry practice in the use of some of these codes, such as for XK.
The CLDR provides data for normalizing territory/region codes, including mapping overlong codes like "eng-840" or "eng-USA" to the correct code "en-US".
@@ -594,9 +620,9 @@
#### <a name="unicode_variant_subtag_validity" href="#unicode_variant_subtag_validity">`unicode_variant_subtag`</a> (also known as a _Unicode language variant code_)
-Subtags in the variant.xml file (see _Section 3.11 [Validity Data](#Validity_Data)_). These are based on [[BCP47](#BCP47)] subtag values marked as **Type: variant**. The sequence of variant tags must not have any duplicates: thus de-1996-fonipa-1996 is invalid, while de-1996-fonipa and de-fonipa-1996 are both valid.
+Subtags in the variant.xml file (see _[Validity Data](#Validity_Data)_). These are based on [[BCP47](#BCP47)] subtag values marked as **Type: variant**. The sequence of variant tags must not have any duplicates: thus de-1996-fonipa-1996 is invalid, while de-1996-fonipa and de-fonipa-1996 are both valid.
-CLDR provides data for normalizing variant codes. About handling of the "POSIX" variant see _Section 3.8.2, [Legacy Variants](#Legacy_Variants)_.
+CLDR provides data for normalizing variant codes. About handling of the "POSIX" variant see _[Legacy Variants](#Legacy_Variants)_.
_Examples:_
@@ -610,9 +636,9 @@
A locale that only has a language subtag (and optionally a script subtag) is called a _language locale_; one with both language and territory subtag is called a _territory locale_ (or _country locale_).
-### 3.5 <a name="Special_Codes" href="#Special_Codes">Special Codes</a>
+### <a name="Special_Codes" href="#Special_Codes">Special Codes</a>
-#### 3.5.1 <a name="Unknown_or_Invalid_Identifiers" href="#Unknown_or_Invalid_Identifiers">Unknown or Invalid Identifiers</a>
+#### <a name="Unknown_or_Invalid_Identifiers" href="#Unknown_or_Invalid_Identifiers">Unknown or Invalid Identifiers</a>
The following identifiers are used to indicate an unknown or invalid code in Unicode language and locale identifiers. For Unicode identifiers, the region code uses a private use ISO 3166 code, and Time Zone code uses an additional code; the others are defined by the relevant standards. When these codes are used in APIs connected with Unicode identifiers, the meaning is that either there was no identifier available, or that at some point an input identifier value was determined to be invalid or ill-formed.
@@ -627,7 +653,7 @@
When only the script or region are known, then a locale ID will use "und" as the language subtag portion. Thus the locale tag "und_Grek" represents the Greek script; "und_US" represents the US territory.
-#### 3.5.2 <a name="Numeric_Codes" href="#Numeric_Codes">Numeric Codes</a>
+#### <a name="Numeric_Codes" href="#Numeric_Codes">Numeric Codes</a>
For region codes, ISO and the UN establish a mapping to three-letter codes and numeric codes. However, this does not extend to the private use codes, which are the codes 900-999 (total: 100), and AAA, QMA-QZZ, XAA-XZZ, and ZZZ (total: 1092). Unicode identifiers supply a standard mapping to these: for the numeric codes, it uses the top of the numeric private use range; for the 3-letter codes it doubles the final letter. These are the resulting mappings for all of the private use region codes:
@@ -644,7 +670,7 @@
| ------------ | ---------- |
| `Qaaa..Qabx` | `900..949` |
-#### 3.5.3 <a name="Private_Use_Codes" href="#Private_Use_Codes">Private Use Codes</a>
+#### <a name="Private_Use_Codes" href="#Private_Use_Codes">Private Use Codes</a>
Private use codes fall into three groups.
@@ -669,10 +695,10 @@
| | reserved | bcp47: all non-5 letter codes not starting with x |
| | excluded | bcp47: all non-5 letter codes starting with x |
-See also _Section 3.5.1 [Unknown or Invalid Identifiers](#Unknown_or_Invalid_Identifiers)_.
+See also _[Unknown or Invalid Identifiers](#Unknown_or_Invalid_Identifiers)_.
<a name="Locale_Extension_Key_and_Type_Data"></a>
-### 3.6 <a name="u_Extension" href="#u_Extension">Unicode BCP 47 U Extension</a>
+### <a name="u_Extension" href="#u_Extension">Unicode BCP 47 U Extension</a>
[[BCP47](#BCP47)] Language Tags provides a mechanism for extending language tags for use in various applications by extension subtags. Each extension subtag is identified by a single alphanumeric character subtag assigned by IANA.
@@ -680,17 +706,17 @@
These subtags are all in lowercase (that is the canonical casing for these subtags), however, subtags are case-insensitive and casing does not carry any specific meaning. All subtags within the Unicode extensions are alphanumeric characters in length of two to eight that meet the rule `extension` in the [[BCP47](#BCP47)].
-**The -u- Extension.** The syntax of 'u' extension subtags is defined by the rule `unicode_locale_extensions` in [Section 3.2 Unicode locale identifier](#Unicode_locale_identifier), except the separator of subtags `sep` must be always hyphen '-' when the extension is used as a part of BCP 47 language tag.
+**The -u- Extension.** The syntax of 'u' extension subtags is defined by the rule `unicode_locale_extensions` in [Unicode locale identifier](#Unicode_locale_identifier), except the separator of subtags `sep` must be always hyphen '-' when the extension is used as a part of BCP 47 language tag.
-A 'u' extension may contain multiple `attribute` s or `keyword` s as defined in [Section 3.2 Unicode locale identifier](#Unicode_locale_identifier). The canonical syntax is defined as in [Canonical Unicode Locale Identifiers](#Canonical_Unicode_Locale_Identifiers).
+A 'u' extension may contain multiple `attribute` s or `keyword` s as defined in [Unicode locale identifier](#Unicode_locale_identifier). The canonical syntax is defined as in [Canonical Unicode Locale Identifiers](#Canonical_Unicode_Locale_Identifiers).
_See also [Unicode Extensions for BCP 47](https://cldr.unicode.org/index/bcp47-extension) on the CLDR site._
-#### 3.6.1 <a name="Key_And_Type_Definitions_" href="#Key_And_Type_Definitions_">Key And Type Definitions</a>
+#### <a name="Key_And_Type_Definitions_" href="#Key_And_Type_Definitions_">Key And Type Definitions</a>
The following chart contains a set of U extension key values that are currently available, with a description or sampling of the U extension type values. Each category is associated with an XML file in the bcp47 directory.
-For the complete list of valid keys and types defined for Unicode locale extensions, see [Section 3.6.4 U Extension Data Files](#Unicode_Locale_Extension_Data_Files). For information on the process for adding new _key_/_type_, see [[LocaleProject](#localeProject)].
+For the complete list of valid keys and types defined for Unicode locale extensions, see [U Extension Data Files](#Unicode_Locale_Extension_Data_Files). For information on the process for adding new _key_/_type_, see [[LocaleProject](#localeProject)].
Most type values are represented by a single subtag in the current version of CLDR. There are exceptions, such as types used for key "ca" (calendar) and "kr" (collation reordering). If the type is not included, then the type value "true" is assumed. Note that the default for key with a possible "true" value is often "false", but may not always be. Note also that "true"/"True" is not a valid script code, since [the ISO 15924 Registration Authority has exceptionally reserved it](https://www.unicode.org/iso15924/codelists.html), which means that it will not be assigned for any purpose.
@@ -707,7 +733,13 @@
<table><tbody>
<tr><th>key<br>(old key name)</th><th>key description</th><th>example type<br>(old type name)</th><th>type description</th></tr>
-<tr><td colspan="4"><b>A <a name="UnicodeCalendarIdentifier" id="UnicodeCalendarIdentifier" href="#UnicodeCalendarIdentifier">Unicode Calendar Identifier</a> defines a type of calendar. The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="ca" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/calendar.xml" target="_blank">calendar.xml</a></b>.</td></tr>
+<tr><td colspan="4"><b>A <a name="UnicodeCalendarIdentifier" id="UnicodeCalendarIdentifier" href="#UnicodeCalendarIdentifier">Unicode Calendar Identifier</a>
+ defines a type of calendar. The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="ca"
+ in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/calendar.xml" target="_blank">calendar.xml</a></b>.<br>
+ This selects calendar-specific data within a locale used for formatting and parsing, such as date/time symbols and patterns; it also selects supplemental
+ calendarData used for calendrical calculations.
+ The value can affect the computation of the first day of the week: see <a href='tr35-dates.md#first-day-overrides'>First Day Overrides</a>.
+ </td></tr>
<tr><td rowspan="10">"ca"<br>(calendar)</td>
<td rowspan="10">Calendar algorithm<br><br><i>(For information on the calendar algorithms associated with the data used with these, see [<a href="#Calendars">Calendars</a>].)</i></td>
<td>"buddhist"</td>
@@ -727,7 +759,11 @@
<tr><td colspan="2">…</td></tr>
<tr><td colspan="2"><b>Note:</b> <i>Some calendar types are represented by two subtags. In such cases, the first subtag specifies a generic calendar type and the second subtag specifies a calendar algorithm variant. The CLDR uses generic calendar types (single subtag types) for tagging data when calendar algorithm variations within a generic calendar type are irrelevant. For example, type "islamic" is used for specifying Islamic calendar formatting data for all Islamic calendar types, including "islamic-civil" and "islamic-umalqura".</i></td></tr>
-<tr><td colspan="4"><b>A <a name="UnicodeCurrencyFormatIdentifier" id="UnicodeCurrencyFormatIdentifier" href="#UnicodeCurrencyFormatIdentifier">Unicode Currency Format Identifier</a> defines a style for currency formatting. The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="cf" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/currency.xml" target="_blank">currency.xml</a></b>.</td></tr>
+<tr><td colspan="4"><b>A <a name="UnicodeCurrencyFormatIdentifier" id="UnicodeCurrencyFormatIdentifier" href="#UnicodeCurrencyFormatIdentifier">Unicode Currency Format Identifier</a>
+ defines a style for currency formatting. The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="cf" in
+ bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/currency.xml" target="_blank">currency.xml</a></b>.<br>
+ This selects the specific type of currency formatting pattern within a locale.
+ </td></tr>
<tr><td rowspan="2">"cf"</td>
<td rowspan="2">Currency Format style</td>
<td>"standard"</td><td>Negative numbers use the minusSign symbol (the default).</td></tr>
@@ -735,8 +771,8 @@
<tr><td colspan="4"><b>A <a name="UnicodeCollationIdentifier" id="UnicodeCollationIdentifier" href="#UnicodeCollationIdentifier">Unicode Collation Identifier</a> defines a type of collation (sort order). The valid values are those <i>name</i> attribute values in the <i>type</i> elements of bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/collation.xml" target="_blank">collation.xml</a></b>.</td></tr>
<tr><td colspan="4"><i>For information on each collation setting parameter, from <b>ka</b> to <b>vt</b>, see <a href="tr35-collation.md#Setting_Options">Setting Options</a></i></td></tr>
-<tr><td rowspan="9">"co"<br>(collation)</td>
- <td rowspan="9">Collation type</td>
+<tr><td rowspan="8">"co"<br>(collation)</td>
+ <td rowspan="8">Collation type</td>
<td>"standard"</td>
<td>The default ordering for each language. For root it is based on the [<a href="#DUCET">DUCET</a>] (Default Unicode Collation Element Table): see <i><a href="tr35-collation.md#Root_Collation">Root Collation</a></i>. Each other locale is based on that, except for appropriate modifications to certain characters for that language.</td></tr>
<tr><td>"search"</td>
@@ -747,7 +783,6 @@
<td>Requests a phonetic variant if available, where text is sorted based on pronunciation. It may interleave different scripts, if multiple scripts are in common use.</td></tr>
<tr><td>"pinyin"</td>
<td>Pinyin ordering for Latin and for CJK characters; that is, an ordering for CJK characters based on a character-by-character transliteration into a pinyin. (used in Chinese)</td></tr>
- <tr><td>"reformed"</td><td>Reformed collation (such as in Swedish)</td></tr>
<tr><td>"searchjl"</td>
<td>Special collation type for a modified string search in which a pattern consisting of a sequence of Hangul initial consonants (jamo lead consonants) will match a sequence of Hangul syllable characters whose initial consonants match the pattern. The jamo lead consonants can be represented using conjoining or compatibility jamo. This search collator is best used at SECONDARY strength with an "asymmetric" search as described in the [<a href="https://www.unicode.org/reports/tr41/#UTS10">UCA</a>] section Asymmetric Search and obtained, for example, using ICU4C's usearch facility with attribute USEARCH_ELEMENT_COMPARISON set to value USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD; this ensures that a full Hangul syllable in the search pattern will only match the same syllable in the searched text (instead of matching any syllable with the same initial consonant), while a Hangul initial consonant in the search pattern will match any Hangul syllable in the searched text with the same initial consonant.</td></tr>
<tr><td colspan="2">…</td></tr>
@@ -758,12 +793,21 @@
<td><i>ISO 4217 code,</i><p><i>plus others in common use</i></p></td>
<td><p>Codes consisting of 3 ASCII letters that are or have been valid in ISO 4217, plus certain additional codes that are or have been in common use. The list of countries and time periods associated with each currency value is available in <a href="tr35-numbers.md#Supplemental_Currency_Data">Supplemental Currency Data</a>, plus the default number of decimals.</p><p>The XXX code is given a broader interpretation as <i>Unknown or Invalid Currency</i>.</p></td></tr>
-<tr><td colspan="4"><b>A <a name="UnicodeDictionaryBreakExclusionIdentifier" id="UnicodeDictionaryBreakExclusionIdentifier" href="#UnicodeDictionaryBreakExclusionIdentifier">Unicode Dictionary Break Exclusion Identifier</a> specifies scripts to be excluded from dictionary-based text break (for words and lines). The valid values are of one or more items of type SCRIPT_CODE as specified in the <i>name</i> attribute value in the <i>type</i> element of key name="dx" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/segmentation.xml" target="_blank">segmentation.xml</a>.</b></td></tr>
+<tr><td colspan="4"><b>A <a name="UnicodeDictionaryBreakExclusionIdentifier" id="UnicodeDictionaryBreakExclusionIdentifier"
+ href="#UnicodeDictionaryBreakExclusionIdentifier">Unicode Dictionary Break Exclusion Identifier</a> specifies scripts to be excluded from dictionary-based text break
+ (for words and lines). The valid values are of one or more items of type SCRIPT_CODE as specified in the <i>name</i> attribute value in the <i>type</i> element of
+ key name="dx" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/segmentation.xml" target="_blank">segmentation.xml</a></b>.<br>
+ This affects break iteration regardless of locale.
+ </td></tr>
<tr><td>"dx"</td>
<td>Dictionary break script exclusions</td>
<td><i><code><a href="#unicode_script_subtag">unicode_script_subtag</a></code> values</i></td>
- <td><p>One or more items of type SCRIPT_CODE, which are valid <code><a href="#unicode_script_subtag">unicode_script_subtag</a></code> values.</p>
- <p>The code Zyyy (Common) can be specified to exclude all scripts, in which case it should be the only SCRIPT_CODE value specified.</p></td></tr>
+ <td><ul><li>One or more items of type SCRIPT_CODE (as usual, separated by hyphens), which are valid <code><a href="#unicode_script_subtag">unicode_script_subtag</a></code> values.</li>
+ <li>Each of the values for the DX key must be a short script property value in the UCD, or one of the compound script values like jpan. The compound script values are expanded when interpreted, eg, -dx-jpan = -dx-hani-hira-kata</li>
+ <li>The values may be in any order, eg, -dx-thai-hani = dx-hani-thai. However, the canonical order for the bcp47 subtag is alphabetical, eg, dx-hani-thai</li>
+ <li>Dictionary-based break iterators will ignore each character whose Script_Extension value set intersects with the DX value set.</li>
+ <li>The code Zyyy (Common) can be specified to exclude all scripts, if and only if it is the only SCRIPT_CODE value specified. If it is not the only script code, Zyyy has the normal meaning: excluding Script_Extension=Common.</li></ul>
+ </td></tr>
<tr><td colspan="4"><b>A <a name="UnicodeEmojiPresentationStyleIdentifier" id="UnicodeEmojiPresentationStyleIdentifier" href="#UnicodeEmojiPresentationStyleIdentifier">Unicode Emoji Presentation Style Identifier</a> specifies a request for the preferred emoji presentation style. This can be used as part of the value for an HTML lang attribute, for example <code><html lang="sr-Latn-u-em-emoji"></code>. The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="em" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/variant.xml" target="_blank">variant.xml</a></b>.</td></tr>
<tr><td rowspan="3">"em"</td>
@@ -772,9 +816,15 @@
<td>Use an emoji presentation for emoji characters if possible.</td></tr>
<tr><td>"text"</td>
<td>Use a text presentation for emoji characters if possible.</td></tr>
- <tr><td>"default"</td><td>Use the default presentation for emoji characters as specified in UTR #51 Section 4, <a href="https://www.unicode.org/reports/tr51/#Presentation_Style">Presentation Style</a>.</td></tr>
+ <tr><td>"default"</td><td>Use the default presentation for emoji characters as specified in UTR #51 <a href="https://www.unicode.org/reports/tr51/#Presentation_Style">Presentation Style</a>.</td></tr>
-<tr><td colspan="4"><b>A <a name="UnicodeFirstDayIdentifier" id="UnicodeFirstDayIdentifier" href="#UnicodeFirstDayIdentifier">Unicode First Day Identifier</a> defines the preferred first day of the week for calendar display. Specifying "fw" in a locale identifier overrides the default value specified by supplemental week data (see Part 4 Dates, section 4.3 <a href="tr35-dates.md#Week_Data">Week Data</a>). The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="fw" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/calendar.xml" target="_blank">calendar.xml</a></b>.</td></tr>
+<tr><td colspan="4"><b>A <a name="UnicodeFirstDayIdentifier" id="UnicodeFirstDayIdentifier" href="#UnicodeFirstDayIdentifier">Unicode First Day Identifier</a>
+ defines the preferred first day of the week for calendar display. Specifying "fw" in a locale identifier overrides the default value specified by supplemental
+ week data for the region (see Part 4 Dates, <a href="tr35-dates.md#Week_Data">Week Data</a>).
+ The valid values are those <i>name</i> attribute values in the <i>type</i> elements
+ of key name="fw" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/calendar.xml" target="_blank">calendar.xml</a>.
+ The value can affect the computation of the first day of the week: see <a href='tr35-dates.md#first-day-overrides'>First Day Overrides</a>.
+ </td></tr>
<tr><td rowspan="4">"fw"</td>
<td rowspan="4">First day of week</td>
<td>"sun"</td>
@@ -785,7 +835,11 @@
<tr><td>"sat"</td>
<td>Saturday</td></tr>
-<tr><td colspan="4"><b>A <a name="UnicodeHourCycleIdentifier" id="UnicodeHourCycleIdentifier" href="#UnicodeHourCycleIdentifier">Unicode Hour Cycle Identifier</a> defines the preferred time cycle. Specifying "hc" in a locale identifier overrides the default value specified by supplemental time data (see Part 4 Dates, section 4.4 <a href="tr35-dates.md#Time_Data">Time Data</a>). The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="hc" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/calendar.xml" target="_blank">calendar.xml</a></b>.</td></tr>
+<tr><td colspan="4"><b>A <a name="UnicodeHourCycleIdentifier" id="UnicodeHourCycleIdentifier" href="#UnicodeHourCycleIdentifier">Unicode Hour Cycle Identifier</a>
+ defines the preferred time cycle. Specifying "hc" in a locale identifier overrides the default value specified by supplemental time data for the region
+ (see Part 4 Dates, <a href="tr35-dates.md#Time_Data">Time Data</a>). The valid values are those <i>name</i> attribute values in the <i>type</i> elements of
+ key name="hc" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/calendar.xml" target="_blank">calendar.xml</a></b>.
+ </td></tr>
<tr><td rowspan="4">"hc"</td>
<td rowspan="4">Hour cycle</td>
<td>"h12"</td>
@@ -797,7 +851,11 @@
<tr><td>"h24"</td>
<td>Hour system using 1–24; corresponds to 'k' in pattern</td></tr>
-<tr><td colspan="4"><b>A <a name="UnicodeLineBreakStyleIdentifier" id="UnicodeLineBreakStyleIdentifier" href="#UnicodeLineBreakStyleIdentifier">Unicode Line Break Style Identifier</a> defines a preferred line break style corresponding to the CSS level 3 <a href="https://drafts.csswg.org/css-text/#line-break-property">line-break option</a>. Specifying "lb" in a locale identifier overrides the locale’s default style (which may correspond to "normal" or "strict"). The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="lb" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/segmentation.xml" target="_blank">segmentation.xml</a></b>.</td></tr>
+<tr><td colspan="4"><b>A <a name="UnicodeLineBreakStyleIdentifier" id="UnicodeLineBreakStyleIdentifier" href="#UnicodeLineBreakStyleIdentifier">Unicode Line Break Style Identifier</a>
+ defines a preferred line break style corresponding to the CSS level 3 <a href="https://drafts.csswg.org/css-text/#line-break-property">line-break option</a>.
+ Specifying "lb" in a locale identifier overrides the locale’s default style (which may correspond to "normal" or "strict"). The valid values are those <i>name</i>
+ attribute values in the <i>type</i> elements of key name="lb" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/segmentation.xml" target="_blank">segmentation.xml</a></b>.
+ </td></tr>
<tr><td rowspan="3">"lb"</td>
<td rowspan="3">Line break style</td>
<td>"strict"</td>
@@ -807,7 +865,11 @@
<tr><td>"loose"</td>
<td>CSS lev 3 line-break=loose</td></tr>
-<tr><td colspan="4"><b>A <a name="UnicodeLineBreakWordIdentifier" id="UnicodeLineBreakWordIdentifier" href="#UnicodeLineBreakWordIdentifier">Unicode Line Break Word Identifier</a> defines preferred line break word handling behavior corresponding to the CSS level 3 <a href="https://drafts.csswg.org/css-text/#word-break-property">word-break option</a>. The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="lw" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/segmentation.xml" target="_blank">segmentation.xml</a></b>.</td></tr>
+<tr><td colspan="4"><b>A <a name="UnicodeLineBreakWordIdentifier" id="UnicodeLineBreakWordIdentifier" href="#UnicodeLineBreakWordIdentifier">Unicode Line Break Word Identifier</a>
+ defines preferred line break word handling behavior corresponding to the CSS level 3 <a href="https://drafts.csswg.org/css-text/#word-break-property">word-break option</a>.
+ Specifying "lw" in a locale identifier overrides the locale’s default style (which may correspond to "normal" or "keepall"). The valid values are those <i>name</i>
+ attribute values in the <i>type</i> elements of key name="lw" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/segmentation.xml" target="_blank">segmentation.xml</a></b>.
+ </td></tr>
<tr><td rowspan="4">"lw"</td>
<td rowspan="4">Line break word handling</td>
<td>"normal"</td>
@@ -819,10 +881,13 @@
<tr><td>"phrase"</td>
<td>Prioritize keeping natural phrases (of multiple words) together when breaking, used in short text like title and headline</td></tr>
-<tr><td colspan="4"><b>A <a name="UnicodeMeasurementSystemIdentifier" id="UnicodeMeasurementSystemIdentifier" href="#UnicodeMeasurementSystemIdentifier">Unicode Measurement System Identifier</a> defines a preferred measurement system. Specifying "ms" in a locale identifier overrides the default value specified by supplemental measurement system data (see Part 2 General, section 5 <a href="tr35-general.md#Measurement_System_Data">Measurement System Data</a>). The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="ms" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/measure.xml" target="_blank">measure.xml</a></b>.
-The determination of preferred units depends on the locale identifer: the keys ms, mu, rg, the base locale (language, script, region) and the user preferences.
-<i>For information about preferred units and unit conversion, see <a href="tr35-info.md#Unit_Conversion">Unit Conversion</a> and <a href="tr35-info.md#Unit_Preferences">Unit Preferences</a>.</i>
-</td></tr>
+<tr><td colspan="4"><b>A <a name="UnicodeMeasurementSystemIdentifier" id="UnicodeMeasurementSystemIdentifier" href="#UnicodeMeasurementSystemIdentifier">Unicode Measurement System Identifier</a>
+ defines a preferred measurement system. Specifying "ms" in a locale identifier overrides the default value specified by supplemental measurement system data for the region
+ (see Part 2 General, <a href="tr35-general.md#Measurement_System_Data">Measurement System Data</a>). The valid values are those <i>name</i> attribute values in the
+ <i>type</i> elements of key name="ms" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/measure.xml" target="_blank">measure.xml</a></b>.
+ The determination of preferred units depends on the locale identifer: the keys ms, mu, rg, the base locale (language, script, region) and the user preferences.
+ <i>For information about preferred units and unit conversion, see <a href="tr35-info.md#Unit_Conversion">Unit Conversion</a> and <a href="tr35-info.md#Unit_Preferences">Unit Preferences</a>.</i>
+ </td></tr>
<tr><td rowspan="3">"ms"</td>
<td rowspan="3">Measurement system</td>
<td>"metric"</td>
@@ -832,8 +897,11 @@
<tr><td>"uksystem"</td>
<td>UK System of measurement: feet, pints, etc.; pints are 20oz</td></tr>
-<tr><td colspan="4"><b>A <a name="MeasurementUnitPreferenceOverride" id="MeasurementUnitPreferenceOverride" href="#MeasurementUnitPreferenceOverride">Measurement Unit Preference Override</a> defines an override for measurement unit preference. The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="mu" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/measure.xml" target="_blank">measure.xml</a></b>.
-<i>For information about preferred units and unit conversion, see <a href="tr35-info.md#Unit_Conversion">Unit Conversion</a> and <a href="tr35-info.md#Unit_Preferences">Unit Preferences</a>.</i>
+<tr><td colspan="4"><b>A <a name="MeasurementUnitPreferenceOverride" id="MeasurementUnitPreferenceOverride" href="#MeasurementUnitPreferenceOverride">Measurement Unit Preference Override</a>
+ defines an override for measurement unit preference. The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="mu" in
+ bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/measure.xml" target="_blank">measure.xml</a></b>.
+ <i>For information about preferred units and unit conversion, see <a href="tr35-info.md#Unit_Conversion">Unit Conversion</a> and <a href="tr35-info.md#Unit_Preferences">Unit Preferences</a>.</i>
+ </td></tr>
<tr><td rowspan="3">"mu"</td>
<td rowspan="3">Measurement unit override</td>
<td>"celsius"</td>
@@ -861,10 +929,15 @@
<tr><td>"tamldec"</td>
<td>Modern Tamil decimal digits</td></tr>
-<tr><td colspan="4"><b>A <a name="RegionOverride" id="RegionOverride" href="#RegionOverride">Region Override</a> specifies an alternate region to use for obtaining certain region-specific default values (those specified by the <a href="tr35-info.md#rgScope"><rgScope></a> element), instead of using the region specified by the <a href="#unicode_region_subtag">unicode_region_subtag</a> in the Unicode Language Identifier (or inferred from the <a href="#unicode_language_subtag">unicode_language_subtag</a>).</b></td></tr>
+<tr><td colspan="4"><b>A <a name="RegionOverride" id="RegionOverride" href="#RegionOverride">Region Override</a> specifies an alternate region to use for obtaining
+ certain region-specific default values (those specified by the <a href="tr35-info.md#rgScope"><rgScope></a> element), instead of using the region
+ specified by the <a href="#unicode_region_subtag">unicode_region_subtag</a> in the Unicode Language Identifier (or inferred from the
+ <a href="#unicode_language_subtag">unicode_language_subtag</a>)</b>.
+ </td></tr>
<tr><td rowspan="2">"rg"</td>
<td rowspan="2">Region Override</td><td>"uszzzz"<br><br></td><td rowspan="2">The value is a <a href="#unicode_subdivision_id">unicode_subdivision_id</a> of type “unknown” or “regular”; this consists of a <a href="#unicode_region_subtag">unicode_region_subtag</a> for a regular region (not a macroregion), suffixed either by “zzzz” (case is not significant) to designate the region as a whole, or by a unicode_subdivision_suffix to provide more specificity. For example, “en-GB-u-rg-uszzzz” represents a locale for British English but with region-specific defaults set to US for items such as default currency, default calendar and week data, default time cycle, and default measurement system and unit preferences.
The determination of preferred units depends on the locale identifer: the keys ms, mu, rg, the base locale (language, script, region) and the user preferences.
+ The value can affect the computation of the first day of the week: see <a href='tr35-dates.md#first-day-overrides'>First Day Overrides</a>.
<i>For information about preferred units and unit conversion, see <a href="tr35-info.md#Unit_Conversion">Unit Conversion</a> and <a href="tr35-info.md#Unit_Preferences">Unit Preferences</a>.</i>
</td></tr>
<tr><td>…</td></tr>
@@ -873,7 +946,9 @@
<tr><td rowspan="2">"sd"</td>
<td rowspan="2">Regional Subdivision</td>
<td>"gbsct"</td>
- <td rowspan="2">A <a href="#unicode_subdivision_id">unicode_subdivision_id</a>, which is a <a href="#unicode_region_subtag">unicode_region_subtag</a> concatenated with a unicode_subdivision_suffix.<br>For example, <i>gbsct</i> is “gb”+“sct” (where sct represents the subdivision code for Scotland). Thus “en-GB-u-sd-gbsct” represents the language variant “English as used in Scotland”. And both “en-u-sd-usca” and “en-US-u-sd-usca” represent “English as used in California”. See <b><i><a href="#Unicode_Subdivision_Codes">3.6.5 Subdivision Codes</a></i></b>.</td></tr>
+ <td rowspan="2">A <a href="#unicode_subdivision_id">unicode_subdivision_id</a>, which is a <a href="#unicode_region_subtag">unicode_region_subtag</a> concatenated with a unicode_subdivision_suffix.<br>For example, <i>gbsct</i> is “gb”+“sct” (where sct represents the subdivision code for Scotland). Thus “en-GB-u-sd-gbsct” represents the language variant “English as used in Scotland”. And both “en-u-sd-usca” and “en-US-u-sd-usca” represent “English as used in California”. See <b><i><a href="#Unicode_Subdivision_Codes">3.6.5 Subdivision Codes</a></i></b>.
+ The value can affect the computation of the first day of the week: see <a href='tr35-dates.md#first-day-overrides'>First Day Overrides</a>.
+ </td></tr>
<tr><td>…</td></tr>
<tr><td colspan="4"><b>A <a name="UnicodeSentenceBreakSuppressionsIdentifier" id="UnicodeSentenceBreakSuppressionsIdentifier" href="#UnicodeSentenceBreakSuppressionsIdentifier">Unicode Sentence Break Suppressions Identifier</a> defines a set of data to be used for suppressing certain sentence breaks that would otherwise be found by UAX #14 rules. The valid values are those <i>name</i> attribute values in the <i>type</i> elements of key name="ss" in bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/segmentation.xml" target="_blank">segmentation.xml</a></b>.</td></tr>
@@ -889,22 +964,22 @@
<td>Time zone</td>
<td><i>Unicode short time zone IDs</i></td>
<td><p>Short identifiers defined in terms of a TZ time zone database [<a href="#Olson">Olson</a>] identifier in the common/bcp47/timezone.xml file, plus a few extra values.</p>
- <p>For more information, see <a href="#Time_Zone_Identifiers">Section 3.6.3 Time Zone Identifiers</a>.</p>
+ <p>For more information, see <a href="#Time_Zone_Identifiers">Time Zone Identifiers</a>.</p>
<p>CLDR provides data for normalizing timezone codes.</p></td></tr>
<tr><td colspan="4"><b>A <a name="UnicodeVariantIdentifier" id="UnicodeVariantIdentifier" href="#UnicodeVariantIdentifier">Unicode Variant Identifier</a> defines a special variant used for locales. The valid values are those name attribute values in the <i>type</i> elements of bcp47/<a href="https://github.com/unicode-org/cldr/blob/main/common/bcp47/variant.xml" target="_blank">variant.xml</a>.</b></td></tr>
<tr><td>"va"</td>
<td>Common variant type</td>
<td>"posix"</td>
- <td>POSIX style locale variant. About handling of the "POSIX" variant see <i>Section 3.8.2, <a href="#Legacy_Variants">Legacy Variants</a></i>.</td></tr>
+ <td>POSIX style locale variant. About handling of the "POSIX" variant see <i><a href="#Legacy_Variants">Legacy Variants</a></i>.</td></tr>
</tbody></table>
-For more information on the allowed keys and types, see the specific elements below, and [Section 3.6.4 U Extension Data Files](#Unicode_Locale_Extension_Data_Files).
+For more information on the allowed keys and types, see the specific elements below, and [U Extension Data Files](#Unicode_Locale_Extension_Data_Files).
Additional keys or types might be added in future versions. Implementations of LDML should be robust to handle any syntactically valid key or type values.
-#### 3.6.2 <a name="Numbering%20System%20Data" href="#Numbering%20System%20Data">Numbering System Data</a>
+#### <a name="Numbering%20System%20Data" href="#Numbering%20System%20Data">Numbering System Data</a>
LDML supports multiple numbering systems. The identifiers for those numbering systems are defined in the file **bcp47/number.xml**. For example, for the latest version of the data see [bcp47/number.xml](https://github.com/unicode-org/cldr/blob/main/common/bcp47/number.xml).
@@ -917,7 +992,7 @@
1. It is a decimal, positional numbering system with an attribute `digits=X`, where `X` is a string with the 10 digits in order used by the numbering system.
2. The values of the type and digits will never change.
-#### 3.6.3 <a name="Time_Zone_Identifiers" href="#Time_Zone_Identifiers">Time Zone Identifiers</a>
+#### <a name="Time_Zone_Identifiers" href="#Time_Zone_Identifiers">Time Zone Identifiers</a>
LDML inherits time zone IDs from the tz database [[Olson](#Olson)]. Because these IDs from the tz database do not satisfy the BCP 47 language subtag syntax requirements, CLDR defines short identifiers for the use in the Unicode locale extension. The short identifiers are defined in the file **common/bcp47/timezone.xml**.
@@ -931,15 +1006,15 @@
Although the short time zone identifiers are guaranteed to be stable, the preferred IDs in the tz database (as those found in **zone.tab** file) might be changed time to time. For example, "Asia/Culcutta" was replaced with "Asia/Kolkata" and moved to **backward** file in the tz database. CLDR contains locale data using a time zone ID from the tz database as the key, stability of the IDs is critical.
-To maintain the stability of "long" IDs (for those inherited from the tz database), a special rule applied to the `alias` attribute in the `<type>` element for "tz" - the first "long" ID is the CLDR canonical "long" time zone ID.
+To maintain the stability of "long" IDs (for those inherited from the tz database), a special rule applied to the `alias` attribute in the `<type>` element for "tz" - the first "long" ID is the CLDR canonical "long" time zone ID. In addition to this, `iana` attribute specifies the preferred ID in the tz database if it's different from the CLDR canonical "long" ID.
For example:
```xml
-<type name="inccu" alias="Asia/Calcutta Asia/Kolkata" description="Kolkata, India"/>
+<type name="inccu" description="Kolkata, India" alias="Asia/Calcutta Asia/Kolkata" iana="Asia/Kolkata"/>
```
-Above `<type>` element defines the short time zone ID "inccu" (for the use in the Unicode locale extension), corresponding _CLDR canonical "long" ID_ "Asia/Culcutta", and an alias "Asia/Kolkata".
+Above `<type>` element defines the short time zone ID "inccu" (for the use in the Unicode locale extension), corresponding _CLDR canonical "long" ID_ "Asia/Culcutta", and an alias "Asia/Kolkata". In the tz database, the preferred ID for this time zone is "Asia/Kolkata".
**Links in the tz database**
@@ -954,7 +1029,7 @@
Even if, for example, Serbia and Croatia share the same rules, CLDR maintains the difference so that the user can either pick "Serbia time" or "Croatia time".
The Croat is not forced to pick "Serbia time" (Europe/Belgrade) nor the Serb forced to pick “Croatia time” (Europe/Zagreb).
-#### 3.6.4 <a name="Unicode_Locale_Extension_Data_Files" href="#Unicode_Locale_Extension_Data_Files">U Extension Data Files</a>
+#### <a name="Unicode_Locale_Extension_Data_Files" href="#Unicode_Locale_Extension_Data_Files">U Extension Data Files</a>
The 'u' extension data is stored in multiple XML files located under common/bcp47 directory in CLDR. Each file contains the locale extension key/type values and their backward compatibility mappings appropriate for a particular domain. [common/bcp47/collation.xml](https://github.com/unicode-org/cldr/blob/maint/maint-41/common/bcp47/collation.xml) contains key/type values for collation, including optional collation parameters and valid type values for each key.
@@ -980,6 +1055,7 @@
<!ATTLIST type preferred NMTOKEN #IMPLIED>
<!ATTLIST type alias CDATA #IMPLIED>
<!ATTLIST type since CDATA #IMPLIED>
+<!ATTLIST type iana CDATA #IMPLIED >
<!ELEMENT attribute EMPTY>
<!ATTLIST attribute name NMTOKEN #REQUIRED>
@@ -1016,11 +1092,11 @@
>
> > **en-u-vt-00A4** : this indicates English, with any characters sorting at or below " ¤" (at a primary level) considered Variable.
>
-> By default in UCA, variable characters are ignored in sorting at a primary, secondary, and tertiary level. But in CLDR, they are not ignorable by default. For more information, see [Collation: Section 3.3 _Setting Options_](tr35-collation.md#Setting_Options) .
+> By default in UCA, variable characters are ignored in sorting at a primary, secondary, and tertiary level. But in CLDR, they are not ignorable by default. For more information, see [Collation: _Setting Options_](tr35-collation.md#Setting_Options) .
>
> ##### <a name="REORDER_CODE" href="#REORDER_CODE">REORDER_CODE</a>
>
-> The type name **"REORDER_CODE"** is reserved for reordering block names (e.g. "latn", "digit" and "others") defined in the _[Root Collation](tr35-collation.md#Root_Collation)_. The type "REORDER_CODE" is used for locale extension key "kr" (colReorder). The value of type for "kr" is represented by one or more reordering block names such as "latn-digit". For more information, see [Collation: Section 3.12 _Collation Reordering_](tr35-collation.md#Script_Reordering) .
+> The type name **"REORDER_CODE"** is reserved for reordering block names (e.g. "latn", "digit" and "others") defined in the _[Root Collation](tr35-collation.md#Root_Collation)_. The type "REORDER_CODE" is used for locale extension key "kr" (colReorder). The value of type for "kr" is represented by one or more reordering block names such as "latn-digit". For more information, see [Collation: _Collation Reordering_](tr35-collation.md#Script_Reordering) .
>
> ##### <a name="RG_KEY_VALUE" href="#RG_KEY_VALUE">RG_KEY_VALUE</a>
>
@@ -1051,7 +1127,7 @@
**description**
-> The description of the `key`, `type` or `attribute` element. There is also some informative text about certain keys and types in the Section 3.5 [Key And Type Definitions](#Key_And_Type_Definitions_).
+> The description of the `key`, `type` or `attribute` element. There is also some informative text about certain keys and types in the [Key And Type Definitions](#Key_And_Type_Definitions_).
**deprecated**
@@ -1112,7 +1188,14 @@
It is strongly recommended that all API methods accept all possible aliases for keywords and types, but generate the canonical form. For example, "ar-u-ca-islamicc" would be equivalent to "ar-u-ca-islamic-civil" on input, but the latter should be output. The one exception is where an alias would only be well-formed with the old syntax, such as "gregorian" (for "gregory").
-#### 3.6.5 <a name="Unicode_Subdivision_Codes" href="#Unicode_Subdivision_Codes">Subdivision Codes</a>
+
+In the Unicode locale extension 'u' data files, `<type>` element has an optional attribute below:
+
+**iana**
+
+This attribute is used by `tz` types for specifying preferred zone ID in the IANA time zone database.
+
+#### <a name="Unicode_Subdivision_Codes" href="#Unicode_Subdivision_Codes">Subdivision Codes</a>
The subdivision codes designate a subdivision of a country or region. They are called various names, such as a _state_ in the United States, or a _province_ in Canada. The codes in CLDR are based on ISO 3166-2 subdivision codes. The ISO codes have a region code followed by a hyphen, then a suffix consisting of 1..3 ASCII letters or digits.
@@ -1125,9 +1208,9 @@
Like BCP 47, CLDR requires stable codes, which are not guaranteed for ISO 3166-2 (nor have the ISO 3166-2 codes been stable in the past). If an ISO 3166-2 code is removed, it remains valid (though marked as deprecated) in CLDR. If an ICU 3166-2 code is reused (for the same region), then CLDR will define a new equivalent code using these as 4-character suffixes.
-##### 3.6.5.1 <a name="Validity" href="#Validity">Validity</a>
+##### <a name="Validity" href="#Validity">Validity</a>
-A [unicode_subdivision_id](#unicode_subdivision_id) is only valid when it is present in the subdivision.xml file as described in _Section 3.11 [Validity Data](#Validity_Data)_. The data is in a compressed form, and thus needs to be expanded before such a test is made.
+A [unicode_subdivision_id](#unicode_subdivision_id) is only valid when it is present in the subdivision.xml file as described in _[Validity Data](#Validity_Data)_. The data is in a compressed form, and thus needs to be expanded before such a test is made.
_Examples:_
@@ -1148,11 +1231,11 @@
In version 28.0, the subdivisions in the validity files used the ISO format, uppercase with a hyphen separating two components, instead of the BCP 47 format.
<a name="t_Extension"></a>
-### 3.7 <a name="BCP47_T_Extension" href="#BCP47_T_Extension">Unicode BCP 47 T Extension</a>
+### <a name="BCP47_T_Extension" href="#BCP47_T_Extension">Unicode BCP 47 T Extension</a>
The Unicode Consortium has registered and is the maintaining authority for two BCP 47 language tag extensions: the extension 'u' for Unicode locale extension [[RFC6067](#RFC6067)] and extension 't' for transformed content [[RFC6497](#RFC6497)]. The Unicode BCP 47 extension data defines the complete list of valid subtags. While the title of the RFC is “Transformed Content”, the abstract makes it clear that the scope is broader than the term "transformed" might indicate to a casual reader: “including content that has been transliterated, transcribed, or translated, or _in some other way influenced by the source. It also provides for additional information used for identification._”
-**The -t- Extension.** The syntax of 't' extension subtags is defined by the rule `unicode_locale_extensions` in [_Section 3.2 Unicode locale identifier_](#Unicode_locale_identifier), except the separator of subtags `sep` must be always hyphen '-' when the extension is used as a part of BCP 47 language tag. For information about the registration process, meaning, and usage of the 't' extension, see [[RFC6497](#RFC6497)].
+**The -t- Extension.** The syntax of 't' extension subtags is defined by the rule `transformed_extensions` in [_ Unicode locale identifier_](#Unicode_locale_identifier), except the separator of subtags `sep` must be always hyphen '-' when the extension is used as a part of BCP 47 language tag. For information about the registration process, meaning, and usage of the 't' extension, see [[RFC6497](#RFC6497)].
These subtags are all in lowercase (that is the canonical casing for these subtags), however, subtags are case-insensitive and casing does not carry any specific meaning. All subtags within the Unicode extensions are alphanumeric characters in length of two to eight that meet the rule `extension` in the [[BCP47](#BCP47)].
@@ -1165,10 +1248,10 @@
| i0 | **Input Method Engine transform:** Used to indicate an input method transformation, such as one used by a client-side input method. The first subfield in a sequence would typically be a 'platform' or vendor designation. | [transform_ime.xml](https://github.com/unicode-org/cldr/blob/maint/maint-41/common/bcp47/transform_ime.xml) |
| k0 | **Keyboard transform:** Used to indicate a keyboard transformation, such as one used by a client-side virtual keyboard. The first subfield in a sequence would typically be a 'platform' designation, representing the platform that the keyboard is intended for. The keyboard might or might not correspond to a keyboard mapping shipped by the vendor for the platform. One or more subsequent fields may occur, but are only added where needed to distinguish from others. | [transform_keyboard.xml](https://github.com/unicode-org/cldr/blob/maint/maint-41/common/bcp47/transform_keyboard.xml) |
| t0 | **Machine Translation:** Used to indicate content that has been machine translated, or a request for a particular type of machine translation of content. The first subfield in a sequence would typically be a 'platform' or vendor designation. | [transform_mt.xml](https://github.com/unicode-org/cldr/blob/maint/maint-41/common/bcp47/transform_mt.xml) |
-| h0 | **Hybrid Locale Identifiers:** h0 with the value 'hybrid' indicates that the -t- value is a language that is mixed into the main language tag to form a hybrid. For more information, and examples, see _Section 3.10.2 [Hybrid Locale Identifiers](#Hybrid_Locale)._ | [transform_hybrid.xml](https://github.com/unicode-org/cldr/blob/maint/maint-41/common/bcp47/transform_hybrid.xml) |
+| h0 | **Hybrid Locale Identifiers:** h0 with the value 'hybrid' indicates that the -t- value is a language that is mixed into the main language tag to form a hybrid. For more information, and examples, see _[Hybrid Locale Identifiers](#Hybrid_Locale)._ | [transform_hybrid.xml](https://github.com/unicode-org/cldr/blob/maint/maint-41/common/bcp47/transform_hybrid.xml) |
| x0 | **Private use transform** | [transform_private_use.xml](https://github.com/unicode-org/cldr/blob/maint/maint-41/common/bcp47/transform_private_use.xml) |
-#### 3.7.1 <a name="Transformed_Content_Data_File" href="#Transformed_Content_Data_File">T Extension Data Files</a>
+#### <a name="Transformed_Content_Data_File" href="#Transformed_Content_Data_File">T Extension Data Files</a>
The overall structure of the data files is the similar to the U Extension, with the following exceptions.
@@ -1206,11 +1289,11 @@
For information about the registration process, meaning, and usage of the 't' extension, see [[RFC6497](#RFC6497)].
-### 3.8 <a name="Compatibility_with_Older_Identifiers" href="#Compatibility_with_Older_Identifiers">Compatibility with Older Identifiers</a>
+### <a name="Compatibility_with_Older_Identifiers" href="#Compatibility_with_Older_Identifiers">Compatibility with Older Identifiers</a>
LDML version before 1.7.2 used slightly different syntax for variant subtags and locale extensions. Implementations of LDML may provide backward compatible identifier support as described in following sections.
-#### 3.8.1 <a name="Old_Locale_Extension_Syntax" href="#Old_Locale_Extension_Syntax">Old Locale Extension Syntax</a>
+#### <a name="Old_Locale_Extension_Syntax" href="#Old_Locale_Extension_Syntax">Old Locale Extension Syntax</a>
LDML 1.7 or older specification used different syntax for representing Unicode locale extensions. The previous definition of Unicode locale extensions had the following structure:
@@ -1244,7 +1327,7 @@
2. **valid** - well-formed and only uses registered language subtags, extensions, keywords, types...
3. **canonical** - valid and no deprecated codes or structure.
-#### 3.8.2 <a name="Legacy_Variants" href="#Legacy_Variants">Legacy Variants</a>
+#### <a name="Legacy_Variants" href="#Legacy_Variants">Legacy Variants</a>
Old LDML specification allowed codes other than registered [[BCP47](#BCP47)] variant subtags used in Unicode language and locale identifiers for representing variations of locale data. Unicode locale identifiers including such variant codes can be converted to the new [[BCP47](#BCP47)] compatible identifiers by following the descriptions below:
@@ -1270,7 +1353,7 @@
> 👉 Note that the mapping between `en_US_POSIX` and `en-US-u-va-posix` is a conversion process, not a canonicalization process.
-#### 3.8.3 <a name="Relation_to_OpenI18n" href="#Relation_to_OpenI18n">Relation to OpenI18n</a>
+#### <a name="Relation_to_OpenI18n" href="#Relation_to_OpenI18n">Relation to OpenI18n</a>
The locale id format generally follows the description in the _OpenI18N Locale Naming Guideline_ [[NamingGuideline](#NamingGuideline)], with some enhancements. The main differences from those guidelines are that the locale id:
@@ -1279,7 +1362,7 @@
3. adds the ability to discriminate the written language by script (or script variant).
4. is a superset of [[BCP47](#BCP47)] codes.
-### 3.9 <a name="Transmitting_Locale_Information" href="#Transmitting_Locale_Information">Transmitting Locale Information</a>
+### <a name="Transmitting_Locale_Information" href="#Transmitting_Locale_Information">Transmitting Locale Information</a>
In a world of on-demand software components, with arbitrary connections between those components, it is important to get a sense of where localization should be done, and how to transmit enough information so that it can be done at that appropriate place. End-users need to get messages localized to their languages, messages that not only contain a translation of text, but also contain variables such as date, time, number formats, and currencies formatted according to the users' conventions. The strategy for doing the so-called _JIT localization_ is made up of two parts:
@@ -1297,7 +1380,7 @@
Even though localization should be done as close to the end-user as possible, there will be cases where different components need to be aware of whatever settings are appropriate for doing the localization. Thus information such as a locale code or time zone needs to be communicated between different components.
-#### 3.9.1 <a name="Message_Formatting_and_Exceptions" href="#Message_Formatting_and_Exceptions">Message Formatting and Exceptions</a>
+#### <a name="Message_Formatting_and_Exceptions" href="#Message_Formatting_and_Exceptions">Message Formatting and Exceptions</a>
Windows ([FormatMessage](https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-formatmessage), [String.Format](https://learn.microsoft.com/en-us/dotnet/api/system.string.format?view=net-6.0)), Java ([MessageFormat](https://docs.oracle.com/javase/7/docs/api/java/text/MessageFormat.html)) and ICU ([MessageFormat](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classMessageFormat.html), [umsg](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/umsg_8h.html)) all provide methods of formatting variables (dates, times, etc) and inserting them at arbitrary positions in a string. This avoids the manual string concatenation that causes severe problems for localization. The question is, where to do this? It is especially important since the original code site that originates a particular message may be far down in the bowels of a component, and passed up to the top of the component with an exception. So we will take that case as representative of this class of issues.
@@ -1307,7 +1390,7 @@
In addition, exceptions are often caught at a higher level; they do not end up being displayed to any end-user at all. By avoiding the localization at the throw site, it the cost of doing formatting, when that formatting is not really necessary. In fact, in many running programs most of the exceptions that are thrown at a low level never end up being presented to an end-user, so this can have considerable performance benefits.
-### 3.10 <a name="Language_and_Locale_IDs" href="#Language_and_Locale_IDs">Unicode Language and Locale IDs</a>
+### <a name="Language_and_Locale_IDs" href="#Language_and_Locale_IDs">Unicode Language and Locale IDs</a>
People have very slippery notions of what distinguishes a language code versus a locale code. The problem is that both are somewhat nebulous concepts.
@@ -1321,7 +1404,7 @@
Notice also that _currency codes_ are different than _currency localizations_. The currency localizations should largely be in the language-based resource bundles, not in the territory-based resource bundles. Thus, the resource bundle _en_ contains the localized mappings in English for a range of different currency codes: USD → US$, RUR → Rub, AUD → $A and so on. Of course, some currency symbols are used for more than one currency, and in such cases specializations appear in the territory-based bundles. Continuing the example, _en_US_ would have USD → $, while _en_AU_ would have AUD → $. (In protocols, the currency codes should always accompany any currency amounts; otherwise the data is ambiguous, and software is forced to use the user's territory to guess at the currency. For some informal discussion of this, see [JIT Localization](https://unicode-org.github.io/icu-docs/design/jit_localization.html).)
-#### 3.10.1 <a name="Written_Language" href="#Written_Language">Written Language</a>
+#### <a name="Written_Language" href="#Written_Language">Written Language</a>
Criteria for what makes a written language should be purely pragmatic; _what would copy-editors say?_ If one gave them text like the following, they would respond that is far from acceptable English for publication, and ask for it to be redone:
@@ -1336,7 +1419,7 @@
Note that the language of locale data may differ from the language of localized software or web sites, when those latter are not localized into the user's preferred language. In such cases, the kind of incongruous juxtapositions described above may well appear, but this situation is usually preferable to forcing unfamiliar date or number formats on the user as well.
-#### 3.10.2 <a name="Hybrid_Locale" href="#Hybrid_Locale">Hybrid Locale Identifiers</a>
+#### <a name="Hybrid_Locale" href="#Hybrid_Locale">Hybrid Locale Identifiers</a>
Hybrid locales have intermixed content from 2 (or more) languages, often with one language's grammatical structure applied to words in another. These are commonly referred to with portmanteau words such as _Franglais, [Spanglish](https://en.wikipedia.org/wiki/Spanglish)_ or _Denglish_. Hybrid locales do not _not_ reference text simply containing two languages: a book of parallel text containing English and French, such as the following, is not Franglais:
@@ -1348,7 +1431,7 @@
While text in a document can be tagged as partly in one language and partly in another, that is not the same having a hybrid locale. There is a difference between having a Spanglish document, and a Spanish document that has some passages quoted in English. Fine-grained tagging doesn't handle grammatical combinations like Tanglish “Enna matteru?” (_What’s the matter?_), which is neither standard Tamil nor standard English. More importantly, it doesn’t work for the very common use case for a [unicode_locale_id](#unicode_locale_id): _locale selection_.
-To communicate requests for localized content and internationalization services, locales are used. When people pick a language from a menu, internally they are picking a locale (en-GB, es-419, etc.). To allow an application to support Spanglish or Hinglish locale selection, [unicode_locale_id](#unicode_locale_id)s can represent hybrid locales using the T Extension key-value 'h0-hybrid'. (For more information on the T extension, see _Section 3.7 [Unicode BCP 47 T Extension](#t_Extension)._)
+To communicate requests for localized content and internationalization services, locales are used. When people pick a language from a menu, internally they are picking a locale (en-GB, es-419, etc.). To allow an application to support Spanglish or Hinglish locale selection, [unicode_locale_id](#unicode_locale_id)s can represent hybrid locales using the T Extension key-value 'h0-hybrid'. (For more information on the T extension, see _[Unicode BCP 47 T Extension](#t_Extension)._)
_However, if users typically expect a their language in non-default script to contain a significant amount of text due to lexical borrowing, then the -t- and hybrid subtags may be omitted. An example of this is when Hindi is written in Latin script since Romanized Hindi typically contains a significant amount of English text, ‘hi-Latn’ can be used instead of ‘hi-Latn-t-en-h0-hybrid’._
This tends to work better in implementations that don't yet handle the -t- extension.
@@ -1384,7 +1467,7 @@
Should there ever be strong need for hybrids of more than two languages or for other purposes such as hybrid languages as the source of translated content, additional structure could be added.
-### 3.11 <a name="Validity_Data" href="#Validity_Data">Validity Data</a>
+### <a name="Validity_Data" href="#Validity_Data">Validity Data</a>
```xml
<!ELEMENT idValidity (id*) >
@@ -1397,15 +1480,15 @@
* **regular** — the standard codes used for the specific type of subtag
* **special** — certain exceptional language codes like 'mul' _(languages only)_
-* **unknown** — the code used to indicate the "unknown", "undetermined" or "invalid" values. For more information, see _Section 3.5.1 [Unknown or Invalid Identifiers](#Unknown_or_Invalid_Identifiers)_.
+* **unknown** — the code used to indicate the "unknown", "undetermined" or "invalid" values. For more information, see _[Unknown or Invalid Identifiers](#Unknown_or_Invalid_Identifiers)_.
* **macroregion** — the standard codes that are macroregions _(for regions only)._
* Note that some two-letter region codes are macroregions, and (in the future) some three-digit codes may be regular codes.
* For details as to which regions are contained within which macroregions, see the `<containment>` element of the supplemental data.
* **deprecated** — codes that should not be used. The `<alias>` element in the supplementalMeta file contains more information about these codes, and which codes should be used instead.
-* **private_use** — codes that, for CLDR, are considered private use. Note that some private-use codes in a source standard such as BCP 47 have defined CLDR semantics, and are considered regular codes. For more information, see _Section 3.5.3 [Private Use Codes](#Private_Use_Codes)._
+* **private_use** — codes that, for CLDR, are considered private use. Note that some private-use codes in a source standard such as BCP 47 have defined CLDR semantics, and are considered regular codes. For more information, see _[Private Use Codes](#Private_Use_Codes)._
* **reserved** — codes that are private use in a source standard, but are reserved for future use as regular codes by CLDR.
-The list of subtags for each idStatus use a compact format as a space-delimited list of StringRanges, as defined in _Section [5.3.4 String Range](#String_Range)._ The separator for each StringRange is a "~".
+The list of subtags for each idStatus use a compact format as a space-delimited list of StringRanges, as defined in _Section String Range](#String_Range)._ The separator for each StringRange is a "~".
Each measure unit is a sequence of subtags, such as “angle-arc-minute”. The first subtag provides a general “category” of the unit.
@@ -1413,7 +1496,7 @@
-## 4 <a name="Locale_Inheritance" href="#Locale_Inheritance">Locale Inheritance and Matching</a>
+## <a name="Locale_Inheritance" href="#Locale_Inheritance">Locale Inheritance and Matching</a>
The XML format relies on an inheritance model, whereby the resources are collected into _bundles_, and the bundles organized into a tree. Data for the many Spanish locales does not need to be duplicated across all of the countries having Spanish as a national language. Instead, common data is collected in the Spanish language locale, and territory locales only need to supply differences. The parent of all of the language locales is a generic locale known as _root_. Wherever possible, the resources in the root are language & territory neutral. For example, the collation (sorting) order in the root is based on the [[DUCET](#DUCET)] (see _[Root Collation](tr35-collation.md#Root_Collation)_). Since English language collation has the same ordering as the root locale, the 'en' locale data does not need to supply any collation data, nor do the 'en_US', 'en_GB' or the any of the various other locales that use English.
@@ -1449,7 +1532,7 @@
If a type and key are supplied in the locale id, then logically the chain from that id to the root is searched for a resource tag with a given type, all the way up to root. If no resource is found with that tag and type, then the chain is searched again without the type.
-Thus the data for any given locale will only contain resources that are different from the parent locale. For example, most territory locales will inherit the bulk of their data from the language locale: "en" will contain the bulk of the data: "en_IE" will only contain a few items like currency. All data that is inherited from a parent is presumed to be valid, just as valid as if it were physically present in the file. This provides for much smaller resource bundles, and much simpler (and less error-prone) maintenance. At the script or region level, the "primary" child locale will be empty, since its parent will contain all of the appropriate resources for it. For more information see _CLDR Information: Section 9.3 [Default Content](tr35-info.md#Default_Content)._
+Thus the data for any given locale will only contain resources that are different from the parent locale. For example, most territory locales will inherit the bulk of their data from the language locale: "en" will contain the bulk of the data: "en_IE" will only contain a few items like currency. All data that is inherited from a parent is presumed to be valid, just as valid as if it were physically present in the file. This provides for much smaller resource bundles, and much simpler (and less error-prone) maintenance. At the script or region level, the "primary" child locale will be empty, since its parent will contain all of the appropriate resources for it. For more information see _CLDR Information: [Default Content](tr35-info.md#Default_Content)._
Certain data items depend only on the region specified in a locale id (by a [unicode_region_subtag](#unicode_region_subtag_validity) or an “rg” [Region Override](#RegionOverride) key), and are obtained from supplemental data rather than through locale resources. For example:
@@ -1459,9 +1542,9 @@
(For more information on the specific items handled this way, see [Territory-Based Preferences](tr35-info.md#Territory_Based_Preferences).) These items will be correct for the specified region regardless of whether a locale bundle actually exists with the same combination of language and region as in the locale id. For example, suppose data is requested for the locale id "fr_US" and there is no bundle for that combination. Data obtained via locale inheritance, such as currency patterns and currency symbols, will be obtained from the parent locale "fr". However, currency amounts would be formatted by default using US dollars, just displayed in the manner governed by the locale "fr". When a locale id does not specify a region, the region-specific items such as those above are obtained from the likely region for the locale (obtained via [Likely Subtags](#Likely_Subtags)).
-For the relationship between Inheritance, DefaultContent, LikelySubtags, and LocaleMatching, see Section 4.2.6 [Inheritance vs Related Information](tr35.md#Inheritance_vs_Related).
+For the relationship between Inheritance, DefaultContent, LikelySubtags, and LocaleMatching, see [Inheritance vs Related Information](tr35.md#Inheritance_vs_Related).
-### 4.1 <a name="Lookup" href="#Lookup">Lookup</a>
+### <a name="Lookup" href="#Lookup">Lookup</a>
If a language has more than one script in customary modern use, then the CLDR file structure in common/main follows the following model:
@@ -1469,19 +1552,21 @@
lang
lang_script
lang_script_region
-lang_region (aliases to lang_script_region)
+lang_region (aliases to lang_script_region based on likely subtags)
```
-#### 4.1.1 <a name="Bundle_vs_Item_Lookup" href="#Bundle_vs_Item_Lookup">Bundle vs Item Lookup</a>
+#### <a name="Bundle_vs_Item_Lookup" href="#Bundle_vs_Item_Lookup">Bundle vs Item Lookup</a>
There are actually two different kinds of inheritance fallback: _resource bundle lookup_ and _resource item lookup_. For the former, a process is looking to find the first, best resource bundle it can; for the later, it is fallback within bundles on individual items, like the translated name for the region "CN" in Breton.
These are closely related, but distinct, processes. They are illustrated in the table [Lookup Differences](#Lookup-Differences), where "key" stands for zero or more key/type pairs. Logically speaking, when looking up an item for a given locale, you first do a resource bundle lookup to find the best bundle for the locale, then you do an inherited item lookup starting with that resource bundle.
-The table [Lookup Differences](#Lookup-Differences) uses the naïve resource bundle lookup for illustration. More sophisticated systems will get far better results for resource bundle lookup if they use the algorithm described in _Section 4.4 [Language Matching](#LanguageMatching)_. That algorithm takes into account both the user’s desired locale(s) and the application’s supported locales, in order to get the best match.
+The table [Lookup Differences](#Lookup-Differences) uses the naïve resource bundle lookup for illustration. More sophisticated systems will get far better results for resource bundle lookup if they use the algorithm described in _[Language Matching](#LanguageMatching)_. That algorithm takes into account both the user’s desired locale(s) and the application’s supported locales, in order to get the best match.
If the naïve resource bundle lookup is used, the desired locale needs to be canonicalized using 4.3 [Likely Subtags](#Likely_Subtags) and the supplemental alias information, so that locales that CLDR considers identical are treated as such. Thus eng-Latn-GB should be mapped to en-GB, and cmn-TW mapped to zh-Hant-TW.
+The initial bundle accessed during resource bundle lookup should not contain a script subtag unless, according to likely subtags, the script is required to disambiguate the locale. For example, `zh-Hant-TW` should start lookup at `zh-TW` (since `zh-TW` implies `Hant`), and `de-Latn-LI` should start at `de-LI` (since `de` implies `Latn` and `de-LI` does not have its own entry in likely subtags).
+
For the purposes of CLDR, everything with the `<ldml>` dtd is treated logically as if it is one resource bundle, even if the implementation separates data into separate physical resource bundles. For example, suppose that there is a main XML file for Nama (naq), but there are no `<unit>` elements for it because the units are all inherited from root. If the `<unit>` elements are separated into a separate data tree for modularity in the implementation, the Nama `<unit>` resource bundle would be empty. However, for purposes of resource-bundle lookup the resource bundle lookup still stops at naq.xml.
###### Table: <a name="Lookup-Differences" href="#Lookup-Differences">Lookup Differences</a>
@@ -1545,7 +1630,7 @@
* break iteration
* case mapping
* transliteration
- * The lookup for transliteration is yet more complicated because of the interplay of source and target locales: see _Part 2 General, Section 10.1 [Inheritance.](tr35-general.md#Inheritance)_
+ * The lookup for transliteration is yet more complicated because of the interplay of source and target locales: see _Part 2 General, [Inheritance.](tr35-general.md#Inheritance)_
Thus if there is no Akan locale, for example, asking for a collation for Akan should produce the root collation, _not the Swedish collation._
@@ -1555,7 +1640,7 @@
Where the LDML inheritance relationship does not match a target system, such as POSIX, the data logically should be fully resolved in converting to a format for use by that system, by adding _all_ inherited data to each locale data set.
-For a more complete description of how inheritance applies to data, and the use of keywords, see _[Section 4.2 Inheritance](#Inheritance_and_Validity)_ .
+For a more complete description of how inheritance applies to data, and the use of keywords, see _[Inheritance](#Inheritance_and_Validity)_ .
The locale data does not contain general character properties that are derived from the _Unicode Character Database_ [[UAX44](https://www.unicode.org/reports/tr41/#UAX44)]. That data being common across locales, it is not duplicated in the bundles. Constructing a POSIX locale from the CLDR data requires use of UCD data. In addition, POSIX locales may also specify the character encoding, which requires the data to be transformed into that target encoding.
@@ -1569,7 +1654,7 @@
```
<a name="Multiple_Inheritance"></a>
-#### 4.1.2 <a name="Lateral_Inheritance" href="#Lateral_Inheritance">Lateral Inheritance</a>
+#### <a name="Lateral_Inheritance" href="#Lateral_Inheritance">Lateral Inheritance</a>
__Lateral Inheritance__ is where resources are inherited from within the same locale, _before inheriting from the parent_. This is used for the following element@attribute instances:
@@ -1593,7 +1678,7 @@
| count | plural_rules(locale, x) → "other" → ∅ | minDays, pluralMinimalPairs |
| ordinal | plural_rules(locale, x) → "other" → ∅ | ordinalMinimalPairs |
-The gender fallback is to neuter if the locale has a neuter gender, otherwise masculine. This may be extended in the future if necessary. See also [Part 2, Section 15, Grammatical Features](tr35-general.md#Grammatical_Features).
+The gender fallback is to neuter if the locale has a neuter gender, otherwise masculine. This may be extended in the future if necessary. See also [Part 2, Grammatical Features](tr35-general.md#Grammatical_Features).
For example, if there is no value for a path, and that path has a [@count="x"] attribute and value, then:
@@ -1673,10 +1758,11 @@
| root | `//ldml/numbers/currencies/currency[@type="CAD"]/displayName` |
-#### 4.1.3 <a name="Parent_Locales" href="#Parent_Locales">Parent Locales</a>
+#### <a name="Parent_Locales" href="#Parent_Locales">Parent Locales</a>
```xml
<!ELEMENT parentLocales ( parentLocale* ) >
+<!ATTLIST parentLocales component NMTOKEN #IMPLIED >
<!ELEMENT parentLocale EMPTY >
<!ATTLIST parentLocale parent NMTOKEN #REQUIRED >
<!ATTLIST parentLocale locales NMTOKENS #REQUIRED >
@@ -1701,7 +1787,71 @@
<parentLocale parent="es_419" locales="es_AR es_BO … es_UY es_VE"/>
```
-Collation data, however, is an exception. Since collation rules do not truly inherit data from the parent, the `parentLocale` element is not necessary and not used for collation. Thus, for a locale like zh_Hant in the example above, the `parentLocale` element would dictate the parent as "root" when referring to main locale data, but for collation data, the parent locale would still be "zh", even though the `parentLocale` element is present for that locale.
+There are certain components that require addenda to the common parent fallback rules. For a locale like `zh_Hant` in the example above, the `parentLocale` element would dictate the parent as `root` when referring to main locale data, but for collation data, the parent locale should still be `zh`, even though the `parentLocale` element is present for that locale. To address this, components can have their own fallback rules that inherit from the common rules and add additional parents that supplement or override the common rules:
+
+```xml
+<parentLocales component="segmentations">
+ <parentLocale parent="zh" locales="zh_Hant"/>
+</parentLocales>
+```
+
+Logically, component-specific parent locales should be merged with the common parent locales as if merging maps with children as the keys and parents as the values, and retaining the component-specific parents whenever there are duplicate keys. For example, consider the following XML:
+
+```xml
+<parentLocales>
+ <parentLocale parent="root" locales="az_Arab az_Cyrl yue_Hans zh_Hant"/>
+ <parentLocale parent="en_001" locales="en_150 en_AG en_SI"/>
+</parentLocales>
+<parentLocales component="collations">
+ <parentLocale parent="zh" locales="zh_Hant"/>
+ <parentLocale parent="zh_Hant" locales="yue yue_Hant"/>
+ <parentLocale parent="zh_Hans" locales="yue_CN yue_Hans yue_Hans_CN"/>
+</parentLocales>
+```
+
+These data correspond to the following key-value maps:
+
+```javascript
+// Common parents
+{
+ "az_Arab": "root",
+ "az_Cyrl": "root",
+ "en_150": "en_001",
+ "en_AG": "en_001",
+ "en_SI": "en_001",
+ "yue_Hans": "root",
+ "zh_Hant": "root",
+}
+
+// Collation overrides
+{
+ "yue_CN": "zh_Hans",
+ "yue_Hans_CN": "zh_Hans",
+ "yue_Hans": "zh_Hans",
+ "yue_Hant": "zh_Hant",
+ "yue": "zh_Hant",
+ "zh_Hant": "zh",
+}
+```
+
+The resulting set of parents used for collations should then be:
+
+```javascript
+// Resolved collation parents
+{
+ "az_Arab": "root",
+ "az_Cyrl": "root",
+ "en_150": "en_001",
+ "en_AG": "en_001",
+ "en_SI": "en_001",
+ "yue_CN": "zh_Hans",
+ "yue_Hans_CN": "zh_Hans",
+ "yue_Hans": "zh_Hans",
+ "yue_Hant": "zh_Hant",
+ "yue": "zh_Hant",
+ "zh_Hant": "zh",
+}
+```
Since parentLocale information is not localizable on a per locale basis, the parentLocale information is contained in CLDR’s [supplemental data.](tr35-info.md)
@@ -1716,17 +1866,46 @@
There are certain invariants that must always be true:
-3. The parent must either be the root locale or have the same script as the child.
+3. The parent must either be the root locale or have the same script as the child. This rule does not apply to component-specific parents.
4. There must never be cycles, such as: X parent of Y ... parent of X.
5. Following the inheritance path, using parentLocale where available and otherwise truncating the locale, must always lead eventually to the root locale.
-### 4.2 <a name="Inheritance_and_Validity" href="#Inheritance_and_Validity">Inheritance and Validity</a>
+#### <a name="Region_Priority_Inheritance" href="#Region_Priority_Inheritance">Region-Priority Inheritance</a>
+
+Certain data may be more appropriate to store with the region as the primary key instead of language. This is often needed for regional user preferences, such as week info, calendar system, and measurement system. All resources matched by an entry in <a href="tr35-info.md#rgScope"><rgScope></a> should use this type of inheritance.
+
+The default search chain for region-priority inheritance removes the language subtag before the region subtag, as follows:
+
+```
+en_US_someVariant
+en_US
+US
+001
+```
+
+Equivalently as BCP-47:
+
+```
+en-US-variant
+en-US
+und-US
+und
+```
+
+Before running region-priority inheritance, the locale should be normalized as follows:
+
+1. If the locale contains the `-u-rg` Unicode BCP-47 locale extension, the region subtag should be set to the `-u-rg` region. For example, `en-US-u-rg-gbzzzz` should normalize to `en-GB` when running region-priority inheritance.
+2. If, after performing step 1, the locale is missing the region subtag (`language` or `language_script`), the region subtag should be filled in from likely subtags data. For example, `en` should become `en-US` before running region-priority inheritance.
+
+Note that region-priority inheritance does not currently make use of parent locales or territory containment, but it may in the future.
+
+### <a name="Inheritance_and_Validity" href="#Inheritance_and_Validity">Inheritance and Validity</a>
The following describes in more detail how to determine the exact inheritance of elements, and the validity of a given element in LDML.
-#### 4.2.1 <a name="Definitions" href="#Definitions">Definitions</a>
+#### <a name="Definitions" href="#Definitions">Definitions</a>
-_Blocking_ elements are those whose subelements do not inherit from parent locales. For example, a `<collation>` element is a blocking element: everything in a `<collation>` element is treated as a single lump of data, as far as inheritance is concerned. For more information, see [Section 5.5 Valid Attribute Values](#Valid_Attribute_Values).
+_Blocking_ elements are those whose subelements do not inherit from parent locales. For example, a `<collation>` element is a blocking element: everything in a `<collation>` element is treated as a single lump of data, as far as inheritance is concerned. For more information, see [Valid Attribute Values](#Valid_Attribute_Values).
Attributes that serve to distinguish multiple elements at the same level are called _distinguishing_ attributes. For example, the `type` attribute distinguishes different elements in lists of translations, such as:
@@ -1735,7 +1914,7 @@
<language type="ab">Abkhazian</language>
```
-Distinguishing attributes affect inheritance; two elements with different distinguishing attributes are treated as different for purposes of inheritance. For more information, see [Section 5.5 Valid Attribute Values](#Valid_Attribute_Values). Other attributes are called value attributes. Value attributes do not affect inheritance, and elements with value attributes may not have child elements (see [XML Format](#XML_Format)).
+Distinguishing attributes affect inheritance; two elements with different distinguishing attributes are treated as different for purposes of inheritance. For more information, see [Valid Attribute Values](#Valid_Attribute_Values). Other attributes are called value attributes. Value attributes do not affect inheritance, and elements with value attributes may not have child elements (see [XML Format](#XML_Format)).
Non-distinguishing attributes are identified by [DTD Annotations](#DTD_Annotations) such as `@VALUE`.
@@ -1789,7 +1968,7 @@
> <root, de, de_DE, de_DE_xxx>
-#### 4.2.2 <a name="Resolved_Data_File" href="#Resolved_Data_File">Resolved Data File</a>
+#### <a name="Resolved_Data_File" href="#Resolved_Data_File">Resolved Data File</a>
To produce fully resolved locale data file from CLDR for a locale ID L, you start with L, and successively add unique items from the parent locales until you get up to root. More formally, this can be expressed as the following procedure.
@@ -1798,7 +1977,7 @@
1. Let Temp be a copy of the pairs in the LDML file for Li
2. Replace each alias in Temp by the resolved list of pairs it points to.
1. The resolved list of pairs is obtained by recursively applying this procedure.
- 2. That alias now blocks any inheritance from the parent. (See _[Section 5.1 Common Elements](#Common_Elements)_ for an example.)
+ 2. That alias now blocks any inheritance from the parent. (See _[Common Elements](#Common_Elements)_ for an example.)
3. For each element pair P in Temp:
1. If P does not contain a blocking element, and Result does not have an element pair Q with an equivalent element chain, add P to Result.
@@ -1808,7 +1987,7 @@
* The identity element and its children are unaffected by resolution.
* The LDML data must be constructed so as to avoid circularity in step 2.2.
-#### 4.2.3 <a name="Valid_Data" href="#Valid_Data">Valid Data</a>
+#### <a name="Valid_Data" href="#Valid_Data">Valid Data</a>
The attribute `draft="x"` in LDML means that the data has not been approved by the subcommittee. (For more information, see [Process](https://cldr.unicode.org/index/process)). However, some data that is not explicitly marked as `draft` may be implicitly `draft`, either because it inherits it from a parent, or from an enclosing element.
@@ -1841,11 +2020,11 @@
</ldml>
```
-However, normally the draft attributes should be canonicalized, which means they are pushed down to leaf nodes as described in _[Section 5.6 Canonical Form](#Canonical_Form)_. If an LDML file does have draft attributes that are not on leaf nodes, the file should be interpreted as if it were the canonicalized version of that file.
+However, normally the draft attributes should be canonicalized, which means they are pushed down to leaf nodes as described in _[Canonical Form](#Canonical_Form)_. If an LDML file does have draft attributes that are not on leaf nodes, the file should be interpreted as if it were the canonicalized version of that file.
More formally, here is how to determine whether data for an element chain E is implicitly or explicitly draft, given a locale L. Sections 1, 2, and 4 are simply formalizations of what is in LDML already. Item 3 adds the new element.
-#### 4.2.4 <a name="Checking_for_Draft_Status" href="#Checking_for_Draft_Status">Checking for Draft Status</a>
+#### <a name="Checking_for_Draft_Status" href="#Checking_for_Draft_Status">Checking for Draft Status</a>
1. **Parent Locale Inheritance**
1. Walk through the locale chain until you find a locale ID L' with a data file D. (L' may equal L).
@@ -1867,7 +2046,7 @@
The `validSubLocales` in the most specific (farthest from root file) locale file "wins" through the full resolution step (data from more specific files replacing data from less specific ones).
-#### 4.2.5 <a name="Keyword_and_Default_Resolution" href="#Keyword_and_Default_Resolution">Keyword and Default Resolution</a>
+#### <a name="Keyword_and_Default_Resolution" href="#Keyword_and_Default_Resolution">Keyword and Default Resolution</a>
When accessing data based on keywords, the following process is used. Consider the following example:
@@ -1944,7 +2123,7 @@
For identifiers, such as language codes, script codes, region codes, variant codes, types, keywords, currency symbols or currency display names, the default value is the identifier itself whenever no value is found in the root. Thus if there is no display name for the region code 'QA' in root, then the display name is simply 'QA'.
-#### 4.2.6 <a name="Inheritance_vs_Related" href="#Inheritance_vs_Related">Inheritance vs Related Information</a>
+#### <a name="Inheritance_vs_Related" href="#Inheritance_vs_Related">Inheritance vs Related Information</a>
There are related types of data and processing that are easy to confuse:
@@ -1990,7 +2169,7 @@
</tbody></table>
-### 4.3 <a name="Likely_Subtags" href="#Likely_Subtags">Likely Subtags</a>
+### <a name="Likely_Subtags" href="#Likely_Subtags">Likely Subtags</a>
```xml
<!ELEMENT likelySubtag EMPTY >
@@ -2004,7 +2183,7 @@
The _likelySubtag_ supplemental data provides default information for computing these values. This data is based on the default content data, the population data, and the suppress-script data in [[BCP47](#BCP47)]. It is heuristically derived, and may change over time.
-For the relationship between Inheritance, DefaultContent, LikelySubtags, and LocaleMatching, see **_Section 4.2.6 [Inheritance vs Related Information](tr35.md#Inheritance_vs_Related)_**.
+For the relationship between Inheritance, DefaultContent, LikelySubtags, and LocaleMatching, see **_[Inheritance vs Related Information](tr35.md#Inheritance_vs_Related)_**.
To look up data in the table, see if a locale matches one of the `from` attribute values. If so, fetch the corresponding `to` attribute value. For example, the Chinese data looks like the following:
@@ -2020,8 +2199,7 @@
So looking up "zh_TW" returns "zh_Hant_TW", while looking up "zh" returns "zh_Hans_CN".
In more detail, the data is designed to be used in the following operations.
-
-Note that as of CLDR v24, any field present in the 'from' field is also present in the 'to' field, so an input field will not change in "Add Likely Subtags" operation. The data and operations can also be used with language tags using [[BCP47](#BCP47)] syntax, with the appropriate changes. In addition, certain common 'denormalized' language subtags such as 'iw' (for 'he') may occur in both the 'from' and 'to' fields. This allows for implementations that use those denormalized subtags to use the data with only minor changes to the operations.
+Like other CLDR operations, these operations can also be used with language tags having [[BCP47](#BCP47)] syntax, with the appropriate changes to the data.
An implementation may choose to exclude language tags with the language subtag "und" from the following operation. In such a case, only the canonicalization is done. An implementation can declare that it is doing the exclusion, or can take a parameter that controls whether or not to do it.
@@ -2036,20 +2214,25 @@
2. Replace any deprecated subtags with their canonical values using the `<alias>` data in supplemental metadata. Use the first value in the replacement list, if it exists. Language tag replacements may have multiple parts, such as "sh" ➞ "sr_Latn" or "mo" ➞ "ro_MD". In such a case, the original script and/or region are retained if there is one. Thus "sh_Arab_AQ" ➞ "sr_Arab_AQ", not "sr_Latn_AQ".
3. If the tag is a legacy language tag (marked as “Type: grandfathered” in BCP 47; see `<variable id="$grandfathered" type="choice">` in the supplemental data), then return it.
4. Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
- 5. Get the components of the cleaned-up source tag _(languages, scripts,_ and _regions_), plus any variants and extensions.
+ 5. Get the components of the cleaned-up source tag _(language<sub>s</sub>, script<sub>s</sub>,_ and _region<sub>s</sub>_), plus any variants and extensions.
+ 6. If the language is not 'und' and the other two components are not empty, return the language tag composed of _language<sub>s</sub>\_script<sub>s</sub>\_region<sub>s</sub>_ + variants + extensions.
2. **Lookup.** Look up each of the following in order, and stop on the first match:
- 1. _languages_scripts_regions_
- 2. _languages_regions_
- 3. _languages_scripts_
- 4. __languages__
- 5. und\__scripts_
+ 1. _language<sub>s</sub>\_script<sub>s</sub>\_region<sub>s</sub>_
+ 2. _language<sub>s</sub>\_script<sub>s</sub>_
+ 3. _language<sub>s</sub>\_region<sub>s</sub>_
+ 4. _language<sub>s</sub>_
3. **Return**
- 1. If there is no match, either return
- 1. an error value, or
- 2. the match for "und" (in APIs where a valid language tag is required).
- 2. Otherwise there is a match = _languagem_scriptm_regionm_
- 3. Let xr = xs if xs is not empty, and xm otherwise.
- 4. Return the language tag composed of _languager _ scriptr _ regionr_ + variants + extensions .
+ 1. If there is no match, signal an error and stop.
+ 2. Otherwise there is a match = _language<sub>m</sub>\_script<sub>m</sub>\_region<sub>m</sub>_
+ 3. Let x<sub>r</sub> = x<sub>s</sub> if x<sub>s</sub> is neither empty nor a macroregion, and x<sub>m</sub> otherwise.
+ 4. Return the language tag composed of _language<sub>r</sub>\_script<sub>r</sub>\_region<sub>r</sub>_ + variants + extensions.
+
+Signalling an error can be done in various ways, depending on the most consistent approach for APIs in the module. For example:
+ 1. raise an exception
+ 2. return an error value (such as null)
+ 3. return the input (with missing fields)
+ 4. return the input, but "Zzzz", and/or "ZZ" substituted for empty fields.
+ 5. "und"
The lookup can be optimized. For example, if any of the tags in Step 2 are the same as previous ones in that list, they do not need to be tested.
@@ -2062,34 +2245,52 @@
To find the most likely language for a country, or language for a script, use "und" as the language subtag. For example, looking up "und_TW" returns zh_Hant_TW.
-A goal of the algorithm is that if X ⇒ Y, and X' results from replacing an empty subtag in X by the corresponding subtag in Y, then X' ⇒ Y. For example, if und_AF ⇒ fa_Arab_AF, then:
+A general goal of the algorithm is that non-empty field present in the 'from' field is also present in the 'to' field, so a non-empty input field will not change in "Add Likely Subtags" operation.
+That is, when X ⇒ Y, and X' results from replacing an empty subtag in X by the corresponding subtag in Y, then X' ⇒ Y.
+For example, if und_AF ⇒ fa_Arab_AF, then:
* fa_Arab_AF ⇒ fa_Arab_AF
* und_Arab_AF ⇒ fa_Arab_AF
* fa_AF ⇒ fa_Arab_AF
-There are a small number of exceptions to this goal in the current data, where X ∈ {und_Bopo, und_Brai, und_Cakm, und_Limb, und_Shaw}.
+There are a few exceptions to this goal:
+* A 'denormalized' subtag changes to the normalized form, except for certain denormalized language subtags such as 'iw' (for 'he' = Hebrew) which may occur in both the 'from' and 'to' fields of the data.
+This allows for implementations that use those denormalized subtags to use the data with only minor changes to the operations.
+* A macroregion (such as West Africa = 011) _may_ change to a specific country (Nigeria = NG).
**_Remove_** _**Likely Subtags:** Given a locale, remove any fields that Add Likely Subtags would add._
-The reverse operation removes fields that would be added by the first operation.
+The reverse operation removes fields that could be added by the first operation.
-1. First get max = AddLikelySubtags(inputLocale). If an error is signaled, return it.
-2. Remove the variants from max.
-3. Get the components of the max (_languagemax_, _scriptmax_, _regionmax_).
-4. Then for _trial_ in {_languagemax_, _languagemax_regionmax_, _languagemax_scriptmax_}
- * If AddLikelySubtags(_trial_) = max, then return _trial_ + variants.
-5. If you do not get a match, return max + variants.
+1. First get max = AddLikelySubtags(inputLocale).
+2. If an error is signaled in AddLikelySubtags, signal that same error and stop.
+3. Remove the variants and extensions from max.
+4. Get the components of the max (_languagemax_, _scriptmax_, _regionmax_).
+5. Then for _trial_ in {_languagemax_, _languagemax_regionmax_, _languagemax_scriptmax_}
+ * If AddLikelySubtags(_trial_) = max, then return _trial_ + variants + extensions.
+6. If there is no match, return max + variants + extensions.
Example:
-* Input is zh_Hant. Maximize to get zh_Hant_TW.
+* Input is zh_Hant or zh_TW.
+* Maximize to get zh_Hant_TW.
* zh => zh_Hans_CN. No match, so continue.
-* zh_TW => zh_Hant_TW. Matches, so return zh_TW.
+* zh_TW => zh_Hant_TW. Matches, so return **zh_TW**.
-A variant of this favors the script over the region, thus using {language, language_script, language_region} in the above. If that variant is used, then the result in this example would be zh_Hant instead of zh_TW.
+**_Remove_** _**Likely Subtags, favoring script:** Given a locale, remove any fields that Add Likely Subtags would add, but favor script over region._
-### 4.4 <a name="LanguageMatching" href="#LanguageMatching">Language Matching</a>
+A variant of this favors the script over the region, thus using {language, language_script, language_region} in the step #4 above.
+This variant much less commonly used, only when the script relationship is more significant to users.
+Here is the difference:
+
+Example:
+
+* Input is zh_Hant or zh_TW.
+* Maximize to get zh_Hant_TW.
+* zh => zh_Hans_CN. No match, so continue.
+* zh_Hant => zh_Hant_TW. Matches, so return **zh_Hant**.
+
+### <a name="LanguageMatching" href="#LanguageMatching">Language Matching</a>
```xml
<!ELEMENT languageMatching ( languageMatches* ) >
@@ -2128,7 +2329,7 @@
Language Matching can also be used to get fallback data elements. In many cases, there may not be full data for a particular locale. For example, for a Breton speaker, the best fallback if data is unavailable might be French. That is, suppose we have found a Breton bundle, but it does not contain translation for the key "CN" (for the country China). It is best to return "chine", rather than falling back to the value default language such as Russian and getting "Китай". The language matching data can be used to get the closest fallback locales (of those supported) to a given language.
-For the relationship between Inheritance, DefaultContent, LikelySubtags, and LocaleMatching, see **_Section 4.2.6 [Inheritance vs Related Information](tr35.md#Inheritance_vs_Related)_**.
+For the relationship between Inheritance, DefaultContent, LikelySubtags, and LocaleMatching, see **_[Inheritance vs Related Information](tr35.md#Inheritance_vs_Related)_**.
When such fallback is used for inherited item lookup, the normal order of inheritance is used for inherited item lookup, except that before using any data from **root**, the data for the fallback locales would be used if available. Language matching does not interact with the fallback of resources _within the locale-parent chain_. For example, suppose that we are looking for the value for a particular path **P** in **nb-NO**. In the absence of aliases, normally the following lookup is used.
@@ -2175,7 +2376,7 @@
To find the matching distance MD between any two languages, perform the following steps.
-1. Maximize each language using Section 4.3 [Likely Subtags](#Likely_Subtags).
+1. Maximize each language using [Likely Subtags](#Likely_Subtags).
* und is a special case: see below.
2. Set the match-distance MD to 0
3. For each subtag in {language, script, region}
@@ -2183,7 +2384,7 @@
2. Traverse the languageMatching data until a match is found.
* \* matches any field.
* If the oneway flag is false, then the match is symmetric; otherwise only match one direction.
- * For region matching, use the mechanisms in **Section 4.4.1 [Enhanced Language Matching](#EnhancedLanguageMatching)**.
+ * For region matching, use the mechanisms in **[Enhanced Language Matching](#EnhancedLanguageMatching)**.
3. Add the `distance` attribute value to MD.
* This used to be a `percent` attribute value, which was 100 - the `distance` attribute value.
4. Remove the subtag from each (logically)
@@ -2231,7 +2432,7 @@
When the language+region is not matched, and there is otherwise no reason to pick among the supported regions for that language, then some measure of geographic "closeness" can be used. The results may be more understandable by users. Looking for en-SK, for example, should fall back to something within Europe (eg en-GB) in preference to something far away and unrelated (eg en-SG). Such a closeness metric does not need to be exact; a small amount of data can be used to give an approximate distance between any two regions. However, any such data must be used carefully; although Hong Kong is closer to India than to the UK, it is unlikely that en-IN would be a better match to en-HK than en-GB would.
-#### 4.4.1 <a name="EnhancedLanguageMatching" href="#EnhancedLanguageMatching">Enhanced Language Matching</a>
+#### <a name="EnhancedLanguageMatching" href="#EnhancedLanguageMatching">Enhanced Language Matching</a>
The enhanced format for language matching adds structure to enable better matching of languages. It is distinguished by having a suffix "\_new" on the type, as in the example below. The extended structure allows matching to take into account broad similarities that would give better results. For example, for English the regions that are or inherit from US (AS|GU|MH|MP|PR|UM|VI|US) form a “cluster”. Each region in that cluster should be closer to each other than to any other region. And a region outside the cluster should be closer to another region outside that cluster than to one inside. We get this issue with the “world languages” like English, Spanish, Portuguese, Arabic, etc.
@@ -2267,7 +2468,7 @@
-## 5 <a name="XML_Format" href="#XML_Format">XML Format</a>
+## <a name="XML_Format" href="#XML_Format">XML Format</a>
There are two kinds of data that can be expressed in LDML: language-dependent data and supplementary data. In either case, data can be split across multiple files, which can be in multiple directory trees.
@@ -2355,7 +2556,7 @@
</unit>
```
-Rule elements do not have these restrictions, but also do not inherit, except as an entire block. Items which are ordered have the DTD Annotation `@ORDERED`. See [_DTD Annotations_](#DTD_Annotations) and _[Section 4.2 Inheritance and Validity](#Inheritance_and_Validity)_. For more technical details, see [Updating-DTDs](https://cldr.unicode.org/development/updating-dtds).
+Rule elements do not have these restrictions, but also do not inherit, except as an entire block. Items which are ordered have the DTD Annotation `@ORDERED`. See [_DTD Annotations_](#DTD_Annotations) and _[Inheritance and Validity](#Inheritance_and_Validity)_. For more technical details, see [Updating-DTDs](https://cldr.unicode.org/development/updating-dtds).
Note that the data in examples given below is purely illustrative, and does not match any particular language. For a more detailed example of this format, see [[Example](#LDML)]. There is also a DTD for this format, but _remember that the DTD alone is not sufficient to understand the semantics, the constraints, nor the interrelationships between the different elements and attributes_. You may wish to have copies of each of these to hand as you proceed through the rest of this document.
@@ -2377,11 +2578,11 @@
* preferenceOrdering
* references
-### 5.1 <a name="Common_Elements" href="#Common_Elements">Common Elements</a>
+### <a name="Common_Elements" href="#Common_Elements">Common Elements</a>
At any level in any element, two special elements are allowed.
-#### 5.1.1 <a name="special" href="#special">Element special</a>
+#### <a name="special" href="#special">Element special</a>
This element is designed to allow for arbitrary additional annotation and data that is product-specific. It has one required attribute `xmlns`, which specifies the XML [namespace](https://www.w3.org/TR/REC-xml-names/) of the special data. For example, the following used the version 1.0 POSIX special element.
@@ -2404,7 +2605,7 @@
</ldml>
```
-##### 5.1.1.1 <a name="Sample_Special_Elements" href="#Sample_Special_Elements">Sample Special Elements</a>
+##### <a name="Sample_Special_Elements" href="#Sample_Special_Elements">Sample Special Elements</a>
The elements in this section are _**not**_ part of the Locale Data Markup Language 1.0 specification. Instead, they are special elements used for application-specific data to be stored in the Common Locale Repository. They may change or be removed in future versions of this document, and are present here more as examples of how to extend the format. (Some of these items may move into a future version of the Locale Data Markup Language specification.)
@@ -2449,7 +2650,7 @@
</special>
```
-#### 5.1.2 <a name="Alias_Elements" href="#Alias_Elements">Element alias</a>
+#### <a name="Alias_Elements" href="#Alias_Elements">Element alias</a>
```xml
<!ELEMENT alias (special*) >
@@ -2482,7 +2683,7 @@
<alias source="locale" path="../monthWidth[@type='wide']"/>
```
-The default value if the path is not present is the same position in the tree. All of the attributes in the [[XPath](#XPath)] must be _distinguishing_ elements. For more details, see [Section 4.2 Inheritance and Validity](#Inheritance_and_Validity).
+The default value if the path is not present is the same position in the tree. All of the attributes in the [[XPath](#XPath)] must be _distinguishing_ elements. For more details, see [Inheritance and Validity](#Inheritance_and_Validity).
There is a special value for the source attribute, the constant `source="locale"`. This special value is equivalent to the locale being resolved. For example, consider the following example, where locale data for 'de' is being resolved:
@@ -2563,7 +2764,7 @@
The first row shows the inheritance within the `<x>` element, whereby `<c>` is inherited from root. The second shows the inheritance within the `<y>` element, whereby `<a>`, `<c>`, and `<d>` are inherited also from root, but from an alias there. The alias in root is logically replaced not by the elements in root itself, but by elements in the 'target' locale.
-For more details on data resolution, see [Section 4.2 Inheritance and Validity](#Inheritance_and_Validity).
+For more details on data resolution, see [Inheritance and Validity](#Inheritance_and_Validity).
Aliases must be resolved recursively. An alias may point to another path that results in another alias being found, and so on. For example, looking up Thai buddhist abbreviated months for the locale **xx-YY** may result in the following chain of aliases being followed:
@@ -2582,7 +2783,7 @@
It is an error to have a circular chain of aliases. That is, a collection of LDML XML documents must not have situations where a sequence of alias lookups (including inheritance and lateral inheritance) can be followed indefinitely without terminating.
-#### 5.1.3 <a name="Element_displayName" href="#Element_displayName">Element displayName</a>
+#### <a name="Element_displayName" href="#Element_displayName">Element displayName</a>
Many elements can have a display name. This is a translated name that can be presented to users when discussing the particular service. For example, a number format, used to format numbers using the conventions of that locale, can have translated name for presentation in GUIs.
@@ -2595,15 +2796,15 @@
Where present, the display names must be unique; that is, two distinct codes would not get the same display name. (There is one exception to this: in time zones, where parsing results would give the same GMT offset, the standard and daylight display names can be the same across different time zone IDs.) Any translations should follow customary practice for the locale in question. For more information, see [[Data Formats](#DataFormats)].
-#### 5.1.4 <a name="Escaping_Characters" href="#Escaping_Characters">Escaping Characters</a>
+#### <a name="Escaping_Characters" href="#Escaping_Characters">Escaping Characters</a>
Unfortunately, XML does not have the capability to contain all Unicode code points. Due to this, in certain instances extra syntax is required to represent those code points that cannot be otherwise represented in element content. The escaping syntax is only defined on a few types of elements, such as in collation or exemplar sets, and uses the appropriate syntax for that type.
The element `<cp>`, which was formerly used for this purpose, has been deprecated.
-### 5.2 <a name="Common_Attributes" href="#Common_Attributes">Common Attributes</a>
+### <a name="Common_Attributes" href="#Common_Attributes">Common Attributes</a>
-#### 5.2.1 <a name="Attribute_type" href="#Attribute_type">Attribute type</a>
+#### <a name="Attribute_type" href="#Attribute_type">Attribute type</a>
The attribute `type` is also used to indicate an alternate resource that can be selected with a matching `type=option` in the locale id modifiers, or be referenced by a default element. For example:
@@ -2617,7 +2818,7 @@
</ldml>
```
-#### 5.2.2 <a name="Attribute_draft" href="#Attribute_draft">Attribute draft</a>
+#### <a name="Attribute_draft" href="#Attribute_draft">Attribute draft</a>
If this attribute is present, it indicates the status of all the data in this element and any subelements (unless they have a contrary `draft` value), as per the following:
@@ -2628,9 +2829,9 @@
For more information on precisely how these values are computed for any given release, see [Data Submission and Vetting Process](https://cldr.unicode.org/index/process#h.krygv7y7jkk9) on the CLDR website.
-The `draft` attribute should only occur on "leaf" elements, and is deprecated elsewhere. For a more formal description of how elements are inherited, and what their draft status is, see _[Section 4.2 Inheritance and Validity](#Inheritance_and_Validity)_.
+The `draft` attribute should only occur on "leaf" elements, and is deprecated elsewhere. For a more formal description of how elements are inherited, and what their draft status is, see _[Inheritance and Validity](#Inheritance_and_Validity)_.
-#### 5.2.3 <a name="alt_attribute" href="#alt_attribute">Attribute alt</a>
+#### <a name="alt_attribute" href="#alt_attribute">Attribute alt</a>
This attribute labels an alternative value for an element. The value is a _descriptor_ that indicates what kind of alternative it is, and takes one of the following
@@ -2657,9 +2858,9 @@
The values for _variantname_ at this time include "variant", "list", "email", "www", "short", and "secondary".
-For a more complete description of how draft applies to data, see _[Section 4.2 Inheritance and Validity](#Inheritance_and_Validity)_.
+For a more complete description of how draft applies to data, see _[Inheritance and Validity](#Inheritance_and_Validity)_.
-#### 5.2.4 <a name="references_attribute" href="#references_attribute">Attribute references</a>
+#### <a name="references_attribute" href="#references_attribute">Attribute references</a>
The value of this attribute is a token representing a reference for the information in the element, including standards that it may conform to. `<references>`. (In older versions of CLDR, the value of the attribute was freeform text. That format is deprecated.)
@@ -2677,9 +2878,9 @@
This attribute was originally intended for use in marking display names whose capitalization differed from what was indicated by the now-deprecated `<inText>` element (perhaps, for example, because the names included a proper noun). It was never supported in the dtd and is not needed for use with the new `<contextTransforms>` element.
-### 5.3 <a name="Common_Structures" href="#Common_Structures">Common Structures</a>
+### <a name="Common_Structures" href="#Common_Structures">Common Structures</a>
-#### 5.3.1 <a name="Date_Ranges" href="#Date_Ranges">Date and Date Ranges</a>
+#### <a name="Date_Ranges" href="#Date_Ranges">Date and Date Ranges</a>
When attribute specify date ranges, it is usually done with attributes `from` and `to`. The `from` attribute specifies the starting point, and the `to` attribute specifies the end point. The deprecated `time` attribute was formerly used to specify time with the deprecated `weekEndStart` and `weekEndEnd` elements, which were themselves inherently `from` or `to`.
@@ -2702,7 +2903,7 @@
The dates and times are specified in local time, unless otherwise noted. (In particular, the metazone values are in UTC (also known as GMT).
-#### 5.3.2 <a name="Text_Directionality" href="#Text_Directionality">Text Directionality</a>
+#### <a name="Text_Directionality" href="#Text_Directionality">Text Directionality</a>
The content of certain elements, such as date or number formats, may consist of several sub-elements with an inherent order (for example, the year, month, and day for dates). In some cases, the order of these sub-elements may be changed depending on the bidirectional context in which the element is embedded.
@@ -2710,40 +2911,56 @@
Element content whose display may be affected in this way should include an explicit direction mark, such as U+200E LEFT-TO-RIGHT MARK or U+200F RIGHT-TO-LEFT MARK, at the beginning or end of the element content, or both.
-#### 5.3.3 <a name="Unicode_Sets" href="#Unicode_Sets">Unicode Sets</a>
+#### <a name="Unicode_Sets" href="#Unicode_Sets">Unicode Sets</a>
-Some attribute values or element contents use _UnicodeSet_ notation. A UnicodeSet represents a finite set of Unicode code points and strings, and is defined by lists of code points and strings, Unicode property sets, and set operators, all bounded by square brackets. In this context, a code point means a string consisting of exactly one code point.
+Some attribute values or element contents use _UnicodeSet_ notation.
+A UnicodeSet represents a finite set of Unicode code points and strings, and is defined by lists of code points and strings, Unicode property sets, and set operators, with square brackets for groupings.
+In this context, a code point means a string consisting of exactly one code point.
-A UnicodeSet implements the semantics in _UTS #18: Unicode Regular Expressions_ [[UTS18](https://www.unicode.org/reports/tr41/#UTS18)] Levels 1 & 2 that are relevant to determining sets of characters. Note however that it may deviate from the syntax provided in [[UTS18](https://www.unicode.org/reports/tr41/#UTS18)], which is illustrative rather than a requirement. There is one exception to the supported semantics, Section [RL2.6](https://www.unicode.org/reports/tr18/#RL2.6) _Wildcards in Property Values_. That feature can be supported in clients such as ICU by implementing a “hook” as is done in the [online UnicodeSet utilities](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7Bname%3D%2FAPPLE%2F%7D).
+A UnicodeSet implements the semantics in _UTS #18: Unicode Regular Expressions_ [[UTS18](https://www.unicode.org/reports/tr41/#UTS18)] Levels 1 & 2 that are relevant to determining sets of characters.
+Note however that it may deviate from the syntax provided in [[UTS18](https://www.unicode.org/reports/tr41/#UTS18)].
+In particular, Section [RL2.6](https://www.unicode.org/reports/tr18/#RL2.6) _Wildcards in Property Values_ is not supported.
+However, that feature can be supported in clients such as ICU by implementing a “hook” as is done in the [online UnicodeSet utilities](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7Bname%3D%2FAPPLE%2F%7D).
-A UnicodeSet may be cited in specifications outside of the domain of LDML. In such a case, the specification may specify a subset of the syntax provided here.
+A UnicodeSet may be cited in specifications outside of the domain of LDML.
+In such a case, that specification may specify a subset or superset of the syntax provided here.
-The following provides EBNF syntax for a UnicodeSet:
+##### UnicodeSet syntax
| Symbol | Expression | Examples |
| -------------- | -------------------------------------------------------------- | --------------------------------------- |
-| `root` | <pre>= prop<br/>\| '[-]'<br/>\| '[' [\\-\\^]? s seq+ ']'</pre> | \\p{x=y},<br/>[abc] |
-| `seq` | <pre>= root (s [\\&\\-] s root)* s<br/>\| range s</pre> | [abc]-[cde], a |
-| `range` | <pre>= char ('-' char)?<br/>\| '{' (s char)+ s '}'</pre> | a, a-c, \{abc} |
-| `prop` | <pre>= '\\' [pP] '{' propName ([≠=] s value1+)? '}'<br/>\| '[:' '^'? propName ([≠=] s value2+)? ':]'</pre> | \\p\{x=y}, [:x=y:]<br/> |
-| `propName` | <pre>= s [A-Za-z0-9] [A-Za-z0-9_\\x20]* s</pre> | General_Category,<br/>General Category |
-| `value1` | <pre>= [^\\}]<br/>\| '\\' quoted</pre> | Lm,<br/>\\n,<br/>\\} |
-| `value2` | <pre>= [^:]<br/>\| '\\' quoted</pre> | Lm,<br/>\\n,<br/>\\: |
-| `char` | <pre>= [^\\& \\- \\[ \\[ \\] \\\\ \\} \\{ [:Pat_WS:]]<br/>\| '\\' quoted</pre> | a, b, c, \\n |
-| `quoted` | <pre>= 'u' (hex{4} \| bracketedHex)<br/>\| 'x' (hex{2} \| bracketedHex)<br/>\| 'U00' ('0' hex{5} \| '10' hex{4})<br/>\| 'N{' propName '}'<br/>\| [[\u0000-\U00010FFFF]-[uxUN]]</pre> | _**error** if lengths not exact_ |
-| `charName` | <pre>= s [A-Za-z0-9] [-A-Za-z0-9_\x20]* s</pre> | TIBETAN LETTER -A |
-| `bracketedHex` | <pre>= '{' s hexCodePoint (s hexCodePoint)* s '}'</pre> | \{61 2019 62} |
-| `hexCodePoint` | <pre>= hex{1,5} \| '10' hex{4}</pre> | |
-| `hex` | <pre>= [0-9A-Fa-f]</pre> | |
-| `s` | <pre>= [:Pattern_White_Space:]*</pre> | optional whitespace |
+| `unicodeSet` | <pre>= prop<br/>\| '\[' '^'? s '-'? s seq\* \[\\$ \\-\]? s '\]' <br/>\| var</pre> | \\p\{x=y\},<br/>[abc],<br/>$myset |
+| `seq` | <pre>= unicodeSet \(s \[\\&\\\-\] s unicodeSet\)\* s<br/>\| range s</pre> | \[abc\]\-\[cde\], a |
+| `range` | <pre>= element \('\-' element\)? | a, a\-c, \{abc\}, a\-\{z\} <br/> _note: in ranges, elements must resolve to exactly one code point._ |
+| `element` | <pre>= char \| string \| var </pre> | %, b, \{hello\}, \{\}, \\x\{61 62\} |
+| `prop` | <pre>= '\\' \[pP\] '\{' propName \(\[≠=\] s pValuePerl\+\)? '\}'<br/>\| '\[:' '^'? propName \(\[≠=\] s pValuePosix\+\)? ':\]'</pre> | \\p\{x=y\}, \[:x=y:\]<br/> |
+| `propName` | <pre>= s \[A\-Za\-z0\-9\] \[A\-Za\-z0\-9\_\\x20\]\* s</pre> | General\_Category,<br/>General Category |
+| `pValuePerl` | <pre>= \[^\\\}\]<br/>\| '\\' quoted</pre> | Lm,<br/>\\n,<br/>\\\} |
+| `pValuePosix` | <pre>= \[^:\]<br/>\| '\\' quoted</pre> | Lm,<br/>\\n,<br/>\\: |
+| `string` | <pre>= '\{' \(s charInString\)\* s '\}' </pre> | \{hello\} |
+| `char` | <pre>= \[^ \\^ \\& \\\- \\\[ \\\] \\\\ \\\{ \\$ \[:Pat_WS:\]\]<br/>\| '\\' quoted</pre> | a, b, c, \\n, \\\{, \\$ |
+| `charInString` | <pre>= \[^ \\\\ \\\} \[:Pat_WS:\]\]<br/>\| '\\' quoted</pre> | a, b, c, \\n, \{, $ |
+| `quoted` | <pre>= 'u' \(hex\{4\} \| bracketedHex\)<br/>\| 'x' \(hex\{2\} \| bracketedHex\)<br/>\| 'U00' \('0' hex\{5\} \| '10' hex\{4\}\)<br/>\| 'N\{' charName '\}'<br/>\| \[\[\\u0000\-\\U00010FFFF\]\-\[uxUN\]\]</pre> | n, U0000FFFE, \{, $, \] <br/> _note: lengths are exact_ |
+| `charName` | <pre>= s \[A\-Za\-z0\-9\] \[\-A\-Za\-z0\-9\_\\x20\]\* s</pre> | TIBETAN LETTER \-A |
+| `bracketedHex` | <pre>= '\{' s hexCodePoint \(sRequired hexCodePoint\)\* s '\}'</pre> | \{61 2019 62\}, \{61\} |
+| `hexCodePoint` | <pre>= hex\{1,5\} \| '10' hex\{4\}</pre> | |
+| `hex` | <pre>= \[0\-9A\-Fa\-f\]</pre> | |
+| `var` | <pre>= '$' \[:XID_Start:\] \[:XID_Continue:\]\*</pre> | $a, $elt5 (optional support) |
+| `s` | <pre>= \[:Pattern_White_Space:\]\*</pre> | optional whitespace |
+| `sRequired` | <pre>= \[:Pattern_White_Space:\]\+</pre> | required whitespace |
-Some constraints on UnicodeSet syntax are not captured by this EBNF. Notably, property names and values are restricted to those supported by the implementation, and have additional constraints imposed by [[UAX44](https://www.unicode.org/reports/tr41/#UAX44)]. In addition, quoted values that resolve to more than one code point are disallowed in ranges of the form `char '-' char`.
+Some constraints on UnicodeSet syntax are not captured by this EBNF.
+Notably:
+1. Property names and values are restricted to those supported by the implementation, and have additional constraints imposed by [[UAX44](https://www.unicode.org/reports/tr41/#UAX44)].
+2. Escapes that use multiple code points are equivalent to their flattened representation, i.e., `\x{61 62}` is equivalent to `\x{61}\x{62}`. These can also occur in strings, so **\[\{\\x\{ 061 62 0063\}\}\]** is equivalent to **\[\{abc\}\]**.
+3. Ranges (**X**-**Y**) are only supported in the case that elements **X** and **Y** resolve to single code points. That is, **\[a-b\]** and **\[\{a\}-\{b\}\]** are supported, while **\[a-{bz}\]** and **\[\{ax\}-\{bz\}\]** are not, because single-codepoint-strings are equivalent to that code point.
+4. If **\[…\]** starts with \[:, then it begins a prop, and must also terminate with :\]. Thus **\[:di:\]** is a valid property expression, **\[di:\]** is a 3 code-point set, and **\[:di\]** raises an error. Whitespace is significant when initiating/terminating a POSIX property expression, so **\[ :\]** is syntactically valid and equivalent to **\[\\:\]**.
The syntax characters are listed in the table below:
| Char | Hex | Name | Usage |
| ---- | ------ | -------------------- | ------------------------------------------ |
-| $ | U+0024 | DOLLAR SIGN | Equivalent of \\uFFFF (This is for implementations that return \\uFFFF when accessing before the first or after the last character) |
+| $ | U+0024 | DOLLAR SIGN | Equivalent to \\uFFFF when followed by '\]', initiator for variable identifiers otherwise |
| & | U+0026 | AMPERSAND | Intersecting UnicodeSets |
| - | U+002D | HYPHEN-MINUS | Ranges of characters; also set difference. |
| : | U+003A | COLON | POSIX-style property syntax |
@@ -2755,17 +2972,55 @@
| } | U+007D | RIGHT CURLY BRACKET | Strings in set; Perl property syntax |
| | U+0020 U+0009..U+000D U+0085<br/>U+200E U+200F<br/>U+2028 U+2029 | ASCII whitespace,<br/>LRM, RLM,<br/>LINE/PARAGRAPH SEPARATOR | Ignored except when escaped |
+Note that some syntax characters only have a special meaning in a certain context. In particular:
+* Out of all above syntax characters, only \\, \}, and whitespace have a special meaning inside strings (**\[\{\[a-z\]\}\]** is the set of the string '\[a-z\]', **\[\{\$blah\}\]** is the set of the string '\$blah').
+* \$ is equivalent to \uFFFF when appearing at the very end of a set with or without trailing whitespace (**[a-z\$]**, **[a-z\$ ]**), and used as starting indicator for a variable reference elsewhere, in which case the variable name will be the longest match on the `var` nonterminal (such as **[\$my_set]**).
+* \- is equivalent to the literal character \\- when occuring at the very beginning of a set, after a \^ at the beginning of a set, or at the very end of a set, in all cases with or without whitespace (**[-abc]**, **[ ^ -abc]**, **[abc-]**), and used as the set difference or range operator elsewhere (**[[abc]-[bc]]**, **[a-z]**)
+* \: initiates a POSIX property set when directly after a \[ without whitespace inbetween (**[:L:]**), ends a POSIX property set when directly before a \] without whitespace inbetween (**[:L:]**), and is equivalent to the literal character \\\: in any other place (**[ \:]**, **[L\:]**)
+* \} ends a string when occurring inside a string (**[{hello}]**), and is equivalent to the literal character \\\} in any other place (**[}a]**)
-##### 5.3.3.1 <a name="Lists_of_Code_Points" href="#Lists_of_Code_Points">Lists of Code Points</a>
+###### Syntax Special Case Examples
+In the following, a table of examples including common sources of confusion concerning the UnicodeSet syntax:
+| Expression | Contained Elements | Syntax Errors |
+| - | - | - |
+| **\[^a\]** | All Unicode code points except 'a' | **\[ ^a\]**, **\[a^\]** |
+| **\[\\^a\]** | 'a' and '^' | |
+| **\[:L:\]** | All code points with Unicode property 'General_Category' equal to 'Letter' | **\[:L\]**, **\[:\]** |
+| **\[ :\]** | ':' | |
+| **\[L:\]** | 'L' and ':' | |
+| **\[-\]** | '-'. | |
+| **\[ - \]** | '-' | |
+| **\[a-\]**, **\[-a\]** | 'a' and '-' | |
+| **\[a -b\]** | All code points between 'a' and 'b' (inclusive) | |
+| **\[\[a-b\] -\[b\]\]**, **\[\[a\]-\[b\]-\[c\]\]** | 'a' | **\[a-b-c\]** |
+| **\[^ - \]** | All Unicode code points except '-' | **\[ ^ - \]** |
+| **\[\$\]**, **\[ \$ \]** | U+FFFF | |
+| **\[\$a\]** | The value of the variable '\$a' | **\[\$ a\]**, **\[\$und\]** |
+| **\[\$a\$\]** | U+FFFF and the value of the variable '\$a' | |
+| **\[a\$\]** | 'a' and U+FFFF | |
+| **\[\}\]** | '\}' | **\[\{\]** |
+| **\[\{\}\]** | the empty string, '' | |
+| **\[\{\}\}\]** | '\}' and the empty string, '' | |
+| **\[\{\{\}\]** | '\{' | |
+| **\[\{\$var\}\]** | the string '\$var' | |
+| **\[\{\[a-z\}\]**, **\[\{ \[ a - z\}\]** | the string '\[a-z' | |
+| **\[\\x\{10FFFF 1\}\]** | U+10FFFF and U+1 | **\[\\x\{10FFFF1\}\]** |
+| **\[\\x\{61\}-d\]** | 'a', 'b', 'c', and 'd' | **\[\\x\{61 63\}-d\]**, **\[\\x\{61 63\}-\\x\{62 64\}\]** |
+
+*Note: the above assumes that variables are supported, \$a is defined as a full UnicodeSet, a string, or a char, and \$und is not defined at all.*
+
+
+
+
+
+##### <a name="Lists_of_Code_Points" href="#Lists_of_Code_Points">Lists of Code Points</a>
Lists are a sequence of strings that may include ranges, which are indicated by a '-' between two code points, as in "a-z". The sequence _start-end_ specifies the range of all code points from the start to end, inclusive, in Unicode order. For example, **[a c d-f m]** is equivalent to **[a c d e f m]**. Whitespace can be freely used for clarity, as **[a c d-f m]** means the same as **[acd-fm]**.
-A string with multiple code points is represented in a list by being surrounded by curly braces, such as in **[a-z \{ch}]**. It can be used with the range notation, as described in _Section [5.3.4 String Range](#String_Range)_ . There is an additional restriction on string ranges in a UnicodeSet: the number of codepoints in the first string of the range must be identical to the number in the second. Thus [\{ab}-\{c}] and [\{ab}-c] are invalid.
+A string with multiple code points is represented in a list by being surrounded by curly braces, such as in **[a-z \{ch}]**. It can be used with the range notation, with the restriction that each string contains exactly one code point. Thus **\[\{ab\}-\{c\}\]**, **\[\{ax\}-\{bz\}\]**, and **\[\{ab\}-c\]** are invalid. A string consisting of a single code point is equivalent to that code point, that is, **[\{a}-c]** is valid and equivalent to **[a b c]**.
-In UnicodeSets, there are two ways to quote syntax code points:
-
-<a name="Backslash_Escapes"></a>
-Outside of single quotes, certain backslashed code point sequences can be used to quote code points:
+##### <a name="Backslash_Escapes" href="#Backslash_Escapes">Backslash Escapes</a>
+Certain backslashed code point sequences can be used to quote code points:
| Sequence | Code point |
| --------------- | ------------------------------------ |
@@ -2790,7 +3045,7 @@
Unicode property sets are defined as described in _UTS #18: Unicode Regular Expressions_ [[UTS18](https://www.unicode.org/reports/tr41/#UTS18)], Level 1 and RL2.5, including the syntax where given. For an example of a concrete implementation of this, see [[ICUUnicodeSet](#ICUUnicodeSet)].
-##### 5.3.3.2 <a name="Unicode_Properties" href="#Unicode_Properties">Unicode Properties</a>
+##### <a name="Unicode_Properties" href="#Unicode_Properties">Unicode Properties</a>
Briefly, Unicode property sets are specified by any Unicode property and a value of that property, such as **[:General_Category=Letter:]** for Unicode letters or **\\p\{uppercase}** for the set of upper case letters in Unicode. The property names are defined by the PropertyAliases.txt file and the property values by the PropertyValueAliases.txt file. For more information, see [[UAX44](https://www.unicode.org/reports/tr41/#UAX44)]. The syntax for specifying the property sets is an extension of either POSIX or Perl syntax, by the addition of `"=<value>"`. For example, you can match letters by using the POSIX-style syntax:
@@ -2809,7 +3064,7 @@
| POSIX-style Syntax | [:type=value:] | [:^type=value:] |
| Perl-style Syntax | \\p\{type=value} | \\P\{type=value} |
-##### 5.3.3.3 <a name="Boolean_Operations" href="#Boolean_Operations">Boolean Operations</a>
+##### <a name="Boolean_Operations" href="#Boolean_Operations">Boolean Operations</a>
The low-level lists or properties then can be freely combined with the normal set operations (union, inverse, difference, and intersection):
@@ -2823,7 +3078,44 @@
**One caution:** the '&' and '-' operators operate between sets. That is, they must be immediately preceded and immediately followed by a set. For example, the pattern **[[:Lu:]-A]** is illegal, since it is interpreted as the set **[:Lu:]** followed by the incomplete range **-A**. To specify the set of upper case letters except for 'A', enclose the 'A' in brackets: **[[:Lu:]-[A]]**.
-##### 5.3.3.4 <a name="UnicodeSet_Examples" href="#UnicodeSet_Examples">UnicodeSet Examples</a>
+##### <a name="Variables_in_UnicodeSets" href="#Variables_in_UnicodeSets">Variables in UnicodeSets</a>
+
+Support for variable identifiers (var) is optional.
+They are used in certain contexts such as in [Transforms](tr35-general.md#Transforms).
+When they are used, they are defined as follows:
+
+UnicodeSets may contain variables (`$my_char`, `$the_set`, ...) in place of full UnicodeSets and strings/characters. If variable support is enabled, variables must be defined (out-of-scope for UnicodeSets). In particular, referring to undefined variables is an error.
+
+Not all variable maps are valid for a given expression in UnicodeSet syntax.
+For instance, consider **[$a-$b]**; this may be a range of characters if both **$a** and **$b** are characters,
+or a difference of sets if they are both sets; but given the map `{ a => '0', b => [:L:] }`, it is invalid.
+
+**Note:** In particular, the variable map is needed not just to compute the actual set of characters and strings represented by the UnicodeSet,
+but also to parse the UnicodeSet syntax: if **$a** and **$b** were unknown, the parsing of **[$a-$b]** would be ambiguous.
+
+Variables are replaced by value, that is, **[a \$minus z]** with a variable map `{ minus => '-' }` is equivalent to **[-az]**, not **[a-z]** (i.e., cardinality of 3 instead of 26).
+The full `var` nonterminal is replaced, i.e., the variable name together with the prefixed \$.
+
+The variable syntax implements UAX31-R1-2 with XID_Start and XID_Continue. For more information, see [[UAX31](https://www.unicode.org/reports/tr41/#UAX31)].
+Variables are equivalent normalized identifiers with Normalization Form C, implementing UAX31-R4. Furthermore, variables are case-sensitive.
+
+
+Notes:
+1. The 'type' of a variable value is not specified syntactically.
+Thus \[\$a\-\$b\] can resolve whether \$a and \$b are chars/strings (eg, \$a=δ, \$b=θ) or full UnicodeSets (eg, \$a=\\p\{script=greek\}, \$b=\\p\{general_category=letter\}).
+The only restriction is that the result be syntactic; thus (\$a=w, \$b=xy) would raise an error.
+2. Variable substitution is currently disallowed inside of property expressions.
+Thus \\p{gc=\$blah} raises an error.
+3. '\$' when followed by '\]' is interpreted as \\uFFFF, and is used to match before the start of a string or after the end.
+Thus \[ab\$\] matches the string "xaby" in the locations (marked with '()'): "()xaby", "x(a)by", "xa(b)y", "xaby()".
+4. If an unescaped '\$' is neither followed by a character of type \[:XID_Start:\] nor a '\]', it is a syntax error.
+
+**Backwards compatibility**: In prior versions of this document, the character \$ was a valid element of the `char` nonterminal with the special meaning of `\uFFFF`.
+In current versions, the \$ character may only appear by itself at the end of a UnicodeSet, e.g., **[a-z\$]**, where it keeps that interpretation.
+Allowing \$ to appear in any other location is only allowed as the prefix for variables.
+The previous behavior of allowing \$ in the `char` nonterminal is considered obsolete and must be avoided by new implementations.
+
+##### <a name="UnicodeSet_Examples" href="#UnicodeSet_Examples">UnicodeSet Examples</a>
The following table summarizes the syntax that can be used.
@@ -2837,11 +3129,10 @@
| [[pat1]-[pat2]] | The asymmetric difference of sets specified by pat1 and pat2 |
| [a \{ab} \{ac}] | The code point 'a' and the multi-code point strings "ab" and "ac" |
| [x\\u\{61 2019 62}y] | Equivalent to [x\\u0061\\u2019\\u0062y] (= [xa’by]) |
-| [\{ax}-\{bz}] | The set containing [\{ax} \{ay} \{az} \{bx} \{by} \{bz}], using the range syntax to get all the strings from \{ax} to \{bz} as described in _Section [5.3.4 String Range](#String_Range)_. |
| [:Lu:] | The set of code points with a given property value, as defined by PropertyValueAliases.txt. In this case, these are the Unicode upper case letters. The long form for this is **[:General_Category=Uppercase_Letter:]**. |
| [:L:] | The set of code points belonging to all Unicode categories starting with 'L', that is, **[[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]**. The long form for this is **[:General_Category=Letter:]**. |
-#### 5.3.4 <a name="String_Range" href="#String_Range">String Range</a>
+#### <a name="String_Range" href="#String_Range">String Range</a>
A String Range is a compact format for specifying a list of strings.
@@ -2878,7 +3169,7 @@
<tr><td>👦🏻-🏿</td><td>→</td><td>👦🏻 👦🏼 👦🏽 👦🏾 👦🏿</td></tr>
</tbody></table>
-### 5.4 <a name="Identity_Elements" href="#Identity_Elements">Identity Elements</a>
+### <a name="Identity_Elements" href="#Identity_Elements">Identity Elements</a>
```xml
<!ELEMENT identity (alias | (version, generation?, language, script?, territory?, variant?, special*) ) >
@@ -2928,13 +3219,13 @@
The variant code is the tertiary part of the specification of the locale id, with values as described above.
-When combined according to the rules described in _[Section 3, Unicode Language and Locale Identifiers](#Unicode_Language_and_Locale_Identifiers)_, the `language` element, along with any of the optional `script`, `territory`, and `variant` elements, must identify a known, stable locale identifier. Otherwise, it is an error.
+When combined according to the rules described in _[Unicode Language and Locale Identifiers](#Unicode_Language_and_Locale_Identifiers)_, the `language` element, along with any of the optional `script`, `territory`, and `variant` elements, must identify a known, stable locale identifier. Otherwise, it is an error.
-### 5.5 <a name="Valid_Attribute_Values" href="#Valid_Attribute_Values">Valid Attribute Values</a>
+### <a name="Valid_Attribute_Values" href="#Valid_Attribute_Values">Valid Attribute Values</a>
-The [DTD Annotations](#DTD_Annotations) in Section 5.7 are used to determine whether elements, attributes, or attribute values are valid (or deprecated).
+The [DTD Annotations](#DTD_Annotations) in are used to determine whether elements, attributes, or attribute values are valid (or deprecated).
-### 5.6 <a name="Canonical_Form" href="#Canonical_Form">Canonical Form</a>
+### <a name="Canonical_Form" href="#Canonical_Form">Canonical Form</a>
The following are restrictions on the format of LDML files to allow for easier parsing and comparison of files.
@@ -2964,7 +3255,7 @@
[XML](https://www.w3.org/TR/REC-xml/) files can have a wide variation in textual form, while representing precisely the same data. By putting the LDML files in the repository into a canonical form, this allows us to use the simple diff tools used widely (and in CVS) to detect differences when vetting changes, without those tools being confused. This is not a requirement on other uses of LDML; just simply a way to manage repository data more easily.
-#### 5.6.1 <a name="Content" href="#Content">Content</a>
+#### <a name="Content" href="#Content">Content</a>
1. All start elements are on their own line, indented by _depth_ tabs.
2. All end elements (except for leaf nodes) are on their own line, indented by _depth_ tabs.
@@ -3008,15 +3299,15 @@
</ldml>
```
-#### 5.6.2 <a name="Ordering" href="#Ordering">Ordering</a>
+#### <a name="Ordering" href="#Ordering">Ordering</a>
An element is ordered first by the element name, and then if the element names are identical, by the sorted set of attribute-value pairs. For the latter, compare the first pair in each (in sorted order by attribute pair). If not identical, go to the second pair, and so on.
Elements and attributes are ordered according to their order in the respective DTDs. Attribute value comparison is a bit more complicated, and may depend on the attribute and type. This is currently done with specific ordering tables.
-Any future additions to the DTD must be structured so as to allow compatibility with this ordering. See also [Section 5.5 Valid Attribute Values.](#Valid_Attribute_Values)
+Any future additions to the DTD must be structured so as to allow compatibility with this ordering. See also [Valid Attribute Values.](#Valid_Attribute_Values)
-#### 5.6.3 <a name="Comments" href="#Comments">Comments</a>
+#### <a name="Comments" href="#Comments">Comments</a>
1. Comments are of the form `<!-- stuff -->`.
2. They are logically attached to a node. There are 4 kinds:
@@ -3041,7 +3332,7 @@
<zone type="Asia/Jerusalem">
```
-### 5.7 <a name="DTD_Annotations" href="#DTD_Annotations">DTD Annotations</a>
+### <a name="DTD_Annotations" href="#DTD_Annotations">DTD Annotations</a>
The information in a standard DTD is insufficient for use in CLDR. To make up for that, DTD annotations are added. These are of the form
@@ -3055,6 +3346,7 @@
| ---------------------| ----------- |
| `<!--@VALUE-->` | The attribute is not distinguishing, and is treated like an element value |
| `<!--@METADATA-->` | The attribute is a “comment” on the data, like the draft status. It is not typically used in implementations. |
+| `<!--@ALLOWS_UESC-->` | The attribute value can be escaped using the `\u` notation. Does not require this notation to be used. |
| `<!--@ORDERED-->` | The element's children are ordered, and do not inherit. |
| `<!--@DEPRECATED-->` | The element or attribute is deprecated, and should not be used. |
| `<!--@DEPRECATED: attribute-value1, attribute-value2-->` | The attribute values are deprecated, and should not be used. Spaces between tokens are not significant. |
@@ -3069,7 +3361,7 @@
The element values may be literals, regular expressions, or variables (some of which are set programmatically according to other CLDR data, such as the above). However, the information at this point does not cover all attribute values, is used only for testing, and should not be used in implementations since the structure may change without notice.
-#### 5.7.1 <a name="match_expressions" href="#match_expressions">Attribute Value Constraints</a>
+#### <a name="match_expressions" href="#match_expressions">Attribute Value Constraints</a>
The following are constraints on the attribute values. Note: in future versions, the format may change, and/or the constraints may be tightened.
@@ -3093,25 +3385,25 @@
-## 6 <a name="Property_Data" href="#Property_Data">Property Data</a>
+## <a name="Property_Data" href="#Property_Data">Property Data</a>
Some data in CLDR does not use an XML format, but rather a semicolon-delimited format derived from that of the Unicode Character Database. That is because the data is more likely to be parsed by implementations that already parse UCD data. Those files are present in the common/properties directory.
Each file has a header that explains the format and usage of the data.
-### 6.1 <a name="Script_Metadata" href="#Script_Metadata">Script Metadata</a>
+### <a name="Script_Metadata" href="#Script_Metadata">Script Metadata</a>
`scriptMetadata.txt`
This file provides general information about scripts that may be useful to implementations processing text. The information is the best currently available, and may change between versions of CLDR. The format is similar to Unicode Character Database property file, and is documented in the header of the data file.
-### 6.2 <a name="Extended_Pictographic" href="#Extended_Pictographic">Extended Pictographic</a>
+### <a name="Extended_Pictographic" href="#Extended_Pictographic">Extended Pictographic</a>
`ExtendedPictographic.txt`
This file was used to define the ExtendedPictographic data used for “future-proofing” emoji behavior, especially in segmentation. As of Emoji version 11.0, the set of Extended_Pictographic is incorporated into the emoji data files found at [unicode.org/Public/emoji/](https://www.unicode.org/Public/emoji/).
-### 6.3 <a name="Labels.txt" href="#Labels.txt">Labels.txt</a>
+### <a name="Labels.txt" href="#Labels.txt">Labels.txt</a>
`labels.txt`
@@ -3119,23 +3411,23 @@
Initially, the contents are focused on emoji, but may be expanded in the future to other types of characters. Note that a character may have multiple labels.
-### 6.4 <a name="Segmentation_Tests" href="#Segmentation_Tests">Segmentation Tests</a>
+### <a name="Segmentation_Tests" href="#Segmentation_Tests">Segmentation Tests</a>
CLDR provides a tailoring to the [Grapheme Cluster Break (gcb)](https://www.unicode.org/reports/tr29/) algorithm to avoid splitting Indic aksaras. The corresponding test files for that are located in common/properties/segments/, along with a readme.txt that provides more details. There are also specific test files for the supported Indic scripts in the unittest directory.
-## 7 <a name="Format_Parse_Issues" href="#Format_Parse_Issues">Issues in Formatting and Parsing</a>
+## <a name="Format_Parse_Issues" href="#Format_Parse_Issues">Issues in Formatting and Parsing</a>
-### 7.1 <a name="Lenient_Parsing" href="#Lenient_Parsing">Lenient Parsing</a>
+### <a name="Lenient_Parsing" href="#Lenient_Parsing">Lenient Parsing</a>
-#### 7.1.1 <a name="Motivation" href="#Motivation">Motivation</a>
+#### <a name="Motivation" href="#Motivation">Motivation</a>
User input is frequently messy. Attempting to parse it by matching it exactly against a pattern is likely to be unsuccessful, even when the meaning of the input is clear to a human being. For example, for a date pattern of "MM/dd/yy", the input "June 1, 2006" will fail.
The goal of lenient parsing is to accept user input whenever it is possible to decipher what the user intended. Doing so requires using patterns as data to guide the parsing process, rather than an exact template that must be matched. This informative section suggests some heuristics that may be useful for lenient parsing of dates, times, and numbers.
-#### 7.1.2 <a name="Loose_Matching" href="#Loose_Matching">Loose Matching</a>
+#### <a name="Loose_Matching" href="#Loose_Matching">Loose Matching</a>
Loose matching ignores attributes of the strings being compared that are not important to matching. It involves the following steps:
@@ -3155,7 +3447,7 @@
Loose matching involves (logically) applying the above transform to both the input text and to each of the field elements used in matching, before applying the specific heuristics below. For example, if the input number text is " - NA f. 1,000.00", then it is mapped to "-naf1,000.00" before processing. The currency signs are also transformed, so "NA f." is converted to "naf" for purposes of matching. As with other Unicode algorithms, this is a logical statement of the process; actual implementations can optimize, such as by applying the transform incrementally during matching.
-### 7.2 <a name="Invalid_Patterns" href="#Invalid_Patterns">Handling Invalid Patterns</a>
+### <a name="Invalid_Patterns" href="#Invalid_Patterns">Handling Invalid Patterns</a>
Processes sometimes encounter invalid number or date patterns, such as a number pattern with “¤¤¤¤¤” (valid pattern character but invalid length in current CLDR), a date pattern with “nn” (invalid pattern character in current CLDR), or a date pattern with “MMMMMM” (invalid length in current CLDR). The recommended behavior for handling such an invalid pattern field is:
@@ -3165,11 +3457,68 @@
* For a pattern that contains a currently-invalid pattern character (applies only to date patterns, for which A-Za-z are reserved as pattern characters but not all defined as valid):
* Produce an error (set an error code or throw an exception) when an attempt is made to create a formatter with such a pattern or to apply such a pattern to an existing formatter.
+## <a name="Data_Size" href="#Data_Size">Data Size Reduction</a>
+Software implementations may have constrained memory requirements.
+The following outlines some techniques for filtering out CLDR data for a particular implementation.
+The exact filtering would depend on the particular requirements of the implementation in question, of course.
+
+Locale data can be _sliced_ to exclude data not needed by a particular implementation.
+This can be _vertical slicing_: excluding a locale and all the locales inheriting from them, or _horizontal slicing_: excluding particular types of data from all locales.
+For example:
+ * A vertical slice could retain only those locales used in a particular set of markets, such as EU locales.
+ * A horizontal slice could remove all data in the emoji/ directory, which are annotations for emoji and symbols.
+
+Of course, both of these techniques can be applied.
+
+### <a name="Vertical_Slicing" href="#Vertical_Slicing">Vertical Slicing</a>
+
+The choice of locales to include depends very much upon particular implementations.
+Some information that might be useful for determining the choice is found in the
+ [Supplemental Territory Information](tr35-info.md#Supplemental_Territory_Information),
+which provides information on the use of languages in different countries/regions.
+(For a human-readable chart, see [Territory-Language Information](https://unicode-org.github.io/cldr-staging/charts/latest/supplemental/territory_language_information.html).)
+
+It is important to note that if a particular locale is in a vertical slice, then all of its parents should be as well, because of inheritance.
+This is not a factor if the data is fully resolved, as in the JSON format data.
+
+Slicing can also remove related supplemental data.
+For example, the likely subtags data includes a large number of languages that may not be of interest for all implementations.
+Where an the implementation only includes (say) the CLDR locales at Basic coverage in [Unicode CLDR - Coverage Levels](https://cldr.unicode.org/index/cldr-spec/coverage-levels)
+(and locales inheriting from them), the likely subtag data that doesn’t match can be filtered out.
+
+### <a name="Horizontal_Slicing" href="#Horizontal_Slicing">Horizontal Slicing</a>
+
+The main reason to perform horizontal slicing is when a particular feature is not used, so the implementation wants to remove the data required for powering that feature.
+For example, if an application isn't performing date formatting, it can remove all date formatting data (transitively).
+It must take care to retain data used by other features: in the previous example, the number formatting data where currencies are being formatted.
+
+Locales may also have data on a field-by-field basis that is reasonable to filter out.
+For example, locales that meet the Modern level of coverage typically also include some data at a Comprehensive level.
+That data is not typically needed for most implementations, and can typically be filtered out.
+For example, in CLDR version 43, 58% of the script names (`//ldml/localeDisplayNames/scripts/script[@type="*"]`) are at the Comprehensive level;
+in fact, ~20% of all of values for the Modern level locales are at the Comprehensive level.
+
+The easiest way to do that is to use the CLDR Java tooling (the `cldr-code` package) to filter the data before generating the implementation's data format.
+That way allows the implementation to have direct access to the CoverageLevel code that can determine the coverage level, for a given locale and path.
+Once the data is transformed, such as to the JSON format, the CoverageLevel code is no longer accessible.
+For example, here is a code snippet:
+
+```
+private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = CLDRConfig.getInstance().getSupplementalDataInfo();
+...
+ Level pathLevel = SUPPLEMENTAL_DATA_INFO.getCoverageLevel(path, locale);
+ if (minimumPathCoverage.compareTo(pathLevel) >= 0) {
+ include(path);
+ }
+```
+
+Similarly, the subdivision translations represent a large body of data that may not be needed for many implementations.
+
* * *
## <a name="Deprecated_Structure" href="#Deprecated_Structure">Annex A Deprecated Structure</a>
-The [DTD Annotations](#DTD_Annotations) in Section 5.7 are used to determine whether DTD items such as elements, attributes, or attribute values are deprecated.
+The [DTD Annotations](#DTD_Annotations) in are used to determine whether DTD items such as elements, attributes, or attribute values are deprecated.
Though such deprecated items are still valid LDML, they are strongly discouraged, and are no longer used in CLDR.
@@ -3179,11 +3528,11 @@
### <a name="Fallback_Elements" href="#Fallback_Elements">A.1 Element fallback</a>
-Implementations should use instead the information in [Section 4.4 Language Matching](#LanguageMatching) for doing language fallback.
+Implementations should use instead the information in [Language Matching](#LanguageMatching) for doing language fallback.
### <a name="BCP47_Keyword_Mapping" href="#BCP47_Keyword_Mapping">A.2 BCP 47 Keyword Mapping</a>
-Instead use the mechanisms descibed in [Section 3.6.4 U Extension Data Files](#Unicode_Locale_Extension_Data_Files).
+Instead use the mechanisms descibed in [U Extension Data Files](#Unicode_Locale_Extension_Data_Files).
### <a name="Choice_Patterns" href="#Choice_Patterns">A.3 Choice Patterns</a>
@@ -3264,11 +3613,7 @@
### <a name="postCodeElements" href="#postCodeElements">A.16 Elements postalCodeData, postCodeRegex</a>
-Instead please see other services that are kept up to date, such as:
-
-* [https://i18napis.appspot.com/address/data/US](https://i18napis.appspot.com/address/data/US)
-* [https://i18napis.appspot.com/address/data/CH](https://i18napis.appspot.com/address/data/CH)
-* ...
+Instead please see other services that are kept up to date, such as <https://github.com/google/libaddressinput>
### <a name="telephoneCodeData" href="#telephoneCodeData">A.17 Element telephoneCodeData</a>
@@ -3392,30 +3737,7 @@
###### Table: <a name="Part_7_Links" href="#Part_7_Links">Part 7 Links</a>: [Keyboards](tr35-keyboards.md) (keyboard mappings)
-| Old section | Section in new part |
-| -------------------------------------------------------------------------------------------------------------------------- | ------------------- |
-| S <a name="Keyboards" href="#Keyboards">Keyboards</a> | 1 [Introduction](tr35-keyboards.md#Introduction) |
-| S <a name="Goals_and_Nongoals" href="#Goals_and_Nongoals">Goals and Nongoals</a> | [Goals and Nongoals](tr35-keyboards.md#Goals_and_Nongoals) |
-| S <a name="File_and_Dir_Structure" href="#File_and_Dir_Structure">File and Directory Structure</a> | [File and Directory Structure](tr35-keyboards.md#File_and_Dir_Structure) |
-| S <a name="Element_Heirarchy_Layout_File" href="#Element_Heirarchy_Layout_File">Element Hierarchy - Layout File</a> | [Element Hierarchy - Layout File](tr35-keyboards.md#Element_Heirarchy_Layout_File) |
-| S <a name="Element_Heirarchy_Platform_File" href="#Element_Heirarchy_Platform_File">Element Hierarchy - Platform File</a> | [Element Hierarchy - Platform File](tr35-keyboards.md#Element_Heirarchy_Platform_File) |
-| S <a name="Invariants" href="#Invariants">Invariants</a> | [Invariants](tr35-keyboards.md#Invariants) |
-| S <a name="Data_Sources" href="#Data_Sources">Data Sources</a> | [Data Sources](tr35-keyboards.md#Data_Sources) |
-| S <a name="Keyboard_IDs" href="#Keyboard_IDs">Keyboard IDs</a> | [Keyboard IDs](tr35-keyboards.md#Keyboard_IDs) |
-| S <a name="Platform_Behaviors_in_Edge_Cases" href="#Platform_Behaviors_in_Edge_Cases">Platform Behaviors in Edge Cases</a> | [Platform Behaviors in Edge Cases](tr35-keyboards.md#Platform_Behaviors_in_Edge_Cases) |
-| S <a name="Element_Keyboard" href="#Element_Keyboard">Element: keyboard</a> | [Element: keyboard](tr35-keyboards.md#Element_Keyboard) |
-| S <a name="Element_version" href="#Element_version">Element: version</a> | [Element: version](tr35-keyboards.md#Element_version) |
-| S <a name="Element_generation" href="#Element_generation">Element: generation</a> | [Element: generation](tr35-keyboards.md#Element_generation) |
-| S <a name="Element_names" href="#Element_names">Element: names</a> | [Element: names](tr35-keyboards.md#Element_names) |
-| S <a name="Element_name" href="#Element_name">Element: name</a> | [Element: name](tr35-keyboards.md#Element_name) |
-| S <a name="Element_settings" href="#Element_settings">Element: settings</a> | [Element: settings](tr35-keyboards.md#Element_settings) |
-| S <a name="Element_keyMap" href="#Element_keyMap">Element: keyMap</a> | [Element: keyMap](tr35-keyboards.md#Element_keyMap) |
-| S <a name="Element_map" href="#Element_map">Element: map</a> | [Element: map](tr35-keyboards.md#Element_map) |
-| S <a name="Element_transforms" href="#Element_transforms">Element: transforms</a> | [Element: transforms](tr35-keyboards.md#Element_transforms) |
-| S <a name="Element_transform" href="#Element_transform">Element: transform</a> | [Element: transform](tr35-keyboards.md#Element_transform) |
-| S <a name="Element_platform" href="#Element_platform">Element: platform</a> | [Element: platform](tr35-keyboards.md#Element_platform) |
-| S <a name="Element_hardwareMap" href="#Element_hardwareMap">Element: hardwareMap</a> | [Element: hardwareMap](tr35-keyboards.md#Element_hardwareMap) |
-| S <a name="Principles_for_Keyboard_Ids" href="#Principles_for_Keyboard_Ids">Principles for Keyboard Ids</a> | [Principles for Keyboard Ids](tr35-keyboards.md#Principles_for_Keyboard_Ids) |
+[Part 7](tr35-keyboards.md) has been extensively rewritten. The prior link anchors within this file are no longer valid.
* * *
@@ -3467,7 +3789,7 @@
Note that for the case of territoryAlias, there may be multiple replacement values separated by spaces in the text (such as replacement="und-CW und-SX und-BQ"); other rules only ever have a single replacement value.
-#### <a name="3.-matches" href="#3.-matches">3. Matches</a>
+#### <a name="3.-matches" href="#3.-matches">Matches</a>
A rule matches a source if and only for all fields, each _source_ field ⊇ _type_ field.
@@ -3510,7 +3832,7 @@
>
> result="ja-Latn-alalc97-fonipa" // note that CLDR canonical order of variants is alphabetical
-##### <a name="territory-exception" href="#territory-exception">Territory Exception</a>
+##### Territory Exception
If the field = territory, and the replacement.field has more than one value, then look up the most likely territory for the base language code (and script, if there is one). If that likely territory is in the list of replacements, use it. Otherwise, use the first territory in the list.
@@ -3592,7 +3914,7 @@
The canonicalization of localeIds is done by first canonicalizing the languageId portion, then handling extensions in the following way:
1. Replace any _tlang_ languageId value by its canonicalization.
-2. Use the bcp47 data to replace keys, types, tfields, and tvalues by their canonical forms. See **Section 3.6.4 U Extension Data Files** and **Section 3.7.1 T Extension Data Files**. The matches are in the `alias` attribute value, while the canonical replacement is in the `name` attribute value. For example:
+2. Use the bcp47 data to replace keys, types, tfields, and tvalues by their canonical forms. See **U Extension Data Files** and **T Extension Data Files**. The matches are in the `alias` attribute value, while the canonical replacement is in the `name` attribute value. For example:
1. Because of the following bcp47 data:
`<key name="ms"…>…<type name="uksystem" … alias="imperial" … />…</key>`
2. We get the following transformation:
@@ -3684,7 +4006,7 @@
* Jennifer Chye for her contributions to the conversion tools.
* Markus Scherer for a major rewrite of Part 5, Collation.
* [Shane Carr](https://www.sffc.xyz/) for his work on numbers and measurement units.
-* Robin Leroy for his work on compact plurals: Part 3, Section 5, [Language Plural Rules](tr35-numbers.md#Language_Plural_Rules)
+* Robin Leroy for his work on compact plurals: Part 3, [Language Plural Rules](tr35-numbers.md#Language_Plural_Rules)
* Rich Gillam for work on Person Names.
* Alex Kolisnychenko for work on Person Names.
* Mike McKenna for work on Person Names.
@@ -3694,37 +4016,51 @@
## <a name="Modifications" href="#Modifications">Modifications</a>
-**Revision 67**
+**Differences from LDML Version 43**
-* [Parent Locales](#Parent_Locales)
- * Updated the description of guidelines and invariants for `parentLocale` data.
-* [Hybrid Locale Identifiers](#Hybrid_Locale)
- * Expanded the discussion of combinations such as Hinglish.
-* [Currency Formats](tr35-numbers.md#Currency_Formats) and [Currencies](tr35-numbers.md#Currencies)
- * Described the new `alt="alphaNextToNumber"` and `alt="noCurrency"` variants for `pattern`s used with `currencyFormat` elements
- * Described the new `currencyPatternAppendISO` element under `currencyFormats`
- * Discouraged the use of the old `currencySpacing` element (and its subelements) in favor of the `alt="alphaNextToNumber"` variant
-* [Element dateTimeFormat](tr35-dates.md#dateTimeFormat)
- * Described the new `dateTimeFormat type="atTime"` pattern and when to use it versus the standard `dateTimeFormat` pattern.
-* [Matching Skeletons](tr35-dates.md#Matching_Skeletons)
- * Provided more detailed recommendations on matching pattern field length to field length in the requested skeleton.
-* [Unit Preferences](tr35-info.md#Unit_Preferences)
- * Added a new subsection to specify the interaction of the unit Preferences data with the locale keys mu, ms, and rg, and the base locale.
-* Plurals
- * In [Plural rules syntax](tr35-numbers.md#Plural_rules_syntax), allow sample values to have positive and negative signs.
-* Units of measurement
- * [Unit Preferences](tr35-info.md#Unit_Preferences)
- * Added a new subsection to specify the interaction of the unit Preferences data with the locale keys mu, ms, and rg, and the base locale.
- * [Unit Elements](tr35-general.md#Unit_Elements), [Unit_Conversion](tr35-info.md#Unit_Conversion)
- * For simpler and cleaner parsing, add a new element (unitIdComponent) and restructured the EBNF for parsing unit identifiers.
- * As part of this work, the identifier metric-ton was deprecated in favor of tonne. As usual, the older identifier remains for compatibility, and is aliased to the new one.
+* [Core](#Contents)
+ * In [Time Zone Identifiers](#Time_Zone_Identifiers), added information on the new `iana` attribute for stability; also see information on `iana` in the section [U Extension Data Files](#Unicode_Locale_Extension_Data_Files).
+ * [Likely Subtags](#Likely_Subtags): There is a fix to how macroregions are handled by adding likely subtags, such as with `und_419`
+ * [Unicode Sets](#Unicode_Sets): New sections on the following, with additional clarifications:
+ * [UnicodeSet syntax](#unicodeset-syntax)
+ * [Backslash Escapes](#Backslash_Escapes)
+ * [Variables in UnicodeSets](#Variables_in_UnicodeSets)
+ * [Unicode Language Identifier](#unicode-language-identifier): clarified constraint on duplicate subtags.
+ * [Key/Type Definitions](#key-and-type-definitions): clarified definition of `-dx`
+ * [EBNF](#ebnf): Clarified use of EBNF in LDML
+ * (44.1)[Key/Type Definitions](#key-and-type-definitions): further clarified the definition of `-dx`
+
+* [General](tr35-general.md#Contents)
+ * Added new section [Unit Identifier Uniqueness](tr35-general.md#Unit_Identifier_Uniqueness), and added a relevant constraint on base_component in the [Syntax](tr35-general.md#syntax) section.
+ * Several clarifications were added in [Transform Rules Syntax](tr35-general.md#Transform_Rules_Syntax), and a new section [Transform Syntax Characters](tr35-general.md#transform-syntax-characters) was added with a table of the characters.
+ * (44.1) [Synthesizing Sequence Names](tr35-general.md#SynthesizingNames) Added handling of derived emoji names and keywords for emoji facing-right sequences.
+
+* [Dates](tr35-dates.md#Contents)
+ * New section [First Day Overrides](tr35-dates.md#first-day-overrides): Described the various locale ID elements that affect determination of the first day of the week (for week of year calculations), and the order in which they should be considered. Also noted in [Key/Type Definitions](#Key_Type_Definitions) which keys can affect determination of first day.
+
+* [Supplemental](tr35-info.md#Contents)
+ * In [Conversion Data](tr35-info.md#conversion-data), expanded the list of values for the convertUnit systems attribute.
+ * Added new section [Derived Unit System](tr35-info.md#derived-unit-system)
+ * Rewrote and clarified the material in [Unit Preferences Overrides](tr35-info.md#Unit_Preferences_Data)
+
+* [Keyboards](tr35-keyboards.md#Contents)
+ * Complete rewrite of the specification by the Keyboard Subcommittee. Available as a technical preview in CLDR version 44. See [Part 7: Status](tr35-keyboards.md#status).
+
* [Person Names](tr35-personNames.md#Contents)
- * Added a new Part 8, Person Names.
+ * Added material in [API Implementaion](tr35-personNames.md#api-implementation) on recommended implementation API options.
+ * Describe new [parameterDefault Element](tr35-personNames.md#parameterdefault-element) element that specifies default formality and length.
+ * Describe new [nativeSpaceReplacement Element](tr35-personNames.md#nativespacereplacement-element) that specifies how spaces should be handled when the name language is the same as the formatting language.
+ * In [Modifiers](tr35-personNames.md#modifiers) added the modifiers retain, genitive and vocative.
+ * Added sections on [Grammatical Modifiers for Names](tr35-personNames.md#grammatical-modifiers-for-names) and [Future Modifiers](tr35-personNames.md#future-modifiers).
+ * Fixed a problem in [Switch the formatting locale if necessary](tr35-personNames.md#switch-the-formatting-locale-if-necessary), where the full formatting locale wasn't being set correctly when the name object has a locale whose script is incompatibility with name script.
+ * Rewrote the section on [Setting the spaceReplacement](tr35-personNames.md#setting-the-spacereplacement).
-Note that small changes such as typos and link fixes are not listed above. Modifications in previous versions are listed in those respective versions. Click on **Previous Version** in the header until you get to the desired version.
+Note that small changes such as typos and link fixes are not listed above.
+Modifications in previous versions are listed in those respective versions.
+Click on **Previous Version** in the header until you get to the desired version.
* * *
-Copyright © 2001–2022 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
+Copyright © 2001–2023 Unicode, Inc. All Rights Reserved. The Unicode Consortium makes no expressed or implied warranty of any kind, and assumes no liability for errors or omissions. No liability is assumed for incidental and consequential damages in connection with or arising out of the use of the information or programs contained or accompanying this technical report. The Unicode [Terms of Use](https://www.unicode.org/copyright.html) apply.
Unicode and the Unicode logo are trademarks of Unicode, Inc., and are registered in some jurisdictions.