| /* GENERATED SOURCE. DO NOT MODIFY. */ |
| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2013-2015, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * CollationBuilder.java, ported from collationbuilder.h/.cpp |
| * |
| * C++ version created on: 2013may06 |
| * created by: Markus W. Scherer |
| */ |
| |
| package android.icu.impl.coll; |
| |
| import java.text.ParseException; |
| |
| import android.icu.impl.Norm2AllModes; |
| import android.icu.impl.Normalizer2Impl; |
| import android.icu.impl.Normalizer2Impl.Hangul; |
| import android.icu.lang.UScript; |
| import android.icu.text.CanonicalIterator; |
| import android.icu.text.Collator; |
| import android.icu.text.Normalizer2; |
| import android.icu.text.UnicodeSet; |
| import android.icu.text.UnicodeSetIterator; |
| import android.icu.util.ICUInputTooLongException; |
| import android.icu.util.ULocale; |
| |
| /** |
| * @hide Only a subset of ICU is exposed in Android |
| */ |
| public final class CollationBuilder extends CollationRuleParser.Sink { |
| private static final boolean DEBUG = false; |
| private static final class BundleImporter implements CollationRuleParser.Importer { |
| BundleImporter() {} |
| @Override |
| public String getRules(String localeID, String collationType) { |
| return CollationLoader.loadRules(new ULocale(localeID), collationType); |
| } |
| } |
| |
| public CollationBuilder(CollationTailoring b) { |
| nfd = Normalizer2.getNFDInstance(); |
| fcd = Norm2AllModes.getFCDNormalizer2(); |
| nfcImpl = Norm2AllModes.getNFCInstance().impl; |
| base = b; |
| baseData = b.data; |
| rootElements = new CollationRootElements(b.data.rootElements); |
| variableTop = 0; |
| dataBuilder = new CollationDataBuilder(); |
| fastLatinEnabled = true; |
| cesLength = 0; |
| rootPrimaryIndexes = new UVector32(); |
| nodes = new UVector64(); |
| nfcImpl.ensureCanonIterData(); |
| dataBuilder.initForTailoring(baseData); |
| } |
| |
| public CollationTailoring parseAndBuild(String ruleString) throws ParseException { |
| if(baseData.rootElements == null) { |
| // C++ U_MISSING_RESOURCE_ERROR |
| throw new UnsupportedOperationException( |
| "missing root elements data, tailoring not supported"); |
| } |
| CollationTailoring tailoring = new CollationTailoring(base.settings); |
| CollationRuleParser parser = new CollationRuleParser(baseData); |
| // Note: This always bases &[last variable] and &[first regular] |
| // on the root collator's maxVariable/variableTop. |
| // If we wanted this to change after [maxVariable x], then we would keep |
| // the tailoring.settings pointer here and read its variableTop when we need it. |
| // See http://unicode.org/cldr/trac/ticket/6070 |
| variableTop = base.settings.readOnly().variableTop; |
| parser.setSink(this); |
| // In Java, there is only one Importer implementation. |
| // In C++, the importer is a parameter for this method. |
| parser.setImporter(new BundleImporter()); |
| CollationSettings ownedSettings = tailoring.settings.copyOnWrite(); |
| parser.parse(ruleString, ownedSettings); |
| if(dataBuilder.hasMappings()) { |
| makeTailoredCEs(); |
| closeOverComposites(); |
| finalizeCEs(); |
| // Copy all of ASCII, and Latin-1 letters, into each tailoring. |
| optimizeSet.add(0, 0x7f); |
| optimizeSet.add(0xc0, 0xff); |
| // Hangul is decomposed on the fly during collation, |
| // and the tailoring data is always built with HANGUL_TAG specials. |
| optimizeSet.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END); |
| dataBuilder.optimize(optimizeSet); |
| tailoring.ensureOwnedData(); |
| if(fastLatinEnabled) { dataBuilder.enableFastLatin(); } |
| dataBuilder.build(tailoring.ownedData); |
| // C++ tailoring.builder = dataBuilder; |
| dataBuilder = null; |
| } else { |
| tailoring.data = baseData; |
| } |
| ownedSettings.fastLatinOptions = CollationFastLatin.getOptions( |
| tailoring.data, ownedSettings, |
| ownedSettings.fastLatinPrimaries); |
| tailoring.setRules(ruleString); |
| // In Java, we do not have a rules version. |
| // In C++, the genrb build tool reads and supplies one, |
| // and the rulesVersion is a parameter for this method. |
| tailoring.setVersion(base.version, 0 /* rulesVersion */); |
| return tailoring; |
| } |
| |
| /** Implements CollationRuleParser.Sink. */ |
| @Override |
| void addReset(int strength, CharSequence str) { |
| assert(str.length() != 0); |
| if(str.charAt(0) == CollationRuleParser.POS_LEAD) { |
| ces[0] = getSpecialResetPosition(str); |
| cesLength = 1; |
| assert((ces[0] & Collation.CASE_AND_QUATERNARY_MASK) == 0); |
| } else { |
| // normal reset to a character or string |
| String nfdString = nfd.normalize(str); |
| cesLength = dataBuilder.getCEs(nfdString, ces, 0); |
| if(cesLength > Collation.MAX_EXPANSION_LENGTH) { |
| throw new IllegalArgumentException( |
| "reset position maps to too many collation elements (more than 31)"); |
| } |
| } |
| if(strength == Collator.IDENTICAL) { return; } // simple reset-at-position |
| |
| // &[before strength]position |
| assert(Collator.PRIMARY <= strength && strength <= Collator.TERTIARY); |
| int index = findOrInsertNodeForCEs(strength); |
| |
| long node = nodes.elementAti(index); |
| // If the index is for a "weaker" node, |
| // then skip backwards over this and further "weaker" nodes. |
| while(strengthFromNode(node) > strength) { |
| index = previousIndexFromNode(node); |
| node = nodes.elementAti(index); |
| } |
| |
| // Find or insert a node whose index we will put into a temporary CE. |
| if(strengthFromNode(node) == strength && isTailoredNode(node)) { |
| // Reset to just before this same-strength tailored node. |
| index = previousIndexFromNode(node); |
| } else if(strength == Collator.PRIMARY) { |
| // root primary node (has no previous index) |
| long p = weight32FromNode(node); |
| if(p == 0) { |
| throw new UnsupportedOperationException( |
| "reset primary-before ignorable not possible"); |
| } |
| if(p <= rootElements.getFirstPrimary()) { |
| // There is no primary gap between ignorables and the space-first-primary. |
| throw new UnsupportedOperationException( |
| "reset primary-before first non-ignorable not supported"); |
| } |
| if(p == Collation.FIRST_TRAILING_PRIMARY) { |
| // We do not support tailoring to an unassigned-implicit CE. |
| throw new UnsupportedOperationException( |
| "reset primary-before [first trailing] not supported"); |
| } |
| p = rootElements.getPrimaryBefore(p, baseData.isCompressiblePrimary(p)); |
| index = findOrInsertNodeForPrimary(p); |
| // Go to the last node in this list: |
| // Tailor after the last node between adjacent root nodes. |
| for(;;) { |
| node = nodes.elementAti(index); |
| int nextIndex = nextIndexFromNode(node); |
| if(nextIndex == 0) { break; } |
| index = nextIndex; |
| } |
| } else { |
| // &[before 2] or &[before 3] |
| index = findCommonNode(index, Collator.SECONDARY); |
| if(strength >= Collator.TERTIARY) { |
| index = findCommonNode(index, Collator.TERTIARY); |
| } |
| // findCommonNode() stayed on the stronger node or moved to |
| // an explicit common-weight node of the reset-before strength. |
| node = nodes.elementAti(index); |
| if(strengthFromNode(node) == strength) { |
| // Found a same-strength node with an explicit weight. |
| int weight16 = weight16FromNode(node); |
| if(weight16 == 0) { |
| throw new UnsupportedOperationException( |
| (strength == Collator.SECONDARY) ? |
| "reset secondary-before secondary ignorable not possible" : |
| "reset tertiary-before completely ignorable not possible"); |
| } |
| assert(weight16 > Collation.BEFORE_WEIGHT16); |
| // Reset to just before this node. |
| // Insert the preceding same-level explicit weight if it is not there already. |
| // Which explicit weight immediately precedes this one? |
| weight16 = getWeight16Before(index, node, strength); |
| // Does this preceding weight have a node? |
| int previousWeight16; |
| int previousIndex = previousIndexFromNode(node); |
| for(int i = previousIndex;; i = previousIndexFromNode(node)) { |
| node = nodes.elementAti(i); |
| int previousStrength = strengthFromNode(node); |
| if(previousStrength < strength) { |
| assert(weight16 >= Collation.COMMON_WEIGHT16 || i == previousIndex); |
| // Either the reset element has an above-common weight and |
| // the parent node provides the implied common weight, |
| // or the reset element has a weight<=common in the node |
| // right after the parent, and we need to insert the preceding weight. |
| previousWeight16 = Collation.COMMON_WEIGHT16; |
| break; |
| } else if(previousStrength == strength && !isTailoredNode(node)) { |
| previousWeight16 = weight16FromNode(node); |
| break; |
| } |
| // Skip weaker nodes and same-level tailored nodes. |
| } |
| if(previousWeight16 == weight16) { |
| // The preceding weight has a node, |
| // maybe with following weaker or tailored nodes. |
| // Reset to the last of them. |
| index = previousIndex; |
| } else { |
| // Insert a node with the preceding weight, reset to that. |
| node = nodeFromWeight16(weight16) | nodeFromStrength(strength); |
| index = insertNodeBetween(previousIndex, index, node); |
| } |
| } else { |
| // Found a stronger node with implied strength-common weight. |
| int weight16 = getWeight16Before(index, node, strength); |
| index = findOrInsertWeakNode(index, weight16, strength); |
| } |
| // Strength of the temporary CE = strength of its reset position. |
| // Code above raises an error if the before-strength is stronger. |
| strength = ceStrength(ces[cesLength - 1]); |
| } |
| ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength); |
| } |
| |
| /** |
| * Returns the secondary or tertiary weight preceding the current node's weight. |
| * node=nodes[index]. |
| */ |
| private int getWeight16Before(int index, long node, int level) { |
| assert(strengthFromNode(node) < level || !isTailoredNode(node)); |
| // Collect the root CE weights if this node is for a root CE. |
| // If it is not, then return the low non-primary boundary for a tailored CE. |
| int t; |
| if(strengthFromNode(node) == Collator.TERTIARY) { |
| t = weight16FromNode(node); |
| } else { |
| t = Collation.COMMON_WEIGHT16; // Stronger node with implied common weight. |
| } |
| while(strengthFromNode(node) > Collator.SECONDARY) { |
| index = previousIndexFromNode(node); |
| node = nodes.elementAti(index); |
| } |
| if(isTailoredNode(node)) { |
| return Collation.BEFORE_WEIGHT16; |
| } |
| int s; |
| if(strengthFromNode(node) == Collator.SECONDARY) { |
| s = weight16FromNode(node); |
| } else { |
| s = Collation.COMMON_WEIGHT16; // Stronger node with implied common weight. |
| } |
| while(strengthFromNode(node) > Collator.PRIMARY) { |
| index = previousIndexFromNode(node); |
| node = nodes.elementAti(index); |
| } |
| if(isTailoredNode(node)) { |
| return Collation.BEFORE_WEIGHT16; |
| } |
| // [p, s, t] is a root CE. Return the preceding weight for the requested level. |
| long p = weight32FromNode(node); |
| int weight16; |
| if(level == Collator.SECONDARY) { |
| weight16 = rootElements.getSecondaryBefore(p, s); |
| } else { |
| weight16 = rootElements.getTertiaryBefore(p, s, t); |
| assert((weight16 & ~Collation.ONLY_TERTIARY_MASK) == 0); |
| } |
| return weight16; |
| } |
| |
| private long getSpecialResetPosition(CharSequence str) { |
| assert(str.length() == 2); |
| long ce; |
| int strength = Collator.PRIMARY; |
| boolean isBoundary = false; |
| CollationRuleParser.Position pos = |
| CollationRuleParser.POSITION_VALUES[str.charAt(1) - CollationRuleParser.POS_BASE]; |
| switch(pos) { |
| case FIRST_TERTIARY_IGNORABLE: |
| // Quaternary CEs are not supported. |
| // Non-zero quaternary weights are possible only on tertiary or stronger CEs. |
| return 0; |
| case LAST_TERTIARY_IGNORABLE: |
| return 0; |
| case FIRST_SECONDARY_IGNORABLE: { |
| // Look for a tailored tertiary node after [0, 0, 0]. |
| int index = findOrInsertNodeForRootCE(0, Collator.TERTIARY); |
| long node = nodes.elementAti(index); |
| if((index = nextIndexFromNode(node)) != 0) { |
| node = nodes.elementAti(index); |
| assert(strengthFromNode(node) <= Collator.TERTIARY); |
| if(isTailoredNode(node) && strengthFromNode(node) == Collator.TERTIARY) { |
| return tempCEFromIndexAndStrength(index, Collator.TERTIARY); |
| } |
| } |
| return rootElements.getFirstTertiaryCE(); |
| // No need to look for nodeHasAnyBefore() on a tertiary node. |
| } |
| case LAST_SECONDARY_IGNORABLE: |
| ce = rootElements.getLastTertiaryCE(); |
| strength = Collator.TERTIARY; |
| break; |
| case FIRST_PRIMARY_IGNORABLE: { |
| // Look for a tailored secondary node after [0, 0, *]. |
| int index = findOrInsertNodeForRootCE(0, Collator.SECONDARY); |
| long node = nodes.elementAti(index); |
| while((index = nextIndexFromNode(node)) != 0) { |
| node = nodes.elementAti(index); |
| strength = strengthFromNode(node); |
| if(strength < Collator.SECONDARY) { break; } |
| if(strength == Collator.SECONDARY) { |
| if(isTailoredNode(node)) { |
| if(nodeHasBefore3(node)) { |
| index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); |
| assert(isTailoredNode(nodes.elementAti(index))); |
| } |
| return tempCEFromIndexAndStrength(index, Collator.SECONDARY); |
| } else { |
| break; |
| } |
| } |
| } |
| ce = rootElements.getFirstSecondaryCE(); |
| strength = Collator.SECONDARY; |
| break; |
| } |
| case LAST_PRIMARY_IGNORABLE: |
| ce = rootElements.getLastSecondaryCE(); |
| strength = Collator.SECONDARY; |
| break; |
| case FIRST_VARIABLE: |
| ce = rootElements.getFirstPrimaryCE(); |
| isBoundary = true; // FractionalUCA.txt: FDD1 00A0, SPACE first primary |
| break; |
| case LAST_VARIABLE: |
| ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); |
| break; |
| case FIRST_REGULAR: |
| ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1); |
| isBoundary = true; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary |
| break; |
| case LAST_REGULAR: |
| // Use the Hani-first-primary rather than the actual last "regular" CE before it, |
| // for backward compatibility with behavior before the introduction of |
| // script-first-primary CEs in the root collator. |
| ce = rootElements.firstCEWithPrimaryAtLeast( |
| baseData.getFirstPrimaryForGroup(UScript.HAN)); |
| break; |
| case FIRST_IMPLICIT: |
| ce = baseData.getSingleCE(0x4e00); |
| break; |
| case LAST_IMPLICIT: |
| // We do not support tailoring to an unassigned-implicit CE. |
| throw new UnsupportedOperationException( |
| "reset to [last implicit] not supported"); |
| case FIRST_TRAILING: |
| ce = Collation.makeCE(Collation.FIRST_TRAILING_PRIMARY); |
| isBoundary = true; // trailing first primary (there is no mapping for it) |
| break; |
| case LAST_TRAILING: |
| throw new IllegalArgumentException("LDML forbids tailoring to U+FFFF"); |
| default: |
| assert(false); |
| return 0; |
| } |
| |
| int index = findOrInsertNodeForRootCE(ce, strength); |
| long node = nodes.elementAti(index); |
| if((pos.ordinal() & 1) == 0) { |
| // even pos = [first xyz] |
| if(!nodeHasAnyBefore(node) && isBoundary) { |
| // A <group> first primary boundary is artificially added to FractionalUCA.txt. |
| // It is reachable via its special contraction, but is not normally used. |
| // Find the first character tailored after the boundary CE, |
| // or the first real root CE after it. |
| if((index = nextIndexFromNode(node)) != 0) { |
| // If there is a following node, then it must be tailored |
| // because there are no root CEs with a boundary primary |
| // and non-common secondary/tertiary weights. |
| node = nodes.elementAti(index); |
| assert(isTailoredNode(node)); |
| ce = tempCEFromIndexAndStrength(index, strength); |
| } else { |
| assert(strength == Collator.PRIMARY); |
| long p = ce >>> 32; |
| int pIndex = rootElements.findPrimary(p); |
| boolean isCompressible = baseData.isCompressiblePrimary(p); |
| p = rootElements.getPrimaryAfter(p, pIndex, isCompressible); |
| ce = Collation.makeCE(p); |
| index = findOrInsertNodeForRootCE(ce, Collator.PRIMARY); |
| node = nodes.elementAti(index); |
| } |
| } |
| if(nodeHasAnyBefore(node)) { |
| // Get the first node that was tailored before this one at a weaker strength. |
| if(nodeHasBefore2(node)) { |
| index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); |
| node = nodes.elementAti(index); |
| } |
| if(nodeHasBefore3(node)) { |
| index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); |
| } |
| assert(isTailoredNode(nodes.elementAti(index))); |
| ce = tempCEFromIndexAndStrength(index, strength); |
| } |
| } else { |
| // odd pos = [last xyz] |
| // Find the last node that was tailored after the [last xyz] |
| // at a strength no greater than the position's strength. |
| for(;;) { |
| int nextIndex = nextIndexFromNode(node); |
| if(nextIndex == 0) { break; } |
| long nextNode = nodes.elementAti(nextIndex); |
| if(strengthFromNode(nextNode) < strength) { break; } |
| index = nextIndex; |
| node = nextNode; |
| } |
| // Do not make a temporary CE for a root node. |
| // This last node might be the node for the root CE itself, |
| // or a node with a common secondary or tertiary weight. |
| if(isTailoredNode(node)) { |
| ce = tempCEFromIndexAndStrength(index, strength); |
| } |
| } |
| return ce; |
| } |
| |
| /** Implements CollationRuleParser.Sink. */ |
| @Override |
| void addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension) { |
| String nfdPrefix; |
| if(prefix.length() == 0) { |
| nfdPrefix = ""; |
| } else { |
| nfdPrefix = nfd.normalize(prefix); |
| } |
| String nfdString = nfd.normalize(str); |
| |
| // The runtime code decomposes Hangul syllables on the fly, |
| // with recursive processing but without making the Jamo pieces visible for matching. |
| // It does not work with certain types of contextual mappings. |
| int nfdLength = nfdString.length(); |
| if(nfdLength >= 2) { |
| char c = nfdString.charAt(0); |
| if(Hangul.isJamoL(c) || Hangul.isJamoV(c)) { |
| // While handling a Hangul syllable, contractions starting with Jamo L or V |
| // would not see the following Jamo of that syllable. |
| throw new UnsupportedOperationException( |
| "contractions starting with conjoining Jamo L or V not supported"); |
| } |
| c = nfdString.charAt(nfdLength - 1); |
| if(Hangul.isJamoL(c) || |
| (Hangul.isJamoV(c) && Hangul.isJamoL(nfdString.charAt(nfdLength - 2)))) { |
| // A contraction ending with Jamo L or L+V would require |
| // generating Hangul syllables in addTailComposites() (588 for a Jamo L), |
| // or decomposing a following Hangul syllable on the fly, during contraction matching. |
| throw new UnsupportedOperationException( |
| "contractions ending with conjoining Jamo L or L+V not supported"); |
| } |
| // A Hangul syllable completely inside a contraction is ok. |
| } |
| // Note: If there is a prefix, then the parser checked that |
| // both the prefix and the string begin with NFC boundaries (not Jamo V or T). |
| // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) |
| // (While handling a Hangul syllable, prefixes on Jamo V or T |
| // would not see the previous Jamo of that syllable.) |
| |
| if(strength != Collator.IDENTICAL) { |
| // Find the node index after which we insert the new tailored node. |
| int index = findOrInsertNodeForCEs(strength); |
| assert(cesLength > 0); |
| long ce = ces[cesLength - 1]; |
| if(strength == Collator.PRIMARY && !isTempCE(ce) && (ce >>> 32) == 0) { |
| // There is no primary gap between ignorables and the space-first-primary. |
| throw new UnsupportedOperationException( |
| "tailoring primary after ignorables not supported"); |
| } |
| if(strength == Collator.QUATERNARY && ce == 0) { |
| // The CE data structure does not support non-zero quaternary weights |
| // on tertiary ignorables. |
| throw new UnsupportedOperationException( |
| "tailoring quaternary after tertiary ignorables not supported"); |
| } |
| // Insert the new tailored node. |
| index = insertTailoredNodeAfter(index, strength); |
| // Strength of the temporary CE: |
| // The new relation may yield a stronger CE but not a weaker one. |
| int tempStrength = ceStrength(ce); |
| if(strength < tempStrength) { tempStrength = strength; } |
| ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength); |
| } |
| |
| setCaseBits(nfdString); |
| |
| int cesLengthBeforeExtension = cesLength; |
| if(extension.length() != 0) { |
| String nfdExtension = nfd.normalize(extension); |
| cesLength = dataBuilder.getCEs(nfdExtension, ces, cesLength); |
| if(cesLength > Collation.MAX_EXPANSION_LENGTH) { |
| throw new IllegalArgumentException( |
| "extension string adds too many collation elements (more than 31 total)"); |
| } |
| } |
| int ce32 = Collation.UNASSIGNED_CE32; |
| if((!nfdPrefix.contentEquals(prefix) || !nfdString.contentEquals(str)) && |
| !ignorePrefix(prefix) && !ignoreString(str)) { |
| // Map from the original input to the CEs. |
| // We do this in case the canonical closure is incomplete, |
| // so that it is possible to explicitly provide the missing mappings. |
| ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32); |
| } |
| addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32); |
| cesLength = cesLengthBeforeExtension; |
| } |
| |
| /** |
| * Picks one of the current CEs and finds or inserts a node in the graph |
| * for the CE + strength. |
| */ |
| private int findOrInsertNodeForCEs(int strength) { |
| assert(Collator.PRIMARY <= strength && strength <= Collator.QUATERNARY); |
| |
| // Find the last CE that is at least as "strong" as the requested difference. |
| // Note: Stronger is smaller (Collator.PRIMARY=0). |
| long ce; |
| for(;; --cesLength) { |
| if(cesLength == 0) { |
| ce = ces[0] = 0; |
| cesLength = 1; |
| break; |
| } else { |
| ce = ces[cesLength - 1]; |
| } |
| if(ceStrength(ce) <= strength) { break; } |
| } |
| |
| if(isTempCE(ce)) { |
| // No need to findCommonNode() here for lower levels |
| // because insertTailoredNodeAfter() will do that anyway. |
| return indexFromTempCE(ce); |
| } |
| |
| // root CE |
| if((int)(ce >>> 56) == Collation.UNASSIGNED_IMPLICIT_BYTE) { |
| throw new UnsupportedOperationException( |
| "tailoring relative to an unassigned code point not supported"); |
| } |
| return findOrInsertNodeForRootCE(ce, strength); |
| } |
| |
| private int findOrInsertNodeForRootCE(long ce, int strength) { |
| assert((int)(ce >>> 56) != Collation.UNASSIGNED_IMPLICIT_BYTE); |
| |
| // Find or insert the node for each of the root CE's weights, |
| // down to the requested level/strength. |
| // Root CEs must have common=zero quaternary weights (for which we never insert any nodes). |
| assert((ce & 0xc0) == 0); |
| int index = findOrInsertNodeForPrimary(ce >>> 32); |
| if(strength >= Collator.SECONDARY) { |
| int lower32 = (int)ce; |
| index = findOrInsertWeakNode(index, lower32 >>> 16, Collator.SECONDARY); |
| if(strength >= Collator.TERTIARY) { |
| index = findOrInsertWeakNode(index, lower32 & Collation.ONLY_TERTIARY_MASK, |
| Collator.TERTIARY); |
| } |
| } |
| return index; |
| } |
| |
| /** |
| * Like Java Collections.binarySearch(List, key, Comparator). |
| * |
| * @return the index>=0 where the item was found, |
| * or the index<0 for inserting the string at ~index in sorted order |
| * (index into rootPrimaryIndexes) |
| */ |
| private static final int binarySearchForRootPrimaryNode( |
| int[] rootPrimaryIndexes, int length, long[] nodes, long p) { |
| if(length == 0) { return ~0; } |
| int start = 0; |
| int limit = length; |
| for (;;) { |
| int i = (int)(((long)start + (long)limit) / 2); |
| long node = nodes[rootPrimaryIndexes[i]]; |
| long nodePrimary = node >>> 32; // weight32FromNode(node) |
| if (p == nodePrimary) { |
| return i; |
| } else if (p < nodePrimary) { |
| if (i == start) { |
| return ~start; // insert s before i |
| } |
| limit = i; |
| } else { |
| if (i == start) { |
| return ~(start + 1); // insert s after i |
| } |
| start = i; |
| } |
| } |
| } |
| |
| /** Finds or inserts the node for a root CE's primary weight. */ |
| private int findOrInsertNodeForPrimary(long p) { |
| int rootIndex = binarySearchForRootPrimaryNode( |
| rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p); |
| if(rootIndex >= 0) { |
| return rootPrimaryIndexes.elementAti(rootIndex); |
| } else { |
| // Start a new list of nodes with this primary. |
| int index = nodes.size(); |
| nodes.addElement(nodeFromWeight32(p)); |
| rootPrimaryIndexes.insertElementAt(index, ~rootIndex); |
| return index; |
| } |
| } |
| |
| /** Finds or inserts the node for a secondary or tertiary weight. */ |
| private int findOrInsertWeakNode(int index, int weight16, int level) { |
| assert(0 <= index && index < nodes.size()); |
| assert(Collator.SECONDARY <= level && level <= Collator.TERTIARY); |
| |
| if(weight16 == Collation.COMMON_WEIGHT16) { |
| return findCommonNode(index, level); |
| } |
| |
| // If this will be the first below-common weight for the parent node, |
| // then we will also need to insert a common weight after it. |
| long node = nodes.elementAti(index); |
| assert(strengthFromNode(node) < level); // parent node is stronger |
| if(weight16 != 0 && weight16 < Collation.COMMON_WEIGHT16) { |
| int hasThisLevelBefore = level == Collator.SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3; |
| if((node & hasThisLevelBefore) == 0) { |
| // The parent node has an implied level-common weight. |
| long commonNode = |
| nodeFromWeight16(Collation.COMMON_WEIGHT16) | nodeFromStrength(level); |
| if(level == Collator.SECONDARY) { |
| // Move the HAS_BEFORE3 flag from the parent node |
| // to the new secondary common node. |
| commonNode |= node & HAS_BEFORE3; |
| node &= ~(long)HAS_BEFORE3; |
| } |
| nodes.setElementAt(node | hasThisLevelBefore, index); |
| // Insert below-common-weight node. |
| int nextIndex = nextIndexFromNode(node); |
| node = nodeFromWeight16(weight16) | nodeFromStrength(level); |
| index = insertNodeBetween(index, nextIndex, node); |
| // Insert common-weight node. |
| insertNodeBetween(index, nextIndex, commonNode); |
| // Return index of below-common-weight node. |
| return index; |
| } |
| } |
| |
| // Find the root CE's weight for this level. |
| // Postpone insertion if not found: |
| // Insert the new root node before the next stronger node, |
| // or before the next root node with the same strength and a larger weight. |
| int nextIndex; |
| while((nextIndex = nextIndexFromNode(node)) != 0) { |
| node = nodes.elementAti(nextIndex); |
| int nextStrength = strengthFromNode(node); |
| if(nextStrength <= level) { |
| // Insert before a stronger node. |
| if(nextStrength < level) { break; } |
| // nextStrength == level |
| if(!isTailoredNode(node)) { |
| int nextWeight16 = weight16FromNode(node); |
| if(nextWeight16 == weight16) { |
| // Found the node for the root CE up to this level. |
| return nextIndex; |
| } |
| // Insert before a node with a larger same-strength weight. |
| if(nextWeight16 > weight16) { break; } |
| } |
| } |
| // Skip the next node. |
| index = nextIndex; |
| } |
| node = nodeFromWeight16(weight16) | nodeFromStrength(level); |
| return insertNodeBetween(index, nextIndex, node); |
| } |
| |
| /** |
| * Makes and inserts a new tailored node into the list, after the one at index. |
| * Skips over nodes of weaker strength to maintain collation order |
| * ("postpone insertion"). |
| * @return the new node's index |
| */ |
| private int insertTailoredNodeAfter(int index, int strength) { |
| assert(0 <= index && index < nodes.size()); |
| if(strength >= Collator.SECONDARY) { |
| index = findCommonNode(index, Collator.SECONDARY); |
| if(strength >= Collator.TERTIARY) { |
| index = findCommonNode(index, Collator.TERTIARY); |
| } |
| } |
| // Postpone insertion: |
| // Insert the new node before the next one with a strength at least as strong. |
| long node = nodes.elementAti(index); |
| int nextIndex; |
| while((nextIndex = nextIndexFromNode(node)) != 0) { |
| node = nodes.elementAti(nextIndex); |
| if(strengthFromNode(node) <= strength) { break; } |
| // Skip the next node which has a weaker (larger) strength than the new one. |
| index = nextIndex; |
| } |
| node = IS_TAILORED | nodeFromStrength(strength); |
| return insertNodeBetween(index, nextIndex, node); |
| } |
| |
| /** |
| * Inserts a new node into the list, between list-adjacent items. |
| * The node's previous and next indexes must not be set yet. |
| * @return the new node's index |
| */ |
| private int insertNodeBetween(int index, int nextIndex, long node) { |
| assert(previousIndexFromNode(node) == 0); |
| assert(nextIndexFromNode(node) == 0); |
| assert(nextIndexFromNode(nodes.elementAti(index)) == nextIndex); |
| // Append the new node and link it to the existing nodes. |
| int newIndex = nodes.size(); |
| node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex); |
| nodes.addElement(node); |
| // nodes[index].nextIndex = newIndex |
| node = nodes.elementAti(index); |
| nodes.setElementAt(changeNodeNextIndex(node, newIndex), index); |
| // nodes[nextIndex].previousIndex = newIndex |
| if(nextIndex != 0) { |
| node = nodes.elementAti(nextIndex); |
| nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex); |
| } |
| return newIndex; |
| } |
| |
| /** |
| * Finds the node which implies or contains a common=05 weight of the given strength |
| * (secondary or tertiary), if the current node is stronger. |
| * Skips weaker nodes and tailored nodes if the current node is stronger |
| * and is followed by an explicit-common-weight node. |
| * Always returns the input index if that node is no stronger than the given strength. |
| */ |
| private int findCommonNode(int index, int strength) { |
| assert(Collator.SECONDARY <= strength && strength <= Collator.TERTIARY); |
| long node = nodes.elementAti(index); |
| if(strengthFromNode(node) >= strength) { |
| // The current node is no stronger. |
| return index; |
| } |
| if(strength == Collator.SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { |
| // The current node implies the strength-common weight. |
| return index; |
| } |
| index = nextIndexFromNode(node); |
| node = nodes.elementAti(index); |
| assert(!isTailoredNode(node) && strengthFromNode(node) == strength && |
| weight16FromNode(node) < Collation.COMMON_WEIGHT16); |
| // Skip to the explicit common node. |
| do { |
| index = nextIndexFromNode(node); |
| node = nodes.elementAti(index); |
| assert(strengthFromNode(node) >= strength); |
| } while(isTailoredNode(node) || strengthFromNode(node) > strength || |
| weight16FromNode(node) < Collation.COMMON_WEIGHT16); |
| assert(weight16FromNode(node) == Collation.COMMON_WEIGHT16); |
| return index; |
| } |
| |
| private void setCaseBits(CharSequence nfdString) { |
| int numTailoredPrimaries = 0; |
| for(int i = 0; i < cesLength; ++i) { |
| if(ceStrength(ces[i]) == Collator.PRIMARY) { ++numTailoredPrimaries; } |
| } |
| // We should not be able to get too many case bits because |
| // cesLength<=31==MAX_EXPANSION_LENGTH. |
| // 31 pairs of case bits fit into an long without setting its sign bit. |
| assert(numTailoredPrimaries <= 31); |
| |
| long cases = 0; |
| if(numTailoredPrimaries > 0) { |
| CharSequence s = nfdString; |
| UTF16CollationIterator baseCEs = new UTF16CollationIterator(baseData, false, s, 0); |
| int baseCEsLength = baseCEs.fetchCEs() - 1; |
| assert(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation.NO_CE); |
| |
| int lastCase = 0; |
| int numBasePrimaries = 0; |
| for(int i = 0; i < baseCEsLength; ++i) { |
| long ce = baseCEs.getCE(i); |
| if((ce >>> 32) != 0) { |
| ++numBasePrimaries; |
| int c = ((int)ce >> 14) & 3; |
| assert(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE |
| if(numBasePrimaries < numTailoredPrimaries) { |
| cases |= (long)c << ((numBasePrimaries - 1) * 2); |
| } else if(numBasePrimaries == numTailoredPrimaries) { |
| lastCase = c; |
| } else if(c != lastCase) { |
| // There are more base primary CEs than tailored primaries. |
| // Set mixed case if the case bits of the remainder differ. |
| lastCase = 1; |
| // Nothing more can change. |
| break; |
| } |
| } |
| } |
| if(numBasePrimaries >= numTailoredPrimaries) { |
| cases |= (long)lastCase << ((numTailoredPrimaries - 1) * 2); |
| } |
| } |
| |
| for(int i = 0; i < cesLength; ++i) { |
| long ce = ces[i] & 0xffffffffffff3fffL; // clear old case bits |
| int strength = ceStrength(ce); |
| if(strength == Collator.PRIMARY) { |
| ce |= (cases & 3) << 14; |
| cases >>>= 2; |
| } else if(strength == Collator.TERTIARY) { |
| // Tertiary CEs must have uppercase bits. |
| // See the LDML spec, and comments in class CollationCompare. |
| ce |= 0x8000; |
| } |
| // Tertiary ignorable CEs must have 0 case bits. |
| // We set 0 case bits for secondary CEs too |
| // since currently only U+0345 is cased and maps to a secondary CE, |
| // and it is lowercase. Other secondaries are uncased. |
| // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight. |
| ces[i] = ce; |
| } |
| } |
| |
| /** Implements CollationRuleParser.Sink. */ |
| @Override |
| void suppressContractions(UnicodeSet set) { |
| dataBuilder.suppressContractions(set); |
| } |
| |
| /** Implements CollationRuleParser.Sink. */ |
| @Override |
| void optimize(UnicodeSet set) { |
| optimizeSet.addAll(set); |
| } |
| |
| /** |
| * Adds the mapping and its canonical closure. |
| * Takes ce32=dataBuilder.encodeCEs(...) so that the data builder |
| * need not re-encode the CEs multiple times. |
| */ |
| private int addWithClosure(CharSequence nfdPrefix, CharSequence nfdString, |
| long[] newCEs, int newCEsLength, int ce32) { |
| // Map from the NFD input to the CEs. |
| ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32); |
| ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32); |
| addTailComposites(nfdPrefix, nfdString); |
| return ce32; |
| } |
| |
| // ICU-22517 |
| // This constant defines a limit for the addOnlyClosure to return |
| // error, to avoid taking a long time for canonical closure expansion. |
| // Please let us know if you have a reasonable use case that needed |
| // for a practical Collation rule that needs to increase this limit. |
| // This value is needed for compiling a rule with eight Hangul syllables such as |
| // "&a=b쫊쫊쫊쫊쫊쫊쫊쫊" without error, which should be more than realistic |
| // usage. |
| static private int kClosureLoopLimit = 6560; |
| |
| private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString, |
| long[] newCEs, int newCEsLength, int ce32) { |
| // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.) |
| // TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String |
| int loop = 0; |
| if(nfdPrefix.length() == 0) { |
| CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString()); |
| String prefix = ""; |
| for(;;) { |
| String str = stringIter.next(); |
| if(str == null) { break; } |
| if(ignoreString(str) || str.contentEquals(nfdString)) { continue; } |
| if (loop++ > kClosureLoopLimit) { |
| throw new ICUInputTooLongException("Too many closure"); |
| } |
| ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32); |
| } |
| } else { |
| CanonicalIterator prefixIter = new CanonicalIterator(nfdPrefix.toString()); |
| CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString()); |
| for(;;) { |
| String prefix = prefixIter.next(); |
| if(prefix == null) { break; } |
| if(ignorePrefix(prefix)) { continue; } |
| boolean samePrefix = prefix.contentEquals(nfdPrefix); |
| for(;;) { |
| String str = stringIter.next(); |
| if(str == null) { break; } |
| if(ignoreString(str) || (samePrefix && str.contentEquals(nfdString))) { continue; } |
| if (loop++ > kClosureLoopLimit) { |
| throw new ICUInputTooLongException("Too many closure"); |
| } |
| ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32); |
| } |
| stringIter.reset(); |
| } |
| } |
| return ce32; |
| } |
| |
| private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) { |
| // Look for the last starter in the NFD string. |
| int lastStarter; |
| int indexAfterLastStarter = nfdString.length(); |
| for(;;) { |
| if(indexAfterLastStarter == 0) { return; } // no starter at all |
| lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter); |
| if(nfd.getCombiningClass(lastStarter) == 0) { break; } |
| indexAfterLastStarter -= Character.charCount(lastStarter); |
| } |
| // No closure to Hangul syllables since we decompose them on the fly. |
| if(Hangul.isJamoL(lastStarter)) { return; } |
| |
| // Are there any composites whose decomposition starts with the lastStarter? |
| // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters. |
| // We might find some more equivalent mappings here if it did. |
| UnicodeSet composites = new UnicodeSet(); |
| if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } |
| |
| StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder(); |
| long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH]; |
| UnicodeSetIterator iter = new UnicodeSetIterator(composites); |
| while(iter.next()) { |
| assert(iter.codepoint != UnicodeSetIterator.IS_STRING); |
| int composite = iter.codepoint; |
| String decomp = nfd.getDecomposition(composite); |
| if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, |
| newNFDString, newString)) { |
| continue; |
| } |
| int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0); |
| if(newCEsLength > Collation.MAX_EXPANSION_LENGTH) { |
| // Ignore mappings that we cannot store. |
| continue; |
| } |
| // Note: It is possible that the newCEs do not make use of the mapping |
| // for which we are adding the tail composites, in which case we might be adding |
| // unnecessary mappings. |
| // For example, when we add tail composites for ae^ (^=combining circumflex), |
| // UCA discontiguous-contraction matching does not find any matches |
| // for ae_^ (_=any combining diacritic below) *unless* there is also |
| // a contraction mapping for ae. |
| // Thus, if there is no ae contraction, then the ae^ mapping is ignored |
| // while fetching the newCEs for ae_^. |
| // TODO: Try to detect this effectively. |
| // (Alternatively, print a warning when prefix contractions are missing.) |
| |
| // We do not need an explicit mapping for the NFD strings. |
| // It is fine if the NFD input collates like this via a sequence of mappings. |
| // It also saves a little bit of space, and may reduce the set of characters with contractions. |
| int ce32 = addIfDifferent(nfdPrefix, newString, |
| newCEs, newCEsLength, Collation.UNASSIGNED_CE32); |
| if(ce32 != Collation.UNASSIGNED_CE32) { |
| // was different, was added |
| addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32); |
| } |
| } |
| } |
| |
| private boolean mergeCompositeIntoString(CharSequence nfdString, int indexAfterLastStarter, |
| int composite, CharSequence decomp, |
| StringBuilder newNFDString, StringBuilder newString) { |
| assert(Character.codePointBefore(nfdString, indexAfterLastStarter) == |
| Character.codePointAt(decomp, 0)); |
| int lastStarterLength = Character.offsetByCodePoints(decomp, 0, 1); |
| if(lastStarterLength == decomp.length()) { |
| // Singleton decompositions should be found by addWithClosure() |
| // and the CanonicalIterator, so we can ignore them here. |
| return false; |
| } |
| if(equalSubSequences(nfdString, indexAfterLastStarter, decomp, lastStarterLength)) { |
| // same strings, nothing new to be found here |
| return false; |
| } |
| |
| // Make new FCD strings that combine a composite, or its decomposition, |
| // into the nfdString's last starter and the combining marks following it. |
| // Make an NFD version, and a version with the composite. |
| newNFDString.setLength(0); |
| newNFDString.append(nfdString, 0, indexAfterLastStarter); |
| newString.setLength(0); |
| newString.append(nfdString, 0, indexAfterLastStarter - lastStarterLength) |
| .appendCodePoint(composite); |
| |
| // The following is related to discontiguous contraction matching, |
| // but builds only FCD strings (or else returns false). |
| int sourceIndex = indexAfterLastStarter; |
| int decompIndex = lastStarterLength; |
| // Small optimization: We keep the source character across loop iterations |
| // because we do not always consume it, |
| // and then need not fetch it again nor look up its combining class again. |
| int sourceChar = Collation.SENTINEL_CP; |
| // The cc variables need to be declared before the loop so that at the end |
| // they are set to the last combining classes seen. |
| int sourceCC = 0; |
| int decompCC = 0; |
| for(;;) { |
| if(sourceChar < 0) { |
| if(sourceIndex >= nfdString.length()) { break; } |
| sourceChar = Character.codePointAt(nfdString, sourceIndex); |
| sourceCC = nfd.getCombiningClass(sourceChar); |
| assert(sourceCC != 0); |
| } |
| // We consume a decomposition character in each iteration. |
| if(decompIndex >= decomp.length()) { break; } |
| int decompChar = Character.codePointAt(decomp, decompIndex); |
| decompCC = nfd.getCombiningClass(decompChar); |
| // Compare the two characters and their combining classes. |
| if(decompCC == 0) { |
| // Unable to merge because the source contains a non-zero combining mark |
| // but the composite's decomposition contains another starter. |
| // The strings would not be equivalent. |
| return false; |
| } else if(sourceCC < decompCC) { |
| // Composite + sourceChar would not be FCD. |
| return false; |
| } else if(decompCC < sourceCC) { |
| newNFDString.appendCodePoint(decompChar); |
| decompIndex += Character.charCount(decompChar); |
| } else if(decompChar != sourceChar) { |
| // Blocked because same combining class. |
| return false; |
| } else { // match: decompChar == sourceChar |
| newNFDString.appendCodePoint(decompChar); |
| decompIndex += Character.charCount(decompChar); |
| sourceIndex += Character.charCount(decompChar); |
| sourceChar = Collation.SENTINEL_CP; |
| } |
| } |
| // We are at the end of at least one of the two inputs. |
| if(sourceChar >= 0) { // more characters from nfdString but not from decomp |
| if(sourceCC < decompCC) { |
| // Appending the next source character to the composite would not be FCD. |
| return false; |
| } |
| newNFDString.append(nfdString, sourceIndex, nfdString.length()); |
| newString.append(nfdString, sourceIndex, nfdString.length()); |
| } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString |
| newNFDString.append(decomp, decompIndex, decomp.length()); |
| } |
| assert(nfd.isNormalized(newNFDString)); |
| assert(fcd.isNormalized(newString)); |
| assert(nfd.normalize(newString).equals(newNFDString.toString())); // canonically equivalent |
| return true; |
| } |
| |
| private boolean equalSubSequences(CharSequence left, int leftStart, CharSequence right, int rightStart) { |
| // C++ UnicodeString::compare(leftStart, 0x7fffffff, right, rightStart, 0x7fffffff) == 0 |
| int leftLength = left.length(); |
| if((leftLength - leftStart) != (right.length() - rightStart)) { return false; } |
| while(leftStart < leftLength) { |
| if(left.charAt(leftStart++) != right.charAt(rightStart++)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| private boolean ignorePrefix(CharSequence s) { |
| // Do not map non-FCD prefixes. |
| return !isFCD(s); |
| } |
| private boolean ignoreString(CharSequence s) { |
| // Do not map non-FCD strings. |
| // Do not map strings that start with Hangul syllables: We decompose those on the fly. |
| return !isFCD(s) || Hangul.isHangul(s.charAt(0)); |
| } |
| private boolean isFCD(CharSequence s) { |
| return fcd.isNormalized(s); |
| } |
| |
| private static final UnicodeSet COMPOSITES = new UnicodeSet("[:NFD_QC=N:]"); |
| static { |
| // Hangul is decomposed on the fly during collation. |
| COMPOSITES.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END); |
| } |
| |
| private void closeOverComposites() { |
| String prefix = ""; // empty |
| UnicodeSetIterator iter = new UnicodeSetIterator(COMPOSITES); |
| while(iter.next()) { |
| assert(iter.codepoint != UnicodeSetIterator.IS_STRING); |
| String nfdString = nfd.getDecomposition(iter.codepoint); |
| cesLength = dataBuilder.getCEs(nfdString, ces, 0); |
| if(cesLength > Collation.MAX_EXPANSION_LENGTH) { |
| // Too many CEs from the decomposition (unusual), ignore this composite. |
| // We could add a capacity parameter to getCEs() and reallocate if necessary. |
| // However, this can only really happen in contrived cases. |
| continue; |
| } |
| String composite = iter.getString(); |
| addIfDifferent(prefix, composite, ces, cesLength, Collation.UNASSIGNED_CE32); |
| } |
| } |
| |
| private int addIfDifferent(CharSequence prefix, CharSequence str, |
| long[] newCEs, int newCEsLength, int ce32) { |
| long[] oldCEs = new long[Collation.MAX_EXPANSION_LENGTH]; |
| int oldCEsLength = dataBuilder.getCEs(prefix, str, oldCEs, 0); |
| if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { |
| if(ce32 == Collation.UNASSIGNED_CE32) { |
| ce32 = dataBuilder.encodeCEs(newCEs, newCEsLength); |
| } |
| dataBuilder.addCE32(prefix, str, ce32); |
| } |
| return ce32; |
| } |
| |
| private static boolean sameCEs(long[] ces1, int ces1Length, |
| long[] ces2, int ces2Length) { |
| if(ces1Length != ces2Length) { |
| return false; |
| } |
| assert(ces1Length <= Collation.MAX_EXPANSION_LENGTH); |
| for(int i = 0; i < ces1Length; ++i) { |
| if(ces1[i] != ces2[i]) { return false; } |
| } |
| return true; |
| } |
| |
| private static final int alignWeightRight(int w) { |
| if(w != 0) { |
| while((w & 0xff) == 0) { w >>>= 8; } |
| } |
| return w; |
| } |
| |
| /** |
| * Walks the tailoring graph and overwrites tailored nodes with new CEs. |
| * After this, the graph is destroyed. |
| * The nodes array can then be used only as a source of tailored CEs. |
| */ |
| private void makeTailoredCEs() { |
| CollationWeights primaries = new CollationWeights(); |
| CollationWeights secondaries = new CollationWeights(); |
| CollationWeights tertiaries = new CollationWeights(); |
| long[] nodesArray = nodes.getBuffer(); |
| if(DEBUG) { |
| System.out.println("\nCollationBuilder.makeTailoredCEs()"); |
| } |
| |
| for(int rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { |
| int i = rootPrimaryIndexes.elementAti(rpi); |
| long node = nodesArray[i]; |
| long p = weight32FromNode(node); |
| int s = p == 0 ? 0 : Collation.COMMON_WEIGHT16; |
| int t = s; |
| int q = 0; |
| boolean pIsTailored = false; |
| boolean sIsTailored = false; |
| boolean tIsTailored = false; |
| if(DEBUG) { |
| System.out.printf("\nprimary %x\n", alignWeightRight((int)p)); |
| } |
| int pIndex = p == 0 ? 0 : rootElements.findPrimary(p); |
| int nextIndex = nextIndexFromNode(node); |
| while(nextIndex != 0) { |
| i = nextIndex; |
| node = nodesArray[i]; |
| nextIndex = nextIndexFromNode(node); |
| int strength = strengthFromNode(node); |
| if(strength == Collator.QUATERNARY) { |
| assert(isTailoredNode(node)); |
| if(DEBUG) { |
| System.out.print(" quat+ "); |
| } |
| if(q == 3) { |
| // C++ U_BUFFER_OVERFLOW_ERROR |
| throw new UnsupportedOperationException("quaternary tailoring gap too small"); |
| } |
| ++q; |
| } else { |
| if(strength == Collator.TERTIARY) { |
| if(isTailoredNode(node)) { |
| if(DEBUG) { |
| System.out.print(" ter+ "); |
| } |
| if(!tIsTailored) { |
| // First tailored tertiary node for [p, s]. |
| int tCount = countTailoredNodes(nodesArray, nextIndex, |
| Collator.TERTIARY) + 1; |
| int tLimit; |
| if(t == 0) { |
| // Gap at the beginning of the tertiary CE range. |
| t = rootElements.getTertiaryBoundary() - 0x100; |
| tLimit = (int)rootElements.getFirstTertiaryCE() & Collation.ONLY_TERTIARY_MASK; |
| } else if(!pIsTailored && !sIsTailored) { |
| // p and s are root weights. |
| tLimit = rootElements.getTertiaryAfter(pIndex, s, t); |
| } else if(t == Collation.BEFORE_WEIGHT16) { |
| tLimit = Collation.COMMON_WEIGHT16; |
| } else { |
| // [p, s] is tailored. |
| assert(t == Collation.COMMON_WEIGHT16); |
| tLimit = rootElements.getTertiaryBoundary(); |
| } |
| assert(tLimit == 0x4000 || (tLimit & ~Collation.ONLY_TERTIARY_MASK) == 0); |
| tertiaries.initForTertiary(); |
| if(!tertiaries.allocWeights(t, tLimit, tCount)) { |
| // C++ U_BUFFER_OVERFLOW_ERROR |
| throw new UnsupportedOperationException("tertiary tailoring gap too small"); |
| } |
| tIsTailored = true; |
| } |
| t = (int)tertiaries.nextWeight(); |
| assert(t != 0xffffffff); |
| } else { |
| t = weight16FromNode(node); |
| tIsTailored = false; |
| if(DEBUG) { |
| System.out.printf(" ter %x\n", alignWeightRight(t)); |
| } |
| } |
| } else { |
| if(strength == Collator.SECONDARY) { |
| if(isTailoredNode(node)) { |
| if(DEBUG) { |
| System.out.print(" sec+ "); |
| } |
| if(!sIsTailored) { |
| // First tailored secondary node for p. |
| int sCount = countTailoredNodes(nodesArray, nextIndex, |
| Collator.SECONDARY) + 1; |
| int sLimit; |
| if(s == 0) { |
| // Gap at the beginning of the secondary CE range. |
| s = rootElements.getSecondaryBoundary() - 0x100; |
| sLimit = (int)(rootElements.getFirstSecondaryCE() >> 16); |
| } else if(!pIsTailored) { |
| // p is a root primary. |
| sLimit = rootElements.getSecondaryAfter(pIndex, s); |
| } else if(s == Collation.BEFORE_WEIGHT16) { |
| sLimit = Collation.COMMON_WEIGHT16; |
| } else { |
| // p is a tailored primary. |
| assert(s == Collation.COMMON_WEIGHT16); |
| sLimit = rootElements.getSecondaryBoundary(); |
| } |
| if(s == Collation.COMMON_WEIGHT16) { |
| // Do not tailor into the getSortKey() range of |
| // compressed common secondaries. |
| s = rootElements.getLastCommonSecondary(); |
| } |
| secondaries.initForSecondary(); |
| if(!secondaries.allocWeights(s, sLimit, sCount)) { |
| // C++ U_BUFFER_OVERFLOW_ERROR |
| if(DEBUG) { |
| System.out.printf("!secondaries.allocWeights(%x, %x, sCount=%d)\n", |
| alignWeightRight(s), alignWeightRight(sLimit), |
| alignWeightRight(sCount)); |
| } |
| throw new UnsupportedOperationException("secondary tailoring gap too small"); |
| } |
| sIsTailored = true; |
| } |
| s = (int)secondaries.nextWeight(); |
| assert(s != 0xffffffff); |
| } else { |
| s = weight16FromNode(node); |
| sIsTailored = false; |
| if(DEBUG) { |
| System.out.printf(" sec %x\n", alignWeightRight(s)); |
| } |
| } |
| } else /* Collator.PRIMARY */ { |
| assert(isTailoredNode(node)); |
| if(DEBUG) { |
| System.out.print("pri+ "); |
| } |
| if(!pIsTailored) { |
| // First tailored primary node in this list. |
| int pCount = countTailoredNodes(nodesArray, nextIndex, |
| Collator.PRIMARY) + 1; |
| boolean isCompressible = baseData.isCompressiblePrimary(p); |
| long pLimit = |
| rootElements.getPrimaryAfter(p, pIndex, isCompressible); |
| primaries.initForPrimary(isCompressible); |
| if(!primaries.allocWeights(p, pLimit, pCount)) { |
| // C++ U_BUFFER_OVERFLOW_ERROR // TODO: introduce a more specific UErrorCode? |
| throw new UnsupportedOperationException("primary tailoring gap too small"); |
| } |
| pIsTailored = true; |
| } |
| p = primaries.nextWeight(); |
| assert(p != 0xffffffffL); |
| s = Collation.COMMON_WEIGHT16; |
| sIsTailored = false; |
| } |
| t = s == 0 ? 0 : Collation.COMMON_WEIGHT16; |
| tIsTailored = false; |
| } |
| q = 0; |
| } |
| if(isTailoredNode(node)) { |
| nodesArray[i] = Collation.makeCE(p, s, t, q); |
| if(DEBUG) { |
| System.out.printf("%016x\n", nodesArray[i]); |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * Counts the tailored nodes of the given strength up to the next node |
| * which is either stronger or has an explicit weight of this strength. |
| */ |
| private static int countTailoredNodes(long[] nodesArray, int i, int strength) { |
| int count = 0; |
| for(;;) { |
| if(i == 0) { break; } |
| long node = nodesArray[i]; |
| if(strengthFromNode(node) < strength) { break; } |
| if(strengthFromNode(node) == strength) { |
| if(isTailoredNode(node)) { |
| ++count; |
| } else { |
| break; |
| } |
| } |
| i = nextIndexFromNode(node); |
| } |
| return count; |
| } |
| |
| private static final class CEFinalizer implements CollationDataBuilder.CEModifier { |
| CEFinalizer(long[] ces) { |
| finalCEs = ces; |
| } |
| @Override |
| public long modifyCE32(int ce32) { |
| assert(!Collation.isSpecialCE32(ce32)); |
| if(CollationBuilder.isTempCE32(ce32)) { |
| // retain case bits |
| return finalCEs[CollationBuilder.indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8); |
| } else { |
| return Collation.NO_CE; |
| } |
| } |
| @Override |
| public long modifyCE(long ce) { |
| if(CollationBuilder.isTempCE(ce)) { |
| // retain case bits |
| return finalCEs[CollationBuilder.indexFromTempCE(ce)] | (ce & 0xc000); |
| } else { |
| return Collation.NO_CE; |
| } |
| } |
| |
| private long[] finalCEs; |
| }; |
| |
| /** Replaces temporary CEs with the final CEs they point to. */ |
| private void finalizeCEs() { |
| CollationDataBuilder newBuilder = new CollationDataBuilder(); |
| newBuilder.initForTailoring(baseData); |
| CEFinalizer finalizer = new CEFinalizer(nodes.getBuffer()); |
| newBuilder.copyFrom(dataBuilder, finalizer); |
| dataBuilder = newBuilder; |
| } |
| |
| /** |
| * Encodes "temporary CE" data into a CE that fits into the CE32 data structure, |
| * with 2-byte primary, 1-byte secondary and 6-bit tertiary, |
| * with valid CE byte values. |
| * |
| * The index must not exceed 20 bits (0xfffff). |
| * The strength must fit into 2 bits (Collator.PRIMARY..Collator.QUATERNARY). |
| * |
| * Temporary CEs are distinguished from real CEs by their use of |
| * secondary weights 06..45 which are otherwise reserved for compressed sort keys. |
| * |
| * The case bits are unused and available. |
| */ |
| private static long tempCEFromIndexAndStrength(int index, int strength) { |
| return |
| // CE byte offsets, to ensure valid CE bytes, and case bits 11 |
| 0x4040000006002000L + |
| // index bits 19..13 -> primary byte 1 = CE bits 63..56 (byte values 40..BF) |
| ((long)(index & 0xfe000) << 43) + |
| // index bits 12..6 -> primary byte 2 = CE bits 55..48 (byte values 40..BF) |
| ((long)(index & 0x1fc0) << 42) + |
| // index bits 5..0 -> secondary byte 1 = CE bits 31..24 (byte values 06..45) |
| ((index & 0x3f) << 24) + |
| // strength bits 1..0 -> tertiary byte 1 = CE bits 13..8 (byte values 20..23) |
| (strength << 8); |
| } |
| private static int indexFromTempCE(long tempCE) { |
| tempCE -= 0x4040000006002000L; |
| return |
| ((int)(tempCE >> 43) & 0xfe000) | |
| ((int)(tempCE >> 42) & 0x1fc0) | |
| ((int)(tempCE >> 24) & 0x3f); |
| } |
| private static int strengthFromTempCE(long tempCE) { |
| return ((int)tempCE >> 8) & 3; |
| } |
| private static boolean isTempCE(long ce) { |
| int sec = (int)ce >>> 24; |
| return 6 <= sec && sec <= 0x45; |
| } |
| |
| private static int indexFromTempCE32(int tempCE32) { |
| tempCE32 -= 0x40400620; |
| return |
| ((tempCE32 >> 11) & 0xfe000) | |
| ((tempCE32 >> 10) & 0x1fc0) | |
| ((tempCE32 >> 8) & 0x3f); |
| } |
| private static boolean isTempCE32(int ce32) { |
| return |
| (ce32 & 0xff) >= 2 && // not a long-primary/long-secondary CE32 |
| 6 <= ((ce32 >> 8) & 0xff) && ((ce32 >> 8) & 0xff) <= 0x45; |
| } |
| |
| private static int ceStrength(long ce) { |
| return |
| isTempCE(ce) ? strengthFromTempCE(ce) : |
| (ce & 0xff00000000000000L) != 0 ? Collator.PRIMARY : |
| ((int)ce & 0xff000000) != 0 ? Collator.SECONDARY : |
| ce != 0 ? Collator.TERTIARY : |
| Collator.IDENTICAL; |
| } |
| |
| /** At most 1M nodes, limited by the 20 bits in node bit fields. */ |
| private static final int MAX_INDEX = 0xfffff; |
| /** |
| * Node bit 6 is set on a primary node if there are nodes |
| * with secondary values below the common secondary weight (05). |
| */ |
| private static final int HAS_BEFORE2 = 0x40; |
| /** |
| * Node bit 5 is set on a primary or secondary node if there are nodes |
| * with tertiary values below the common tertiary weight (05). |
| */ |
| private static final int HAS_BEFORE3 = 0x20; |
| /** |
| * Node bit 3 distinguishes a tailored node, which has no weight value, |
| * from a node with an explicit (root or default) weight. |
| */ |
| private static final int IS_TAILORED = 8; |
| |
| private static long nodeFromWeight32(long weight32) { |
| return weight32 << 32; |
| } |
| private static long nodeFromWeight16(int weight16) { |
| return (long)weight16 << 48; |
| } |
| private static long nodeFromPreviousIndex(int previous) { |
| return (long)previous << 28; |
| } |
| private static long nodeFromNextIndex(int next) { |
| return next << 8; |
| } |
| private static long nodeFromStrength(int strength) { |
| return strength; |
| } |
| |
| private static long weight32FromNode(long node) { |
| return node >>> 32; |
| } |
| private static int weight16FromNode(long node) { |
| return (int)(node >> 48) & 0xffff; |
| } |
| private static int previousIndexFromNode(long node) { |
| return (int)(node >> 28) & MAX_INDEX; |
| } |
| private static int nextIndexFromNode(long node) { |
| return ((int)node >> 8) & MAX_INDEX; |
| } |
| private static int strengthFromNode(long node) { |
| return (int)node & 3; |
| } |
| |
| private static boolean nodeHasBefore2(long node) { |
| return (node & HAS_BEFORE2) != 0; |
| } |
| private static boolean nodeHasBefore3(long node) { |
| return (node & HAS_BEFORE3) != 0; |
| } |
| private static boolean nodeHasAnyBefore(long node) { |
| return (node & (HAS_BEFORE2 | HAS_BEFORE3)) != 0; |
| } |
| private static boolean isTailoredNode(long node) { |
| return (node & IS_TAILORED) != 0; |
| } |
| |
| private static long changeNodePreviousIndex(long node, int previous) { |
| return (node & 0xffff00000fffffffL) | nodeFromPreviousIndex(previous); |
| } |
| private static long changeNodeNextIndex(long node, int next) { |
| return (node & 0xfffffffff00000ffL) | nodeFromNextIndex(next); |
| } |
| |
| private Normalizer2 nfd, fcd; |
| private Normalizer2Impl nfcImpl; |
| |
| private CollationTailoring base; |
| private CollationData baseData; |
| private CollationRootElements rootElements; |
| private long variableTop; |
| |
| private CollationDataBuilder dataBuilder; |
| private boolean fastLatinEnabled; |
| private UnicodeSet optimizeSet = new UnicodeSet(); |
| |
| private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH]; |
| private int cesLength; |
| |
| /** |
| * Indexes of nodes with root primary weights, sorted by primary. |
| * Compact form of a TreeMap from root primary to node index. |
| * |
| * This is a performance optimization for finding reset positions. |
| * Without this, we would have to search through the entire nodes list. |
| * It also allows storing root primary weights in list head nodes, |
| * without previous index, leaving room in root primary nodes for 32-bit primary weights. |
| */ |
| private UVector32 rootPrimaryIndexes; |
| /** |
| * Data structure for assigning tailored weights and CEs. |
| * Doubly-linked lists of nodes in mostly collation order. |
| * Each list starts with a root primary node and ends with a nextIndex of 0. |
| * |
| * When there are any nodes in the list, then there is always a root primary node at index 0. |
| * This allows some code not to have to check explicitly for nextIndex==0. |
| * |
| * Root primary nodes have 32-bit weights but do not have previous indexes. |
| * All other nodes have at most 16-bit weights and do have previous indexes. |
| * |
| * Nodes with explicit weights store root collator weights, |
| * or default weak weights (e.g., secondary 05) for stronger nodes. |
| * "Tailored" nodes, with the IS_TAILORED bit set, |
| * do not store explicit weights but rather |
| * create a difference of a certain strength from the preceding node. |
| * |
| * A root node is followed by either |
| * - a root/default node of the same strength, or |
| * - a root/default node of the next-weaker strength, or |
| * - a tailored node of the same strength. |
| * |
| * A node of a given strength normally implies "common" weights on weaker levels. |
| * |
| * A node with HAS_BEFORE2 must be immediately followed by |
| * a secondary node with an explicit below-common weight, then a secondary tailored node, |
| * and later an explicit common-secondary node. |
| * The below-common weight can be a root weight, |
| * or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight |
| * or before the lowest root weight. |
| * (&[before 2] resets to an explicit secondary node so that |
| * the following addRelation(secondary) tailors right after that. |
| * If we did not have this node and instead were to reset on the primary node, |
| * then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.) |
| * |
| * If the flag is not set, then there are no explicit secondary nodes |
| * with the common or lower weights. |
| * |
| * Same for HAS_BEFORE3 for tertiary nodes and weights. |
| * A node must not have both flags set. |
| * |
| * Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs |
| * which point to stable indexes in this list, |
| * and temporary CEs stored in a CollationDataBuilder only point to tailored nodes. |
| * |
| * A temporary CE in the ces[] array may point to a non-tailored reset-before-position node, |
| * until the next relation is added. |
| * |
| * At the end, the tailored weights are allocated as necessary, |
| * then the tailored nodes are replaced with final CEs, |
| * and the CollationData is rewritten by replacing temporary CEs with final ones. |
| * |
| * We cannot simply insert new nodes in the middle of the array |
| * because that would invalidate the indexes stored in existing temporary CEs. |
| * We need to use a linked graph with stable indexes to existing nodes. |
| * A doubly-linked list seems easiest to maintain. |
| * |
| * Each node is stored as an long, with its fields stored as bit fields. |
| * |
| * Root primary node: |
| * - primary weight: 32 bits 63..32 |
| * - reserved/unused/zero: 4 bits 31..28 |
| * |
| * Weaker root nodes & tailored nodes: |
| * - a weight: 16 bits 63..48 |
| * + a root or default weight for a non-tailored node |
| * + unused/zero for a tailored node |
| * - index to the previous node: 20 bits 47..28 |
| * |
| * All types of nodes: |
| * - index to the next node: 20 bits 27..8 |
| * + nextIndex=0 in last node per root-primary list |
| * - reserved/unused/zero bits: bits 7, 4, 2 |
| * - HAS_BEFORE2: bit 6 |
| * - HAS_BEFORE3: bit 5 |
| * - IS_TAILORED: bit 3 |
| * - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0 |
| * |
| * We could allocate structs with pointers, but we would have to store them |
| * in a pointer list so that they can be indexed from temporary CEs, |
| * and they would require more memory allocations. |
| */ |
| private UVector64 nodes; |
| } |