private void _testTrieRanges(TrieTest.SetRange[] setRanges, int countSetRanges, TrieTest.CheckRange[] checkRanges, int countCheckRanges, bool latin1Linear) { IntTrieBuilder newTrie = new IntTrieBuilder(null, 2000, checkRanges[0].value_ren, checkRanges[0].value_ren, latin1Linear); // set values from setRanges[] bool ok = true; for (int i = 0; i < countSetRanges; ++i) { int start_0 = setRanges[i].start; int limit_1 = setRanges[i].limit; int value_ren = setRanges[i].value_ren; bool overwrite_2 = setRanges[i].overwrite; if ((limit_1 - start_0) == 1 && overwrite_2) { ok &= newTrie.SetValue(start_0, value_ren); } else { ok &= newTrie.SetRange(start_0, limit_1, value_ren, overwrite_2); } } if (!ok) { Errln("setting values into a trie failed"); return; } // verify that all these values are in the new Trie int start_3 = 0; for (int i_4 = 0; i_4 < countCheckRanges; ++i_4) { int limit_5 = checkRanges[i_4].limit; int value_6 = checkRanges[i_4].value_ren; while (start_3 < limit_5) { if (value_6 != newTrie.GetValue(start_3)) { Errln("newTrie [U+" + ILOG.J2CsMapping.Util.IlNumber.ToString(start_3, 16) + "]==0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(newTrie.GetValue(start_3), 16) + " instead of 0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(value_6, 16)); } ++start_3; } } IntTrie trie = newTrie.Serialize(new TrieTest._testFoldedValue(newTrie), new TrieTest._testFoldingOffset()); // test linear Latin-1 range from utrie_getData() if (latin1Linear) { start_3 = 0; for (int i_7 = 0; i_7 < countCheckRanges && start_3 <= 0xff; ++i_7) { int limit_8 = checkRanges[i_7].limit; int value_9 = checkRanges[i_7].value_ren; while (start_3 < limit_8 && start_3 <= 0xff) { if (value_9 != trie.GetLatin1LinearValue((char)start_3)) { Errln("IntTrie.getLatin1LinearValue[U+" + ILOG.J2CsMapping.Util.IlNumber.ToString(start_3, 16) + "]==0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(trie .GetLatin1LinearValue((char)start_3), 16) + " instead of 0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(value_9, 16)); } ++start_3; } } } if (latin1Linear != trie.IsLatin1Linear()) { Errln("trie serialization did not preserve " + "Latin-1-linearity"); } // verify that all these values are in the serialized Trie start_3 = 0; for (int i_10 = 0; i_10 < countCheckRanges; ++i_10) { int limit_11 = checkRanges[i_10].limit; int value_12 = checkRanges[i_10].value_ren; if (start_3 == 0xd800) { // skip surrogates start_3 = limit_11; continue; } while (start_3 < limit_11) { if (start_3 <= 0xffff) { int value2 = trie.GetBMPValue((char)start_3); if (value_12 != value2) { Errln("serialized trie.getBMPValue(U+" + ILOG.J2CsMapping.Util.IlNumber.ToString(start_3, 16) + " == 0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(value2, 16) + " instead of 0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(value_12, 16)); } if (!IBM.ICU.Text.UTF16.IsLeadSurrogate((char)start_3)) { value2 = trie.GetLeadValue((char)start_3); if (value_12 != value2) { Errln("serialized trie.getLeadValue(U+" + ILOG.J2CsMapping.Util.IlNumber.ToString(start_3, 16) + " == 0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(value2, 16) + " instead of 0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(value_12, 16)); } } } int value2_13 = trie.GetCodePointValue(start_3); if (value_12 != value2_13) { Errln("serialized trie.getCodePointValue(U+" + ILOG.J2CsMapping.Util.IlNumber.ToString(start_3, 16) + ")==0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(value2_13, 16) + " instead of 0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(value_12, 16)); } ++start_3; } } // enumerate and verify all ranges int enumRanges = 1; TrieIterator iter = new TrieTest._testEnumValue(trie); RangeValueIterator_Constants.Element result = new RangeValueIterator_Constants.Element(); while (iter.Next(result)) { if (result.start != checkRanges[enumRanges - 1].limit || result.limit != checkRanges[enumRanges].limit || (result.value_ren ^ 0x5555) != checkRanges[enumRanges].value_ren) { Errln("utrie_enum() delivers wrong range [U+" + ILOG.J2CsMapping.Util.IlNumber.ToString(result.start, 16) + "..U+" + ILOG.J2CsMapping.Util.IlNumber.ToString(result.limit, 16) + "].0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(result.value_ren ^ 0x5555, 16) + " instead of [U+" + ILOG.J2CsMapping.Util.IlNumber.ToString(checkRanges[enumRanges - 1].limit, 16) + "..U+" + ILOG.J2CsMapping.Util.IlNumber.ToString(checkRanges[enumRanges].limit, 16) + "].0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(checkRanges[enumRanges].value_ren, 16)); } enumRanges++; } // test linear Latin-1 range if (trie.IsLatin1Linear()) { for (start_3 = 0; start_3 < 0x100; ++start_3) { if (trie.GetLatin1LinearValue((char)start_3) != trie .GetLeadValue((char)start_3)) { Errln("trie.getLatin1LinearValue[U+" + ILOG.J2CsMapping.Util.IlNumber.ToString(start_3, 16) + "]=0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(trie .GetLatin1LinearValue((char)start_3), 16) + " instead of 0x" + ILOG.J2CsMapping.Util.IlNumber.ToString(trie .GetLeadValue((char)start_3), 16)); } } } _testTrieIteration(trie, checkRanges, countCheckRanges); }
// ------------------------------------------------------------------------ // // build Build the list of non-overlapping character ranges // from the Unicode Sets. // // ------------------------------------------------------------------------ internal void Build() { RBBINode usetNode; RBBISetBuilder.RangeDescriptor rlRange; if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("usets") >= 0) { PrintSets(); } // Initialize the process by creating a single range encompassing all // characters // that is in no sets. // fRangeList = new RBBISetBuilder.RangeDescriptor(); fRangeList.fStartChar = 0; fRangeList.fEndChar = 0x10ffff; // // Find the set of non-overlapping ranges of characters // IIterator ni = new ILOG.J2CsMapping.Collections.IteratorAdapter(fRB.fUSetNodes.GetEnumerator()); while (ni.HasNext()) { usetNode = (RBBINode)ni.Next(); UnicodeSet inputSet = usetNode.fInputSet; int inputSetRangeCount = inputSet.GetRangeCount(); int inputSetRangeIndex = 0; rlRange = fRangeList; for (;;) { if (inputSetRangeIndex >= inputSetRangeCount) { break; } int inputSetRangeBegin = inputSet .GetRangeStart(inputSetRangeIndex); int inputSetRangeEnd = inputSet.GetRangeEnd(inputSetRangeIndex); // skip over ranges from the range list that are completely // below the current range from the input unicode set. while (rlRange.fEndChar < inputSetRangeBegin) { rlRange = rlRange.fNext; } // If the start of the range from the range list is before with // the start of the range from the unicode set, split the range // list range // in two, with one part being before (wholly outside of) the // unicode set // and the other containing the rest. // Then continue the loop; the post-split current range will // then be skipped // over if (rlRange.fStartChar < inputSetRangeBegin) { rlRange.Split(inputSetRangeBegin); continue; } // Same thing at the end of the ranges... // If the end of the range from the range list doesn't coincide // with // the end of the range from the unicode set, split the range // list // range in two. The first part of the split range will be // wholly inside the Unicode set. if (rlRange.fEndChar > inputSetRangeEnd) { rlRange.Split(inputSetRangeEnd + 1); } // The current rlRange is now entirely within the UnicodeSet // range. // Add this unicode set to the list of sets for this rlRange if (rlRange.fIncludesSets.IndexOf(usetNode) == -1) { ILOG.J2CsMapping.Collections.Generics.Collections.Add(rlRange.fIncludesSets, usetNode); } // Advance over ranges that we are finished with. if (inputSetRangeEnd == rlRange.fEndChar) { inputSetRangeIndex++; } rlRange = rlRange.fNext; } } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("range") >= 0) { PrintRanges(); } // // Group the above ranges, with each group consisting of one or more // ranges that are in exactly the same set of original UnicodeSets. // The groups are numbered, and these group numbers are the set of // input symbols recognized by the run-time state machine. // // Numbering: # 0 (state table column 0) is unused. // # 1 is reserved - table column 1 is for end-of-input // # 2 is reserved - table column 2 is for beginning-in-input // # 3 is the first range list. // RBBISetBuilder.RangeDescriptor rlSearchRange; for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) { for (rlSearchRange = fRangeList; rlSearchRange != rlRange; rlSearchRange = rlSearchRange.fNext) { if (rlRange.fIncludesSets.Equals(rlSearchRange.fIncludesSets)) { rlRange.fNum = rlSearchRange.fNum; break; } } if (rlRange.fNum == 0) { fGroupCount++; rlRange.fNum = fGroupCount + 2; rlRange.SetDictionaryFlag(); AddValToSets(rlRange.fIncludesSets, fGroupCount + 2); } } // Handle input sets that contain the special string {eof}. // Column 1 of the state table is reserved for EOF on input. // Column 2 is reserved for before-the-start-input. // (This column can be optimized away later if there are no rule // references to {bof}.) // Add this column value (1 or 2) to the equivalent expression // subtree for each UnicodeSet that contains the string {eof} // Because {bof} and {eof} are not a characters in the normal sense, // they doesn't affect the computation of ranges or TRIE. String eofString = "eof"; String bofString = "bof"; ni = new ILOG.J2CsMapping.Collections.IteratorAdapter(fRB.fUSetNodes.GetEnumerator()); while (ni.HasNext()) { usetNode = (RBBINode)ni.Next(); UnicodeSet inputSet_0 = usetNode.fInputSet; if (inputSet_0.Contains(eofString)) { AddValToSet(usetNode, 1); } if (inputSet_0.Contains(bofString)) { AddValToSet(usetNode, 2); fSawBOF = true; } } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("rgroup") >= 0) { PrintRangeGroups(); } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("esets") >= 0) { PrintSets(); } // IntTrieBuilder(int aliasdata[], int maxdatalength, // int initialvalue, int leadunitvalue, // boolean latin1linear) fTrie = new IntTrieBuilder(null, // Data array (utrie will allocate one) 100000, // Max Data Length 0, // Initial value for all code points 0, // Lead Surrogate unit value, true); // Keep Latin 1 in separately. for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) { fTrie.SetRange(rlRange.fStartChar, rlRange.fEndChar + 1, rlRange.fNum, true); } }