private void checkFirst(CharsTrie trie, StringAndValue[] data, int dataLength) { for (int i = 0; i < dataLength; ++i) { if (data[i].s.Length == 0) { continue; // skip empty string } String expectedString = data[i].s; int c = expectedString[0]; int nextCp = expectedString.Length > 1 ? expectedString[1] : 0; Result firstResult = trie.First(c); int firstValue = firstResult.HasValue() ? trie.GetValue() : -1; Result nextResult = trie.Next(nextCp); if (firstResult != trie.Reset().Next(c) || firstResult != trie.Current || firstValue != (firstResult.HasValue() ? trie.GetValue() : -1) || nextResult != trie.Next(nextCp) ) { Errln(String.Format("trie.first(U+{0:X4})!=trie.Reset().Next(same) for {1}", c, data[i].s)); } c = expectedString.CodePointAt(0); int cLength = Character.CharCount(c); nextCp = expectedString.Length > cLength?expectedString.CodePointAt(cLength) : 0; firstResult = trie.FirstForCodePoint(c); firstValue = firstResult.HasValue() ? trie.GetValue() : -1; nextResult = trie.NextForCodePoint(nextCp); if (firstResult != trie.Reset().NextForCodePoint(c) || firstResult != trie.Current || firstValue != (firstResult.HasValue() ? trie.GetValue() : -1) || nextResult != trie.NextForCodePoint(nextCp) ) { Errln(String.Format("trie.firstForCodePoint(U+{0:X4})!=trie.Reset().NextForCodePoint(same) for {1}", c, data[i].s)); } } trie.Reset(); }
public override int Matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values) { UCharacterIterator text = UCharacterIterator.GetInstance(text_); CharsTrie uct = new CharsTrie(characters, 0); int c = text.NextCodePoint(); if (c == UCharacterIterator.Done) { return(0); } Result result = uct.FirstForCodePoint(c); // TODO: should numChars count Character.charCount? int numChars = 1; int count = 0; for (; ;) { if (result.HasValue()) { if (count < limit) { if (values != null) { values[count] = uct.GetValue(); } lengths[count] = numChars; count++; } if (result == Result.FinalValue) { break; } } else if (result == Result.NoMatch) { break; } if (numChars >= maxLength) { break; } c = text.NextCodePoint(); if (c == UCharacterIterator.Done) { break; } ++numChars; result = uct.NextForCodePoint(c); } count_[0] = count; return(numChars); }
/// <summary> /// Is there an exception at this point? /// </summary> /// <param name="n">The location of the possible break.</param> /// <returns></returns> private bool BreakExceptionAt(int n) { // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt() int bestPosn = -1; int bestValue = -1; // loops while 'n' points to an exception text.Index = n; backwardsTrie.Reset(); int uch; // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") if ((uch = text.PreviousCodePoint()) == ' ') { // TODO: skip a class of chars here?? // TODO only do this the 1st time? } else { uch = text.NextCodePoint(); } Result r = Result.IntermediateValue; while ((uch = text.PreviousCodePoint()) != UCharacterIterator.Done && // more to consume backwards and.. ((r = backwardsTrie.NextForCodePoint(uch)).HasNext())) { // more in the trie if (r.HasValue()) { // remember the best match so far bestPosn = text.Index; bestValue = backwardsTrie.GetValue(); } } if (r.Matches()) { // exact match? bestValue = backwardsTrie.GetValue(); bestPosn = text.Index; } if (bestPosn >= 0) { if (bestValue == SimpleFilteredSentenceBreakIteratorBuilder.Match) { // exact match! return(true); // Exception here. } else if (bestValue == SimpleFilteredSentenceBreakIteratorBuilder.Partial && forwardsPartialTrie != null) { // make sure there's a forward trie // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie // to see if it matches something going forward. forwardsPartialTrie.Reset(); Result rfwd = Result.IntermediateValue; text.Index = bestPosn; // hope that's close .. while ((uch = text.NextCodePoint()) != BreakIterator.Done && ((rfwd = forwardsPartialTrie.NextForCodePoint(uch)).HasNext())) { } if (rfwd.Matches()) { // Exception here return(true); } // else fall through } // else fall through } // else fall through return(false); // No exception here. }
public void Test32NextForCodePoint() { StringAndValue[] data = { // "\u4dff\\U00010000\u9999\\U00020000\udfff\\U0010ffff" new StringAndValue("\u4dff\ud800\udc00\u9999\ud840\udc00\udfff\udbff\udfff", 2000000000), // "\u4dff\\U00010000\u9999\\U00020002" new StringAndValue("\u4dff\ud800\udc00\u9999\ud840\udc02", 44444), // "\u4dff\\U000103ff" new StringAndValue("\u4dff\ud800\udfff", 99999) }; CharsTrie trie = buildTrie(data, data.Length, TrieBuilderOption.Fast); Result result; if ((result = trie.NextForCodePoint(0x4dff)) != Result.NoValue || result != trie.Current || (result = trie.NextForCodePoint(0x10000)) != Result.NoValue || result != trie.Current || (result = trie.NextForCodePoint(0x9999)) != Result.NoValue || result != trie.Current || (result = trie.NextForCodePoint(0x20000)) != Result.NoValue || result != trie.Current || (result = trie.NextForCodePoint(0xdfff)) != Result.NoValue || result != trie.Current || (result = trie.NextForCodePoint(0x10ffff)) != Result.FinalValue || result != trie.Current || trie.GetValue() != 2000000000 ) { Errln("CharsTrie.NextForCodePoint() fails for " + data[0].s); } if ((result = trie.FirstForCodePoint(0x4dff)) != Result.NoValue || result != trie.Current || (result = trie.NextForCodePoint(0x10000)) != Result.NoValue || result != trie.Current || (result = trie.NextForCodePoint(0x9999)) != Result.NoValue || result != trie.Current || (result = trie.NextForCodePoint(0x20002)) != Result.FinalValue || result != trie.Current || trie.GetValue() != 44444 ) { Errln("CharsTrie.NextForCodePoint() fails for " + data[1].s); } if ((result = trie.Reset().NextForCodePoint(0x4dff)) != Result.NoValue || result != trie.Current || (result = trie.NextForCodePoint(0x10000)) != Result.NoValue || result != trie.Current || (result = trie.NextForCodePoint(0x9999)) != Result.NoValue || result != trie.Current || (result = trie.NextForCodePoint(0x20222)) != Result.NoMatch || result != trie.Current // no match for trail surrogate ) { Errln("CharsTrie.NextForCodePoint() fails for \u4dff\\U00010000\u9999\\U00020222"); } if ((result = trie.Reset().NextForCodePoint(0x4dff)) != Result.NoValue || result != trie.Current || (result = trie.NextForCodePoint(0x103ff)) != Result.FinalValue || result != trie.Current || trie.GetValue() != 99999 ) { Errln("CharsTrie.NextForCodePoint() fails for " + data[2].s); } }