private void Test1Sentence(BreakIterator bi, String text) { int start = bi.Text.BeginIndex; assertEquals(start, bi.First()); int current = bi.Current; assertEquals(bi.Text.EndIndex, bi.Next()); int end = bi.Current - start; assertEquals(text, text.Substring(current - start, end - start)); assertEquals(text.Length, bi.Last() - start); end = bi.Current; bi.Previous(); assertEquals(BreakIterator.Done, bi.Previous()); int previous = bi.Current; assertEquals(text, text.Substring(previous - start, end - start)); assertEquals(start, bi.Current); assertEquals(BreakIterator.Done, bi.Following(bi.Last() / 2 + start)); assertEquals(BreakIterator.Done, bi.Preceding(bi.Last() / 2 + start)); assertEquals(start, bi.First()); assertEquals(BreakIterator.Done, bi.Next(13)); assertEquals(BreakIterator.Done, bi.Next(-8)); }
public void TestRuleStatus() { BreakIterator bi = BreakIterator.GetWordInstance(ULocale.ENGLISH); bi.SetText("# "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= RuleStatus.WordNone); assertTrue(null, bi.RuleStatus < RuleStatus.WordNoneLimit); bi.SetText("3 "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= RuleStatus.WordNumber); assertTrue(null, bi.RuleStatus < RuleStatus.WordNumberLimit); bi.SetText("a "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= RuleStatus.WordLetter); assertTrue(null, bi.RuleStatus < RuleStatus.WordLetterLimit); bi.SetText("イ "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= RuleStatus.WordKana); // TODO: ticket #10261, Kana is not returning the correct status. // assertTrue(null, bi.getRuleStatus() < RuleStatus.WordKanaLimit); // System.out.println("\n" + bi.getRuleStatus()); bi.SetText("退 "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= RuleStatus.WordIdeo); assertTrue(null, bi.RuleStatus < RuleStatus.WordIdeoLimit); }
public void TestRuleStatus() { BreakIterator bi = BreakIterator.GetWordInstance(new UCultureInfo("en")); bi.SetText("# "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= BreakIterator.WordNone); assertTrue(null, bi.RuleStatus < BreakIterator.WordNoneLimit); bi.SetText("3 "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= BreakIterator.WordNumber); assertTrue(null, bi.RuleStatus < BreakIterator.WordNumberLimit); bi.SetText("a "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= BreakIterator.WordLetter); assertTrue(null, bi.RuleStatus < BreakIterator.WordLetterLimit); bi.SetText("イ "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= BreakIterator.WordKana); // TODO: ticket #10261, Kana is not returning the correct status. // assertTrue(null, bi.RuleStatus < RuleStatus.WordKanaLimit); // Console.Out.WriteLine("\n" + bi.RuleStatus); bi.SetText("退 "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= BreakIterator.WordIdeo); assertTrue(null, bi.RuleStatus < BreakIterator.WordIdeoLimit); }
/// <summary> /// Given that the delegate has already given its "initial" answer, /// find the NEXT actual (non-suppressed) break. /// </summary> /// <param name="n">Initial position from delegate.</param> /// <returns>New break position or <see cref="BreakIterator.Done"/>.</returns> private int InternalNext(int n) { if (n == BreakIterator.Done || // at end or backwardsTrie == null) { // .. no backwards table loaded == no exceptions return(n); } ResetState(); int textLen = text.Length; while (n != BreakIterator.Done && n != textLen) { // outer loop runs once per underlying break (from fDelegate). // loops while 'n' points to an exception. if (BreakExceptionAt(n)) { // n points to a break exception n = @delegate.Next(); } else { // no exception at this spot return(n); } } return(n); //hit underlying DONE or break at end of text }
public void TestRuleStatus() { BreakIterator bi = BreakIterator.GetWordInstance(ULocale.ENGLISH); bi.SetText("# "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= RuleBasedBreakIterator.WORD_NONE); assertTrue(null, bi.RuleStatus < RuleBasedBreakIterator.WORD_NONE_LIMIT); bi.SetText("3 "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= RuleBasedBreakIterator.WORD_NUMBER); assertTrue(null, bi.RuleStatus < RuleBasedBreakIterator.WORD_NUMBER_LIMIT); bi.SetText("a "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= RuleBasedBreakIterator.WORD_LETTER); assertTrue(null, bi.RuleStatus < RuleBasedBreakIterator.WORD_LETTER_LIMIT); bi.SetText("イ "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= RuleBasedBreakIterator.WORD_KANA); // TODO: ticket #10261, Kana is not returning the correct status. // assertTrue(null, bi.getRuleStatus() < RuleBasedBreakIterator.WORD_KANA_LIMIT); // System.out.println("\n" + bi.getRuleStatus()); bi.SetText("退 "); assertEquals(null, bi.Next(), 1); assertTrue(null, bi.RuleStatus >= RuleBasedBreakIterator.WORD_IDEO); assertTrue(null, bi.RuleStatus < RuleBasedBreakIterator.WORD_IDEO_LIMIT); }
/** * @param filteredBI * @param text */ private void assertFrenchBreakBehavior(BreakIterator filteredBI, String text) { Logln("Testing French behavior:"); filteredBI.SetText(text); assertEquals("6th next", 20, filteredBI.Next()); assertEquals("6th next", 84, filteredBI.Next()); filteredBI.First(); }
/** * @param filteredBI * @param text */ private void assertEnglishBreakBehavior(BreakIterator filteredBI, String text) { Logln("Testing English filtered behavior:"); filteredBI.SetText(text); assertEquals("5th next", 84, filteredBI.Next()); assertEquals("5th next", 278, filteredBI.Next()); filteredBI.First(); }
/** * @param filteredBI * @param text */ private void assertDefaultBreakBehavior(BreakIterator filteredBI, String text) { Logln("Testing Default Behavior:"); filteredBI.SetText(text); assertEquals("1st next", 20, filteredBI.Next()); assertEquals("1st next", 84, filteredBI.Next()); assertEquals("1st next", 90, filteredBI.Next()); assertEquals("1st next", 181, filteredBI.Next()); assertEquals("1st next", 278, filteredBI.Next()); filteredBI.First(); }
public void TestFilteredJapanese() { ULocale loc = ULocale.JAPANESE; BreakIterator brk = FilteredBreakIteratorBuilder .GetInstance(loc) .WrapIteratorWithFilter(BreakIterator.GetSentenceInstance(loc)); brk.SetText("OKです。"); assertEquals("Starting point", 0, brk.Current); assertEquals("Next point", 5, brk.Next()); assertEquals("Last point", BreakIterator.Done, brk.Next()); }
public static IEnumerable <Token> EnumerateTokens(this BreakIterator bi) { string text = bi.GetCLRText(); int start = bi.First(), end = bi.Next(); while (end != BreakIterator.DONE) { yield return(new Token(start, end, text.Substring(start, end - start), bi.GetRuleStatus())); start = end; end = bi.Next(); } }
private void Test0Sentences(BreakIterator bi) { assertEquals(0, bi.Current); assertEquals(0, bi.First()); assertEquals(BreakIterator.Done, bi.Next()); assertEquals(0, bi.Last()); assertEquals(BreakIterator.Done, bi.Previous()); assertEquals(BreakIterator.Done, bi.Following(0)); assertEquals(BreakIterator.Done, bi.Preceding(0)); assertEquals(0, bi.First()); assertEquals(BreakIterator.Done, bi.Next(13)); assertEquals(BreakIterator.Done, bi.Next(-8)); }
public static IEnumerable <string> Enumerate(this BreakIterator bi) { var sb = new StringBuilder(); string text = bi.GetCLRText(); int start = bi.First(), end = bi.Next(); while (end != BreakIterator.DONE) { yield return(text.Substring(start, end - start)); start = end; end = bi.Next(); } }
private void Consume(BreakIterator bi, CharacterIterator ci) { bi.SetText(ci); while (bi.Next() != BreakIterator.Done) { } }
//========================================================================= // general test subroutines //========================================================================= private List <String> _testFirstAndNext(BreakIterator bi, String text) { int p = bi.First(); int lastP = p; List <String> result = new List <String>(); if (p != 0) { Errln("first() returned " + p + " instead of 0"); } while (p != BreakIterator.Done) { p = bi.Next(); if (p != BreakIterator.Done) { if (p <= lastP) { Errln("next() failed to move forward: next() on position " + lastP + " yielded " + p); } result.Add(text.Substring(lastP, p - lastP)); // ICU4N: Corrected 2nd substring parameter } else { if (lastP != text.Length) { Errln("next() returned DONE prematurely: offset was " + lastP + " instead of " + text.Length); } } lastP = p; } return(result); }
/// <summary> /// Called to summarize a document when no hits were /// found. By default this just returns the first /// <paramref name="maxPassages"/> sentences; subclasses can override /// to customize. /// </summary> protected virtual Passage[] GetEmptyHighlight(string fieldName, BreakIterator bi, int maxPassages) { // BreakIterator should be un-next'd: JCG.List <Passage> passages = new JCG.List <Passage>(); int pos = bi.Current; if (Debugging.AssertsEnabled) { Debugging.Assert(pos == 0); } while (passages.Count < maxPassages) { int next = bi.Next(); if (next == BreakIterator.Done) { break; } Passage passage = new Passage(); passage.score = float.NaN; passage.startOffset = pos; passage.endOffset = next; passages.Add(passage); pos = next; } return(passages.ToArray(/*new Passage[passages.size()]*/)); }
public virtual int FindEndOffset(StringBuilder buffer, int start) { // avoid illegal start offset if (start > buffer.Length || start < 0) return start; bi.SetText(buffer.ToString(start, buffer.Length - start)); return bi.Next() + start; }
/// <summary> /// return true if there is a token from the buffer, or null if it is /// exhausted. /// </summary> private bool IncrementSentence() { if (length == 0) // we must refill the buffer { return(false); } while (true) { int start = iterator.Current; if (start == BreakIterator.Done) { return(false); // BreakIterator exhausted } // find the next set of boundaries int end = iterator.Next(); if (end == BreakIterator.Done) { return(false); // BreakIterator exhausted } SetNextSentence(start, end); if (IncrementWord()) { return(true); } } }
private void Consume(BreakIterator bi, CharacterIterator ci) { bi.SetText(ci.toString()); while (bi.Next() != BreakIterator.DONE) { ; } }
public override int Next() { int current = bi.Current; int next = bi.Next(); status = CalcStatus(current, next); return(next); }
private int GetNext() { bool isThai, isNonThai; bool prevWasThai = false, prevWasNonThai = false; int prev = wordBreaker.Current; int current = wordBreaker.Next(); if (current != BreakIterator.Done && current - prev > 0) { int length = text.Length; string toMatch; // Find all of the transitions between Thai and non-Thai characters and digits for (int i = prev; i < current; i++) { char high = text[i]; // Account for surrogate pairs if (char.IsHighSurrogate(high) && i < length && i + 1 < current && char.IsLowSurrogate(text[i + 1])) { toMatch = string.Empty + high + text[++i]; } else { toMatch = string.Empty + high; } if (char.IsLetter(toMatch, 0)) // Always break letters apart from digits to match the JDK { isThai = thaiPattern.IsMatch(toMatch); isNonThai = !isThai; } else { isThai = false; isNonThai = false; } if ((prevWasThai && isNonThai) || (prevWasNonThai && isThai)) { transitions.Enqueue(i); } // record the values for comparison with the next loop prevWasThai = isThai; prevWasNonThai = isNonThai; } if (transitions.Count > 0) { transitions.Enqueue(current); return(transitions.Peek()); } } return(current); }
public void TestEndBehavior() { String testString = "boo."; BreakIterator wb = BreakIterator.GetWordInstance(); wb.SetText(testString); if (wb.First() != 0) { Errln("Didn't get break at beginning of string."); } if (wb.Next() != 3) { Errln("Didn't get break before period in \"boo.\""); } if (wb.Current != 4 && wb.Next() != 4) { Errln("Didn't get break at end of string."); } }
public void TestPreceding() { String words3 = "aaa bbb ccc"; BreakIterator e = BreakIterator.GetWordInstance(CultureInfo.CurrentCulture); e.SetText(words3); e.First(); int p1 = e.Next(); int p2 = e.Next(); int p3 = e.Next(); int p4 = e.Next(); int f = e.Following(p2 + 1); int p = e.Preceding(p2 + 1); if (f != p3) { Errln("IntlTestTextBoundary::TestPreceding: f!=p3"); } if (p != p2) { Errln("IntlTestTextBoundary::TestPreceding: p!=p2"); } if (p1 + 1 != p2) { Errln("IntlTestTextBoundary::TestPreceding: p1+1!=p2"); } if (p3 + 1 != p4) { Errln("IntlTestTextBoundary::TestPreceding: p3+1!=p4"); } if (!e.IsBoundary(p2) || e.IsBoundary(p2 + 1) || !e.IsBoundary(p3)) { Errln("IntlTestTextBoundary::TestPreceding: isBoundary err"); } }
/** Asserts that two breakiterators break the text the same way */ public void assertSameBreaks(CharacterIterator one, CharacterIterator two, BreakIterator expected, BreakIterator actual) { expected.SetText(one); actual.SetText(two); assertEquals(expected.Current, actual.Current); // next() int v = expected.Current; while (v != BreakIterator.DONE) { assertEquals(v = expected.Next(), actual.Next()); assertEquals(expected.Current, actual.Current); } // first() assertEquals(expected.First(), actual.First()); assertEquals(expected.Current, actual.Current); // last() assertEquals(expected.Last(), actual.Last()); assertEquals(expected.Current, actual.Current); // previous() v = expected.Current; while (v != BreakIterator.DONE) { assertEquals(v = expected.Previous(), actual.Previous()); assertEquals(expected.Current, actual.Current); } // following() for (int i = one.BeginIndex; i <= one.EndIndex; i++) { expected.First(); actual.First(); assertEquals(expected.Following(i), actual.Following(i)); assertEquals(expected.Current, actual.Current); } // preceding() for (int i = one.BeginIndex; i <= one.EndIndex; i++) { expected.Last(); actual.Last(); assertEquals(expected.Preceding(i), actual.Preceding(i)); assertEquals(expected.Current, actual.Current); } }
public void TestGetSetText() { Logln("Testing getText setText "); String str1 = "first string."; String str2 = "Second string."; //RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) BreakIterator.getCharacterInstance(Locale.getDefault()); RuleBasedBreakIterator wordIter1 = (RuleBasedBreakIterator)BreakIterator.GetWordInstance(CultureInfo.CurrentCulture); CharacterIterator text1 = new StringCharacterIterator(str1); //CharacterIterator text1Clone = (CharacterIterator) text1.Clone(); //CharacterIterator text2 = new StringCharacterIterator(str2); wordIter1.SetText(str1); if (!wordIter1.Text.Equals(text1)) { Errln("ERROR:1 error in setText or getText "); } if (wordIter1.Current != 0) { Errln("ERROR:1 setText did not set the iteration position to the beginning of the text, it is" + wordIter1.Current + "\n"); } wordIter1.Next(2); wordIter1.SetText(str2); if (wordIter1.Current != 0) { Errln("ERROR:2 setText did not reset the iteration position to the beginning of the text, it is" + wordIter1.Current + "\n"); } // Test the CharSequence overload of setText() for a simple case. BreakIterator lineIter = BreakIterator.GetLineInstance(new CultureInfo("en")); ICharSequence csText = "Hello, World. ".ToCharSequence(); // Expected Line Brks ^ ^ ^ // 0123456789012345 List <int> expected = new List <int>(); expected.Add(0); expected.Add(7); expected.Add(14); lineIter.SetText(csText); for (int pos = lineIter.First(); pos != BreakIterator.Done; pos = lineIter.Next()) { assertTrue("", expected.Contains(pos)); } assertEquals("", csText.Length, lineIter.Current); }
List <String> ParseText(String text) { List <String> words = new List <String>(); BreakIterator boundary = BreakIterator.GetWordInstance(); boundary.Text = text; int start = boundary.First(); for (int end = boundary.Next(); end != BreakIterator.DONE; start = end, end = boundary.Next()) { if (!Char.IsLetter(text[start])) { continue; } words.Add(text.Substring(start, end - start)); } return(words); }
public void TestBug12918() { // This test triggered an assertion failure in ICU4C, in dictbe.cpp // The equivalent code in ICU4J is structured slightly differently, // and does not appear vulnerable to the same issue. // // \u3325 decomposes with normalization, then the CJK dictionary // finds a break within the decomposition. String crasherString = "\u3325\u4a16"; BreakIterator iter = BreakIterator.GetWordInstance(ULocale.ENGLISH); iter.SetText(crasherString); iter.First(); int pos = 0; int lastPos = -1; while ((pos = iter.Next()) != BreakIterator.Done) { assertTrue("", pos > lastPos); } }
private int GetNext() { bool isThai = false, isNonThai = false; bool prevWasThai = false, prevWasNonThai = false; int prev = wordBreaker.Current; int current = wordBreaker.Next(); if (current != BreakIterator.Done && current - prev > 0) { // Find all of the transitions between Thai and non-Thai characters and digits for (int i = prev; i < current; i++) { char c = text[i]; isThai = char.IsLetter(c) && thaiPattern.IsMatch(c.ToString()); isNonThai = char.IsLetter(c) && !isThai; if ((prevWasThai && isNonThai) || (prevWasNonThai && isThai)) { transitions.Add(i); } // record the values for comparison with the next loop prevWasThai = isThai; prevWasNonThai = isNonThai; } if (transitions.Any()) { transitions.Add(current); return(transitions.First()); } } return(current); }
private void Do3SentenceTest(BreakIterator bi) // LUCENENET NOTE: Refactored a bit because Substring in .NET requires some light math to match Java { assertEquals(0, bi.Current); assertEquals(0, bi.First()); int current = bi.Current; assertEquals(SENTENCES[0], TEXT.Substring(current, bi.Next() - current)); // LUCNENENET: Corrected 2nd parameter current = bi.Current; assertEquals(SENTENCES[1], TEXT.Substring(current, bi.Next() - current)); // LUCNENENET: Corrected 2nd parameter current = bi.Current; assertEquals(bi.Text.EndIndex, bi.Next()); int next = bi.Current; assertEquals(SENTENCES[2], TEXT.Substring(current, next - current)); // LUCNENENET: Corrected 2nd parameter assertEquals(BreakIterator.Done, bi.Next()); assertEquals(TEXT.Length, bi.Last()); int end = bi.Current; int prev = bi.Previous(); assertEquals(SENTENCES[2], TEXT.Substring(prev, end - prev)); // LUCNENENET: Corrected 2nd parameter end = bi.Current; prev = bi.Previous(); assertEquals(SENTENCES[1], TEXT.Substring(prev, end - prev)); // LUCNENENET: Corrected 2nd parameter end = bi.Current; prev = bi.Previous(); assertEquals(SENTENCES[0], TEXT.Substring(prev, end - prev)); // LUCNENENET: Corrected 2nd parameter assertEquals(BreakIterator.Done, bi.Previous()); assertEquals(0, bi.Current); assertEquals(59, bi.Following(39)); assertEquals(59, bi.Following(31)); assertEquals(31, bi.Following(30)); assertEquals(0, bi.Preceding(57)); assertEquals(0, bi.Preceding(58)); assertEquals(31, bi.Preceding(59)); assertEquals(0, bi.First()); assertEquals(59, bi.Next(2)); assertEquals(0, bi.Next(-2)); }
public void TestLineIteration() { BreakIterator bi = GetLineInstance(System.Globalization.CultureInfo.InvariantCulture); // Test empty Assert.AreEqual(0, bi.Current); Assert.AreEqual(BreakIterator.Done, bi.Next()); Assert.AreEqual(0, bi.Current); bi.SetText(LINE_TEXT); // Ensure position starts at 0 when initialized Assert.AreEqual(0, bi.Current); // Check first boundary (Apache\t^Lucene) - Ensure we break on \t Assert.AreEqual(7, bi.Next()); // Ensure Current returns the most recent boundary Assert.AreEqual(7, bi.Current); // Check next boundary (Lucene^(TM)) Assert.AreEqual(13, bi.Next()); // Ensure Current returns the most recent boundary Assert.AreEqual(13, bi.Current); // Check next boundary (Lucene(TM) ^is a) Assert.AreEqual(18, bi.Next()); // Ensure Current returns the most recent boundary Assert.AreEqual(18, bi.Current); // Move to start of high-performance bi.Next(); bi.Next(); // Check next boundary (high-\n^performance) Assert.AreEqual(29, bi.Next()); // Check last boundary (in Java.^) Assert.AreEqual(108, bi.Last()); // Check move past last boundary Assert.AreEqual(BreakIterator.Done, bi.Next()); // Ensure we are still at last boundary Assert.AreEqual(108, bi.Current); // Check MovePrevious Assert.AreEqual(103, bi.Previous()); // Ensure we get the same value for Current as the last move Assert.AreEqual(103, bi.Current); // Check MoveFirst Assert.AreEqual(0, bi.First()); // Ensure we get the same value for Current as the last move Assert.AreEqual(0, bi.Current); // Check moving beyond first boundary Assert.AreEqual(BreakIterator.Done, bi.Previous()); // Ensure we are still at first boundary Assert.AreEqual(0, bi.Current); // Check MoveLast() Assert.AreEqual(108, bi.Last()); }
public void TestWordIteration() { BreakIterator bi = GetWordInstance(System.Globalization.CultureInfo.InvariantCulture); bi.SetText(""); // Test empty Assert.AreEqual(0, bi.Current); Assert.AreEqual(BreakIterator.Done, bi.Next()); Assert.AreEqual(0, bi.Current); bi.SetText(TEXT); // Ensure position starts at 0 when initialized Assert.AreEqual(0, bi.Current); // Check first boundary (Apache^) Assert.AreEqual(6, bi.Next()); // Ensure Current returns the last boundary iterated to Assert.AreEqual(6, bi.Current); // Check second boundary (^Lucene) Assert.AreEqual(7, bi.Next()); // Ensure Current returns the last boundary iterated to Assert.AreEqual(7, bi.Current); // Check third boundary (Lucene^) Assert.AreEqual(13, bi.Next()); // Ensure Current returns the last boundary iterated to Assert.AreEqual(13, bi.Current); // Check fourth boundary (^TM) Assert.AreEqual(14, bi.Next()); // Check fifth boundary (TM^) Assert.AreEqual(16, bi.Next()); // Check sixth boundary (TM)^ Assert.AreEqual(17, bi.Next()); // Check seventh boundary (^is) Assert.AreEqual(18, bi.Next()); // Move to (^high-performance) bi.Next(); bi.Next(); bi.Next(); // Check next boundary (^high-performance) Assert.AreEqual(23, bi.Next()); // Ensure we don't break on hyphen (high-performance^) Assert.AreEqual(39, bi.Next()); // Check MoveLast() Assert.AreEqual(107, bi.Last()); // Check going past last boundary Assert.AreEqual(BreakIterator.Done, bi.Next()); // Check we are still at last boundary Assert.AreEqual(107, bi.Current); // Check MoveFirst() Assert.AreEqual(0, bi.First()); // Check going past first boundary Assert.AreEqual(BreakIterator.Done, bi.Previous()); // Check we are still at first boundary Assert.AreEqual(0, bi.Current); }