private void Test1Sentence(BreakIterator bi, String text) { int start = bi.Text.BeginIndex; assertEquals(start, bi.First()); int current = bi.Current; assertEquals(bi.Text.EndIndex, bi.Next()); int end = bi.Current - start; assertEquals(text, text.Substring(current - start, end - start)); assertEquals(text.Length, bi.Last() - start); end = bi.Current; bi.Previous(); assertEquals(BreakIterator.Done, bi.Previous()); int previous = bi.Current; assertEquals(text, text.Substring(previous - start, end - start)); assertEquals(start, bi.Current); assertEquals(BreakIterator.Done, bi.Following(bi.Last() / 2 + start)); assertEquals(BreakIterator.Done, bi.Preceding(bi.Last() / 2 + start)); assertEquals(start, bi.First()); assertEquals(BreakIterator.Done, bi.Next(13)); assertEquals(BreakIterator.Done, bi.Next(-8)); }
private void Test0Sentences(BreakIterator bi) { assertEquals(0, bi.Current); assertEquals(0, bi.First()); assertEquals(BreakIterator.Done, bi.Next()); assertEquals(0, bi.Last()); assertEquals(BreakIterator.Done, bi.Previous()); assertEquals(BreakIterator.Done, bi.Following(0)); assertEquals(BreakIterator.Done, bi.Preceding(0)); assertEquals(0, bi.First()); assertEquals(BreakIterator.Done, bi.Next(13)); assertEquals(BreakIterator.Done, bi.Next(-8)); }
/** Asserts that two breakiterators break the text the same way */ public void assertSameBreaks(CharacterIterator one, CharacterIterator two, BreakIterator expected, BreakIterator actual) { expected.SetText(one); actual.SetText(two); assertEquals(expected.Current, actual.Current); // next() int v = expected.Current; while (v != BreakIterator.DONE) { assertEquals(v = expected.Next(), actual.Next()); assertEquals(expected.Current, actual.Current); } // first() assertEquals(expected.First(), actual.First()); assertEquals(expected.Current, actual.Current); // last() assertEquals(expected.Last(), actual.Last()); assertEquals(expected.Current, actual.Current); // previous() v = expected.Current; while (v != BreakIterator.DONE) { assertEquals(v = expected.Previous(), actual.Previous()); assertEquals(expected.Current, actual.Current); } // following() for (int i = one.BeginIndex; i <= one.EndIndex; i++) { expected.First(); actual.First(); assertEquals(expected.Following(i), actual.Following(i)); assertEquals(expected.Current, actual.Current); } // preceding() for (int i = one.BeginIndex; i <= one.EndIndex; i++) { expected.Last(); actual.Last(); assertEquals(expected.Preceding(i), actual.Preceding(i)); assertEquals(expected.Current, actual.Current); } }
//========================================================================= // general test subroutines //========================================================================= private List <String> _testFirstAndNext(BreakIterator bi, String text) { int p = bi.First(); int lastP = p; List <String> result = new List <String>(); if (p != 0) { Errln("first() returned " + p + " instead of 0"); } while (p != BreakIterator.Done) { p = bi.Next(); if (p != BreakIterator.Done) { if (p <= lastP) { Errln("next() failed to move forward: next() on position " + lastP + " yielded " + p); } result.Add(text.Substring(lastP, p - lastP)); // ICU4N: Corrected 2nd substring parameter } else { if (lastP != text.Length) { Errln("next() returned DONE prematurely: offset was " + lastP + " instead of " + text.Length); } } lastP = p; } return(result); }
/** * @param filteredBI * @param text */ private void assertFrenchBreakBehavior(BreakIterator filteredBI, String text) { Logln("Testing French behavior:"); filteredBI.SetText(text); assertEquals("6th next", 20, filteredBI.Next()); assertEquals("6th next", 84, filteredBI.Next()); filteredBI.First(); }
/** * @param filteredBI * @param text */ private void assertEnglishBreakBehavior(BreakIterator filteredBI, String text) { Logln("Testing English filtered behavior:"); filteredBI.SetText(text); assertEquals("5th next", 84, filteredBI.Next()); assertEquals("5th next", 278, filteredBI.Next()); filteredBI.First(); }
private void Do3SentenceTest(BreakIterator bi) // LUCENENET NOTE: Refactored a bit because Substring in .NET requires some light math to match Java { assertEquals(0, bi.Current); assertEquals(0, bi.First()); int current = bi.Current; assertEquals(SENTENCES[0], TEXT.Substring(current, bi.Next() - current)); // LUCNENENET: Corrected 2nd parameter current = bi.Current; assertEquals(SENTENCES[1], TEXT.Substring(current, bi.Next() - current)); // LUCNENENET: Corrected 2nd parameter current = bi.Current; assertEquals(bi.Text.EndIndex, bi.Next()); int next = bi.Current; assertEquals(SENTENCES[2], TEXT.Substring(current, next - current)); // LUCNENENET: Corrected 2nd parameter assertEquals(BreakIterator.Done, bi.Next()); assertEquals(TEXT.Length, bi.Last()); int end = bi.Current; int prev = bi.Previous(); assertEquals(SENTENCES[2], TEXT.Substring(prev, end - prev)); // LUCNENENET: Corrected 2nd parameter end = bi.Current; prev = bi.Previous(); assertEquals(SENTENCES[1], TEXT.Substring(prev, end - prev)); // LUCNENENET: Corrected 2nd parameter end = bi.Current; prev = bi.Previous(); assertEquals(SENTENCES[0], TEXT.Substring(prev, end - prev)); // LUCNENENET: Corrected 2nd parameter assertEquals(BreakIterator.Done, bi.Previous()); assertEquals(0, bi.Current); assertEquals(59, bi.Following(39)); assertEquals(59, bi.Following(31)); assertEquals(31, bi.Following(30)); assertEquals(0, bi.Preceding(57)); assertEquals(0, bi.Preceding(58)); assertEquals(31, bi.Preceding(59)); assertEquals(0, bi.First()); assertEquals(59, bi.Next(2)); assertEquals(0, bi.Next(-2)); }
/** * @param filteredBI * @param text */ private void assertDefaultBreakBehavior(BreakIterator filteredBI, String text) { Logln("Testing Default Behavior:"); filteredBI.SetText(text); assertEquals("1st next", 20, filteredBI.Next()); assertEquals("1st next", 84, filteredBI.Next()); assertEquals("1st next", 90, filteredBI.Next()); assertEquals("1st next", 181, filteredBI.Next()); assertEquals("1st next", 278, filteredBI.Next()); filteredBI.First(); }
public static IEnumerable <Token> EnumerateTokens(this BreakIterator bi) { string text = bi.GetCLRText(); int start = bi.First(), end = bi.Next(); while (end != BreakIterator.DONE) { yield return(new Token(start, end, text.Substring(start, end - start), bi.GetRuleStatus())); start = end; end = bi.Next(); } }
public static IEnumerable <string> Enumerate(this BreakIterator bi) { var sb = new StringBuilder(); string text = bi.GetCLRText(); int start = bi.First(), end = bi.Next(); while (end != BreakIterator.DONE) { yield return(text.Substring(start, end - start)); start = end; end = bi.Next(); } }
public void TestGetSetText() { Logln("Testing getText setText "); String str1 = "first string."; String str2 = "Second string."; //RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) BreakIterator.getCharacterInstance(Locale.getDefault()); RuleBasedBreakIterator wordIter1 = (RuleBasedBreakIterator)BreakIterator.GetWordInstance(CultureInfo.CurrentCulture); CharacterIterator text1 = new StringCharacterIterator(str1); //CharacterIterator text1Clone = (CharacterIterator) text1.Clone(); //CharacterIterator text2 = new StringCharacterIterator(str2); wordIter1.SetText(str1); if (!wordIter1.Text.Equals(text1)) { Errln("ERROR:1 error in setText or getText "); } if (wordIter1.Current != 0) { Errln("ERROR:1 setText did not set the iteration position to the beginning of the text, it is" + wordIter1.Current + "\n"); } wordIter1.Next(2); wordIter1.SetText(str2); if (wordIter1.Current != 0) { Errln("ERROR:2 setText did not reset the iteration position to the beginning of the text, it is" + wordIter1.Current + "\n"); } // Test the CharSequence overload of setText() for a simple case. BreakIterator lineIter = BreakIterator.GetLineInstance(new CultureInfo("en")); ICharSequence csText = "Hello, World. ".ToCharSequence(); // Expected Line Brks ^ ^ ^ // 0123456789012345 List <int> expected = new List <int>(); expected.Add(0); expected.Add(7); expected.Add(14); lineIter.SetText(csText); for (int pos = lineIter.First(); pos != BreakIterator.Done; pos = lineIter.Next()) { assertTrue("", expected.Contains(pos)); } assertEquals("", csText.Length, lineIter.Current); }
List <String> ParseText(String text) { List <String> words = new List <String>(); BreakIterator boundary = BreakIterator.GetWordInstance(); boundary.Text = text; int start = boundary.First(); for (int end = boundary.Next(); end != BreakIterator.DONE; start = end, end = boundary.Next()) { if (!Char.IsLetter(text[start])) { continue; } words.Add(text.Substring(start, end - start)); } return(words); }
public void TestEndBehavior() { String testString = "boo."; BreakIterator wb = BreakIterator.GetWordInstance(); wb.SetText(testString); if (wb.First() != 0) { Errln("Didn't get break at beginning of string."); } if (wb.Next() != 3) { Errln("Didn't get break before period in \"boo.\""); } if (wb.Current != 4 && wb.Next() != 4) { Errln("Didn't get break at end of string."); } }
public void TestBug12918() { // This test triggered an assertion failure in ICU4C, in dictbe.cpp // The equivalent code in ICU4J is structured slightly differently, // and does not appear vulnerable to the same issue. // // \u3325 decomposes with normalization, then the CJK dictionary // finds a break within the decomposition. String crasherString = "\u3325\u4a16"; BreakIterator iter = BreakIterator.GetWordInstance(ULocale.ENGLISH); iter.SetText(crasherString); iter.First(); int pos = 0; int lastPos = -1; while ((pos = iter.Next()) != BreakIterator.Done) { assertTrue("", pos > lastPos); } }
public void TestPreceding() { String words3 = "aaa bbb ccc"; BreakIterator e = BreakIterator.GetWordInstance(CultureInfo.CurrentCulture); e.SetText(words3); e.First(); int p1 = e.Next(); int p2 = e.Next(); int p3 = e.Next(); int p4 = e.Next(); int f = e.Following(p2 + 1); int p = e.Preceding(p2 + 1); if (f != p3) { Errln("IntlTestTextBoundary::TestPreceding: f!=p3"); } if (p != p2) { Errln("IntlTestTextBoundary::TestPreceding: p!=p2"); } if (p1 + 1 != p2) { Errln("IntlTestTextBoundary::TestPreceding: p1+1!=p2"); } if (p3 + 1 != p4) { Errln("IntlTestTextBoundary::TestPreceding: p3+1!=p4"); } if (!e.IsBoundary(p2) || e.IsBoundary(p2 + 1) || !e.IsBoundary(p3)) { Errln("IntlTestTextBoundary::TestPreceding: isBoundary err"); } }
private void doOtherInvariantTest(BreakIterator tb, String testChars) { StringBuffer work = new StringBuffer("a\r\na"); int errorCount = 0; // a break should never occur between CR and LF for (int i = 0; i < testChars.Length; i++) { work[0] = testChars[i]; for (int j = 0; j < testChars.Length; j++) { work[3] = testChars[j]; tb.SetText(work.ToString()); for (int k = tb.First(); k != BreakIterator.Done; k = tb.Next()) { if (k == 2) { Errln("Break between CR and LF in string U+" + (work[0]).ToHexString() + ", U+d U+a U+" + (work[3]).ToHexString()); errorCount++; if (errorCount >= 75) { return; } } } } } // a break should never occur before a non-spacing mark, unless it's preceded // by a line terminator work.Length = (0); work.Append("aaaa"); for (int i = 0; i < testChars.Length; i++) { char c = testChars[i]; if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003') { continue; } work[1] = c; for (int j = 0; j < testChars.Length; j++) { c = testChars[j]; if (Character.GetType(c) != UnicodeCategory.NonSpacingMark && Character.GetType(c) != UnicodeCategory.EnclosingMark) { continue; } work[2] = c; tb.SetText(work.ToString()); for (int k = tb.First(); k != BreakIterator.Done; k = tb.Next()) { if (k == 2) { Errln("Break between U+" + ((work[1])).ToHexString() + " and U+" + ((work[2])).ToHexString()); errorCount++; if (errorCount >= 75) { return; } } } } } }
public void TestLineIteration() { BreakIterator bi = GetLineInstance(System.Globalization.CultureInfo.InvariantCulture); // Test empty Assert.AreEqual(0, bi.Current); Assert.AreEqual(BreakIterator.Done, bi.Next()); Assert.AreEqual(0, bi.Current); bi.SetText(LINE_TEXT); // Ensure position starts at 0 when initialized Assert.AreEqual(0, bi.Current); // Check first boundary (Apache\t^Lucene) - Ensure we break on \t Assert.AreEqual(7, bi.Next()); // Ensure Current returns the most recent boundary Assert.AreEqual(7, bi.Current); // Check next boundary (Lucene^(TM)) Assert.AreEqual(13, bi.Next()); // Ensure Current returns the most recent boundary Assert.AreEqual(13, bi.Current); // Check next boundary (Lucene(TM) ^is a) Assert.AreEqual(18, bi.Next()); // Ensure Current returns the most recent boundary Assert.AreEqual(18, bi.Current); // Move to start of high-performance bi.Next(); bi.Next(); // Check next boundary (high-\n^performance) Assert.AreEqual(29, bi.Next()); // Check last boundary (in Java.^) Assert.AreEqual(108, bi.Last()); // Check move past last boundary Assert.AreEqual(BreakIterator.Done, bi.Next()); // Ensure we are still at last boundary Assert.AreEqual(108, bi.Current); // Check MovePrevious Assert.AreEqual(103, bi.Previous()); // Ensure we get the same value for Current as the last move Assert.AreEqual(103, bi.Current); // Check MoveFirst Assert.AreEqual(0, bi.First()); // Ensure we get the same value for Current as the last move Assert.AreEqual(0, bi.Current); // Check moving beyond first boundary Assert.AreEqual(BreakIterator.Done, bi.Previous()); // Ensure we are still at first boundary Assert.AreEqual(0, bi.Current); // Check MoveLast() Assert.AreEqual(108, bi.Last()); }
public void TestWordIteration() { BreakIterator bi = GetWordInstance(System.Globalization.CultureInfo.InvariantCulture); bi.SetText(""); // Test empty Assert.AreEqual(0, bi.Current); Assert.AreEqual(BreakIterator.Done, bi.Next()); Assert.AreEqual(0, bi.Current); bi.SetText(TEXT); // Ensure position starts at 0 when initialized Assert.AreEqual(0, bi.Current); // Check first boundary (Apache^) Assert.AreEqual(6, bi.Next()); // Ensure Current returns the last boundary iterated to Assert.AreEqual(6, bi.Current); // Check second boundary (^Lucene) Assert.AreEqual(7, bi.Next()); // Ensure Current returns the last boundary iterated to Assert.AreEqual(7, bi.Current); // Check third boundary (Lucene^) Assert.AreEqual(13, bi.Next()); // Ensure Current returns the last boundary iterated to Assert.AreEqual(13, bi.Current); // Check fourth boundary (^TM) Assert.AreEqual(14, bi.Next()); // Check fifth boundary (TM^) Assert.AreEqual(16, bi.Next()); // Check sixth boundary (TM)^ Assert.AreEqual(17, bi.Next()); // Check seventh boundary (^is) Assert.AreEqual(18, bi.Next()); // Move to (^high-performance) bi.Next(); bi.Next(); bi.Next(); // Check next boundary (^high-performance) Assert.AreEqual(23, bi.Next()); // Ensure we don't break on hyphen (high-performance^) Assert.AreEqual(39, bi.Next()); // Check MoveLast() Assert.AreEqual(107, bi.Last()); // Check going past last boundary Assert.AreEqual(BreakIterator.Done, bi.Next()); // Check we are still at last boundary Assert.AreEqual(107, bi.Current); // Check MoveFirst() Assert.AreEqual(0, bi.First()); // Check going past first boundary Assert.AreEqual(BreakIterator.Done, bi.Previous()); // Check we are still at first boundary Assert.AreEqual(0, bi.Current); }
public void TestSentenceIteration() { BreakIterator bi = GetSentenceInstance(System.Globalization.CultureInfo.InvariantCulture); bi.SetText(""); // Test empty Assert.AreEqual(0, bi.Current); Assert.AreEqual(BreakIterator.Done, bi.Next()); Assert.AreEqual(0, bi.Current); bi.SetText(SENTENCE_TEXT); // Ensure position starts at 0 when initialized Assert.AreEqual(0, bi.Current); // Check first boundary (in Java.^) - Ensure we don't break on \n Assert.AreEqual(108, bi.Next()); // Ensure Current returns the most recent boundary Assert.AreEqual(108, bi.Current); // Check next boundary (especially cross-platform.^) Assert.AreEqual(221, bi.Next()); // Check next boundary (free download.^) Assert.AreEqual(290, bi.Next()); // Check next boundary (things easy.^) Assert.AreEqual(324, bi.Next()); // Check next boundary (is powerful.^) Assert.AreEqual(344, bi.Next()); // Check next boundary (is exciting.^) Assert.AreEqual(364, bi.Next()); // Check next boundary (is cool.^) Assert.AreEqual(380, bi.Next()); // Check last boundary (Lucene now?^) Assert.AreEqual(400, bi.Next()); // Check move past last boundary Assert.AreEqual(BreakIterator.Done, bi.Next()); // Ensure we are still at last boundary Assert.AreEqual(400, bi.Current); // Check MovePrevious Assert.AreEqual(380, bi.Previous()); // Ensure we get the same value for Current as the last move Assert.AreEqual(380, bi.Current); // Check MoveFirst Assert.AreEqual(0, bi.First()); // Ensure we get the same value for Current as the last move Assert.AreEqual(0, bi.Current); // Check moving beyond first boundary Assert.AreEqual(BreakIterator.Done, bi.Previous()); // Ensure we are still at first boundary Assert.AreEqual(0, bi.Current); // Check MoveLast() Assert.AreEqual(400, bi.Last()); }
public void TestWordIterationThai() { BreakIterator bi = GetWordInstance(new System.Globalization.CultureInfo("th")); bi.SetText(""); // Test empty Assert.AreEqual(0, bi.Current); Assert.AreEqual(BreakIterator.Done, bi.Next()); Assert.AreEqual(0, bi.Current); bi.SetText("บริษัทMicrosoftบริการดีที่สุด"); // Ensure position starts at 0 when initialized Assert.AreEqual(0, bi.Current); // Check first boundary (บริษัท^Microsoft) Assert.AreEqual(6, bi.Next()); // Ensure Current returns the last boundary iterated to Assert.AreEqual(6, bi.Current); // Check second boundary (Microsoft^บริการ) Assert.AreEqual(15, bi.Next()); // Ensure Current returns the last boundary iterated to Assert.AreEqual(15, bi.Current); // Check third boundary (บริการ^ดี) Assert.AreEqual(21, bi.Next()); // Ensure Current returns the last boundary iterated to Assert.AreEqual(21, bi.Current); // Check fourth boundary (ดี^ที่สุด) Assert.AreEqual(23, bi.Next()); // Check fifth boundary (ดีที่สุด^) Assert.AreEqual(29, bi.Next()); // Check beyond last boundary (ดีที่สุด)^ Assert.AreEqual(BreakIterator.Done, bi.Next()); // Check we are still at last boundary Assert.AreEqual(29, bi.Current); // Check MovePrevious() (ดี^ที่สุด) Assert.AreEqual(23, bi.Previous()); // Check MoveFirst() Assert.AreEqual(0, bi.First()); // Check going past first boundary Assert.AreEqual(BreakIterator.Done, bi.Previous()); // Check we are still at first boundary Assert.AreEqual(0, bi.Current); // Check Numerals bi.SetText("๑23๔๕๖7"); // Ensure position starts at 0 when initialized Assert.AreEqual(0, bi.Current); // Ensure Hindu and Thai numerals stay in one group Assert.AreEqual(7, bi.Next()); }
// ICU4N specific: Removed clone, as the cast to SimpleFilteredSentenceBreakIterator when we return object is completely pointless public override int First() { // Don't suppress a break opportunity at the beginning of text. return(@delegate.First()); }
public void TestGetTitleInstance() { BreakIterator bi = BreakIterator.GetTitleInstance(new CultureInfo("en-CA")); TestFmwk.assertNotEquals("Title instance break iterator not correctly instantiated", bi.First(), null); bi.SetText("Here is some Text"); TestFmwk.assertEquals("Title instance break iterator not correctly instantiated", bi.First(), 0); }
protected override void HandleTransliterate(IReplaceable text, TransliterationPosition pos, bool incremental) { lock (this) { boundaryCount = 0; int boundary = 0; GetBreakIterator(); // Lazy-create it if necessary bi.SetText(new ReplaceableCharacterIterator(text, pos.Start, pos.Limit, pos.Start)); // TODO: fix clumsy workaround used below. /* * char[] tempBuffer = new char[text.length()]; * text.getChars(0, text.length(), tempBuffer, 0); * bi.setText(new StringCharacterIterator(new String(tempBuffer), pos.start, pos.limit, pos.start)); */ // end debugging // To make things much easier, we will stack the boundaries, and then insert at the end. // generally, we won't need too many, since we will be filtered. for (boundary = bi.First(); boundary != BreakIterator.Done && boundary < pos.Limit; boundary = bi.Next()) { if (boundary == 0) { continue; } // HACK: Check to see that preceeding item was a letter int cp = UTF16.CharAt(text, boundary - 1); int type = UChar.GetUnicodeCategory(cp).ToInt32(); //System.out.println(Integer.toString(cp,16) + " (before): " + type); if (((1 << type) & LETTER_OR_MARK_MASK) == 0) { continue; } cp = UTF16.CharAt(text, boundary); type = UChar.GetUnicodeCategory(cp).ToInt32(); //System.out.println(Integer.toString(cp,16) + " (after): " + type); if (((1 << type) & LETTER_OR_MARK_MASK) == 0) { continue; } if (boundaryCount >= boundaries.Length) { // realloc if necessary int[] temp = new int[boundaries.Length * 2]; System.Array.Copy(boundaries, 0, temp, 0, boundaries.Length); boundaries = temp; } boundaries[boundaryCount++] = boundary; //System.out.println(boundary); } int delta = 0; int lastBoundary = 0; if (boundaryCount != 0) { // if we found something, adjust delta = boundaryCount * insertion.Length; lastBoundary = boundaries[boundaryCount - 1]; // we do this from the end backwards, so that we don't have to keep updating. while (boundaryCount > 0) { boundary = boundaries[--boundaryCount]; text.Replace(boundary, boundary, insertion); } } // Now fix up the return values pos.ContextLimit += delta; pos.Limit += delta; pos.Start = incremental ? lastBoundary + delta : pos.Limit; } }