public SentenceAndWordTokenizer(TextReader reader) : base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS())) { termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); }
public void CanSetNewText() { var locale = new Locale("zh"); var text = "Good-day, kind sir ! Can I have a glass of water? I am very parched."; var expected = new[] { 0, 22, 52, 70 }; var secondText = "供重呼車遊踏持図質腰大野明会掲歌? 方図強候準素能物第毎止田作昼野集。霊一起続時筑腺算掲断詳山住死示流投。"; var secondExpected = new[] { 0, 18, 35, 53 }; using (var bi = BreakIterator.CreateSentenceInstance(locale)) { bi.SetText(text); Assert.AreEqual(text, bi.Text); CollectionAssert.AreEqual(expected, bi.Boundaries); // Move the iterator to the next boundary Assert.AreEqual(expected[1], bi.MoveNext()); Assert.AreEqual(expected[1], bi.Current); Assert.AreEqual((int)BreakIterator.UWordBreak.NONE, bi.GetRuleStatus()); bi.SetText(secondText); Assert.AreEqual(secondText, bi.Text); // Assert that the iterator was reset back to the first element // when we set new text. Assert.AreEqual(secondExpected[0], bi.Current); Assert.AreEqual((int)BreakIterator.UWordBreak.NONE, bi.GetRuleStatus()); CollectionAssert.AreEqual(secondExpected, bi.Boundaries); } }
/// <summary> /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary> public ThaiTokenizer(AttributeFactory factory, TextReader reader) : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS())) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS())); termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); }
public virtual void TestConsumeSentenceInstance() { // we use the default locale, as its randomized by LuceneTestCase var bi = BreakIterator.CreateSentenceInstance(Locale.GetUS()); var ci = CharArrayIterator.NewSentenceInstance(); for (var i = 0; i < 10000; i++) { var text = TestUtil.RandomUnicodeString(Random()).toCharArray(); ci.SetText(text, 0, text.Length); Consume(bi, ci); } }
public void MoveFollowingTest_Empty(int offset, int expectedOffset, int expectedCurrent) { var locale = new Locale("de-DE"); using (var bi = BreakIterator.CreateSentenceInstance(locale)) { bi.SetText(string.Empty); int actualOffset = bi.MoveFollowing(offset); Assert.AreEqual(expectedOffset, actualOffset); Assert.AreEqual(expectedCurrent, bi.Current); } }
public void MoveFollowingTest(int offset, int expectedOffset, int expectedCurrent) { var locale = new Locale("de-DE"); var text = "Good-day, kind sir ! Can I have a glass of water? I am very parched."; using (var bi = BreakIterator.CreateSentenceInstance(locale)) { bi.SetText(text); int actualOffset = bi.MoveFollowing(offset); Assert.AreEqual(expectedOffset, actualOffset); Assert.AreEqual(expectedCurrent, bi.Current); } }
public void CreateSentenceInstanceTest() { var locale = new Locale("de-DE"); var text = "Good-bye, dear! That was a delicious dinner."; var expected = new[] { 0, 16, 44 }; using (var bi = BreakIterator.CreateSentenceInstance(locale)) { bi.SetText(text); Assert.AreEqual(locale, bi.Locale); Assert.AreEqual(text, bi.Text); CollectionAssert.AreEqual(expected, bi.Boundaries); } }
public void CreateSentenceInstanceTest() { var locale = new Locale("de-DE"); var text = "Good-bye, dear! That was a delicious dinner."; var expected = new[] { 0, 16, 44 }; using (var bi = BreakIterator.CreateSentenceInstance(locale)) { bi.SetText(text); Assert.That(bi.Locale, Is.EqualTo(locale)); Assert.That(bi.Text, Is.EqualTo(text)); Assert.That(bi.Boundaries, Is.EquivalentTo(expected)); } }
public void CreateSentenceInstanceTest() { var text = "供重呼車遊踏持図質腰大野明会掲歌? 方図強候準素能物第毎止田作昼野集。霊一起続時筑腺算掲断詳山住死示流投。"; var locale = new Locale("zh"); var expected = new[] { 0, 18, 35, 53 }; using (var bi = BreakIterator.CreateSentenceInstance(locale)) { bi.SetText(text); Assert.AreEqual(locale, bi.Locale); Assert.AreEqual(text, bi.Text); CollectionAssert.AreEqual(expected, bi.Boundaries); } }
public void IsBoundary(BreakIterator.UBreakIteratorType type, string text, int[] offsetsToTest, bool[] expectedIsBoundary, int[] expectedOffsets) // expected BreakIterator.Current after calling IsBoundary. { var locale = new Locale("zh"); BreakIterator bi = default(BreakIterator); try { switch (type) { case BreakIterator.UBreakIteratorType.SENTENCE: bi = BreakIterator.CreateSentenceInstance(locale); break; case BreakIterator.UBreakIteratorType.WORD: bi = BreakIterator.CreateWordInstance(locale); break; default: throw new NotSupportedException("This iterator type is not supported in this test yet. [" + type + "]"); } bi.SetText(text); for (int i = 0; i < offsetsToTest.Length; i++) { var isBoundary = bi.IsBoundary(offsetsToTest[i]); Assert.AreEqual(expectedIsBoundary[i], isBoundary, "Expected IsBoundary was not equal at i: {0}, offset: {1}", i, offsetsToTest[i]); Assert.AreEqual(expectedOffsets[i], bi.Current); } } finally { if (bi != default(BreakIterator)) { bi.Dispose(); } } }
public void CanSetNewText_Empty() { var locale = new Locale("en-US"); var text = "Good-day, kind sir ! Can I have a glass of water? I am very parched."; string secondText = string.Empty; var expected = new[] { 0, 22, 52, 70 }; // RuleStatus only applies to BreakIterator.UBreakIteratorType.WORD. var expectedRuleStatusVector = new[] { 0 }; using (var bi = BreakIterator.CreateSentenceInstance(locale)) { bi.SetText(text); Assert.AreEqual(text, bi.Text); CollectionAssert.AreEqual(expected, bi.Boundaries); // Move the iterator to the next boundary Assert.AreEqual(expected[1], bi.MoveNext()); Assert.AreEqual(expected[1], bi.Current); Assert.AreEqual(0, bi.GetRuleStatus()); CollectionAssert.AreEqual(expectedRuleStatusVector, bi.GetRuleStatusVector()); // Assert that the new set of boundaries were found. bi.SetText(secondText); Assert.AreEqual(secondText, bi.Text); // Assert that the iterator was reset back to the first element // and is now null. Assert.AreEqual(0, bi.Current); Assert.AreEqual(BreakIterator.DONE, bi.MoveNext()); Assert.AreEqual(0, bi.MoveFirst()); Assert.AreEqual(0, bi.MoveLast()); Assert.AreEqual(BreakIterator.DONE, bi.MovePrevious()); Assert.AreEqual(0, bi.GetRuleStatus()); Assert.AreEqual(new[] { 0 }, bi.GetRuleStatusVector()); CollectionAssert.IsEmpty(bi.Boundaries); } }
public void CanSetNewText() { var locale = new Locale("en-US"); var text = "Good-day, kind sir ! Can I have a glass of water? I am very parched."; var expected = new[] { 0, 22, 52, 70 }; // RuleStatus only applies to BreakIterator.UBreakIteratorType.WORD. var expectedRuleStatusVector = new[] { 0 }; var secondText = "It is my birthday! I hope something exciting happens."; var secondExpected = new[] { 0, 20, 54 }; using (var bi = BreakIterator.CreateSentenceInstance(locale)) { bi.SetText(text); Assert.AreEqual(text, bi.Text); CollectionAssert.AreEqual(expected, bi.Boundaries); // Move the iterator to the next boundary Assert.AreEqual(expected[1], bi.MoveNext()); Assert.AreEqual(expected[1], bi.Current); Assert.AreEqual(0, bi.GetRuleStatus()); CollectionAssert.AreEqual(expectedRuleStatusVector, bi.GetRuleStatusVector()); // Assert that the new set of boundaries were found. bi.SetText(secondText); Assert.AreEqual(secondText, bi.Text); // Assert that the iterator was reset back to the first element // when we set new text. Assert.AreEqual(secondExpected[0], bi.Current); Assert.AreEqual(0, bi.GetRuleStatus()); CollectionAssert.AreEqual(expectedRuleStatusVector, bi.GetRuleStatusVector()); CollectionAssert.AreEqual(secondExpected, bi.Boundaries); } }
public WholeSentenceTokenizer(TextReader reader) : base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS())) { termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); }
public void CanIterateForwards(BreakIterator.UBreakIteratorType type, string text, int[] expected, BreakIterator.UWordBreak[] ruleStatus) { var locale = new Locale("zh"); BreakIterator bi = default(BreakIterator); try { switch (type) { case BreakIterator.UBreakIteratorType.SENTENCE: bi = BreakIterator.CreateSentenceInstance(locale); break; case BreakIterator.UBreakIteratorType.WORD: bi = BreakIterator.CreateWordInstance(locale); break; default: throw new NotSupportedException("This iterator type is not supported in this test yet. [" + type + "]"); } bi.SetText(text); CollectionAssert.AreEqual(expected, bi.Boundaries); // Verify each boundary for the sentences for (int i = 0; i < expected.Length; i++) { int current = bi.Current; int status = bi.GetRuleStatus(); int expectedStatus = (int)ruleStatus[i]; Assert.AreEqual(expected[i], current); Assert.AreEqual(expectedStatus, status); CollectionAssert.AreEqual(new[] { expectedStatus }, bi.GetRuleStatusVector()); int moveNext = bi.MoveNext(); int next = i + 1; if (next < expected.Length) { Assert.AreEqual(expected[next], moveNext); } else { // Verify that the BreakIterator is exhausted because we've // moved past every item. Assert.AreEqual(BreakIterator.DONE, moveNext); } } int lastIndex = expected.Length - 1; Assert.AreEqual(expected[lastIndex], bi.Current); // We've moved past the last word, it should return the last offset. Assert.AreEqual(BreakIterator.DONE, bi.MoveNext()); Assert.AreEqual(expected[lastIndex], bi.Current); // Verify that the first element is correct now that we've moved to the end. Assert.AreEqual(expected[0], bi.MoveFirst()); Assert.AreEqual(expected[0], bi.Current); } finally { if (bi != default(BreakIterator)) { bi.Dispose(); } } }
private BreakIterator GetSentenceInstance(System.Globalization.CultureInfo locale) { return(BreakIterator.CreateSentenceInstance(new Locale(locale.Name))); }