Exemplo n.º 1
0
 public SentenceAndWordTokenizer(TextReader reader)
     : base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
 {
     termAtt   = AddAttribute <ICharTermAttribute>();
     offsetAtt = AddAttribute <IOffsetAttribute>();
     posIncAtt = AddAttribute <IPositionIncrementAttribute>();
 }
        public void CanSetNewText()
        {
            var locale   = new Locale("zh");
            var text     = "Good-day, kind sir !  Can I have a glass of water?  I am very parched.";
            var expected = new[] { 0, 22, 52, 70 };

            var secondText     = "供重呼車遊踏持図質腰大野明会掲歌? 方図強候準素能物第毎止田作昼野集。霊一起続時筑腺算掲断詳山住死示流投。";
            var secondExpected = new[] { 0, 18, 35, 53 };

            using (var bi = BreakIterator.CreateSentenceInstance(locale))
            {
                bi.SetText(text);

                Assert.AreEqual(text, bi.Text);
                CollectionAssert.AreEqual(expected, bi.Boundaries);

                // Move the iterator to the next boundary
                Assert.AreEqual(expected[1], bi.MoveNext());
                Assert.AreEqual(expected[1], bi.Current);
                Assert.AreEqual((int)BreakIterator.UWordBreak.NONE, bi.GetRuleStatus());

                bi.SetText(secondText);
                Assert.AreEqual(secondText, bi.Text);

                // Assert that the iterator was reset back to the first element
                // when we set new text.
                Assert.AreEqual(secondExpected[0], bi.Current);
                Assert.AreEqual((int)BreakIterator.UWordBreak.NONE, bi.GetRuleStatus());

                CollectionAssert.AreEqual(secondExpected, bi.Boundaries);
            }
        }
Exemplo n.º 3
0
 /// <summary>
 /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
 public ThaiTokenizer(AttributeFactory factory, TextReader reader)
     : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
 {
     if (!DBBI_AVAILABLE)
     {
         throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
     }
     wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
     termAtt     = AddAttribute <ICharTermAttribute>();
     offsetAtt   = AddAttribute <IOffsetAttribute>();
 }
Exemplo n.º 4
0
        public virtual void TestConsumeSentenceInstance()
        {
            // we use the default locale, as its randomized by LuceneTestCase
            var bi = BreakIterator.CreateSentenceInstance(Locale.GetUS());
            var ci = CharArrayIterator.NewSentenceInstance();

            for (var i = 0; i < 10000; i++)
            {
                var text = TestUtil.RandomUnicodeString(Random()).toCharArray();
                ci.SetText(text, 0, text.Length);
                Consume(bi, ci);
            }
        }
Exemplo n.º 5
0
        public void MoveFollowingTest_Empty(int offset, int expectedOffset, int expectedCurrent)
        {
            var locale = new Locale("de-DE");

            using (var bi = BreakIterator.CreateSentenceInstance(locale))
            {
                bi.SetText(string.Empty);

                int actualOffset = bi.MoveFollowing(offset);

                Assert.AreEqual(expectedOffset, actualOffset);
                Assert.AreEqual(expectedCurrent, bi.Current);
            }
        }
Exemplo n.º 6
0
        public void MoveFollowingTest(int offset, int expectedOffset, int expectedCurrent)
        {
            var locale = new Locale("de-DE");
            var text   = "Good-day, kind sir !  Can I have a glass of water?  I am very parched.";

            using (var bi = BreakIterator.CreateSentenceInstance(locale))
            {
                bi.SetText(text);

                int actualOffset = bi.MoveFollowing(offset);

                Assert.AreEqual(expectedOffset, actualOffset);
                Assert.AreEqual(expectedCurrent, bi.Current);
            }
        }
Exemplo n.º 7
0
        public void CreateSentenceInstanceTest()
        {
            var locale   = new Locale("de-DE");
            var text     = "Good-bye, dear! That was a delicious dinner.";
            var expected = new[] { 0, 16, 44 };

            using (var bi = BreakIterator.CreateSentenceInstance(locale))
            {
                bi.SetText(text);

                Assert.AreEqual(locale, bi.Locale);
                Assert.AreEqual(text, bi.Text);
                CollectionAssert.AreEqual(expected, bi.Boundaries);
            }
        }
Exemplo n.º 8
0
        public void CreateSentenceInstanceTest()
        {
            var locale   = new Locale("de-DE");
            var text     = "Good-bye, dear! That was a delicious dinner.";
            var expected = new[] { 0, 16, 44 };

            using (var bi = BreakIterator.CreateSentenceInstance(locale))
            {
                bi.SetText(text);

                Assert.That(bi.Locale, Is.EqualTo(locale));
                Assert.That(bi.Text, Is.EqualTo(text));
                Assert.That(bi.Boundaries, Is.EquivalentTo(expected));
            }
        }
        public void CreateSentenceInstanceTest()
        {
            var text     = "供重呼車遊踏持図質腰大野明会掲歌? 方図強候準素能物第毎止田作昼野集。霊一起続時筑腺算掲断詳山住死示流投。";
            var locale   = new Locale("zh");
            var expected = new[] { 0, 18, 35, 53 };

            using (var bi = BreakIterator.CreateSentenceInstance(locale))
            {
                bi.SetText(text);

                Assert.AreEqual(locale, bi.Locale);
                Assert.AreEqual(text, bi.Text);
                CollectionAssert.AreEqual(expected, bi.Boundaries);
            }
        }
        public void IsBoundary(BreakIterator.UBreakIteratorType type,
                               string text,
                               int[] offsetsToTest,
                               bool[] expectedIsBoundary,
                               int[] expectedOffsets) // expected BreakIterator.Current after calling IsBoundary.
        {
            var locale = new Locale("zh");

            BreakIterator bi = default(BreakIterator);

            try
            {
                switch (type)
                {
                case BreakIterator.UBreakIteratorType.SENTENCE:
                    bi = BreakIterator.CreateSentenceInstance(locale);
                    break;

                case BreakIterator.UBreakIteratorType.WORD:
                    bi = BreakIterator.CreateWordInstance(locale);
                    break;

                default:
                    throw new NotSupportedException("This iterator type is not supported in this test yet. [" + type + "]");
                }

                bi.SetText(text);

                for (int i = 0; i < offsetsToTest.Length; i++)
                {
                    var isBoundary = bi.IsBoundary(offsetsToTest[i]);

                    Assert.AreEqual(expectedIsBoundary[i], isBoundary, "Expected IsBoundary was not equal at i: {0}, offset: {1}", i, offsetsToTest[i]);
                    Assert.AreEqual(expectedOffsets[i], bi.Current);
                }
            }
            finally
            {
                if (bi != default(BreakIterator))
                {
                    bi.Dispose();
                }
            }
        }
Exemplo n.º 11
0
        public void CanSetNewText_Empty()
        {
            var    locale     = new Locale("en-US");
            var    text       = "Good-day, kind sir !  Can I have a glass of water?  I am very parched.";
            string secondText = string.Empty;
            var    expected   = new[] { 0, 22, 52, 70 };
            // RuleStatus only applies to BreakIterator.UBreakIteratorType.WORD.
            var expectedRuleStatusVector = new[] { 0 };

            using (var bi = BreakIterator.CreateSentenceInstance(locale))
            {
                bi.SetText(text);

                Assert.AreEqual(text, bi.Text);
                CollectionAssert.AreEqual(expected, bi.Boundaries);

                // Move the iterator to the next boundary
                Assert.AreEqual(expected[1], bi.MoveNext());
                Assert.AreEqual(expected[1], bi.Current);
                Assert.AreEqual(0, bi.GetRuleStatus());
                CollectionAssert.AreEqual(expectedRuleStatusVector, bi.GetRuleStatusVector());

                // Assert that the new set of boundaries were found.
                bi.SetText(secondText);
                Assert.AreEqual(secondText, bi.Text);

                // Assert that the iterator was reset back to the first element
                // and is now null.
                Assert.AreEqual(0, bi.Current);
                Assert.AreEqual(BreakIterator.DONE, bi.MoveNext());
                Assert.AreEqual(0, bi.MoveFirst());
                Assert.AreEqual(0, bi.MoveLast());
                Assert.AreEqual(BreakIterator.DONE, bi.MovePrevious());
                Assert.AreEqual(0, bi.GetRuleStatus());
                Assert.AreEqual(new[] { 0 }, bi.GetRuleStatusVector());

                CollectionAssert.IsEmpty(bi.Boundaries);
            }
        }
Exemplo n.º 12
0
        public void CanSetNewText()
        {
            var locale   = new Locale("en-US");
            var text     = "Good-day, kind sir !  Can I have a glass of water?  I am very parched.";
            var expected = new[] { 0, 22, 52, 70 };
            // RuleStatus only applies to BreakIterator.UBreakIteratorType.WORD.
            var expectedRuleStatusVector = new[] { 0 };

            var secondText     = "It is my birthday!  I hope something exciting happens.";
            var secondExpected = new[] { 0, 20, 54 };

            using (var bi = BreakIterator.CreateSentenceInstance(locale))
            {
                bi.SetText(text);

                Assert.AreEqual(text, bi.Text);
                CollectionAssert.AreEqual(expected, bi.Boundaries);

                // Move the iterator to the next boundary
                Assert.AreEqual(expected[1], bi.MoveNext());
                Assert.AreEqual(expected[1], bi.Current);
                Assert.AreEqual(0, bi.GetRuleStatus());
                CollectionAssert.AreEqual(expectedRuleStatusVector, bi.GetRuleStatusVector());

                // Assert that the new set of boundaries were found.
                bi.SetText(secondText);
                Assert.AreEqual(secondText, bi.Text);

                // Assert that the iterator was reset back to the first element
                // when we set new text.
                Assert.AreEqual(secondExpected[0], bi.Current);
                Assert.AreEqual(0, bi.GetRuleStatus());
                CollectionAssert.AreEqual(expectedRuleStatusVector, bi.GetRuleStatusVector());

                CollectionAssert.AreEqual(secondExpected, bi.Boundaries);
            }
        }
Exemplo n.º 13
0
 public WholeSentenceTokenizer(TextReader reader)
     : base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
 {
     termAtt   = AddAttribute <ICharTermAttribute>();
     offsetAtt = AddAttribute <IOffsetAttribute>();
 }
        public void CanIterateForwards(BreakIterator.UBreakIteratorType type, string text, int[] expected, BreakIterator.UWordBreak[] ruleStatus)
        {
            var locale = new Locale("zh");

            BreakIterator bi = default(BreakIterator);

            try
            {
                switch (type)
                {
                case BreakIterator.UBreakIteratorType.SENTENCE:
                    bi = BreakIterator.CreateSentenceInstance(locale);
                    break;

                case BreakIterator.UBreakIteratorType.WORD:
                    bi = BreakIterator.CreateWordInstance(locale);
                    break;

                default:
                    throw new NotSupportedException("This iterator type is not supported in this test yet. [" + type + "]");
                }

                bi.SetText(text);

                CollectionAssert.AreEqual(expected, bi.Boundaries);

                // Verify each boundary for the sentences
                for (int i = 0; i < expected.Length; i++)
                {
                    int current = bi.Current;
                    int status  = bi.GetRuleStatus();

                    int expectedStatus = (int)ruleStatus[i];

                    Assert.AreEqual(expected[i], current);
                    Assert.AreEqual(expectedStatus, status);
                    CollectionAssert.AreEqual(new[] { expectedStatus }, bi.GetRuleStatusVector());

                    int moveNext = bi.MoveNext();
                    int next     = i + 1;

                    if (next < expected.Length)
                    {
                        Assert.AreEqual(expected[next], moveNext);
                    }
                    else
                    {
                        // Verify that the BreakIterator is exhausted because we've
                        // moved past every item.
                        Assert.AreEqual(BreakIterator.DONE, moveNext);
                    }
                }

                int lastIndex = expected.Length - 1;
                Assert.AreEqual(expected[lastIndex], bi.Current);

                // We've moved past the last word, it should return the last offset.
                Assert.AreEqual(BreakIterator.DONE, bi.MoveNext());
                Assert.AreEqual(expected[lastIndex], bi.Current);

                // Verify that the first element is correct now that we've moved to the end.
                Assert.AreEqual(expected[0], bi.MoveFirst());
                Assert.AreEqual(expected[0], bi.Current);
            }
            finally
            {
                if (bi != default(BreakIterator))
                {
                    bi.Dispose();
                }
            }
        }
Exemplo n.º 15
0
 private BreakIterator GetSentenceInstance(System.Globalization.CultureInfo locale)
 {
     return(BreakIterator.CreateSentenceInstance(new Locale(locale.Name)));
 }