public void CreateWordInstanceTest()
        {
            if (string.CompareOrdinal(Wrapper.IcuVersion, "52.1") < 0)
            {
                Assert.Ignore("This test requires ICU 52 or higher");
            }

            var text     = "你是中国人么? 我喜欢你们的国家。";
            var locale   = new Locale("zh");
            var expected = new[] {
                0, 2, 5, 6,                     //你是中国人么
                7, 8, 9, 11,                    //? 我喜欢
                13, 14, 16, 17                  //的国家。
            };
            var none        = (int)BreakIterator.UWordBreak.NONE;
            var ideographic = (int)BreakIterator.UWordBreak.IDEO;             //ideographic character
            var ruleStatus  = new[] {
                none, ideographic, ideographic, ideographic,
                none, none, ideographic, ideographic,
                ideographic, ideographic, ideographic, none
            };

            using (var bi = BreakIterator.CreateWordInstance(locale))
            {
                bi.SetText(text);

                Assert.AreEqual(text, bi.Text);
                Assert.AreEqual(locale, bi.Locale);
                CollectionAssert.AreEqual(expected, bi.Boundaries);

                // Verify each boundary and rule status.
                for (int i = 0; i < expected.Length; i++)
                {
                    int current = bi.Current;
                    int status  = bi.GetRuleStatus();

                    Assert.AreEqual(expected[i], current);
                    Assert.AreEqual(ruleStatus[i], status);

                    int moveNext = bi.MoveNext();
                    int next     = i + 1;

                    if (next < expected.Length)
                    {
                        Assert.AreEqual(expected[next], moveNext);
                    }
                    else
                    {
                        // Verify that the BreakIterator is exhausted because we've
                        // moved past every item.
                        Assert.AreEqual(BreakIterator.DONE, moveNext);
                    }
                }

                // Verify that the BreakIterator is exhausted because we've
                // moved past every item, so current should be the last offset.
                int lastIndex = expected.Length - 1;
                Assert.AreEqual(expected[lastIndex], bi.Current);
            }
        }
Ejemplo n.º 2
0
        public void CanIterateForwards()
        {
            var locale   = new Locale("de-DE");
            var text     = "Good-day, kind sir !";
            var expected = new int[] { 0, 4, 5, 8, 9, 10, 14, 15, 18, 19, 20 };

            var none       = BreakIterator.UWordBreak.NONE;
            var letter     = BreakIterator.UWordBreak.LETTER;
            var ruleStatus = new[] { none, letter, none, letter, none, none, letter, none, letter, none, none };

            using (var bi = BreakIterator.CreateWordInstance(locale))
            {
                bi.SetText(text);

                CollectionAssert.AreEqual(expected, bi.Boundaries);

                // Verify each boundary and rule status.
                for (int i = 0; i < expected.Length; i++)
                {
                    int   current          = bi.Current;
                    int   status           = bi.GetRuleStatus();
                    int[] ruleStatusVector = bi.GetRuleStatusVector();

                    Assert.AreEqual(expected[i], current);
                    Assert.AreEqual((int)ruleStatus[i], status);
                    // There should only be one rule that parsed these.
                    Assert.AreEqual(1, ruleStatusVector.Length);
                    Assert.AreEqual((int)ruleStatus[i], ruleStatusVector[0]);

                    int moveNext = bi.MoveNext();
                    int next     = i + 1;

                    if (next < expected.Length)
                    {
                        Assert.AreEqual(expected[next], moveNext);
                    }
                    else
                    {
                        // Verify that the BreakIterator is exhausted because we've
                        // moved past every item.
                        Assert.AreEqual(BreakIterator.DONE, moveNext);
                    }
                }

                // Verify that the BreakIterator is exhausted because we've
                // moved past every item. It should return the last offset found.
                int lastIndex = expected.Length - 1;
                Assert.AreEqual(expected[lastIndex], bi.Current);

                // We've moved past the last word, it should return the last offset.
                Assert.AreEqual(BreakIterator.DONE, bi.MoveNext());
                Assert.AreEqual(expected[lastIndex], bi.Current);

                // Verify that the first element is correct now that we've moved to the end.
                Assert.AreEqual(expected[0], bi.MoveFirst());
                Assert.AreEqual(expected[0], bi.Current);
            }
        }
Ejemplo n.º 3
0
 /// <summary>
 /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
 public ThaiTokenizer(AttributeFactory factory, TextReader reader)
     : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
 {
     if (!DBBI_AVAILABLE)
     {
         throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
     }
     wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
     termAtt     = AddAttribute <ICharTermAttribute>();
     offsetAtt   = AddAttribute <IOffsetAttribute>();
 }
Ejemplo n.º 4
0
        private void UxBreakClick(object sender, EventArgs e)
        {
            using (BreakIterator bi = BreakIterator.CreateWordInstance(Locale.GetUS()))
            {
                bi.SetText(this.uxText.Text);

                var words = bi.Enumerate().ToList();

                MessageBox.Show(string.Join("-", words.ToArray()));
            }
        }
Ejemplo n.º 5
0
        public virtual void TestConsumeWordInstance()
        {
            // we use the default locale, as its randomized by LuceneTestCase
            var bi = BreakIterator.CreateWordInstance(Locale.GetUS());
            var ci = CharArrayIterator.NewWordInstance();

            for (var i = 0; i < 10000; i++)
            {
                var text = TestUtil.RandomUnicodeString(Random()).toCharArray();
                ci.SetText(text, 0, text.Length);
                Consume(bi, ci);
            }
        }
Ejemplo n.º 6
0
        public void MovePrecedingTest_Empty(int offset, int expectedOffset, int expectedCurrent)
        {
            var locale = new Locale("de-DE");

            using (var bi = BreakIterator.CreateWordInstance(locale))
            {
                bi.SetText(string.Empty);

                int actualOffset = bi.MovePreceding(offset);

                Assert.AreEqual(expectedOffset, actualOffset);
                Assert.AreEqual(expectedCurrent, bi.Current);
            }
        }
Ejemplo n.º 7
0
        public void CreateWordInstanceTest()
        {
            var locale   = new Locale("de-DE");
            var text     = "Good-day, kind sir !";
            var expected = new int[] { 0, 4, 5, 8, 9, 10, 14, 15, 18, 19, 20 };

            using (var bi = BreakIterator.CreateWordInstance(locale))
            {
                bi.SetText(text);

                Assert.That(bi.Locale, Is.EqualTo(locale));
                Assert.That(bi.Text, Is.EqualTo(text));
                Assert.That(bi.Boundaries, Is.EquivalentTo(expected));
            }
        }
Ejemplo n.º 8
0
        public void IsBoundaryTest(int offset, bool expectedIsBoundary, int expectedOffset)
        {
            var locale = new Locale("de-DE");
            var text   = "Good-day, kind sir !";

            using (var bi = BreakIterator.CreateWordInstance(locale))
            {
                bi.SetText(text);

                var isBoundary = bi.IsBoundary(offset);

                Assert.AreEqual(expectedIsBoundary, isBoundary);
                Assert.AreEqual(expectedOffset, bi.Current);
            }
        }
Ejemplo n.º 9
0
        public void MovePrecedingTest(int offset, int expectedOffset, int expectedCurrent)
        {
            var text   = "Good-day, kind sir !";
            var locale = new Locale("de-DE");

            using (var bi = BreakIterator.CreateWordInstance(locale))
            {
                bi.SetText(text);

                int actualOffset = bi.MovePreceding(offset);

                Assert.AreEqual(expectedOffset, actualOffset);
                Assert.AreEqual(expectedCurrent, bi.Current);
            }
        }
Ejemplo n.º 10
0
        public void CreateWordInstanceTest()
        {
            var locale   = new Locale("de-DE");
            var text     = "Good-day, kind sir !";
            var expected = new int[] { 0, 4, 5, 8, 9, 10, 14, 15, 18, 19, 20 };

            using (var bi = BreakIterator.CreateWordInstance(locale))
            {
                bi.SetText(text);

                Assert.AreEqual(locale, bi.Locale);
                Assert.AreEqual(text, bi.Text);
                CollectionAssert.AreEqual(expected, bi.Boundaries);
            }
        }
Ejemplo n.º 11
0
        private void UxOccurrenceCountClick(object sender, EventArgs e)
        {
            using (BreakIterator bi = BreakIterator.CreateWordInstance(Locale.GetUS()))
            {
                bi.SetText(this.uxText.Text);

                var words = bi.Enumerate()
                            .GroupBy(w => w)
                            .OrderBy(x => x.Count())
                            .Reverse()
                            .Select(x => x.Key + " : " + x.Count())
                            .Take(10)
                            .ToArray();

                MessageBox.Show(string.Join(Environment.NewLine, words));
            }
        }
Ejemplo n.º 12
0
        public void IsBoundaryTest_Empty()
        {
            string text          = string.Empty;
            var    offsetsToTest = new[] { 0, -1, 100 };
            var    locale        = new Locale("de-DE");

            using (var bi = BreakIterator.CreateWordInstance(locale))
            {
                bi.SetText(text);

                for (int i = 0; i < offsetsToTest.Length; i++)
                {
                    var isBoundary = bi.IsBoundary(offsetsToTest[i]);
                    Assert.IsFalse(isBoundary);
                    Assert.AreEqual(0, bi.Current);
                }
            }
        }
        public void IsBoundary(BreakIterator.UBreakIteratorType type,
                               string text,
                               int[] offsetsToTest,
                               bool[] expectedIsBoundary,
                               int[] expectedOffsets) // expected BreakIterator.Current after calling IsBoundary.
        {
            var locale = new Locale("zh");

            BreakIterator bi = default(BreakIterator);

            try
            {
                switch (type)
                {
                case BreakIterator.UBreakIteratorType.SENTENCE:
                    bi = BreakIterator.CreateSentenceInstance(locale);
                    break;

                case BreakIterator.UBreakIteratorType.WORD:
                    bi = BreakIterator.CreateWordInstance(locale);
                    break;

                default:
                    throw new NotSupportedException("This iterator type is not supported in this test yet. [" + type + "]");
                }

                bi.SetText(text);

                for (int i = 0; i < offsetsToTest.Length; i++)
                {
                    var isBoundary = bi.IsBoundary(offsetsToTest[i]);

                    Assert.AreEqual(expectedIsBoundary[i], isBoundary, "Expected IsBoundary was not equal at i: {0}, offset: {1}", i, offsetsToTest[i]);
                    Assert.AreEqual(expectedOffsets[i], bi.Current);
                }
            }
            finally
            {
                if (bi != default(BreakIterator))
                {
                    bi.Dispose();
                }
            }
        }
Ejemplo n.º 14
0
        public void Clone()
        {
            var locale = new Locale("de-DE");

            using (var bi = BreakIterator.CreateWordInstance(locale))
            {
                const string text = "Hello World!";
                bi.SetText(text);

                bi.MoveFollowing(5);

                using (var clone = bi.Clone())
                {
                    Assert.That(clone.Text, Is.EqualTo(bi.Text));
                    Assert.That(clone.Current, Is.EqualTo(bi.Current));
                    Assert.That(clone.Boundaries, Is.EquivalentTo(bi.Boundaries));
                    Assert.That(clone.Locale, Is.EqualTo(bi.Locale));
                    bi.SetText("Good afternoon");
                    Assert.That(clone.Text, Is.EqualTo(text));
                    Assert.That(clone.Boundaries, Is.Not.EquivalentTo(bi.Boundaries));
                }
            }
        }
Ejemplo n.º 15
0
 private BreakIterator GetWordInstance(System.Globalization.CultureInfo locale)
 {
     return(BreakIterator.CreateWordInstance(new Locale(locale.Name)));
 }
 private void Load()
 {
     _breakIterator = BreakIterator.CreateWordInstance(Locale.GetUS());
 }
        public void CanIterateBackwards()
        {
            if (string.CompareOrdinal(Wrapper.IcuVersion, "52.1") < 0)
            {
                Assert.Ignore("This test requires ICU 52 or higher");
            }

            var text     = "你是中国人么? 我喜欢你们的国家。";
            var locale   = new Locale("zh");
            var expected = new[] {
                0, 2, 5, 6,                     //你是中国人么
                7, 8, 9, 11,                    //? 我喜欢
                13, 14, 16, 17                  //的国家。
            };
            var none        = (int)BreakIterator.UWordBreak.NONE;
            var ideographic = (int)BreakIterator.UWordBreak.IDEO;             //ideographic character
            var ruleStatus  = new[] {
                none, ideographic, ideographic, ideographic,
                none, none, ideographic, ideographic,
                ideographic, ideographic, ideographic, none
            };

            using (var bi = BreakIterator.CreateWordInstance(locale))
            {
                bi.SetText(text);

                CollectionAssert.AreEqual(expected, bi.Boundaries);

                int current         = 0;
                var currentBoundary = expected[current];
                var currentStatus   = ruleStatus[current];
                Assert.AreEqual(currentBoundary, bi.Current);
                Assert.AreEqual(currentStatus, bi.GetRuleStatus());
                // For these, we only expect one rule to be applied in order to find the text boundary.
                CollectionAssert.AreEqual(new[] { currentStatus }, bi.GetRuleStatusVector());

                // Increment the index and verify that the next Boundary is correct.
                current++;
                currentBoundary = expected[current];
                currentStatus   = ruleStatus[current];
                Assert.AreEqual(currentBoundary, bi.MoveNext());
                Assert.AreEqual(currentBoundary, bi.Current);
                Assert.AreEqual(currentStatus, bi.GetRuleStatus());
                CollectionAssert.AreEqual(new[] { currentStatus }, bi.GetRuleStatusVector());

                current++;
                currentBoundary = expected[current];
                currentStatus   = ruleStatus[current];
                Assert.AreEqual(currentBoundary, bi.MoveNext());
                Assert.AreEqual(currentBoundary, bi.Current);
                Assert.AreEqual(currentStatus, bi.GetRuleStatus());
                CollectionAssert.AreEqual(new[] { currentStatus }, bi.GetRuleStatusVector());

                current--;
                currentBoundary = expected[current];
                currentStatus   = ruleStatus[current];
                Assert.AreEqual(currentBoundary, bi.MovePrevious());
                Assert.AreEqual(currentBoundary, bi.Current);
                Assert.AreEqual(currentStatus, bi.GetRuleStatus());
                CollectionAssert.AreEqual(new[] { currentStatus }, bi.GetRuleStatusVector());

                current--;
                currentBoundary = expected[current];
                currentStatus   = ruleStatus[current];
                Assert.AreEqual(currentBoundary, bi.MovePrevious());
                Assert.AreEqual(currentBoundary, bi.Current);
                Assert.AreEqual(currentStatus, bi.GetRuleStatus());
                CollectionAssert.AreEqual(new[] { currentStatus }, bi.GetRuleStatusVector());

                // We've moved past the first word, it should return 0.
                Assert.AreEqual(BreakIterator.DONE, bi.MovePrevious());
                Assert.AreEqual(0, bi.Current);
                Assert.AreEqual(0, bi.GetRuleStatus());                           // this by default returns 0.
                CollectionAssert.AreEqual(new[] { 0 }, bi.GetRuleStatusVector()); // default returns 0 in the status vector

                // Verify that the element is correct now that we've moved to the end.
                var last       = expected.Last();
                var lastStatus = ruleStatus.Last();

                Assert.AreEqual(last, bi.MoveLast());
                Assert.AreEqual(last, bi.Current);
                Assert.AreEqual(lastStatus, bi.GetRuleStatus());
                CollectionAssert.AreEqual(new[] { lastStatus }, bi.GetRuleStatusVector());
            }
        }
        public void CanIterateForwards(BreakIterator.UBreakIteratorType type, string text, int[] expected, BreakIterator.UWordBreak[] ruleStatus)
        {
            var locale = new Locale("zh");

            BreakIterator bi = default(BreakIterator);

            try
            {
                switch (type)
                {
                case BreakIterator.UBreakIteratorType.SENTENCE:
                    bi = BreakIterator.CreateSentenceInstance(locale);
                    break;

                case BreakIterator.UBreakIteratorType.WORD:
                    bi = BreakIterator.CreateWordInstance(locale);
                    break;

                default:
                    throw new NotSupportedException("This iterator type is not supported in this test yet. [" + type + "]");
                }

                bi.SetText(text);

                CollectionAssert.AreEqual(expected, bi.Boundaries);

                // Verify each boundary for the sentences
                for (int i = 0; i < expected.Length; i++)
                {
                    int current = bi.Current;
                    int status  = bi.GetRuleStatus();

                    int expectedStatus = (int)ruleStatus[i];

                    Assert.AreEqual(expected[i], current);
                    Assert.AreEqual(expectedStatus, status);
                    CollectionAssert.AreEqual(new[] { expectedStatus }, bi.GetRuleStatusVector());

                    int moveNext = bi.MoveNext();
                    int next     = i + 1;

                    if (next < expected.Length)
                    {
                        Assert.AreEqual(expected[next], moveNext);
                    }
                    else
                    {
                        // Verify that the BreakIterator is exhausted because we've
                        // moved past every item.
                        Assert.AreEqual(BreakIterator.DONE, moveNext);
                    }
                }

                int lastIndex = expected.Length - 1;
                Assert.AreEqual(expected[lastIndex], bi.Current);

                // We've moved past the last word, it should return the last offset.
                Assert.AreEqual(BreakIterator.DONE, bi.MoveNext());
                Assert.AreEqual(expected[lastIndex], bi.Current);

                // Verify that the first element is correct now that we've moved to the end.
                Assert.AreEqual(expected[0], bi.MoveFirst());
                Assert.AreEqual(expected[0], bi.Current);
            }
            finally
            {
                if (bi != default(BreakIterator))
                {
                    bi.Dispose();
                }
            }
        }
Ejemplo n.º 19
0
        private void wordBreak()
        {
            int quote = 0;

            using (BreakIterator bi = BreakIterator.CreateWordInstance(Locale.GetUS()))
            {
                var input = inputTextBox.Text;

                Regex pairReplace       = new Regex(@"\(\d+,\d+\)");
                var   pairReplaceResult = pairReplace.Matches(inputTextBox.Text);
                foreach (Match match in pairReplaceResult)
                {
                    string treat = match.Value;
                    treat = treat.Replace(",", "⠠");
                    input = input.Remove(match.Index, match.Length).Insert(match.Index, treat);
                }

                Regex expression = new Regex(@"[^\.][^\d+]\.\s");
                var   results    = expression.Matches(input);
                foreach (Match match in results)
                {
                    input = Regex.Replace(input, @"[^\.][^\d+]\.\s", match.Value.Replace(".", "⠸⠲"));
                }
                input = Regex.Replace(input, @"\.{3,}", "⠄⠄⠄");

                bi.SetText(input);
                IEnumerable <String> spWord = bi.Enumerate();
                for (int i = 0; i < spWord.Count(); i++)
                {
                    string spwordCheckCapital = spWord.ElementAt(i);
                    //check wheather sentense capitalize or CAPS whole word
                    if (Regex.IsMatch(spwordCheckCapital, @"[A-Z]+[A-Z]"))
                    {
                        spwordCheckCapital = "⠠⠠" + spwordCheckCapital;
                    }
                    else if (Char.IsUpper(spwordCheckCapital[0]))
                    {
                        spwordCheckCapital = "⠠" + spwordCheckCapital;
                    }

                    string vowelLiftFilter = spwordCheckCapital;

                    expression = new Regex(@"\)|\?|[^\d]\,|\!|\:|;");
                    results    = expression.Matches(vowelLiftFilter);
                    foreach (Match match in results)
                    {
                        //Check Thai or English
                        Regex checkTHOrEN = new Regex(@"[\u0080-\u9fff]+");
                        for (var j = i - 1; j >= 0; j--)
                        {
                            if (spWord.ElementAt(j).Equals(" "))
                            {
                                continue;
                            }
                            Match m = checkTHOrEN.Match(spWord.ElementAt(j));
                            if (m.Success)
                            {
                                vowelLiftFilter = getBrailleInTable("T" + vowelLiftFilter);
                                break;
                            }
                            vowelLiftFilter = getBrailleInTable("E" + vowelLiftFilter);
                            break;
                        }
                    }

                    expression = new Regex(@"\(");
                    results    = expression.Matches(vowelLiftFilter);
                    foreach (Match match in results)
                    {
                        Regex checkTHOrEN = new Regex(@"[\u0080-\u9fff]+");
                        for (var j = i + 1; j <= spWord.Count(); j++)
                        {
                            if (spWord.ElementAt(j).Equals(" "))
                            {
                                continue;
                            }
                            Match m = checkTHOrEN.Match(spWord.ElementAt(j));
                            if (m.Success)
                            {
                                vowelLiftFilter = getBrailleInTable("T" + vowelLiftFilter);
                                break;
                            }
                            vowelLiftFilter = getBrailleInTable("E" + vowelLiftFilter);
                            break;
                        }
                    }

                    string bt = separateVowel(vowelLiftFilter);
                    // Detect double quote
                    if (vowelLiftFilter.Equals("\""))
                    {
                        bt = quote % 2 == 0 ? "⠦" : "⠴";
                        quote++;
                    }
                    prepareOutput.Add(bt);
                }
            }
        }