public void CreateWordInstanceTest() { if (string.CompareOrdinal(Wrapper.IcuVersion, "52.1") < 0) { Assert.Ignore("This test requires ICU 52 or higher"); } var text = "你是中国人么? 我喜欢你们的国家。"; var locale = new Locale("zh"); var expected = new[] { 0, 2, 5, 6, //你是中国人么 7, 8, 9, 11, //? 我喜欢 13, 14, 16, 17 //的国家。 }; var none = (int)BreakIterator.UWordBreak.NONE; var ideographic = (int)BreakIterator.UWordBreak.IDEO; //ideographic character var ruleStatus = new[] { none, ideographic, ideographic, ideographic, none, none, ideographic, ideographic, ideographic, ideographic, ideographic, none }; using (var bi = BreakIterator.CreateWordInstance(locale)) { bi.SetText(text); Assert.AreEqual(text, bi.Text); Assert.AreEqual(locale, bi.Locale); CollectionAssert.AreEqual(expected, bi.Boundaries); // Verify each boundary and rule status. for (int i = 0; i < expected.Length; i++) { int current = bi.Current; int status = bi.GetRuleStatus(); Assert.AreEqual(expected[i], current); Assert.AreEqual(ruleStatus[i], status); int moveNext = bi.MoveNext(); int next = i + 1; if (next < expected.Length) { Assert.AreEqual(expected[next], moveNext); } else { // Verify that the BreakIterator is exhausted because we've // moved past every item. Assert.AreEqual(BreakIterator.DONE, moveNext); } } // Verify that the BreakIterator is exhausted because we've // moved past every item, so current should be the last offset. int lastIndex = expected.Length - 1; Assert.AreEqual(expected[lastIndex], bi.Current); } }
public void CanIterateForwards() { var locale = new Locale("de-DE"); var text = "Good-day, kind sir !"; var expected = new int[] { 0, 4, 5, 8, 9, 10, 14, 15, 18, 19, 20 }; var none = BreakIterator.UWordBreak.NONE; var letter = BreakIterator.UWordBreak.LETTER; var ruleStatus = new[] { none, letter, none, letter, none, none, letter, none, letter, none, none }; using (var bi = BreakIterator.CreateWordInstance(locale)) { bi.SetText(text); CollectionAssert.AreEqual(expected, bi.Boundaries); // Verify each boundary and rule status. for (int i = 0; i < expected.Length; i++) { int current = bi.Current; int status = bi.GetRuleStatus(); int[] ruleStatusVector = bi.GetRuleStatusVector(); Assert.AreEqual(expected[i], current); Assert.AreEqual((int)ruleStatus[i], status); // There should only be one rule that parsed these. Assert.AreEqual(1, ruleStatusVector.Length); Assert.AreEqual((int)ruleStatus[i], ruleStatusVector[0]); int moveNext = bi.MoveNext(); int next = i + 1; if (next < expected.Length) { Assert.AreEqual(expected[next], moveNext); } else { // Verify that the BreakIterator is exhausted because we've // moved past every item. Assert.AreEqual(BreakIterator.DONE, moveNext); } } // Verify that the BreakIterator is exhausted because we've // moved past every item. It should return the last offset found. int lastIndex = expected.Length - 1; Assert.AreEqual(expected[lastIndex], bi.Current); // We've moved past the last word, it should return the last offset. Assert.AreEqual(BreakIterator.DONE, bi.MoveNext()); Assert.AreEqual(expected[lastIndex], bi.Current); // Verify that the first element is correct now that we've moved to the end. Assert.AreEqual(expected[0], bi.MoveFirst()); Assert.AreEqual(expected[0], bi.Current); } }
/// <summary> /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary> public ThaiTokenizer(AttributeFactory factory, TextReader reader) : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS())) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS())); termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); }
private void UxBreakClick(object sender, EventArgs e) { using (BreakIterator bi = BreakIterator.CreateWordInstance(Locale.GetUS())) { bi.SetText(this.uxText.Text); var words = bi.Enumerate().ToList(); MessageBox.Show(string.Join("-", words.ToArray())); } }
public virtual void TestConsumeWordInstance() { // we use the default locale, as its randomized by LuceneTestCase var bi = BreakIterator.CreateWordInstance(Locale.GetUS()); var ci = CharArrayIterator.NewWordInstance(); for (var i = 0; i < 10000; i++) { var text = TestUtil.RandomUnicodeString(Random()).toCharArray(); ci.SetText(text, 0, text.Length); Consume(bi, ci); } }
public void MovePrecedingTest_Empty(int offset, int expectedOffset, int expectedCurrent) { var locale = new Locale("de-DE"); using (var bi = BreakIterator.CreateWordInstance(locale)) { bi.SetText(string.Empty); int actualOffset = bi.MovePreceding(offset); Assert.AreEqual(expectedOffset, actualOffset); Assert.AreEqual(expectedCurrent, bi.Current); } }
public void CreateWordInstanceTest() { var locale = new Locale("de-DE"); var text = "Good-day, kind sir !"; var expected = new int[] { 0, 4, 5, 8, 9, 10, 14, 15, 18, 19, 20 }; using (var bi = BreakIterator.CreateWordInstance(locale)) { bi.SetText(text); Assert.That(bi.Locale, Is.EqualTo(locale)); Assert.That(bi.Text, Is.EqualTo(text)); Assert.That(bi.Boundaries, Is.EquivalentTo(expected)); } }
public void IsBoundaryTest(int offset, bool expectedIsBoundary, int expectedOffset) { var locale = new Locale("de-DE"); var text = "Good-day, kind sir !"; using (var bi = BreakIterator.CreateWordInstance(locale)) { bi.SetText(text); var isBoundary = bi.IsBoundary(offset); Assert.AreEqual(expectedIsBoundary, isBoundary); Assert.AreEqual(expectedOffset, bi.Current); } }
public void MovePrecedingTest(int offset, int expectedOffset, int expectedCurrent) { var text = "Good-day, kind sir !"; var locale = new Locale("de-DE"); using (var bi = BreakIterator.CreateWordInstance(locale)) { bi.SetText(text); int actualOffset = bi.MovePreceding(offset); Assert.AreEqual(expectedOffset, actualOffset); Assert.AreEqual(expectedCurrent, bi.Current); } }
public void CreateWordInstanceTest() { var locale = new Locale("de-DE"); var text = "Good-day, kind sir !"; var expected = new int[] { 0, 4, 5, 8, 9, 10, 14, 15, 18, 19, 20 }; using (var bi = BreakIterator.CreateWordInstance(locale)) { bi.SetText(text); Assert.AreEqual(locale, bi.Locale); Assert.AreEqual(text, bi.Text); CollectionAssert.AreEqual(expected, bi.Boundaries); } }
private void UxOccurrenceCountClick(object sender, EventArgs e) { using (BreakIterator bi = BreakIterator.CreateWordInstance(Locale.GetUS())) { bi.SetText(this.uxText.Text); var words = bi.Enumerate() .GroupBy(w => w) .OrderBy(x => x.Count()) .Reverse() .Select(x => x.Key + " : " + x.Count()) .Take(10) .ToArray(); MessageBox.Show(string.Join(Environment.NewLine, words)); } }
public void IsBoundaryTest_Empty() { string text = string.Empty; var offsetsToTest = new[] { 0, -1, 100 }; var locale = new Locale("de-DE"); using (var bi = BreakIterator.CreateWordInstance(locale)) { bi.SetText(text); for (int i = 0; i < offsetsToTest.Length; i++) { var isBoundary = bi.IsBoundary(offsetsToTest[i]); Assert.IsFalse(isBoundary); Assert.AreEqual(0, bi.Current); } } }
public void IsBoundary(BreakIterator.UBreakIteratorType type, string text, int[] offsetsToTest, bool[] expectedIsBoundary, int[] expectedOffsets) // expected BreakIterator.Current after calling IsBoundary. { var locale = new Locale("zh"); BreakIterator bi = default(BreakIterator); try { switch (type) { case BreakIterator.UBreakIteratorType.SENTENCE: bi = BreakIterator.CreateSentenceInstance(locale); break; case BreakIterator.UBreakIteratorType.WORD: bi = BreakIterator.CreateWordInstance(locale); break; default: throw new NotSupportedException("This iterator type is not supported in this test yet. [" + type + "]"); } bi.SetText(text); for (int i = 0; i < offsetsToTest.Length; i++) { var isBoundary = bi.IsBoundary(offsetsToTest[i]); Assert.AreEqual(expectedIsBoundary[i], isBoundary, "Expected IsBoundary was not equal at i: {0}, offset: {1}", i, offsetsToTest[i]); Assert.AreEqual(expectedOffsets[i], bi.Current); } } finally { if (bi != default(BreakIterator)) { bi.Dispose(); } } }
public void Clone() { var locale = new Locale("de-DE"); using (var bi = BreakIterator.CreateWordInstance(locale)) { const string text = "Hello World!"; bi.SetText(text); bi.MoveFollowing(5); using (var clone = bi.Clone()) { Assert.That(clone.Text, Is.EqualTo(bi.Text)); Assert.That(clone.Current, Is.EqualTo(bi.Current)); Assert.That(clone.Boundaries, Is.EquivalentTo(bi.Boundaries)); Assert.That(clone.Locale, Is.EqualTo(bi.Locale)); bi.SetText("Good afternoon"); Assert.That(clone.Text, Is.EqualTo(text)); Assert.That(clone.Boundaries, Is.Not.EquivalentTo(bi.Boundaries)); } } }
private BreakIterator GetWordInstance(System.Globalization.CultureInfo locale) { return(BreakIterator.CreateWordInstance(new Locale(locale.Name))); }
private void Load() { _breakIterator = BreakIterator.CreateWordInstance(Locale.GetUS()); }
public void CanIterateBackwards() { if (string.CompareOrdinal(Wrapper.IcuVersion, "52.1") < 0) { Assert.Ignore("This test requires ICU 52 or higher"); } var text = "你是中国人么? 我喜欢你们的国家。"; var locale = new Locale("zh"); var expected = new[] { 0, 2, 5, 6, //你是中国人么 7, 8, 9, 11, //? 我喜欢 13, 14, 16, 17 //的国家。 }; var none = (int)BreakIterator.UWordBreak.NONE; var ideographic = (int)BreakIterator.UWordBreak.IDEO; //ideographic character var ruleStatus = new[] { none, ideographic, ideographic, ideographic, none, none, ideographic, ideographic, ideographic, ideographic, ideographic, none }; using (var bi = BreakIterator.CreateWordInstance(locale)) { bi.SetText(text); CollectionAssert.AreEqual(expected, bi.Boundaries); int current = 0; var currentBoundary = expected[current]; var currentStatus = ruleStatus[current]; Assert.AreEqual(currentBoundary, bi.Current); Assert.AreEqual(currentStatus, bi.GetRuleStatus()); // For these, we only expect one rule to be applied in order to find the text boundary. CollectionAssert.AreEqual(new[] { currentStatus }, bi.GetRuleStatusVector()); // Increment the index and verify that the next Boundary is correct. current++; currentBoundary = expected[current]; currentStatus = ruleStatus[current]; Assert.AreEqual(currentBoundary, bi.MoveNext()); Assert.AreEqual(currentBoundary, bi.Current); Assert.AreEqual(currentStatus, bi.GetRuleStatus()); CollectionAssert.AreEqual(new[] { currentStatus }, bi.GetRuleStatusVector()); current++; currentBoundary = expected[current]; currentStatus = ruleStatus[current]; Assert.AreEqual(currentBoundary, bi.MoveNext()); Assert.AreEqual(currentBoundary, bi.Current); Assert.AreEqual(currentStatus, bi.GetRuleStatus()); CollectionAssert.AreEqual(new[] { currentStatus }, bi.GetRuleStatusVector()); current--; currentBoundary = expected[current]; currentStatus = ruleStatus[current]; Assert.AreEqual(currentBoundary, bi.MovePrevious()); Assert.AreEqual(currentBoundary, bi.Current); Assert.AreEqual(currentStatus, bi.GetRuleStatus()); CollectionAssert.AreEqual(new[] { currentStatus }, bi.GetRuleStatusVector()); current--; currentBoundary = expected[current]; currentStatus = ruleStatus[current]; Assert.AreEqual(currentBoundary, bi.MovePrevious()); Assert.AreEqual(currentBoundary, bi.Current); Assert.AreEqual(currentStatus, bi.GetRuleStatus()); CollectionAssert.AreEqual(new[] { currentStatus }, bi.GetRuleStatusVector()); // We've moved past the first word, it should return 0. Assert.AreEqual(BreakIterator.DONE, bi.MovePrevious()); Assert.AreEqual(0, bi.Current); Assert.AreEqual(0, bi.GetRuleStatus()); // this by default returns 0. CollectionAssert.AreEqual(new[] { 0 }, bi.GetRuleStatusVector()); // default returns 0 in the status vector // Verify that the element is correct now that we've moved to the end. var last = expected.Last(); var lastStatus = ruleStatus.Last(); Assert.AreEqual(last, bi.MoveLast()); Assert.AreEqual(last, bi.Current); Assert.AreEqual(lastStatus, bi.GetRuleStatus()); CollectionAssert.AreEqual(new[] { lastStatus }, bi.GetRuleStatusVector()); } }
public void CanIterateForwards(BreakIterator.UBreakIteratorType type, string text, int[] expected, BreakIterator.UWordBreak[] ruleStatus) { var locale = new Locale("zh"); BreakIterator bi = default(BreakIterator); try { switch (type) { case BreakIterator.UBreakIteratorType.SENTENCE: bi = BreakIterator.CreateSentenceInstance(locale); break; case BreakIterator.UBreakIteratorType.WORD: bi = BreakIterator.CreateWordInstance(locale); break; default: throw new NotSupportedException("This iterator type is not supported in this test yet. [" + type + "]"); } bi.SetText(text); CollectionAssert.AreEqual(expected, bi.Boundaries); // Verify each boundary for the sentences for (int i = 0; i < expected.Length; i++) { int current = bi.Current; int status = bi.GetRuleStatus(); int expectedStatus = (int)ruleStatus[i]; Assert.AreEqual(expected[i], current); Assert.AreEqual(expectedStatus, status); CollectionAssert.AreEqual(new[] { expectedStatus }, bi.GetRuleStatusVector()); int moveNext = bi.MoveNext(); int next = i + 1; if (next < expected.Length) { Assert.AreEqual(expected[next], moveNext); } else { // Verify that the BreakIterator is exhausted because we've // moved past every item. Assert.AreEqual(BreakIterator.DONE, moveNext); } } int lastIndex = expected.Length - 1; Assert.AreEqual(expected[lastIndex], bi.Current); // We've moved past the last word, it should return the last offset. Assert.AreEqual(BreakIterator.DONE, bi.MoveNext()); Assert.AreEqual(expected[lastIndex], bi.Current); // Verify that the first element is correct now that we've moved to the end. Assert.AreEqual(expected[0], bi.MoveFirst()); Assert.AreEqual(expected[0], bi.Current); } finally { if (bi != default(BreakIterator)) { bi.Dispose(); } } }
private void wordBreak() { int quote = 0; using (BreakIterator bi = BreakIterator.CreateWordInstance(Locale.GetUS())) { var input = inputTextBox.Text; Regex pairReplace = new Regex(@"\(\d+,\d+\)"); var pairReplaceResult = pairReplace.Matches(inputTextBox.Text); foreach (Match match in pairReplaceResult) { string treat = match.Value; treat = treat.Replace(",", "⠠"); input = input.Remove(match.Index, match.Length).Insert(match.Index, treat); } Regex expression = new Regex(@"[^\.][^\d+]\.\s"); var results = expression.Matches(input); foreach (Match match in results) { input = Regex.Replace(input, @"[^\.][^\d+]\.\s", match.Value.Replace(".", "⠸⠲")); } input = Regex.Replace(input, @"\.{3,}", "⠄⠄⠄"); bi.SetText(input); IEnumerable <String> spWord = bi.Enumerate(); for (int i = 0; i < spWord.Count(); i++) { string spwordCheckCapital = spWord.ElementAt(i); //check wheather sentense capitalize or CAPS whole word if (Regex.IsMatch(spwordCheckCapital, @"[A-Z]+[A-Z]")) { spwordCheckCapital = "⠠⠠" + spwordCheckCapital; } else if (Char.IsUpper(spwordCheckCapital[0])) { spwordCheckCapital = "⠠" + spwordCheckCapital; } string vowelLiftFilter = spwordCheckCapital; expression = new Regex(@"\)|\?|[^\d]\,|\!|\:|;"); results = expression.Matches(vowelLiftFilter); foreach (Match match in results) { //Check Thai or English Regex checkTHOrEN = new Regex(@"[\u0080-\u9fff]+"); for (var j = i - 1; j >= 0; j--) { if (spWord.ElementAt(j).Equals(" ")) { continue; } Match m = checkTHOrEN.Match(spWord.ElementAt(j)); if (m.Success) { vowelLiftFilter = getBrailleInTable("T" + vowelLiftFilter); break; } vowelLiftFilter = getBrailleInTable("E" + vowelLiftFilter); break; } } expression = new Regex(@"\("); results = expression.Matches(vowelLiftFilter); foreach (Match match in results) { Regex checkTHOrEN = new Regex(@"[\u0080-\u9fff]+"); for (var j = i + 1; j <= spWord.Count(); j++) { if (spWord.ElementAt(j).Equals(" ")) { continue; } Match m = checkTHOrEN.Match(spWord.ElementAt(j)); if (m.Success) { vowelLiftFilter = getBrailleInTable("T" + vowelLiftFilter); break; } vowelLiftFilter = getBrailleInTable("E" + vowelLiftFilter); break; } } string bt = separateVowel(vowelLiftFilter); // Detect double quote if (vowelLiftFilter.Equals("\"")) { bt = quote % 2 == 0 ? "⠦" : "⠴"; quote++; } prepareOutput.Add(bt); } } }