public void TestSentenceBoundary() { // LUCENENET specific - using a mock of the JDK BreakIterator class, which is just // an ICU BreakIterator with custom rules applied. East Asian // languages are skipped because the DictionaryBasedBreakIterator is not overridden by the rules. switch (CultureInfo.CurrentCulture.TwoLetterISOLanguageName) { case "th": // Thai case "lo": // Lao case "my": // Burmese case "km": // Khmer case "ja": // Japanese case "ko": // Korean case "zh": // Chinese Assume.That(false, "This test does not apply to East Asian languages."); break; } StringBuilder text = new StringBuilder(TEXT); // we test this with default locale, its randomized by LuceneTestCase // LUCENENET specific - using a mock of the JDK BreakIterator class, which is just // an ICU BreakIterator with custom rules applied. BreakIterator bi = JdkBreakIterator.GetSentenceInstance(CultureInfo.CurrentCulture); IBoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); int start = TEXT.IndexOf("any application", StringComparison.Ordinal); int expected = TEXT.IndexOf("It is a", StringComparison.Ordinal); TestFindStartOffset(text, start, expected, scanner); expected = TEXT.IndexOf("Apache Lucene is an open source", StringComparison.Ordinal); TestFindEndOffset(text, start, expected, scanner); }
public void TestWordBoundary() { StringBuilder text = new StringBuilder(TEXT); BreakIterator bi = JdkBreakIterator.GetWordInstance(CultureInfo.InvariantCulture); IBoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); int start = TEXT.IndexOf("formance"); int expected = TEXT.IndexOf("high-performance"); TestFindStartOffset(text, start, expected, scanner); expected = TEXT.IndexOf(", full"); TestFindEndOffset(text, start, expected, scanner); }
public void TestSentenceBoundary() { StringBuilder text = new StringBuilder(TEXT); // we test this with default locale, its randomized by LuceneTestCase BreakIterator bi = JdkBreakIterator.GetSentenceInstance(CultureInfo.CurrentCulture); IBoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); int start = TEXT.IndexOf("any application"); int expected = TEXT.IndexOf("It is a"); TestFindStartOffset(text, start, expected, scanner); expected = TEXT.IndexOf("Apache Lucene is an open source"); TestFindEndOffset(text, start, expected, scanner); }
public void TestWordBoundary() { StringBuilder text = new StringBuilder(TEXT); // LUCENENET specific - using a mock of the JDK BreakIterator class, which is just // an ICU BreakIterator with custom rules applied. BreakIterator bi = JdkBreakIterator.GetWordInstance(CultureInfo.InvariantCulture); IBoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); int start = TEXT.IndexOf("formance", StringComparison.Ordinal); int expected = TEXT.IndexOf("high-performance", StringComparison.Ordinal); TestFindStartOffset(text, start, expected, scanner); expected = TEXT.IndexOf(", full", StringComparison.Ordinal); TestFindEndOffset(text, start, expected, scanner); }
private BreakIterator GetWordInstance(System.Globalization.CultureInfo locale) { return(JdkBreakIterator.GetWordInstance(locale)); }
/// <summary> /// Returns the <see cref="BreakIterator"/> to use for /// dividing text into passages. This instantiates an /// <see cref="BreakIterator.GetSentenceInstance(CultureInfo)"/> by default; /// subclasses can override to customize. /// </summary> protected virtual BreakIterator GetBreakIterator(string field) { return(JdkBreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture)); }