public static BreakIterator GetWordInstance() { if (instance == null) { instance = new BreakIterator(); } return instance; }
/// <summary> /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary> public ThaiTokenizer(AttributeFactory factory, Reader reader) : base(factory, reader, (BreakIterator)sentenceProto.clone()) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } wordBreaker = (BreakIterator)proto.clone(); }
public void Init() { characterBreak = BreakIterator.GetCharacterInstance(); wordBreak = BreakIterator.GetWordInstance(); lineBreak = BreakIterator.GetLineInstance(); //Logln("Creating sentence iterator..."); sentenceBreak = BreakIterator.GetSentenceInstance(); //Logln("Finished creating sentence iterator..."); titleBreak = BreakIterator.GetTitleInstance(); }
// /// <summary> /// list Khmer word after run ICU /// </summary> /// <param name="txt"></param> /// <returns></returns> public static String wordList(string txt) { // todo Icu.Wrapper.Init(); var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList(); var res = String.Join(Environment.NewLine, words); Icu.Wrapper.Cleanup(); return(txt); }
private static Statistics WorkWithDocument(string inputFile) { try { var contents = File.ReadAllText(inputFile); var defsFile = Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), "Definitions.json"); //var cam = new Icu.Locale("km-KH"); //var bi = new Icu.RuleBasedBreakIterator(Icu.BreakIterator.UBreakIteratorType.WORD, cam); //bi.SetText(contents); //var count = 0; //while (bi.MoveNext() > 0) // count++; // txtNumWords.Text = count.ToString(); var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", contents).ToList(); //var sentences = BreakIterator.Split(BreakIterator.UBreakIteratorType.SENTENCE, "km-KH", contents); //var longestSentence = sentences.OrderByDescending(s=>s.Length).FirstOrDefault(); var chars = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "km-KH", contents).ToList(); var sentences = contents.Split(new string[] { "។" }, StringSplitOptions.None).ToList(); var longestSentence = sentences.OrderByDescending(s => s.Length).FirstOrDefault(); var longestSentenceWords = longestSentence.Split(new string[] { "។" }, StringSplitOptions.None); var longestSentenceWordsAPI = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", longestSentence).ToList(); var longestWord = words.OrderByDescending(s => s.Length).FirstOrDefault(); var longestWordChars = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "km-KH", longestWord).ToList(); var defs = JsonConvert.DeserializeObject <Definitions>(File.ReadAllText(defsFile)); return(new Statistics() { //Sentences = (bi.Boundaries.Length + 1).ToString(), Sentences = sentences.Count().ToString(), Words = words.Count().ToString(), Consonants = chars.Intersect(defs.Consonants.ToList()).Count().ToString(), Vowels = chars.Intersect(defs.Vowels.ToList()).Count().ToString(), LongestSentence = longestSentence, LongestSentenceWords = longestSentenceWordsAPI.Count().ToString(), //LongestSentenceWords = longestSentence?.Count().ToString(), LongestWord = longestWord, LongestWordChars = longestWordChars.Count().ToString(), WordList = String.Join(Environment.NewLine, words), AddingZWSP = String.Join("\u200B", words), }); } catch (Exception x) { XLogger.Error(x); return(null); } }
/// <summary> /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary> public ThaiTokenizer(AttributeFactory factory, TextReader reader) : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS())) { if (!DBBI_AVAILABLE) { throw new System.NotSupportedException("This JRE does not have support for Thai segmentation"); } wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS())); termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); }
/// <summary> /// If its a <see cref="RuleBasedBreakIterator"/>, the rule status can be used for token type. If it's /// any other <see cref="BreakIterator"/>, the rulestatus method is not available, so treat /// it like a generic <see cref="BreakIterator"/>. /// </summary> /// <param name="breakIterator"></param> /// <returns></returns> public static BreakIteratorWrapper Wrap(BreakIterator breakIterator) { if (breakIterator is RuleBasedBreakIterator) { return(new RBBIWrapper((RuleBasedBreakIterator)breakIterator)); } else { return(new BIWrapper(breakIterator)); } }
/** * @param filteredBI * @param text */ private void assertDefaultBreakBehavior(BreakIterator filteredBI, String text) { Logln("Testing Default Behavior:"); filteredBI.SetText(text); assertEquals("1st next", 20, filteredBI.Next()); assertEquals("1st next", 84, filteredBI.Next()); assertEquals("1st next", 90, filteredBI.Next()); assertEquals("1st next", 181, filteredBI.Next()); assertEquals("1st next", 278, filteredBI.Next()); filteredBI.First(); }
// /// <summary> /// print number of Khmer words in text /// </summary> /// <param name="txt"></param> /// <returns></returns> public static int wordCount(string txt) { int nums = 0; Icu.Wrapper.Init(); var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList(); nums = words.Count(); Icu.Wrapper.Cleanup(); return(nums); }
// /// <summary> /// print number of Khmer vowel in text /// </summary> /// <param name="text"></param> /// <returns></returns> public static int vowelCount(string text) { int num_Vowel = -1; Icu.Wrapper.Init(); var chars = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "km-KH", text).ToList(); num_Vowel = chars.Intersect(defs.Vowels.ToList()).Count(); Icu.Wrapper.Cleanup(); return(num_Vowel); }
// add zero width space "\u200B" to between Khmer words // example{ wordswordswordswordswordswordswordswords // after addZWSP{ words"\u200B"words"\u200B"words"\u200B"words"\u200B"words"\u200B"words"\u200B"words"\u200B"words public static String addZWSP(string txt) { var res = ""; Icu.Wrapper.Init(); var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList(); res = String.Join("\u200B", words); Icu.Wrapper.Cleanup(); return(res); }
private void UxBreakClick(object sender, EventArgs e) { using (BreakIterator bi = BreakIterator.CreateWordInstance(Locale.GetUS())) { bi.SetText(this.uxText.Text); var words = bi.Enumerate().ToList(); MessageBox.Show(string.Join("-", words.ToArray())); } }
public void Split_Sentence() { var parts = BreakIterator.Split(BreakIterator.UBreakIteratorType.SENTENCE, "zh-HK", "供重呼車遊踏持図質腰大野明会掲歌? 方図強候準素能物第毎止田作昼野集。霊一起続時筑腺算掲断詳山住死示流投。"); var expected = new[] { "供重呼車遊踏持図質腰大野明会掲歌? ", "方図強候準素能物第毎止田作昼野集。", "霊一起続時筑腺算掲断詳山住死示流投。" }; Assert.That(parts.Count(), Is.EqualTo(expected.Length)); Assert.That(parts.ToArray(), Is.EquivalentTo(expected)); }
private static BreakIterator LoadSentenceProto() { UninterruptableMonitor.Enter(syncLock); try { return(BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture)); } finally { UninterruptableMonitor.Exit(syncLock); } }
public override BreakIterator CreateBreakIterator(UCultureInfo locale, int kind) { // TODO: convert to UCultureInfo when service switches over if (service.IsDefault) { return(CreateBreakInstance(locale, kind)); } BreakIterator iter = (BreakIterator)service.Get(locale, kind, out UCultureInfo actualLoc); iter.SetCulture(actualLoc, actualLoc); // services make no distinction between actual & valid return(iter); }
private static BreakIterator LoadProto() { UninterruptableMonitor.Enter(syncLock); try { return(BreakIterator.GetWordInstance(new CultureInfo("th"))); } finally { UninterruptableMonitor.Exit(syncLock); } }
public void TestFilteredJapanese() { ULocale loc = ULocale.JAPANESE; BreakIterator brk = FilteredBreakIteratorBuilder .GetInstance(loc) .WrapIteratorWithFilter(BreakIterator.GetSentenceInstance(loc)); brk.SetText("OKです。"); assertEquals("Starting point", 0, brk.Current); assertEquals("Next point", 5, brk.Next()); assertEquals("Last point", BreakIterator.Done, brk.Next()); }
public void GetBoundaries_Character() { var text = "abc? 1"; var expected = new[] { new Boundary(0, 1), new Boundary(1, 2), new Boundary(2, 3), new Boundary(3, 4), new Boundary(4, 5), new Boundary(5, 6) }; var parts = BreakIterator.GetBoundaries(BreakIterator.UBreakIteratorType.CHARACTER, new Locale("en-US"), text); Assert.That(parts.Count(), Is.EqualTo(expected.Length)); Assert.That(parts.ToArray(), Is.EquivalentTo(expected)); }
public void BreakIteratorThatIsNull() { var locale = new Locale("de-DE"); using (var bi = BreakIterator.CreateCharacterInstance(locale)) { Assert.Throws <ArgumentNullException>(() => { bi.SetText(null); }); } }
public void GetBoundaries_Sentence() { var text = "Aa bb. Ccdef 3.5 x? Y?x! Z"; var expected = new[] { new Boundary(0, 7), new Boundary(7, 20), new Boundary(20, 22), new Boundary(22, 25), new Boundary(25, 26) }; var parts = BreakIterator.GetBoundaries(BreakIterator.UBreakIteratorType.SENTENCE, new Locale("en-US"), text); Assert.That(parts.Count(), Is.EqualTo(expected.Length)); Assert.That(parts.ToArray(), Is.EquivalentTo(expected)); }
/// <summary> /// Initializes a new instance of the <see cref="StringSearcher<T>"/> class. /// </summary> /// <param name="type">The type.</param> /// <param name="wsManager">The writing system store.</param> public StringSearcher(SearchType type, WritingSystemManager wsManager) { if (wsManager == null) { throw new ArgumentNullException("wsManager"); } m_type = type; m_sortKeySelector = (ws, text) => wsManager.Get(ws).DefaultCollation.Collator.GetSortKey(text).KeyData; m_tokenizer = (ws, text) => BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, wsManager.Get(ws).IcuLocale, text); }
public void doTest() { BreakIterator brkIter; switch (type) { case BreakIterator.KIND_CHARACTER: brkIter = BreakIterator.GetCharacterInstance(locale); break; case BreakIterator.KIND_WORD: brkIter = BreakIterator.GetWordInstance(locale); break; case BreakIterator.KIND_LINE: brkIter = BreakIterator.GetLineInstance(locale); break; case BreakIterator.KIND_SENTENCE: brkIter = BreakIterator.GetSentenceInstance(locale); break; default: Errln("Unsupported break iterator type " + type); return; } brkIter.SetText(text); int[] foundOffsets = new int[maxOffsetCount]; int offset, foundOffsetsCount = 0; // do forwards iteration test while (foundOffsetsCount < maxOffsetCount && (offset = brkIter.Next()) != BreakIterator.Done) { foundOffsets[foundOffsetsCount++] = offset; } if (!offsetsMatchExpected(foundOffsets, foundOffsetsCount)) { // log error for forwards test String textToDisplay = (text.Length <= 16) ? text : text.Substring(0, 16 - 0); // ICU4N: Checked 2nd parameter Errln("For type " + type + " " + locale + ", text \"" + textToDisplay + "...\"" + "; expect " + expectOffsets.Length + " offsets:" + formatOffsets(expectOffsets, expectOffsets.Length) + "; found " + foundOffsetsCount + " offsets fwd:" + formatOffsets(foundOffsets, foundOffsetsCount)); } else { // do backwards iteration test --foundOffsetsCount; // back off one from the end offset while (foundOffsetsCount > 0) { offset = brkIter.Previous(); if (offset != foundOffsets[--foundOffsetsCount]) { // log error for backwards test String textToDisplay = (text.Length <= 16) ? text : text.Substring(0, 16 - 0); // ICU4N: Checked 2nd parameter Errln("For type " + type + " " + locale + ", text \"" + textToDisplay + "...\"" + "; expect " + expectOffsets.Length + " offsets:" + formatOffsets(expectOffsets, expectOffsets.Length) + "; found rev offset " + offset + " where expect " + foundOffsets[foundOffsetsCount]); break; } } } }
public void Split_Word() { if (string.CompareOrdinal(Wrapper.IcuVersion, "52.1") < 0) { Assert.Ignore("This test requires ICU 52 or higher"); } var parts = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "zh-HK", "今晚、我會睡著。一隻狗"); var expected = new[] { "今晚", "我會", "睡著", "一隻", "狗" }; Assert.That(parts.Count(), Is.EqualTo(expected.Length)); Assert.That(parts.ToArray(), Is.EquivalentTo(expected)); }
private void Test0Sentences(BreakIterator bi) { assertEquals(0, bi.Current); assertEquals(0, bi.First()); assertEquals(BreakIterator.Done, bi.Next()); assertEquals(0, bi.Last()); assertEquals(BreakIterator.Done, bi.Previous()); assertEquals(BreakIterator.Done, bi.Following(0)); assertEquals(BreakIterator.Done, bi.Preceding(0)); assertEquals(0, bi.First()); assertEquals(BreakIterator.Done, bi.Next(13)); assertEquals(BreakIterator.Done, bi.Next(-8)); }
public virtual void TestConsumeWordInstance() { // we use the default locale, as its randomized by LuceneTestCase var bi = BreakIterator.CreateWordInstance(Locale.GetUS()); var ci = CharArrayIterator.NewWordInstance(); for (var i = 0; i < 10000; i++) { var text = TestUtil.RandomUnicodeString(Random()).toCharArray(); ci.SetText(text, 0, text.Length); Consume(bi, ci); } }
public virtual void testConsumeSentenceInstance() { // we use the default locale, as its randomized by LuceneTestCase BreakIterator bi = BreakIterator.getSentenceInstance(Locale.Default); CharArrayIterator ci = CharArrayIterator.newSentenceInstance(); for (int i = 0; i < 10000; i++) { char[] text = TestUtil.randomUnicodeString(random()).toCharArray(); ci.setText(text, 0, text.Length); consume(bi, ci); } }
public override BreakIterator CreateBreakIterator(ULocale locale, int kind) { // TODO: convert to ULocale when service switches over if (service.IsDefault) { return(CreateBreakInstance(locale, kind)); } ULocale[] actualLoc = new ULocale[1]; BreakIterator iter = (BreakIterator)service.Get(locale, kind, actualLoc); iter.SetLocale(actualLoc[0], actualLoc[0]); // services make no distinction between actual & valid return(iter); }
public static IEnumerable <string> Enumerate(this BreakIterator bi) { var sb = new StringBuilder(); string text = bi.GetCLRText(); int start = bi.First(), end = bi.Next(); while (end != BreakIterator.DONE) { yield return(text.Substring(start, end - start)); start = end; end = bi.Next(); } }
/// <summary> /// Set the <see cref="Text.BreakIterator"/> that will be used to restrict the points /// at which matches are detected. /// </summary> /// <param name="breakIterator"> /// A <see cref="Text.BreakIterator"/> that will be used to restrict the /// points at which matches are detected. If a match is /// found, but the match's start or end index is not a /// boundary as determined by the <see cref="Text.BreakIterator"/>, /// the match will be rejected and another will be searched /// for. If this parameter is <c>null</c>, no break /// detection is attempted. /// </param> /// <seealso cref="Text.BreakIterator"/> /// <stable>ICU 2.0</stable> public virtual void SetBreakIterator(BreakIterator breakIterator) { search_.BreakIterator = breakIterator; if (search_.BreakIterator != null) { // Create a clone of CharacterItearator, so it won't // affect the position currently held by search_.text() if (search_.Text != null) { search_.BreakIterator.SetText((CharacterIterator)search_.Text.Clone()); } } }
public virtual void TestConsumeSentenceInstance() { // we use the default locale, as its randomized by LuceneTestCase BreakIterator bi = BreakIterator.GetSentenceInstance(CultureInfo.CurrentCulture); var ci = CharArrayIterator.NewSentenceInstance(); for (var i = 0; i < 10000; i++) { var text = TestUtil.RandomUnicodeString(Random()).toCharArray(); ci.SetText(text, 0, text.Length); Consume(bi, ci); } }
public static IEnumerable <Token> EnumerateTokens(this BreakIterator bi) { string text = bi.GetCLRText(); int start = bi.First(), end = bi.Next(); while (end != BreakIterator.DONE) { yield return(new Token(start, end, text.Substring(start, end - start), bi.GetRuleStatus())); start = end; end = bi.Next(); } }
public void CanSetNewText_Null() { var locale = new Locale("en-US"); var text = "Good-day, kind sir ! Can I have a glass of water? I am very parched."; string secondText = null; using (var bi = BreakIterator.CreateCharacterInstance(locale)) { bi.SetText(text); Assert.Throws <ArgumentNullException>(() => bi.SetText(secondText)); } }
public ThaiWordBreaker(BreakIterator wordBreaker) { if (wordBreaker == null) { throw new ArgumentNullException("wordBreaker"); } this.wordBreaker = wordBreaker; }
/// <summary> /// Construct a new SegmenterBase, also supplying the AttributeFactory /// </summary> protected SegmentingTokenizerBase(AttributeFactory factory, TextReader reader, BreakIterator iterator) : base(factory, reader) { offsetAtt = AddAttribute<IOffsetAttribute>(); this.iterator = iterator; }
private void Consume(BreakIterator bi, CharacterIterator ci) { bi.SetText(ci.toString()); while (bi.Next() != BreakIterator.DONE) { ; } }
/// <summary> /// Construct a new SegmenterBase, using /// the provided BreakIterator for sentence segmentation. /// <para> /// Note that you should never share BreakIterators across different /// TokenStreams, instead a newly created or cloned one should always /// be provided to this constructor. /// </para> /// </summary> public SegmentingTokenizerBase(Reader reader, BreakIterator iterator) : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, iterator) { }
/// <summary> /// Construct a new SegmenterBase, also supplying the AttributeFactory /// </summary> public SegmentingTokenizerBase(AttributeFactory factory, Reader reader, BreakIterator iterator) : base(factory, reader) { this.iterator = iterator; }
private void consume(BreakIterator bi, CharacterIterator ci) { bi.Text = ci; while (bi.next() != BreakIterator.DONE) { ; } }