public void Split_Sentence() { var parts = BreakIterator.Split(BreakIterator.UBreakIteratorType.SENTENCE, "en-US", "Aa bb. Cc 3.5 x? Y?x! Z"); Assert.That(parts.ToArray(), Is.EquivalentTo(new[] { "Aa bb. ", "Cc 3.5 x? ", "Y?", "x! ", "Z" })); Assert.That(parts.Count(), Is.EqualTo(5)); }
public void Split_Character() { var parts = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "en-US", "abc"); Assert.That(parts.Count(), Is.EqualTo(3)); Assert.That(parts.ToArray(), Is.EquivalentTo(new[] { "a", "b", "c" })); }
public void Split_Line() { var parts = BreakIterator.Split(BreakIterator.UBreakIteratorType.LINE, "en-US", "Aa Bb. Cc"); Assert.That(parts.Count(), Is.EqualTo(3)); Assert.That(parts.ToArray(), Is.EquivalentTo(new[] { "Aa ", "Bb. ", "Cc" })); }
public void Split_Line() { var parts = BreakIterator.Split(BreakIterator.UBreakIteratorType.LINE, "zh-HK", "今晚、我會睡著。"); var expected = new[] { "今", "晚、", "我", "會", "睡", "著。" }; Assert.That(parts.Count(), Is.EqualTo(expected.Length)); Assert.That(parts.ToArray(), Is.EquivalentTo(expected)); }
public void Split_Character() { var parts = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "zh-HK", "今晚、我會睡著。狗"); var expected = new[] { "今", "晚", "、", "我", "會", "睡", "著", "。", "狗" }; Assert.That(parts.Count(), Is.EqualTo(expected.Length)); Assert.That(parts.ToArray(), Is.EquivalentTo(expected)); }
// /// <summary> /// longest word in the text /// </summary> /// <param name="txt"></param> /// <returns></returns> public static String longestWord(string txt) { Icu.Wrapper.Init(); var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList(); var longestWord = words.OrderByDescending(s => s.Length).FirstOrDefault(); Icu.Wrapper.Cleanup(); return(longestWord); }
private void Consume(BreakIterator.UBreakIteratorType iteratorType, Locale locale, CharacterIterator ci) { var contents = BreakIterator.Split(iteratorType, locale, ci.ToString()); foreach (var token in contents) { ; } }
// /// <summary> /// list Khmer word after run ICU /// </summary> /// <param name="txt"></param> /// <returns></returns> public static String wordList(string txt) { // todo Icu.Wrapper.Init(); var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList(); var res = String.Join(Environment.NewLine, words); Icu.Wrapper.Cleanup(); return(txt); }
private static Statistics WorkWithDocument(string inputFile) { try { var contents = File.ReadAllText(inputFile); var defsFile = Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), "Definitions.json"); //var cam = new Icu.Locale("km-KH"); //var bi = new Icu.RuleBasedBreakIterator(Icu.BreakIterator.UBreakIteratorType.WORD, cam); //bi.SetText(contents); //var count = 0; //while (bi.MoveNext() > 0) // count++; // txtNumWords.Text = count.ToString(); var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", contents).ToList(); //var sentences = BreakIterator.Split(BreakIterator.UBreakIteratorType.SENTENCE, "km-KH", contents); //var longestSentence = sentences.OrderByDescending(s=>s.Length).FirstOrDefault(); var chars = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "km-KH", contents).ToList(); var sentences = contents.Split(new string[] { "។" }, StringSplitOptions.None).ToList(); var longestSentence = sentences.OrderByDescending(s => s.Length).FirstOrDefault(); var longestSentenceWords = longestSentence.Split(new string[] { "។" }, StringSplitOptions.None); var longestSentenceWordsAPI = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", longestSentence).ToList(); var longestWord = words.OrderByDescending(s => s.Length).FirstOrDefault(); var longestWordChars = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "km-KH", longestWord).ToList(); var defs = JsonConvert.DeserializeObject <Definitions>(File.ReadAllText(defsFile)); return(new Statistics() { //Sentences = (bi.Boundaries.Length + 1).ToString(), Sentences = sentences.Count().ToString(), Words = words.Count().ToString(), Consonants = chars.Intersect(defs.Consonants.ToList()).Count().ToString(), Vowels = chars.Intersect(defs.Vowels.ToList()).Count().ToString(), LongestSentence = longestSentence, LongestSentenceWords = longestSentenceWordsAPI.Count().ToString(), //LongestSentenceWords = longestSentence?.Count().ToString(), LongestWord = longestWord, LongestWordChars = longestWordChars.Count().ToString(), WordList = String.Join(Environment.NewLine, words), AddingZWSP = String.Join("\u200B", words), }); } catch (Exception x) { XLogger.Error(x); return(null); } }
// /// <summary> /// print number of Khmer vowel in text /// </summary> /// <param name="text"></param> /// <returns></returns> public static int vowelCount(string text) { int num_Vowel = -1; Icu.Wrapper.Init(); var chars = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "km-KH", text).ToList(); num_Vowel = chars.Intersect(defs.Vowels.ToList()).Count(); Icu.Wrapper.Cleanup(); return(num_Vowel); }
// /// <summary> /// print number of Khmer words in text /// </summary> /// <param name="txt"></param> /// <returns></returns> public static int wordCount(string txt) { int nums = 0; Icu.Wrapper.Init(); var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList(); nums = words.Count(); Icu.Wrapper.Cleanup(); return(nums); }
// add zero width space "\u200B" to between Khmer words // example{ wordswordswordswordswordswordswordswords // after addZWSP{ words"\u200B"words"\u200B"words"\u200B"words"\u200B"words"\u200B"words"\u200B"words"\u200B"words public static String addZWSP(string txt) { var res = ""; Icu.Wrapper.Init(); var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList(); res = String.Join("\u200B", words); Icu.Wrapper.Cleanup(); return(res); }
public void Split_Sentence() { var parts = BreakIterator.Split(BreakIterator.UBreakIteratorType.SENTENCE, "zh-HK", "供重呼車遊踏持図質腰大野明会掲歌? 方図強候準素能物第毎止田作昼野集。霊一起続時筑腺算掲断詳山住死示流投。"); var expected = new[] { "供重呼車遊踏持図質腰大野明会掲歌? ", "方図強候準素能物第毎止田作昼野集。", "霊一起続時筑腺算掲断詳山住死示流投。" }; Assert.That(parts.Count(), Is.EqualTo(expected.Length)); Assert.That(parts.ToArray(), Is.EquivalentTo(expected)); }
/// <summary> /// Initializes a new instance of the <see cref="StringSearcher<T>"/> class. /// </summary> /// <param name="type">The type.</param> /// <param name="wsManager">The writing system store.</param> public StringSearcher(SearchType type, WritingSystemManager wsManager) { if (wsManager == null) { throw new ArgumentNullException("wsManager"); } m_type = type; m_sortKeySelector = (ws, text) => wsManager.Get(ws).DefaultCollation.Collator.GetSortKey(text).KeyData; m_tokenizer = (ws, text) => BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, wsManager.Get(ws).IcuLocale, text); }
public void Split_Word() { if (string.CompareOrdinal(Wrapper.IcuVersion, "52.1") < 0) { Assert.Ignore("This test requires ICU 52 or higher"); } var parts = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "zh-HK", "今晚、我會睡著。一隻狗"); var expected = new[] { "今晚", "我會", "睡著", "一隻", "狗" }; Assert.That(parts.Count(), Is.EqualTo(expected.Length)); Assert.That(parts.ToArray(), Is.EquivalentTo(expected)); }
public IEnumerable <string> Split(BreakIterator.UBreakIteratorType type, string text) { return(BreakIterator.Split(type, "en-US", text)); }