public void Split_Sentence()
        {
            var parts = BreakIterator.Split(BreakIterator.UBreakIteratorType.SENTENCE, "en-US", "Aa bb. Cc 3.5 x? Y?x! Z");

            Assert.That(parts.ToArray(), Is.EquivalentTo(new[] { "Aa bb. ", "Cc 3.5 x? ", "Y?", "x! ", "Z" }));
            Assert.That(parts.Count(), Is.EqualTo(5));
        }
        public void Split_Character()
        {
            var parts = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "en-US", "abc");

            Assert.That(parts.Count(), Is.EqualTo(3));
            Assert.That(parts.ToArray(), Is.EquivalentTo(new[] { "a", "b", "c" }));
        }
        public void Split_Line()
        {
            var parts = BreakIterator.Split(BreakIterator.UBreakIteratorType.LINE, "en-US", "Aa Bb. Cc");

            Assert.That(parts.Count(), Is.EqualTo(3));
            Assert.That(parts.ToArray(), Is.EquivalentTo(new[] { "Aa ", "Bb. ", "Cc" }));
        }
        public void Split_Line()
        {
            var parts    = BreakIterator.Split(BreakIterator.UBreakIteratorType.LINE, "zh-HK", "今晚、我會睡著。");
            var expected = new[] { "今", "晚、", "我", "會", "睡", "著。" };

            Assert.That(parts.Count(), Is.EqualTo(expected.Length));
            Assert.That(parts.ToArray(), Is.EquivalentTo(expected));
        }
        public void Split_Character()
        {
            var parts    = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "zh-HK", "今晚、我會睡著。狗");
            var expected = new[] { "今", "晚", "、", "我", "會", "睡", "著", "。", "狗" };

            Assert.That(parts.Count(), Is.EqualTo(expected.Length));
            Assert.That(parts.ToArray(), Is.EquivalentTo(expected));
        }
        //
        /// <summary>
        /// longest word in the text
        /// </summary>
        /// <param name="txt"></param>
        /// <returns></returns>
        public static String longestWord(string txt)
        {
            Icu.Wrapper.Init();
            var words       = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList();
            var longestWord = words.OrderByDescending(s => s.Length).FirstOrDefault();

            Icu.Wrapper.Cleanup();
            return(longestWord);
        }
Example #7
0
        private void Consume(BreakIterator.UBreakIteratorType iteratorType, Locale locale, CharacterIterator ci)
        {
            var contents = BreakIterator.Split(iteratorType, locale, ci.ToString());

            foreach (var token in contents)
            {
                ;
            }
        }
        //
        /// <summary>
        /// list Khmer word after run ICU
        /// </summary>
        /// <param name="txt"></param>
        /// <returns></returns>
        public static String wordList(string txt)
        {
            // todo
            Icu.Wrapper.Init();
            var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList();
            var res   = String.Join(Environment.NewLine, words);

            Icu.Wrapper.Cleanup();
            return(txt);
        }
Example #9
0
        private static Statistics WorkWithDocument(string inputFile)
        {
            try
            {
                var contents = File.ReadAllText(inputFile);
                var defsFile = Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), "Definitions.json");

                //var cam = new Icu.Locale("km-KH");
                //var bi = new Icu.RuleBasedBreakIterator(Icu.BreakIterator.UBreakIteratorType.WORD, cam);
                //bi.SetText(contents);
                //var count = 0;
                //while (bi.MoveNext() > 0)
                //    count++;
                // txtNumWords.Text = count.ToString();


                var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", contents).ToList();
                //var sentences = BreakIterator.Split(BreakIterator.UBreakIteratorType.SENTENCE, "km-KH", contents);
                //var longestSentence = sentences.OrderByDescending(s=>s.Length).FirstOrDefault();
                var chars                   = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "km-KH", contents).ToList();
                var sentences               = contents.Split(new string[] { "។" }, StringSplitOptions.None).ToList();
                var longestSentence         = sentences.OrderByDescending(s => s.Length).FirstOrDefault();
                var longestSentenceWords    = longestSentence.Split(new string[] { "។" }, StringSplitOptions.None);
                var longestSentenceWordsAPI = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", longestSentence).ToList();
                var longestWord             = words.OrderByDescending(s => s.Length).FirstOrDefault();
                var longestWordChars        = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "km-KH", longestWord).ToList();

                var defs = JsonConvert.DeserializeObject <Definitions>(File.ReadAllText(defsFile));

                return(new Statistics()
                {
                    //Sentences = (bi.Boundaries.Length + 1).ToString(),
                    Sentences = sentences.Count().ToString(),
                    Words = words.Count().ToString(),
                    Consonants = chars.Intersect(defs.Consonants.ToList()).Count().ToString(),
                    Vowels = chars.Intersect(defs.Vowels.ToList()).Count().ToString(),

                    LongestSentence = longestSentence,
                    LongestSentenceWords = longestSentenceWordsAPI.Count().ToString(),
                    //LongestSentenceWords = longestSentence?.Count().ToString(),

                    LongestWord = longestWord,
                    LongestWordChars = longestWordChars.Count().ToString(),

                    WordList = String.Join(Environment.NewLine, words),
                    AddingZWSP = String.Join("\u200B", words),
                });
            }
            catch (Exception x)
            {
                XLogger.Error(x);
                return(null);
            }
        }
        //
        /// <summary>
        /// print number of Khmer vowel in text
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        public static int vowelCount(string text)
        {
            int num_Vowel = -1;

            Icu.Wrapper.Init();
            var chars = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "km-KH", text).ToList();

            num_Vowel = chars.Intersect(defs.Vowels.ToList()).Count();
            Icu.Wrapper.Cleanup();
            return(num_Vowel);
        }
        //
        /// <summary>
        /// print number of Khmer words in text
        /// </summary>
        /// <param name="txt"></param>
        /// <returns></returns>
        public static int wordCount(string txt)
        {
            int nums = 0;

            Icu.Wrapper.Init();
            var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList();

            nums = words.Count();
            Icu.Wrapper.Cleanup();
            return(nums);
        }
        // add zero width space "\u200B" to between Khmer words
        // example{ wordswordswordswordswordswordswordswords
        // after addZWSP{ words"\u200B"words"\u200B"words"\u200B"words"\u200B"words"\u200B"words"\u200B"words"\u200B"words
        public static String addZWSP(string txt)
        {
            var res = "";

            Icu.Wrapper.Init();
            var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList();

            res = String.Join("\u200B", words);
            Icu.Wrapper.Cleanup();
            return(res);
        }
        public void Split_Sentence()
        {
            var parts    = BreakIterator.Split(BreakIterator.UBreakIteratorType.SENTENCE, "zh-HK", "供重呼車遊踏持図質腰大野明会掲歌? 方図強候準素能物第毎止田作昼野集。霊一起続時筑腺算掲断詳山住死示流投。");
            var expected = new[] {
                "供重呼車遊踏持図質腰大野明会掲歌? ",
                "方図強候準素能物第毎止田作昼野集。",
                "霊一起続時筑腺算掲断詳山住死示流投。"
            };

            Assert.That(parts.Count(), Is.EqualTo(expected.Length));
            Assert.That(parts.ToArray(), Is.EquivalentTo(expected));
        }
Example #14
0
        /// <summary>
        /// Initializes a new instance of the <see cref="StringSearcher&lt;T&gt;"/> class.
        /// </summary>
        /// <param name="type">The type.</param>
        /// <param name="wsManager">The writing system store.</param>
        public StringSearcher(SearchType type, WritingSystemManager wsManager)
        {
            if (wsManager == null)
            {
                throw new ArgumentNullException("wsManager");
            }

            m_type            = type;
            m_sortKeySelector = (ws, text) => wsManager.Get(ws).DefaultCollation.Collator.GetSortKey(text).KeyData;
            m_tokenizer       = (ws, text) => BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD,
                                                                  wsManager.Get(ws).IcuLocale, text);
        }
        public void Split_Word()
        {
            if (string.CompareOrdinal(Wrapper.IcuVersion, "52.1") < 0)
            {
                Assert.Ignore("This test requires ICU 52 or higher");
            }

            var parts    = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "zh-HK", "今晚、我會睡著。一隻狗");
            var expected = new[] { "今晚", "我會", "睡著", "一隻", "狗" };

            Assert.That(parts.Count(), Is.EqualTo(expected.Length));
            Assert.That(parts.ToArray(), Is.EquivalentTo(expected));
        }
Example #16
0
 public IEnumerable <string> Split(BreakIterator.UBreakIteratorType type, string text)
 {
     return(BreakIterator.Split(type, "en-US", text));
 }