Exemple #1
0
    public static BreakIterator GetWordInstance()
    {
        if (instance == null)
        {
            instance = new BreakIterator();
        }

        return instance;
    }
Exemple #2
0
 /// <summary>
 /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
 public ThaiTokenizer(AttributeFactory factory, Reader reader)
     : base(factory, reader, (BreakIterator)sentenceProto.clone())
 {
     if (!DBBI_AVAILABLE)
     {
       throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
     }
     wordBreaker = (BreakIterator)proto.clone();
 }
Exemple #3
0
 public void Init()
 {
     characterBreak = BreakIterator.GetCharacterInstance();
     wordBreak      = BreakIterator.GetWordInstance();
     lineBreak      = BreakIterator.GetLineInstance();
     //Logln("Creating sentence iterator...");
     sentenceBreak = BreakIterator.GetSentenceInstance();
     //Logln("Finished creating sentence iterator...");
     titleBreak = BreakIterator.GetTitleInstance();
 }
        //
        /// <summary>
        /// list Khmer word after run ICU
        /// </summary>
        /// <param name="txt"></param>
        /// <returns></returns>
        public static String wordList(string txt)
        {
            // todo
            Icu.Wrapper.Init();
            var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList();
            var res   = String.Join(Environment.NewLine, words);

            Icu.Wrapper.Cleanup();
            return(txt);
        }
Exemple #5
0
        private static Statistics WorkWithDocument(string inputFile)
        {
            try
            {
                var contents = File.ReadAllText(inputFile);
                var defsFile = Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), "Definitions.json");

                //var cam = new Icu.Locale("km-KH");
                //var bi = new Icu.RuleBasedBreakIterator(Icu.BreakIterator.UBreakIteratorType.WORD, cam);
                //bi.SetText(contents);
                //var count = 0;
                //while (bi.MoveNext() > 0)
                //    count++;
                // txtNumWords.Text = count.ToString();


                var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", contents).ToList();
                //var sentences = BreakIterator.Split(BreakIterator.UBreakIteratorType.SENTENCE, "km-KH", contents);
                //var longestSentence = sentences.OrderByDescending(s=>s.Length).FirstOrDefault();
                var chars                   = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "km-KH", contents).ToList();
                var sentences               = contents.Split(new string[] { "។" }, StringSplitOptions.None).ToList();
                var longestSentence         = sentences.OrderByDescending(s => s.Length).FirstOrDefault();
                var longestSentenceWords    = longestSentence.Split(new string[] { "។" }, StringSplitOptions.None);
                var longestSentenceWordsAPI = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", longestSentence).ToList();
                var longestWord             = words.OrderByDescending(s => s.Length).FirstOrDefault();
                var longestWordChars        = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "km-KH", longestWord).ToList();

                var defs = JsonConvert.DeserializeObject <Definitions>(File.ReadAllText(defsFile));

                return(new Statistics()
                {
                    //Sentences = (bi.Boundaries.Length + 1).ToString(),
                    Sentences = sentences.Count().ToString(),
                    Words = words.Count().ToString(),
                    Consonants = chars.Intersect(defs.Consonants.ToList()).Count().ToString(),
                    Vowels = chars.Intersect(defs.Vowels.ToList()).Count().ToString(),

                    LongestSentence = longestSentence,
                    LongestSentenceWords = longestSentenceWordsAPI.Count().ToString(),
                    //LongestSentenceWords = longestSentence?.Count().ToString(),

                    LongestWord = longestWord,
                    LongestWordChars = longestWordChars.Count().ToString(),

                    WordList = String.Join(Environment.NewLine, words),
                    AddingZWSP = String.Join("\u200B", words),
                });
            }
            catch (Exception x)
            {
                XLogger.Error(x);
                return(null);
            }
        }
Exemple #6
0
 /// <summary>
 /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
 public ThaiTokenizer(AttributeFactory factory, TextReader reader)
     : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
 {
     if (!DBBI_AVAILABLE)
     {
         throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
     }
     wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
     termAtt     = AddAttribute <ICharTermAttribute>();
     offsetAtt   = AddAttribute <IOffsetAttribute>();
 }
 /// <summary>
 /// If its a <see cref="RuleBasedBreakIterator"/>, the rule status can be used for token type. If it's
 /// any other <see cref="BreakIterator"/>, the rulestatus method is not available, so treat
 /// it like a generic <see cref="BreakIterator"/>.
 /// </summary>
 /// <param name="breakIterator"></param>
 /// <returns></returns>
 public static BreakIteratorWrapper Wrap(BreakIterator breakIterator)
 {
     if (breakIterator is RuleBasedBreakIterator)
     {
         return(new RBBIWrapper((RuleBasedBreakIterator)breakIterator));
     }
     else
     {
         return(new BIWrapper(breakIterator));
     }
 }
 /**
  * @param filteredBI
  * @param text
  */
 private void assertDefaultBreakBehavior(BreakIterator filteredBI, String text)
 {
     Logln("Testing Default Behavior:");
     filteredBI.SetText(text);
     assertEquals("1st next", 20, filteredBI.Next());
     assertEquals("1st next", 84, filteredBI.Next());
     assertEquals("1st next", 90, filteredBI.Next());
     assertEquals("1st next", 181, filteredBI.Next());
     assertEquals("1st next", 278, filteredBI.Next());
     filteredBI.First();
 }
        //
        /// <summary>
        /// print number of Khmer words in text
        /// </summary>
        /// <param name="txt"></param>
        /// <returns></returns>
        public static int wordCount(string txt)
        {
            int nums = 0;

            Icu.Wrapper.Init();
            var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList();

            nums = words.Count();
            Icu.Wrapper.Cleanup();
            return(nums);
        }
        //
        /// <summary>
        /// print number of Khmer vowel in text
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        public static int vowelCount(string text)
        {
            int num_Vowel = -1;

            Icu.Wrapper.Init();
            var chars = BreakIterator.Split(BreakIterator.UBreakIteratorType.CHARACTER, "km-KH", text).ToList();

            num_Vowel = chars.Intersect(defs.Vowels.ToList()).Count();
            Icu.Wrapper.Cleanup();
            return(num_Vowel);
        }
        // add zero width space "\u200B" to between Khmer words
        // example{ wordswordswordswordswordswordswordswords
        // after addZWSP{ words"\u200B"words"\u200B"words"\u200B"words"\u200B"words"\u200B"words"\u200B"words"\u200B"words
        public static String addZWSP(string txt)
        {
            var res = "";

            Icu.Wrapper.Init();
            var words = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "km-KH", txt).ToList();

            res = String.Join("\u200B", words);
            Icu.Wrapper.Cleanup();
            return(res);
        }
Exemple #12
0
        private void UxBreakClick(object sender, EventArgs e)
        {
            using (BreakIterator bi = BreakIterator.CreateWordInstance(Locale.GetUS()))
            {
                bi.SetText(this.uxText.Text);

                var words = bi.Enumerate().ToList();

                MessageBox.Show(string.Join("-", words.ToArray()));
            }
        }
        public void Split_Sentence()
        {
            var parts    = BreakIterator.Split(BreakIterator.UBreakIteratorType.SENTENCE, "zh-HK", "供重呼車遊踏持図質腰大野明会掲歌? 方図強候準素能物第毎止田作昼野集。霊一起続時筑腺算掲断詳山住死示流投。");
            var expected = new[] {
                "供重呼車遊踏持図質腰大野明会掲歌? ",
                "方図強候準素能物第毎止田作昼野集。",
                "霊一起続時筑腺算掲断詳山住死示流投。"
            };

            Assert.That(parts.Count(), Is.EqualTo(expected.Length));
            Assert.That(parts.ToArray(), Is.EquivalentTo(expected));
        }
Exemple #14
0
 private static BreakIterator LoadSentenceProto()
 {
     UninterruptableMonitor.Enter(syncLock);
     try
     {
         return(BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture));
     }
     finally
     {
         UninterruptableMonitor.Exit(syncLock);
     }
 }
Exemple #15
0
        public override BreakIterator CreateBreakIterator(UCultureInfo locale, int kind)
        {
            // TODO: convert to UCultureInfo when service switches over
            if (service.IsDefault)
            {
                return(CreateBreakInstance(locale, kind));
            }
            BreakIterator iter = (BreakIterator)service.Get(locale, kind, out UCultureInfo actualLoc);

            iter.SetCulture(actualLoc, actualLoc); // services make no distinction between actual & valid
            return(iter);
        }
Exemple #16
0
 private static BreakIterator LoadProto()
 {
     UninterruptableMonitor.Enter(syncLock);
     try
     {
         return(BreakIterator.GetWordInstance(new CultureInfo("th")));
     }
     finally
     {
         UninterruptableMonitor.Exit(syncLock);
     }
 }
        public void TestFilteredJapanese()
        {
            ULocale       loc = ULocale.JAPANESE;
            BreakIterator brk = FilteredBreakIteratorBuilder
                                .GetInstance(loc)
                                .WrapIteratorWithFilter(BreakIterator.GetSentenceInstance(loc));

            brk.SetText("OKです。");
            assertEquals("Starting point", 0, brk.Current);
            assertEquals("Next point", 5, brk.Next());
            assertEquals("Last point", BreakIterator.Done, brk.Next());
        }
        public void GetBoundaries_Character()
        {
            var text     = "abc? 1";
            var expected = new[] {
                new Boundary(0, 1), new Boundary(1, 2), new Boundary(2, 3), new Boundary(3, 4), new Boundary(4, 5), new Boundary(5, 6)
            };

            var parts = BreakIterator.GetBoundaries(BreakIterator.UBreakIteratorType.CHARACTER, new Locale("en-US"), text);

            Assert.That(parts.Count(), Is.EqualTo(expected.Length));
            Assert.That(parts.ToArray(), Is.EquivalentTo(expected));
        }
        public void BreakIteratorThatIsNull()
        {
            var locale = new Locale("de-DE");

            using (var bi = BreakIterator.CreateCharacterInstance(locale))
            {
                Assert.Throws <ArgumentNullException>(() =>
                {
                    bi.SetText(null);
                });
            }
        }
        public void GetBoundaries_Sentence()
        {
            var text     = "Aa bb. Ccdef 3.5 x? Y?x! Z";
            var expected = new[] {
                new Boundary(0, 7), new Boundary(7, 20), new Boundary(20, 22), new Boundary(22, 25), new Boundary(25, 26)
            };

            var parts = BreakIterator.GetBoundaries(BreakIterator.UBreakIteratorType.SENTENCE, new Locale("en-US"), text);

            Assert.That(parts.Count(), Is.EqualTo(expected.Length));
            Assert.That(parts.ToArray(), Is.EquivalentTo(expected));
        }
Exemple #21
0
        /// <summary>
        /// Initializes a new instance of the <see cref="StringSearcher&lt;T&gt;"/> class.
        /// </summary>
        /// <param name="type">The type.</param>
        /// <param name="wsManager">The writing system store.</param>
        public StringSearcher(SearchType type, WritingSystemManager wsManager)
        {
            if (wsManager == null)
            {
                throw new ArgumentNullException("wsManager");
            }

            m_type            = type;
            m_sortKeySelector = (ws, text) => wsManager.Get(ws).DefaultCollation.Collator.GetSortKey(text).KeyData;
            m_tokenizer       = (ws, text) => BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD,
                                                                  wsManager.Get(ws).IcuLocale, text);
        }
Exemple #22
0
            public void doTest()
            {
                BreakIterator brkIter;

                switch (type)
                {
                case BreakIterator.KIND_CHARACTER: brkIter = BreakIterator.GetCharacterInstance(locale); break;

                case BreakIterator.KIND_WORD: brkIter = BreakIterator.GetWordInstance(locale); break;

                case BreakIterator.KIND_LINE: brkIter = BreakIterator.GetLineInstance(locale); break;

                case BreakIterator.KIND_SENTENCE: brkIter = BreakIterator.GetSentenceInstance(locale); break;

                default: Errln("Unsupported break iterator type " + type); return;
                }
                brkIter.SetText(text);
                int[] foundOffsets = new int[maxOffsetCount];
                int   offset, foundOffsetsCount = 0;

                // do forwards iteration test
                while (foundOffsetsCount < maxOffsetCount && (offset = brkIter.Next()) != BreakIterator.Done)
                {
                    foundOffsets[foundOffsetsCount++] = offset;
                }
                if (!offsetsMatchExpected(foundOffsets, foundOffsetsCount))
                {
                    // log error for forwards test
                    String textToDisplay = (text.Length <= 16) ? text : text.Substring(0, 16 - 0); // ICU4N: Checked 2nd parameter
                    Errln("For type " + type + " " + locale + ", text \"" + textToDisplay + "...\"" +
                          "; expect " + expectOffsets.Length + " offsets:" + formatOffsets(expectOffsets, expectOffsets.Length) +
                          "; found " + foundOffsetsCount + " offsets fwd:" + formatOffsets(foundOffsets, foundOffsetsCount));
                }
                else
                {
                    // do backwards iteration test
                    --foundOffsetsCount; // back off one from the end offset
                    while (foundOffsetsCount > 0)
                    {
                        offset = brkIter.Previous();
                        if (offset != foundOffsets[--foundOffsetsCount])
                        {
                            // log error for backwards test
                            String textToDisplay = (text.Length <= 16) ? text : text.Substring(0, 16 - 0); // ICU4N: Checked 2nd parameter
                            Errln("For type " + type + " " + locale + ", text \"" + textToDisplay + "...\"" +
                                  "; expect " + expectOffsets.Length + " offsets:" + formatOffsets(expectOffsets, expectOffsets.Length) +
                                  "; found rev offset " + offset + " where expect " + foundOffsets[foundOffsetsCount]);
                            break;
                        }
                    }
                }
            }
        public void Split_Word()
        {
            if (string.CompareOrdinal(Wrapper.IcuVersion, "52.1") < 0)
            {
                Assert.Ignore("This test requires ICU 52 or higher");
            }

            var parts    = BreakIterator.Split(BreakIterator.UBreakIteratorType.WORD, "zh-HK", "今晚、我會睡著。一隻狗");
            var expected = new[] { "今晚", "我會", "睡著", "一隻", "狗" };

            Assert.That(parts.Count(), Is.EqualTo(expected.Length));
            Assert.That(parts.ToArray(), Is.EquivalentTo(expected));
        }
 private void Test0Sentences(BreakIterator bi)
 {
     assertEquals(0, bi.Current);
     assertEquals(0, bi.First());
     assertEquals(BreakIterator.Done, bi.Next());
     assertEquals(0, bi.Last());
     assertEquals(BreakIterator.Done, bi.Previous());
     assertEquals(BreakIterator.Done, bi.Following(0));
     assertEquals(BreakIterator.Done, bi.Preceding(0));
     assertEquals(0, bi.First());
     assertEquals(BreakIterator.Done, bi.Next(13));
     assertEquals(BreakIterator.Done, bi.Next(-8));
 }
        public virtual void TestConsumeWordInstance()
        {
            // we use the default locale, as its randomized by LuceneTestCase
            var bi = BreakIterator.CreateWordInstance(Locale.GetUS());
            var ci = CharArrayIterator.NewWordInstance();

            for (var i = 0; i < 10000; i++)
            {
                var text = TestUtil.RandomUnicodeString(Random()).toCharArray();
                ci.SetText(text, 0, text.Length);
                Consume(bi, ci);
            }
        }
        public virtual void testConsumeSentenceInstance()
        {
            // we use the default locale, as its randomized by LuceneTestCase
            BreakIterator     bi = BreakIterator.getSentenceInstance(Locale.Default);
            CharArrayIterator ci = CharArrayIterator.newSentenceInstance();

            for (int i = 0; i < 10000; i++)
            {
                char[] text = TestUtil.randomUnicodeString(random()).toCharArray();
                ci.setText(text, 0, text.Length);
                consume(bi, ci);
            }
        }
Exemple #27
0
        public override BreakIterator CreateBreakIterator(ULocale locale, int kind)
        {
            // TODO: convert to ULocale when service switches over
            if (service.IsDefault)
            {
                return(CreateBreakInstance(locale, kind));
            }
            ULocale[]     actualLoc = new ULocale[1];
            BreakIterator iter      = (BreakIterator)service.Get(locale, kind, actualLoc);

            iter.SetLocale(actualLoc[0], actualLoc[0]); // services make no distinction between actual & valid
            return(iter);
        }
Exemple #28
0
        public static IEnumerable <string> Enumerate(this BreakIterator bi)
        {
            var    sb = new StringBuilder();
            string text = bi.GetCLRText();
            int    start = bi.First(), end = bi.Next();

            while (end != BreakIterator.DONE)
            {
                yield return(text.Substring(start, end - start));

                start = end; end = bi.Next();
            }
        }
Exemple #29
0
 /// <summary>
 /// Set the <see cref="Text.BreakIterator"/> that will be used to restrict the points
 /// at which matches are detected.
 /// </summary>
 /// <param name="breakIterator">
 /// A <see cref="Text.BreakIterator"/> that will be used to restrict the
 /// points at which matches are detected. If a match is
 /// found, but the match's start or end index is not a
 /// boundary as determined by the <see cref="Text.BreakIterator"/>,
 /// the match will be rejected and another will be searched
 /// for. If this parameter is <c>null</c>, no break
 /// detection is attempted.
 /// </param>
 /// <seealso cref="Text.BreakIterator"/>
 /// <stable>ICU 2.0</stable>
 public virtual void SetBreakIterator(BreakIterator breakIterator)
 {
     search_.BreakIterator = breakIterator;
     if (search_.BreakIterator != null)
     {
         // Create a clone of CharacterItearator, so it won't
         // affect the position currently held by search_.text()
         if (search_.Text != null)
         {
             search_.BreakIterator.SetText((CharacterIterator)search_.Text.Clone());
         }
     }
 }
Exemple #30
0
        public virtual void TestConsumeSentenceInstance()
        {
            // we use the default locale, as its randomized by LuceneTestCase
            BreakIterator bi = BreakIterator.GetSentenceInstance(CultureInfo.CurrentCulture);
            var           ci = CharArrayIterator.NewSentenceInstance();

            for (var i = 0; i < 10000; i++)
            {
                var text = TestUtil.RandomUnicodeString(Random()).toCharArray();
                ci.SetText(text, 0, text.Length);
                Consume(bi, ci);
            }
        }
Exemple #31
0
        public static IEnumerable <Token> EnumerateTokens(this BreakIterator bi)
        {
            string text = bi.GetCLRText();
            int    start = bi.First(), end = bi.Next();

            while (end != BreakIterator.DONE)
            {
                yield return(new Token(start, end, text.Substring(start, end - start), bi.GetRuleStatus()));

                start = end;
                end   = bi.Next();
            }
        }
        public void CanSetNewText_Null()
        {
            var    locale     = new Locale("en-US");
            var    text       = "Good-day, kind sir !  Can I have a glass of water?  I am very parched.";
            string secondText = null;

            using (var bi = BreakIterator.CreateCharacterInstance(locale))
            {
                bi.SetText(text);

                Assert.Throws <ArgumentNullException>(() => bi.SetText(secondText));
            }
        }
 public ThaiWordBreaker(BreakIterator wordBreaker) 
 {
     if (wordBreaker == null)
     {
         throw new ArgumentNullException("wordBreaker");
     }
     this.wordBreaker = wordBreaker;
 }
 /// <summary>
 /// Construct a new SegmenterBase, also supplying the AttributeFactory
 /// </summary>
 protected SegmentingTokenizerBase(AttributeFactory factory, TextReader reader, BreakIterator iterator)
     : base(factory, reader)
 {
     offsetAtt = AddAttribute<IOffsetAttribute>();
     this.iterator = iterator;
 }
 private void Consume(BreakIterator bi, CharacterIterator ci)
 {
     bi.SetText(ci.toString());
     while (bi.Next() != BreakIterator.DONE)
     {
         ;
     }
 }
	  /// <summary>
	  /// Construct a new SegmenterBase, using
	  /// the provided BreakIterator for sentence segmentation.
	  /// <para>
	  /// Note that you should never share BreakIterators across different
	  /// TokenStreams, instead a newly created or cloned one should always
	  /// be provided to this constructor.
	  /// </para>
	  /// </summary>
	  public SegmentingTokenizerBase(Reader reader, BreakIterator iterator) : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, iterator)
	  {
	  }
	  /// <summary>
	  /// Construct a new SegmenterBase, also supplying the AttributeFactory
	  /// </summary>
	  public SegmentingTokenizerBase(AttributeFactory factory, Reader reader, BreakIterator iterator) : base(factory, reader)
	  {
		this.iterator = iterator;
	  }
	  private void consume(BreakIterator bi, CharacterIterator ci)
	  {
		bi.Text = ci;
		while (bi.next() != BreakIterator.DONE)
		{
		  ;
		}
	  }