public static IEnumerable <string> Split(UBreakIteratorType type, string locale, string text)
        {
            if (string.IsNullOrEmpty(text))
            {
                return(new string[] { });
            }


            ErrorCode err;
            IntPtr    bi = NativeMethods.ubrk_open(type, locale, text, text.Length, out err);

            if (err != ErrorCode.NoErrors)
            {
                throw new Exception("BreakIterator.Split() failed with code " + err);
            }
            var tokens = new List <string>();
            int cur    = NativeMethods.ubrk_first(bi);

            while (cur != DONE)
            {
                int next   = NativeMethods.ubrk_next(bi);
                int status = NativeMethods.ubrk_getRuleStatus(bi);
                if (next != DONE && AddToken(type, status))
                {
                    tokens.Add(text.Substring(cur, next - cur));
                }
                cur = next;
            }
            NativeMethods.ubrk_close(bi);
            return(tokens);
        }
        private static IEnumerable <Boundary> GetBoundaries(UBreakIteratorType type, Locale locale, string text, bool includeSpacesAndPunctuation)
        {
            List <Boundary> boundaries = new List <Boundary>();

            using (var breakIterator = new RuleBasedBreakIterator(type, locale))
            {
                breakIterator.SetText(text);

                int current = breakIterator.Current;

                while (current != DONE)
                {
                    int next   = breakIterator.MoveNext();
                    int status = breakIterator.GetRuleStatus();

                    if (next == DONE)
                    {
                        break;
                    }

                    if (includeSpacesAndPunctuation || AddToken(type, status))
                    {
                        boundaries.Add(new Boundary(current, next));
                    }

                    current = next;
                }
            }

            return(boundaries);
        }
        /// <summary>
        /// Splits the specified text along the specified type of boundaries.
        /// Spaces and punctuations are not returned.
        /// </summary>
        /// <param name="type">The type.</param>
        /// <param name="locale">The locale.</param>
        /// <param name="text">The text.</param>
        /// <returns>The tokens.</returns>
        public static IEnumerable <string> Split(UBreakIteratorType type, Locale locale, string text)
        {
            if (string.IsNullOrEmpty(text))
            {
                yield break;
            }

            foreach (var boundary in GetBoundaries(type, locale.Id, text, includeSpacesAndPunctuation: false))
            {
                yield return(text.Substring(boundary.Start, boundary.End - boundary.Start));
            }
        }
        private static bool AddToken(UBreakIteratorType type, int status)
        {
            switch (type)
            {
            case UBreakIteratorType.CHARACTER:
                return(true);

            case UBreakIteratorType.LINE:
            case UBreakIteratorType.SENTENCE:
                return(true);

            case UBreakIteratorType.WORD:
                return(status < (int)UWordBreak.NONE || status >= (int)UWordBreak.NONE_LIMIT);
            }
            return(false);
        }
        //-------------------------------------------------------------------------------------------------
        //WinterDev
        public static IEnumerable <SplitBound> GetSplitBoundIter(UBreakIteratorType type,
                                                                 string locale,
                                                                 char[] charBuffer,
                                                                 int start,
                                                                 int len)
        {
            if (charBuffer == null || charBuffer.Length == 0)
            {
                return(new SplitBound[] { });
            }

            ErrorCode err;
            var       tokens = new List <SplitBound>();

            unsafe
            {
                fixed(char *head = &charBuffer[0])
                {
                    IntPtr bi = NativeMethods.ubrk_open_unsafe(type, locale, head + start, len, out err);

                    if (err != ErrorCode.NoErrors)
                    {
                        throw new Exception("BreakIterator.Split() failed with code " + err);
                    }

                    int cur = NativeMethods.ubrk_first(bi);

                    while (cur != DONE)
                    {
                        int next   = NativeMethods.ubrk_next(bi);
                        int status = NativeMethods.ubrk_getRuleStatus(bi);
                        if (next != DONE && AddToken(type, status))
                        {
                            tokens.Add(new SplitBound(cur, next - cur));
                        }
                        cur = next;
                    }
                    NativeMethods.ubrk_close(bi);
                }
            }
            return(tokens);
        }
        public override void DoBreak(char[] input, int start, int len, OnBreak onbreak)
        {
            //1.
            UBreakIteratorType type = UBreakIteratorType.WORD;

            switch (BreakKind)
            {
            default:
            case TextBreakKind.Word:
                type = UBreakIteratorType.WORD;
                break;

            case TextBreakKind.Sentence:
                type = UBreakIteratorType.SENTENCE;
                break;
            }
            //------------------------
            int errCode = 0;

            //break all string
            unsafe
            {
                fixed(char *h = &input[start])
                {
                    IntPtr nativeIter = NativeTextBreakerLib.MtFt_UbrkOpen(type, localebuff, h, len, out errCode);
                    int    cur        = NativeTextBreakerLib.MtFt_UbrkFirst(nativeIter);

                    while (cur != DONE)
                    {
                        int next   = NativeTextBreakerLib.MtFt_UbrkNext(nativeIter);
                        int status = NativeTextBreakerLib.MtFt_UbrkGetRuleStatus(nativeIter);
                        if (next != DONE && AddToken(type, status))
                        {
                            onbreak(new SplitBound(cur, next - cur));
                        }
                        cur = next;
                    }
                    NativeTextBreakerLib.MtFt_UbrkClose(nativeIter);
                }
            }
        }
        /// <summary>
        /// Creates a copy of the given RuleBasedBreakIterator
        /// </summary>
        /// <param name="bi">break itrerator</param>
        /// <exception cref="Exception">Throws an exception if we get an error cloning the native
        /// break iterator</exception>
        private RuleBasedBreakIterator(RuleBasedBreakIterator bi)
        {
            _iteratorType   = bi._iteratorType;
            Rules           = bi.Rules;
            _locale         = bi._locale;
            _text           = bi._text;
            _currentIndex   = bi._currentIndex;
            _textBoundaries = new TextBoundary[bi._textBoundaries.Length];
            bi._textBoundaries.CopyTo(_textBoundaries, 0);

            if (bi._breakIterator == IntPtr.Zero)
            {
                return;
            }

            ErrorCode errorCode;

            _breakIterator = NativeMethods.ubrk_safeClone(bi._breakIterator, IntPtr.Zero, IntPtr.Zero, out errorCode);

            if (errorCode.IsFailure())
            {
                throw new Exception($"BreakIterator.ubrk_safeClone() failed with code {errorCode}");
            }
        }
 /// <summary>
 /// Gets the sentence/line/word/character boundaries for the text. Spaces and punctuations
 /// are not returned for UBreakIteratorType.WORD.
 /// </summary>
 public static IEnumerable <Boundary> GetBoundaries(UBreakIteratorType type, Locale locale, string text)
 {
     return(GetBoundaries(type, locale, text, false));
 }
 /// <summary>
 /// Splits the specified text along the specified type of boundaries.
 /// Spaces and punctuations are not returned.
 /// </summary>
 /// <param name="type">The type.</param>
 /// <param name="locale">The locale.</param>
 /// <param name="text">The text.</param>
 /// <returns>The tokens.</returns>
 public static IEnumerable <string> Split(UBreakIteratorType type, string locale, string text)
 {
     return(Split(type, new Locale(locale), text));
 }
 /// <summary>
 /// Creates a BreakIterator with the given BreakIteratorType and Locale.
 /// </summary>
 /// <param name="iteratorType">Break type.</param>
 /// <param name="locale">The locale.</param>
 /// <remarks>
 /// If iterator type is UBreakIteratorType.WORD, it will include
 /// spaces and punctuation as boundaries for words.  If this is
 /// not desired <see cref="BreakIterator.GetBoundaries(BreakIterator.UBreakIteratorType, Icu.Locale, string, bool)"/>.
 /// </remarks>
 public RuleBasedBreakIterator(UBreakIteratorType iteratorType, Locale locale)
     : base()
 {
     _locale       = locale;
     _iteratorType = iteratorType;
 }
 static bool AddToken(UBreakIteratorType type, int status)
 {
     switch (type)
     {
         case UBreakIteratorType.CHARACTER:
             return true;
         case UBreakIteratorType.LINE:
         case UBreakIteratorType.SENTENCE:
             return true;
         case UBreakIteratorType.WORD:
             return status < (int)UWordBreak.NONE || status >= (int)UWordBreak.NONE_LIMIT;
     }
     return false;
 }
 public static unsafe extern IntPtr MtFt_UbrkOpen(UBreakIteratorType iterType, byte[] locale, char* startChar, int len, out int err);
Exemple #13
0
 public static unsafe extern IntPtr MtFt_UbrkOpen(UBreakIteratorType iterType, byte[] locale, char *startChar, int len, out int err);
Exemple #14
0
 /// <summary>
 /// Splits the specified text along the specified type of boundaries. Spaces and punctuations
 /// are not returned.
 /// </summary>
 /// <param name="type">The type.</param>
 /// <param name="locale">The locale.</param>
 /// <param name="text">The text.</param>
 /// <returns>The tokens.</returns>
 public static IEnumerable <string> Split(UBreakIteratorType type, Locale locale, string text)
 {
     return(Split(type, locale.Id, text));
 }