public static IEnumerable <string> Split(UBreakIteratorType type, string locale, string text) { if (string.IsNullOrEmpty(text)) { return(new string[] { }); } ErrorCode err; IntPtr bi = NativeMethods.ubrk_open(type, locale, text, text.Length, out err); if (err != ErrorCode.NoErrors) { throw new Exception("BreakIterator.Split() failed with code " + err); } var tokens = new List <string>(); int cur = NativeMethods.ubrk_first(bi); while (cur != DONE) { int next = NativeMethods.ubrk_next(bi); int status = NativeMethods.ubrk_getRuleStatus(bi); if (next != DONE && AddToken(type, status)) { tokens.Add(text.Substring(cur, next - cur)); } cur = next; } NativeMethods.ubrk_close(bi); return(tokens); }
private static IEnumerable <Boundary> GetBoundaries(UBreakIteratorType type, Locale locale, string text, bool includeSpacesAndPunctuation) { List <Boundary> boundaries = new List <Boundary>(); using (var breakIterator = new RuleBasedBreakIterator(type, locale)) { breakIterator.SetText(text); int current = breakIterator.Current; while (current != DONE) { int next = breakIterator.MoveNext(); int status = breakIterator.GetRuleStatus(); if (next == DONE) { break; } if (includeSpacesAndPunctuation || AddToken(type, status)) { boundaries.Add(new Boundary(current, next)); } current = next; } } return(boundaries); }
/// <summary> /// Splits the specified text along the specified type of boundaries. /// Spaces and punctuations are not returned. /// </summary> /// <param name="type">The type.</param> /// <param name="locale">The locale.</param> /// <param name="text">The text.</param> /// <returns>The tokens.</returns> public static IEnumerable <string> Split(UBreakIteratorType type, Locale locale, string text) { if (string.IsNullOrEmpty(text)) { yield break; } foreach (var boundary in GetBoundaries(type, locale.Id, text, includeSpacesAndPunctuation: false)) { yield return(text.Substring(boundary.Start, boundary.End - boundary.Start)); } }
private static bool AddToken(UBreakIteratorType type, int status) { switch (type) { case UBreakIteratorType.CHARACTER: return(true); case UBreakIteratorType.LINE: case UBreakIteratorType.SENTENCE: return(true); case UBreakIteratorType.WORD: return(status < (int)UWordBreak.NONE || status >= (int)UWordBreak.NONE_LIMIT); } return(false); }
//------------------------------------------------------------------------------------------------- //WinterDev public static IEnumerable <SplitBound> GetSplitBoundIter(UBreakIteratorType type, string locale, char[] charBuffer, int start, int len) { if (charBuffer == null || charBuffer.Length == 0) { return(new SplitBound[] { }); } ErrorCode err; var tokens = new List <SplitBound>(); unsafe { fixed(char *head = &charBuffer[0]) { IntPtr bi = NativeMethods.ubrk_open_unsafe(type, locale, head + start, len, out err); if (err != ErrorCode.NoErrors) { throw new Exception("BreakIterator.Split() failed with code " + err); } int cur = NativeMethods.ubrk_first(bi); while (cur != DONE) { int next = NativeMethods.ubrk_next(bi); int status = NativeMethods.ubrk_getRuleStatus(bi); if (next != DONE && AddToken(type, status)) { tokens.Add(new SplitBound(cur, next - cur)); } cur = next; } NativeMethods.ubrk_close(bi); } } return(tokens); }
public override void DoBreak(char[] input, int start, int len, OnBreak onbreak) { //1. UBreakIteratorType type = UBreakIteratorType.WORD; switch (BreakKind) { default: case TextBreakKind.Word: type = UBreakIteratorType.WORD; break; case TextBreakKind.Sentence: type = UBreakIteratorType.SENTENCE; break; } //------------------------ int errCode = 0; //break all string unsafe { fixed(char *h = &input[start]) { IntPtr nativeIter = NativeTextBreakerLib.MtFt_UbrkOpen(type, localebuff, h, len, out errCode); int cur = NativeTextBreakerLib.MtFt_UbrkFirst(nativeIter); while (cur != DONE) { int next = NativeTextBreakerLib.MtFt_UbrkNext(nativeIter); int status = NativeTextBreakerLib.MtFt_UbrkGetRuleStatus(nativeIter); if (next != DONE && AddToken(type, status)) { onbreak(new SplitBound(cur, next - cur)); } cur = next; } NativeTextBreakerLib.MtFt_UbrkClose(nativeIter); } } }
/// <summary> /// Creates a copy of the given RuleBasedBreakIterator /// </summary> /// <param name="bi">break itrerator</param> /// <exception cref="Exception">Throws an exception if we get an error cloning the native /// break iterator</exception> private RuleBasedBreakIterator(RuleBasedBreakIterator bi) { _iteratorType = bi._iteratorType; Rules = bi.Rules; _locale = bi._locale; _text = bi._text; _currentIndex = bi._currentIndex; _textBoundaries = new TextBoundary[bi._textBoundaries.Length]; bi._textBoundaries.CopyTo(_textBoundaries, 0); if (bi._breakIterator == IntPtr.Zero) { return; } ErrorCode errorCode; _breakIterator = NativeMethods.ubrk_safeClone(bi._breakIterator, IntPtr.Zero, IntPtr.Zero, out errorCode); if (errorCode.IsFailure()) { throw new Exception($"BreakIterator.ubrk_safeClone() failed with code {errorCode}"); } }
/// <summary> /// Gets the sentence/line/word/character boundaries for the text. Spaces and punctuations /// are not returned for UBreakIteratorType.WORD. /// </summary> public static IEnumerable <Boundary> GetBoundaries(UBreakIteratorType type, Locale locale, string text) { return(GetBoundaries(type, locale, text, false)); }
/// <summary> /// Splits the specified text along the specified type of boundaries. /// Spaces and punctuations are not returned. /// </summary> /// <param name="type">The type.</param> /// <param name="locale">The locale.</param> /// <param name="text">The text.</param> /// <returns>The tokens.</returns> public static IEnumerable <string> Split(UBreakIteratorType type, string locale, string text) { return(Split(type, new Locale(locale), text)); }
/// <summary> /// Creates a BreakIterator with the given BreakIteratorType and Locale. /// </summary> /// <param name="iteratorType">Break type.</param> /// <param name="locale">The locale.</param> /// <remarks> /// If iterator type is UBreakIteratorType.WORD, it will include /// spaces and punctuation as boundaries for words. If this is /// not desired <see cref="BreakIterator.GetBoundaries(BreakIterator.UBreakIteratorType, Icu.Locale, string, bool)"/>. /// </remarks> public RuleBasedBreakIterator(UBreakIteratorType iteratorType, Locale locale) : base() { _locale = locale; _iteratorType = iteratorType; }
static bool AddToken(UBreakIteratorType type, int status) { switch (type) { case UBreakIteratorType.CHARACTER: return true; case UBreakIteratorType.LINE: case UBreakIteratorType.SENTENCE: return true; case UBreakIteratorType.WORD: return status < (int)UWordBreak.NONE || status >= (int)UWordBreak.NONE_LIMIT; } return false; }
public static unsafe extern IntPtr MtFt_UbrkOpen(UBreakIteratorType iterType, byte[] locale, char* startChar, int len, out int err);
public static unsafe extern IntPtr MtFt_UbrkOpen(UBreakIteratorType iterType, byte[] locale, char *startChar, int len, out int err);
/// <summary> /// Splits the specified text along the specified type of boundaries. Spaces and punctuations /// are not returned. /// </summary> /// <param name="type">The type.</param> /// <param name="locale">The locale.</param> /// <param name="text">The text.</param> /// <returns>The tokens.</returns> public static IEnumerable <string> Split(UBreakIteratorType type, Locale locale, string text) { return(Split(type, locale.Id, text)); }