static bool IsArabicChar(char c, out SpanBreakInfo brInfo) { brInfo = s_brkArabic; if (brInfo.UnicodeRange.IsInRange(c)) { return(true); } brInfo = s_brkArabicSupplement; if (brInfo.UnicodeRange.IsInRange(c)) { return(true); } brInfo = s_brkArabicExtendA; if (brInfo.UnicodeRange.IsInRange(c)) { return(true); } brInfo = s_brkArabicPresentFormA; if (brInfo.UnicodeRange.IsInRange(c)) { return(true); } brInfo = s_brkArabicPresentFormB; if (brInfo.UnicodeRange.IsInRange(c)) { return(true); } // brInfo = null; return(false); }
public DictionaryBreakingEngine() { _breakInfo = GetSpanBreakInfo(); }
internal override void BreakWord(WordVisitor visitor, char[] charBuff, int startAt, int len) { //use custom parsing visitor.State = VisitorState.Parsing; RunAgent agent = _runAdapter.Agent; //collect arabic char and break int arabic_len = 0; int lim = startAt + len; SpanBreakInfo latest_ar = null; for (int i = startAt; i < lim; ++i) { char c = charBuff[i]; if (IsArabicChar(c, out SpanBreakInfo spBreak)) { arabic_len++; latest_ar = spBreak; } else { break; } } // if (arabic_len == 0) { visitor.State = VisitorState.OutOfRangeChar; return; } visitor.SpanBreakInfo = latest_ar; //only collect char Line line1 = new Line(new string(charBuff, startAt, arabic_len)); _runAdapter.LoadLine(line1); while (_runAdapter.MoveNext()) { int offset = agent.Offset; byte level = agent.Level; int sp_len = agent.Length; bool rtl = agent.IsRightToLeft; if (rtl) { //temp fix visitor.AddWordBreak_AndSetCurrentIndex(startAt + sp_len, WordKind.Text); } else { //use other engine break; } //iter each run-span //string tt = new string(buffer, offset, len); //System.Diagnostics.Debug.WriteLine(tt); } if (visitor.CurrentIndex == startAt + len) { visitor.State = VisitorState.End; } else { //continue to other parser visitor.State = VisitorState.OutOfRangeChar; } }
public static bool GetUniCodeRangeFor(char c1, out int startCodePoint, out int endCodePoint, out SpanBreakInfo spanBreakInfo) { //find proper unicode range (and its lang) //Thai //TODO: review this again, with AUTOGEN code { const char s_firstChar = (char)0x0E00; const char s_lastChar = (char)0xE7F; if (c1 >= s_firstChar && c1 <= s_lastChar) { startCodePoint = s_firstChar; endCodePoint = s_lastChar; spanBreakInfo = s_thai; return(true); } } //Lao { const char s_firstChar = (char)0x0E80; const char s_lastChar = (char)0x0EFF; if (c1 >= s_firstChar && c1 <= s_lastChar) { startCodePoint = s_firstChar; endCodePoint = s_lastChar; spanBreakInfo = s_lao; return(true); } } { //Katakana const char s_firstChar = (char)0x3040; const char s_lastChar = (char)0x30FF; if (c1 >= s_firstChar && c1 <= s_lastChar) { startCodePoint = s_firstChar; endCodePoint = s_lastChar; spanBreakInfo = s_hana; return(true); } //CJK_Symbols_And_Punctuation = (48L << 32) | (0x3000 << 16) | 0x303F, //Hiragana = (49L << 32) | (0x3040 << 16) | 0x309F, //Katakana = (50L << 32) | (0x30A0 << 16) | 0x30FF, //Katakana_Phonetic_Extensions = (50L << 32) | (0x31F0 << 16) | 0x31FF, } { //Hangul_Syllables const char s_firstChar = (char)0xAC00; const char s_lastChar = (char)0xD7AF; if (c1 >= s_firstChar && c1 <= s_lastChar) {//Hangul_Syllables = (56L << 32) | (0xAC00 << 16) | 0xD7AF, startCodePoint = s_firstChar; endCodePoint = s_lastChar; spanBreakInfo = s_hangul; return(true); } else if (c1 >= 0x3130 && c1 <= 0x318F) { //Hangul_Compatibility_Jamo = (52L << 32) | (0x3130 << 16) | 0x318F, startCodePoint = 0x3130; endCodePoint = 0x318F; spanBreakInfo = s_hangul; return(true); } } { //Hangul_Compatibility_Jamo = (52L << 32) | (0x3130 << 16) | 0x318F, const char s_firstChar = (char)0x3130; const char s_lastChar = (char)0x318F; if (c1 >= s_firstChar && c1 <= s_lastChar) { startCodePoint = s_firstChar; endCodePoint = s_lastChar; spanBreakInfo = s_hangul_jumo; return(true); } } { //cjk for (int i = 0; i < cjk_pairs.Length; i += 2) { int s_firstChar = cjk_pairs[i]; int s_lastChar = cjk_pairs[i + 1]; if (c1 >= s_firstChar && c1 <= s_lastChar) { startCodePoint = s_firstChar; endCodePoint = s_lastChar; spanBreakInfo = s_hani; return(true); } } } { //https://en.wikipedia.org/wiki/Arabic_script_in_Unicode //Rumi Numeral Symbols(10E60–10E7F, 31 characters) //Indic Siyaq Numbers(1EC70–1ECBF, 68 characters) //Ottoman Siyaq Numbers(1ED00–1ED4F, 61 characters) //Arabic Mathematical Alphabetic Symbols(1EE00–1EEFF, 143 characters) if (c1 >= 0x0600 && c1 <= 0x06FF) { //Arabic (0600–06FF, 255 characters) startCodePoint = 0x0600; endCodePoint = 0x06FF; spanBreakInfo = s_arabic; return(true); } else if (c1 >= 0x0750 && c1 <= 0x077F) { //Arabic Supplement(0750–077F, 48 characters) startCodePoint = 0x0750; endCodePoint = 0x077F; spanBreakInfo = s_arabic_supplement; return(true); } else if (c1 >= 0x8A0 && c1 <= 0x08FF) { //Arabic Extended-A(08A0–08FF, 84 characters) startCodePoint = 0x8A0; endCodePoint = 0x08FF; spanBreakInfo = s_arabic; //TODO: review here return(true); } else if (c1 >= 0xFB50 && c1 <= 0xFDFF) { //Arabic Presentation Forms - A(FB50–FDFF, 611 characters) startCodePoint = 0xFB50; endCodePoint = 0xFDFF; spanBreakInfo = s_arabic_presentation_form_a; //TODO: review here return(true); } else if (c1 >= 0xFE70 && c1 <= 0xFEFF) { //Arabic Presentation Forms - B(FE70–FEFF, 141 characters) startCodePoint = 0xFE70; endCodePoint = 0xFEFF; spanBreakInfo = s_arabic_presentation_form_b; //TODO: review here return(true); } else { startCodePoint = 0; endCodePoint = 0; spanBreakInfo = null; return(false); } } }
public static bool GetUniCodeRangeFor(int c1, out UnicodeRangeInfo unicodeRangeInfo, out SpanBreakInfo spanBreakInfo) { if (Unicode13RangeInfoList.TryGetUnicodeRangeInfo(c1, out unicodeRangeInfo) && s_registerSpanBreakInfo.TryGetValue(unicodeRangeInfo, out spanBreakInfo)) { return(true); } //we may found unicodeRange info //but may not found register spanbreak info spanBreakInfo = null; return(false); }