Ejemplo n.º 1
0
        static bool IsArabicChar(char c, out SpanBreakInfo brInfo)
        {
            brInfo = s_brkArabic;
            if (brInfo.UnicodeRange.IsInRange(c))
            {
                return(true);
            }

            brInfo = s_brkArabicSupplement;
            if (brInfo.UnicodeRange.IsInRange(c))
            {
                return(true);
            }

            brInfo = s_brkArabicExtendA;
            if (brInfo.UnicodeRange.IsInRange(c))
            {
                return(true);
            }


            brInfo = s_brkArabicPresentFormA;
            if (brInfo.UnicodeRange.IsInRange(c))
            {
                return(true);
            }

            brInfo = s_brkArabicPresentFormB;
            if (brInfo.UnicodeRange.IsInRange(c))
            {
                return(true);
            }

            //
            brInfo = null;
            return(false);
        }
Ejemplo n.º 2
0
 public DictionaryBreakingEngine()
 {
     _breakInfo = GetSpanBreakInfo();
 }
Ejemplo n.º 3
0
        internal override void BreakWord(WordVisitor visitor, char[] charBuff, int startAt, int len)
        {
            //use custom parsing

            visitor.State = VisitorState.Parsing;
            RunAgent agent = _runAdapter.Agent;

            //collect arabic char and break

            int arabic_len = 0;
            int lim        = startAt + len;

            SpanBreakInfo latest_ar = null;

            for (int i = startAt; i < lim; ++i)
            {
                char c = charBuff[i];
                if (IsArabicChar(c, out SpanBreakInfo spBreak))
                {
                    arabic_len++;
                    latest_ar = spBreak;
                }
                else
                {
                    break;
                }
            }
            //
            if (arabic_len == 0)
            {
                visitor.State = VisitorState.OutOfRangeChar;
                return;
            }


            visitor.SpanBreakInfo = latest_ar;

            //only collect char
            Line line1 = new Line(new string(charBuff, startAt, arabic_len));

            _runAdapter.LoadLine(line1);

            while (_runAdapter.MoveNext())
            {
                int  offset = agent.Offset;
                byte level  = agent.Level;
                int  sp_len = agent.Length;
                bool rtl    = agent.IsRightToLeft;

                if (rtl)
                {
                    //temp fix
                    visitor.AddWordBreak_AndSetCurrentIndex(startAt + sp_len, WordKind.Text);
                }
                else
                {
                    //use other engine
                    break;
                }
                //iter each run-span
                //string tt = new string(buffer, offset, len);
                //System.Diagnostics.Debug.WriteLine(tt);
            }

            if (visitor.CurrentIndex == startAt + len)
            {
                visitor.State = VisitorState.End;
            }
            else
            {
                //continue to other parser
                visitor.State = VisitorState.OutOfRangeChar;
            }
        }
Ejemplo n.º 4
0
        public static bool GetUniCodeRangeFor(char c1, out int startCodePoint, out int endCodePoint, out SpanBreakInfo spanBreakInfo)
        {
            //find proper unicode range (and its lang)
            //Thai
            //TODO: review this again, with AUTOGEN code

            {
                const char s_firstChar = (char)0x0E00;
                const char s_lastChar  = (char)0xE7F;
                if (c1 >= s_firstChar && c1 <= s_lastChar)
                {
                    startCodePoint = s_firstChar;
                    endCodePoint   = s_lastChar;
                    spanBreakInfo  = s_thai;
                    return(true);
                }
            }
            //Lao
            {
                const char s_firstChar = (char)0x0E80;
                const char s_lastChar  = (char)0x0EFF;
                if (c1 >= s_firstChar && c1 <= s_lastChar)
                {
                    startCodePoint = s_firstChar;
                    endCodePoint   = s_lastChar;
                    spanBreakInfo  = s_lao;
                    return(true);
                }
            }


            {
                //Katakana
                const char s_firstChar = (char)0x3040;
                const char s_lastChar  = (char)0x30FF;
                if (c1 >= s_firstChar && c1 <= s_lastChar)
                {
                    startCodePoint = s_firstChar;
                    endCodePoint   = s_lastChar;
                    spanBreakInfo  = s_hana;
                    return(true);
                }
                //CJK_Symbols_And_Punctuation = (48L << 32) | (0x3000 << 16) | 0x303F,
                //Hiragana = (49L << 32) | (0x3040 << 16) | 0x309F,
                //Katakana = (50L << 32) | (0x30A0 << 16) | 0x30FF,
                //Katakana_Phonetic_Extensions = (50L << 32) | (0x31F0 << 16) | 0x31FF,
            }
            {
                //Hangul_Syllables
                const char s_firstChar = (char)0xAC00;
                const char s_lastChar  = (char)0xD7AF;
                if (c1 >= s_firstChar && c1 <= s_lastChar)
                {//Hangul_Syllables = (56L << 32) | (0xAC00 << 16) | 0xD7AF,
                    startCodePoint = s_firstChar;
                    endCodePoint   = s_lastChar;
                    spanBreakInfo  = s_hangul;
                    return(true);
                }
                else if (c1 >= 0x3130 && c1 <= 0x318F)
                {
                    //Hangul_Compatibility_Jamo = (52L << 32) | (0x3130 << 16) | 0x318F,
                    startCodePoint = 0x3130;
                    endCodePoint   = 0x318F;
                    spanBreakInfo  = s_hangul;
                    return(true);
                }
            }

            {
                //Hangul_Compatibility_Jamo = (52L << 32) | (0x3130 << 16) | 0x318F,
                const char s_firstChar = (char)0x3130;
                const char s_lastChar  = (char)0x318F;
                if (c1 >= s_firstChar && c1 <= s_lastChar)
                {
                    startCodePoint = s_firstChar;
                    endCodePoint   = s_lastChar;
                    spanBreakInfo  = s_hangul_jumo;
                    return(true);
                }
            }
            {
                //cjk
                for (int i = 0; i < cjk_pairs.Length; i += 2)
                {
                    int s_firstChar = cjk_pairs[i];
                    int s_lastChar  = cjk_pairs[i + 1];

                    if (c1 >= s_firstChar && c1 <= s_lastChar)
                    {
                        startCodePoint = s_firstChar;
                        endCodePoint   = s_lastChar;
                        spanBreakInfo  = s_hani;
                        return(true);
                    }
                }
            }

            {
                //https://en.wikipedia.org/wiki/Arabic_script_in_Unicode

                //Rumi Numeral Symbols(10E60–10E7F, 31 characters)
                //Indic Siyaq Numbers(1EC70–1ECBF, 68 characters)
                //Ottoman Siyaq Numbers(1ED00–1ED4F, 61 characters)
                //Arabic Mathematical Alphabetic Symbols(1EE00–1EEFF, 143 characters)

                if (c1 >= 0x0600 && c1 <= 0x06FF)
                {
                    //Arabic (0600–06FF, 255 characters)
                    startCodePoint = 0x0600;
                    endCodePoint   = 0x06FF;
                    spanBreakInfo  = s_arabic;
                    return(true);
                }
                else if (c1 >= 0x0750 && c1 <= 0x077F)
                {
                    //Arabic Supplement(0750–077F, 48 characters)
                    startCodePoint = 0x0750;
                    endCodePoint   = 0x077F;
                    spanBreakInfo  = s_arabic_supplement;
                    return(true);
                }
                else if (c1 >= 0x8A0 && c1 <= 0x08FF)
                {
                    //Arabic Extended-A(08A0–08FF, 84 characters)
                    startCodePoint = 0x8A0;
                    endCodePoint   = 0x08FF;
                    spanBreakInfo  = s_arabic; //TODO: review here
                    return(true);
                }
                else if (c1 >= 0xFB50 && c1 <= 0xFDFF)
                {
                    //Arabic Presentation Forms - A(FB50–FDFF, 611 characters)
                    startCodePoint = 0xFB50;
                    endCodePoint   = 0xFDFF;
                    spanBreakInfo  = s_arabic_presentation_form_a; //TODO: review here
                    return(true);
                }
                else if (c1 >= 0xFE70 && c1 <= 0xFEFF)
                {
                    //Arabic Presentation Forms - B(FE70–FEFF, 141 characters)
                    startCodePoint = 0xFE70;
                    endCodePoint   = 0xFEFF;
                    spanBreakInfo  = s_arabic_presentation_form_b; //TODO: review here
                    return(true);
                }
                else
                {
                    startCodePoint = 0;
                    endCodePoint   = 0;
                    spanBreakInfo  = null;
                    return(false);
                }
            }
        }
Ejemplo n.º 5
0
 public static bool GetUniCodeRangeFor(int c1, out UnicodeRangeInfo unicodeRangeInfo, out SpanBreakInfo spanBreakInfo)
 {
     if (Unicode13RangeInfoList.TryGetUnicodeRangeInfo(c1, out unicodeRangeInfo) &&
         s_registerSpanBreakInfo.TryGetValue(unicodeRangeInfo, out spanBreakInfo))
     {
         return(true);
     }
     //we may found unicodeRange info
     //but may not found register spanbreak info
     spanBreakInfo = null;
     return(false);
 }