Beispiel #1
0
        public void BreakWords(char[] charBuff, int startAt, int len)
        {
            //conver to char buffer
            int j = charBuff.Length;

            if (j < 1)
            {
                _endAt = 0;
                return;
            }
            _endAt = startAt + len;
            _visitor.LoadText(charBuff, startAt, len);
            //----------------------------------------
            BreakingEngine currentEngine = _breakingEngine = SelectEngine(charBuff[startAt]);
            //----------------------------------------
            //select breaking engine
            int endAt = startAt + len;

            for (; ;)
            {
                //----------------------------------------
                currentEngine.BreakWord(_visitor, charBuff, startAt, endAt - startAt); //please note that len is decreasing
                switch (_visitor.State)
                {
                default: throw new NotSupportedException();

                case VisitorState.End:
                    //ok
                    return;

                case VisitorState.OutOfRangeChar:
                {
                    //find proper breaking engine for current char

                    BreakingEngine anotherEngine = SelectEngine(_visitor.Char);
                    if (anotherEngine == currentEngine)
                    {
                        if (ThrowIfCharOutOfRange)
                        {
                            throw new NotSupportedException($"A proper breaking engine for character '{_visitor.Char}' was not found.");
                        }
                        startAt = _visitor.CurrentIndex + 1;
                        _visitor.SetCurrentIndex(startAt);
                        _visitor.AddWordBreakAtCurrentIndex(WordKind.Unknown);
                    }
                    else
                    {
                        currentEngine = anotherEngine;
                        startAt       = _visitor.CurrentIndex;
                    }
                }
                break;
                }
            }
        }
Beispiel #2
0
        internal override void BreakWord(WordVisitor visitor, char[] charBuff, int startAt, int len)
        {
            visitor.State = VisitorState.Parsing;
            this._startAt = startAt;
            this._len     = len;
            this._endAt   = startAt + len;

            char c_first = this.FirstUnicodeChar;
            char c_last  = this.LastUnicodeChar;
            int  endAt   = startAt + len;

            Stack <int> candidateBreakList = visitor.GetTempCandidateBreaks();

            for (int i = startAt; i < endAt;)
            {
                //find proper start words;
                char c = charBuff[i];
                //----------------------
                //check if c is in our responsiblity
                if (c < c_first || c > c_last)
                {
                    //out of our range
                    //should return ?
                    visitor.State = VisitorState.OutOfRangeChar;
                    return;
                }
                //----------------------
                WordGroup wordgroup = GetWordGroupForFirstChar(c);
                if (wordgroup == null)
                {
                    //continue next char
                    ++i;
                    visitor.AddWordBreakAt(i, WordKind.Text);
                    visitor.SetCurrentIndex(visitor.LatestBreakAt);
                }
                else
                {
                    //check if we can move next
                    if (visitor.IsEnd)
                    {
                        visitor.State = VisitorState.End;
                        return;
                    }
                    //---------------------
                    WordGroup c_wordgroup = wordgroup;
                    candidateBreakList.Clear();

                    int candidateLen = 1;

                    if (c_wordgroup.PrefixIsWord)
                    {
                        candidateBreakList.Push(candidateLen);
                    }

                    bool continueRead = true;

                    while (continueRead)
                    {
                        //not end
                        //then move next
                        candidateLen++;
                        visitor.SetCurrentIndex(i + 1);
                        if (visitor.IsEnd)
                        {
                            //end  ***
                            visitor.State = VisitorState.End;
                            //----------------------------------------
                            WordGroup next1 = GetSubGroup(visitor, c_wordgroup);

                            bool latest_candidate_isNotWord = false;
                            if (next1 != null)
                            {
                                //accept

                                //---------------------
                                //since this is end word ...
                                //and next1 != null=> this has a link to next word group
                                //but it may be incomplete so => we need decision ***

                                if (next1.PrefixIsWord)
                                {
                                    candidateBreakList.Push(candidateLen);
                                }
                                else
                                {
                                    if (!DontMergeLastIncompleteWord)
                                    {
                                        latest_candidate_isNotWord = true;//word may has error
                                        candidateBreakList.Push(candidateLen);
                                    }
                                }
                                //---------------------
                            }
                            else
                            {
                                if (c_wordgroup.WordSpanListCount > 0)
                                {
                                    int p1 = visitor.CurrentIndex;
                                    //p2: suggest position
                                    int p2 = FindInWordSpans(visitor, c_wordgroup);
                                    if (p2 - p1 > 0)
                                    {
                                        visitor.AddWordBreakAt(p2, WordKind.Text);
                                        visitor.SetCurrentIndex(p2);
                                        candidateBreakList.Clear();
                                    }
                                }
                            }
                            //----------------------------------------
                            i = endAt; //temp fix, TODO: review here

                            //choose best match
                            if (candidateBreakList.Count > 0)
                            {
                                int candi1 = candidateBreakList.Pop();
                                //try

                                visitor.SetCurrentIndex(visitor.LatestBreakAt + candi1);
                                if (latest_candidate_isNotWord)
                                {
                                    //use this
                                    //use this candidate if possible
                                    visitor.AddWordBreakAtCurrentIndex(WordKind.TextIncomplete);
                                }
                                else
                                {
                                    //use this
                                    //use this candidate if possible
                                    visitor.AddWordBreakAtCurrentIndex();
                                }

                                break;
                            }
                            continueRead = false;
                            //----------------------------------------
                            return;
                        }
                        WordGroup next = GetSubGroup(visitor, c_wordgroup);
                        //for debug
                        //string prefix = (next == null) ? "" : next.GetPrefix(CurrentCustomDic.TextBuffer);
                        if (next != null)
                        {
                            if (next.PrefixIsWord)
                            {
                                candidateBreakList.Push(candidateLen);
                            }
                            c_wordgroup = next;
                            i           = visitor.CurrentIndex;

                            if (visitor.IsEnd)
                            {
                                i = endAt; //temp fix, TODO: review here
                                bool foundCandidate = false;
                                //choose best match
                                while (candidateBreakList.Count > 0)
                                {
                                    int candi1 = candidateBreakList.Pop();
                                    //try
                                    visitor.SetCurrentIndex(visitor.LatestBreakAt + candi1);
                                    if (visitor.State != VisitorState.End)
                                    {
                                        char next_char = visitor.Char;
                                        if (CanBeStartChar(next_char))
                                        {
                                            //use this
                                            //use this candidate if possible
                                            visitor.AddWordBreakAtCurrentIndex();
                                            foundCandidate = true;
                                            break;
                                        }
                                    }
                                    else
                                    {
                                        visitor.AddWordBreakAtCurrentIndex();
                                        foundCandidate = true;
                                        break;
                                    }
                                }
                                continueRead = false;
                            }
                        }
                        else
                        {
                            continueRead = false;
                            //no deeper group
                            //then check if
                            if (c_wordgroup.WordSpanListCount > 0)
                            {
                                int p1 = visitor.CurrentIndex;
                                //p2: suggest position
                                int p2 = FindInWordSpans(visitor, c_wordgroup);
                                if (p2 - p1 > 0)
                                {
                                    visitor.AddWordBreakAt(p2, WordKind.Text);
                                    visitor.SetCurrentIndex(p2);
                                }
                                else
                                {
                                    //on the same pos
                                    if (visitor.State == VisitorState.OutOfRangeChar)
                                    {
                                        visitor.AddWordBreakAtCurrentIndex();
                                        return;
                                    }
                                    else
                                    {
                                        bool foundCandidate = false;
                                        int  candi_count    = candidateBreakList.Count;
                                        if (candi_count == 0)
                                        {
                                            //no candidate
                                            //need to step back
                                            int latestBreakAt = visitor.LatestBreakAt;
                                            if (visitor.CurrentIndex - 1 > latestBreakAt)
                                            {
                                                //steop back

                                                visitor.SetCurrentIndex(visitor.CurrentIndex - 1);
                                                char current_char = visitor.Char;
                                                if (CanBeStartChar(current_char))
                                                {
                                                    if (visitor.CurrentIndex - 1 > latestBreakAt)
                                                    {
                                                    }
                                                    else
                                                    {
                                                    }
                                                }
                                                else
                                                {
                                                }
                                            }
                                            else
                                            {
                                                throw new NotSupportedException("i-3311");
                                            }
                                        }
                                        else
                                        {
                                            while (candi_count > 0)
                                            {
                                                int candi1 = candidateBreakList.Pop();
                                                //try
                                                visitor.SetCurrentIndex(visitor.LatestBreakAt + candi1);
                                                //check if we can use this candidate
                                                if (visitor.State != VisitorState.End)
                                                {
                                                    char next_char = visitor.Char;
                                                    if (CanBeStartChar(next_char))
                                                    {
                                                        //use this
                                                        //use this candidate if possible
                                                        visitor.AddWordBreakAtCurrentIndex();
                                                        foundCandidate = true;
                                                        break;
                                                    }
                                                }
                                                else
                                                {
                                                    visitor.AddWordBreakAtCurrentIndex();
                                                    foundCandidate = true;
                                                }
                                            }
                                        }
                                        if (!foundCandidate)
                                        {
                                            //no next word, no candidate
                                            //skip this
                                            char next_char = visitor.Char;
                                            if (CanBeStartChar(next_char))
                                            {
                                                //use this
                                                //use this candidate if possible
                                                visitor.AddWordBreakAtCurrentIndex();
                                                foundCandidate = true;
                                                break;
                                            }
                                            else
                                            {
                                                //TODO: review here
                                                visitor.SetCurrentIndex(visitor.LatestBreakAt + 1);
                                                visitor.AddWordBreakAtCurrentIndex();
                                                visitor.SetCurrentIndex(visitor.LatestBreakAt);
                                            }
                                        }
                                    }
                                }
                            }
                            else
                            {
                                bool foundCandidate = false;
                                while (candidateBreakList.Count > 0)
                                {
                                    int candi1 = candidateBreakList.Pop();
                                    //try
                                    visitor.SetCurrentIndex(visitor.LatestBreakAt + candi1);
                                    if (visitor.State == VisitorState.End)
                                    {
                                        visitor.AddWordBreakAtCurrentIndex();
                                        return;
                                    }
                                    //check if we can use this candidate
                                    char next_char = visitor.Char;
                                    if (!CanHandle(next_char))
                                    {
                                        //use this
                                        //use this candidate if possible
                                        visitor.AddWordBreakAtCurrentIndex();
                                        foundCandidate = true;
                                        break;
                                    }
                                    if (CanBeStartChar(next_char))
                                    {
                                        //use this
                                        //use this candidate if possible
                                        visitor.AddWordBreakAtCurrentIndex();
                                        foundCandidate = true;
                                        break;
                                    }
                                }
                                if (!foundCandidate)
                                {
                                    if (candidateLen > 0)
                                    {
                                        //use that candidate len
                                        visitor.AddWordBreakAtCurrentIndex();
                                        visitor.SetCurrentIndex(visitor.LatestBreakAt);
                                    }
                                }
                            }
                            i = visitor.CurrentIndex;
                        }
                    }
                }
            }
            //------
            if (visitor.CurrentIndex >= len - 1)
            {
                //the last one
                visitor.State = VisitorState.End;
            }
        }