public void BreakWords(char[] charBuff, int startAt, int len) { //conver to char buffer int j = charBuff.Length; if (j < 1) { _endAt = 0; return; } _endAt = startAt + len; _visitor.LoadText(charBuff, startAt, len); //---------------------------------------- BreakingEngine currentEngine = _breakingEngine = SelectEngine(charBuff[startAt]); //---------------------------------------- //select breaking engine int endAt = startAt + len; for (; ;) { //---------------------------------------- currentEngine.BreakWord(_visitor, charBuff, startAt, endAt - startAt); //please note that len is decreasing switch (_visitor.State) { default: throw new NotSupportedException(); case VisitorState.End: //ok return; case VisitorState.OutOfRangeChar: { //find proper breaking engine for current char BreakingEngine anotherEngine = SelectEngine(_visitor.Char); if (anotherEngine == currentEngine) { if (ThrowIfCharOutOfRange) { throw new NotSupportedException($"A proper breaking engine for character '{_visitor.Char}' was not found."); } startAt = _visitor.CurrentIndex + 1; _visitor.SetCurrentIndex(startAt); _visitor.AddWordBreakAtCurrentIndex(WordKind.Unknown); } else { currentEngine = anotherEngine; startAt = _visitor.CurrentIndex; } } break; } } }
internal override void BreakWord(WordVisitor visitor, char[] charBuff, int startAt, int len) { visitor.State = VisitorState.Parsing; this._startAt = startAt; this._len = len; this._endAt = startAt + len; char c_first = this.FirstUnicodeChar; char c_last = this.LastUnicodeChar; int endAt = startAt + len; Stack <int> candidateBreakList = visitor.GetTempCandidateBreaks(); for (int i = startAt; i < endAt;) { //find proper start words; char c = charBuff[i]; //---------------------- //check if c is in our responsiblity if (c < c_first || c > c_last) { //out of our range //should return ? visitor.State = VisitorState.OutOfRangeChar; return; } //---------------------- WordGroup wordgroup = GetWordGroupForFirstChar(c); if (wordgroup == null) { //continue next char ++i; visitor.AddWordBreakAt(i, WordKind.Text); visitor.SetCurrentIndex(visitor.LatestBreakAt); } else { //check if we can move next if (visitor.IsEnd) { visitor.State = VisitorState.End; return; } //--------------------- WordGroup c_wordgroup = wordgroup; candidateBreakList.Clear(); int candidateLen = 1; if (c_wordgroup.PrefixIsWord) { candidateBreakList.Push(candidateLen); } bool continueRead = true; while (continueRead) { //not end //then move next candidateLen++; visitor.SetCurrentIndex(i + 1); if (visitor.IsEnd) { //end *** visitor.State = VisitorState.End; //---------------------------------------- WordGroup next1 = GetSubGroup(visitor, c_wordgroup); bool latest_candidate_isNotWord = false; if (next1 != null) { //accept //--------------------- //since this is end word ... //and next1 != null=> this has a link to next word group //but it may be incomplete so => we need decision *** if (next1.PrefixIsWord) { candidateBreakList.Push(candidateLen); } else { if (!DontMergeLastIncompleteWord) { latest_candidate_isNotWord = true;//word may has error candidateBreakList.Push(candidateLen); } } //--------------------- } else { if (c_wordgroup.WordSpanListCount > 0) { int p1 = visitor.CurrentIndex; //p2: suggest position int p2 = FindInWordSpans(visitor, c_wordgroup); if (p2 - p1 > 0) { visitor.AddWordBreakAt(p2, WordKind.Text); visitor.SetCurrentIndex(p2); candidateBreakList.Clear(); } } } //---------------------------------------- i = endAt; //temp fix, TODO: review here //choose best match if (candidateBreakList.Count > 0) { int candi1 = candidateBreakList.Pop(); //try visitor.SetCurrentIndex(visitor.LatestBreakAt + candi1); if (latest_candidate_isNotWord) { //use this //use this candidate if possible visitor.AddWordBreakAtCurrentIndex(WordKind.TextIncomplete); } else { //use this //use this candidate if possible visitor.AddWordBreakAtCurrentIndex(); } break; } continueRead = false; //---------------------------------------- return; } WordGroup next = GetSubGroup(visitor, c_wordgroup); //for debug //string prefix = (next == null) ? "" : next.GetPrefix(CurrentCustomDic.TextBuffer); if (next != null) { if (next.PrefixIsWord) { candidateBreakList.Push(candidateLen); } c_wordgroup = next; i = visitor.CurrentIndex; if (visitor.IsEnd) { i = endAt; //temp fix, TODO: review here bool foundCandidate = false; //choose best match while (candidateBreakList.Count > 0) { int candi1 = candidateBreakList.Pop(); //try visitor.SetCurrentIndex(visitor.LatestBreakAt + candi1); if (visitor.State != VisitorState.End) { char next_char = visitor.Char; if (CanBeStartChar(next_char)) { //use this //use this candidate if possible visitor.AddWordBreakAtCurrentIndex(); foundCandidate = true; break; } } else { visitor.AddWordBreakAtCurrentIndex(); foundCandidate = true; break; } } continueRead = false; } } else { continueRead = false; //no deeper group //then check if if (c_wordgroup.WordSpanListCount > 0) { int p1 = visitor.CurrentIndex; //p2: suggest position int p2 = FindInWordSpans(visitor, c_wordgroup); if (p2 - p1 > 0) { visitor.AddWordBreakAt(p2, WordKind.Text); visitor.SetCurrentIndex(p2); } else { //on the same pos if (visitor.State == VisitorState.OutOfRangeChar) { visitor.AddWordBreakAtCurrentIndex(); return; } else { bool foundCandidate = false; int candi_count = candidateBreakList.Count; if (candi_count == 0) { //no candidate //need to step back int latestBreakAt = visitor.LatestBreakAt; if (visitor.CurrentIndex - 1 > latestBreakAt) { //steop back visitor.SetCurrentIndex(visitor.CurrentIndex - 1); char current_char = visitor.Char; if (CanBeStartChar(current_char)) { if (visitor.CurrentIndex - 1 > latestBreakAt) { } else { } } else { } } else { throw new NotSupportedException("i-3311"); } } else { while (candi_count > 0) { int candi1 = candidateBreakList.Pop(); //try visitor.SetCurrentIndex(visitor.LatestBreakAt + candi1); //check if we can use this candidate if (visitor.State != VisitorState.End) { char next_char = visitor.Char; if (CanBeStartChar(next_char)) { //use this //use this candidate if possible visitor.AddWordBreakAtCurrentIndex(); foundCandidate = true; break; } } else { visitor.AddWordBreakAtCurrentIndex(); foundCandidate = true; } } } if (!foundCandidate) { //no next word, no candidate //skip this char next_char = visitor.Char; if (CanBeStartChar(next_char)) { //use this //use this candidate if possible visitor.AddWordBreakAtCurrentIndex(); foundCandidate = true; break; } else { //TODO: review here visitor.SetCurrentIndex(visitor.LatestBreakAt + 1); visitor.AddWordBreakAtCurrentIndex(); visitor.SetCurrentIndex(visitor.LatestBreakAt); } } } } } else { bool foundCandidate = false; while (candidateBreakList.Count > 0) { int candi1 = candidateBreakList.Pop(); //try visitor.SetCurrentIndex(visitor.LatestBreakAt + candi1); if (visitor.State == VisitorState.End) { visitor.AddWordBreakAtCurrentIndex(); return; } //check if we can use this candidate char next_char = visitor.Char; if (!CanHandle(next_char)) { //use this //use this candidate if possible visitor.AddWordBreakAtCurrentIndex(); foundCandidate = true; break; } if (CanBeStartChar(next_char)) { //use this //use this candidate if possible visitor.AddWordBreakAtCurrentIndex(); foundCandidate = true; break; } } if (!foundCandidate) { if (candidateLen > 0) { //use that candidate len visitor.AddWordBreakAtCurrentIndex(); visitor.SetCurrentIndex(visitor.LatestBreakAt); } } } i = visitor.CurrentIndex; } } } } //------ if (visitor.CurrentIndex >= len - 1) { //the last one visitor.State = VisitorState.End; } }