internal override void BreakWord(WordVisitor visitor, char[] charBuff, int startAt, int len) { visitor.State = VisitorState.Parsing; char c_first = this.FirstUnicodeChar; char c_last = this.LastUnicodeChar; int endAt = startAt + len; visitor.SpanBreakInfo = _breakInfo; Stack <int> candidateBreakList = visitor.GetTempCandidateBreaks(); bool breakPeroidInTextSpan = BreakPeroidInTextSpan; for (int i = startAt; i < endAt;) { ENTER_LOOP: //find proper start words; char c = charBuff[i]; //---------------------- //check if c is in our responsiblity if ((c < c_first || c > c_last)) { if (c == '.') { if (breakPeroidInTextSpan) { //out of our range //should return ? visitor.State = VisitorState.OutOfRangeChar; return; } else { //**** //concat eg. A.B.C //*** ++i; continue; } } else { //out of our range //should return ? visitor.State = VisitorState.OutOfRangeChar; return; } } //---------------------- WordGroup wordgroup = GetWordGroupForFirstChar(c); if (wordgroup == null) { //continue next char ++i; visitor.AddWordBreak_AndSetCurrentIndex(i, WordKind.Text); } else { //check if we can move next if (visitor.IsEnd) { visitor.State = VisitorState.End; return; } //--------------------- WordGroup c_wordgroup = wordgroup; candidateBreakList.Clear(); int candidateLen = 1; if (c_wordgroup.PrefixIsWord) { candidateBreakList.Push(candidateLen); } bool continueRead = true; while (continueRead) { //not end //then move next visitor.SetCurrentIndex(i + 1); if (visitor.IsEnd) { //end *** visitor.State = VisitorState.End; //---------------------------------------- WordGroup next1 = GetSubGroup(visitor, c_wordgroup); bool latest_candidate_isNotWord = false; if (next1 != null) { //accept //--------------------- //since this is end word ... //and next1 != null=> this has a link to next word group //but it may be incomplete so => we need decision *** if (next1.PrefixIsWord) { candidateBreakList.Push(candidateLen); } else { if (!DontMergeLastIncompleteWord) { latest_candidate_isNotWord = true;//word may has error candidateBreakList.Push(candidateLen); } } //--------------------- } else { if (c_wordgroup.WordSpanListCount > 0) { int p1 = visitor.CurrentIndex; //p2: suggest position int p2 = FindInWordSpans(visitor, c_wordgroup); if (p2 - p1 > 0) { visitor.AddWordBreak_AndSetCurrentIndex(p2, WordKind.Text); candidateBreakList.Clear(); } } } //---------------------------------------- i = endAt; //temp fix, TODO: review here //choose best match if (candidateBreakList.Count > 0) { if (DontMergeLastIncompleteWord) { int candi1 = candidateBreakList.Pop(); visitor.SetCurrentIndex(visitor.LatestBreakAt + candi1); visitor.AddWordBreakAtCurrentIndex( latest_candidate_isNotWord ? WordKind.TextIncomplete : WordKind.Text); } else { visitor.AddWordBreakAtCurrentIndex(); } return; } continueRead = false; //---------------------------------------- if (visitor.CurrentIndex >= len - 1) { //flush remaining char if (visitor.LatestBreakAt < startAt + len) { visitor.AddWordBreakAtCurrentIndex(); } } return; } //---------------------------------------- candidateLen++; if (!breakPeroidInTextSpan && visitor.Char == '.') { //treat abbrev ++i; goto ENTER_LOOP; } //---------------------------------------- WordGroup next = GetSubGroup(visitor, c_wordgroup); //for debug //string prefix = (next == null) ? "" : next.GetPrefix(CurrentCustomDic.TextBuffer); if (next != null) { if (next.PrefixIsWord) { candidateBreakList.Push(candidateLen); } c_wordgroup = next; i = visitor.CurrentIndex; if (visitor.IsEnd) { i = endAt; //temp fix, TODO: review here #if DEBUG bool dbugFoundCandidate = false; #endif //choose best match while (candidateBreakList.Count > 0) { int candi1 = candidateBreakList.Pop(); //try visitor.SetCurrentIndex(visitor.LatestBreakAt + candi1); if (visitor.State != VisitorState.End) { char next_char = visitor.Char; if (CanBeStartChar(next_char)) { //use this //use this candidate if possible visitor.AddWordBreakAtCurrentIndex(); #if DEBUG dbugFoundCandidate = true; #endif break; } } else { visitor.AddWordBreakAtCurrentIndex(); #if DEBUG dbugFoundCandidate = true; #endif break; } } continueRead = false; } } else { continueRead = false; //no deeper group //then check if if (c_wordgroup.WordSpanListCount > 0) { int p1 = visitor.CurrentIndex; //p2: suggest position int p2 = FindInWordSpans(visitor, c_wordgroup); if (p2 - p1 > 0) { visitor.AddWordBreak_AndSetCurrentIndex(p2, WordKind.Text); } else { //on the same pos if (visitor.State == VisitorState.OutOfRangeChar) { //*** if (this.DontMergeLastIncompleteWord) { //choose best match int p3 = visitor.CurrentIndex; int p4 = p3; if (candidateBreakList.Count > 0) { int candi1 = candidateBreakList.Pop(); visitor.SetCurrentIndex(p4 = (visitor.LatestBreakAt + candi1)); visitor.AddWordBreakAtCurrentIndex(); } // if (p4 < p3) { visitor.SetCurrentIndex(p1); visitor.AddWordBreakAtCurrentIndex(); } return; } else { visitor.AddWordBreakAtCurrentIndex(); return; } } else { bool foundCandidate = false; int candi_count = candidateBreakList.Count; if (candi_count == 0) { //no candidate //need to step back int latestBreakAt = visitor.LatestBreakAt; if (visitor.CurrentIndex - 1 > latestBreakAt) { //step back visitor.SetCurrentIndex(visitor.CurrentIndex - 1); //TODO: review here again #if DEBUG char current_char = visitor.Char; if (CanBeStartChar(current_char)) { if (visitor.CurrentIndex - 1 > latestBreakAt) { } else { } } else { } #endif } else { throw new NotSupportedException("i-3311"); } } else { while (candidateBreakList.Count > 0) { int candi1 = candidateBreakList.Pop(); //try visitor.SetCurrentIndex(visitor.LatestBreakAt + candi1); //check if we can use this candidate if (visitor.State != VisitorState.End) { char next_char = visitor.Char; if (CanBeStartChar(next_char)) { //use this //use this candidate if possible visitor.AddWordBreakAtCurrentIndex(); foundCandidate = true; break; } } else { visitor.AddWordBreakAtCurrentIndex(); foundCandidate = true; break; } } } if (!foundCandidate) { //no next word, no candidate //skip this char next_char = visitor.Char; if (CanBeStartChar(next_char)) { //use this //use this candidate if possible visitor.AddWordBreakAtCurrentIndex(); foundCandidate = true; break; } else { //TODO: review here visitor.SetCurrentIndex(visitor.LatestBreakAt + 1); visitor.AddWordBreakAtCurrentIndex(); visitor.SetCurrentIndex(visitor.LatestBreakAt); } } } } } else { bool foundCandidate = false; while (candidateBreakList.Count > 0) { int candi1 = candidateBreakList.Pop(); //try visitor.SetCurrentIndex(visitor.LatestBreakAt + candi1); if (visitor.State == VisitorState.End) { visitor.AddWordBreakAtCurrentIndex(); return; } //check if we can use this candidate char next_char = visitor.Char; if (!CanHandle(next_char)) { //use this //use this candidate if possible visitor.AddWordBreakAtCurrentIndex(); foundCandidate = true; break; } if (CanBeStartChar(next_char)) { //use this //use this candidate if possible visitor.AddWordBreakAtCurrentIndex(); foundCandidate = true; break; } } if (!foundCandidate) { if (candidateLen > 0) { //use that candidate len visitor.AddWordBreakAtCurrentIndex(); visitor.SetCurrentIndex(visitor.LatestBreakAt); } } } i = visitor.CurrentIndex; } } } } //------ if (visitor.CurrentIndex >= len - 1) { //the last one visitor.State = VisitorState.End; if (visitor.LatestBreakAt < startAt + len) { visitor.AddWordBreakAt(startAt + len, WordKind.Text); } } }
internal override void BreakWord(WordVisitor visitor, char[] charBuff, int startAt, int len) { //use custom parsing visitor.State = VisitorState.Parsing; RunAgent agent = _runAdapter.Agent; //collect arabic char and break int arabic_len = 0; int lim = startAt + len; for (int i = startAt; i < lim; ++i) { char c = charBuff[i]; if (IsArabicChar(c)) { arabic_len++; } else { break; } } // if (arabic_len == 0) { visitor.State = VisitorState.OutOfRangeChar; return; } visitor.SpanBreakInfo = _breakInfo; //only collect char Line line1 = new Line(new string(charBuff, startAt, arabic_len)); _runAdapter.LoadLine(line1); while (_runAdapter.MoveNext()) { int offset = agent.Offset; byte level = agent.Level; int sp_len = agent.Length; bool rtl = agent.IsRightToLeft; if (rtl) { //temp fix visitor.AddWordBreak_AndSetCurrentIndex(startAt + sp_len, WordKind.Text); } else { //use other engine break; } //iter each run-span //string tt = new string(buffer, offset, len); //System.Diagnostics.Debug.WriteLine(tt); } if (visitor.CurrentIndex == startAt + len) { visitor.State = VisitorState.End; } else { //continue to other parser visitor.State = VisitorState.OutOfRangeChar; } }