void DoIndexOfSmallAmount(CustomDicTextBuffer textBuffer) { //convention... //data must me sorted (ascending) before use with the wordSpanList for (int i = wordSpanList.Count - 1; i >= 0; --i) { WordSpan sp = wordSpanList[i]; #if DEBUG //string dbugStr = sp.GetString(textBuffer); #endif if (sp.SameTextContent(this.prefixSpan, textBuffer)) { this.PrefixIsWord = true; break; } } this._resultWordGroup = new WordGroup( this.prefixSpan, null, this.wordSpanList.ToArray(), this.PrefixIsWord); }
public void LoadSortedUniqueWordList(IEnumerable <string> sortedWordList) { // load unique and sorted word list if (textBuffer != null) { return; } if (firstChar == '\0' || lastChar == '\0') { throw new NotSupportedException(); } //--------------- Dictionary <char, DevelopingWordGroup> wordGroups = new Dictionary <char, DevelopingWordGroup>(); textBuffer = new CustomDicTextBuffer(1024); foreach (string line in sortedWordList) { char[] lineBuffer = line.Trim().ToCharArray(); int lineLen = lineBuffer.Length; char c0; if (lineLen > 0 && (c0 = lineBuffer[0]) != '#') { int startAt = textBuffer.CurrentPosition; textBuffer.AddWord(lineBuffer); #if DEBUG if (lineLen > byte.MaxValue) { throw new NotSupportedException(); } #endif WordSpan wordspan = new WordSpan(startAt, (byte)lineLen); //each wordgroup contains text span DevelopingWordGroup found; if (!wordGroups.TryGetValue(c0, out found)) { found = new DevelopingWordGroup(new WordSpan(startAt, 1)); wordGroups.Add(c0, found); } found.AddWordSpan(wordspan); } //- next line } //------------------------------------------------------------------ textBuffer.Freeze(); //------------------------------------------------------------------ //do index DoIndex(wordGroups); //clear, not used wordGroups.Clear(); }
public bool SameTextContent(WordSpan another, CustomDicTextBuffer textBuffer) { if (another.len == this.len) { for (int i = another.len - 1; i >= 0; --i) { if (this.GetChar(i, textBuffer) != another.GetChar(i, textBuffer)) { return(false); } } return(true); } return(false); }
internal void CollectAllWords(CustomDicTextBuffer textBuffer, List <string> output) { if (this.PrefixIsWord) { output.Add(GetPrefix(textBuffer)); } if (subGroups != null) { foreach (WordGroup wordGroup in subGroups) { if (wordGroup != null) { wordGroup.CollectAllWords(textBuffer, output); } } } if (wordSpans != null) { foreach (var span in wordSpans) { output.Add(span.GetString(textBuffer)); } } }
internal string GetPrefix(CustomDicTextBuffer buffer) { return(prefixSpan.GetString(buffer)); }
WordGroup _resultWordGroup;//after call DoIndex() internal void DoIndex(CustomDicTextBuffer textBuffer, CustomDic owner) { //recursive if (this.PrefixLen > 7) { DoIndexOfSmallAmount(textBuffer); #if DEBUG dbugDataState = debugDataState.TooLongPrefix; #endif return; } //----------------------------------------------- bool hasEvalPrefix = false; if (subGroups == null) { subGroups = new DevelopingWordGroup[owner.LastChar - owner.FirstChar + 1]; } //-------------------------------- int j = wordSpanList.Count; int thisPrefixLen = this.PrefixLen; int doSepAt = thisPrefixLen; for (int i = 0; i < j; ++i) { WordSpan sp = wordSpanList[i]; if (sp.len > doSepAt) { char c = sp.GetChar(doSepAt, textBuffer); int c_index = c - owner.FirstChar; DevelopingWordGroup found = subGroups[c_index]; if (found == null) { //not found found = new DevelopingWordGroup(new WordSpan(sp.startAt, (byte)(doSepAt + 1))); subGroups[c_index] = found; } found.AddWordSpan(sp); } else { if (!hasEvalPrefix) { if (sp.SameTextContent(this.prefixSpan, textBuffer)) { hasEvalPrefix = true; this.PrefixIsWord = true; } } } } #if DEBUG this.dbugDataState = debugDataState.Indexed; #endif wordSpanList.Clear(); wordSpanList = null; //-------------------------------- //do sup index //foreach (WordGroup subgroup in this.wordGroups.Values) bool hasSomeSubGroup = false; foreach (DevelopingWordGroup subgroup in this.subGroups) { if (subgroup != null) { hasSomeSubGroup = true; //**** //performance factor here,**** //in this current version //if we not call DoIndex(), //this subgroup need linear search-> so it slow //so we call DoIndex until member count in the group <=3 //then it search faster, //but dictionary-building time may increase. if (subgroup.WordSpanListCount > 2) { subgroup.DoIndex(textBuffer, owner); } else { #if DEBUG subgroup.dbugDataState = debugDataState.SmallAmountOfMembers; #endif subgroup.DoIndexOfSmallAmount(textBuffer); } } } //-------------------------------- #if DEBUG this.dbugDataState = debugDataState.Indexed; #endif if (!hasSomeSubGroup) { //clear subGroups = null; } //-------------------------------- WordGroup[] newsubGroups = null; if (subGroups != null) { newsubGroups = new WordGroup[subGroups.Length]; for (int i = subGroups.Length - 1; i >= 0; --i) { DevelopingWordGroup subg = subGroups[i]; if (subg != null) { newsubGroups[i] = subg.ResultWordGroup; } } } //-------------------------------- this._resultWordGroup = new WordGroup( this.prefixSpan, newsubGroups, null, this.PrefixIsWord); }
public string GetString(CustomDicTextBuffer textBuffer) { return(textBuffer.GetString(startAt, len)); }
public char GetChar(int index, CustomDicTextBuffer textBuffer) { return(textBuffer.GetChar(startAt + index)); }
int FindInWordSpans(WordVisitor visitor, WordGroup wordGroup) { WordSpan[] wordSpans = wordGroup.GetWordSpans(); if (wordSpans == null) { throw new NotSupportedException(); } //at this wordgroup //no subground anymore //so we should find the word one by one //start at prefix //and select the one that int readLen = visitor.CurrentIndex - visitor.LatestBreakAt; int nwords = wordSpans.Length; //only 1 that match CustomDicTextBuffer currentTextBuffer = CurrentCustomDic.TextBuffer; //we sort unindex string *** //so we find from longest one( last) to begin for (int i = nwords - 1; i >= 0; --i) { //loop test on each word WordSpan w = wordSpans[i]; #if DEBUG //string dbugstr = w.GetString(currentTextBuffer); #endif int savedIndex = visitor.CurrentIndex; char c = visitor.Char; int wordLen = w.len; int matchCharCount = 0; if (wordLen > readLen) { for (int p = readLen; p < wordLen; ++p) { char c2 = w.GetChar(p, currentTextBuffer); if (c2 == c) { matchCharCount++; //match //read next if (!visitor.IsEnd) { visitor.SetCurrentIndex(visitor.CurrentIndex + 1); c = visitor.Char; } else { //no more data in visitor break; } } else { break; } } } //reset if (readLen + matchCharCount == wordLen) { int newBreakAt = visitor.LatestBreakAt + wordLen; visitor.SetCurrentIndex(newBreakAt); //-------------------------------------------- if (visitor.State == VisitorState.End) { return(newBreakAt); } //check next char can be the char of new word or not //this depends on each lang char canBeStartChar = visitor.Char; if (CanHandle(canBeStartChar)) { if (CanBeStartChar(canBeStartChar)) { return(newBreakAt); } else { //back to savedIndex visitor.SetCurrentIndex(savedIndex); return(savedIndex); } } else { visitor.State = VisitorState.OutOfRangeChar; return(newBreakAt); } } visitor.SetCurrentIndex(savedIndex); } return(0); }