Beispiel #1
0
        void DoIndexOfSmallAmount(CustomDicTextBuffer textBuffer)
        {
            //convention...
            //data must me sorted (ascending) before use with the wordSpanList

            for (int i = wordSpanList.Count - 1; i >= 0; --i)
            {
                WordSpan sp = wordSpanList[i];
#if DEBUG
                //string dbugStr = sp.GetString(textBuffer);
#endif

                if (sp.SameTextContent(this.prefixSpan, textBuffer))
                {
                    this.PrefixIsWord = true;
                    break;
                }
            }

            this._resultWordGroup = new WordGroup(
                this.prefixSpan,
                null,
                this.wordSpanList.ToArray(),
                this.PrefixIsWord);
        }
Beispiel #2
0
        public void LoadSortedUniqueWordList(IEnumerable <string> sortedWordList)
        {
            // load unique and sorted word list
            if (textBuffer != null)
            {
                return;
            }
            if (firstChar == '\0' || lastChar == '\0')
            {
                throw new NotSupportedException();
            }

            //---------------
            Dictionary <char, DevelopingWordGroup> wordGroups = new Dictionary <char, DevelopingWordGroup>();

            textBuffer = new CustomDicTextBuffer(1024);
            foreach (string line in sortedWordList)
            {
                char[] lineBuffer = line.Trim().ToCharArray();
                int    lineLen    = lineBuffer.Length;
                char   c0;
                if (lineLen > 0 && (c0 = lineBuffer[0]) != '#')
                {
                    int startAt = textBuffer.CurrentPosition;
                    textBuffer.AddWord(lineBuffer);

#if DEBUG
                    if (lineLen > byte.MaxValue)
                    {
                        throw new NotSupportedException();
                    }
#endif

                    WordSpan wordspan = new WordSpan(startAt, (byte)lineLen);
                    //each wordgroup contains text span

                    DevelopingWordGroup found;
                    if (!wordGroups.TryGetValue(c0, out found))
                    {
                        found = new DevelopingWordGroup(new WordSpan(startAt, 1));
                        wordGroups.Add(c0, found);
                    }
                    found.AddWordSpan(wordspan);
                }
                //- next line
            }
            //------------------------------------------------------------------
            textBuffer.Freeze();
            //------------------------------------------------------------------
            //do index
            DoIndex(wordGroups);

            //clear, not used
            wordGroups.Clear();
        }
Beispiel #3
0
 public bool SameTextContent(WordSpan another, CustomDicTextBuffer textBuffer)
 {
     if (another.len == this.len)
     {
         for (int i = another.len - 1; i >= 0; --i)
         {
             if (this.GetChar(i, textBuffer) != another.GetChar(i, textBuffer))
             {
                 return(false);
             }
         }
         return(true);
     }
     return(false);
 }
Beispiel #4
0
 internal void CollectAllWords(CustomDicTextBuffer textBuffer, List <string> output)
 {
     if (this.PrefixIsWord)
     {
         output.Add(GetPrefix(textBuffer));
     }
     if (subGroups != null)
     {
         foreach (WordGroup wordGroup in subGroups)
         {
             if (wordGroup != null)
             {
                 wordGroup.CollectAllWords(textBuffer, output);
             }
         }
     }
     if (wordSpans != null)
     {
         foreach (var span in wordSpans)
         {
             output.Add(span.GetString(textBuffer));
         }
     }
 }
Beispiel #5
0
 internal string GetPrefix(CustomDicTextBuffer buffer)
 {
     return(prefixSpan.GetString(buffer));
 }
Beispiel #6
0
        WordGroup _resultWordGroup;//after call DoIndex()
        internal void DoIndex(CustomDicTextBuffer textBuffer, CustomDic owner)
        {
            //recursive
            if (this.PrefixLen > 7)
            {
                DoIndexOfSmallAmount(textBuffer);
#if DEBUG
                dbugDataState = debugDataState.TooLongPrefix;
#endif
                return;
            }
            //-----------------------------------------------

            bool hasEvalPrefix = false;
            if (subGroups == null)
            {
                subGroups = new DevelopingWordGroup[owner.LastChar - owner.FirstChar + 1];
            }
            //--------------------------------
            int j             = wordSpanList.Count;
            int thisPrefixLen = this.PrefixLen;
            int doSepAt       = thisPrefixLen;
            for (int i = 0; i < j; ++i)
            {
                WordSpan sp = wordSpanList[i];
                if (sp.len > doSepAt)
                {
                    char c       = sp.GetChar(doSepAt, textBuffer);
                    int  c_index = c - owner.FirstChar;
                    DevelopingWordGroup found = subGroups[c_index];
                    if (found == null)
                    {
                        //not found
                        found = new DevelopingWordGroup(new WordSpan(sp.startAt, (byte)(doSepAt + 1)));
                        subGroups[c_index] = found;
                    }
                    found.AddWordSpan(sp);
                }
                else
                {
                    if (!hasEvalPrefix)
                    {
                        if (sp.SameTextContent(this.prefixSpan, textBuffer))
                        {
                            hasEvalPrefix     = true;
                            this.PrefixIsWord = true;
                        }
                    }
                }
            }
#if DEBUG
            this.dbugDataState = debugDataState.Indexed;
#endif
            wordSpanList.Clear();
            wordSpanList = null;
            //--------------------------------
            //do sup index
            //foreach (WordGroup subgroup in this.wordGroups.Values)
            bool hasSomeSubGroup = false;
            foreach (DevelopingWordGroup subgroup in this.subGroups)
            {
                if (subgroup != null)
                {
                    hasSomeSubGroup = true;

                    //****
                    //performance factor here,****
                    //in this current version
                    //if we not call DoIndex(),
                    //this subgroup need linear search-> so it slow
                    //so we call DoIndex until member count in the group <=3
                    //then it search faster,
                    //but dictionary-building time may increase.

                    if (subgroup.WordSpanListCount > 2)
                    {
                        subgroup.DoIndex(textBuffer, owner);
                    }
                    else
                    {
#if DEBUG
                        subgroup.dbugDataState = debugDataState.SmallAmountOfMembers;
#endif
                        subgroup.DoIndexOfSmallAmount(textBuffer);
                    }
                }
            }
            //--------------------------------
#if DEBUG
            this.dbugDataState = debugDataState.Indexed;
#endif
            if (!hasSomeSubGroup)
            {
                //clear
                subGroups = null;
            }

            //--------------------------------
            WordGroup[] newsubGroups = null;
            if (subGroups != null)
            {
                newsubGroups = new WordGroup[subGroups.Length];
                for (int i = subGroups.Length - 1; i >= 0; --i)
                {
                    DevelopingWordGroup subg = subGroups[i];
                    if (subg != null)
                    {
                        newsubGroups[i] = subg.ResultWordGroup;
                    }
                }
            }
            //--------------------------------
            this._resultWordGroup = new WordGroup(
                this.prefixSpan,
                newsubGroups,
                null,
                this.PrefixIsWord);
        }
Beispiel #7
0
 public string GetString(CustomDicTextBuffer textBuffer)
 {
     return(textBuffer.GetString(startAt, len));
 }
Beispiel #8
0
 public char GetChar(int index, CustomDicTextBuffer textBuffer)
 {
     return(textBuffer.GetChar(startAt + index));
 }
        int FindInWordSpans(WordVisitor visitor, WordGroup wordGroup)
        {
            WordSpan[] wordSpans = wordGroup.GetWordSpans();
            if (wordSpans == null)
            {
                throw new NotSupportedException();
            }

            //at this wordgroup
            //no subground anymore
            //so we should find the word one by one
            //start at prefix
            //and select the one that

            int readLen = visitor.CurrentIndex - visitor.LatestBreakAt;
            int nwords  = wordSpans.Length;
            //only 1 that match

            CustomDicTextBuffer currentTextBuffer = CurrentCustomDic.TextBuffer;

            //we sort unindex string ***
            //so we find from longest one( last) to begin
            for (int i = nwords - 1; i >= 0; --i)
            {
                //loop test on each word
                WordSpan w = wordSpans[i];
#if DEBUG
                //string dbugstr = w.GetString(currentTextBuffer);
#endif

                int  savedIndex     = visitor.CurrentIndex;
                char c              = visitor.Char;
                int  wordLen        = w.len;
                int  matchCharCount = 0;
                if (wordLen > readLen)
                {
                    for (int p = readLen; p < wordLen; ++p)
                    {
                        char c2 = w.GetChar(p, currentTextBuffer);
                        if (c2 == c)
                        {
                            matchCharCount++;
                            //match
                            //read next
                            if (!visitor.IsEnd)
                            {
                                visitor.SetCurrentIndex(visitor.CurrentIndex + 1);
                                c = visitor.Char;
                            }
                            else
                            {
                                //no more data in visitor

                                break;
                            }
                        }
                        else
                        {
                            break;
                        }
                    }
                }
                //reset
                if (readLen + matchCharCount == wordLen)
                {
                    int newBreakAt = visitor.LatestBreakAt + wordLen;
                    visitor.SetCurrentIndex(newBreakAt);
                    //--------------------------------------------
                    if (visitor.State == VisitorState.End)
                    {
                        return(newBreakAt);
                    }
                    //check next char can be the char of new word or not
                    //this depends on each lang
                    char canBeStartChar = visitor.Char;
                    if (CanHandle(canBeStartChar))
                    {
                        if (CanBeStartChar(canBeStartChar))
                        {
                            return(newBreakAt);
                        }
                        else
                        {
                            //back to savedIndex
                            visitor.SetCurrentIndex(savedIndex);
                            return(savedIndex);
                        }
                    }
                    else
                    {
                        visitor.State = VisitorState.OutOfRangeChar;
                        return(newBreakAt);
                    }
                }
                visitor.SetCurrentIndex(savedIndex);
            }
            return(0);
        }