C# (CSharp) WordStemの例

プログラミング言語: C# (CSharp)

クラス/型: WordStem

hotexamples.comのコード掲載数: 10

C# (CSharp) WordStem - 10件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたC# (CSharp)のWordStemの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

        private SparseVector <double> ProcessDocument(string document)
        {
            Set <string>          docWords = new Set <string>();
            Dictionary <int, int> tfVec    = new Dictionary <int, int>();
            ArrayList <WordStem>  nGrams   = new ArrayList <WordStem>(mMaxNGramLen);

            mTokenizer.Text = document;
            foreach (string token in mTokenizer)
            {
                string word = token.Trim().ToLower();
                if (mStopWords == null || !mStopWords.Contains(word))
                {
                    string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower();
                    if (nGrams.Count < mMaxNGramLen)
                    {
                        WordStem wordStem = new WordStem();
                        wordStem.Word = word;
                        wordStem.Stem = stem;
                        nGrams.Add(wordStem);
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            continue;
                        }
                    }
                    else
                    {
                        WordStem wordStem = nGrams[0];
                        wordStem.Word = word;
                        wordStem.Stem = stem;
                        for (int i = 0; i < mMaxNGramLen - 1; i++)
                        {
                            nGrams[i] = nGrams[i + 1];
                        }
                        nGrams[mMaxNGramLen - 1] = wordStem;
                    }
                    ProcessDocumentNGrams(nGrams, 0, tfVec, docWords);
                }
            }
            int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;

            for (int i = startIdx; i < nGrams.Count; i++)
            {
                ProcessDocumentNGrams(nGrams, i, tfVec, docWords);
            }
            SparseVector <double> docVec = new SparseVector <double>();

            foreach (KeyValuePair <int, int> tfItem in tfVec)
            {
                docVec.InnerIdx.Add(tfItem.Key);
                docVec.InnerDat.Add(tfItem.Value);
            }
            docVec.Sort();
            return(docVec);
        }

コード例 #2

ファイルを表示

        public SparseVector <double> ProcessDocument(string document, string language)
        {
            Dictionary <int, int> tfVec  = new Dictionary <int, int>();
            ArrayList <WordStem>  nGrams = new ArrayList <WordStem>(mMaxNGramLen);

            mTokenizer.Text = document;

            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;

            if (stemmers.ContainsKey(language))
            {
                stemmer   = stemmers[language];
                stopWords = stopWordDict[language];
            }
            else
            {
                Language lang = TextMiningUtils.GetLanguage(language);

                try
                {
                    TextMiningUtils.GetLanguageTools(lang, out stopWords, out stemmer);
                }
                catch (ArgumentNotSupportedException)   // Language tools to not exist, so fallback to english
                {
                    TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer);
                    mLogger.Error("Initialize", "Missing language tools for language code {0}.", language);
                }

                stemmers[language]     = stemmer;
                stopWordDict[language] = stopWords;
            }

            foreach (string token in mTokenizer)
            {
                string word = token.Trim().ToLower();
                if (stopWords == null || !stopWords.Contains(word))
                {
                    string stem = stemmer == null ? word : stemmer.GetStem(word).Trim().ToLower();
                    if (nGrams.Count < mMaxNGramLen)
                    {
                        WordStem wordStem = new WordStem();
                        wordStem.Word = word;
                        wordStem.Stem = stem;
                        nGrams.Add(wordStem);
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            continue;
                        }
                    }
                    else
                    {
                        WordStem wordStem = nGrams[0];
                        wordStem.Word = word;
                        wordStem.Stem = stem;
                        for (int i = 0; i < mMaxNGramLen - 1; i++)
                        {
                            nGrams[i] = nGrams[i + 1];
                        }
                        nGrams[mMaxNGramLen - 1] = wordStem;
                    }
                    ProcessDocumentNGrams(nGrams, 0, tfVec);
                }
            }
            int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;

            for (int i = startIdx; i < nGrams.Count; i++)
            {
                ProcessDocumentNGrams(nGrams, i, tfVec);
            }
            SparseVector <double> docVec = new SparseVector <double>();

            if (mWordWeightType == WordWeightType.TermFreq)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    docVec.InnerIdx.Add(tfItem.Key);
                    docVec.InnerDat.Add(tfItem.Value);
                }
            }
            else if (mWordWeightType == WordWeightType.TfIdf)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                    if (tfIdf > 0)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(tfIdf);
                    }
                }
            }
            else if (mWordWeightType == WordWeightType.LogDfTfIdf)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                    if (tfIdf > 0)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(Math.Log(1 + mIdxInfo[tfItem.Key].mDocFreq) * tfIdf);
                    }
                }
            }
            docVec.Sort();
            CutLowWeights(ref docVec);
            if (mNormalizeVectors)
            {
                Utils.TryNrmVecL2(docVec);
            }
            return(docVec);
        }

コード例 #3

ファイルを表示

        public void Initialize(IEnumerable <string> documents)
        {
            Utils.ThrowException(documents == null ? new ArgumentNullException("documents") : null);
            mWordInfo.Clear();
            mIdxInfo.Clear();
            mTfVectors.Clear();
            // build vocabulary
            mLogger.Info("Initialize", "Building vocabulary ...");
            int docCount = 0;

            foreach (string document in documents)
            {
                docCount++;
                mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, /*numSteps=*/ -1);
                Set <string>         docWords = new Set <string>();
                ArrayList <WordStem> nGrams   = new ArrayList <WordStem>(mMaxNGramLen);
                mTokenizer.Text = document;
                foreach (string token in mTokenizer)
                {
                    string word = token.Trim().ToLower();
                    if (mStopWords == null || !mStopWords.Contains(word))
                    {
                        string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower();
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            WordStem wordStem = new WordStem();
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            nGrams.Add(wordStem);
                            if (nGrams.Count < mMaxNGramLen)
                            {
                                continue;
                            }
                        }
                        else
                        {
                            WordStem wordStem = nGrams[0];
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            for (int i = 0; i < mMaxNGramLen - 1; i++)
                            {
                                nGrams[i] = nGrams[i + 1];
                            }
                            nGrams[mMaxNGramLen - 1] = wordStem;
                        }
                        ProcessNGramsPass1(nGrams, 0, docWords);
                    }
                }
                int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
                for (int i = startIdx; i < nGrams.Count; i++)
                {
                    ProcessNGramsPass1(nGrams, i, docWords);
                }
            }
            mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, docCount);
            // determine most frequent word and n-gram forms
            foreach (Word wordInfo in mWordInfo.Values)
            {
                int max = 0;
                foreach (KeyValuePair <string, int> wordForm in wordInfo.mForms)
                {
                    if (wordForm.Value > max)
                    {
                        max = wordForm.Value;
                        wordInfo.mMostFrequentForm = wordForm.Key;
                    }
                }
            }
            // compute bag-of-words vectors
            mLogger.Info("Initialize", "Computing bag-of-words vectors ...");
            int docNum = 1;

            foreach (string document in documents)
            {
                mLogger.ProgressFast(this, "Initialize", "Document {0} / {1} ...", docNum++, docCount);
                Dictionary <int, int> tfVec  = new Dictionary <int, int>();
                ArrayList <WordStem>  nGrams = new ArrayList <WordStem>(mMaxNGramLen);
                mTokenizer.Text = document;
                foreach (string token in mTokenizer)
                {
                    string word = token.Trim().ToLower();
                    if (mStopWords == null || !mStopWords.Contains(word))
                    {
                        string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower();
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            WordStem wordStem = new WordStem();
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            nGrams.Add(wordStem);
                            if (nGrams.Count < mMaxNGramLen)
                            {
                                continue;
                            }
                        }
                        else
                        {
                            WordStem wordStem = nGrams[0];
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            for (int i = 0; i < mMaxNGramLen - 1; i++)
                            {
                                nGrams[i] = nGrams[i + 1];
                            }
                            nGrams[mMaxNGramLen - 1] = wordStem;
                        }
                        ProcessNGramsPass2(nGrams, 0, tfVec);
                    }
                }
                int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
                for (int i = startIdx; i < nGrams.Count; i++)
                {
                    ProcessNGramsPass2(nGrams, i, tfVec);
                }
                SparseVector <double> docVec = new SparseVector <double>();
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    docVec.InnerIdx.Add(tfItem.Key);
                    docVec.InnerDat.Add(tfItem.Value);
                }
                docVec.Sort();
                mTfVectors.Add(docVec);
            }
        }

コード例 #4

ファイルを表示

ファイル: IncrementalBowSpace.cs プロジェクト: viidea/latino

 private SparseVector<double> ProcessDocument(string document)
 {
     Set<string> docWords = new Set<string>();
     Dictionary<int, int> tfVec = new Dictionary<int, int>();
     ArrayList<WordStem> nGrams = new ArrayList<WordStem>(mMaxNGramLen);
     mTokenizer.Text = document;
     foreach (string token in mTokenizer)
     {
         string word = token.Trim().ToLower();
         if (mStopWords == null || !mStopWords.Contains(word))
         {
             string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower();
             if (nGrams.Count < mMaxNGramLen)
             {
                 WordStem wordStem = new WordStem();
                 wordStem.Word = word;
                 wordStem.Stem = stem;
                 nGrams.Add(wordStem);
                 if (nGrams.Count < mMaxNGramLen) { continue; }
             }
             else
             {
                 WordStem wordStem = nGrams[0];
                 wordStem.Word = word;
                 wordStem.Stem = stem;
                 for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; }
                 nGrams[mMaxNGramLen - 1] = wordStem;
             }
             ProcessDocumentNGrams(nGrams, 0, tfVec, docWords);
         }
     }
     int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
     for (int i = startIdx; i < nGrams.Count; i++)
     {
         ProcessDocumentNGrams(nGrams, i, tfVec, docWords);
     }
     SparseVector<double> docVec = new SparseVector<double>();
     foreach (KeyValuePair<int, int> tfItem in tfVec)
     {
         docVec.InnerIdx.Add(tfItem.Key);
         docVec.InnerDat.Add(tfItem.Value);
     }
     docVec.Sort();
     return docVec;
 }

コード例 #5

ファイルを表示

        public ArrayList <SparseVector <double> > Initialize(IEnumerable <KeyDat <string, string> > documents, bool keepBowVectors)
        {
            Utils.ThrowException(documents == null ? new ArgumentNullException("documents") : null);
            Debug.Assert(documents != null, "Documents are always passed");
            mWordInfo.Clear();
            mIdxInfo.Clear();
            mBowVectors.Clear();
            ArrayList <SparseVector <double> > bows = keepBowVectors ? null : new ArrayList <SparseVector <double> >();

            // build vocabulary
            mLogger.Info("Initialize", "Building vocabulary ...");
            int docCount = 0;

            foreach (var document in documents)
            {
                docCount++;
                mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, /*numSteps=n*/ -1);
                Set <string>         docWords = new Set <string>();
                ArrayList <WordStem> nGrams   = new ArrayList <WordStem>(mMaxNGramLen);
                mTokenizer.Text = document.First;

                IStemmer stemmer;
                Set <string> .ReadOnly stopWords;
                // Setup stopwords and stemmer
                if (stemmers.ContainsKey(document.Second))
                {
                    stemmer   = stemmers[document.Second];
                    stopWords = stopWordDict[document.Second];
                }
                else
                {
                    Language lang = TextMiningUtils.GetLanguage(document.Second);

                    try
                    {
                        TextMiningUtils.GetLanguageTools(lang, out stopWords, out stemmer);
                    }
                    catch (ArgumentNotSupportedException)   // Language tools to not exist, so fallback to english
                    {
                        TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer);
                        mLogger.Error("Initialize", "Missing language tools for language code {0}.", document.Second);
                    }

                    stemmers[document.Second]     = stemmer;
                    stopWordDict[document.Second] = stopWords;
                }

                foreach (string token in mTokenizer)
                {
                    string word = token.Trim().ToLower();
                    if (stopWords == null || !stopWords.Contains(word))
                    {
                        string stem = stemmer == null ? word : stemmer.GetStem(word).Trim().ToLower();
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            WordStem wordStem = new WordStem();
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            nGrams.Add(wordStem);
                            if (nGrams.Count < mMaxNGramLen)
                            {
                                continue;
                            }
                        }
                        else
                        {
                            WordStem wordStem = nGrams[0];
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            for (int i = 0; i < mMaxNGramLen - 1; i++)
                            {
                                nGrams[i] = nGrams[i + 1];
                            }
                            nGrams[mMaxNGramLen - 1] = wordStem;
                        }
                        ProcessNGramsPass1(nGrams, 0, docWords);
                    }
                }
                int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
                for (int i = startIdx; i < nGrams.Count; i++)
                {
                    ProcessNGramsPass1(nGrams, i, docWords);
                }
            }
            mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, docCount);
            // remove unfrequent words and n-grams, precompute IDF
            ArrayList <string> removeList = new ArrayList <string>();

            foreach (KeyValuePair <string, Word> wordInfo in mWordInfo)
            {
                if (wordInfo.Value.mFreq < mMinWordFreq)
                {
                    removeList.Add(wordInfo.Key);
                }
                else
                {
                    wordInfo.Value.mIdf = Math.Log((double)docCount / (double)wordInfo.Value.mDocFreq);
                }
            }
            foreach (string key in removeList)
            {
                mWordInfo.Remove(key);
            }
            // determine most frequent word and n-gram forms
            foreach (Word wordInfo in mWordInfo.Values)
            {
                int max = 0;
                foreach (KeyValuePair <string, int> wordForm in wordInfo.mForms)
                {
                    if (wordForm.Value > max)
                    {
                        max = wordForm.Value;
                        wordInfo.mMostFrequentForm = wordForm.Key;
                    }
                }
                if (!mKeepWordForms)
                {
                    wordInfo.mForms.Clear();
                }
            }
            // compute bag-of-words vectors
            mLogger.Info("Initialize", "Computing bag-of-words vectors ...");
            int docNum = 1;

            foreach (var document in documents)
            {
                Set <string> .ReadOnly stopWords = stopWordDict[document.Second];
                IStemmer stemmer = stemmers[document.Second];

                mLogger.ProgressFast(this, "Initialize", "Document {0} / {1} ...", docNum++, docCount);
                Dictionary <int, int> tfVec  = new Dictionary <int, int>();
                ArrayList <WordStem>  nGrams = new ArrayList <WordStem>(mMaxNGramLen);
                mTokenizer.Text = document.First;
                foreach (string token in mTokenizer)
                {
                    string word = token.Trim().ToLower();
                    if (stopWords == null || !stopWords.Contains(word))
                    {
                        string stem = stemmer == null ? word : stemmer.GetStem(word).Trim().ToLower();
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            WordStem wordStem = new WordStem();
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            nGrams.Add(wordStem);
                            if (nGrams.Count < mMaxNGramLen)
                            {
                                continue;
                            }
                        }
                        else
                        {
                            WordStem wordStem = nGrams[0];
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            for (int i = 0; i < mMaxNGramLen - 1; i++)
                            {
                                nGrams[i] = nGrams[i + 1];
                            }
                            nGrams[mMaxNGramLen - 1] = wordStem;
                        }
                        ProcessNGramsPass2(nGrams, 0, tfVec);
                    }
                }
                int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
                for (int i = startIdx; i < nGrams.Count; i++)
                {
                    ProcessNGramsPass2(nGrams, i, tfVec);
                }
                SparseVector <double> docVec = new SparseVector <double>();
                if (mWordWeightType == WordWeightType.TermFreq)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(tfItem.Value);
                    }
                }
                else if (mWordWeightType == WordWeightType.TfIdf)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                        if (tfIdf > 0)
                        {
                            docVec.InnerIdx.Add(tfItem.Key);
                            docVec.InnerDat.Add(tfIdf);
                        }
                    }
                }
                else if (mWordWeightType == WordWeightType.LogDfTfIdf)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                        if (tfIdf > 0)
                        {
                            docVec.InnerIdx.Add(tfItem.Key);
                            docVec.InnerDat.Add(Math.Log(1 + mIdxInfo[tfItem.Key].mDocFreq) * tfIdf);
                        }
                    }
                }
                docVec.Sort();
                CutLowWeights(ref docVec);
                if (mNormalizeVectors)
                {
                    Utils.TryNrmVecL2(docVec);
                }
                if (keepBowVectors)
                {
                    mBowVectors.Add(docVec);
                }
                else
                {
                    bows.Add(docVec);
                }
            }
            return(bows);
        }

コード例 #6

ファイルを表示

ファイル: IncrementalBowSpace.cs プロジェクト: viidea/latino

 public void Initialize(IEnumerable<string> documents)
 {
     Utils.ThrowException(documents == null ? new ArgumentNullException("documents") : null);
     mWordInfo.Clear();
     mIdxInfo.Clear();
     mTfVectors.Clear();
     // build vocabulary
     mLogger.Info("Initialize", "Building vocabulary ...");
     int docCount = 0;
     foreach (string document in documents)
     {
         docCount++;
         mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, /*numSteps=*/-1);
         Set<string> docWords = new Set<string>();
         ArrayList<WordStem> nGrams = new ArrayList<WordStem>(mMaxNGramLen);
         mTokenizer.Text = document;
         foreach (string token in mTokenizer)
         {
             string word = token.Trim().ToLower();
             if (mStopWords == null || !mStopWords.Contains(word))
             {
                 string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower();
                 if (nGrams.Count < mMaxNGramLen)
                 {
                     WordStem wordStem = new WordStem();
                     wordStem.Word = word;
                     wordStem.Stem = stem;
                     nGrams.Add(wordStem);
                     if (nGrams.Count < mMaxNGramLen) { continue; }
                 }
                 else
                 {
                     WordStem wordStem = nGrams[0];
                     wordStem.Word = word;
                     wordStem.Stem = stem;
                     for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; }
                     nGrams[mMaxNGramLen - 1] = wordStem;
                 }
                 ProcessNGramsPass1(nGrams, 0, docWords);
             }
         }
         int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
         for (int i = startIdx; i < nGrams.Count; i++)
         {
             ProcessNGramsPass1(nGrams, i, docWords);
         }
     }
     mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, docCount);
     // determine most frequent word and n-gram forms
     foreach (Word wordInfo in mWordInfo.Values)
     {
         int max = 0;
         foreach (KeyValuePair<string, int> wordForm in wordInfo.mForms)
         {
             if (wordForm.Value > max)
             {
                 max = wordForm.Value;
                 wordInfo.mMostFrequentForm = wordForm.Key;
             }
         }
     }
     // compute bag-of-words vectors
     mLogger.Info("Initialize", "Computing bag-of-words vectors ...");
     int docNum = 1;
     foreach (string document in documents)
     {
         mLogger.ProgressFast(this, "Initialize", "Document {0} / {1} ...", docNum++, docCount);
         Dictionary<int, int> tfVec = new Dictionary<int, int>();
         ArrayList<WordStem> nGrams = new ArrayList<WordStem>(mMaxNGramLen);
         mTokenizer.Text = document;
         foreach (string token in mTokenizer)
         {
             string word = token.Trim().ToLower();
             if (mStopWords == null || !mStopWords.Contains(word))
             {
                 string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower();
                 if (nGrams.Count < mMaxNGramLen)
                 {
                     WordStem wordStem = new WordStem();
                     wordStem.Word = word;
                     wordStem.Stem = stem;
                     nGrams.Add(wordStem);
                     if (nGrams.Count < mMaxNGramLen) { continue; }
                 }
                 else
                 {
                     WordStem wordStem = nGrams[0];
                     wordStem.Word = word;
                     wordStem.Stem = stem;
                     for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; }
                     nGrams[mMaxNGramLen - 1] = wordStem;
                 }
                 ProcessNGramsPass2(nGrams, 0, tfVec);
             }
         }
         int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
         for (int i = startIdx; i < nGrams.Count; i++)
         {
             ProcessNGramsPass2(nGrams, i, tfVec);
         }
         SparseVector<double> docVec = new SparseVector<double>();
         foreach (KeyValuePair<int, int> tfItem in tfVec)
         {
             docVec.InnerIdx.Add(tfItem.Key);
             docVec.InnerDat.Add(tfItem.Value);
         }
         docVec.Sort();
         mTfVectors.Add(docVec);
     }
 }

コード例 #7

ファイルを表示

ファイル: BowSpace.cs プロジェクト: mgrcar/Detextive

        public SparseVector <double> ProcessDocument(string document)
        {
            Dictionary <int, int> tf_vec  = new Dictionary <int, int>();
            ArrayList <WordStem>  n_grams = new ArrayList <WordStem>(m_max_n_gram_len);

            m_tokenizer.Text = document;
            foreach (string token in m_tokenizer)
            {
                string word = token.Trim().ToLower();
                if (m_stop_words == null || !m_stop_words.Contains(word))
                {
                    string stem = m_stemmer == null ? word : m_stemmer.GetStem(word).Trim().ToLower();
                    if (n_grams.Count < m_max_n_gram_len)
                    {
                        WordStem word_stem = new WordStem();
                        word_stem.Word = word;
                        word_stem.Stem = stem;
                        n_grams.Add(word_stem);
                        if (n_grams.Count < m_max_n_gram_len)
                        {
                            continue;
                        }
                    }
                    else
                    {
                        WordStem word_stem = n_grams[0];
                        word_stem.Word = word;
                        word_stem.Stem = stem;
                        for (int i = 0; i < m_max_n_gram_len - 1; i++)
                        {
                            n_grams[i] = n_grams[i + 1];
                        }
                        n_grams[m_max_n_gram_len - 1] = word_stem;
                    }
                    ProcessDocumentNGrams(n_grams, 0, tf_vec);
                }
            }
            int start_idx = n_grams.Count == m_max_n_gram_len ? 1 : 0;

            for (int i = start_idx; i < n_grams.Count; i++)
            {
                ProcessDocumentNGrams(n_grams, i, tf_vec);
            }
            SparseVector <double> doc_vec = new SparseVector <double>();

            if (m_word_weight_type == WordWeightType.TermFreq)
            {
                foreach (KeyValuePair <int, int> tf_item in tf_vec)
                {
                    doc_vec.InnerIdx.Add(tf_item.Key);
                    doc_vec.InnerDat.Add(tf_item.Value);
                }
            }
            else if (m_word_weight_type == WordWeightType.TfIdf)
            {
                foreach (KeyValuePair <int, int> tf_item in tf_vec)
                {
                    double tf_idf = (double)tf_item.Value * m_idx_info[tf_item.Key].m_idf;
                    if (tf_idf > 0)
                    {
                        doc_vec.InnerIdx.Add(tf_item.Key);
                        doc_vec.InnerDat.Add(tf_idf);
                    }
                }
            }
            else if (m_word_weight_type == WordWeightType.LogDfTfIdf)
            {
                foreach (KeyValuePair <int, int> tf_item in tf_vec)
                {
                    double tf_idf = (double)tf_item.Value * m_idx_info[tf_item.Key].m_idf;
                    if (tf_idf > 0)
                    {
                        doc_vec.InnerIdx.Add(tf_item.Key);
                        doc_vec.InnerDat.Add(Math.Log(1 + m_idx_info[tf_item.Key].m_doc_freq) * tf_idf);
                    }
                }
            }
            doc_vec.Sort();
            CutLowWeights(ref doc_vec);
            if (m_normalize_vectors)
            {
                ModelUtils.TryNrmVecL2(doc_vec);
            }
            return(doc_vec);
        }

コード例 #8

ファイルを表示

ファイル: BowSpace.cs プロジェクト: mgrcar/Detextive

        public void Initialize(IEnumerable <string> documents, bool large_scale)
        {
            Utils.ThrowException(documents == null ? new ArgumentNullException("documents") : null);
            m_word_info.Clear();
            m_idx_info.Clear();
            m_bow_vectors.Clear();
            // build vocabulary
            Utils.VerboseLine("Building vocabulary ...");
            int doc_count = 0;

            if (!large_scale)
            {
                foreach (string document in documents)
                {
                    doc_count++;
                    Utils.Verbose("Document {0} ...\r", doc_count);
                    Set <string>         doc_words = new Set <string>();
                    ArrayList <WordStem> n_grams   = new ArrayList <WordStem>(m_max_n_gram_len);
                    m_tokenizer.Text = document;
                    foreach (string token in m_tokenizer)
                    {
                        string word = token.Trim().ToLower();
                        if (m_stop_words == null || !m_stop_words.Contains(word))
                        {
                            string stem = m_stemmer == null ? word : m_stemmer.GetStem(word).Trim().ToLower();
                            if (n_grams.Count < m_max_n_gram_len)
                            {
                                WordStem word_stem = new WordStem();
                                word_stem.Word = word;
                                word_stem.Stem = stem;
                                n_grams.Add(word_stem);
                                if (n_grams.Count < m_max_n_gram_len)
                                {
                                    continue;
                                }
                            }
                            else
                            {
                                WordStem word_stem = n_grams[0];
                                word_stem.Word = word;
                                word_stem.Stem = stem;
                                for (int i = 0; i < m_max_n_gram_len - 1; i++)
                                {
                                    n_grams[i] = n_grams[i + 1];
                                }
                                n_grams[m_max_n_gram_len - 1] = word_stem;
                            }
                            ProcessNGramsPass1(n_grams, 0, doc_words);
                        }
                    }
                    int start_idx = n_grams.Count == m_max_n_gram_len ? 1 : 0;
                    for (int i = start_idx; i < n_grams.Count; i++)
                    {
                        ProcessNGramsPass1(n_grams, i, doc_words);
                    }
                }
                Utils.VerboseLine("");
            }
            else // large-scale mode (needs less memory, slower)
            {
                for (int n = 1; n <= m_max_n_gram_len; n++)
                {
                    doc_count = 0;
                    Utils.VerboseLine("Pass {0} of {1} ...", n, m_max_n_gram_len);
                    foreach (string document in documents)
                    {
                        doc_count++;
                        Utils.Verbose("Document {0} ...\r", doc_count);
                        ArrayList <WordStem> n_grams   = new ArrayList <WordStem>(n);
                        Set <string>         doc_words = new Set <string>();
                        m_tokenizer.Text = document;
                        foreach (string token in m_tokenizer)
                        {
                            string word = token.Trim().ToLower();
                            if (m_stop_words == null || !m_stop_words.Contains(word))
                            {
                                string stem = m_stemmer == null ? word : m_stemmer.GetStem(word).Trim().ToLower();
                                if (n_grams.Count < n)
                                {
                                    WordStem word_stem = new WordStem();
                                    word_stem.Word = word;
                                    word_stem.Stem = stem;
                                    n_grams.Add(word_stem);
                                    if (n_grams.Count < n)
                                    {
                                        continue;
                                    }
                                }
                                else
                                {
                                    WordStem word_stem = n_grams[0];
                                    word_stem.Word = word;
                                    word_stem.Stem = stem;
                                    for (int i = 0; i < n - 1; i++)
                                    {
                                        n_grams[i] = n_grams[i + 1];
                                    }
                                    n_grams[n - 1] = word_stem;
                                }
                                string n_gram      = n_grams[0].Word;
                                string n_gram_stem = n_grams[0].Stem;
                                if (n > 1)
                                {
                                    for (int i = 1; i < n - 1; i++)
                                    {
                                        n_gram      += " " + n_grams[i].Word;
                                        n_gram_stem += " " + n_grams[i].Stem;
                                    }
                                    if (!m_word_info.ContainsKey(n_gram_stem))
                                    {
                                        continue;
                                    }
                                    if (m_word_info[n_gram_stem].m_freq < m_min_word_freq)
                                    {
                                        continue;
                                    }
                                    string n_gram_stem_2 = "";
                                    for (int i = 1; i < n - 1; i++)
                                    {
                                        n_gram_stem_2 += n_grams[i].Stem + " ";
                                    }
                                    n_gram_stem_2 += n_grams[n - 1].Stem;
                                    if (!m_word_info.ContainsKey(n_gram_stem_2))
                                    {
                                        continue;
                                    }
                                    if (m_word_info[n_gram_stem_2].m_freq < m_min_word_freq)
                                    {
                                        continue;
                                    }
                                    n_gram      += " " + n_grams[n - 1].Word;
                                    n_gram_stem += " " + n_grams[n - 1].Stem;
                                }
                                if (!m_word_info.ContainsKey(n_gram_stem))
                                {
                                    Word n_gram_info = new Word(n_gram);
                                    m_word_info.Add(n_gram_stem, n_gram_info);
                                    doc_words.Add(n_gram_stem);
                                }
                                else
                                {
                                    Word n_gram_info = m_word_info[n_gram_stem];
                                    if (!doc_words.Contains(n_gram_stem))
                                    {
                                        n_gram_info.m_doc_freq++;
                                        doc_words.Add(n_gram_stem);
                                    }
                                    n_gram_info.m_freq++;
                                    if (!n_gram_info.m_forms.ContainsKey(n_gram))
                                    {
                                        n_gram_info.m_forms.Add(n_gram, 1);
                                    }
                                    else
                                    {
                                        n_gram_info.m_forms[n_gram]++;
                                    }
                                }
                            }
                        }
                    }
                    Utils.VerboseLine("");
                }
            }
            // remove unfrequent words and n-grams, precompute IDF
            ArrayList <string> remove_list = new ArrayList <string>();

            foreach (KeyValuePair <string, Word> word_info in m_word_info)
            {
                if (word_info.Value.m_freq < m_min_word_freq)
                {
                    remove_list.Add(word_info.Key);
                }
                else
                {
                    word_info.Value.m_idf = Math.Log((double)doc_count / (double)word_info.Value.m_doc_freq);
                }
            }
            foreach (string key in remove_list)
            {
                m_word_info.Remove(key);
            }
            // determine most frequent word and n-gram forms
            foreach (Word word_info in m_word_info.Values)
            {
                int max = 0;
                foreach (KeyValuePair <string, int> word_form in word_info.m_forms)
                {
                    if (word_form.Value > max)
                    {
                        max = word_form.Value;
                        word_info.m_most_frequent_form = word_form.Key;
                    }
                }
                if (!m_keep_word_forms)
                {
                    word_info.m_forms.Clear();
                }
            }
            // compute bag-of-words vectors
            Utils.VerboseLine("Computing bag-of-words vectors ...");
            int doc_num = 1;

            foreach (string document in documents)
            {
                Utils.Verbose("Document {0} of {1} ...\r", doc_num++, doc_count);
                Dictionary <int, int> tf_vec  = new Dictionary <int, int>();
                ArrayList <WordStem>  n_grams = new ArrayList <WordStem>(m_max_n_gram_len);
                m_tokenizer.Text = document;
                foreach (string token in m_tokenizer)
                {
                    string word = token.Trim().ToLower();
                    if (m_stop_words == null || !m_stop_words.Contains(word))
                    {
                        string stem = m_stemmer == null ? word : m_stemmer.GetStem(word).Trim().ToLower();
                        if (n_grams.Count < m_max_n_gram_len)
                        {
                            WordStem word_stem = new WordStem();
                            word_stem.Word = word;
                            word_stem.Stem = stem;
                            n_grams.Add(word_stem);
                            if (n_grams.Count < m_max_n_gram_len)
                            {
                                continue;
                            }
                        }
                        else
                        {
                            WordStem word_stem = n_grams[0];
                            word_stem.Word = word;
                            word_stem.Stem = stem;
                            for (int i = 0; i < m_max_n_gram_len - 1; i++)
                            {
                                n_grams[i] = n_grams[i + 1];
                            }
                            n_grams[m_max_n_gram_len - 1] = word_stem;
                        }
                        ProcessNGramsPass2(n_grams, 0, tf_vec);
                    }
                }
                int start_idx = n_grams.Count == m_max_n_gram_len ? 1 : 0;
                for (int i = start_idx; i < n_grams.Count; i++)
                {
                    ProcessNGramsPass2(n_grams, i, tf_vec);
                }
                SparseVector <double> doc_vec = new SparseVector <double>();
                if (m_word_weight_type == WordWeightType.TermFreq)
                {
                    foreach (KeyValuePair <int, int> tf_item in tf_vec)
                    {
                        doc_vec.InnerIdx.Add(tf_item.Key);
                        doc_vec.InnerDat.Add(tf_item.Value);
                    }
                }
                else if (m_word_weight_type == WordWeightType.TfIdf)
                {
                    foreach (KeyValuePair <int, int> tf_item in tf_vec)
                    {
                        double tf_idf = (double)tf_item.Value * m_idx_info[tf_item.Key].m_idf;
                        if (tf_idf > 0)
                        {
                            doc_vec.InnerIdx.Add(tf_item.Key);
                            doc_vec.InnerDat.Add(tf_idf);
                        }
                    }
                }
                else if (m_word_weight_type == WordWeightType.LogDfTfIdf)
                {
                    foreach (KeyValuePair <int, int> tf_item in tf_vec)
                    {
                        double tf_idf = (double)tf_item.Value * m_idx_info[tf_item.Key].m_idf;
                        if (tf_idf > 0)
                        {
                            doc_vec.InnerIdx.Add(tf_item.Key);
                            doc_vec.InnerDat.Add(Math.Log(1 + m_idx_info[tf_item.Key].m_doc_freq) * tf_idf);
                        }
                    }
                }
                doc_vec.Sort();
                CutLowWeights(ref doc_vec);
                if (m_normalize_vectors)
                {
                    ModelUtils.TryNrmVecL2(doc_vec);
                }
                m_bow_vectors.Add(doc_vec);
            }
            Utils.VerboseLine("");
        }

コード例 #9

ファイルを表示

        public SparseVector <double> ProcessDocument(string document)
        {
            Dictionary <int, int> tfVec  = new Dictionary <int, int>();
            ArrayList <WordStem>  nGrams = new ArrayList <WordStem>(mMaxNGramLen);

            mTokenizer.Text = document;
            foreach (string token in mTokenizer)
            {
                string word = token.Trim().ToLower();
                if (mStopWords == null || !mStopWords.Contains(word))
                {
                    string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower();
                    if (nGrams.Count < mMaxNGramLen)
                    {
                        WordStem wordStem = new WordStem();
                        wordStem.Word = word;
                        wordStem.Stem = stem;
                        nGrams.Add(wordStem);
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            continue;
                        }
                    }
                    else
                    {
                        WordStem wordStem = nGrams[0];
                        wordStem.Word = word;
                        wordStem.Stem = stem;
                        for (int i = 0; i < mMaxNGramLen - 1; i++)
                        {
                            nGrams[i] = nGrams[i + 1];
                        }
                        nGrams[mMaxNGramLen - 1] = wordStem;
                    }
                    ProcessDocumentNGrams(nGrams, 0, tfVec);
                }
            }
            int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;

            for (int i = startIdx; i < nGrams.Count; i++)
            {
                ProcessDocumentNGrams(nGrams, i, tfVec);
            }
            SparseVector <double> docVec = new SparseVector <double>();

            if (mWordWeightType == WordWeightType.TermFreq)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    docVec.InnerIdx.Add(tfItem.Key);
                    docVec.InnerDat.Add(tfItem.Value);
                }
            }
            else if (mWordWeightType == WordWeightType.TfIdf)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                    if (tfIdf > 0)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(tfIdf);
                    }
                }
            }
            else if (mWordWeightType == WordWeightType.Dyakonov)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    double weight = (double)tfItem.Value / Math.Sqrt(mIdxInfo[tfItem.Key].mFreq);
                    if (weight > 0)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(weight);
                    }
                }
            }
            else if (mWordWeightType == WordWeightType.LogDfTfIdf)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                    if (tfIdf > 0)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(Math.Log(1 + mIdxInfo[tfItem.Key].mDocFreq) * tfIdf);
                    }
                }
            }
            docVec.Sort();
            CutLowWeights(ref docVec);
            if (mNormalizeVectors)
            {
                Utils.TryNrmVecL2(docVec);
            }
            return(docVec);
        }

コード例 #10

ファイルを表示

        public ArrayList <SparseVector <double> > Initialize(IEnumerable <string> documents, bool largeScale, bool keepBowVectors)
        {
            Utils.ThrowException(documents == null ? new ArgumentNullException("documents") : null);
            mWordInfo.Clear();
            mIdxInfo.Clear();
            mBowVectors.Clear();
            ArrayList <SparseVector <double> > bows = keepBowVectors ? null : new ArrayList <SparseVector <double> >();

            // build vocabulary
            mLogger.Info("Initialize", "Building vocabulary ...");
            int docCount = 0;

            if (!largeScale)
            {
                foreach (string document in documents)
                {
                    docCount++;
                    mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, /*numSteps=n*/ -1);
                    Set <string>         docWords = new Set <string>();
                    ArrayList <WordStem> nGrams   = new ArrayList <WordStem>(mMaxNGramLen);
                    mTokenizer.Text = document;
                    foreach (string token in mTokenizer)
                    {
                        string word = token.Trim().ToLower();
                        if (mStopWords == null || !mStopWords.Contains(word))
                        {
                            string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower();
                            if (nGrams.Count < mMaxNGramLen)
                            {
                                WordStem wordStem = new WordStem();
                                wordStem.Word = word;
                                wordStem.Stem = stem;
                                nGrams.Add(wordStem);
                                if (nGrams.Count < mMaxNGramLen)
                                {
                                    continue;
                                }
                            }
                            else
                            {
                                WordStem wordStem = nGrams[0];
                                wordStem.Word = word;
                                wordStem.Stem = stem;
                                for (int i = 0; i < mMaxNGramLen - 1; i++)
                                {
                                    nGrams[i] = nGrams[i + 1];
                                }
                                nGrams[mMaxNGramLen - 1] = wordStem;
                            }
                            ProcessNGramsPass1(nGrams, 0, docWords);
                        }
                    }
                    int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
                    for (int i = startIdx; i < nGrams.Count; i++)
                    {
                        ProcessNGramsPass1(nGrams, i, docWords);
                    }
                }
                mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, docCount);
            }
            else // large-scale mode (needs less memory, slower)
            {
                for (int n = 1; n <= mMaxNGramLen; n++)
                {
                    docCount = 0;
                    mLogger.Info("Initialize", "Pass {0} of {1} ...", n, mMaxNGramLen);
                    foreach (string document in documents)
                    {
                        docCount++;
                        mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, /*numSteps=*/ -1);
                        ArrayList <WordStem> nGrams   = new ArrayList <WordStem>(n);
                        Set <string>         docWords = new Set <string>();
                        mTokenizer.Text = document;
                        foreach (string token in mTokenizer)
                        {
                            string word = token.Trim().ToLower();
                            if (mStopWords == null || !mStopWords.Contains(word))
                            {
                                string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower();
                                if (nGrams.Count < n)
                                {
                                    WordStem wordStem = new WordStem();
                                    wordStem.Word = word;
                                    wordStem.Stem = stem;
                                    nGrams.Add(wordStem);
                                    if (nGrams.Count < n)
                                    {
                                        continue;
                                    }
                                }
                                else
                                {
                                    WordStem wordStem = nGrams[0];
                                    wordStem.Word = word;
                                    wordStem.Stem = stem;
                                    for (int i = 0; i < n - 1; i++)
                                    {
                                        nGrams[i] = nGrams[i + 1];
                                    }
                                    nGrams[n - 1] = wordStem;
                                }
                                string nGram     = nGrams[0].Word;
                                string nGramStem = nGrams[0].Stem;
                                if (n > 1)
                                {
                                    for (int i = 1; i < n - 1; i++)
                                    {
                                        nGram     += " " + nGrams[i].Word;
                                        nGramStem += " " + nGrams[i].Stem;
                                    }
                                    if (!mWordInfo.ContainsKey(nGramStem))
                                    {
                                        continue;
                                    }
                                    if (mWordInfo[nGramStem].mFreq < mMinWordFreq)
                                    {
                                        continue;
                                    }
                                    string nGramStem2 = "";
                                    for (int i = 1; i < n - 1; i++)
                                    {
                                        nGramStem2 += nGrams[i].Stem + " ";
                                    }
                                    nGramStem2 += nGrams[n - 1].Stem;
                                    if (!mWordInfo.ContainsKey(nGramStem2))
                                    {
                                        continue;
                                    }
                                    if (mWordInfo[nGramStem2].mFreq < mMinWordFreq)
                                    {
                                        continue;
                                    }
                                    nGram     += " " + nGrams[n - 1].Word;
                                    nGramStem += " " + nGrams[n - 1].Stem;
                                }
                                if (!mWordInfo.ContainsKey(nGramStem))
                                {
                                    Word nGramInfo = new Word(nGram, nGramStem);
                                    mWordInfo.Add(nGramStem, nGramInfo);
                                    docWords.Add(nGramStem);
                                }
                                else
                                {
                                    Word nGramInfo = mWordInfo[nGramStem];
                                    if (!docWords.Contains(nGramStem))
                                    {
                                        nGramInfo.mDocFreq++;
                                        docWords.Add(nGramStem);
                                    }
                                    nGramInfo.mFreq++;
                                    if (!nGramInfo.mForms.ContainsKey(nGram))
                                    {
                                        nGramInfo.mForms.Add(nGram, 1);
                                    }
                                    else
                                    {
                                        nGramInfo.mForms[nGram]++;
                                    }
                                }
                            }
                        }
                    }
                    mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, docCount);
                }
            }
            // remove unfrequent words and n-grams, precompute IDF
            ArrayList <string> removeList = new ArrayList <string>();

            foreach (KeyValuePair <string, Word> wordInfo in mWordInfo)
            {
                if (wordInfo.Value.mFreq < mMinWordFreq)
                {
                    removeList.Add(wordInfo.Key);
                }
                else
                {
                    wordInfo.Value.mIdf = Math.Log((double)docCount / (double)wordInfo.Value.mDocFreq);
                }
            }
            foreach (string key in removeList)
            {
                mWordInfo.Remove(key);
            }
            // determine most frequent word and n-gram forms
            foreach (Word wordInfo in mWordInfo.Values)
            {
                int max = 0;
                foreach (KeyValuePair <string, int> wordForm in wordInfo.mForms)
                {
                    if (wordForm.Value > max)
                    {
                        max = wordForm.Value;
                        wordInfo.mMostFrequentForm = wordForm.Key;
                    }
                }
                if (!mKeepWordForms)
                {
                    wordInfo.mForms.Clear();
                }
            }
            // compute bag-of-words vectors
            mLogger.Info("Initialize", "Computing bag-of-words vectors ...");
            int docNum = 1;

            foreach (string document in documents)
            {
                mLogger.ProgressFast(this, "Initialize", "Document {0} / {1} ...", docNum++, docCount);
                Dictionary <int, int> tfVec  = new Dictionary <int, int>();
                ArrayList <WordStem>  nGrams = new ArrayList <WordStem>(mMaxNGramLen);
                mTokenizer.Text = document;
                foreach (string token in mTokenizer)
                {
                    string word = token.Trim().ToLower();
                    if (mStopWords == null || !mStopWords.Contains(word))
                    {
                        string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower();
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            WordStem wordStem = new WordStem();
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            nGrams.Add(wordStem);
                            if (nGrams.Count < mMaxNGramLen)
                            {
                                continue;
                            }
                        }
                        else
                        {
                            WordStem wordStem = nGrams[0];
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            for (int i = 0; i < mMaxNGramLen - 1; i++)
                            {
                                nGrams[i] = nGrams[i + 1];
                            }
                            nGrams[mMaxNGramLen - 1] = wordStem;
                        }
                        ProcessNGramsPass2(nGrams, 0, tfVec);
                    }
                }
                int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
                for (int i = startIdx; i < nGrams.Count; i++)
                {
                    ProcessNGramsPass2(nGrams, i, tfVec);
                }
                SparseVector <double> docVec = new SparseVector <double>();
                if (mWordWeightType == WordWeightType.TermFreq)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(tfItem.Value);
                    }
                }
                else if (mWordWeightType == WordWeightType.TfIdf)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                        if (tfIdf > 0)
                        {
                            docVec.InnerIdx.Add(tfItem.Key);
                            docVec.InnerDat.Add(tfIdf);
                        }
                    }
                }
                else if (mWordWeightType == WordWeightType.Dyakonov)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        double weight = (double)tfItem.Value / Math.Sqrt(mIdxInfo[tfItem.Key].mFreq);
                        if (weight > 0)
                        {
                            docVec.InnerIdx.Add(tfItem.Key);
                            docVec.InnerDat.Add(weight);
                        }
                    }
                }
                else if (mWordWeightType == WordWeightType.LogDfTfIdf)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                        if (tfIdf > 0)
                        {
                            docVec.InnerIdx.Add(tfItem.Key);
                            docVec.InnerDat.Add(Math.Log(1 + mIdxInfo[tfItem.Key].mDocFreq) * tfIdf);
                        }
                    }
                }
                docVec.Sort();
                CutLowWeights(ref docVec);
                if (mNormalizeVectors)
                {
                    Utils.TryNrmVecL2(docVec);
                }
                if (keepBowVectors)
                {
                    mBowVectors.Add(docVec);
                }
                else
                {
                    bows.Add(docVec);
                }
            }
            return(bows);
        }