Example #1
0
        public SparseVector <double> ProcessDocument(string document, string language)
        {
            Dictionary <int, int> tfVec  = new Dictionary <int, int>();
            ArrayList <WordStem>  nGrams = new ArrayList <WordStem>(mMaxNGramLen);

            mTokenizer.Text = document;

            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;

            if (stemmers.ContainsKey(language))
            {
                stemmer   = stemmers[language];
                stopWords = stopWordDict[language];
            }
            else
            {
                Language lang = TextMiningUtils.GetLanguage(language);

                try
                {
                    TextMiningUtils.GetLanguageTools(lang, out stopWords, out stemmer);
                }
                catch (ArgumentNotSupportedException)   // Language tools to not exist, so fallback to english
                {
                    TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer);
                    mLogger.Error("Initialize", "Missing language tools for language code {0}.", language);
                }

                stemmers[language]     = stemmer;
                stopWordDict[language] = stopWords;
            }

            foreach (string token in mTokenizer)
            {
                string word = token.Trim().ToLower();
                if (stopWords == null || !stopWords.Contains(word))
                {
                    string stem = stemmer == null ? word : stemmer.GetStem(word).Trim().ToLower();
                    if (nGrams.Count < mMaxNGramLen)
                    {
                        WordStem wordStem = new WordStem();
                        wordStem.Word = word;
                        wordStem.Stem = stem;
                        nGrams.Add(wordStem);
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            continue;
                        }
                    }
                    else
                    {
                        WordStem wordStem = nGrams[0];
                        wordStem.Word = word;
                        wordStem.Stem = stem;
                        for (int i = 0; i < mMaxNGramLen - 1; i++)
                        {
                            nGrams[i] = nGrams[i + 1];
                        }
                        nGrams[mMaxNGramLen - 1] = wordStem;
                    }
                    ProcessDocumentNGrams(nGrams, 0, tfVec);
                }
            }
            int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;

            for (int i = startIdx; i < nGrams.Count; i++)
            {
                ProcessDocumentNGrams(nGrams, i, tfVec);
            }
            SparseVector <double> docVec = new SparseVector <double>();

            if (mWordWeightType == WordWeightType.TermFreq)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    docVec.InnerIdx.Add(tfItem.Key);
                    docVec.InnerDat.Add(tfItem.Value);
                }
            }
            else if (mWordWeightType == WordWeightType.TfIdf)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                    if (tfIdf > 0)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(tfIdf);
                    }
                }
            }
            else if (mWordWeightType == WordWeightType.LogDfTfIdf)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                    if (tfIdf > 0)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(Math.Log(1 + mIdxInfo[tfItem.Key].mDocFreq) * tfIdf);
                    }
                }
            }
            docVec.Sort();
            CutLowWeights(ref docVec);
            if (mNormalizeVectors)
            {
                Utils.TryNrmVecL2(docVec);
            }
            return(docVec);
        }
Example #2
0
        public ArrayList <SparseVector <double> > Initialize(IEnumerable <KeyDat <string, string> > documents, bool keepBowVectors)
        {
            Utils.ThrowException(documents == null ? new ArgumentNullException("documents") : null);
            Debug.Assert(documents != null, "Documents are always passed");
            mWordInfo.Clear();
            mIdxInfo.Clear();
            mBowVectors.Clear();
            ArrayList <SparseVector <double> > bows = keepBowVectors ? null : new ArrayList <SparseVector <double> >();

            // build vocabulary
            mLogger.Info("Initialize", "Building vocabulary ...");
            int docCount = 0;

            foreach (var document in documents)
            {
                docCount++;
                mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, /*numSteps=n*/ -1);
                Set <string>         docWords = new Set <string>();
                ArrayList <WordStem> nGrams   = new ArrayList <WordStem>(mMaxNGramLen);
                mTokenizer.Text = document.First;

                IStemmer stemmer;
                Set <string> .ReadOnly stopWords;
                // Setup stopwords and stemmer
                if (stemmers.ContainsKey(document.Second))
                {
                    stemmer   = stemmers[document.Second];
                    stopWords = stopWordDict[document.Second];
                }
                else
                {
                    Language lang = TextMiningUtils.GetLanguage(document.Second);

                    try
                    {
                        TextMiningUtils.GetLanguageTools(lang, out stopWords, out stemmer);
                    }
                    catch (ArgumentNotSupportedException)   // Language tools to not exist, so fallback to english
                    {
                        TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer);
                        mLogger.Error("Initialize", "Missing language tools for language code {0}.", document.Second);
                    }

                    stemmers[document.Second]     = stemmer;
                    stopWordDict[document.Second] = stopWords;
                }

                foreach (string token in mTokenizer)
                {
                    string word = token.Trim().ToLower();
                    if (stopWords == null || !stopWords.Contains(word))
                    {
                        string stem = stemmer == null ? word : stemmer.GetStem(word).Trim().ToLower();
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            WordStem wordStem = new WordStem();
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            nGrams.Add(wordStem);
                            if (nGrams.Count < mMaxNGramLen)
                            {
                                continue;
                            }
                        }
                        else
                        {
                            WordStem wordStem = nGrams[0];
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            for (int i = 0; i < mMaxNGramLen - 1; i++)
                            {
                                nGrams[i] = nGrams[i + 1];
                            }
                            nGrams[mMaxNGramLen - 1] = wordStem;
                        }
                        ProcessNGramsPass1(nGrams, 0, docWords);
                    }
                }
                int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
                for (int i = startIdx; i < nGrams.Count; i++)
                {
                    ProcessNGramsPass1(nGrams, i, docWords);
                }
            }
            mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, docCount);
            // remove unfrequent words and n-grams, precompute IDF
            ArrayList <string> removeList = new ArrayList <string>();

            foreach (KeyValuePair <string, Word> wordInfo in mWordInfo)
            {
                if (wordInfo.Value.mFreq < mMinWordFreq)
                {
                    removeList.Add(wordInfo.Key);
                }
                else
                {
                    wordInfo.Value.mIdf = Math.Log((double)docCount / (double)wordInfo.Value.mDocFreq);
                }
            }
            foreach (string key in removeList)
            {
                mWordInfo.Remove(key);
            }
            // determine most frequent word and n-gram forms
            foreach (Word wordInfo in mWordInfo.Values)
            {
                int max = 0;
                foreach (KeyValuePair <string, int> wordForm in wordInfo.mForms)
                {
                    if (wordForm.Value > max)
                    {
                        max = wordForm.Value;
                        wordInfo.mMostFrequentForm = wordForm.Key;
                    }
                }
                if (!mKeepWordForms)
                {
                    wordInfo.mForms.Clear();
                }
            }
            // compute bag-of-words vectors
            mLogger.Info("Initialize", "Computing bag-of-words vectors ...");
            int docNum = 1;

            foreach (var document in documents)
            {
                Set <string> .ReadOnly stopWords = stopWordDict[document.Second];
                IStemmer stemmer = stemmers[document.Second];

                mLogger.ProgressFast(this, "Initialize", "Document {0} / {1} ...", docNum++, docCount);
                Dictionary <int, int> tfVec  = new Dictionary <int, int>();
                ArrayList <WordStem>  nGrams = new ArrayList <WordStem>(mMaxNGramLen);
                mTokenizer.Text = document.First;
                foreach (string token in mTokenizer)
                {
                    string word = token.Trim().ToLower();
                    if (stopWords == null || !stopWords.Contains(word))
                    {
                        string stem = stemmer == null ? word : stemmer.GetStem(word).Trim().ToLower();
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            WordStem wordStem = new WordStem();
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            nGrams.Add(wordStem);
                            if (nGrams.Count < mMaxNGramLen)
                            {
                                continue;
                            }
                        }
                        else
                        {
                            WordStem wordStem = nGrams[0];
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            for (int i = 0; i < mMaxNGramLen - 1; i++)
                            {
                                nGrams[i] = nGrams[i + 1];
                            }
                            nGrams[mMaxNGramLen - 1] = wordStem;
                        }
                        ProcessNGramsPass2(nGrams, 0, tfVec);
                    }
                }
                int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
                for (int i = startIdx; i < nGrams.Count; i++)
                {
                    ProcessNGramsPass2(nGrams, i, tfVec);
                }
                SparseVector <double> docVec = new SparseVector <double>();
                if (mWordWeightType == WordWeightType.TermFreq)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(tfItem.Value);
                    }
                }
                else if (mWordWeightType == WordWeightType.TfIdf)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                        if (tfIdf > 0)
                        {
                            docVec.InnerIdx.Add(tfItem.Key);
                            docVec.InnerDat.Add(tfIdf);
                        }
                    }
                }
                else if (mWordWeightType == WordWeightType.LogDfTfIdf)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                        if (tfIdf > 0)
                        {
                            docVec.InnerIdx.Add(tfItem.Key);
                            docVec.InnerDat.Add(Math.Log(1 + mIdxInfo[tfItem.Key].mDocFreq) * tfIdf);
                        }
                    }
                }
                docVec.Sort();
                CutLowWeights(ref docVec);
                if (mNormalizeVectors)
                {
                    Utils.TryNrmVecL2(docVec);
                }
                if (keepBowVectors)
                {
                    mBowVectors.Add(docVec);
                }
                else
                {
                    bows.Add(docVec);
                }
            }
            return(bows);
        }