private SparseVector <double> ProcessDocument(string document) { Set <string> docWords = new Set <string>(); Dictionary <int, int> tfVec = new Dictionary <int, int>(); ArrayList <WordStem> nGrams = new ArrayList <WordStem>(mMaxNGramLen); mTokenizer.Text = document; foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (mStopWords == null || !mStopWords.Contains(word)) { string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < mMaxNGramLen) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < mMaxNGramLen) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[mMaxNGramLen - 1] = wordStem; } ProcessDocumentNGrams(nGrams, 0, tfVec, docWords); } } int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0; for (int i = startIdx; i < nGrams.Count; i++) { ProcessDocumentNGrams(nGrams, i, tfVec, docWords); } SparseVector <double> docVec = new SparseVector <double>(); foreach (KeyValuePair <int, int> tfItem in tfVec) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(tfItem.Value); } docVec.Sort(); return(docVec); }
public SparseVector <double> ProcessDocument(string document, string language) { Dictionary <int, int> tfVec = new Dictionary <int, int>(); ArrayList <WordStem> nGrams = new ArrayList <WordStem>(mMaxNGramLen); mTokenizer.Text = document; IStemmer stemmer; Set <string> .ReadOnly stopWords; if (stemmers.ContainsKey(language)) { stemmer = stemmers[language]; stopWords = stopWordDict[language]; } else { Language lang = TextMiningUtils.GetLanguage(language); try { TextMiningUtils.GetLanguageTools(lang, out stopWords, out stemmer); } catch (ArgumentNotSupportedException) // Language tools to not exist, so fallback to english { TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); mLogger.Error("Initialize", "Missing language tools for language code {0}.", language); } stemmers[language] = stemmer; stopWordDict[language] = stopWords; } foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (stopWords == null || !stopWords.Contains(word)) { string stem = stemmer == null ? word : stemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < mMaxNGramLen) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < mMaxNGramLen) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[mMaxNGramLen - 1] = wordStem; } ProcessDocumentNGrams(nGrams, 0, tfVec); } } int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0; for (int i = startIdx; i < nGrams.Count; i++) { ProcessDocumentNGrams(nGrams, i, tfVec); } SparseVector <double> docVec = new SparseVector <double>(); if (mWordWeightType == WordWeightType.TermFreq) { foreach (KeyValuePair <int, int> tfItem in tfVec) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(tfItem.Value); } } else if (mWordWeightType == WordWeightType.TfIdf) { foreach (KeyValuePair <int, int> tfItem in tfVec) { double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf; if (tfIdf > 0) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(tfIdf); } } } else if (mWordWeightType == WordWeightType.LogDfTfIdf) { foreach (KeyValuePair <int, int> tfItem in tfVec) { double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf; if (tfIdf > 0) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(Math.Log(1 + mIdxInfo[tfItem.Key].mDocFreq) * tfIdf); } } } docVec.Sort(); CutLowWeights(ref docVec); if (mNormalizeVectors) { Utils.TryNrmVecL2(docVec); } return(docVec); }
public void Initialize(IEnumerable <string> documents) { Utils.ThrowException(documents == null ? new ArgumentNullException("documents") : null); mWordInfo.Clear(); mIdxInfo.Clear(); mTfVectors.Clear(); // build vocabulary mLogger.Info("Initialize", "Building vocabulary ..."); int docCount = 0; foreach (string document in documents) { docCount++; mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, /*numSteps=*/ -1); Set <string> docWords = new Set <string>(); ArrayList <WordStem> nGrams = new ArrayList <WordStem>(mMaxNGramLen); mTokenizer.Text = document; foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (mStopWords == null || !mStopWords.Contains(word)) { string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < mMaxNGramLen) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < mMaxNGramLen) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[mMaxNGramLen - 1] = wordStem; } ProcessNGramsPass1(nGrams, 0, docWords); } } int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0; for (int i = startIdx; i < nGrams.Count; i++) { ProcessNGramsPass1(nGrams, i, docWords); } } mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, docCount); // determine most frequent word and n-gram forms foreach (Word wordInfo in mWordInfo.Values) { int max = 0; foreach (KeyValuePair <string, int> wordForm in wordInfo.mForms) { if (wordForm.Value > max) { max = wordForm.Value; wordInfo.mMostFrequentForm = wordForm.Key; } } } // compute bag-of-words vectors mLogger.Info("Initialize", "Computing bag-of-words vectors ..."); int docNum = 1; foreach (string document in documents) { mLogger.ProgressFast(this, "Initialize", "Document {0} / {1} ...", docNum++, docCount); Dictionary <int, int> tfVec = new Dictionary <int, int>(); ArrayList <WordStem> nGrams = new ArrayList <WordStem>(mMaxNGramLen); mTokenizer.Text = document; foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (mStopWords == null || !mStopWords.Contains(word)) { string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < mMaxNGramLen) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < mMaxNGramLen) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[mMaxNGramLen - 1] = wordStem; } ProcessNGramsPass2(nGrams, 0, tfVec); } } int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0; for (int i = startIdx; i < nGrams.Count; i++) { ProcessNGramsPass2(nGrams, i, tfVec); } SparseVector <double> docVec = new SparseVector <double>(); foreach (KeyValuePair <int, int> tfItem in tfVec) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(tfItem.Value); } docVec.Sort(); mTfVectors.Add(docVec); } }
private SparseVector<double> ProcessDocument(string document) { Set<string> docWords = new Set<string>(); Dictionary<int, int> tfVec = new Dictionary<int, int>(); ArrayList<WordStem> nGrams = new ArrayList<WordStem>(mMaxNGramLen); mTokenizer.Text = document; foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (mStopWords == null || !mStopWords.Contains(word)) { string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < mMaxNGramLen) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < mMaxNGramLen) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[mMaxNGramLen - 1] = wordStem; } ProcessDocumentNGrams(nGrams, 0, tfVec, docWords); } } int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0; for (int i = startIdx; i < nGrams.Count; i++) { ProcessDocumentNGrams(nGrams, i, tfVec, docWords); } SparseVector<double> docVec = new SparseVector<double>(); foreach (KeyValuePair<int, int> tfItem in tfVec) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(tfItem.Value); } docVec.Sort(); return docVec; }
public ArrayList <SparseVector <double> > Initialize(IEnumerable <KeyDat <string, string> > documents, bool keepBowVectors) { Utils.ThrowException(documents == null ? new ArgumentNullException("documents") : null); Debug.Assert(documents != null, "Documents are always passed"); mWordInfo.Clear(); mIdxInfo.Clear(); mBowVectors.Clear(); ArrayList <SparseVector <double> > bows = keepBowVectors ? null : new ArrayList <SparseVector <double> >(); // build vocabulary mLogger.Info("Initialize", "Building vocabulary ..."); int docCount = 0; foreach (var document in documents) { docCount++; mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, /*numSteps=n*/ -1); Set <string> docWords = new Set <string>(); ArrayList <WordStem> nGrams = new ArrayList <WordStem>(mMaxNGramLen); mTokenizer.Text = document.First; IStemmer stemmer; Set <string> .ReadOnly stopWords; // Setup stopwords and stemmer if (stemmers.ContainsKey(document.Second)) { stemmer = stemmers[document.Second]; stopWords = stopWordDict[document.Second]; } else { Language lang = TextMiningUtils.GetLanguage(document.Second); try { TextMiningUtils.GetLanguageTools(lang, out stopWords, out stemmer); } catch (ArgumentNotSupportedException) // Language tools to not exist, so fallback to english { TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); mLogger.Error("Initialize", "Missing language tools for language code {0}.", document.Second); } stemmers[document.Second] = stemmer; stopWordDict[document.Second] = stopWords; } foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (stopWords == null || !stopWords.Contains(word)) { string stem = stemmer == null ? word : stemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < mMaxNGramLen) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < mMaxNGramLen) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[mMaxNGramLen - 1] = wordStem; } ProcessNGramsPass1(nGrams, 0, docWords); } } int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0; for (int i = startIdx; i < nGrams.Count; i++) { ProcessNGramsPass1(nGrams, i, docWords); } } mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, docCount); // remove unfrequent words and n-grams, precompute IDF ArrayList <string> removeList = new ArrayList <string>(); foreach (KeyValuePair <string, Word> wordInfo in mWordInfo) { if (wordInfo.Value.mFreq < mMinWordFreq) { removeList.Add(wordInfo.Key); } else { wordInfo.Value.mIdf = Math.Log((double)docCount / (double)wordInfo.Value.mDocFreq); } } foreach (string key in removeList) { mWordInfo.Remove(key); } // determine most frequent word and n-gram forms foreach (Word wordInfo in mWordInfo.Values) { int max = 0; foreach (KeyValuePair <string, int> wordForm in wordInfo.mForms) { if (wordForm.Value > max) { max = wordForm.Value; wordInfo.mMostFrequentForm = wordForm.Key; } } if (!mKeepWordForms) { wordInfo.mForms.Clear(); } } // compute bag-of-words vectors mLogger.Info("Initialize", "Computing bag-of-words vectors ..."); int docNum = 1; foreach (var document in documents) { Set <string> .ReadOnly stopWords = stopWordDict[document.Second]; IStemmer stemmer = stemmers[document.Second]; mLogger.ProgressFast(this, "Initialize", "Document {0} / {1} ...", docNum++, docCount); Dictionary <int, int> tfVec = new Dictionary <int, int>(); ArrayList <WordStem> nGrams = new ArrayList <WordStem>(mMaxNGramLen); mTokenizer.Text = document.First; foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (stopWords == null || !stopWords.Contains(word)) { string stem = stemmer == null ? word : stemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < mMaxNGramLen) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < mMaxNGramLen) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[mMaxNGramLen - 1] = wordStem; } ProcessNGramsPass2(nGrams, 0, tfVec); } } int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0; for (int i = startIdx; i < nGrams.Count; i++) { ProcessNGramsPass2(nGrams, i, tfVec); } SparseVector <double> docVec = new SparseVector <double>(); if (mWordWeightType == WordWeightType.TermFreq) { foreach (KeyValuePair <int, int> tfItem in tfVec) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(tfItem.Value); } } else if (mWordWeightType == WordWeightType.TfIdf) { foreach (KeyValuePair <int, int> tfItem in tfVec) { double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf; if (tfIdf > 0) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(tfIdf); } } } else if (mWordWeightType == WordWeightType.LogDfTfIdf) { foreach (KeyValuePair <int, int> tfItem in tfVec) { double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf; if (tfIdf > 0) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(Math.Log(1 + mIdxInfo[tfItem.Key].mDocFreq) * tfIdf); } } } docVec.Sort(); CutLowWeights(ref docVec); if (mNormalizeVectors) { Utils.TryNrmVecL2(docVec); } if (keepBowVectors) { mBowVectors.Add(docVec); } else { bows.Add(docVec); } } return(bows); }
public void Initialize(IEnumerable<string> documents) { Utils.ThrowException(documents == null ? new ArgumentNullException("documents") : null); mWordInfo.Clear(); mIdxInfo.Clear(); mTfVectors.Clear(); // build vocabulary mLogger.Info("Initialize", "Building vocabulary ..."); int docCount = 0; foreach (string document in documents) { docCount++; mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, /*numSteps=*/-1); Set<string> docWords = new Set<string>(); ArrayList<WordStem> nGrams = new ArrayList<WordStem>(mMaxNGramLen); mTokenizer.Text = document; foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (mStopWords == null || !mStopWords.Contains(word)) { string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < mMaxNGramLen) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < mMaxNGramLen) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[mMaxNGramLen - 1] = wordStem; } ProcessNGramsPass1(nGrams, 0, docWords); } } int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0; for (int i = startIdx; i < nGrams.Count; i++) { ProcessNGramsPass1(nGrams, i, docWords); } } mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, docCount); // determine most frequent word and n-gram forms foreach (Word wordInfo in mWordInfo.Values) { int max = 0; foreach (KeyValuePair<string, int> wordForm in wordInfo.mForms) { if (wordForm.Value > max) { max = wordForm.Value; wordInfo.mMostFrequentForm = wordForm.Key; } } } // compute bag-of-words vectors mLogger.Info("Initialize", "Computing bag-of-words vectors ..."); int docNum = 1; foreach (string document in documents) { mLogger.ProgressFast(this, "Initialize", "Document {0} / {1} ...", docNum++, docCount); Dictionary<int, int> tfVec = new Dictionary<int, int>(); ArrayList<WordStem> nGrams = new ArrayList<WordStem>(mMaxNGramLen); mTokenizer.Text = document; foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (mStopWords == null || !mStopWords.Contains(word)) { string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < mMaxNGramLen) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < mMaxNGramLen) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[mMaxNGramLen - 1] = wordStem; } ProcessNGramsPass2(nGrams, 0, tfVec); } } int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0; for (int i = startIdx; i < nGrams.Count; i++) { ProcessNGramsPass2(nGrams, i, tfVec); } SparseVector<double> docVec = new SparseVector<double>(); foreach (KeyValuePair<int, int> tfItem in tfVec) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(tfItem.Value); } docVec.Sort(); mTfVectors.Add(docVec); } }
public SparseVector <double> ProcessDocument(string document) { Dictionary <int, int> tf_vec = new Dictionary <int, int>(); ArrayList <WordStem> n_grams = new ArrayList <WordStem>(m_max_n_gram_len); m_tokenizer.Text = document; foreach (string token in m_tokenizer) { string word = token.Trim().ToLower(); if (m_stop_words == null || !m_stop_words.Contains(word)) { string stem = m_stemmer == null ? word : m_stemmer.GetStem(word).Trim().ToLower(); if (n_grams.Count < m_max_n_gram_len) { WordStem word_stem = new WordStem(); word_stem.Word = word; word_stem.Stem = stem; n_grams.Add(word_stem); if (n_grams.Count < m_max_n_gram_len) { continue; } } else { WordStem word_stem = n_grams[0]; word_stem.Word = word; word_stem.Stem = stem; for (int i = 0; i < m_max_n_gram_len - 1; i++) { n_grams[i] = n_grams[i + 1]; } n_grams[m_max_n_gram_len - 1] = word_stem; } ProcessDocumentNGrams(n_grams, 0, tf_vec); } } int start_idx = n_grams.Count == m_max_n_gram_len ? 1 : 0; for (int i = start_idx; i < n_grams.Count; i++) { ProcessDocumentNGrams(n_grams, i, tf_vec); } SparseVector <double> doc_vec = new SparseVector <double>(); if (m_word_weight_type == WordWeightType.TermFreq) { foreach (KeyValuePair <int, int> tf_item in tf_vec) { doc_vec.InnerIdx.Add(tf_item.Key); doc_vec.InnerDat.Add(tf_item.Value); } } else if (m_word_weight_type == WordWeightType.TfIdf) { foreach (KeyValuePair <int, int> tf_item in tf_vec) { double tf_idf = (double)tf_item.Value * m_idx_info[tf_item.Key].m_idf; if (tf_idf > 0) { doc_vec.InnerIdx.Add(tf_item.Key); doc_vec.InnerDat.Add(tf_idf); } } } else if (m_word_weight_type == WordWeightType.LogDfTfIdf) { foreach (KeyValuePair <int, int> tf_item in tf_vec) { double tf_idf = (double)tf_item.Value * m_idx_info[tf_item.Key].m_idf; if (tf_idf > 0) { doc_vec.InnerIdx.Add(tf_item.Key); doc_vec.InnerDat.Add(Math.Log(1 + m_idx_info[tf_item.Key].m_doc_freq) * tf_idf); } } } doc_vec.Sort(); CutLowWeights(ref doc_vec); if (m_normalize_vectors) { ModelUtils.TryNrmVecL2(doc_vec); } return(doc_vec); }
public void Initialize(IEnumerable <string> documents, bool large_scale) { Utils.ThrowException(documents == null ? new ArgumentNullException("documents") : null); m_word_info.Clear(); m_idx_info.Clear(); m_bow_vectors.Clear(); // build vocabulary Utils.VerboseLine("Building vocabulary ..."); int doc_count = 0; if (!large_scale) { foreach (string document in documents) { doc_count++; Utils.Verbose("Document {0} ...\r", doc_count); Set <string> doc_words = new Set <string>(); ArrayList <WordStem> n_grams = new ArrayList <WordStem>(m_max_n_gram_len); m_tokenizer.Text = document; foreach (string token in m_tokenizer) { string word = token.Trim().ToLower(); if (m_stop_words == null || !m_stop_words.Contains(word)) { string stem = m_stemmer == null ? word : m_stemmer.GetStem(word).Trim().ToLower(); if (n_grams.Count < m_max_n_gram_len) { WordStem word_stem = new WordStem(); word_stem.Word = word; word_stem.Stem = stem; n_grams.Add(word_stem); if (n_grams.Count < m_max_n_gram_len) { continue; } } else { WordStem word_stem = n_grams[0]; word_stem.Word = word; word_stem.Stem = stem; for (int i = 0; i < m_max_n_gram_len - 1; i++) { n_grams[i] = n_grams[i + 1]; } n_grams[m_max_n_gram_len - 1] = word_stem; } ProcessNGramsPass1(n_grams, 0, doc_words); } } int start_idx = n_grams.Count == m_max_n_gram_len ? 1 : 0; for (int i = start_idx; i < n_grams.Count; i++) { ProcessNGramsPass1(n_grams, i, doc_words); } } Utils.VerboseLine(""); } else // large-scale mode (needs less memory, slower) { for (int n = 1; n <= m_max_n_gram_len; n++) { doc_count = 0; Utils.VerboseLine("Pass {0} of {1} ...", n, m_max_n_gram_len); foreach (string document in documents) { doc_count++; Utils.Verbose("Document {0} ...\r", doc_count); ArrayList <WordStem> n_grams = new ArrayList <WordStem>(n); Set <string> doc_words = new Set <string>(); m_tokenizer.Text = document; foreach (string token in m_tokenizer) { string word = token.Trim().ToLower(); if (m_stop_words == null || !m_stop_words.Contains(word)) { string stem = m_stemmer == null ? word : m_stemmer.GetStem(word).Trim().ToLower(); if (n_grams.Count < n) { WordStem word_stem = new WordStem(); word_stem.Word = word; word_stem.Stem = stem; n_grams.Add(word_stem); if (n_grams.Count < n) { continue; } } else { WordStem word_stem = n_grams[0]; word_stem.Word = word; word_stem.Stem = stem; for (int i = 0; i < n - 1; i++) { n_grams[i] = n_grams[i + 1]; } n_grams[n - 1] = word_stem; } string n_gram = n_grams[0].Word; string n_gram_stem = n_grams[0].Stem; if (n > 1) { for (int i = 1; i < n - 1; i++) { n_gram += " " + n_grams[i].Word; n_gram_stem += " " + n_grams[i].Stem; } if (!m_word_info.ContainsKey(n_gram_stem)) { continue; } if (m_word_info[n_gram_stem].m_freq < m_min_word_freq) { continue; } string n_gram_stem_2 = ""; for (int i = 1; i < n - 1; i++) { n_gram_stem_2 += n_grams[i].Stem + " "; } n_gram_stem_2 += n_grams[n - 1].Stem; if (!m_word_info.ContainsKey(n_gram_stem_2)) { continue; } if (m_word_info[n_gram_stem_2].m_freq < m_min_word_freq) { continue; } n_gram += " " + n_grams[n - 1].Word; n_gram_stem += " " + n_grams[n - 1].Stem; } if (!m_word_info.ContainsKey(n_gram_stem)) { Word n_gram_info = new Word(n_gram); m_word_info.Add(n_gram_stem, n_gram_info); doc_words.Add(n_gram_stem); } else { Word n_gram_info = m_word_info[n_gram_stem]; if (!doc_words.Contains(n_gram_stem)) { n_gram_info.m_doc_freq++; doc_words.Add(n_gram_stem); } n_gram_info.m_freq++; if (!n_gram_info.m_forms.ContainsKey(n_gram)) { n_gram_info.m_forms.Add(n_gram, 1); } else { n_gram_info.m_forms[n_gram]++; } } } } } Utils.VerboseLine(""); } } // remove unfrequent words and n-grams, precompute IDF ArrayList <string> remove_list = new ArrayList <string>(); foreach (KeyValuePair <string, Word> word_info in m_word_info) { if (word_info.Value.m_freq < m_min_word_freq) { remove_list.Add(word_info.Key); } else { word_info.Value.m_idf = Math.Log((double)doc_count / (double)word_info.Value.m_doc_freq); } } foreach (string key in remove_list) { m_word_info.Remove(key); } // determine most frequent word and n-gram forms foreach (Word word_info in m_word_info.Values) { int max = 0; foreach (KeyValuePair <string, int> word_form in word_info.m_forms) { if (word_form.Value > max) { max = word_form.Value; word_info.m_most_frequent_form = word_form.Key; } } if (!m_keep_word_forms) { word_info.m_forms.Clear(); } } // compute bag-of-words vectors Utils.VerboseLine("Computing bag-of-words vectors ..."); int doc_num = 1; foreach (string document in documents) { Utils.Verbose("Document {0} of {1} ...\r", doc_num++, doc_count); Dictionary <int, int> tf_vec = new Dictionary <int, int>(); ArrayList <WordStem> n_grams = new ArrayList <WordStem>(m_max_n_gram_len); m_tokenizer.Text = document; foreach (string token in m_tokenizer) { string word = token.Trim().ToLower(); if (m_stop_words == null || !m_stop_words.Contains(word)) { string stem = m_stemmer == null ? word : m_stemmer.GetStem(word).Trim().ToLower(); if (n_grams.Count < m_max_n_gram_len) { WordStem word_stem = new WordStem(); word_stem.Word = word; word_stem.Stem = stem; n_grams.Add(word_stem); if (n_grams.Count < m_max_n_gram_len) { continue; } } else { WordStem word_stem = n_grams[0]; word_stem.Word = word; word_stem.Stem = stem; for (int i = 0; i < m_max_n_gram_len - 1; i++) { n_grams[i] = n_grams[i + 1]; } n_grams[m_max_n_gram_len - 1] = word_stem; } ProcessNGramsPass2(n_grams, 0, tf_vec); } } int start_idx = n_grams.Count == m_max_n_gram_len ? 1 : 0; for (int i = start_idx; i < n_grams.Count; i++) { ProcessNGramsPass2(n_grams, i, tf_vec); } SparseVector <double> doc_vec = new SparseVector <double>(); if (m_word_weight_type == WordWeightType.TermFreq) { foreach (KeyValuePair <int, int> tf_item in tf_vec) { doc_vec.InnerIdx.Add(tf_item.Key); doc_vec.InnerDat.Add(tf_item.Value); } } else if (m_word_weight_type == WordWeightType.TfIdf) { foreach (KeyValuePair <int, int> tf_item in tf_vec) { double tf_idf = (double)tf_item.Value * m_idx_info[tf_item.Key].m_idf; if (tf_idf > 0) { doc_vec.InnerIdx.Add(tf_item.Key); doc_vec.InnerDat.Add(tf_idf); } } } else if (m_word_weight_type == WordWeightType.LogDfTfIdf) { foreach (KeyValuePair <int, int> tf_item in tf_vec) { double tf_idf = (double)tf_item.Value * m_idx_info[tf_item.Key].m_idf; if (tf_idf > 0) { doc_vec.InnerIdx.Add(tf_item.Key); doc_vec.InnerDat.Add(Math.Log(1 + m_idx_info[tf_item.Key].m_doc_freq) * tf_idf); } } } doc_vec.Sort(); CutLowWeights(ref doc_vec); if (m_normalize_vectors) { ModelUtils.TryNrmVecL2(doc_vec); } m_bow_vectors.Add(doc_vec); } Utils.VerboseLine(""); }
public SparseVector <double> ProcessDocument(string document) { Dictionary <int, int> tfVec = new Dictionary <int, int>(); ArrayList <WordStem> nGrams = new ArrayList <WordStem>(mMaxNGramLen); mTokenizer.Text = document; foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (mStopWords == null || !mStopWords.Contains(word)) { string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < mMaxNGramLen) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < mMaxNGramLen) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[mMaxNGramLen - 1] = wordStem; } ProcessDocumentNGrams(nGrams, 0, tfVec); } } int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0; for (int i = startIdx; i < nGrams.Count; i++) { ProcessDocumentNGrams(nGrams, i, tfVec); } SparseVector <double> docVec = new SparseVector <double>(); if (mWordWeightType == WordWeightType.TermFreq) { foreach (KeyValuePair <int, int> tfItem in tfVec) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(tfItem.Value); } } else if (mWordWeightType == WordWeightType.TfIdf) { foreach (KeyValuePair <int, int> tfItem in tfVec) { double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf; if (tfIdf > 0) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(tfIdf); } } } else if (mWordWeightType == WordWeightType.Dyakonov) { foreach (KeyValuePair <int, int> tfItem in tfVec) { double weight = (double)tfItem.Value / Math.Sqrt(mIdxInfo[tfItem.Key].mFreq); if (weight > 0) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(weight); } } } else if (mWordWeightType == WordWeightType.LogDfTfIdf) { foreach (KeyValuePair <int, int> tfItem in tfVec) { double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf; if (tfIdf > 0) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(Math.Log(1 + mIdxInfo[tfItem.Key].mDocFreq) * tfIdf); } } } docVec.Sort(); CutLowWeights(ref docVec); if (mNormalizeVectors) { Utils.TryNrmVecL2(docVec); } return(docVec); }
public ArrayList <SparseVector <double> > Initialize(IEnumerable <string> documents, bool largeScale, bool keepBowVectors) { Utils.ThrowException(documents == null ? new ArgumentNullException("documents") : null); mWordInfo.Clear(); mIdxInfo.Clear(); mBowVectors.Clear(); ArrayList <SparseVector <double> > bows = keepBowVectors ? null : new ArrayList <SparseVector <double> >(); // build vocabulary mLogger.Info("Initialize", "Building vocabulary ..."); int docCount = 0; if (!largeScale) { foreach (string document in documents) { docCount++; mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, /*numSteps=n*/ -1); Set <string> docWords = new Set <string>(); ArrayList <WordStem> nGrams = new ArrayList <WordStem>(mMaxNGramLen); mTokenizer.Text = document; foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (mStopWords == null || !mStopWords.Contains(word)) { string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < mMaxNGramLen) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < mMaxNGramLen) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[mMaxNGramLen - 1] = wordStem; } ProcessNGramsPass1(nGrams, 0, docWords); } } int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0; for (int i = startIdx; i < nGrams.Count; i++) { ProcessNGramsPass1(nGrams, i, docWords); } } mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, docCount); } else // large-scale mode (needs less memory, slower) { for (int n = 1; n <= mMaxNGramLen; n++) { docCount = 0; mLogger.Info("Initialize", "Pass {0} of {1} ...", n, mMaxNGramLen); foreach (string document in documents) { docCount++; mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, /*numSteps=*/ -1); ArrayList <WordStem> nGrams = new ArrayList <WordStem>(n); Set <string> docWords = new Set <string>(); mTokenizer.Text = document; foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (mStopWords == null || !mStopWords.Contains(word)) { string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < n) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < n) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < n - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[n - 1] = wordStem; } string nGram = nGrams[0].Word; string nGramStem = nGrams[0].Stem; if (n > 1) { for (int i = 1; i < n - 1; i++) { nGram += " " + nGrams[i].Word; nGramStem += " " + nGrams[i].Stem; } if (!mWordInfo.ContainsKey(nGramStem)) { continue; } if (mWordInfo[nGramStem].mFreq < mMinWordFreq) { continue; } string nGramStem2 = ""; for (int i = 1; i < n - 1; i++) { nGramStem2 += nGrams[i].Stem + " "; } nGramStem2 += nGrams[n - 1].Stem; if (!mWordInfo.ContainsKey(nGramStem2)) { continue; } if (mWordInfo[nGramStem2].mFreq < mMinWordFreq) { continue; } nGram += " " + nGrams[n - 1].Word; nGramStem += " " + nGrams[n - 1].Stem; } if (!mWordInfo.ContainsKey(nGramStem)) { Word nGramInfo = new Word(nGram, nGramStem); mWordInfo.Add(nGramStem, nGramInfo); docWords.Add(nGramStem); } else { Word nGramInfo = mWordInfo[nGramStem]; if (!docWords.Contains(nGramStem)) { nGramInfo.mDocFreq++; docWords.Add(nGramStem); } nGramInfo.mFreq++; if (!nGramInfo.mForms.ContainsKey(nGram)) { nGramInfo.mForms.Add(nGram, 1); } else { nGramInfo.mForms[nGram]++; } } } } } mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, docCount); } } // remove unfrequent words and n-grams, precompute IDF ArrayList <string> removeList = new ArrayList <string>(); foreach (KeyValuePair <string, Word> wordInfo in mWordInfo) { if (wordInfo.Value.mFreq < mMinWordFreq) { removeList.Add(wordInfo.Key); } else { wordInfo.Value.mIdf = Math.Log((double)docCount / (double)wordInfo.Value.mDocFreq); } } foreach (string key in removeList) { mWordInfo.Remove(key); } // determine most frequent word and n-gram forms foreach (Word wordInfo in mWordInfo.Values) { int max = 0; foreach (KeyValuePair <string, int> wordForm in wordInfo.mForms) { if (wordForm.Value > max) { max = wordForm.Value; wordInfo.mMostFrequentForm = wordForm.Key; } } if (!mKeepWordForms) { wordInfo.mForms.Clear(); } } // compute bag-of-words vectors mLogger.Info("Initialize", "Computing bag-of-words vectors ..."); int docNum = 1; foreach (string document in documents) { mLogger.ProgressFast(this, "Initialize", "Document {0} / {1} ...", docNum++, docCount); Dictionary <int, int> tfVec = new Dictionary <int, int>(); ArrayList <WordStem> nGrams = new ArrayList <WordStem>(mMaxNGramLen); mTokenizer.Text = document; foreach (string token in mTokenizer) { string word = token.Trim().ToLower(); if (mStopWords == null || !mStopWords.Contains(word)) { string stem = mStemmer == null ? word : mStemmer.GetStem(word).Trim().ToLower(); if (nGrams.Count < mMaxNGramLen) { WordStem wordStem = new WordStem(); wordStem.Word = word; wordStem.Stem = stem; nGrams.Add(wordStem); if (nGrams.Count < mMaxNGramLen) { continue; } } else { WordStem wordStem = nGrams[0]; wordStem.Word = word; wordStem.Stem = stem; for (int i = 0; i < mMaxNGramLen - 1; i++) { nGrams[i] = nGrams[i + 1]; } nGrams[mMaxNGramLen - 1] = wordStem; } ProcessNGramsPass2(nGrams, 0, tfVec); } } int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0; for (int i = startIdx; i < nGrams.Count; i++) { ProcessNGramsPass2(nGrams, i, tfVec); } SparseVector <double> docVec = new SparseVector <double>(); if (mWordWeightType == WordWeightType.TermFreq) { foreach (KeyValuePair <int, int> tfItem in tfVec) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(tfItem.Value); } } else if (mWordWeightType == WordWeightType.TfIdf) { foreach (KeyValuePair <int, int> tfItem in tfVec) { double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf; if (tfIdf > 0) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(tfIdf); } } } else if (mWordWeightType == WordWeightType.Dyakonov) { foreach (KeyValuePair <int, int> tfItem in tfVec) { double weight = (double)tfItem.Value / Math.Sqrt(mIdxInfo[tfItem.Key].mFreq); if (weight > 0) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(weight); } } } else if (mWordWeightType == WordWeightType.LogDfTfIdf) { foreach (KeyValuePair <int, int> tfItem in tfVec) { double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf; if (tfIdf > 0) { docVec.InnerIdx.Add(tfItem.Key); docVec.InnerDat.Add(Math.Log(1 + mIdxInfo[tfItem.Key].mDocFreq) * tfIdf); } } } docVec.Sort(); CutLowWeights(ref docVec); if (mNormalizeVectors) { Utils.TryNrmVecL2(docVec); } if (keepBowVectors) { mBowVectors.Add(docVec); } else { bows.Add(docVec); } } return(bows); }