public static T CreateBowSpace <T>(bool rmvStopWords, int maxNGramLen, WordWeightType wordWeightType, Language language, int minWordFreq) where T : BowSpace, new() { Set <string> .ReadOnly langStopWords = null; IStemmer stemmer = null; try { TextMiningUtils.GetLanguageTools(language, out langStopWords, out stemmer); } catch { } if (language == Language.Portuguese) { stemmer = null; } var bowSpc = new T { //Logger = logger, Tokenizer = new RegexTokenizer { TokenRegex = new[] { Language.Russian, Language.Bulgarian, Language.Serbian }.Contains(language) ? @"[#@$]?([\d_]*[\p{IsBasicLatin}\p{IsLatin-1Supplement}\p{IsLatinExtended-A}\p{IsLatinExtended-B}\p{IsLatinExtendedAdditional}\p{IsCyrillic}\p{IsCyrillicSupplement}-[^\p{L}]][\d_]*){2,}" : @"[#@$]?([\d_]*[\p{IsBasicLatin}\p{IsLatin-1Supplement}\p{IsLatinExtended-A}\p{IsLatinExtended-B}\p{IsLatinExtendedAdditional}-[^\p{L}]][\d_]*){2,}", IgnoreUnknownTokens = true }, CutLowWeightsPerc = 0, MaxNGramLen = maxNGramLen, MinWordFreq = minWordFreq, WordWeightType = wordWeightType, NormalizeVectors = true, Stemmer = stemmer }; if (langStopWords != null) { var stopWords = new Set <string>(langStopWords) { "rt" }; // additional stop words if (language == Language.English) { stopWords.AddRange("im,youre,hes,shes,its,were,theyre,ive,youve,weve,theyve,youd,hed,theyd,youll,theyll,isnt,arent,wasnt,werent,hasnt,havent,hadnt,doesnt,dont,didnt,wont,wouldnt,shant,shouldnt,cant,couldnt,mustnt,lets,thats,whos,whats,heres,theres,whens,wheres,whys,hows,i,m,you,re,he,s,she,it,we,they,ve,d,ll,isn,t,aren,wasn,weren,hasn,haven,hadn,doesn,don,didn,won,wouldn,shan,shouldn,can,couldn,mustn,let,that,who,what,here,there,when,where,why,how".Split(',')); } if (rmvStopWords) { bowSpc.StopWords = stopWords; } } return(bowSpc); }
public void Load(BinarySerializer reader) { // the following statements throw serialization-related exceptions LoadVocabulary(reader); // throws ArgumentNullException mTokenizer = reader.ReadObject <ITokenizer>(); mStopWords = reader.ReadObject <Set <string> .ReadOnly>(); mStemmer = reader.ReadObject <IStemmer>(); mTfVectors.Load(reader); mMaxNGramLen = reader.ReadInt(); mWordWeightType = (WordWeightType)reader.ReadInt(); mCutLowWeightsPerc = reader.ReadDouble(); mNormalizeVectors = reader.ReadBool(); }
public void Load(BinarySerializer reader) { // the following statements throw serialization-related exceptions LoadVocabulary(reader); // throws ArgumentNullException m_tokenizer = reader.ReadObject <ITokenizer>(); m_stop_words = reader.ReadObject <Set <string> .ReadOnly>(); m_stemmer = reader.ReadObject <IStemmer>(); m_bow_vectors.Load(reader); m_max_n_gram_len = reader.ReadInt(); m_min_word_freq = reader.ReadInt(); m_word_weight_type = (WordWeightType)reader.ReadInt(); m_cut_low_weights_perc = reader.ReadDouble(); m_normalize_vectors = reader.ReadBool(); }
public ArrayList <SparseVector <double> > GetMostRecentBows(int num, WordWeightType wordWeightType, bool normalizeVectors, double cutLowWeightsPerc, int minWordFreq) { Utils.ThrowException(num < 0 ? new ArgumentOutOfRangeException("num") : null); Utils.ThrowException(cutLowWeightsPerc < 0 || cutLowWeightsPerc >= 1 ? new ArgumentOutOfRangeException("cutLowWeightsPerc") : null); Utils.ThrowException(minWordFreq < 1 ? new ArgumentOutOfRangeException("minWordFreq") : null); num = Math.Min(num, mTfVectors.Count); ArrayList <SparseVector <double> > bowVectors = new ArrayList <SparseVector <double> >(num); if (wordWeightType == WordWeightType.TermFreq) { for (int i = mTfVectors.Count - num; i < mTfVectors.Count; i++) { SparseVector <double> tfVec = mTfVectors[i]; SparseVector <double> tmp = new SparseVector <double>(tfVec.Count); foreach (IdxDat <double> tfInfo in tfVec) { if (mIdxInfo[tfInfo.Idx].Freq >= minWordFreq) { tmp.InnerIdx.Add(tfInfo.Idx); tmp.InnerDat.Add(tfInfo.Dat); } } ModelUtils.CutLowWeights(ref tmp, cutLowWeightsPerc); if (normalizeVectors) { ModelUtils.TryNrmVecL2(tmp); } bowVectors.Add(tmp); } } else if (wordWeightType == WordWeightType.TfIdf || wordWeightType == WordWeightType.TfIdfSafe) { for (int i = mTfVectors.Count - num; i < mTfVectors.Count; i++) { SparseVector <double> tfVec = mTfVectors[i]; SparseVector <double> tmp = new SparseVector <double>(tfVec.Count); foreach (IdxDat <double> tfInfo in tfVec) { if (mIdxInfo[tfInfo.Idx].Freq >= minWordFreq) { tmp.InnerIdx.Add(tfInfo.Idx); tmp.InnerDat.Add(tfInfo.Dat * Idf(mIdxInfo[tfInfo.Idx], mTfVectors.Count + (wordWeightType == WordWeightType.TfIdf ? 0 : 1))); } } ModelUtils.CutLowWeights(ref tmp, cutLowWeightsPerc); if (normalizeVectors) { ModelUtils.TryNrmVecL2(tmp); } bowVectors.Add(tmp); } } else if (wordWeightType == WordWeightType.LogDfTfIdf) { for (int i = mTfVectors.Count - num; i < mTfVectors.Count; i++) { SparseVector <double> tfVec = mTfVectors[i]; SparseVector <double> tmp = new SparseVector <double>(tfVec.Count); foreach (IdxDat <double> tfInfo in tfVec) { if (mIdxInfo[tfInfo.Idx].Freq >= minWordFreq) { tmp.InnerIdx.Add(tfInfo.Idx); double tfIdf = tfInfo.Dat * Idf(mIdxInfo[tfInfo.Idx], mTfVectors.Count); tmp.InnerDat.Add(Math.Log(1 + mIdxInfo[tfInfo.Idx].mDocFreq) * tfIdf); } } ModelUtils.CutLowWeights(ref tmp, cutLowWeightsPerc); if (normalizeVectors) { ModelUtils.TryNrmVecL2(tmp); } bowVectors.Add(tmp); } } return(bowVectors); }
public void Load(BinarySerializer reader) { // the following statements throw serialization-related exceptions LoadVocabulary(reader); // throws ArgumentNullException mTokenizer = reader.ReadObject<ITokenizer>(); mStopWords = reader.ReadObject<Set<string>.ReadOnly>(); mStemmer = reader.ReadObject<IStemmer>(); mTfVectors.Load(reader); mMaxNGramLen = reader.ReadInt(); mWordWeightType = (WordWeightType)reader.ReadInt(); mCutLowWeightsPerc = reader.ReadDouble(); mNormalizeVectors = reader.ReadBool(); }
public ArrayList<SparseVector<double>> GetMostRecentBows(int num, WordWeightType wordWeightType, bool normalizeVectors, double cutLowWeightsPerc, int minWordFreq) { Utils.ThrowException(num < 0 ? new ArgumentOutOfRangeException("num") : null); Utils.ThrowException(cutLowWeightsPerc < 0 || cutLowWeightsPerc >= 1 ? new ArgumentOutOfRangeException("cutLowWeightsPerc") : null); Utils.ThrowException(minWordFreq < 1 ? new ArgumentOutOfRangeException("minWordFreq") : null); num = Math.Min(num, mTfVectors.Count); ArrayList<SparseVector<double>> bowVectors = new ArrayList<SparseVector<double>>(num); if (wordWeightType == WordWeightType.TermFreq) { for (int i = mTfVectors.Count - num; i < mTfVectors.Count; i++) { SparseVector<double> tfVec = mTfVectors[i]; SparseVector<double> tmp = new SparseVector<double>(tfVec.Count); foreach (IdxDat<double> tfInfo in tfVec) { if (mIdxInfo[tfInfo.Idx].Freq >= minWordFreq) { tmp.InnerIdx.Add(tfInfo.Idx); tmp.InnerDat.Add(tfInfo.Dat); } } ModelUtils.CutLowWeights(ref tmp, cutLowWeightsPerc); if (normalizeVectors) { ModelUtils.TryNrmVecL2(tmp); } bowVectors.Add(tmp); } } else if (wordWeightType == WordWeightType.TfIdf) { for (int i = mTfVectors.Count - num; i < mTfVectors.Count; i++) { SparseVector<double> tfVec = mTfVectors[i]; SparseVector<double> tmp = new SparseVector<double>(tfVec.Count); foreach (IdxDat<double> tfInfo in tfVec) { if (mIdxInfo[tfInfo.Idx].Freq >= minWordFreq) { tmp.InnerIdx.Add(tfInfo.Idx); tmp.InnerDat.Add(tfInfo.Dat * Idf(mIdxInfo[tfInfo.Idx], mTfVectors.Count)); } } ModelUtils.CutLowWeights(ref tmp, cutLowWeightsPerc); if (normalizeVectors) { ModelUtils.TryNrmVecL2(tmp); } bowVectors.Add(tmp); } } else if (wordWeightType == WordWeightType.LogDfTfIdf) { for (int i = mTfVectors.Count - num; i < mTfVectors.Count; i++) { SparseVector<double> tfVec = mTfVectors[i]; SparseVector<double> tmp = new SparseVector<double>(tfVec.Count); foreach (IdxDat<double> tfInfo in tfVec) { if (mIdxInfo[tfInfo.Idx].Freq >= minWordFreq) { tmp.InnerIdx.Add(tfInfo.Idx); double tfIdf = tfInfo.Dat * Idf(mIdxInfo[tfInfo.Idx], mTfVectors.Count); tmp.InnerDat.Add(Math.Log(1 + mIdxInfo[tfInfo.Idx].mDocFreq) * tfIdf); } } ModelUtils.CutLowWeights(ref tmp, cutLowWeightsPerc); if (normalizeVectors) { ModelUtils.TryNrmVecL2(tmp); } bowVectors.Add(tmp); } } return bowVectors; }