コード例 #1
0
        public static T CreateBowSpace <T>(bool rmvStopWords, int maxNGramLen, WordWeightType wordWeightType, Language language, int minWordFreq) where T : BowSpace, new()
        {
            Set <string> .ReadOnly langStopWords = null;
            IStemmer stemmer = null;

            try
            {
                TextMiningUtils.GetLanguageTools(language, out langStopWords, out stemmer);
            }
            catch
            {
            }

            if (language == Language.Portuguese)
            {
                stemmer = null;
            }

            var bowSpc = new T
            {
                //Logger = logger,
                Tokenizer = new RegexTokenizer
                {
                    TokenRegex =
                        new[] { Language.Russian, Language.Bulgarian, Language.Serbian }.Contains(language)
                                    ? @"[#@$]?([\d_]*[\p{IsBasicLatin}\p{IsLatin-1Supplement}\p{IsLatinExtended-A}\p{IsLatinExtended-B}\p{IsLatinExtendedAdditional}\p{IsCyrillic}\p{IsCyrillicSupplement}-[^\p{L}]][\d_]*){2,}"
                                    : @"[#@$]?([\d_]*[\p{IsBasicLatin}\p{IsLatin-1Supplement}\p{IsLatinExtended-A}\p{IsLatinExtended-B}\p{IsLatinExtendedAdditional}-[^\p{L}]][\d_]*){2,}",
                    IgnoreUnknownTokens = true
                },
                CutLowWeightsPerc = 0,
                MaxNGramLen       = maxNGramLen,
                MinWordFreq       = minWordFreq,
                WordWeightType    = wordWeightType,
                NormalizeVectors  = true,
                Stemmer           = stemmer
            };

            if (langStopWords != null)
            {
                var stopWords = new Set <string>(langStopWords)
                {
                    "rt"
                };
                // additional stop words
                if (language == Language.English)
                {
                    stopWords.AddRange("im,youre,hes,shes,its,were,theyre,ive,youve,weve,theyve,youd,hed,theyd,youll,theyll,isnt,arent,wasnt,werent,hasnt,havent,hadnt,doesnt,dont,didnt,wont,wouldnt,shant,shouldnt,cant,couldnt,mustnt,lets,thats,whos,whats,heres,theres,whens,wheres,whys,hows,i,m,you,re,he,s,she,it,we,they,ve,d,ll,isn,t,aren,wasn,weren,hasn,haven,hadn,doesn,don,didn,won,wouldn,shan,shouldn,can,couldn,mustn,let,that,who,what,here,there,when,where,why,how".Split(','));
                }
                if (rmvStopWords)
                {
                    bowSpc.StopWords = stopWords;
                }
            }
            return(bowSpc);
        }
コード例 #2
0
 public void Load(BinarySerializer reader)
 {
     // the following statements throw serialization-related exceptions
     LoadVocabulary(reader); // throws ArgumentNullException
     mTokenizer = reader.ReadObject <ITokenizer>();
     mStopWords = reader.ReadObject <Set <string> .ReadOnly>();
     mStemmer   = reader.ReadObject <IStemmer>();
     mTfVectors.Load(reader);
     mMaxNGramLen       = reader.ReadInt();
     mWordWeightType    = (WordWeightType)reader.ReadInt();
     mCutLowWeightsPerc = reader.ReadDouble();
     mNormalizeVectors  = reader.ReadBool();
 }
コード例 #3
0
ファイル: BowSpace.cs プロジェクト: mgrcar/Detextive
 public void Load(BinarySerializer reader)
 {
     // the following statements throw serialization-related exceptions
     LoadVocabulary(reader); // throws ArgumentNullException
     m_tokenizer  = reader.ReadObject <ITokenizer>();
     m_stop_words = reader.ReadObject <Set <string> .ReadOnly>();
     m_stemmer    = reader.ReadObject <IStemmer>();
     m_bow_vectors.Load(reader);
     m_max_n_gram_len       = reader.ReadInt();
     m_min_word_freq        = reader.ReadInt();
     m_word_weight_type     = (WordWeightType)reader.ReadInt();
     m_cut_low_weights_perc = reader.ReadDouble();
     m_normalize_vectors    = reader.ReadBool();
 }
コード例 #4
0
        public ArrayList <SparseVector <double> > GetMostRecentBows(int num, WordWeightType wordWeightType, bool normalizeVectors, double cutLowWeightsPerc,
                                                                    int minWordFreq)
        {
            Utils.ThrowException(num < 0 ? new ArgumentOutOfRangeException("num") : null);
            Utils.ThrowException(cutLowWeightsPerc < 0 || cutLowWeightsPerc >= 1 ? new ArgumentOutOfRangeException("cutLowWeightsPerc") : null);
            Utils.ThrowException(minWordFreq < 1 ? new ArgumentOutOfRangeException("minWordFreq") : null);
            num = Math.Min(num, mTfVectors.Count);
            ArrayList <SparseVector <double> > bowVectors = new ArrayList <SparseVector <double> >(num);

            if (wordWeightType == WordWeightType.TermFreq)
            {
                for (int i = mTfVectors.Count - num; i < mTfVectors.Count; i++)
                {
                    SparseVector <double> tfVec = mTfVectors[i];
                    SparseVector <double> tmp   = new SparseVector <double>(tfVec.Count);
                    foreach (IdxDat <double> tfInfo in tfVec)
                    {
                        if (mIdxInfo[tfInfo.Idx].Freq >= minWordFreq)
                        {
                            tmp.InnerIdx.Add(tfInfo.Idx);
                            tmp.InnerDat.Add(tfInfo.Dat);
                        }
                    }
                    ModelUtils.CutLowWeights(ref tmp, cutLowWeightsPerc);
                    if (normalizeVectors)
                    {
                        ModelUtils.TryNrmVecL2(tmp);
                    }
                    bowVectors.Add(tmp);
                }
            }
            else if (wordWeightType == WordWeightType.TfIdf || wordWeightType == WordWeightType.TfIdfSafe)
            {
                for (int i = mTfVectors.Count - num; i < mTfVectors.Count; i++)
                {
                    SparseVector <double> tfVec = mTfVectors[i];
                    SparseVector <double> tmp   = new SparseVector <double>(tfVec.Count);
                    foreach (IdxDat <double> tfInfo in tfVec)
                    {
                        if (mIdxInfo[tfInfo.Idx].Freq >= minWordFreq)
                        {
                            tmp.InnerIdx.Add(tfInfo.Idx);
                            tmp.InnerDat.Add(tfInfo.Dat * Idf(mIdxInfo[tfInfo.Idx], mTfVectors.Count + (wordWeightType == WordWeightType.TfIdf ? 0 : 1)));
                        }
                    }
                    ModelUtils.CutLowWeights(ref tmp, cutLowWeightsPerc);
                    if (normalizeVectors)
                    {
                        ModelUtils.TryNrmVecL2(tmp);
                    }
                    bowVectors.Add(tmp);
                }
            }
            else if (wordWeightType == WordWeightType.LogDfTfIdf)
            {
                for (int i = mTfVectors.Count - num; i < mTfVectors.Count; i++)
                {
                    SparseVector <double> tfVec = mTfVectors[i];
                    SparseVector <double> tmp   = new SparseVector <double>(tfVec.Count);
                    foreach (IdxDat <double> tfInfo in tfVec)
                    {
                        if (mIdxInfo[tfInfo.Idx].Freq >= minWordFreq)
                        {
                            tmp.InnerIdx.Add(tfInfo.Idx);
                            double tfIdf = tfInfo.Dat * Idf(mIdxInfo[tfInfo.Idx], mTfVectors.Count);
                            tmp.InnerDat.Add(Math.Log(1 + mIdxInfo[tfInfo.Idx].mDocFreq) * tfIdf);
                        }
                    }
                    ModelUtils.CutLowWeights(ref tmp, cutLowWeightsPerc);
                    if (normalizeVectors)
                    {
                        ModelUtils.TryNrmVecL2(tmp);
                    }
                    bowVectors.Add(tmp);
                }
            }
            return(bowVectors);
        }
コード例 #5
0
ファイル: IncrementalBowSpace.cs プロジェクト: viidea/latino
 public void Load(BinarySerializer reader)
 {
     // the following statements throw serialization-related exceptions
     LoadVocabulary(reader); // throws ArgumentNullException
     mTokenizer = reader.ReadObject<ITokenizer>();
     mStopWords = reader.ReadObject<Set<string>.ReadOnly>();
     mStemmer = reader.ReadObject<IStemmer>();
     mTfVectors.Load(reader);
     mMaxNGramLen = reader.ReadInt();
     mWordWeightType = (WordWeightType)reader.ReadInt();
     mCutLowWeightsPerc = reader.ReadDouble();
     mNormalizeVectors = reader.ReadBool();
 }
コード例 #6
0
 public ArrayList<SparseVector<double>> GetMostRecentBows(int num, WordWeightType wordWeightType, bool normalizeVectors, double cutLowWeightsPerc,
     int minWordFreq)
 {
     Utils.ThrowException(num < 0 ? new ArgumentOutOfRangeException("num") : null);
     Utils.ThrowException(cutLowWeightsPerc < 0 || cutLowWeightsPerc >= 1 ? new ArgumentOutOfRangeException("cutLowWeightsPerc") : null);
     Utils.ThrowException(minWordFreq < 1 ? new ArgumentOutOfRangeException("minWordFreq") : null);
     num = Math.Min(num, mTfVectors.Count);
     ArrayList<SparseVector<double>> bowVectors = new ArrayList<SparseVector<double>>(num);
     if (wordWeightType == WordWeightType.TermFreq)
     {
         for (int i = mTfVectors.Count - num; i < mTfVectors.Count; i++)
         {
             SparseVector<double> tfVec = mTfVectors[i];
             SparseVector<double> tmp = new SparseVector<double>(tfVec.Count);
             foreach (IdxDat<double> tfInfo in tfVec)
             {
                 if (mIdxInfo[tfInfo.Idx].Freq >= minWordFreq)
                 {
                     tmp.InnerIdx.Add(tfInfo.Idx);
                     tmp.InnerDat.Add(tfInfo.Dat);
                 }
             }
             ModelUtils.CutLowWeights(ref tmp, cutLowWeightsPerc);
             if (normalizeVectors) { ModelUtils.TryNrmVecL2(tmp); }
             bowVectors.Add(tmp);
         }
     }
     else if (wordWeightType == WordWeightType.TfIdf)
     {
         for (int i = mTfVectors.Count - num; i < mTfVectors.Count; i++)
         {
             SparseVector<double> tfVec = mTfVectors[i];
             SparseVector<double> tmp = new SparseVector<double>(tfVec.Count);
             foreach (IdxDat<double> tfInfo in tfVec)
             {
                 if (mIdxInfo[tfInfo.Idx].Freq >= minWordFreq)
                 {
                     tmp.InnerIdx.Add(tfInfo.Idx);
                     tmp.InnerDat.Add(tfInfo.Dat * Idf(mIdxInfo[tfInfo.Idx], mTfVectors.Count));
                 }
             }
             ModelUtils.CutLowWeights(ref tmp, cutLowWeightsPerc);
             if (normalizeVectors) { ModelUtils.TryNrmVecL2(tmp); }
             bowVectors.Add(tmp);
         }
     }
     else if (wordWeightType == WordWeightType.LogDfTfIdf)
     {
         for (int i = mTfVectors.Count - num; i < mTfVectors.Count; i++)
         {
             SparseVector<double> tfVec = mTfVectors[i];
             SparseVector<double> tmp = new SparseVector<double>(tfVec.Count);
             foreach (IdxDat<double> tfInfo in tfVec)
             {
                 if (mIdxInfo[tfInfo.Idx].Freq >= minWordFreq)
                 {
                     tmp.InnerIdx.Add(tfInfo.Idx);
                     double tfIdf = tfInfo.Dat * Idf(mIdxInfo[tfInfo.Idx], mTfVectors.Count);
                     tmp.InnerDat.Add(Math.Log(1 + mIdxInfo[tfInfo.Idx].mDocFreq) * tfIdf);
                 }
             }
             ModelUtils.CutLowWeights(ref tmp, cutLowWeightsPerc);
             if (normalizeVectors) { ModelUtils.TryNrmVecL2(tmp); }
             bowVectors.Add(tmp);
         }
     }
     return bowVectors;
 }