Exemple #1
0
        public void ReadCorpus(string dir, int cutOff)
        {
            Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null);
            Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/ true) ? new ArgumentValueException("dir") : null);
            Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null);

            if (languageProfiles != null)
            {
                languageProfiles.Clear();
            }

            string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories);
            Array.Sort(files);

            DateTime        dtStart     = DateTime.Now;
            LanguageProfile langProfile = null;
            Language        lastLang    = Language.Unspecified;

            foreach (string f in files)
            {
                Language lang;
                string   fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower();
                lang = TextMiningUtils.GetLanguage(fileLangCode);
                // Skips file names starting with unknown language code
                if (lang == Language.Unspecified)
                {
                    continue;
                }
                if (lang.Equals(lastLang) == false)
                {
                    // Adds new language
                    mLogger.Debug("ReadCorpus", lang + ":\t" + Path.GetFileName(f));
                    langProfile = new LanguageProfile(n, lang);
                    langProfile.AddTokensFromFile(f);
                    languageProfiles.Add(langProfile);
                    lastLang = lang;
                }
                else
                {
                    // Adds corpus file to the last language added
                    mLogger.Debug("ReadCorpus", lang + ":\t" + Path.GetFileName(f));
                    langProfile.AddTokensFromFile(f);
                }
            }

            // Does ranking of language profiles
            // No n-grams should be added to languages after this!
            foreach (LanguageProfile l in languageProfiles)
            {
                l.Trim(cutOff);
                l.DoRanking(); // throws InvalidOperationException
            }
        }
Exemple #2
0
        public void BuildProfilesFromCorpus(string dir, int cutOff, Encoding codePage, Encoding loadAs)
        {
            Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null);
            Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/ true) ? new ArgumentValueException("dir") : null);
            Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null);
            mLanguageProfiles.Clear();
            string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories);
            Array.Sort(files);
            LanguageProfile langProfile = null;
            Language        lastLang    = Language.Unspecified;

            foreach (string f in files)
            {
                Language lang;
                string   fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower();
                lang = TextMiningUtils.GetLanguage(fileLangCode);
                // skip file names starting with unknown language code
                if (lang == Language.Unspecified)
                {
                    mLogger.Warn("BuildProfilesFromCorpus", "Unknown: " + Path.GetFileName(f));
                    continue;
                }
                if (lang != lastLang)
                {
                    // add new language
                    mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f));
                    langProfile = new LanguageProfile(mN, lang, codePage);
                    langProfile.AddTokensFromFile(f, loadAs);
                    mLanguageProfiles.Add(langProfile);
                    lastLang = lang;
                }
                else
                {
                    // add corpus file to the last language added
                    mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f));
                    langProfile.AddTokensFromFile(f, loadAs);
                }
            }
            // ranks n-grams
            // *** n-grams should not be added to languages after this
            foreach (LanguageProfile l in mLanguageProfiles)
            {
                l.Trim(cutOff);
                l.DoRanking(); // throws InvalidOperationException
            }
        }
Exemple #3
0
        public SparseVector <double> ProcessDocument(string document, string language)
        {
            Dictionary <int, int> tfVec  = new Dictionary <int, int>();
            ArrayList <WordStem>  nGrams = new ArrayList <WordStem>(mMaxNGramLen);

            mTokenizer.Text = document;

            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;

            if (stemmers.ContainsKey(language))
            {
                stemmer   = stemmers[language];
                stopWords = stopWordDict[language];
            }
            else
            {
                Language lang = TextMiningUtils.GetLanguage(language);

                try
                {
                    TextMiningUtils.GetLanguageTools(lang, out stopWords, out stemmer);
                }
                catch (ArgumentNotSupportedException)   // Language tools to not exist, so fallback to english
                {
                    TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer);
                    mLogger.Error("Initialize", "Missing language tools for language code {0}.", language);
                }

                stemmers[language]     = stemmer;
                stopWordDict[language] = stopWords;
            }

            foreach (string token in mTokenizer)
            {
                string word = token.Trim().ToLower();
                if (stopWords == null || !stopWords.Contains(word))
                {
                    string stem = stemmer == null ? word : stemmer.GetStem(word).Trim().ToLower();
                    if (nGrams.Count < mMaxNGramLen)
                    {
                        WordStem wordStem = new WordStem();
                        wordStem.Word = word;
                        wordStem.Stem = stem;
                        nGrams.Add(wordStem);
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            continue;
                        }
                    }
                    else
                    {
                        WordStem wordStem = nGrams[0];
                        wordStem.Word = word;
                        wordStem.Stem = stem;
                        for (int i = 0; i < mMaxNGramLen - 1; i++)
                        {
                            nGrams[i] = nGrams[i + 1];
                        }
                        nGrams[mMaxNGramLen - 1] = wordStem;
                    }
                    ProcessDocumentNGrams(nGrams, 0, tfVec);
                }
            }
            int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;

            for (int i = startIdx; i < nGrams.Count; i++)
            {
                ProcessDocumentNGrams(nGrams, i, tfVec);
            }
            SparseVector <double> docVec = new SparseVector <double>();

            if (mWordWeightType == WordWeightType.TermFreq)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    docVec.InnerIdx.Add(tfItem.Key);
                    docVec.InnerDat.Add(tfItem.Value);
                }
            }
            else if (mWordWeightType == WordWeightType.TfIdf)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                    if (tfIdf > 0)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(tfIdf);
                    }
                }
            }
            else if (mWordWeightType == WordWeightType.LogDfTfIdf)
            {
                foreach (KeyValuePair <int, int> tfItem in tfVec)
                {
                    double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                    if (tfIdf > 0)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(Math.Log(1 + mIdxInfo[tfItem.Key].mDocFreq) * tfIdf);
                    }
                }
            }
            docVec.Sort();
            CutLowWeights(ref docVec);
            if (mNormalizeVectors)
            {
                Utils.TryNrmVecL2(docVec);
            }
            return(docVec);
        }
Exemple #4
0
        public ArrayList <SparseVector <double> > Initialize(IEnumerable <KeyDat <string, string> > documents, bool keepBowVectors)
        {
            Utils.ThrowException(documents == null ? new ArgumentNullException("documents") : null);
            Debug.Assert(documents != null, "Documents are always passed");
            mWordInfo.Clear();
            mIdxInfo.Clear();
            mBowVectors.Clear();
            ArrayList <SparseVector <double> > bows = keepBowVectors ? null : new ArrayList <SparseVector <double> >();

            // build vocabulary
            mLogger.Info("Initialize", "Building vocabulary ...");
            int docCount = 0;

            foreach (var document in documents)
            {
                docCount++;
                mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, /*numSteps=n*/ -1);
                Set <string>         docWords = new Set <string>();
                ArrayList <WordStem> nGrams   = new ArrayList <WordStem>(mMaxNGramLen);
                mTokenizer.Text = document.First;

                IStemmer stemmer;
                Set <string> .ReadOnly stopWords;
                // Setup stopwords and stemmer
                if (stemmers.ContainsKey(document.Second))
                {
                    stemmer   = stemmers[document.Second];
                    stopWords = stopWordDict[document.Second];
                }
                else
                {
                    Language lang = TextMiningUtils.GetLanguage(document.Second);

                    try
                    {
                        TextMiningUtils.GetLanguageTools(lang, out stopWords, out stemmer);
                    }
                    catch (ArgumentNotSupportedException)   // Language tools to not exist, so fallback to english
                    {
                        TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer);
                        mLogger.Error("Initialize", "Missing language tools for language code {0}.", document.Second);
                    }

                    stemmers[document.Second]     = stemmer;
                    stopWordDict[document.Second] = stopWords;
                }

                foreach (string token in mTokenizer)
                {
                    string word = token.Trim().ToLower();
                    if (stopWords == null || !stopWords.Contains(word))
                    {
                        string stem = stemmer == null ? word : stemmer.GetStem(word).Trim().ToLower();
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            WordStem wordStem = new WordStem();
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            nGrams.Add(wordStem);
                            if (nGrams.Count < mMaxNGramLen)
                            {
                                continue;
                            }
                        }
                        else
                        {
                            WordStem wordStem = nGrams[0];
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            for (int i = 0; i < mMaxNGramLen - 1; i++)
                            {
                                nGrams[i] = nGrams[i + 1];
                            }
                            nGrams[mMaxNGramLen - 1] = wordStem;
                        }
                        ProcessNGramsPass1(nGrams, 0, docWords);
                    }
                }
                int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
                for (int i = startIdx; i < nGrams.Count; i++)
                {
                    ProcessNGramsPass1(nGrams, i, docWords);
                }
            }
            mLogger.ProgressFast(this, "Initialize", "Document {0} ...", docCount, docCount);
            // remove unfrequent words and n-grams, precompute IDF
            ArrayList <string> removeList = new ArrayList <string>();

            foreach (KeyValuePair <string, Word> wordInfo in mWordInfo)
            {
                if (wordInfo.Value.mFreq < mMinWordFreq)
                {
                    removeList.Add(wordInfo.Key);
                }
                else
                {
                    wordInfo.Value.mIdf = Math.Log((double)docCount / (double)wordInfo.Value.mDocFreq);
                }
            }
            foreach (string key in removeList)
            {
                mWordInfo.Remove(key);
            }
            // determine most frequent word and n-gram forms
            foreach (Word wordInfo in mWordInfo.Values)
            {
                int max = 0;
                foreach (KeyValuePair <string, int> wordForm in wordInfo.mForms)
                {
                    if (wordForm.Value > max)
                    {
                        max = wordForm.Value;
                        wordInfo.mMostFrequentForm = wordForm.Key;
                    }
                }
                if (!mKeepWordForms)
                {
                    wordInfo.mForms.Clear();
                }
            }
            // compute bag-of-words vectors
            mLogger.Info("Initialize", "Computing bag-of-words vectors ...");
            int docNum = 1;

            foreach (var document in documents)
            {
                Set <string> .ReadOnly stopWords = stopWordDict[document.Second];
                IStemmer stemmer = stemmers[document.Second];

                mLogger.ProgressFast(this, "Initialize", "Document {0} / {1} ...", docNum++, docCount);
                Dictionary <int, int> tfVec  = new Dictionary <int, int>();
                ArrayList <WordStem>  nGrams = new ArrayList <WordStem>(mMaxNGramLen);
                mTokenizer.Text = document.First;
                foreach (string token in mTokenizer)
                {
                    string word = token.Trim().ToLower();
                    if (stopWords == null || !stopWords.Contains(word))
                    {
                        string stem = stemmer == null ? word : stemmer.GetStem(word).Trim().ToLower();
                        if (nGrams.Count < mMaxNGramLen)
                        {
                            WordStem wordStem = new WordStem();
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            nGrams.Add(wordStem);
                            if (nGrams.Count < mMaxNGramLen)
                            {
                                continue;
                            }
                        }
                        else
                        {
                            WordStem wordStem = nGrams[0];
                            wordStem.Word = word;
                            wordStem.Stem = stem;
                            for (int i = 0; i < mMaxNGramLen - 1; i++)
                            {
                                nGrams[i] = nGrams[i + 1];
                            }
                            nGrams[mMaxNGramLen - 1] = wordStem;
                        }
                        ProcessNGramsPass2(nGrams, 0, tfVec);
                    }
                }
                int startIdx = nGrams.Count == mMaxNGramLen ? 1 : 0;
                for (int i = startIdx; i < nGrams.Count; i++)
                {
                    ProcessNGramsPass2(nGrams, i, tfVec);
                }
                SparseVector <double> docVec = new SparseVector <double>();
                if (mWordWeightType == WordWeightType.TermFreq)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        docVec.InnerIdx.Add(tfItem.Key);
                        docVec.InnerDat.Add(tfItem.Value);
                    }
                }
                else if (mWordWeightType == WordWeightType.TfIdf)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                        if (tfIdf > 0)
                        {
                            docVec.InnerIdx.Add(tfItem.Key);
                            docVec.InnerDat.Add(tfIdf);
                        }
                    }
                }
                else if (mWordWeightType == WordWeightType.LogDfTfIdf)
                {
                    foreach (KeyValuePair <int, int> tfItem in tfVec)
                    {
                        double tfIdf = (double)tfItem.Value * mIdxInfo[tfItem.Key].mIdf;
                        if (tfIdf > 0)
                        {
                            docVec.InnerIdx.Add(tfItem.Key);
                            docVec.InnerDat.Add(Math.Log(1 + mIdxInfo[tfItem.Key].mDocFreq) * tfIdf);
                        }
                    }
                }
                docVec.Sort();
                CutLowWeights(ref docVec);
                if (mNormalizeVectors)
                {
                    Utils.TryNrmVecL2(docVec);
                }
                if (keepBowVectors)
                {
                    mBowVectors.Add(docVec);
                }
                else
                {
                    bows.Add(docVec);
                }
            }
            return(bows);
        }