コード例 #1
0
        public static T CreateBowSpace <T>(bool rmvStopWords, int maxNGramLen, WordWeightType wordWeightType, Language language, int minWordFreq) where T : BowSpace, new()
        {
            Set <string> .ReadOnly langStopWords = null;
            IStemmer stemmer = null;

            try
            {
                TextMiningUtils.GetLanguageTools(language, out langStopWords, out stemmer);
            }
            catch
            {
            }

            if (language == Language.Portuguese)
            {
                stemmer = null;
            }

            var bowSpc = new T
            {
                //Logger = logger,
                Tokenizer = new RegexTokenizer
                {
                    TokenRegex =
                        new[] { Language.Russian, Language.Bulgarian, Language.Serbian }.Contains(language)
                                    ? @"[#@$]?([\d_]*[\p{IsBasicLatin}\p{IsLatin-1Supplement}\p{IsLatinExtended-A}\p{IsLatinExtended-B}\p{IsLatinExtendedAdditional}\p{IsCyrillic}\p{IsCyrillicSupplement}-[^\p{L}]][\d_]*){2,}"
                                    : @"[#@$]?([\d_]*[\p{IsBasicLatin}\p{IsLatin-1Supplement}\p{IsLatinExtended-A}\p{IsLatinExtended-B}\p{IsLatinExtendedAdditional}-[^\p{L}]][\d_]*){2,}",
                    IgnoreUnknownTokens = true
                },
                CutLowWeightsPerc = 0,
                MaxNGramLen       = maxNGramLen,
                MinWordFreq       = minWordFreq,
                WordWeightType    = wordWeightType,
                NormalizeVectors  = true,
                Stemmer           = stemmer
            };

            if (langStopWords != null)
            {
                var stopWords = new Set <string>(langStopWords)
                {
                    "rt"
                };
                // additional stop words
                if (language == Language.English)
                {
                    stopWords.AddRange("im,youre,hes,shes,its,were,theyre,ive,youve,weve,theyve,youd,hed,theyd,youll,theyll,isnt,arent,wasnt,werent,hasnt,havent,hadnt,doesnt,dont,didnt,wont,wouldnt,shant,shouldnt,cant,couldnt,mustnt,lets,thats,whos,whats,heres,theres,whens,wheres,whys,hows,i,m,you,re,he,s,she,it,we,they,ve,d,ll,isn,t,aren,wasn,weren,hasn,haven,hadn,doesn,don,didn,won,wouldn,shan,shouldn,can,couldn,mustn,let,that,who,what,here,there,when,where,why,how".Split(','));
                }
                if (rmvStopWords)
                {
                    bowSpc.StopWords = stopWords;
                }
            }
            return(bowSpc);
        }
コード例 #2
0
ファイル: YahooSearchEngine.cs プロジェクト: 0000duck/latino
        public override void Search()
        {
            Utils.ThrowException(mResultSetMaxSz > 1000 ? new ArgumentValueException("ResultSetMaxSize") : null);
            string langStr = TextMiningUtils.GetLanguageCode(mLanguage);

            mResultSet.Inner.Clear();
            if (mCache == null || !mCache.GetFromCache("YahooSearchEngine", mLanguage, mQuery, mResultSetMaxSz, ref mTotalHits, ref mResultSet))
            {
                int resultsPerPage = mResultSetMaxSz > 100 ? 100 : mResultSetMaxSz;
                for (int i = 0; i < mResultSetMaxSz;)
                {
                    string request = string.Format("http://search.yahooapis.com/WebSearchService/V1/webSearch?appid={0}&query={1}&results={2}&start={3}{4}",
                                                   HttpUtility.UrlEncode(mAppId), HttpUtility.UrlEncode(mQuery), resultsPerPage, i + 1,
                                                   mLanguage == Language.Unspecified ? "" : string.Format("&language={0}", langStr));
                    int    firstResult, resultsReturned;
                    string response = SendRequest(request, out firstResult, out resultsReturned); // throws WebException, QuotaExceededException
                    if (mRetry && mTotalHits == 0)                                                // *** Yahoo sometimes returns 0 results even if this is not the case (do a retry)
                    {
                        Thread.Sleep(2000);                                                       // delay for 2 seconds
                        response = SendRequest(request, out firstResult, out resultsReturned);    // throws WebException, QuotaExceededException
                    }
                    if (firstResult != i + 1)
                    {
                        mTotalHits = i;
                        break;
                    }
                    Match regexMatch = mResultItemRegex.Match(response);
                    while (regexMatch.Success)
                    {
                        string title   = HttpUtility.HtmlDecode(regexMatch.Result("${title}"));
                        string snippet = HttpUtility.HtmlDecode(regexMatch.Result("${summary}"));
                        string url     = HttpUtility.HtmlDecode(regexMatch.Result("${url}"));
                        mResultSet.Inner.Add(new SearchEngineResultItem(title, snippet, url, mResultSet.Count + 1));
                        regexMatch = regexMatch.NextMatch();
                        if (++i == mResultSetMaxSz)
                        {
                            break;
                        }
                    }
                    if (resultsReturned < resultsPerPage)
                    {
                        mTotalHits = firstResult + resultsReturned - 1;
                        break;
                    }
                }
                mTotalHits = Math.Max(mTotalHits, (long)mResultSet.Count); // just to make sure ...
                if (mCache != null)
                {
                    mCache.PutIntoCache("YahooSearchEngine", mLanguage, mQuery, mTotalHits, mResultSet);
                }
            }
        }
コード例 #3
0
        public override void Search()
        {
            string langStr = TextMiningUtils.GetLanguageCode(mLanguage);

            mResultSet.Inner.Clear();
            if (mCache == null || !mCache.GetFromCache("GoogleDefine", mLanguage, mQuery, mResultSetMaxSz, ref mTotalHits, ref mResultSet))
            {
                int    i        = 0;
                string defHtml  = WebUtils.GetWebPage(string.Format("http://www.google.com/search?defl={0}&q=define%3A{1}", langStr, HttpUtility.UrlEncode(mQuery))); // throws WebException
                Match  defMatch = new Regex("<li>(?<def>[^<]*)(<br><a href=\"(?<href>[^\"]*))?", RegexOptions.Singleline).Match(defHtml);
                while (defMatch.Success)
                {
                    string def      = HttpUtility.HtmlDecode(defMatch.Result("${def}").Trim());
                    string href     = defMatch.Result("${href}");
                    string url      = null;
                    Match  matchUrl = new Regex("&q=(?<url>[^&]*)").Match(href);
                    if (matchUrl.Success)
                    {
                        url = HttpUtility.UrlDecode(matchUrl.Result("${url}"));
                    }
                    mResultSet.Inner.Add(new SearchEngineResultItem(mQuery, def, url, ++i));
                    defMatch = defMatch.NextMatch();
                }
                string lastUrl = null;
                for (int j = mResultSet.Count - 1; j >= 0; j--)
                {
                    if (mResultSet[j].Url == null)
                    {
                        mResultSet[j].SetUrl(lastUrl);
                    }
                    else
                    {
                        lastUrl = mResultSet[j].Url;
                    }
                }
                mTotalHits = mResultSet.Count;
                if (mCache != null)
                {
                    mCache.PutIntoCache("GoogleDefine", mLanguage, mQuery, mTotalHits, mResultSet);
                }
                if (mResultSetMaxSz < mResultSet.Count)
                {
                    mResultSet.Inner.RemoveRange(mResultSetMaxSz, mResultSet.Count - mResultSetMaxSz);
                }
            }
        }
コード例 #4
0
        public /*protected*/ override void ProcessDocument(Document document)
        {
            string contentType = document.Features.GetFeatureValue("contentType");

            if (contentType != "Text")
            {
                return;
            }
            StringBuilder strBuilder = new StringBuilder();

            try
            {
                TextBlock[] blocks = document.GetAnnotatedBlocks(mBlockSelector);
                foreach (TextBlock block in blocks)
                {
                    strBuilder.AppendLine(block.Text);
                }
                string text = strBuilder.ToString();
                if (text.Length >= mMinTextLen)
                {
                    LanguageProfile langProfile = mLanguageDetector.DetectLanguage(text);
                    if (langProfile != null)
                    {
                        document.Features.SetFeatureValue("detectedLanguage", langProfile.Language.ToString());
                    }
                }
                if (text.Length > 0)
                {
                    document.Features.SetFeatureValue("detectedCharRange", TextMiningUtils.GetCharRange(text));
                }
            }
            catch (Exception exception)
            {
                mLogger.Error("ProcessDocument", exception);
            }
        }
コード例 #5
0
ファイル: BinarySvm.cs プロジェクト: petergabrovsek/LATINO
        public override void Run(object[] args)
        {
            // prepare data
            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer);

            // Create a tokenizer.
            var tokenizer = new UnicodeTokenizer
            {
                MinTokenLen = 2,                            // Each token must be at least 2 characters long.
                Filter      = TokenizerFilter.AlphaStrict   // Tokens can consist of alphabetic characters only.
            };

            // take data for two classes from cvs file
            var data = new List <LabeledTweet>(GetLabeledTweets().Where(lt => lt.Polarity != 2)).ToList();

            // Create a bag-of-words space.
            var bowSpc = new BowSpace
            {
                Tokenizer      = tokenizer,                 // Assign the tokenizer.
                StopWords      = stopWords,                 // Assign the stop words.
                Stemmer        = stemmer,                   // Assign the stemmer.
                MinWordFreq    = 1,                         // A term must appear at least n-times in the corpus for it to be part of the vocabulary.
                MaxNGramLen    = 2,                         // Terms consisting of at most n-consecutive words will be considered.
                WordWeightType = WordWeightType.TermFreq,   // Set the weighting scheme for the bag-of-words vectors to TF.
                //WordWeightType = WordWeightType.TfIdf,  // Set the weighting scheme for the bag-of-words vectors to TF-IDF.
                NormalizeVectors  = true,                   // The TF-IDF vectors will be normalized.
                CutLowWeightsPerc = 0                       // The terms with the lowest weights, summing up to 20% of the overall weight sum, will be removed from each TF-IDF vector.
            };
            ArrayList <SparseVector <double> > bowData = bowSpc.Initialize(data.Select(d => d.Text));

            // label data
            var labeledSet = new LabeledDataset <string, SparseVector <double> >();

            for (int i = 0; i < data.Count; i++)
            {
                labeledSet.Add(data[i].Label, bowData[i]);
            }
            labeledSet.Shuffle();

            int testSize    = labeledSet.Count / 10;
            var trainingSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Skip(testSize));
            var testSet     = new LabeledDataset <string, SparseVector <double> >(labeledSet.Take(testSize));

            //-------------------- SVM

            var svmBinClass = new SvmBinaryClassifier <string> {
                VerbosityLevel = SvmLightVerbosityLevel.Off
            };

            if (args.Any())
            {
                svmBinClass.C = (int)args[0];
            }
            //svmBinClass.BiasedHyperplane = true;
            //svmBinClass.CustomParams = "-t 3";   // non-linear kernel
            //svmBinClass.CustomParams = String.Format("-j {0}",j);

            svmBinClass.Train(trainingSet);

            int    correct = 0;
            double avgDist = 0;

            foreach (LabeledExample <string, SparseVector <double> > labeledExample in testSet)
            {
                var prediction = svmBinClass.Predict(labeledExample.Example);
                //Output.WriteLine("actual: {0}\tpredicted: {1}\t score: {2:0.0000}", labeledExample.Label, prediction.BestClassLabel, prediction.BestScore);
                avgDist += prediction.BestScore;
                if (prediction.BestClassLabel == labeledExample.Label)
                {
                    correct++;
                }
            }

            Output.WriteLine("Accuracy: {0:0.00}", 100.0 * correct / testSet.Count);
            Output.WriteLine("Avg. distance: {0:0.00}", avgDist / testSet.Count);

            Result.Add("accuracy", (double)correct / testSet.Count);

            Result.Add("classifier", svmBinClass);
            Result.Add("labeled_data", labeledSet);
        }
コード例 #6
0
        static void Main(string[] args)
        {
            // Get the stop words and stemmer for English.

            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English,
                                             out stopWords, out stemmer);

            // Create a tokenizer.

            UnicodeTokenizer tokenizer = new UnicodeTokenizer();

            tokenizer.MinTokenLen = 2;                      // Each token must be at least 2
            // characters long.
            tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens
            // can consist of alphabetic characters only.

            // Load a document corpus from a file. Each line in the file
            // represents one document.

            string[] docs
                = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt");

            // Create a bag-of-words space.

            BowSpace bowSpc = new BowSpace();

            bowSpc.Tokenizer   = tokenizer; // Assign the tokenizer.
            bowSpc.StopWords   = stopWords; // Assign the stop words.
            bowSpc.Stemmer     = stemmer;   // Assign the stemmer.
            bowSpc.MinWordFreq = 3;         // A term must appear at least 3
            // times in the corpus for it to be part of the
            // vocabulary.
            bowSpc.MaxNGramLen = 3;                       // Terms consisting of at most 3
            // consecutive words will be considered.
            bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the
            // weighting scheme for the bag-of-words vectors to
            // TF-IDF.
            bowSpc.NormalizeVectors = true; // The TF-IDF vectors will
            // be normalized.
            bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest
            // weights, summing up to 20% of the overall weight sum,
            // will be removed from each TF-IDF vector.

            bowSpc.Initialize(docs); // Initialize the BOW space.

            // Compute 100 clusters of documents.

            KMeansFast kMeans = new KMeansFast(100); // Set k to 100.

            kMeans.Trials = 3;                       // Perform 3 repetitions. Take the best
            // result.
            kMeans.Eps = 0.001;                      // Stop iterating when the partition
            // quality increases for less than 0.001.

            ClusteringResult cr = kMeans.Cluster(bowSpc); // Execute.

            // Extract the top 5 terms with the highest TF-IDF weights
            // from each of the clusters' centroids and output the
            // number of documents (companies) in each cluster.

            foreach (Cluster cl in cr.Roots)
            {
                SparseVector <double> .ReadOnly centroid
                    = cl.ComputeCentroid(bowSpc, CentroidType.NrmL2);
                Console.Write(bowSpc.GetKeywordsStr(centroid, 5));
                Console.WriteLine(" ({0} companies)", cl.Items.Count);
            }

            // Output the documents that are contained in the first
            // cluster.

            foreach (int docIdx in cr.Roots[0].Items)
            {
                Console.WriteLine(docs[docIdx]);
            }
        }
コード例 #7
0
ファイル: Tutorial4_1.cs プロジェクト: 0000duck/latino
        static void Main(string[] args)
        {
            // Get the stop words and stemmer for English.

            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English,
                                             out stopWords, out stemmer);

            // Test the stemmer.

            Console.WriteLine(stemmer.GetStem("running"));
            // Output: run

            // Create a tokenizer.

            UnicodeTokenizer tokenizer = new UnicodeTokenizer();

            tokenizer.MinTokenLen = 2;                      // Each token must be at least 2
            // characters long.
            tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens
            // can consist of alphabetic characters only.

            // Test the tokenizer.

            tokenizer.Text = "one 1 two 2 three 3 one_1 two_2 three_3";
            foreach (string token in tokenizer)
            {
                Console.Write("\"{0}\" ", token);
            }
            Console.WriteLine();
            // Output: "one" "two" "three"

            // Load a document corpus from a file. Each line in the file
            // represents one document.

            string[] docs
                = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt");

            // Create a bag-of-words space.

            BowSpace bowSpc = new BowSpace();

            bowSpc.Tokenizer   = tokenizer; // Assign the tokenizer.
            bowSpc.StopWords   = stopWords; // Assign the stop words.
            bowSpc.Stemmer     = stemmer;   // Assign the stemmer.
            bowSpc.MinWordFreq = 3;         // A term must appear at least 3
            // times in the corpus for it to be part of the
            // vocabulary.
            bowSpc.MaxNGramLen = 3;                       // Terms consisting of at most 3
            // consecutive words will be considered.
            bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the
            // weighting scheme for the bag-of-words vectors to
            // TF-IDF.
            bowSpc.NormalizeVectors = true; // The TF-IDF vectors will
            // be normalized.
            bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest
            // weights, summing up to 20% of the overall weight sum,
            // will be removed from each TF-IDF vector.

            bowSpc.Initialize(docs); // Initialize the BOW space.

            // Output the vocabulary (the terms, their stems,
            // frequencies, and document frequencies) to the console.

            StreamWriter stdOut
                = new StreamWriter(Console.OpenStandardOutput());

            bowSpc.OutputStats(stdOut);
            stdOut.Close();

            // Output the TF-IDF vector representing the description of
            // Google to the console.

            SparseVector <double> .ReadOnly googVec
                = bowSpc.BowVectors[4192 - 1]; // The description of
            // Google can be found at the row 4192 in the corpus.
            foreach (IdxDat <double> termInfo in googVec)
            {
                Console.WriteLine("{0} : {1}",
                                  bowSpc.Words[termInfo.Idx].MostFrequentForm,
                                  termInfo.Dat);
            }

            // Extract the top 5 terms with the highest TF-IDF weights
            // from the vector representing Google.

            Console.WriteLine(bowSpc.GetKeywordsStr(googVec, 5));
            // Output: google, relevant, targeted advertising, search,
            // index
        }