public static T CreateBowSpace <T>(bool rmvStopWords, int maxNGramLen, WordWeightType wordWeightType, Language language, int minWordFreq) where T : BowSpace, new() { Set <string> .ReadOnly langStopWords = null; IStemmer stemmer = null; try { TextMiningUtils.GetLanguageTools(language, out langStopWords, out stemmer); } catch { } if (language == Language.Portuguese) { stemmer = null; } var bowSpc = new T { //Logger = logger, Tokenizer = new RegexTokenizer { TokenRegex = new[] { Language.Russian, Language.Bulgarian, Language.Serbian }.Contains(language) ? @"[#@$]?([\d_]*[\p{IsBasicLatin}\p{IsLatin-1Supplement}\p{IsLatinExtended-A}\p{IsLatinExtended-B}\p{IsLatinExtendedAdditional}\p{IsCyrillic}\p{IsCyrillicSupplement}-[^\p{L}]][\d_]*){2,}" : @"[#@$]?([\d_]*[\p{IsBasicLatin}\p{IsLatin-1Supplement}\p{IsLatinExtended-A}\p{IsLatinExtended-B}\p{IsLatinExtendedAdditional}-[^\p{L}]][\d_]*){2,}", IgnoreUnknownTokens = true }, CutLowWeightsPerc = 0, MaxNGramLen = maxNGramLen, MinWordFreq = minWordFreq, WordWeightType = wordWeightType, NormalizeVectors = true, Stemmer = stemmer }; if (langStopWords != null) { var stopWords = new Set <string>(langStopWords) { "rt" }; // additional stop words if (language == Language.English) { stopWords.AddRange("im,youre,hes,shes,its,were,theyre,ive,youve,weve,theyve,youd,hed,theyd,youll,theyll,isnt,arent,wasnt,werent,hasnt,havent,hadnt,doesnt,dont,didnt,wont,wouldnt,shant,shouldnt,cant,couldnt,mustnt,lets,thats,whos,whats,heres,theres,whens,wheres,whys,hows,i,m,you,re,he,s,she,it,we,they,ve,d,ll,isn,t,aren,wasn,weren,hasn,haven,hadn,doesn,don,didn,won,wouldn,shan,shouldn,can,couldn,mustn,let,that,who,what,here,there,when,where,why,how".Split(',')); } if (rmvStopWords) { bowSpc.StopWords = stopWords; } } return(bowSpc); }
public override void Run(object[] args) { // prepare data IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Create a tokenizer. var tokenizer = new UnicodeTokenizer { MinTokenLen = 2, // Each token must be at least 2 characters long. Filter = TokenizerFilter.AlphaStrict // Tokens can consist of alphabetic characters only. }; // take data for two classes from cvs file var data = new List <LabeledTweet>(GetLabeledTweets().Where(lt => lt.Polarity != 2)).ToList(); // Create a bag-of-words space. var bowSpc = new BowSpace { Tokenizer = tokenizer, // Assign the tokenizer. StopWords = stopWords, // Assign the stop words. Stemmer = stemmer, // Assign the stemmer. MinWordFreq = 1, // A term must appear at least n-times in the corpus for it to be part of the vocabulary. MaxNGramLen = 2, // Terms consisting of at most n-consecutive words will be considered. WordWeightType = WordWeightType.TermFreq, // Set the weighting scheme for the bag-of-words vectors to TF. //WordWeightType = WordWeightType.TfIdf, // Set the weighting scheme for the bag-of-words vectors to TF-IDF. NormalizeVectors = true, // The TF-IDF vectors will be normalized. CutLowWeightsPerc = 0 // The terms with the lowest weights, summing up to 20% of the overall weight sum, will be removed from each TF-IDF vector. }; ArrayList <SparseVector <double> > bowData = bowSpc.Initialize(data.Select(d => d.Text)); // label data var labeledSet = new LabeledDataset <string, SparseVector <double> >(); for (int i = 0; i < data.Count; i++) { labeledSet.Add(data[i].Label, bowData[i]); } labeledSet.Shuffle(); int testSize = labeledSet.Count / 10; var trainingSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Skip(testSize)); var testSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Take(testSize)); //-------------------- SVM var svmBinClass = new SvmBinaryClassifier <string> { VerbosityLevel = SvmLightVerbosityLevel.Off }; if (args.Any()) { svmBinClass.C = (int)args[0]; } //svmBinClass.BiasedHyperplane = true; //svmBinClass.CustomParams = "-t 3"; // non-linear kernel //svmBinClass.CustomParams = String.Format("-j {0}",j); svmBinClass.Train(trainingSet); int correct = 0; double avgDist = 0; foreach (LabeledExample <string, SparseVector <double> > labeledExample in testSet) { var prediction = svmBinClass.Predict(labeledExample.Example); //Output.WriteLine("actual: {0}\tpredicted: {1}\t score: {2:0.0000}", labeledExample.Label, prediction.BestClassLabel, prediction.BestScore); avgDist += prediction.BestScore; if (prediction.BestClassLabel == labeledExample.Label) { correct++; } } Output.WriteLine("Accuracy: {0:0.00}", 100.0 * correct / testSet.Count); Output.WriteLine("Avg. distance: {0:0.00}", avgDist / testSet.Count); Result.Add("accuracy", (double)correct / testSet.Count); Result.Add("classifier", svmBinClass); Result.Add("labeled_data", labeledSet); }
static void Main(string[] args) { // Get the stop words and stemmer for English. IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Create a tokenizer. UnicodeTokenizer tokenizer = new UnicodeTokenizer(); tokenizer.MinTokenLen = 2; // Each token must be at least 2 // characters long. tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens // can consist of alphabetic characters only. // Load a document corpus from a file. Each line in the file // represents one document. string[] docs = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt"); // Create a bag-of-words space. BowSpace bowSpc = new BowSpace(); bowSpc.Tokenizer = tokenizer; // Assign the tokenizer. bowSpc.StopWords = stopWords; // Assign the stop words. bowSpc.Stemmer = stemmer; // Assign the stemmer. bowSpc.MinWordFreq = 3; // A term must appear at least 3 // times in the corpus for it to be part of the // vocabulary. bowSpc.MaxNGramLen = 3; // Terms consisting of at most 3 // consecutive words will be considered. bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the // weighting scheme for the bag-of-words vectors to // TF-IDF. bowSpc.NormalizeVectors = true; // The TF-IDF vectors will // be normalized. bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest // weights, summing up to 20% of the overall weight sum, // will be removed from each TF-IDF vector. bowSpc.Initialize(docs); // Initialize the BOW space. // Compute 100 clusters of documents. KMeansFast kMeans = new KMeansFast(100); // Set k to 100. kMeans.Trials = 3; // Perform 3 repetitions. Take the best // result. kMeans.Eps = 0.001; // Stop iterating when the partition // quality increases for less than 0.001. ClusteringResult cr = kMeans.Cluster(bowSpc); // Execute. // Extract the top 5 terms with the highest TF-IDF weights // from each of the clusters' centroids and output the // number of documents (companies) in each cluster. foreach (Cluster cl in cr.Roots) { SparseVector <double> .ReadOnly centroid = cl.ComputeCentroid(bowSpc, CentroidType.NrmL2); Console.Write(bowSpc.GetKeywordsStr(centroid, 5)); Console.WriteLine(" ({0} companies)", cl.Items.Count); } // Output the documents that are contained in the first // cluster. foreach (int docIdx in cr.Roots[0].Items) { Console.WriteLine(docs[docIdx]); } }
static void Main(string[] args) { // Get the stop words and stemmer for English. IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Test the stemmer. Console.WriteLine(stemmer.GetStem("running")); // Output: run // Create a tokenizer. UnicodeTokenizer tokenizer = new UnicodeTokenizer(); tokenizer.MinTokenLen = 2; // Each token must be at least 2 // characters long. tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens // can consist of alphabetic characters only. // Test the tokenizer. tokenizer.Text = "one 1 two 2 three 3 one_1 two_2 three_3"; foreach (string token in tokenizer) { Console.Write("\"{0}\" ", token); } Console.WriteLine(); // Output: "one" "two" "three" // Load a document corpus from a file. Each line in the file // represents one document. string[] docs = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt"); // Create a bag-of-words space. BowSpace bowSpc = new BowSpace(); bowSpc.Tokenizer = tokenizer; // Assign the tokenizer. bowSpc.StopWords = stopWords; // Assign the stop words. bowSpc.Stemmer = stemmer; // Assign the stemmer. bowSpc.MinWordFreq = 3; // A term must appear at least 3 // times in the corpus for it to be part of the // vocabulary. bowSpc.MaxNGramLen = 3; // Terms consisting of at most 3 // consecutive words will be considered. bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the // weighting scheme for the bag-of-words vectors to // TF-IDF. bowSpc.NormalizeVectors = true; // The TF-IDF vectors will // be normalized. bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest // weights, summing up to 20% of the overall weight sum, // will be removed from each TF-IDF vector. bowSpc.Initialize(docs); // Initialize the BOW space. // Output the vocabulary (the terms, their stems, // frequencies, and document frequencies) to the console. StreamWriter stdOut = new StreamWriter(Console.OpenStandardOutput()); bowSpc.OutputStats(stdOut); stdOut.Close(); // Output the TF-IDF vector representing the description of // Google to the console. SparseVector <double> .ReadOnly googVec = bowSpc.BowVectors[4192 - 1]; // The description of // Google can be found at the row 4192 in the corpus. foreach (IdxDat <double> termInfo in googVec) { Console.WriteLine("{0} : {1}", bowSpc.Words[termInfo.Idx].MostFrequentForm, termInfo.Dat); } // Extract the top 5 terms with the highest TF-IDF weights // from the vector representing Google. Console.WriteLine(bowSpc.GetKeywordsStr(googVec, 5)); // Output: google, relevant, targeted advertising, search, // index }