private void LoadModel(BinarySerializer reader) { mLogger.Info("LoadModel", "Loading model ..."); mBowSpace = new BowSpace(reader); mModel = Utils.LoadDictionary <string, IModel <string> >(reader); mLogger.Info("LoadModel", "Model successfully loaded."); }
// ReSharper restore InconsistentNaming static void Main(string[] args) { string[] documents = Corpus.Split(new[] { '.' }, StringSplitOptions.RemoveEmptyEntries); var results = new Dictionary<string, SparseVector<double>>(); foreach (TestCase testCase in Enum.GetValues(typeof(TestCase)).Cast<TestCase>()) { var bow = new BowSpace { Tokenizer = new SimpleTokenizer { MinTokenLen = 2, Type = TokenizerType.AllChars }, StopWords = null, Stemmer = null, MaxNGramLen = 2, MinWordFreq = 1, WordWeightType = WordWeightType.TermFreq, NormalizeVectors = true, KeepWordForms = false }; switch (testCase) { case TestCase.testWordWeightType_TERM_FREQ: break; case TestCase.testWordWeightType_TF_IDF: bow.WordWeightType = WordWeightType.TfIdf; break; case TestCase.testWordWeightType_LOG_DF_TF_IDF: bow.WordWeightType = WordWeightType.LogDfTfIdf; break; case TestCase.testNGramLen: bow.MaxNGramLen = 5; break; case TestCase.testMinWordFreq: bow.MinWordFreq = 3; break; case TestCase.testNormalizeVectors: bow.NormalizeVectors = false; break; case TestCase.testKeepWordForms: bow.KeepWordForms = true; break; case TestCase.testLemmatizer: bow.Stemmer = new Lemmatizer(Language.English); break; case TestCase.testStopWords: bow.Stemmer = new Lemmatizer(Language.English); bow.StopWords = StopWords.EnglishStopWords; break; } bow.Initialize(documents, testCase == TestCase.testInitializeLargeScale); SparseVector<double> vector = bow.ProcessDocument(Document); results.Add(testCase.ToString(), vector); } File.WriteAllText("vec.json", JsonConvert.SerializeObject(results)); Console.WriteLine(results); }
public Prediction <LblT> Predict(string example) { Preconditions.CheckState(IsTrained); example = FeatureProcessor.Run(example); SparseVector <double> vector = BowSpace.ProcessDocument(example); return(Model.Predict(vector)); }
static CategorizerDemo() { string fileName = HttpContext.Current.Server.MapPath("App_Data\\model.bin"); BinarySerializer binReader = new BinarySerializer(fileName, FileMode.Open); mBowSpace = new BowSpace(binReader); mBowSpace.CutLowWeightsPerc = 0.2; mCategorizer = Utils.LoadDictionary<string, IModel<string>>(binReader); binReader.Close(); mReady = true; }
public Pair <string, double>[] GetTopVectorItems(string vectorName, int n, BowSpace bowSpc) { SparseVector <double> vec = mFeatureVectors[vectorName]; return(vec .OrderByDescending(x => x.Dat) .Take(n) .Select(x => new Pair <string, double>(bowSpc.Words[x.Idx].Stem, x.Dat)) .ToArray()); }
static TwitterSentimentDemo() { string modelFileName = HttpContext.Current.Server.MapPath("App_Data\\AdCfy.bin"); string bowSpcFileName = HttpContext.Current.Server.MapPath("App_Data\\AdCfyBowSpc.bin"); mClassifier = new SvmBinaryClassifier<int>(); mClassifier.LoadModel(modelFileName); BinarySerializer bs = new BinarySerializer(bowSpcFileName, FileMode.Open); mBowSpace = new BowSpace(bs); bs.Close(); mReady = true; }
static PumpIndexComponent() { Logger.GetLogger(typeof(PumpIndexComponent)).Info("PumpIndexComponent", "Loading model ..."); string fileName = Utils.GetConfigValue("PumpIndexModel", ".\\PumpIndexModel.bin"); using (BinarySerializer reader = new BinarySerializer(fileName, FileMode.Open)) { mBowSpace = new BowSpace(reader); mClassifier = new SvmBinaryClassifier<int>(reader); mAvgDistPos = reader.ReadDouble(); mAvgDistNeg = reader.ReadDouble(); //Console.WriteLine(mAvgDistPos); //Console.WriteLine(mAvgDistNeg); } Logger.GetLogger(typeof(PumpIndexComponent)).Info("PumpIndexComponent", "Done."); }
static DocumentCategorizerComponent() { Logger.GetLogger(typeof(DocumentCategorizerComponent)).Info("CategorizerComponent", "Loading model ..."); string fileName = Utils.GetConfigValue("CategorizationModel", ".\\CategorizationModel.bin"); BinarySerializer binReader = new BinarySerializer(fileName, FileMode.Open); mBowSpace = new BowSpace(binReader); mBowSpace.CutLowWeightsPerc = 0.2; mCategorizer = Utils.LoadDictionary<string, IModel<string>>(binReader); binReader.Close(); //using (BinarySerializer writer = new BinarySerializer(@"C:\Work\AchimPipe\DocumentCategorizerModel.bin", FileMode.Create)) //{ // mBowSpace.Save(writer); // Utils.SaveDictionary<string, IModel<string>>(mCategorizer, writer); //} Logger.GetLogger(typeof(DocumentCategorizerComponent)).Info("CategorizerComponent", "Done."); }
static PumpIndexComponent() { Logger.GetLogger(typeof(PumpIndexComponent)).Info("PumpIndexComponent", "Loading model ..."); string fileName = Utils.GetConfigValue("PumpIndexModel", ".\\PumpIndexModel.bin"); using (BinarySerializer reader = new BinarySerializer(fileName, FileMode.Open)) { mBowSpace = new BowSpace(reader); mClassifier = new SvmBinaryClassifier <int>(reader); mAvgDistPos = reader.ReadDouble(); mAvgDistNeg = reader.ReadDouble(); //Console.WriteLine(mAvgDistPos); //Console.WriteLine(mAvgDistNeg); } Logger.GetLogger(typeof(PumpIndexComponent)).Info("PumpIndexComponent", "Done."); }
static DocumentCategorizerComponent() { Logger.GetLogger(typeof(DocumentCategorizerComponent)).Info("CategorizerComponent", "Loading model ..."); string fileName = Utils.GetConfigValue("CategorizationModel", ".\\CategorizationModel.bin"); BinarySerializer binReader = new BinarySerializer(fileName, FileMode.Open); mBowSpace = new BowSpace(binReader); mBowSpace.CutLowWeightsPerc = 0.2; mCategorizer = Utils.LoadDictionary <string, IModel <string> >(binReader); binReader.Close(); //using (BinarySerializer writer = new BinarySerializer(@"C:\Work\AchimPipe\DocumentCategorizerModel.bin", FileMode.Create)) //{ // mBowSpace.Save(writer); // Utils.SaveDictionary<string, IModel<string>>(mCategorizer, writer); //} Logger.GetLogger(typeof(DocumentCategorizerComponent)).Info("CategorizerComponent", "Done."); }
public void Train(ILabeledExampleCollection <LblT, string> dataset) { Preconditions.CheckState(!IsTrained); Preconditions.CheckNotNull(dataset); Preconditions.CheckNotNull(BowSpace); Preconditions.CheckNotNull(FeatureProcessor); Preconditions.CheckNotNull(Model); // preprocess the text foreach (LabeledExample <LblT, string> le in dataset) { le.Example = FeatureProcessor.Run(le.Example); } // bow vectors List <SparseVector <double> > bowData = BowSpace is DeltaBowSpace <LblT> ?(BowSpace as DeltaBowSpace <LblT>).Initialize(dataset as ILabeledDataset <LblT, string> ?? new LabeledDataset <LblT, string>(dataset)) : BowSpace.Initialize(dataset.Select(d => d.Example)); var bowDataset = new LabeledDataset <LblT, SparseVector <double> >(); for (int i = 0; i < bowData.Count; i++) { bowDataset.Add(dataset[i].Label, bowData[i]); } // train if (OnTrainModel == null) { Model.Train(bowDataset); } else { OnTrainModel(this, bowDataset); } IsTrained = true; }
static void Main(string[] args) { // load documents Utils.VerboseLine("Loading documents ..."); string[] docs = File.ReadAllLines("C:\\newwork\\testclustering\\data\\yahoofinance.txt"); BowSpace bowSpace = new BowSpace(); bowSpace.StopWords = StopWords.EnglishStopWords; bowSpace.Stemmer = new PorterStemmer(); bowSpace.WordWeightType = WordWeightType.TfIdf; RegexTokenizer tokenizer = new RegexTokenizer(); tokenizer.IgnoreUnknownTokens = true; bowSpace.Tokenizer = tokenizer; bowSpace.Initialize(docs); // compute layout SemanticSpaceLayout semSpc = new SemanticSpaceLayout(bowSpace); Vector2D[] coords = semSpc.ComputeLayout(); // build spatial index //Utils.VerboseLine("Building spatial index ..."); //SpatialIndex2D spatIdx = new SpatialIndex2D(); //spatIdx.BuildIndex(coords); //spatIdx.InsertPoint(9000, new Vector2D(1000, 1000)); //ArrayList<IdxDat<Vector2D>> points = spatIdx.GetPoints(new Vector2D(0.5, 0.5), 0.1); //Utils.VerboseLine("Number of retrieved points: {0}.", points.Count); ArrayList <Vector2D> tmp = new ArrayList <Vector2D>(coords); tmp.Shuffle(); //tmp.RemoveRange(1000, tmp.Count - 1000); // compute elevation StreamWriter writer = new StreamWriter("c:\\elev.txt"); LayoutSettings ls = new LayoutSettings(800, 600); ls.AdjustmentType = LayoutAdjustmentType.Soft; ls.StdDevMult = 2; ls.FitToBounds = true; ls.MarginVert = 50; ls.MarginHoriz = 50; double[,] zMtx = VisualizationUtils.ComputeLayoutElevation(tmp, ls, 150, 200); VisualizationUtils.__DrawElevation__(tmp, ls, 300, 400).Save("c:\\elev.bmp"); for (int row = 0; row < zMtx.GetLength(0); row++) { for (int col = 0; col < zMtx.GetLength(1); col++) { writer.Write("{0}\t", zMtx[row, col]); } writer.WriteLine(); } writer.Close(); // output coordinates StreamWriter tsvWriter = new StreamWriter("c:\\layout.tsv"); for (int i = 0; i < coords.Length; i++) { //if (i < points.Count) //{ // tsvWriter.WriteLine("{0}\t{1}\t{2}\t{3}", coords[i].X, coords[i].Y, points[i].Dat.X, points[i].Dat.Y); //} //else { tsvWriter.WriteLine("{0}\t{1}", coords[i].X, coords[i].Y); } } tsvWriter.Close(); //// get document names //int k = 0; //ArrayList<Pair<string, Vector2D>> layout = new ArrayList<Pair<string, Vector2D>>(); //foreach (string doc in docs) //{ // string[] docInfo = doc.Split(' '); // layout.Add(new Pair<string, Vector2D>(docInfo[0], coords[k++])); //} //Console.WriteLine(coords.Length); //Console.WriteLine(layout.Count); //StreamWriter writer = new StreamWriter("c:\\vidCoords.txt"); //foreach (Pair<string, Vector2D> docPos in layout) //{ // writer.WriteLine("{0}\t{1}\t{2}", docPos.First, docPos.Second.X, docPos.Second.Y); //} //writer.Close(); }
private void LoadModel(BinarySerializer reader) { mLogger.Info("LoadModel", "Loading model ..."); mBowSpace = new BowSpace(reader); mModel = Utils.LoadDictionary<string, IModel<string>>(reader); mLogger.Info("LoadModel", "Model successfully loaded."); }
public override void Run(object[] args) { // prepare data IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Create a tokenizer. var tokenizer = new UnicodeTokenizer { MinTokenLen = 2, // Each token must be at least 2 characters long. Filter = TokenizerFilter.AlphaStrict // Tokens can consist of alphabetic characters only. }; // take data for two classes from cvs file var data = new List <LabeledTweet>(GetLabeledTweets().Where(lt => lt.Polarity != 2)).ToList(); // Create a bag-of-words space. var bowSpc = new BowSpace { Tokenizer = tokenizer, // Assign the tokenizer. StopWords = stopWords, // Assign the stop words. Stemmer = stemmer, // Assign the stemmer. MinWordFreq = 1, // A term must appear at least n-times in the corpus for it to be part of the vocabulary. MaxNGramLen = 2, // Terms consisting of at most n-consecutive words will be considered. WordWeightType = WordWeightType.TermFreq, // Set the weighting scheme for the bag-of-words vectors to TF. //WordWeightType = WordWeightType.TfIdf, // Set the weighting scheme for the bag-of-words vectors to TF-IDF. NormalizeVectors = true, // The TF-IDF vectors will be normalized. CutLowWeightsPerc = 0 // The terms with the lowest weights, summing up to 20% of the overall weight sum, will be removed from each TF-IDF vector. }; ArrayList <SparseVector <double> > bowData = bowSpc.Initialize(data.Select(d => d.Text)); // label data var labeledSet = new LabeledDataset <string, SparseVector <double> >(); for (int i = 0; i < data.Count; i++) { labeledSet.Add(data[i].Label, bowData[i]); } labeledSet.Shuffle(); int testSize = labeledSet.Count / 10; var trainingSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Skip(testSize)); var testSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Take(testSize)); //-------------------- SVM var svmBinClass = new SvmBinaryClassifier <string> { VerbosityLevel = SvmLightVerbosityLevel.Off }; if (args.Any()) { svmBinClass.C = (int)args[0]; } //svmBinClass.BiasedHyperplane = true; //svmBinClass.CustomParams = "-t 3"; // non-linear kernel //svmBinClass.CustomParams = String.Format("-j {0}",j); svmBinClass.Train(trainingSet); int correct = 0; double avgDist = 0; foreach (LabeledExample <string, SparseVector <double> > labeledExample in testSet) { var prediction = svmBinClass.Predict(labeledExample.Example); //Output.WriteLine("actual: {0}\tpredicted: {1}\t score: {2:0.0000}", labeledExample.Label, prediction.BestClassLabel, prediction.BestScore); avgDist += prediction.BestScore; if (prediction.BestClassLabel == labeledExample.Label) { correct++; } } Output.WriteLine("Accuracy: {0:0.00}", 100.0 * correct / testSet.Count); Output.WriteLine("Avg. distance: {0:0.00}", avgDist / testSet.Count); Result.Add("accuracy", (double)correct / testSet.Count); Result.Add("classifier", svmBinClass); Result.Add("labeled_data", labeledSet); }
static void Main(string[] args) { // load documents Utils.VerboseLine("Loading documents ..."); string[] docs = File.ReadAllLines("C:\\newwork\\testclustering\\data\\yahoofinance.txt"); BowSpace bowSpace = new BowSpace(); bowSpace.StopWords = StopWords.EnglishStopWords; bowSpace.Stemmer = new PorterStemmer(); bowSpace.WordWeightType = WordWeightType.TfIdf; RegexTokenizer tokenizer = new RegexTokenizer(); tokenizer.IgnoreUnknownTokens = true; bowSpace.Tokenizer = tokenizer; bowSpace.Initialize(docs); // compute layout SemanticSpaceLayout semSpc = new SemanticSpaceLayout(bowSpace); Vector2D[] coords = semSpc.ComputeLayout(); // build spatial index //Utils.VerboseLine("Building spatial index ..."); //SpatialIndex2D spatIdx = new SpatialIndex2D(); //spatIdx.BuildIndex(coords); //spatIdx.InsertPoint(9000, new Vector2D(1000, 1000)); //ArrayList<IdxDat<Vector2D>> points = spatIdx.GetPoints(new Vector2D(0.5, 0.5), 0.1); //Utils.VerboseLine("Number of retrieved points: {0}.", points.Count); ArrayList<Vector2D> tmp = new ArrayList<Vector2D>(coords); tmp.Shuffle(); //tmp.RemoveRange(1000, tmp.Count - 1000); // compute elevation StreamWriter writer = new StreamWriter("c:\\elev.txt"); LayoutSettings ls = new LayoutSettings(800, 600); ls.AdjustmentType = LayoutAdjustmentType.Soft; ls.StdDevMult = 2; ls.FitToBounds = true; ls.MarginVert = 50; ls.MarginHoriz = 50; double[,] zMtx = VisualizationUtils.ComputeLayoutElevation(tmp, ls, 150, 200); VisualizationUtils.__DrawElevation__(tmp, ls, 300, 400).Save("c:\\elev.bmp"); for (int row = 0; row < zMtx.GetLength(0); row++) { for (int col = 0; col < zMtx.GetLength(1); col++) { writer.Write("{0}\t", zMtx[row, col]); } writer.WriteLine(); } writer.Close(); // output coordinates StreamWriter tsvWriter = new StreamWriter("c:\\layout.tsv"); for (int i = 0; i < coords.Length; i++) { //if (i < points.Count) //{ // tsvWriter.WriteLine("{0}\t{1}\t{2}\t{3}", coords[i].X, coords[i].Y, points[i].Dat.X, points[i].Dat.Y); //} //else { tsvWriter.WriteLine("{0}\t{1}", coords[i].X, coords[i].Y); } } tsvWriter.Close(); //// get document names //int k = 0; //ArrayList<Pair<string, Vector2D>> layout = new ArrayList<Pair<string, Vector2D>>(); //foreach (string doc in docs) //{ // string[] docInfo = doc.Split(' '); // layout.Add(new Pair<string, Vector2D>(docInfo[0], coords[k++])); //} //Console.WriteLine(coords.Length); //Console.WriteLine(layout.Count); //StreamWriter writer = new StreamWriter("c:\\vidCoords.txt"); //foreach (Pair<string, Vector2D> docPos in layout) //{ // writer.WriteLine("{0}\t{1}\t{2}", docPos.First, docPos.Second.X, docPos.Second.Y); //} //writer.Close(); }
public static LabeledDataset <SentimentLabel, SparseVector <double> > InitBowSpace(BowSpace bowSpace, IEnumerable <LabeledExample <SentimentLabel, string> > labeledExamples, IEnumerable <string> initExamples = null) { LabeledExample <SentimentLabel, string>[] examples = labeledExamples as LabeledExample <SentimentLabel, string>[] ?? labeledExamples.ToArray(); List <SparseVector <double> > bowData; if (initExamples != null) { Preconditions.CheckArgument(!(bowSpace is DeltaBowSpace <SentimentLabel>)); bowSpace.Initialize(initExamples); bowData = examples.Select(le => bowSpace.ProcessDocument(le.Example)).ToList(); } else { bowData = bowSpace is DeltaBowSpace <SentimentLabel> ?((DeltaBowSpace <SentimentLabel>)bowSpace).Initialize(new LabeledDataset <SentimentLabel, string>(examples)) : bowSpace.Initialize(examples.Select(d => d.Example)); } var bowDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >(); for (int i = 0; i < bowData.Count; i++) { bowDataset.Add(examples[i].Label, bowData[i]); } return(bowDataset); }
static void Main(string[] args) { // Get the stop words and stemmer for English. IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Test the stemmer. Console.WriteLine(stemmer.GetStem("running")); // Output: run // Create a tokenizer. UnicodeTokenizer tokenizer = new UnicodeTokenizer(); tokenizer.MinTokenLen = 2; // Each token must be at least 2 // characters long. tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens // can consist of alphabetic characters only. // Test the tokenizer. tokenizer.Text = "one 1 two 2 three 3 one_1 two_2 three_3"; foreach (string token in tokenizer) { Console.Write("\"{0}\" ", token); } Console.WriteLine(); // Output: "one" "two" "three" // Load a document corpus from a file. Each line in the file // represents one document. string[] docs = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt"); // Create a bag-of-words space. BowSpace bowSpc = new BowSpace(); bowSpc.Tokenizer = tokenizer; // Assign the tokenizer. bowSpc.StopWords = stopWords; // Assign the stop words. bowSpc.Stemmer = stemmer; // Assign the stemmer. bowSpc.MinWordFreq = 3; // A term must appear at least 3 // times in the corpus for it to be part of the // vocabulary. bowSpc.MaxNGramLen = 3; // Terms consisting of at most 3 // consecutive words will be considered. bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the // weighting scheme for the bag-of-words vectors to // TF-IDF. bowSpc.NormalizeVectors = true; // The TF-IDF vectors will // be normalized. bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest // weights, summing up to 20% of the overall weight sum, // will be removed from each TF-IDF vector. bowSpc.Initialize(docs); // Initialize the BOW space. // Output the vocabulary (the terms, their stems, // frequencies, and document frequencies) to the console. StreamWriter stdOut = new StreamWriter(Console.OpenStandardOutput()); bowSpc.OutputStats(stdOut); stdOut.Close(); // Output the TF-IDF vector representing the description of // Google to the console. SparseVector <double> .ReadOnly googVec = bowSpc.BowVectors[4192 - 1]; // The description of // Google can be found at the row 4192 in the corpus. foreach (IdxDat <double> termInfo in googVec) { Console.WriteLine("{0} : {1}", bowSpc.Words[termInfo.Idx].MostFrequentForm, termInfo.Dat); } // Extract the top 5 terms with the highest TF-IDF weights // from the vector representing Google. Console.WriteLine(bowSpc.GetKeywordsStr(googVec, 5)); // Output: google, relevant, targeted advertising, search, // index }
static void Main(string[] args) { // Get the stop words and stemmer for English. IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Create a tokenizer. UnicodeTokenizer tokenizer = new UnicodeTokenizer(); tokenizer.MinTokenLen = 2; // Each token must be at least 2 // characters long. tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens // can consist of alphabetic characters only. // Load a document corpus from a file. Each line in the file // represents one document. string[] docs = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt"); // Create a bag-of-words space. BowSpace bowSpc = new BowSpace(); bowSpc.Tokenizer = tokenizer; // Assign the tokenizer. bowSpc.StopWords = stopWords; // Assign the stop words. bowSpc.Stemmer = stemmer; // Assign the stemmer. bowSpc.MinWordFreq = 3; // A term must appear at least 3 // times in the corpus for it to be part of the // vocabulary. bowSpc.MaxNGramLen = 3; // Terms consisting of at most 3 // consecutive words will be considered. bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the // weighting scheme for the bag-of-words vectors to // TF-IDF. bowSpc.NormalizeVectors = true; // The TF-IDF vectors will // be normalized. bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest // weights, summing up to 20% of the overall weight sum, // will be removed from each TF-IDF vector. bowSpc.Initialize(docs); // Initialize the BOW space. // Compute 100 clusters of documents. KMeansFast kMeans = new KMeansFast(100); // Set k to 100. kMeans.Trials = 3; // Perform 3 repetitions. Take the best // result. kMeans.Eps = 0.001; // Stop iterating when the partition // quality increases for less than 0.001. ClusteringResult cr = kMeans.Cluster(bowSpc); // Execute. // Extract the top 5 terms with the highest TF-IDF weights // from each of the clusters' centroids and output the // number of documents (companies) in each cluster. foreach (Cluster cl in cr.Roots) { SparseVector <double> .ReadOnly centroid = cl.ComputeCentroid(bowSpc, CentroidType.NrmL2); Console.Write(bowSpc.GetKeywordsStr(centroid, 5)); Console.WriteLine(" ({0} companies)", cl.Items.Count); } // Output the documents that are contained in the first // cluster. foreach (int docIdx in cr.Roots[0].Items) { Console.WriteLine(docs[docIdx]); } }