Esempio n. 1
0
 private void LoadModel(BinarySerializer reader)
 {
     mLogger.Info("LoadModel", "Loading model ...");
     mBowSpace = new BowSpace(reader);
     mModel    = Utils.LoadDictionary <string, IModel <string> >(reader);
     mLogger.Info("LoadModel", "Model successfully loaded.");
 }
Esempio n. 2
0
        // ReSharper restore InconsistentNaming

        static void Main(string[] args)
        {
            string[] documents = Corpus.Split(new[] { '.' }, StringSplitOptions.RemoveEmptyEntries);
            var results = new Dictionary<string, SparseVector<double>>();
            foreach (TestCase testCase in Enum.GetValues(typeof(TestCase)).Cast<TestCase>())
            {
                var bow = new BowSpace
                {
                    Tokenizer = new SimpleTokenizer
                    {
                        MinTokenLen = 2,
                        Type = TokenizerType.AllChars
                    },
                    StopWords = null,
                    Stemmer = null,
                    MaxNGramLen = 2,
                    MinWordFreq = 1,
                    WordWeightType = WordWeightType.TermFreq,
                    NormalizeVectors = true,
                    KeepWordForms = false
                };
                switch (testCase)
                {
                    case TestCase.testWordWeightType_TERM_FREQ:
                        break;
                    case TestCase.testWordWeightType_TF_IDF:
                        bow.WordWeightType = WordWeightType.TfIdf;
                        break;
                    case TestCase.testWordWeightType_LOG_DF_TF_IDF:
                        bow.WordWeightType = WordWeightType.LogDfTfIdf;
                        break;
                    case TestCase.testNGramLen:
                        bow.MaxNGramLen = 5;
                        break;
                    case TestCase.testMinWordFreq:
                        bow.MinWordFreq = 3;
                        break;
                    case TestCase.testNormalizeVectors:
                        bow.NormalizeVectors = false;
                        break;
                    case TestCase.testKeepWordForms:
                        bow.KeepWordForms = true;
                        break;
                    case TestCase.testLemmatizer:
                        bow.Stemmer = new Lemmatizer(Language.English);
                        break;
                    case TestCase.testStopWords:
                        bow.Stemmer = new Lemmatizer(Language.English);
                        bow.StopWords = StopWords.EnglishStopWords;
                        break;
                }
                bow.Initialize(documents, testCase == TestCase.testInitializeLargeScale);
                SparseVector<double> vector = bow.ProcessDocument(Document);

                results.Add(testCase.ToString(), vector);
            }

            File.WriteAllText("vec.json", JsonConvert.SerializeObject(results));
            Console.WriteLine(results);
        }
Esempio n. 3
0
        public Prediction <LblT> Predict(string example)
        {
            Preconditions.CheckState(IsTrained);

            example = FeatureProcessor.Run(example);
            SparseVector <double> vector = BowSpace.ProcessDocument(example);

            return(Model.Predict(vector));
        }
 static CategorizerDemo()
 {
     string fileName = HttpContext.Current.Server.MapPath("App_Data\\model.bin");
     BinarySerializer binReader = new BinarySerializer(fileName, FileMode.Open);
     mBowSpace = new BowSpace(binReader);
     mBowSpace.CutLowWeightsPerc = 0.2;
     mCategorizer = Utils.LoadDictionary<string, IModel<string>>(binReader);
     binReader.Close();
     mReady = true;
 }
Esempio n. 5
0
        public Pair <string, double>[] GetTopVectorItems(string vectorName, int n, BowSpace bowSpc)
        {
            SparseVector <double> vec = mFeatureVectors[vectorName];

            return(vec
                   .OrderByDescending(x => x.Dat)
                   .Take(n)
                   .Select(x => new Pair <string, double>(bowSpc.Words[x.Idx].Stem, x.Dat))
                   .ToArray());
        }
 static TwitterSentimentDemo()
 {
     string modelFileName = HttpContext.Current.Server.MapPath("App_Data\\AdCfy.bin");
     string bowSpcFileName = HttpContext.Current.Server.MapPath("App_Data\\AdCfyBowSpc.bin");
     mClassifier = new SvmBinaryClassifier<int>();
     mClassifier.LoadModel(modelFileName);
     BinarySerializer bs = new BinarySerializer(bowSpcFileName, FileMode.Open);
     mBowSpace = new BowSpace(bs);
     bs.Close();
     mReady = true;
 }
 static PumpIndexComponent()
 {
     Logger.GetLogger(typeof(PumpIndexComponent)).Info("PumpIndexComponent", "Loading model ...");
     string fileName = Utils.GetConfigValue("PumpIndexModel", ".\\PumpIndexModel.bin");
     using (BinarySerializer reader = new BinarySerializer(fileName, FileMode.Open))
     {
         mBowSpace = new BowSpace(reader);
         mClassifier = new SvmBinaryClassifier<int>(reader);
         mAvgDistPos = reader.ReadDouble();
         mAvgDistNeg = reader.ReadDouble();
         //Console.WriteLine(mAvgDistPos);
         //Console.WriteLine(mAvgDistNeg);
     }
     Logger.GetLogger(typeof(PumpIndexComponent)).Info("PumpIndexComponent", "Done.");
 }
 static DocumentCategorizerComponent()
 {
     Logger.GetLogger(typeof(DocumentCategorizerComponent)).Info("CategorizerComponent", "Loading model ...");
     string fileName = Utils.GetConfigValue("CategorizationModel", ".\\CategorizationModel.bin");
     BinarySerializer binReader = new BinarySerializer(fileName, FileMode.Open);
     mBowSpace = new BowSpace(binReader);
     mBowSpace.CutLowWeightsPerc = 0.2;
     mCategorizer = Utils.LoadDictionary<string, IModel<string>>(binReader);
     binReader.Close();
     //using (BinarySerializer writer = new BinarySerializer(@"C:\Work\AchimPipe\DocumentCategorizerModel.bin", FileMode.Create))
     //{
     //    mBowSpace.Save(writer);
     //    Utils.SaveDictionary<string, IModel<string>>(mCategorizer, writer);
     //}
     Logger.GetLogger(typeof(DocumentCategorizerComponent)).Info("CategorizerComponent", "Done.");
 }
        static PumpIndexComponent()
        {
            Logger.GetLogger(typeof(PumpIndexComponent)).Info("PumpIndexComponent", "Loading model ...");
            string fileName = Utils.GetConfigValue("PumpIndexModel", ".\\PumpIndexModel.bin");

            using (BinarySerializer reader = new BinarySerializer(fileName, FileMode.Open))
            {
                mBowSpace   = new BowSpace(reader);
                mClassifier = new SvmBinaryClassifier <int>(reader);
                mAvgDistPos = reader.ReadDouble();
                mAvgDistNeg = reader.ReadDouble();
                //Console.WriteLine(mAvgDistPos);
                //Console.WriteLine(mAvgDistNeg);
            }
            Logger.GetLogger(typeof(PumpIndexComponent)).Info("PumpIndexComponent", "Done.");
        }
Esempio n. 10
0
        static DocumentCategorizerComponent()
        {
            Logger.GetLogger(typeof(DocumentCategorizerComponent)).Info("CategorizerComponent", "Loading model ...");
            string           fileName  = Utils.GetConfigValue("CategorizationModel", ".\\CategorizationModel.bin");
            BinarySerializer binReader = new BinarySerializer(fileName, FileMode.Open);

            mBowSpace = new BowSpace(binReader);
            mBowSpace.CutLowWeightsPerc = 0.2;
            mCategorizer = Utils.LoadDictionary <string, IModel <string> >(binReader);
            binReader.Close();
            //using (BinarySerializer writer = new BinarySerializer(@"C:\Work\AchimPipe\DocumentCategorizerModel.bin", FileMode.Create))
            //{
            //    mBowSpace.Save(writer);
            //    Utils.SaveDictionary<string, IModel<string>>(mCategorizer, writer);
            //}
            Logger.GetLogger(typeof(DocumentCategorizerComponent)).Info("CategorizerComponent", "Done.");
        }
Esempio n. 11
0
        public void Train(ILabeledExampleCollection <LblT, string> dataset)
        {
            Preconditions.CheckState(!IsTrained);
            Preconditions.CheckNotNull(dataset);
            Preconditions.CheckNotNull(BowSpace);
            Preconditions.CheckNotNull(FeatureProcessor);
            Preconditions.CheckNotNull(Model);

            // preprocess the text
            foreach (LabeledExample <LblT, string> le in dataset)
            {
                le.Example = FeatureProcessor.Run(le.Example);
            }

            // bow vectors
            List <SparseVector <double> > bowData = BowSpace is DeltaBowSpace <LblT>
                                                    ?(BowSpace as DeltaBowSpace <LblT>).Initialize(dataset as ILabeledDataset <LblT, string> ?? new LabeledDataset <LblT, string>(dataset))
                                                        : BowSpace.Initialize(dataset.Select(d => d.Example));
            var bowDataset = new LabeledDataset <LblT, SparseVector <double> >();

            for (int i = 0; i < bowData.Count; i++)
            {
                bowDataset.Add(dataset[i].Label, bowData[i]);
            }

            // train
            if (OnTrainModel == null)
            {
                Model.Train(bowDataset);
            }
            else
            {
                OnTrainModel(this, bowDataset);
            }

            IsTrained = true;
        }
Esempio n. 12
0
        static void Main(string[] args)
        {
            // load documents
            Utils.VerboseLine("Loading documents ...");
            string[] docs     = File.ReadAllLines("C:\\newwork\\testclustering\\data\\yahoofinance.txt");
            BowSpace bowSpace = new BowSpace();

            bowSpace.StopWords      = StopWords.EnglishStopWords;
            bowSpace.Stemmer        = new PorterStemmer();
            bowSpace.WordWeightType = WordWeightType.TfIdf;
            RegexTokenizer tokenizer = new RegexTokenizer();

            tokenizer.IgnoreUnknownTokens = true;
            bowSpace.Tokenizer            = tokenizer;
            bowSpace.Initialize(docs);
            // compute layout
            SemanticSpaceLayout semSpc = new SemanticSpaceLayout(bowSpace);

            Vector2D[] coords = semSpc.ComputeLayout();
            // build spatial index
            //Utils.VerboseLine("Building spatial index ...");
            //SpatialIndex2D spatIdx = new SpatialIndex2D();
            //spatIdx.BuildIndex(coords);
            //spatIdx.InsertPoint(9000, new Vector2D(1000, 1000));
            //ArrayList<IdxDat<Vector2D>> points = spatIdx.GetPoints(new Vector2D(0.5, 0.5), 0.1);
            //Utils.VerboseLine("Number of retrieved points: {0}.", points.Count);

            ArrayList <Vector2D> tmp = new ArrayList <Vector2D>(coords);

            tmp.Shuffle();
            //tmp.RemoveRange(1000, tmp.Count - 1000);

            // compute elevation
            StreamWriter   writer = new StreamWriter("c:\\elev.txt");
            LayoutSettings ls     = new LayoutSettings(800, 600);

            ls.AdjustmentType = LayoutAdjustmentType.Soft;
            ls.StdDevMult     = 2;
            ls.FitToBounds    = true;
            ls.MarginVert     = 50;
            ls.MarginHoriz    = 50;
            double[,] zMtx    = VisualizationUtils.ComputeLayoutElevation(tmp, ls, 150, 200);
            VisualizationUtils.__DrawElevation__(tmp, ls, 300, 400).Save("c:\\elev.bmp");
            for (int row = 0; row < zMtx.GetLength(0); row++)
            {
                for (int col = 0; col < zMtx.GetLength(1); col++)
                {
                    writer.Write("{0}\t", zMtx[row, col]);
                }
                writer.WriteLine();
            }
            writer.Close();

            // output coordinates
            StreamWriter tsvWriter = new StreamWriter("c:\\layout.tsv");

            for (int i = 0; i < coords.Length; i++)
            {
                //if (i < points.Count)
                //{
                //    tsvWriter.WriteLine("{0}\t{1}\t{2}\t{3}", coords[i].X, coords[i].Y, points[i].Dat.X, points[i].Dat.Y);
                //}
                //else
                {
                    tsvWriter.WriteLine("{0}\t{1}", coords[i].X, coords[i].Y);
                }
            }
            tsvWriter.Close();
            //// get document names
            //int k = 0;
            //ArrayList<Pair<string, Vector2D>> layout = new ArrayList<Pair<string, Vector2D>>();
            //foreach (string doc in docs)
            //{
            //    string[] docInfo = doc.Split(' ');
            //    layout.Add(new Pair<string, Vector2D>(docInfo[0], coords[k++]));
            //}
            //Console.WriteLine(coords.Length);
            //Console.WriteLine(layout.Count);
            //StreamWriter writer = new StreamWriter("c:\\vidCoords.txt");
            //foreach (Pair<string, Vector2D> docPos in layout)
            //{
            //    writer.WriteLine("{0}\t{1}\t{2}", docPos.First, docPos.Second.X, docPos.Second.Y);
            //}
            //writer.Close();
        }
 private void LoadModel(BinarySerializer reader)
 {
     mLogger.Info("LoadModel", "Loading model ...");
     mBowSpace = new BowSpace(reader);
     mModel = Utils.LoadDictionary<string, IModel<string>>(reader);
     mLogger.Info("LoadModel", "Model successfully loaded.");
 }
Esempio n. 14
0
        public override void Run(object[] args)
        {
            // prepare data
            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer);

            // Create a tokenizer.
            var tokenizer = new UnicodeTokenizer
            {
                MinTokenLen = 2,                            // Each token must be at least 2 characters long.
                Filter      = TokenizerFilter.AlphaStrict   // Tokens can consist of alphabetic characters only.
            };

            // take data for two classes from cvs file
            var data = new List <LabeledTweet>(GetLabeledTweets().Where(lt => lt.Polarity != 2)).ToList();

            // Create a bag-of-words space.
            var bowSpc = new BowSpace
            {
                Tokenizer      = tokenizer,                 // Assign the tokenizer.
                StopWords      = stopWords,                 // Assign the stop words.
                Stemmer        = stemmer,                   // Assign the stemmer.
                MinWordFreq    = 1,                         // A term must appear at least n-times in the corpus for it to be part of the vocabulary.
                MaxNGramLen    = 2,                         // Terms consisting of at most n-consecutive words will be considered.
                WordWeightType = WordWeightType.TermFreq,   // Set the weighting scheme for the bag-of-words vectors to TF.
                //WordWeightType = WordWeightType.TfIdf,  // Set the weighting scheme for the bag-of-words vectors to TF-IDF.
                NormalizeVectors  = true,                   // The TF-IDF vectors will be normalized.
                CutLowWeightsPerc = 0                       // The terms with the lowest weights, summing up to 20% of the overall weight sum, will be removed from each TF-IDF vector.
            };
            ArrayList <SparseVector <double> > bowData = bowSpc.Initialize(data.Select(d => d.Text));

            // label data
            var labeledSet = new LabeledDataset <string, SparseVector <double> >();

            for (int i = 0; i < data.Count; i++)
            {
                labeledSet.Add(data[i].Label, bowData[i]);
            }
            labeledSet.Shuffle();

            int testSize    = labeledSet.Count / 10;
            var trainingSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Skip(testSize));
            var testSet     = new LabeledDataset <string, SparseVector <double> >(labeledSet.Take(testSize));

            //-------------------- SVM

            var svmBinClass = new SvmBinaryClassifier <string> {
                VerbosityLevel = SvmLightVerbosityLevel.Off
            };

            if (args.Any())
            {
                svmBinClass.C = (int)args[0];
            }
            //svmBinClass.BiasedHyperplane = true;
            //svmBinClass.CustomParams = "-t 3";   // non-linear kernel
            //svmBinClass.CustomParams = String.Format("-j {0}",j);

            svmBinClass.Train(trainingSet);

            int    correct = 0;
            double avgDist = 0;

            foreach (LabeledExample <string, SparseVector <double> > labeledExample in testSet)
            {
                var prediction = svmBinClass.Predict(labeledExample.Example);
                //Output.WriteLine("actual: {0}\tpredicted: {1}\t score: {2:0.0000}", labeledExample.Label, prediction.BestClassLabel, prediction.BestScore);
                avgDist += prediction.BestScore;
                if (prediction.BestClassLabel == labeledExample.Label)
                {
                    correct++;
                }
            }

            Output.WriteLine("Accuracy: {0:0.00}", 100.0 * correct / testSet.Count);
            Output.WriteLine("Avg. distance: {0:0.00}", avgDist / testSet.Count);

            Result.Add("accuracy", (double)correct / testSet.Count);

            Result.Add("classifier", svmBinClass);
            Result.Add("labeled_data", labeledSet);
        }
Esempio n. 15
0
        static void Main(string[] args)
        {
            // load documents
            Utils.VerboseLine("Loading documents ...");
            string[] docs = File.ReadAllLines("C:\\newwork\\testclustering\\data\\yahoofinance.txt");
            BowSpace bowSpace = new BowSpace();
            bowSpace.StopWords = StopWords.EnglishStopWords;
            bowSpace.Stemmer = new PorterStemmer();
            bowSpace.WordWeightType = WordWeightType.TfIdf;
            RegexTokenizer tokenizer = new RegexTokenizer();
            tokenizer.IgnoreUnknownTokens = true;
            bowSpace.Tokenizer = tokenizer;
            bowSpace.Initialize(docs);
            // compute layout
            SemanticSpaceLayout semSpc = new SemanticSpaceLayout(bowSpace);
            Vector2D[] coords = semSpc.ComputeLayout();
            // build spatial index
            //Utils.VerboseLine("Building spatial index ...");
            //SpatialIndex2D spatIdx = new SpatialIndex2D();
            //spatIdx.BuildIndex(coords);
            //spatIdx.InsertPoint(9000, new Vector2D(1000, 1000));
            //ArrayList<IdxDat<Vector2D>> points = spatIdx.GetPoints(new Vector2D(0.5, 0.5), 0.1);
            //Utils.VerboseLine("Number of retrieved points: {0}.", points.Count);

            ArrayList<Vector2D> tmp = new ArrayList<Vector2D>(coords);
            tmp.Shuffle();
            //tmp.RemoveRange(1000, tmp.Count - 1000);

            // compute elevation
            StreamWriter writer = new StreamWriter("c:\\elev.txt");
            LayoutSettings ls = new LayoutSettings(800, 600);
            ls.AdjustmentType = LayoutAdjustmentType.Soft;
            ls.StdDevMult = 2;
            ls.FitToBounds = true;
            ls.MarginVert = 50;
            ls.MarginHoriz = 50;
            double[,] zMtx = VisualizationUtils.ComputeLayoutElevation(tmp, ls, 150, 200);
            VisualizationUtils.__DrawElevation__(tmp, ls, 300, 400).Save("c:\\elev.bmp");
            for (int row = 0; row < zMtx.GetLength(0); row++)
            {
                for (int col = 0; col < zMtx.GetLength(1); col++)
                {
                    writer.Write("{0}\t", zMtx[row, col]);
                }
                writer.WriteLine();
            }
            writer.Close();

            // output coordinates
            StreamWriter tsvWriter = new StreamWriter("c:\\layout.tsv");
            for (int i = 0; i < coords.Length; i++)
            {
                //if (i < points.Count)
                //{
                //    tsvWriter.WriteLine("{0}\t{1}\t{2}\t{3}", coords[i].X, coords[i].Y, points[i].Dat.X, points[i].Dat.Y);
                //}
                //else
                {
                    tsvWriter.WriteLine("{0}\t{1}", coords[i].X, coords[i].Y);
                }
            }
            tsvWriter.Close();
            //// get document names
            //int k = 0;
            //ArrayList<Pair<string, Vector2D>> layout = new ArrayList<Pair<string, Vector2D>>();
            //foreach (string doc in docs)
            //{
            //    string[] docInfo = doc.Split(' ');
            //    layout.Add(new Pair<string, Vector2D>(docInfo[0], coords[k++]));
            //}
            //Console.WriteLine(coords.Length);
            //Console.WriteLine(layout.Count);
            //StreamWriter writer = new StreamWriter("c:\\vidCoords.txt");
            //foreach (Pair<string, Vector2D> docPos in layout)
            //{
            //    writer.WriteLine("{0}\t{1}\t{2}", docPos.First, docPos.Second.X, docPos.Second.Y);
            //}
            //writer.Close();
        }
Esempio n. 16
0
        public static LabeledDataset <SentimentLabel, SparseVector <double> > InitBowSpace(BowSpace bowSpace,
                                                                                           IEnumerable <LabeledExample <SentimentLabel, string> > labeledExamples, IEnumerable <string> initExamples = null)
        {
            LabeledExample <SentimentLabel, string>[] examples = labeledExamples as LabeledExample <SentimentLabel, string>[] ?? labeledExamples.ToArray();

            List <SparseVector <double> > bowData;

            if (initExamples != null)
            {
                Preconditions.CheckArgument(!(bowSpace is DeltaBowSpace <SentimentLabel>));
                bowSpace.Initialize(initExamples);
                bowData = examples.Select(le => bowSpace.ProcessDocument(le.Example)).ToList();
            }
            else
            {
                bowData = bowSpace is DeltaBowSpace <SentimentLabel>
                          ?((DeltaBowSpace <SentimentLabel>)bowSpace).Initialize(new LabeledDataset <SentimentLabel, string>(examples))
                              : bowSpace.Initialize(examples.Select(d => d.Example));
            }

            var bowDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >();

            for (int i = 0; i < bowData.Count; i++)
            {
                bowDataset.Add(examples[i].Label, bowData[i]);
            }
            return(bowDataset);
        }
Esempio n. 17
0
        static void Main(string[] args)
        {
            // Get the stop words and stemmer for English.

            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English,
                                             out stopWords, out stemmer);

            // Test the stemmer.

            Console.WriteLine(stemmer.GetStem("running"));
            // Output: run

            // Create a tokenizer.

            UnicodeTokenizer tokenizer = new UnicodeTokenizer();

            tokenizer.MinTokenLen = 2;                      // Each token must be at least 2
            // characters long.
            tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens
            // can consist of alphabetic characters only.

            // Test the tokenizer.

            tokenizer.Text = "one 1 two 2 three 3 one_1 two_2 three_3";
            foreach (string token in tokenizer)
            {
                Console.Write("\"{0}\" ", token);
            }
            Console.WriteLine();
            // Output: "one" "two" "three"

            // Load a document corpus from a file. Each line in the file
            // represents one document.

            string[] docs
                = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt");

            // Create a bag-of-words space.

            BowSpace bowSpc = new BowSpace();

            bowSpc.Tokenizer   = tokenizer; // Assign the tokenizer.
            bowSpc.StopWords   = stopWords; // Assign the stop words.
            bowSpc.Stemmer     = stemmer;   // Assign the stemmer.
            bowSpc.MinWordFreq = 3;         // A term must appear at least 3
            // times in the corpus for it to be part of the
            // vocabulary.
            bowSpc.MaxNGramLen = 3;                       // Terms consisting of at most 3
            // consecutive words will be considered.
            bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the
            // weighting scheme for the bag-of-words vectors to
            // TF-IDF.
            bowSpc.NormalizeVectors = true; // The TF-IDF vectors will
            // be normalized.
            bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest
            // weights, summing up to 20% of the overall weight sum,
            // will be removed from each TF-IDF vector.

            bowSpc.Initialize(docs); // Initialize the BOW space.

            // Output the vocabulary (the terms, their stems,
            // frequencies, and document frequencies) to the console.

            StreamWriter stdOut
                = new StreamWriter(Console.OpenStandardOutput());

            bowSpc.OutputStats(stdOut);
            stdOut.Close();

            // Output the TF-IDF vector representing the description of
            // Google to the console.

            SparseVector <double> .ReadOnly googVec
                = bowSpc.BowVectors[4192 - 1]; // The description of
            // Google can be found at the row 4192 in the corpus.
            foreach (IdxDat <double> termInfo in googVec)
            {
                Console.WriteLine("{0} : {1}",
                                  bowSpc.Words[termInfo.Idx].MostFrequentForm,
                                  termInfo.Dat);
            }

            // Extract the top 5 terms with the highest TF-IDF weights
            // from the vector representing Google.

            Console.WriteLine(bowSpc.GetKeywordsStr(googVec, 5));
            // Output: google, relevant, targeted advertising, search,
            // index
        }
Esempio n. 18
0
        static void Main(string[] args)
        {
            // Get the stop words and stemmer for English.

            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English,
                                             out stopWords, out stemmer);

            // Create a tokenizer.

            UnicodeTokenizer tokenizer = new UnicodeTokenizer();

            tokenizer.MinTokenLen = 2;                      // Each token must be at least 2
            // characters long.
            tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens
            // can consist of alphabetic characters only.

            // Load a document corpus from a file. Each line in the file
            // represents one document.

            string[] docs
                = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt");

            // Create a bag-of-words space.

            BowSpace bowSpc = new BowSpace();

            bowSpc.Tokenizer   = tokenizer; // Assign the tokenizer.
            bowSpc.StopWords   = stopWords; // Assign the stop words.
            bowSpc.Stemmer     = stemmer;   // Assign the stemmer.
            bowSpc.MinWordFreq = 3;         // A term must appear at least 3
            // times in the corpus for it to be part of the
            // vocabulary.
            bowSpc.MaxNGramLen = 3;                       // Terms consisting of at most 3
            // consecutive words will be considered.
            bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the
            // weighting scheme for the bag-of-words vectors to
            // TF-IDF.
            bowSpc.NormalizeVectors = true; // The TF-IDF vectors will
            // be normalized.
            bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest
            // weights, summing up to 20% of the overall weight sum,
            // will be removed from each TF-IDF vector.

            bowSpc.Initialize(docs); // Initialize the BOW space.

            // Compute 100 clusters of documents.

            KMeansFast kMeans = new KMeansFast(100); // Set k to 100.

            kMeans.Trials = 3;                       // Perform 3 repetitions. Take the best
            // result.
            kMeans.Eps = 0.001;                      // Stop iterating when the partition
            // quality increases for less than 0.001.

            ClusteringResult cr = kMeans.Cluster(bowSpc); // Execute.

            // Extract the top 5 terms with the highest TF-IDF weights
            // from each of the clusters' centroids and output the
            // number of documents (companies) in each cluster.

            foreach (Cluster cl in cr.Roots)
            {
                SparseVector <double> .ReadOnly centroid
                    = cl.ComputeCentroid(bowSpc, CentroidType.NrmL2);
                Console.Write(bowSpc.GetKeywordsStr(centroid, 5));
                Console.WriteLine(" ({0} companies)", cl.Items.Count);
            }

            // Output the documents that are contained in the first
            // cluster.

            foreach (int docIdx in cr.Roots[0].Items)
            {
                Console.WriteLine(docs[docIdx]);
            }
        }