private void Train(IEnumerable <RestaurantReview> trainingDataset)
        {
            // Our independant variable of the review text
            string[] inputs = trainingDataset.Select(x => x.Review).ToArray();

            // Our dependant variable is whether or not the review is positive
            int[] outputs = trainingDataset.Select(x => Convert.ToInt32(x.IsPositive)).ToArray();

            // Clean review text
            inputs = inputs.Select(this.CleanReview).ToArray();

            // Convert the reviews into a multidimensial array. Each review will contain the words of of the review
            // Also removes any punctation and other marks
            string[][] wordsPerReview = inputs.Tokenize();

            // Use the bag of words model to creates a sparse matrix that will say wether or not a review contains a certain word
            // All words will be added a column
            this._bagOfWordsModel = new BagOfWords();
            this._bagOfWordsModel.Learn(wordsPerReview);
            double[][] bagOfWordsResult = this._bagOfWordsModel.Transform(wordsPerReview);

            // Use the naive bayes algorithm for our text classification.
            NaiveBayesLearning <NormalDistribution> naiveBayesTeacher = new NaiveBayesLearning <NormalDistribution>();

            naiveBayesTeacher.Options.InnerOption = new NormalOptions()
            {
                Regularization = 1e-5 // to avoid zero variances exceptions
            };
            this._naiveBayesModel = naiveBayesTeacher.Learn(bagOfWordsResult, outputs);
        }
Ejemplo n.º 2
0
        public static List <string> GetKeywords(string[] body, int count, int minOccurance = 0)
        {
            var bow = new BagOfWords()
            {
                MaximumOccurance = 500
            };

            bow.Learn(body);
            int[] codedBody = new int[body.Length];
            bow.Transform(body, codedBody);

            var dictionary = codedBody.Select((value, index) => new { value, index })
                             .ToDictionary(pair => pair.index, pair => pair.value)
                             .OrderByDescending(x => x.Value)
                             .Where(x => x.Value > minOccurance)
                             .Select(x => x.Key)
                             .Take(count)
                             .ToList();

            List <string> result   = new List <string>();
            var           codebook = bow.CodeToString;

            foreach (var keyWord in dictionary)
            {
                result.Add(codebook[keyWord]);
            }
            return(result);
        }
Ejemplo n.º 3
0
        //LogisticRegression reg;


        public LogClassifier(string fileName, int countLayers, int countEpoch)
        {
            Codebook = new BagOfWords()
            {
                MaximumOccurance = 1
            };

            int samples    = 0;
            var dictionary = Utilities.ReadHostFile(fileName, ref samples);

            Samples = samples;

            if (dictionary.Item1.Length != 0 && dictionary.Item2.Length != 0)
            {
                Codebook.Learn(dictionary.Item1);
                double[][] inputs = Codebook.Transform(dictionary.Item1);
                int        count  = inputs.Count();

                //var learner = new IterativeReweightedLeastSquares<LogisticRegression>()
                //{
                //    Tolerance = 1e-4,  // Let's set some convergence parameters
                //    Iterations = 10,  // maximum number of iterations to perform
                //    Regularization = 0
                //};

                //reg = learner.Learn(inputs, outputs2);
                double[][] outputs = Utilities.BoolToDouble(dictionary.Item2);
                classifier = new SimpleClassifierNN(inputs, outputs, count, countLayers, countEpoch);
                var trainingResult = classifier.Train(inputs, outputs);
                Error        = trainingResult.Item1;
                TrainingTime = trainingResult.Item2;
            }
        }
Ejemplo n.º 4
0
Archivo: Logic.cs Proyecto: Viride/EZI
        public BagOfWords GenerateBagOfWords(List <StemmedDocument> documents, List <Keyword> keywords)
        {
            var bag = new BagOfWords();

            bag.BagOfWord = new Dictionary <int, Dictionary <string, double> >();
            bag.Vectors   = new Dictionary <int, double>();
            //(liczba wystąpień / maksymalną liczbę wystąpień) * idf
            foreach (var document in documents)
            {
                var documentBag = new Dictionary <string, double>();
                foreach (var key in keywords)
                {
                    var count = document.Title.Concat(document.Contents).Count(x => x == key.key);
                    documentBag.Add(key.key, count);
                }
                var max = documentBag.Select(x => x.Value).Max();
                if (max != 0)
                {
                    double value = 0;
                    for (int index = 0; index < documentBag.Count; index++)
                    {
                        var    dict = documentBag.ElementAt(index);
                        double val  = (dict.Value / max) * keywords.SingleOrDefault(x => x.key == dict.Key).Idf;
                        documentBag[dict.Key] = val;
                        value = value + Math.Pow(val, 2);
                    }
                    bag.BagOfWord.Add(document.Id, documentBag);
                    bag.Vectors.Add(document.Id, Math.Sqrt(value));
                }
            }
            return(bag);
        }
Ejemplo n.º 5
0
        public static void Main(string[] args)
        {
            // Open and read bag of words data file
            _bagOfWords = _fileReader.Read();

            PrintMenu();
        }
Ejemplo n.º 6
0
        public void CreatingAnInstance_WithProvidingAnEmptyArray_ShouldInferCorrectDistribution9()
        {
            var bag    = new BagOfWords(new string[0]);
            var actual = bag.GetTermFrequency("markov");

            Assert.AreEqual(0, actual);
        }
Ejemplo n.º 7
0
        private static void bagOfWords(int[][] inputs, int[] outputs)
        {
            var bow = new BagOfWords <int>();

            var quantizer = bow.Learn(inputs);

            double[][] histograms = quantizer.Transform(inputs);

            // One way to perform sequence classification with an SVM is to use
            // a kernel defined over sequences, such as DynamicTimeWarping.

            // Create the multi-class learning algorithm as one-vs-one with DTW:
            var teacher = new MulticlassSupportVectorLearning <ChiSquare, double[]>()
            {
                Learner = (p) => new SequentialMinimalOptimization <ChiSquare, double[]>()
                {
                    Complexity = 10000.0 // Create a hard SVM
                }
            };

            // Learn a multi-label SVM using the teacher
            var svm = teacher.Learn(histograms, outputs);

            // Get the predictions for the inputs
            int[] predicted = svm.Decide(histograms);

            // Create a confusion matrix to check the quality of the predictions:
            var cm = new ConfusionMatrix(predicted: predicted, expected: outputs);

            // Check the accuracy measure:
            double accuracy = cm.Accuracy;
        }
Ejemplo n.º 8
0
        public void CreatingAnInstance_WithProvidingEnumerableOfStrings_ShouldInferCorrectDistribution7()
        {
            var bag    = new BagOfWords(new[] { "a", "a", "a", "aa" });
            var actual = bag.GetTermFrequency("aa");

            Assert.AreEqual(0.25, actual);
        }
Ejemplo n.º 9
0
        public void CreatingAnInstance_WithProvidingEnumerableOfStrings_ShouldInferCorrectDistribution8()
        {
            var bag    = new BagOfWords(new[] { "hidden", "markov", "models", "hmm", "afa" });
            var actual = bag.GetTermFrequency("markov");

            Assert.AreEqual(0.2, actual);
        }
Ejemplo n.º 10
0
        public double CalculateSimilarity(BagOfWords bag1, BagOfWords bag2, bool useLabels)
        {
            Clustering clustering = new Clustering();
            double     similarity = clustering.GetCosineDistance(bag1, bag2, useLabels);

            return(similarity);
        }
Ejemplo n.º 11
0
        public void CreatingAnInstance_WithProvidingEnumerableOfStrings_ShouldInferCorrectDistribution3()
        {
            var bag    = new BagOfWords(new[] { "speech", "recognition", "system" });
            var actual = bag.GetTermFrequency("speech");

            Assert.AreEqual(0.33, Math.Round(actual, 2));
        }
Ejemplo n.º 12
0
        public void SerializationTest()
        {
            BagOfWords target = new BagOfWords();

            target.Compute(texts);

            int[][] expected = new int[texts.Length][];
            for (int i = 0; i < expected.Length; i++)
            {
                expected[i] = target.GetFeatureVector(texts[i]);
            }

            MemoryStream    stream = new MemoryStream();
            BinaryFormatter fmt    = new BinaryFormatter();

            fmt.Serialize(stream, target);
            stream.Seek(0, SeekOrigin.Begin);
            target = (BagOfWords)fmt.Deserialize(stream);

            int[][] actual = new int[expected.Length][];
            for (int i = 0; i < actual.Length; i++)
            {
                actual[i] = target.GetFeatureVector(texts[i]);
            }

            Assert.IsTrue(expected.IsEqual(actual));
        }
Ejemplo n.º 13
0
 private static void OutputSimilarity(BagOfWords bag1, BagOfWords bag2, double similarity)
 {
     Console.Out.WriteLine("Similarity for documents '{0}' and '{1}':", bag1.Name, bag2.Name);
     Console.Out.WriteLine("- similarity: {0}", similarity);
     Console.Out.WriteLine("---------------------");
     Console.Out.WriteLine();
 }
Ejemplo n.º 14
0
        /// <summary>
        /// Loads a NB model from a file
        /// Formats the tweets and decides on their bias.
        /// </summary>
        /// <param name="tweets"> The list of tweets</param>
        /// <returns> The bias </returns>
        public double RunNaiveBayes(List <Tweet> tweets)
        {
            if (tweets.Count == 0)
            {
                return(0);
            }

            var model = FileHelper.GetModel();

            bagOfWords = FileHelper.GetBagOfWords();

            double[][] inputs = FormatTweets(tweets);

            //Predicts each tweets class
            int[] answers = model.Decide(inputs);

            List <int> result = new List <int>()
            {
                0, 0, 0
            };

            foreach (var item in answers)
            {
                result[item] += 1;
            }

            return(CalcBias(result));
        }
Ejemplo n.º 15
0
        public void TestTeaching()
        {
            string userName = "******";

            List <BagOfWords> bags = feedService.GetAllBags();

            UserInterest interest = userService.GetUser(userName);

            if (interest == null)
            {
                interest = userService.AddUser(userName);

                FeedItem document = feedService.GetDocument("tag:blogger.com,1999:blog-19732346.post-6636802898282623833");
                userService.LikeDocument(interest, document);
                document = feedService.GetDocument("tag:blogger.com,1999:blog-19732346.post-1441194024531049182");
                userService.LikeDocument(interest, document);
            }

            BagOfWords userBag = new BagOfWords(interest.Id, interest);

            interest.Ratings = userBag.CalculateRatings(bags).Where(rating => rating.Rating > 0).ToList();
            userService.UpdateUser(interest);

            //userService.RemoveUser(userName);
        }
Ejemplo n.º 16
0
 public void Load(string storeName)
 {
     CreateBayesLearner();
     _bayesLearning.Model = StorageHelpers.LoadItem <NaiveBayes>(Constants.ParamModel, storeName);
     _outputCodeBook      = StorageHelpers.LoadItem <Codification>(Constants.ParamCodeBook, storeName);
     _inputBagOfWords     = StorageHelpers.LoadItem <BagOfWords>(Constants.ParamInputBoW, storeName);
     _subjectBagOfWords   = StorageHelpers.LoadItem <BagOfWords>(Constants.ParamSubjectBoW, storeName);
     _textBagOfWords      = StorageHelpers.LoadItem <BagOfWords>(Constants.ParamTextBoW, storeName);
 }
Ejemplo n.º 17
0
 private double[][] CreateTextBagOfWords(string[][] inputs)
 {
     _textBagOfWords = new BagOfWords()
     {
         MaximumOccurance = 1
     };
     _textBagOfWords.Learn(inputs);
     return(_textBagOfWords.Transform(inputs));
 }
        public BagOfWords Read()
        {
            var assembly     = Assembly.GetExecutingAssembly();
            var resourceName = assembly.GetManifestResourceNames()
                               .Single(x => x.EndsWith("docword.kos.txt"));

            Console.WriteLine("Opening bag of words...");

            var bagOfWords = new BagOfWords();

            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
                using (StreamReader sr = new StreamReader(stream, Encoding.UTF8))
                {
                    bagOfWords.NumberOfDocuments         = sr.ReadLine();
                    bagOfWords.NumberOfWordsInVocabulary = sr.ReadLine();
                    bagOfWords.NumberOfNnz = sr.ReadLine();

                    string line;
                    while ((line = sr.ReadLine()) != null)
                    {
                        var data = line.Split(" ");

                        var documentId = Convert.ToInt32(data[0]);
                        var wordId     = Convert.ToInt32(data[1]);
                        var wordCount  = Convert.ToInt32(data[2]);

                        if (bagOfWords.Documents.Exists(x => x.Id == documentId))
                        {
                            var word = new Word
                            {
                                Id    = wordId,
                                Count = wordCount
                            };
                            bagOfWords.Documents.Find(x => x.Id == documentId).Words.Add(word);
                            continue;
                        }

                        var document = new Document
                        {
                            Id    = documentId,
                            Words = new List <Word>
                            {
                                new Word
                                {
                                    Id    = wordId,
                                    Count = wordCount
                                }
                            }
                        };
                        bagOfWords.Documents.Add(document);
                    }
                }
            Console.WriteLine("Bag of words loaded.");

            return(bagOfWords);
        }
Ejemplo n.º 19
0
        public void Init(string wordnetDir, bool inMemory, string modelDir)
        {
            wordnet = new WordNetEngine(wordnetDir, inMemory);
            nlp = new OpenNLPService(modelDir);
            bow = new BagOfWords();
            terms = new FlatRepository();

            executions = new List<IExecution>();
            executions.Add(new FirstExecution(wordnet, nlp));
        }
Ejemplo n.º 20
0
        public void Similarity()
        {
            instance.Register(BagOfWords.Create("one", "two", "three"));
            instance.Register(BagOfWords.Create("one", "one", "one"));
            instance.Register(BagOfWords.Create("one", "one", "two"));

            var result = instance.FindSimilar(BagOfWords.Create("thee", "two", "three")).ToArray();

            Assert.AreEqual(3, result.Length);
        }
Ejemplo n.º 21
0
        public void ProvidesAllWords()
        {
            var document = new Document(new[] { "one", "two", "other", "two" });
            var bagOfWords = new BagOfWords(document);

            Assert.That(bagOfWords.Words, Contains.Item("one"));
            Assert.That(bagOfWords.Words, Contains.Item("two"));
            Assert.That(bagOfWords.Words, Contains.Item("other"));
            CollectionAssert.DoesNotContain(bagOfWords.Words, "nonexisting");
        }
Ejemplo n.º 22
0
        public void CorectlyCountsWordsInBagOfWords()
        {
            var document = new Document(new[] {"one", "two", "other", "two"});
            var bagOfWords = new BagOfWords(document);

            Assert.That(bagOfWords.Count("one"), Is.EqualTo(1));
            Assert.That(bagOfWords.Count("two"), Is.EqualTo(2));
            Assert.That(bagOfWords.Count("other"), Is.EqualTo(1));
            Assert.That(bagOfWords.Count("nonexisting"), Is.EqualTo(0));
        }
Ejemplo n.º 23
0
        public void Init(string wordnetDir, bool inMemory, string modelDir)
        {
            wordnet = new WordNetEngine(wordnetDir, inMemory);
            nlp     = new OpenNLPService(modelDir);
            bow     = new BagOfWords();
            terms   = new FlatRepository();

            executions = new List <IExecution>();
            executions.Add(new FirstExecution(wordnet, nlp));
        }
Ejemplo n.º 24
0
        //Make this class a singleton so that it is not retrained for every class it is used by
        private TextAnalyzer()
        {
            //Usage of a Naive Bayes classifier
            //Create the trainer, allowing for some regularlizatiton
            var teacher = new NaiveBayesLearning <NormalDistribution, NormalOptions>()
            {
                Options = { InnerOption = { Regularization = 1e-6 } }
            };

            //Read in the training data and stop words
            string liberalTrainingPath      = System.Web.Hosting.HostingEnvironment.MapPath(@"~/Data/liberal_training.txt");
            string conservativeTrainingPath = System.Web.Hosting.HostingEnvironment.MapPath(@"~/Data/conservative_training.txt");
            string stopWordsPath            = System.Web.Hosting.HostingEnvironment.MapPath(@"~/Data/stop_words.txt");

            string[] liberalSamples      = File.ReadAllLines(liberalTrainingPath);
            string[] conservativeSamples = File.ReadAllLines(conservativeTrainingPath);
            stopWords = File.ReadAllLines(stopWordsPath);

            //Concat the samples into one array (They are first read into their own array to allow us to know the amount of samples in each file)
            string[] samples = liberalSamples.Concat(conservativeSamples).ToArray();

            //Break the text up into individual words
            string[][] words = samples.Tokenize();

            //If for some reason we didn't actually read any training data, throw an exception cuz the classifier wont work
            if (words.Length == 0)
            {
                throw new Exception("No training data for TextAnalyzer");
            }

            //Remove common english words
            words = TrimStopWords(words);

            //Create a bag of words using the tokenized sample data
            bagOfWords = new BagOfWords();
            bagOfWords.Learn(words);

            //Populate the output array using the known lengths of the sample files
            int[] outputs = new int[samples.Length];
            for (int i = 0; i < samples.Length; i++)
            {
                if (i < liberalSamples.Length)
                {
                    outputs[i] = 0;
                }
                else
                {
                    outputs[i] = 1;
                }
            }

            //Train the classifier
            double[][] inputs = bagOfWords.Transform(words);
            nbClassifier = teacher.Learn(inputs, outputs);
        }
Ejemplo n.º 25
0
        public BagOfWords AnalyzeFeed(Feed feed)
        {
            BagOfWords bag = new BagOfWords(feed.Name, feed.URL, feed.Name);

            foreach (FeedItem item in feed.Items)
            {
                bag.AddDocument(item.Title, item.ContentText, item.Tags);
            }

            return(bag);
        }
Ejemplo n.º 26
0
        public IEnumerable <BagOfWords> AnalyzeFeedItems(Feed feed)
        {
            List <BagOfWords> bags = new List <BagOfWords>();

            foreach (FeedItem item in feed.Items)
            {
                BagOfWords bag = new BagOfWords(item.Title, item.Id, feed.Name);
                bag.AddDocument(item.Title, item.ContentText, item.Tags);
                yield return(bag);
            }
        }
Ejemplo n.º 27
0
 public static void LogBOW(BagOfWords bow)
 {
     string filepath = CreateFilePath(BOW_FILE);
     using (StreamWriter writer = new StreamWriter(filepath))
     {
         foreach (var kvp in bow.OrderByDescending(kvp => kvp.Value))
         {
             writer.WriteLine(string.Format("{0}\t{1}", kvp.Value, kvp.Key));
         }
     }
 }
Ejemplo n.º 28
0
 /// <summary>
 /// Gets the trained Bag of words
 /// </summary>
 /// <returns></returns>
 public static BagOfWords GetBagOfWords()
 {
     if (bagOfWords == null)
     {
         bagOfWords = new BagOfWords()
         {
             MaximumOccurance = 1
         };
         bagOfWords.Learn(ReadObjectFromFile <string[][]>(@"BagOfWords90.txt"));
     }
     return(bagOfWords);
 }
Ejemplo n.º 29
0
        public static void LogBOW(BagOfWords bow)
        {
            string filepath = CreateFilePath(BOW_FILE);

            using (StreamWriter writer = new StreamWriter(filepath))
            {
                foreach (var kvp in bow.OrderByDescending(kvp => kvp.Value))
                {
                    writer.WriteLine(string.Format("{0}\t{1}", kvp.Value, kvp.Key));
                }
            }
        }
Ejemplo n.º 30
0
Archivo: Tf.cs Proyecto: psla/tfidf
 public Tf(BagOfWords bagOfWords)
 {
     _bagOfWords = bagOfWords;
     foreach (string word in bagOfWords.Words)
     {
         uint wordCount = bagOfWords.Count(word);
         if (wordCount > _maxDivider)
         {
             _maxDivider = wordCount;
         }
     }
 }
Ejemplo n.º 31
0
        public void ExecuteTest()
        {
            string[][] words =
            {
                new string[] { "今日", "は", "いい", "天気", "です"   },
                new string[] { "明日", "も", "いい", "天気", "でしょう" }
            };

            var codebook = new BagOfWords()
            {
                //MaximumOccurance = 1 // the resulting vector will have only 0's and 1's
                MaximumOccurance = int.MaxValue
            };

            // Compute the codebook (note: this would have to be done only for the training set)
            codebook.Learn(words);

            // Now, we can use the learned codebook to extract fixed-length
            // representations of the different texts (paragraphs) above:

            // Extract a feature vector from the text 1:
            double[] bow1 = codebook.Transform(words[0]);

            // Extract a feature vector from the text 2:
            double[] bow2 = codebook.Transform(words[1]);

            // we could also have transformed everything at once, i.e.
            double[][] bow = codebook.Transform(words);


            // Now, since we have finite length representations (both bow1 and bow2 should
            // have the same size), we can pass them to any classifier or machine learning
            // method. For example, we can pass them to a Logistic Regression Classifier to
            // discern between the first and second paragraphs

            // Lets create a Logistic classifier to separate the two paragraphs:
            var learner = new IterativeReweightedLeastSquares <LogisticRegression>()
            {
                Tolerance      = 1e-4, // Let's set some convergence parameters
                Iterations     = 100,  // maximum number of iterations to perform
                Regularization = 0
            };

            // Now, we use the learning algorithm to learn the distinction between the two:
            LogisticRegression reg = learner.Learn(new[] { bow1, bow2 }, new[] { false, true });

            // Finally, we can predict using the classifier:
            bool c1 = reg.Decide(bow1); // Should be false
            bool c2 = reg.Decide(bow2); // Should be true

            Console.WriteLine(c1);
            Console.WriteLine(c2);
        }
Ejemplo n.º 32
0
        public void learn_generic1()
        {
            // Declare some testing data
            int[][] sequences = new int[][]
            {
                new int[] { 0, 0, 1, 2 },     // Class 0
                new int[] { 0, 1, 1, 2 },     // Class 0
                new int[] { 0, 0, 0, 1, 2 },  // Class 0
                new int[] { 0, 1, 2, 2, 2 },  // Class 0

                new int[] { 2, 2, 1, 0 },     // Class 1
                new int[] { 2, 2, 2, 1, 0 },  // Class 1
                new int[] { 2, 2, 2, 1, 0 },  // Class 1
                new int[] { 2, 2, 2, 2, 1 },  // Class 1
            };

            int[] outputs = new int[]
            {
                0, 0, 0, 0, // First four sequences are of class 0
                1, 1, 1, 1, // Last four sequences are of class 1
            };

            // Create a Bag-of-Words learning algorithm
            var bow = new BagOfWords <int>();

            bow.ParallelOptions.MaxDegreeOfParallelism = 1;

            // Use the BoW to create a quantizer
            var quantizer = bow.Learn(sequences);

            // Extract vector representations from the integer sequences
            double[][] representations = quantizer.Transform(sequences);

            // Create a new learning algorithm for support vector machines
            var teacher = new MulticlassSupportVectorLearning <ChiSquare, double[]>
            {
                Learner = (p) => new SequentialMinimalOptimization <ChiSquare, double[]>()
                {
                    Complexity = 100000
                }
            };

            // Use the learning algorithm to create a classifier
            var svm = teacher.Learn(representations, outputs);

            // Compute predictions for the training set
            int[] predicted = svm.Decide(representations);

            var    cm  = new ConfusionMatrix(predicted: predicted, expected: outputs);
            double acc = cm.Accuracy;

            Assert.AreEqual(0.75, acc);
        }
Ejemplo n.º 33
0
        private static void OutputCommonLabels(BagOfWords bag1, BagOfWords bag2, List <string> commonLabels)
        {
            Console.Out.WriteLine("Common labels for documents '{0}' and '{1}':", bag1.Name, bag2.Name);
            Console.Out.WriteLine("- number of common labels: {0}", commonLabels.Count);

            Console.Out.WriteLine("- list of labels:");
            foreach (string label in commonLabels)
            {
                Console.Out.WriteLine("\t{0}", label);
            }
            Console.Out.WriteLine("---------------------");
            Console.Out.WriteLine();
        }
Ejemplo n.º 34
0
        private static void TestFeeds(FeedService feedService)
        {
            List <Feed> feeds = feedService.GetFeeds();

            BagOfWords bag1 = feedService.AnalyzeFeed(feeds[0]);
            BagOfWords bag2 = feedService.AnalyzeFeed(feeds[1]);
            BagOfWords bag3 = feedService.AnalyzeFeed(feeds[2]);
            BagOfWords bag4 = feedService.AnalyzeFeed(feeds[3]);

            List <string> commonLabels = feedService.CompareBags(bag1, bag2);

            OutputCommonLabels(bag1, bag2, commonLabels);

            commonLabels = feedService.CompareBags(bag1, bag3);
            OutputCommonLabels(bag1, bag3, commonLabels);

            commonLabels = feedService.CompareBags(bag1, bag4);
            OutputCommonLabels(bag1, bag4, commonLabels);

            commonLabels = feedService.CompareBags(bag2, bag3);
            OutputCommonLabels(bag2, bag3, commonLabels);

            commonLabels = feedService.CompareBags(bag2, bag4);
            OutputCommonLabels(bag2, bag4, commonLabels);

            commonLabels = feedService.CompareBags(bag3, bag4);
            OutputCommonLabels(bag3, bag4, commonLabels);

            double similarity = feedService.CalculateSimilarity(bag1, bag2, true);

            OutputSimilarity(bag1, bag2, similarity);

            similarity = feedService.CalculateSimilarity(bag1, bag3, true);
            OutputSimilarity(bag1, bag3, similarity);

            similarity = feedService.CalculateSimilarity(bag1, bag4, true);
            OutputSimilarity(bag1, bag4, similarity);

            similarity = feedService.CalculateSimilarity(bag2, bag3, true);
            OutputSimilarity(bag2, bag3, similarity);

            similarity = feedService.CalculateSimilarity(bag2, bag4, true);
            OutputSimilarity(bag2, bag4, similarity);

            similarity = feedService.CalculateSimilarity(bag3, bag4, true);
            OutputSimilarity(bag3, bag4, similarity);

            //Clustering cluster = new Clustering();
            //cluster.Clusterize(items);
        }
Ejemplo n.º 35
0
        public List <string> CompareBags(BagOfWords bag1, BagOfWords bag2)
        {
            List <string> commonLabels = new List <string>();

            foreach (string label1 in bag1.Labels.Keys)
            {
                if (bag2.Labels.ContainsKey(label1))
                {
                    commonLabels.Add(label1);
                }
            }

            return(commonLabels);
        }
Ejemplo n.º 36
0
 public void GivenBagOfWordsTfIsCorrect()
 {
     var bagOfWords = new BagOfWords(new Document(new[]
                                                      {
                                                          "fly",
                                                          "fly",
                                                          "fly",
                                                          "fruit",
                                                      }));
     var tf = new Tf(bagOfWords);
     Assert.AreEqual(0.33333333, tf.TermWeight("fruit"), 0.00001);
     Assert.AreEqual(1d, tf.TermWeight("fly"), 0.00001);
     Assert.That(tf.TermWeight("nonexisting"), Is.EqualTo(0));
 }
Ejemplo n.º 37
0
        public void GetFeatureVectorTest2()
        {
            BagOfWords target = new BagOfWords(texts);

            string[] text = { "Lorem", "test", "dolor" };

            int[] actual = target.GetFeatureVector(text);

            Assert.IsTrue(actual[0] == 1);
            Assert.IsTrue(actual[1] == 0);
            Assert.IsTrue(actual[2] == 1);

            for (int i = 3; i < actual.Length; i++)
                Assert.IsFalse(actual[i] == 1);
        }
Ejemplo n.º 38
0
        public void SerializationTest()
        {
            BagOfWords target = new BagOfWords();

            target.Compute(texts);

            int[][] expected = new int[texts.Length][];
            for (int i = 0; i < expected.Length; i++)
                expected[i] = target.GetFeatureVector(texts[i]);

            MemoryStream stream = new MemoryStream();
            BinaryFormatter fmt = new BinaryFormatter();
            fmt.Serialize(stream, target);
            stream.Seek(0, SeekOrigin.Begin);
            target = (BagOfWords)fmt.Deserialize(stream);

            int[][] actual = new int[expected.Length][];
            for (int i = 0; i < actual.Length; i++)
                actual[i] = target.GetFeatureVector(texts[i]);

            Assert.IsTrue(expected.IsEqual(actual));
        }
Ejemplo n.º 39
0
        public void ComputeTest2()
        {

            // Some sample texts
            string[] spamTokens = Tokenize(@"I decided to sign up for the Disney Half Marathon. Half of a marathon is 13.1 miles. A full marathon is 26.2 miles. You may wonder why the strange number of miles. “26.2” is certainly not an even number. And after running 26 miles who cares about the point two? You might think that 26.2 miles is a whole number of kilometers. It isn’t. In fact, it is even worse in kilometers – 42.1648128. I bet you don’t see many t-shirts in England with that number printed on the front.");

            string[] loremTokens = Tokenize(@"Lorem ipsum dolor sit amet,  Nulla nec tortor. Donec id elit quis purus consectetur consequat. Nam congue semper tellus. Sed erat dolor, dapibus sit amet, venenatis ornare, ultrices ut, nisi. Aliquam ante. Suspendisse scelerisque dui nec velit. Duis augue augue, gravida euismod, vulputate ac, facilisis id, sem. Morbi in orci. Nulla purus lacus, pulvinar vel, malesuada ac, mattis nec, quam. Nam molestie scelerisque quam. Nullam feugiat cursus lacus.orem ipsum dolor sit amet.");

            // Their respective classes
            string[] classes = { "spam", "lorem" };


            // Create a new Bag-of-Words for the texts
            BagOfWords bow = new BagOfWords(spamTokens, loremTokens)
            {
                // Limit the maximum number of occurrences in 
                // the feature vector to a single instance
                MaximumOccurance = 1
            };

            // Define the symbols for the Naïve Bayes
            int[] symbols = new int[bow.NumberOfWords];
            for (int i = 0; i < symbols.Length; i++)
                symbols[i] = bow.MaximumOccurance + 1;

            // Create input and outputs for training
            int[][] inputs =
            {
                bow.GetFeatureVector(spamTokens),
                bow.GetFeatureVector(loremTokens)
            };

            int[] outputs =
            {
                0, // spam
                1, // lorem
            };

            // Create the naïve Bayes model
            NaiveBayes bayes = new NaiveBayes(2, symbols);

            for (int i = 0; i < bayes.ClassCount; i++)
                for (int j = 0; j < bayes.SymbolCount.Length; j++)
                    for (int k = 0; k < bayes.SymbolCount[j]; k++)
                        bayes.Distributions[i, j][k] = 1e-10;

            // Estimate the model
            bayes.Estimate(inputs, outputs);


            // Initialize with prior probabilities
            for (int i = 0; i < bayes.ClassCount; i++)
                for (int j = 0; j < bayes.SymbolCount.Length; j++)
                {
                    double sum = bayes.Distributions[i, j].Sum();
                    Assert.AreEqual(1, sum, 1e-5);
                }

            // Consume the model
            {
                // First an example to classify as lorem
                int[] input = bow.GetFeatureVector(loremTokens);
                int answer = bayes.Compute(input);
                string result = classes[answer];

                Assert.AreEqual("lorem", result);
            }

            {
                // Then an example to classify as spam
                int[] input = bow.GetFeatureVector(spamTokens);
                int answer = bayes.Compute(input);
                string result = classes[answer];

                Assert.AreEqual("spam", result);
            }

        }
Ejemplo n.º 40
0
 public FirstExecution(WordNetEngine wordnet, INLPService nlp)
     : base(wordnet, nlp)
 {
     bow = new BagOfWords();
 }
Ejemplo n.º 41
0
        public void ComputeTest()
        {
            BagOfWords target = new BagOfWords();

            target.Compute(texts);

            target.MaximumOccurance = Int16.MaxValue;

            string[] text = { "vestibulum", "vestibulum", "vestibulum" };

            int[] actual = target.GetFeatureVector(text);

            int actualIdx = 43;

            Assert.IsTrue(actual[actualIdx] == 3);

            for (int i = 0; i < actual.Length; i++)
            {
                if (i != actualIdx)
                    Assert.IsTrue(actual[i] == 0);
            }
        }