Example #1
0
        public void GetOutcomes()
        {
            List <Data>      data = DataExtractor.GetDataToBeReviewed();
            KeywordExtractor k    = new KeywordExtractor();

            foreach (var d in data)
            {
                string[] phrases = k.FindKeyPhrases(d.Text);
                phrases = string.Join(" ", phrases).Split(' ');
                double approved = Classifier.Classify(Data.Outcome.Approved.ToString(), string.Join(" ", phrases).Split(' '));
                double denied   = Classifier.Classify(Data.Outcome.Denied.ToString(), phrases);

                if (approved > 0.5)
                {
                    Console.WriteLine(d.Text + ": " + approved + ": " + denied + ": Approved");
                }
                else if (denied > 0.5)
                {
                    Console.WriteLine(d.Text + ": " + approved + ": " + denied + ": Denied");
                }
                else
                {
                    Console.WriteLine(d.Text + ": " + approved + ": " + denied + ": No result");
                }
            }
            Console.Read();
        }
Example #2
0
        public static string CollectRecommendations()
        {
            string toReturn = "";

            if (likeGameDataList.Count <= 0)
            {
                return(toReturn);
            }
            KeywordExtractor extractor = new KeywordExtractor();

            string[] res = new string[100];             //<--magic numbers, ahh!, but I'll always only pick the first so its ok
            for (int j = 0; j < 100; j++)
            {
                res [j] = "";
            }
            //for entire like list, uses nrake and adds to the string the first term from each
            //for now only take one of these! --> make it random I guess
            //the following only executes once, so we're just taking one keyword:
            {
                Random rnd = new Random();
                int    r   = rnd.Next(0, likeGameDataList.Count);
                res      = extractor.FindKeyPhrases(likeGameDataList [r].description);
                res[1]   = likeGameDataList [r].title;
                toReturn = toReturn + res[0] + res [1] + " ";
            }            //this is a bit hacky but basically I am taking the title of the "liked" game and the most popular search term from RAKE

            return(toReturn);
        }
        /// <summary>
        /// Initializes a new instance of the <see cref="TopicKeywordExtractor"/> class.
        /// </summary>
        public TopicKeywordExtractor()
        {
            KeywordExtractor = new KeywordExtractor(new NRakeCore.StopWordFilters.EnglishSmartStopWordFilter());
            Items            = new List <Topic>();
            PhraseScore      = new Dictionary <string, float>();

            TopicKeyPhrases = new Dictionary <Topic, List <string> >();
        }
        private Topic GetTopicFrom(string fileName)
        {
            var kwe = new KeywordExtractor(new NRakeCore.StopWordFilters.EnglishSmartStopWordFilter());
            var t   = new Topic(fileName);

            t.Words = kwe.Tokenize(t.GetText()).Length;

            return(t);
        }
Example #5
0
        public void GetBestInstanceForCulture_ENUS()
        {
            //Arrange
            string lang = "en-us";

            //Act
            var extractor = KeywordExtractor.GetBestInstanceForCulture(lang);

            //Assert
            Assert.AreEqual(typeof(EnglishSmartStopWordFilter), extractor.StopWordFilter.GetType());
        }
Example #6
0
        public void GetBestInstanceForCulture_FRCA()
        {
            //Arrange
            string lang = "fr-ca";

            //Act
            var extractor = KeywordExtractor.GetBestInstanceForCulture(lang);

            //Assert
            Assert.AreEqual(typeof(FrenchStopWordFilter), extractor.StopWordFilter.GetType());
        }
Example #7
0
        public void ScorePhrases_ExpectException()
        {
            //Arrange
            KeywordExtractor extractor = new KeywordExtractor();

            string[] expectedPhrases = new string[] { "compatibility", "systems", "linear constraints", "set", "natural numbers" };

            //Act
            //This should throw an exception because we did not call ToPhrases() to initialize the index of unique words
            var res = extractor.ScorePhrases(expectedPhrases);

            //Assert
        }
Example #8
0
        public void FindKeyPhrases()
        {
            //Arrange
            KeywordExtractor extractor = new KeywordExtractor();
            //string[] expected = Sample1ExpectedOutput;

            //Act
            var res = extractor.FindKeyPhrases(this.Sample1);

            //Assert
            Assert.AreEqual(11, res.Length);
            Assert.AreEqual("minimal supporting set", res[0]);
        }
        /// <summary>
        /// The FindKeyPhrases
        /// </summary>
        /// <param name="topics">The topics<see cref="Topic[]"/></param>
        /// <returns>The <see cref="string[]"/></returns>
        public string[] FindKeyPhrases(Topic[] topics)
        {
            TopicKeyPhrases.Clear();
            foreach (var topic in topics)
            {
                var keyPhrases = KeywordExtractor.FindKeyPhrases(topic.GetText());
                TopicKeyPhrases.Add(topic, keyPhrases.ToList());
            }
            KeyPhraseTopics = ToKeyPhraseTopics(TopicKeyPhrases);

            string[] allKeyPhrases = KeyPhraseTopics.Keys.ToArray();
            PhraseScore = BuildPhraseScore(allKeyPhrases);

            return(allKeyPhrases);
        }
Example #10
0
        public void FindKeyPhrases_LargeHtmlFile2()
        {
            //Arrange
            KeywordExtractor extractor = new KeywordExtractor();
            HtmlDocument     doc       = new HtmlDocument();

            doc.Load("LargeFile2.html");
            var body = doc.DocumentNode.SelectSingleNode("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), ' posts ')]");

            RemoveComments(body);
            string text = body.InnerText;

            //Act
            var res = extractor.FindKeyPhrases(text);

            //Assert
        }
Example #11
0
        public void FindKeyPhrases_DuplicateKeyException()
        {
            //Arrange
            KeywordExtractor extractor = new KeywordExtractor();

            //Act
            var res = extractor.FindKeyPhrases(this.LockInsText);

            //CF Aug 21, 2014: The LockInsText file is weird! The first instance of the word "application" has some wonky encoding that actually
            // ends up as "appli-cation", but it's not displayed consistently in all views.  In WordCooccurrenceMatrix.AggregateLeagueTable()
            // we end up with both variants being put in the array created by .Distinct() but on insertion into the dict, they are treated as
            // an identical key! WTF!
            // This used to throw an unhandled exception (see http://github.dev/yellowpencil/Octave/issues/383) but we've added a try/catch that
            // swallows the exception because it should only occur in really bizarre cases like this one. (Shouldn't be a perf issue either).

            //Assert
        }
Example #12
0
        public void ToPhrases()
        {
            //Arrange
            KeywordExtractor extractor = new KeywordExtractor();
            string           input     = "Compatibility of systems of linear constraints over the set of natural numbers.";

            string[] tokens          = extractor.Tokenize(input);
            string[] expectedPhrases = new string[] { "compatibility", "systems", "linear constraints", "set", "natural numbers" };

            //Act
            var res = extractor.ToPhrases(tokens);

            //Assert
            Assert.AreEqual(expectedPhrases.Length, res.Length);
            for (int i = 0; i < res.Length; i++)
            {
                Assert.AreEqual(expectedPhrases[i], res[i]);
            }
        }
Example #13
0
        public void CompileOccurrences()
        {
            //Arrange
            KeywordExtractor extractor = new KeywordExtractor();

            string[] tokens  = extractor.Tokenize(this.Sample1);
            string[] phrases = extractor.ToPhrases(tokens);
            WordCooccurrenceMatrix matrix = new WordCooccurrenceMatrix(extractor.UniqueWordIndex);

            //Act
            matrix.CompileOccurrences(phrases);

            //Assert
            Assert.AreEqual(2, matrix[matrix.IndexOf("algorithms"), matrix.IndexOf("algorithms")], "'algorithms' diagonal count");
            Assert.AreEqual(1, matrix[matrix.IndexOf("bounds"), matrix.IndexOf("bounds")], "'bounds' diagonal count");
            Assert.AreEqual(1, matrix[matrix.IndexOf("corresponding"), matrix.IndexOf("algorithms")], "'corresponding'->'algorithms' count");
            Assert.AreEqual(2, matrix[matrix.IndexOf("minimal"), matrix.IndexOf("set")], "'minimal'->'set' count");
            Assert.AreEqual(2, matrix[matrix.IndexOf("set"), matrix.IndexOf("minimal")], "'set'->'minimal' count");
        }
Example #14
0
        public void Tokenize()
        {
            //Arrange
            KeywordExtractor extractor = new KeywordExtractor();

            string[] expected = new string[] { };

            //Act
            var res = extractor.Tokenize("The quick, brown fox jumps over the lazy dog. Yes he does!");

            //Assert
            Assert.AreEqual(15, res.Length);
            Assert.AreEqual("the", res[0]);
            Assert.AreEqual("quick", res[1]);
            Assert.AreEqual(",", res[2]);
            Assert.AreEqual("brown", res[3]);
            Assert.AreEqual("fox", res[4]);
            Assert.AreEqual("does", res[13]);
            Assert.AreEqual("!", res[14]);
        }
        public ActivityResult Run(PipelineContext context)
        {
            var            pipe   = context.Pipeline as NewsAnalysisPipeline;
            ActivityResult result = new ActivityResult();
            IDictionary <long, List <string> > dict = new Dictionary <long, List <string> >();

            KeywordExtractor extractor = new KeywordExtractor();
            var newsList = context[pipe.NewsContextKey] as IEnumerable <NewsStream>;

            if (newsList != null)
            {
                foreach (var news in newsList)
                {
                    var keywords = extractor.ExtractKeywordsWithTextRank(news.NewsArticleDescription).ToList();
                    dict.Add(news.Id, keywords ?? new List <string>());
                }
            }

            result.Result     = dict;
            result.ObjectType = dict.GetType();
            return(result);
        }
Example #16
0
        public void ComputeLeagueTable()
        {
            //Arrange
            KeywordExtractor extractor = new KeywordExtractor();

            string[] tokens  = extractor.Tokenize(this.Sample1);
            string[] phrases = extractor.ToPhrases(tokens);
            WordCooccurrenceMatrix matrix = new WordCooccurrenceMatrix(extractor.UniqueWordIndex);

            //Act
            matrix.CompileOccurrences(phrases);
            SortedList <string, WordScore> leagueTable = matrix.LeagueTable;

            //Assert
            Assert.AreEqual(3, leagueTable["algorithms"].Degree, "Degree 1");
            Assert.AreEqual(2, leagueTable["algorithms"].Frequency, "Frequency 1");
            Assert.AreEqual(1.5, leagueTable["algorithms"].Ratio, "Ratio 1");

            Assert.AreEqual(8, leagueTable["minimal"].Degree, "Degree 1");
            Assert.AreEqual(5, leagueTable["minimal"].Frequency, "Frequency 1");
            Assert.AreEqual(1.6, leagueTable["minimal"].Ratio, "Ratio 1");
        }
Example #17
0
        public bool SaveHistory()
        {
            List <Data>      data = DataExtractor.GetData();
            KeywordExtractor k    = new KeywordExtractor();

            foreach (var d in data)
            {
                string[] phrases = k.FindKeyPhrases(d.Text);
                string   text    = string.Join(" ", phrases);
                if (d.Result == Data.Outcome.Approved)
                {
                    Classifier.TeachMatch(Data.Outcome.Approved.ToString(), text);
                    Classifier.TeachNonMatch(Data.Outcome.Denied.ToString(), text);
                }
                else
                {
                    Classifier.TeachMatch(Data.Outcome.Denied.ToString(), text);
                    Classifier.TeachNonMatch(Data.Outcome.Approved.ToString(), text);
                }
            }
            return(true);
        }
        /// <summary>
        /// The TopicKeywords
        /// </summary>
        /// <param name="topic">The topic<see cref="Topic"/></param>
        /// <param name="stopFilter">The stopFilter<see cref="NRakeCore.StopWordFilters.IStopWordFilter"/></param>
        /// <returns>The <see cref="string[]"/></returns>
        private string[] TopicKeywords(Topic topic, NRakeCore.StopWordFilters.IStopWordFilter stopFilter)
        {
            var tke = new KeywordExtractor(stopFilter);

            return(tke.FindKeyPhrases(topic.GetText()));
        }