public void GetOutcomes() { List <Data> data = DataExtractor.GetDataToBeReviewed(); KeywordExtractor k = new KeywordExtractor(); foreach (var d in data) { string[] phrases = k.FindKeyPhrases(d.Text); phrases = string.Join(" ", phrases).Split(' '); double approved = Classifier.Classify(Data.Outcome.Approved.ToString(), string.Join(" ", phrases).Split(' ')); double denied = Classifier.Classify(Data.Outcome.Denied.ToString(), phrases); if (approved > 0.5) { Console.WriteLine(d.Text + ": " + approved + ": " + denied + ": Approved"); } else if (denied > 0.5) { Console.WriteLine(d.Text + ": " + approved + ": " + denied + ": Denied"); } else { Console.WriteLine(d.Text + ": " + approved + ": " + denied + ": No result"); } } Console.Read(); }
public static string CollectRecommendations() { string toReturn = ""; if (likeGameDataList.Count <= 0) { return(toReturn); } KeywordExtractor extractor = new KeywordExtractor(); string[] res = new string[100]; //<--magic numbers, ahh!, but I'll always only pick the first so its ok for (int j = 0; j < 100; j++) { res [j] = ""; } //for entire like list, uses nrake and adds to the string the first term from each //for now only take one of these! --> make it random I guess //the following only executes once, so we're just taking one keyword: { Random rnd = new Random(); int r = rnd.Next(0, likeGameDataList.Count); res = extractor.FindKeyPhrases(likeGameDataList [r].description); res[1] = likeGameDataList [r].title; toReturn = toReturn + res[0] + res [1] + " "; } //this is a bit hacky but basically I am taking the title of the "liked" game and the most popular search term from RAKE return(toReturn); }
/// <summary> /// Initializes a new instance of the <see cref="TopicKeywordExtractor"/> class. /// </summary> public TopicKeywordExtractor() { KeywordExtractor = new KeywordExtractor(new NRakeCore.StopWordFilters.EnglishSmartStopWordFilter()); Items = new List <Topic>(); PhraseScore = new Dictionary <string, float>(); TopicKeyPhrases = new Dictionary <Topic, List <string> >(); }
private Topic GetTopicFrom(string fileName) { var kwe = new KeywordExtractor(new NRakeCore.StopWordFilters.EnglishSmartStopWordFilter()); var t = new Topic(fileName); t.Words = kwe.Tokenize(t.GetText()).Length; return(t); }
public void GetBestInstanceForCulture_ENUS() { //Arrange string lang = "en-us"; //Act var extractor = KeywordExtractor.GetBestInstanceForCulture(lang); //Assert Assert.AreEqual(typeof(EnglishSmartStopWordFilter), extractor.StopWordFilter.GetType()); }
public void GetBestInstanceForCulture_FRCA() { //Arrange string lang = "fr-ca"; //Act var extractor = KeywordExtractor.GetBestInstanceForCulture(lang); //Assert Assert.AreEqual(typeof(FrenchStopWordFilter), extractor.StopWordFilter.GetType()); }
public void ScorePhrases_ExpectException() { //Arrange KeywordExtractor extractor = new KeywordExtractor(); string[] expectedPhrases = new string[] { "compatibility", "systems", "linear constraints", "set", "natural numbers" }; //Act //This should throw an exception because we did not call ToPhrases() to initialize the index of unique words var res = extractor.ScorePhrases(expectedPhrases); //Assert }
public void FindKeyPhrases() { //Arrange KeywordExtractor extractor = new KeywordExtractor(); //string[] expected = Sample1ExpectedOutput; //Act var res = extractor.FindKeyPhrases(this.Sample1); //Assert Assert.AreEqual(11, res.Length); Assert.AreEqual("minimal supporting set", res[0]); }
/// <summary> /// The FindKeyPhrases /// </summary> /// <param name="topics">The topics<see cref="Topic[]"/></param> /// <returns>The <see cref="string[]"/></returns> public string[] FindKeyPhrases(Topic[] topics) { TopicKeyPhrases.Clear(); foreach (var topic in topics) { var keyPhrases = KeywordExtractor.FindKeyPhrases(topic.GetText()); TopicKeyPhrases.Add(topic, keyPhrases.ToList()); } KeyPhraseTopics = ToKeyPhraseTopics(TopicKeyPhrases); string[] allKeyPhrases = KeyPhraseTopics.Keys.ToArray(); PhraseScore = BuildPhraseScore(allKeyPhrases); return(allKeyPhrases); }
public void FindKeyPhrases_LargeHtmlFile2() { //Arrange KeywordExtractor extractor = new KeywordExtractor(); HtmlDocument doc = new HtmlDocument(); doc.Load("LargeFile2.html"); var body = doc.DocumentNode.SelectSingleNode("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), ' posts ')]"); RemoveComments(body); string text = body.InnerText; //Act var res = extractor.FindKeyPhrases(text); //Assert }
public void FindKeyPhrases_DuplicateKeyException() { //Arrange KeywordExtractor extractor = new KeywordExtractor(); //Act var res = extractor.FindKeyPhrases(this.LockInsText); //CF Aug 21, 2014: The LockInsText file is weird! The first instance of the word "application" has some wonky encoding that actually // ends up as "appli-cation", but it's not displayed consistently in all views. In WordCooccurrenceMatrix.AggregateLeagueTable() // we end up with both variants being put in the array created by .Distinct() but on insertion into the dict, they are treated as // an identical key! WTF! // This used to throw an unhandled exception (see http://github.dev/yellowpencil/Octave/issues/383) but we've added a try/catch that // swallows the exception because it should only occur in really bizarre cases like this one. (Shouldn't be a perf issue either). //Assert }
public void ToPhrases() { //Arrange KeywordExtractor extractor = new KeywordExtractor(); string input = "Compatibility of systems of linear constraints over the set of natural numbers."; string[] tokens = extractor.Tokenize(input); string[] expectedPhrases = new string[] { "compatibility", "systems", "linear constraints", "set", "natural numbers" }; //Act var res = extractor.ToPhrases(tokens); //Assert Assert.AreEqual(expectedPhrases.Length, res.Length); for (int i = 0; i < res.Length; i++) { Assert.AreEqual(expectedPhrases[i], res[i]); } }
public void CompileOccurrences() { //Arrange KeywordExtractor extractor = new KeywordExtractor(); string[] tokens = extractor.Tokenize(this.Sample1); string[] phrases = extractor.ToPhrases(tokens); WordCooccurrenceMatrix matrix = new WordCooccurrenceMatrix(extractor.UniqueWordIndex); //Act matrix.CompileOccurrences(phrases); //Assert Assert.AreEqual(2, matrix[matrix.IndexOf("algorithms"), matrix.IndexOf("algorithms")], "'algorithms' diagonal count"); Assert.AreEqual(1, matrix[matrix.IndexOf("bounds"), matrix.IndexOf("bounds")], "'bounds' diagonal count"); Assert.AreEqual(1, matrix[matrix.IndexOf("corresponding"), matrix.IndexOf("algorithms")], "'corresponding'->'algorithms' count"); Assert.AreEqual(2, matrix[matrix.IndexOf("minimal"), matrix.IndexOf("set")], "'minimal'->'set' count"); Assert.AreEqual(2, matrix[matrix.IndexOf("set"), matrix.IndexOf("minimal")], "'set'->'minimal' count"); }
public void Tokenize() { //Arrange KeywordExtractor extractor = new KeywordExtractor(); string[] expected = new string[] { }; //Act var res = extractor.Tokenize("The quick, brown fox jumps over the lazy dog. Yes he does!"); //Assert Assert.AreEqual(15, res.Length); Assert.AreEqual("the", res[0]); Assert.AreEqual("quick", res[1]); Assert.AreEqual(",", res[2]); Assert.AreEqual("brown", res[3]); Assert.AreEqual("fox", res[4]); Assert.AreEqual("does", res[13]); Assert.AreEqual("!", res[14]); }
public ActivityResult Run(PipelineContext context) { var pipe = context.Pipeline as NewsAnalysisPipeline; ActivityResult result = new ActivityResult(); IDictionary <long, List <string> > dict = new Dictionary <long, List <string> >(); KeywordExtractor extractor = new KeywordExtractor(); var newsList = context[pipe.NewsContextKey] as IEnumerable <NewsStream>; if (newsList != null) { foreach (var news in newsList) { var keywords = extractor.ExtractKeywordsWithTextRank(news.NewsArticleDescription).ToList(); dict.Add(news.Id, keywords ?? new List <string>()); } } result.Result = dict; result.ObjectType = dict.GetType(); return(result); }
public void ComputeLeagueTable() { //Arrange KeywordExtractor extractor = new KeywordExtractor(); string[] tokens = extractor.Tokenize(this.Sample1); string[] phrases = extractor.ToPhrases(tokens); WordCooccurrenceMatrix matrix = new WordCooccurrenceMatrix(extractor.UniqueWordIndex); //Act matrix.CompileOccurrences(phrases); SortedList <string, WordScore> leagueTable = matrix.LeagueTable; //Assert Assert.AreEqual(3, leagueTable["algorithms"].Degree, "Degree 1"); Assert.AreEqual(2, leagueTable["algorithms"].Frequency, "Frequency 1"); Assert.AreEqual(1.5, leagueTable["algorithms"].Ratio, "Ratio 1"); Assert.AreEqual(8, leagueTable["minimal"].Degree, "Degree 1"); Assert.AreEqual(5, leagueTable["minimal"].Frequency, "Frequency 1"); Assert.AreEqual(1.6, leagueTable["minimal"].Ratio, "Ratio 1"); }
public bool SaveHistory() { List <Data> data = DataExtractor.GetData(); KeywordExtractor k = new KeywordExtractor(); foreach (var d in data) { string[] phrases = k.FindKeyPhrases(d.Text); string text = string.Join(" ", phrases); if (d.Result == Data.Outcome.Approved) { Classifier.TeachMatch(Data.Outcome.Approved.ToString(), text); Classifier.TeachNonMatch(Data.Outcome.Denied.ToString(), text); } else { Classifier.TeachMatch(Data.Outcome.Denied.ToString(), text); Classifier.TeachNonMatch(Data.Outcome.Approved.ToString(), text); } } return(true); }
/// <summary> /// The TopicKeywords /// </summary> /// <param name="topic">The topic<see cref="Topic"/></param> /// <param name="stopFilter">The stopFilter<see cref="NRakeCore.StopWordFilters.IStopWordFilter"/></param> /// <returns>The <see cref="string[]"/></returns> private string[] TopicKeywords(Topic topic, NRakeCore.StopWordFilters.IStopWordFilter stopFilter) { var tke = new KeywordExtractor(stopFilter); return(tke.FindKeyPhrases(topic.GetText())); }