private WordOccurenceCollection CountOccurencesForText(string textBlockText) { if (textBlockText == null) { throw new ArgumentNullException(nameof(textBlockText)); } var occurences = new WordOccurenceCollection(); // Split text to words var words = textBlockText.Split(new[] { '.', '?', '!', ' ', ':', ',' }, StringSplitOptions.RemoveEmptyEntries) .Where(x => x.Length >= MinimumWordLength); foreach (var word in words) { var simpleWord = word.Simplify(); if (!string.IsNullOrWhiteSpace(simpleWord) && simpleWord.Length >= MinimumWordLength) { occurences.Add(simpleWord); } } return(occurences); }
public void GetWordCount_OnExecuteWithNull_ThrowsException() { var wordOccurendeCollection = new WordOccurenceCollection { "word" }; wordOccurendeCollection.GetWordCount(null); }
public void GetWordCount_OnExecuteWithWordNotInCollection_ReturnsZero() { var wordOccurendeCollection = new WordOccurenceCollection { "word" }; var result = wordOccurendeCollection.GetWordCount("test"); Assert.AreEqual(0, result); }
public void Merge_OnExecute_ReturnsNewMergedCollection() { var wordOccurendeCollection = new WordOccurenceCollection { "word", "word" }; var wordOccurendeCollection2 = new WordOccurenceCollection { "word", "word" }; var mergedCollection = wordOccurendeCollection.Merge(wordOccurendeCollection2); Assert.AreNotEqual(wordOccurendeCollection, mergedCollection); Assert.AreNotEqual(wordOccurendeCollection2, mergedCollection); Assert.AreEqual(4, mergedCollection.GetWordCount("word")); }
public IEnumerable <KeyValuePair <string, int> > GetKeywords(HtmlResult html) { var occurences = new WordOccurenceCollection(); var textBlocks = html.Document.SelectNodes("//*[not(self::script) and not(self::style)]]//text()"); if (textBlocks != null) { foreach (var textBlock in textBlocks) { var textBlockText = textBlock.InnerText; var occurencesInBlock = CountOccurencesForText(textBlockText); occurences = occurences.Merge(occurencesInBlock); } } return(occurences.OrderByDescending(x => x.Value)); }
public void GetWordCount_OnExecute_ReturnsTheOccurenceOfTheWords() { var wordOccurendeCollection = new WordOccurenceCollection { "word", "word", "word", "word", "test", "test", "test" }; var resultForWord = wordOccurendeCollection.GetWordCount("word"); var resultForTest = wordOccurendeCollection.GetWordCount("test"); Assert.AreEqual(4, resultForWord); Assert.AreEqual(3, resultForTest); }
public WordOccurenceCollection GetKeywords(HtmlNode htmlNode) { if (htmlNode == null) { throw new ArgumentNullException(nameof(htmlNode)); } var occurences = new WordOccurenceCollection(); var textBlocks = htmlNode.SelectNodes("//*[not(self::script) and not(self::style)]//text()"); if (textBlocks != null) { var textBlocksWithText = textBlocks.Where(x => !string.IsNullOrWhiteSpace(x.InnerText)).Select(x => x.InnerHtml); foreach (var text in textBlocksWithText) { var occurencesInBlock = CountOccurencesForText(text); occurences = occurences.Merge(occurencesInBlock); } } return(occurences); }