/// <summary> /// Parses an HTML document for words and returns a result. /// </summary> /// <returns>Object that contains parsing results.</returns> public WordParsingResult ParseWords() { if (_htmlDocument == null) { throw new ArgumentNullException(nameof(_htmlDocument)); } var listOfWords = new List <string>(); var nodes = _htmlDocument.DocumentNode.SelectSingleNode("//body") .DescendantsAndSelf() .Where(n => n.NodeType == HtmlNodeType.Text && n.ParentNode.Name != "script" && // exclude <script> element n.ParentNode.Name != "style"); // exclude <style> element foreach (var node in nodes) { var chunks = WebUtility.HtmlDecode(node.InnerText) .Split(WordParserHelper.GetWordSeparators(), StringSplitOptions.RemoveEmptyEntries); foreach (var chunk in chunks) { if (WordParserHelper.IsWord(chunk)) { listOfWords.Add(chunk.ToLower()); } } } return(new WordParsingResult(listOfWords)); }
/// <summary> /// Get distinct list of words with their counts. /// </summary> /// <param name="excludeCommonWords">Exclude common words like the, a, an, the, etc.</param> /// <returns>Collection of Dtos with words and their counts.</returns> public IEnumerable <WordCountDto> GetWordCounts(bool excludeCommonWords) { return(AllWords .Where(w => (!excludeCommonWords || !WordParserHelper.GetCommonWords().Contains(w))) .GroupBy(w => w) .Select(w => new WordCountDto { Word = w.Key, Count = w.Count() })); }