/// <summary>
        /// Parses an HTML document for words and returns a result.
        /// </summary>
        /// <returns>Object that contains parsing results.</returns>
        public WordParsingResult ParseWords()
        {
            if (_htmlDocument == null)
            {
                throw new ArgumentNullException(nameof(_htmlDocument));
            }

            var listOfWords = new List <string>();

            var nodes = _htmlDocument.DocumentNode.SelectSingleNode("//body")
                        .DescendantsAndSelf()
                        .Where(n => n.NodeType == HtmlNodeType.Text &&
                               n.ParentNode.Name != "script" &&                 // exclude <script> element
                               n.ParentNode.Name != "style");                   // exclude <style> element

            foreach (var node in nodes)
            {
                var chunks = WebUtility.HtmlDecode(node.InnerText)
                             .Split(WordParserHelper.GetWordSeparators(), StringSplitOptions.RemoveEmptyEntries);

                foreach (var chunk in chunks)
                {
                    if (WordParserHelper.IsWord(chunk))
                    {
                        listOfWords.Add(chunk.ToLower());
                    }
                }
            }

            return(new WordParsingResult(listOfWords));
        }
示例#2
0
 /// <summary>
 /// Get distinct list of words with their counts.
 /// </summary>
 /// <param name="excludeCommonWords">Exclude common words like the, a, an, the, etc.</param>
 /// <returns>Collection of Dtos with words and their counts.</returns>
 public IEnumerable <WordCountDto> GetWordCounts(bool excludeCommonWords)
 {
     return(AllWords
            .Where(w => (!excludeCommonWords || !WordParserHelper.GetCommonWords().Contains(w)))
            .GroupBy(w => w)
            .Select(w => new WordCountDto
     {
         Word = w.Key,
         Count = w.Count()
     }));
 }