Ejemplo n.º 1
0
        public ScrapedPageViewModel BuildScrapedPageViewModel(Uri targetUri)
        {
            var viewModel = new ScrapedPageViewModel {
                TargetUri = targetUri
            };

            var          scraper = new WebLoader();
            HtmlDocument htmlDoc;

            try
            {
                htmlDoc = scraper.LoadHtmlDocument(targetUri.ToString());
            }
            catch (Exception ex)
            {
                //very rare scenario where we actually just want to swallow this - no logging or further action necessary in this scenario
                //target site won't be scraped. Send user to scrape-specific error page suggesting to try another site.
                return(null);
            }


            IEnumerable <string> filteredNodes = scraper.GetDocumentText(htmlDoc).Select(x => x.InnerText);

            string normalizedPageString = string.Join(" ", filteredNodes)
                                          .Replace("\t", "") // ensure we don't include formatting as an word
                                          .NormalizeLineEndings("");


            List <string> wordList           = StringUtility.GetWords(normalizedPageString.ToLowerInvariant()).ToList();
            var           wordFrequencyStats = StringUtility.AggregateWordFrequencyStats(wordList.Where(w => w.Length > 2)).ToList();


            viewModel.TotalWordCount     = wordList.Count;
            viewModel.UniqueWordCount    = StringUtility.GetUniqueWords(wordList).Count();
            viewModel.WordFrequencyStats = AssembleTagCloudListForViewModel(wordFrequencyStats);
            viewModel.ImageList          = AssembleImageListForViewModel(targetUri, scraper.PluckImageNodes(htmlDoc));

            return(viewModel);
        }