示例#1
0
            public async void ArticleCrawler_AcceptArticles()
            {
                var tag = new InterestTag
                {
                    TagName         = "Testing",
                    AccountUsername = null,
                    ArticleTags     = null
                };

                var mockInterestRepo = new Mock <IInterestTagRepository>();

                mockInterestRepo.Setup(x => x.Insert(tag));
                List <KeyValuePair <string, List <string> > > Sites = new List <KeyValuePair <string, List <string> > >()
                {
                };

                Sites.Add(new KeyValuePair <string, List <string> >("http://hugogarcia.me/site/testing.html", new List <string> {
                    "div", "class", "test", "a", "href", "http://hugogarcia.me/site/", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "name", "description", "content", "Testing"
                }));
                // KeyValuePair<string, List<string>> siteAttributes = new KeyValuePair<string, List<string>>("http://hugogarcia.me/site/testing.html", new List<string> { "div", "class", "test", "a", "href", "http://hugogarcia.me/site/", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "name", "description", "content", "Testing" });
                HashSet <string> Tags = new HashSet <string> {
                    "hm", "test", "tester"
                };

                BaseCrawler test = new BaseCrawler(Sites, Tags);

                // Act
                var result = await test.GatherArticles(Sites);

                var list = await test.ArticleCrawler(result);

                // Assert
                Assert.Equal(4, list.Count);
            }
 public void Update(InterestTag interestTag)
 {
     _interestTagRepository.Update(interestTag);
 }
 public void Create(InterestTag interestTag)
 {
     _interestTagRepository.Insert(interestTag);
 }
        /// <summary>
        /// Crawl through aggregated links and check if its a valid article for the site.
        /// </summary>
        /// <param name="links"> List of gathered links</param>
        /// <returns> List of valid Articles </returns>
        public async Task <List <Article> > ArticleCrawler(List <string[]> links)
        {
            // Will hold the valid Articles.
            var list = new List <Article>();

            // Initiate new httpclient and htmlDocument to request and traverse html.
            var httpClient = new HttpClient();
            var htmlDoc    = new HtmlDocument();

            // For each article, check if valid
            foreach (var art in links)
            {
                try
                {
                    // Will hold the proper attributes for the site.
                    List <string> siteAttribute = null;
                    var           html          = await httpClient.GetStringAsync(art[0]);

                    htmlDoc.LoadHtml(html);

                    // Associate the proper attributes for the Site.
                    foreach (var site in Sites)
                    {
                        if (site.Key == art[1])
                        {
                            siteAttribute = site.Value;
                            break;
                        }
                    }



                    // Boolean to check if article matches a keyword/tag
                    bool tagMatch = false;

                    string[] contentTags = GatherTags(htmlDoc, siteAttribute);

                    tagMatch = MatchTags(contentTags);

                    // If tagMatched, gather the rest of the necessary Article information.
                    if (tagMatch)
                    {
                        // Will hold the article info.
                        string title       = GatherTitle(htmlDoc, siteAttribute);
                        string description = GatherDescription(htmlDoc, siteAttribute);

                        // Gather Tag info
                        if (!interestTagRepository.Exists(a => a.TagName == siteAttribute[18]))
                        {
                            InterestTag newTag = new InterestTag()
                            {
                                TagName         = siteAttribute[18],
                                AccountUsername = null,
                                ArticleTags     = null
                            };
                            interestTagRepository.Insert(newTag);
                            Console.WriteLine($"{newTag.TagName} added to DB");
                        }
                        InterestTag tag = interestTagRepository.GetSingle(d => d.TagName.Equals(siteAttribute[18]));



                        // Create new Article Object and assign the gathered variables.
                        var goodArticle = new Article
                        {
                            // Assigns articleType by attribute[18]. ie. Technology, Medical, etc.
                            TagName      = tag.TagName,
                            ArticleTitle = title,
                            // Assigns the articleLink as the url from the article that is being crawled.
                            ArticleLink        = art[0],
                            ArticleDescription = description
                        };

                        // Ran into some articles that did not have a Title, decided to make the description the title.
                        if (goodArticle.ArticleTitle == "")
                        {
                            goodArticle.ArticleTitle = goodArticle.ArticleDescription;
                        }

                        //Ran into some articles that did not have a description, decided to input my own placeholder description.
                        if (goodArticle.ArticleDescription == "")
                        {
                            goodArticle.ArticleDescription = "Click to read article!";
                        }

                        // Add the valid article to the list of valid articles.
                        list.Add(goodArticle);
                    }
                    else
                    {
                        Console.WriteLine($"{art[0]} not added to {siteAttribute[18]}");
                    }
                }
                // Catch HttpRequestException if it fails to get a response from requested article.
                catch (HttpRequestException e)
                {
                    Console.WriteLine(DateTime.Now + ": Site " + art[0] + " response: " + e.Message);
                }
                catch (NullReferenceException e)
                {
                    Console.WriteLine(DateTime.Now + ": Received an '" + e.Message + "' Error from " + art[0]);
                }
            }
            return(list);
        }