public async void ArticleCrawler_AcceptArticles() { var tag = new InterestTag { TagName = "Testing", AccountUsername = null, ArticleTags = null }; var mockInterestRepo = new Mock <IInterestTagRepository>(); mockInterestRepo.Setup(x => x.Insert(tag)); List <KeyValuePair <string, List <string> > > Sites = new List <KeyValuePair <string, List <string> > >() { }; Sites.Add(new KeyValuePair <string, List <string> >("http://hugogarcia.me/site/testing.html", new List <string> { "div", "class", "test", "a", "href", "http://hugogarcia.me/site/", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "name", "description", "content", "Testing" })); // KeyValuePair<string, List<string>> siteAttributes = new KeyValuePair<string, List<string>>("http://hugogarcia.me/site/testing.html", new List<string> { "div", "class", "test", "a", "href", "http://hugogarcia.me/site/", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "name", "description", "content", "Testing" }); HashSet <string> Tags = new HashSet <string> { "hm", "test", "tester" }; BaseCrawler test = new BaseCrawler(Sites, Tags); // Act var result = await test.GatherArticles(Sites); var list = await test.ArticleCrawler(result); // Assert Assert.Equal(4, list.Count); }
public void Update(InterestTag interestTag) { _interestTagRepository.Update(interestTag); }
public void Create(InterestTag interestTag) { _interestTagRepository.Insert(interestTag); }
/// <summary> /// Crawl through aggregated links and check if its a valid article for the site. /// </summary> /// <param name="links"> List of gathered links</param> /// <returns> List of valid Articles </returns> public async Task <List <Article> > ArticleCrawler(List <string[]> links) { // Will hold the valid Articles. var list = new List <Article>(); // Initiate new httpclient and htmlDocument to request and traverse html. var httpClient = new HttpClient(); var htmlDoc = new HtmlDocument(); // For each article, check if valid foreach (var art in links) { try { // Will hold the proper attributes for the site. List <string> siteAttribute = null; var html = await httpClient.GetStringAsync(art[0]); htmlDoc.LoadHtml(html); // Associate the proper attributes for the Site. foreach (var site in Sites) { if (site.Key == art[1]) { siteAttribute = site.Value; break; } } // Boolean to check if article matches a keyword/tag bool tagMatch = false; string[] contentTags = GatherTags(htmlDoc, siteAttribute); tagMatch = MatchTags(contentTags); // If tagMatched, gather the rest of the necessary Article information. if (tagMatch) { // Will hold the article info. string title = GatherTitle(htmlDoc, siteAttribute); string description = GatherDescription(htmlDoc, siteAttribute); // Gather Tag info if (!interestTagRepository.Exists(a => a.TagName == siteAttribute[18])) { InterestTag newTag = new InterestTag() { TagName = siteAttribute[18], AccountUsername = null, ArticleTags = null }; interestTagRepository.Insert(newTag); Console.WriteLine($"{newTag.TagName} added to DB"); } InterestTag tag = interestTagRepository.GetSingle(d => d.TagName.Equals(siteAttribute[18])); // Create new Article Object and assign the gathered variables. var goodArticle = new Article { // Assigns articleType by attribute[18]. ie. Technology, Medical, etc. TagName = tag.TagName, ArticleTitle = title, // Assigns the articleLink as the url from the article that is being crawled. ArticleLink = art[0], ArticleDescription = description }; // Ran into some articles that did not have a Title, decided to make the description the title. if (goodArticle.ArticleTitle == "") { goodArticle.ArticleTitle = goodArticle.ArticleDescription; } //Ran into some articles that did not have a description, decided to input my own placeholder description. if (goodArticle.ArticleDescription == "") { goodArticle.ArticleDescription = "Click to read article!"; } // Add the valid article to the list of valid articles. list.Add(goodArticle); } else { Console.WriteLine($"{art[0]} not added to {siteAttribute[18]}"); } } // Catch HttpRequestException if it fails to get a response from requested article. catch (HttpRequestException e) { Console.WriteLine(DateTime.Now + ": Site " + art[0] + " response: " + e.Message); } catch (NullReferenceException e) { Console.WriteLine(DateTime.Now + ": Received an '" + e.Message + "' Error from " + art[0]); } } return(list); }