public async void ArticleCrawler_GatherTitle() { // Assert List <KeyValuePair <string, List <string> > > Sites = new List <KeyValuePair <string, List <string> > >() { }; KeyValuePair <string, List <string> > siteAttributes = new KeyValuePair <string, List <string> >("http://hugogarcia.me/site/testing.html", new List <string> { "div", "class", "test", "a", "href", "http://hugogarcia.me/site/", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "name", "description", "content", "Testing" }); string links = "http://hugogarcia.me/site/testSite1.html"; BaseCrawler test = new BaseCrawler(Sites, null); var httpClient = new HttpClient(); var htmlDoc = new HtmlDocument(); var html = await httpClient.GetStringAsync(links); htmlDoc.LoadHtml(html); // Act string title = test.GatherTitle(htmlDoc, siteAttributes.Value); // Assert Assert.Equal("testing testing 1 2 4", title); }
private void GetInfo(string url) { crawler = factory.GetCrawlerHelper(url); string resultMessages = crawler.Start(url); WriteTxt(resultMessages); }
public async void HomeCrawler_GatherArticles() { // Arrange List <KeyValuePair <string, List <string> > > Sites = new List <KeyValuePair <string, List <string> > >() { }; Sites.Add(new KeyValuePair <string, List <string> >("http://hugogarcia.me/site/testing.html", new List <string> { "div", "class", "test", "a", "href", "http://hugogarcia.me/site/", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "name", "description", "content", "Testing" })); List <string[]> actualLinks = new List <string[]> { new string[] { "http://hugogarcia.me/site/testSite1.html", "http://hugogarcia.me/site/testing.html" }, new string[] { "http://www.hugogarcia.me/site/testSite2.html", "http://hugogarcia.me/site/testing.html" }, new string[] { "http://hugogarcia.me/site/testSite3.html", "http://hugogarcia.me/site/testing.html" }, new string[] { "http://hugogarcia.me/site/testSite4.html", "http://hugogarcia.me/site/testing.html" }, new string[] { "http://hugogarcia.me/site/testSite5.html", "http://hugogarcia.me/site/testing.html" } }; HashSet <string> Tags = new HashSet <string> { "hm", "test", "tester" }; BaseCrawler test = new BaseCrawler(Sites, Tags); // Act var result = await test.GatherArticles(Sites); // Assert Assert.Equal(actualLinks, result); }
public async void ArticleCrawler_AcceptArticles() { var tag = new InterestTag { TagName = "Testing", AccountUsername = null, ArticleTags = null }; var mockInterestRepo = new Mock <IInterestTagRepository>(); mockInterestRepo.Setup(x => x.Insert(tag)); List <KeyValuePair <string, List <string> > > Sites = new List <KeyValuePair <string, List <string> > >() { }; Sites.Add(new KeyValuePair <string, List <string> >("http://hugogarcia.me/site/testing.html", new List <string> { "div", "class", "test", "a", "href", "http://hugogarcia.me/site/", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "name", "description", "content", "Testing" })); // KeyValuePair<string, List<string>> siteAttributes = new KeyValuePair<string, List<string>>("http://hugogarcia.me/site/testing.html", new List<string> { "div", "class", "test", "a", "href", "http://hugogarcia.me/site/", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "name", "description", "content", "Testing" }); HashSet <string> Tags = new HashSet <string> { "hm", "test", "tester" }; BaseCrawler test = new BaseCrawler(Sites, Tags); // Act var result = await test.GatherArticles(Sites); var list = await test.ArticleCrawler(result); // Assert Assert.Equal(4, list.Count); }
public async void ArticleCrawler_GatherTags() { // Arrange List <KeyValuePair <string, List <string> > > Sites = new List <KeyValuePair <string, List <string> > >() { }; KeyValuePair <string, List <string> > siteAttributes = new KeyValuePair <string, List <string> >("http://hugogarcia.me/site/testing.html", new List <string> { "div", "class", "test", "a", "href", "http://hugogarcia.me/site/", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "name", "description", "content", "Testing" }); string links = "http://hugogarcia.me/site/testSite1.html"; BaseCrawler test = new BaseCrawler(Sites, null); var httpClient = new HttpClient(); var htmlDoc = new HtmlDocument(); var html = await httpClient.GetStringAsync(links); htmlDoc.LoadHtml(html); string[] actualTags = { "test", "", "testing", "", "tester", "", "blah", "", "bla", "", "foo", "", "bar" }; string[] contentTags = { }; // Act contentTags = test.GatherTags(htmlDoc, siteAttributes.Value); // Assert Assert.Equal(actualTags, contentTags); }
private void SetCrawler() { switch (this.crawlerName) { case "www.rs-sandanski.com": this.crawler = new Sandanski(); break; case "www.sac.justice.bg": this.crawler = new SAC(); break; case "www.ac-smolian.org": this.crawler = new Smolian(); break; default: throw new Exception("Invalid crawler specified."); } }
public void ArticleCrawler_MatchTagsFalse(string[] contentTags) { // Arrange List <KeyValuePair <string, List <string> > > Sites = new List <KeyValuePair <string, List <string> > >() { }; KeyValuePair <string, List <string> > siteAttributes = new KeyValuePair <string, List <string> >("http://hugogarcia.me/site/testing.html", new List <string> { "div", "class", "test", "a", "href", "http://hugogarcia.me/site/", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "name", "description", "content", "Testing" }); HashSet <string> Tags = new HashSet <string> { "hm", "test", "tester" }; BaseCrawler test = new BaseCrawler(Sites, Tags); // Act bool tagMatch = test.MatchTags(contentTags); // Assert Assert.False(tagMatch); }
private bool DownloadImage(Image myImage) { BaseCrawler crawler = null; string sourceUrl = myImage.SourceUrl.Trim(); if (sourceUrl.StartsWith(@"https://cosmos", StringComparison.InvariantCultureIgnoreCase)) // cosmos path { crawler = new CosmosCrawler(FeedUriTypes.ShareFolder, sourceUrl, myImage.DownloadImageName, DateTime.MinValue); } else if (sourceUrl.StartsWith("http://") || sourceUrl.StartsWith("https://")) { ProxyType proxyType; if (!Enum.TryParse <ProxyType>(proxy, true, out proxyType)) { proxyType = ProxyType.NULL; } crawler = new HttpCrawler(FeedUriTypes.Http, sourceUrl, myImage.DownloadImageName, DateTime.MinValue, null, proxyType); } else // wrong image URI { ImageLogger.LogMessage(this.log, EventType.Warning, "Cannot identify this image's URI: {0}", sourceUrl); return(false); } if (crawler != null) { if (crawler.Crawl() != BuildResults.Crawler_Succeed) { ImageLogger.LogMessage(this.log, EventType.Warning, "Exception when download image {0}", sourceUrl); return(false); } } return(true); }
public static int Main() { // Art & Design Sitess List <KeyValuePair <string, List <string> > > ArtSites = new List <KeyValuePair <string, List <string> > >() { }; //ArtSites.Add(new KeyValuePair<string, List<string>>("http://www.cubanartnews.org/can/category/art", new List<string> { "div", "class", "artwork", "a", "href", "http://www.cubanartnews.org/", "meta", "name", "keywords", "content", "title", "", "", "", "meta", "name", "description", "content", "Art & Design" })); //ArtSites.Add(new KeyValuePair<string, List<string>>("http://www.cubanartnews.org/can/category/art/P5", new List<string> { "div", "class", "artwork", "a", "href", "http://www.cubanartnews.org/", "meta", "name", "keywords", "content", "title", "", "", "", "meta", "name", "description", "content", "Art & Design" })); //ArtSites.Add(new KeyValuePair<string, List<string>>("http://www.cubanartnews.org/can/category/art/P10", new List<string> { "div", "class", "artwork", "a", "href", "http://www.cubanartnews.org/", "meta", "name", "keywords", "content", "title", "", "", "", "meta", "name", "description", "content", "Art & Design" })); // Does not allow Framing // ArtSites.Add(new KeyValuePair<string, List<string>>("http://www.artcyclopedia.com/art-news.php", new List<string> { "font", "size", "+1", "a", "href", "", "meta", "name", "news_keywords", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Art & Design" })); // ArtSites.Add(new KeyValuePair<string, List<string>>("https://theconversation.com/us/arts", new List<string> { "div", "class", "article--header", "a", "href", "https://theconversation.com", "meta", "name", "news_keywords", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Art & Design" })); ArtSites.Add(new KeyValuePair <string, List <string> >("https://www.huffingtonpost.com/section/arts", new List <string> { "div", "class", "card__content", "a", "href", "https://www.huffingtonpost.com", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Art & Design" })); // Art & Design Keyword/Tag List HashSet <string> ArtTags = new HashSet <string> { "arts", "portrait", "portraits", "hollywood", "shows", "show", "art", "contemporary", "sculpture", "design", "architecture", "paint", "modern", "music", "renaissance", "sustainability", "play", "opera", "gallery", "moma", "abstract", "graphic", "graffiti", "picasso", "da", "vinci", "photography", "model", "portraits", "movie", "movies", "acting", "act", "actors", "actor", "fasion", "model", "modeling", "models", "music", "film", "cinema", "movie", "movies", "craft" }; // Business Sites List <KeyValuePair <string, List <string> > > MoneySites = new List <KeyValuePair <string, List <string> > >() { }; MoneySites.Add(new KeyValuePair <string, List <string> >("https://brokemillennial.com/", new List <string> { "div", "class", "featuredimg", "a", "href", "https://brokemillennial.com", "meta", "property", "og:description", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Business" })); MoneySites.Add(new KeyValuePair <string, List <string> >("https://www.huffingtonpost.com/section/business", new List <string> { "div", "class", "card__content", "a", "href", "https://www.huffingtonpost.com", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Business" })); // Business Keyword/Tag List HashSet <string> MoneyTags = new HashSet <string> { "marketing", "store", "stores", "goods", "media", "fund", "funds", "securities", "exchange", "tariff", "tariffs", "data", "credit", "economics", "business", "money", "finances", "stocks", "nasdaq", "dow", "savings", "saving", "bull", "bear", "market", "markets", "bank", "banks", "bankruptcy", "wall", "street", "loan", "loans", "saving", "stock", "quarterly", "anually", "financial", "retire", "retirement", "finance", "refinance", "repayment", "investment", "company", "trade" }; // Environment Sites List <KeyValuePair <string, List <string> > > EarthSites = new List <KeyValuePair <string, List <string> > >() { }; // does not allow framing //EarthSites.Add(new KeyValuePair<string, List<string>>("https://theconversation.com/us/environment", new List<string> { "div", "class", "article--header", "a", "href", "https://theconversation.com", "meta", "name", "news_keywords", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Environment" })); // EarthSites.Add(new KeyValuePair<string, List<string>>("http://discovermagazine.com/topics/environment", new List<string> { "div", "class", "dataItem", "a", "href", "http://discovermagazine.com", "meta", "name", "news_keywords", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Environment" })); EarthSites.Add(new KeyValuePair <string, List <string> >("https://www.huffingtonpost.com/section/green", new List <string> { "div", "class", "card__content", "a", "href", "https://www.huffingtonpost.com", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Environment" })); // Environment Keyword/Tag List HashSet <string> EarthTags = new HashSet <string> { "environment", "environmental", "green", "organic", "sustainability", "smog", "toxic", "peta", "science", "protection", "biology", "conservation", "tree", "trees", "water", "levels", "level", "drought", "rain", "polar", "ice", "glaciers", "epa", "global", "warming", "drought", "infrastructure", "urban", "earth", "earthquakes", "fisheries", "fish", "animal", "ocean", "climate", "change", "industry", "global", "agriculture", "weather", "disaster" }; // Education Sites List <KeyValuePair <string, List <string> > > SmartSites = new List <KeyValuePair <string, List <string> > >() { }; // does not allow framing //SmartSites.Add(new KeyValuePair<string, List<string>>("https://theconversation.com/us/education", new List<string> { "div", "class", "article--header", "a", "href", "https://theconversation.com", "meta", "name", "news_keywords", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Education" })); //SmartSites.Add(new KeyValuePair<string, List<string>>("http://www.bbc.com/news/education", new List<string> { "div", "class", "pigeon-item__body", "a", "href", "http://www.bbc.com", "meta", "name", "description", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Education" })); //SmartSites.Add(new KeyValuePair<string, List<string>>("http://www.bbc.com/news/education", new List<string> { "div", "class", "sparrow-item__body", "a", "href", "http://www.bbc.com", "meta", "name", "description", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Education" })); //SmartSites.Add(new KeyValuePair<string, List<string>>("http://www.educationnews.org/", new List<string> { "article", "class", "article-popular ", "a", "href", "http://www.educationnews.org/", "meta", "property", "article:tag", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Education" })); //SmartSites.Add(new KeyValuePair<string, List<string>>("http://www.educationnews.org/", new List<string> { "div", "class", "copy", "a", "href", "http://www.educationnews.org/", "meta", "property", "article:tag", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Education" })); // Education Keyword/Tag List HashSet <string> SmartTags = new HashSet <string> { "afford", "affordable", "children", "child", "institution", "institutions", "education", "school", "university", "college", "career", "careers", "degree", "masters", "student", "students", "teacher", "teachers", "professor", "professors", "daca", "kindergarten", "developmental", "develop", "science", "stem", "steam", "schools", "colleges", "universities" }; // History Sites List <KeyValuePair <string, List <string> > > AncientSites = new List <KeyValuePair <string, List <string> > >() { }; // AncientSites.Add(new KeyValuePair<string, List<string>>("http://historynewsnetwork.org/", new List<string> { "div", "class", "caption", "a", "href", "https://historynewsnetwork.org", "meta", "property", "og:title", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "History" })); AncientSites.Add(new KeyValuePair <string, List <string> >("https://www.newhistorian.com/", new List <string> { "div", "class", "news-summary has-feature-image", "a", "href", "https://www.newhistorian.com/", "meta", "property", "article:tag", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "History" })); // Doe snot allow framing //AncientSites.Add(new KeyValuePair<string, List<string>>("https://www.hoover.org/publications/military-history-news", new List<string> { "h2", "class", "field-title", "a", "href", "https://www.hoover.org", "meta", "name", "description", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "History" })); // History Keyword/Tag List HashSet <string> AncientTags = new HashSet <string> { "sculpture", "sculptures", "slave", "trade", "slaves", "trades", "skull", "skulls", "neanderthal", "hominin", "hominins", "neanderthals", "history", "ancient", "evolution", "prehistoric", "amazon", "dinosaur", "civil", "mayan", "renaissance", "egypt", "bc", "civil", "rights", "slavery", "battle", "ww1", "ww2", "ww", "war", "wall", "walls", "wars", "foreign", "myth", "norse", "greek", "cave", "neanderthals", "missing", "link", "early", "america", "americas", "native", "president", "immigrants", "presidents", "immigrant", "society", "military", "government", "war" }; // Medical Sites List <KeyValuePair <string, List <string> > > MedicalSites = new List <KeyValuePair <string, List <string> > >() { }; MedicalSites.Add(new KeyValuePair <string, List <string> >("https://www.medicalnewstoday.com/", new List <string> { "li", "class", "featured", "a", "href", "https://www.medicalnewstoday.com", "meta", "property", "og:description", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Medical" })); MedicalSites.Add(new KeyValuePair <string, List <string> >("https://www.medicalnewstoday.com/", new List <string> { "li", "class", "knowledge", "a", "href", "https://www.medicalnewstoday.com", "meta", "property", "og:description", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Medical" })); MedicalSites.Add(new KeyValuePair <string, List <string> >("https://www.medicalnewstoday.com/", new List <string> { "li", "class", "written", "a", "href", "https://www.medicalnewstoday.com", "meta", "property", "og:description", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Medical" })); // Medical Keyword/Tag List HashSet <string> MedicalTags = new HashSet <string> { "remedy", "remedies", "condition", "conditions", "symptom", "symptoms", "medication", "medications", "body", "immune", "cells", "cell", "immunity", "pandemic", "virus", "software", "medicine", "cancer", "research", "health", "medical", "healthy", "blood", "diabetes", "chemotherapy", "chemo", "advancements", "advancement", "pills", "pharmaceutical", "pharma", "pharmaceuticals", "opiod", "habit", "addiction", "pain", "tocix", "diet", "benefits", "benefit", "smoke", "smoking", "study", "studies", "cell", "dna", "suicide", "depression", "mental" }; // Technology Sites List <KeyValuePair <string, List <string> > > TechSites = new List <KeyValuePair <string, List <string> > >() { }; // does not allow framing //TechSites.Add(new KeyValuePair<string, List<string>>("http://news.mit.edu/topic/computers", new List<string> { "h3", "class", "title", "a", "href", "http://news.mit.edu", "meta", "name", "news_keywords", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Technology" })); //TechSites.Add(new KeyValuePair<string, List<string>>("https://theconversation.com/us/topics/computer-science-6612", new List<string> { "div", "class", "article--header", "a", "href", "https://theconversation.com", "meta", "name", "news_keywords", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Technology" })); // TechSites.Add(new KeyValuePair<string, List<string>>("http://discovermagazine.com/topics/technology", new List<string> { "div", "class", "dataItem", "a", "href", "http://discovermagazine.com", "meta", "name", "news_keywords", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Technology" })); TechSites.Add(new KeyValuePair <string, List <string> >("https://www.huffingtonpost.com/topic/computer-science", new List <string> { "div", "class", "card__content", "a", "href", "https://www.huffingtonpost.com", "meta", "name", "keywords", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Technology" })); //TechSites.Add(new KeyValuePair<string, List<string>>("http://www.educationnews.org/", new List<string> { "article", "class", "article-popular ", "a", "href", "http://www.educationnews.org/", "meta", "property", "article:tag", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Technology" })); //TechSites.Add(new KeyValuePair<string, List<string>>("http://www.educationnews.org/", new List<string> { "div", "class", "copy", "a", "href", "http://www.educationnews.org/", "meta", "property", "article:tag", "content", "meta", "property", "og:title", "content", "meta", "property", "og:description", "content", "Technology" })); // Technology Keyword/Tag List HashSet <string> TechTags = new HashSet <string> { "robot", "robots", "bionic", "drone", "drones", "drive", "drives", "usb", "data", "software", "technologies", "technology", "computer", "computers", "google", "apps", "apple", "amazon", "microsoft", "ai", "elon musk", "spacex", "nasa", "intelligence", "internet", "engineer", "engineering", "development", "programming", "code", "program", "coding", "science", "intelligent", "research", "study", "studies", "stem" }; // Art & Design Crawler ICrawler artie = new BaseCrawler(ArtSites, ArtTags); // Business Crawler ICrawler cachingie = new BaseCrawler(MoneySites, MoneyTags); // Environmental Crawler ICrawler earthie = new BaseCrawler(EarthSites, EarthTags); // Education Crawler ICrawler smartie = new BaseCrawler(SmartSites, SmartTags); // History Crawler ICrawler oldie = new BaseCrawler(AncientSites, AncientTags); // Medical Crawler ICrawler medie = new BaseCrawler(MedicalSites, MedicalTags); // Tech Crawler ICrawler techie = new BaseCrawler(TechSites, TechTags); // Run all crawlers asynchronously but waits until they are complete before moving on. try { Task.WaitAll(artie.CrawlingAsync(), cachingie.CrawlingAsync(), earthie.CrawlingAsync(), smartie.CrawlingAsync(), oldie.CrawlingAsync(), medie.CrawlingAsync(), techie.CrawlingAsync()); } catch (TaskCanceledException e) { Console.WriteLine(DateTime.Now + e.Task.ToString() + ": " + e.Message); } Console.WriteLine(DateTime.Now + ": Crawler Ended"); return(0); }