public void crawl() { string urlStr = "https://www.dagens.dk/nyheder"; UriBuilder ub = new UriBuilder(urlStr); WebClient wc = new WebClient(); string webPage = wc.DownloadString(ub.Uri.ToString()); webPage = webPage.Split("block-system-main")[1]; webPage = webPage.Split("footer clearfix")[0]; // var urls = urlTagPattern.Matches(webPage); //Console.WriteLine(webPage); var urls = webPage.Split("<a "); List <string> links = new List <string>(); foreach (string url in urls) { //Console.WriteLine(url); string newUrl = url.Split("\"")[1]; if (newUrl.Equals("") || newUrl.Length < 35) { continue; } if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) // enten dette eller lave en liste over allerede besøgte links { continue; } Console.WriteLine(newUrl); links.Add(newUrl); } manager.addLinksToQueue(links); }
/// <summary> /// This method goes to dagens.dk, gets the news on the page /// and tells the manager to put them into its queue /// </summary> public void crawl() { string urlStr = "https://www.dagens.dk/nyheder"; UriBuilder ub = new UriBuilder(urlStr); WebClient wc = new WebClient(); string webPage = wc.DownloadString(ub.Uri.ToString()); webPage = webPage.Split("block-system-main")[1]; webPage = webPage.Split("footer clearfix")[0]; var urls = webPage.Split("<a "); List <string> links = new List <string>(); foreach (string url in urls) { string newUrl = url.Split("\"")[1]; if (newUrl.Equals("") || newUrl.Length < 35) { continue; } if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) { continue; } links.Add(newUrl); } manager.addLinksToQueue(links); }
public void crawl() { string urlStr = "https://nyheder.tv2.dk/"; UriBuilder ub = new UriBuilder(urlStr); WebClient wc = new WebClient(); string webPage = wc.DownloadString(ub.Uri.ToString()); webPage = webPage.Split("o-deck g-con g-col g-row_l g-gutter g-colx")[1] + webPage.Split("o-deck g-con g-col g-row_l g-gutter g-colx")[2]; webPage = webPage.Split("section_load_more_from_term-loadmore")[0]; // var urls = urlTagPattern.Matches(webPage); //Console.WriteLine(webPage); var urls = webPage.Split("<a "); List <string> links = new List <string>(); foreach (string url in urls) { //Console.WriteLine(url); string newUrl = url.Split("\"")[1]; if (newUrl.Equals("") || newUrl.Contains("div class")) { continue; } if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) // enten dette eller lave en liste over allerede besøgte links { continue; } if (newUrl.StartsWith("/")) { newUrl = "https:" + newUrl; if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) { continue; } Console.WriteLine(newUrl); links.Add(newUrl); continue; } Console.WriteLine(newUrl); links.Add(newUrl); } manager.addLinksToQueue(links); }
/// <summary> /// This method goes to bt.dk/nyheder, gets the news on the page /// and tells the manager to put them into its queue /// </summary> public void crawl() { string urlStr = "https://www.bt.dk/nyheder"; UriBuilder ub = new UriBuilder(urlStr); WebClient wc = new WebClient(); string webPage = wc.DownloadString(ub.Uri.ToString()); webPage = webPage.Split("container bg ")[1]; webPage = webPage.Split("site-footer")[0]; var urls = webPage.Split("<a "); List <string> links = new List <string>(); foreach (string url in urls) { string newUrl = url.Split("\"")[1]; if (newUrl.Equals("") || newUrl.Contains("id=")) { continue; } if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) { continue; } if (newUrl.StartsWith("/")) { newUrl = "https://www.bt.dk" + newUrl; if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) { continue; } links.Add(newUrl); continue; } links.Add(newUrl); } manager.addLinksToQueue(links); }
/// <summary> /// This method goes to nyheder.tv2.dk, gets the news on the page /// and tells the manager to put them into its queue /// </summary> public void crawl() { string urlStr = "https://nyheder.tv2.dk/"; UriBuilder ub = new UriBuilder(urlStr); WebClient wc = new WebClient(); string webPage = wc.DownloadString(ub.Uri.ToString()); webPage = webPage.Split("o-deck g-con g-col g-row_l g-gutter g-colx")[1] + webPage.Split("o-deck g-con g-col g-row_l g-gutter g-colx")[2]; webPage = webPage.Split("section_load_more_from_term-loadmore")[0]; var urls = webPage.Split("<a "); List <string> links = new List <string>(); foreach (string url in urls) { string newUrl = url.Split("\"")[1]; if (newUrl.Equals("") || newUrl.Contains("div class")) { continue; } if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) { continue; } if (newUrl.StartsWith("/")) { newUrl = "https:" + newUrl; if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) { continue; } links.Add(newUrl); continue; } links.Add(newUrl); } manager.addLinksToQueue(links); }