public void crawl() { string urlStr = "https://www.dagens.dk/nyheder"; UriBuilder ub = new UriBuilder(urlStr); WebClient wc = new WebClient(); string webPage = wc.DownloadString(ub.Uri.ToString()); webPage = webPage.Split("block-system-main")[1]; webPage = webPage.Split("footer clearfix")[0]; // var urls = urlTagPattern.Matches(webPage); //Console.WriteLine(webPage); var urls = webPage.Split("<a "); List <string> links = new List <string>(); foreach (string url in urls) { //Console.WriteLine(url); string newUrl = url.Split("\"")[1]; if (newUrl.Equals("") || newUrl.Length < 35) { continue; } if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) // enten dette eller lave en liste over allerede besøgte links { continue; } Console.WriteLine(newUrl); links.Add(newUrl); } manager.addLinksToQueue(links); }
public void crawl() { string urlStr = "https://www.dr.dk/"; // does not work on /nyhder UriBuilder ub = new UriBuilder(urlStr); WebClient wc = new WebClient(); string webPage = wc.DownloadString(ub.Uri.ToString()); webPage = webPage.Split("container")[1]; webPage = webPage.Split("marketing-banner-radio")[0]; // var urls = urlTagPattern.Matches(webPage); //Console.WriteLine(webPage); var urls = webPage.Split("<a "); List <string> links = new List <string>(); foreach (string url in urls) { //Console.WriteLine(url); string newUrl = hrefPattern.Match(url).Groups[1].Value; if (newUrl.Equals("")) { continue; } if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) // enten dette eller lave en liste over allerede besøgte links { continue; } if (newUrl.StartsWith("/")) { newUrl = "https://www.dr.dk" + newUrl; if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) { continue; } if (newUrl.Contains("nyheder") && newUrl.Length > 56) { Console.WriteLine(newUrl); links.Add(newUrl); continue; } } if (newUrl.Contains("nyheder") && newUrl.Length > 56) { Console.WriteLine(newUrl); links.Add(newUrl); } } manager.addLinksToQueue(links); }
public void crawl() { string urlStr = "https://ekstrabladet.dk/nyheder"; // also works on /nyhder UriBuilder ub = new UriBuilder(urlStr); WebClient wc = new WebClient(); string webPage = wc.DownloadString(ub.Uri.ToString()); webPage = webPage.Split("sitecontent")[2]; webPage = webPage.Split("footer")[0]; // var urls = urlTagPattern.Matches(webPage); //Console.WriteLine(webPage); var urls = webPage.Split("<a "); List <string> links = new List <string>(); foreach (string url in urls) { //Console.WriteLine(url); string newUrl = hrefPattern.Match(url).Groups[1].Value; if (newUrl.Equals("")) { continue; } if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) // enten dette eller lave en liste over allerede besøgte links { continue; } if (newUrl.StartsWith("/")) { newUrl = "https://ekstrabladet.dk" + newUrl; if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) { continue; } Console.WriteLine(newUrl); links.Add(newUrl); continue; } Console.WriteLine(newUrl); links.Add(newUrl); } manager.addLinksToQueue(links); }
public void crawl() { string urlStr = "https://nyheder.tv2.dk/"; UriBuilder ub = new UriBuilder(urlStr); WebClient wc = new WebClient(); string webPage = wc.DownloadString(ub.Uri.ToString()); webPage = webPage.Split("o-deck g-con g-col g-row_l g-gutter g-colx")[1] + webPage.Split("o-deck g-con g-col g-row_l g-gutter g-colx")[2]; webPage = webPage.Split("section_load_more_from_term-loadmore")[0]; // var urls = urlTagPattern.Matches(webPage); //Console.WriteLine(webPage); var urls = webPage.Split("<a "); List <string> links = new List <string>(); foreach (string url in urls) { //Console.WriteLine(url); string newUrl = url.Split("\"")[1]; if (newUrl.Equals("") || newUrl.Contains("div class")) { continue; } if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) // enten dette eller lave en liste over allerede besøgte links { continue; } if (newUrl.StartsWith("/")) { newUrl = "https:" + newUrl; if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) { continue; } Console.WriteLine(newUrl); links.Add(newUrl); continue; } Console.WriteLine(newUrl); links.Add(newUrl); } manager.addLinksToQueue(links); }