Esempio n. 1
0
        public void crawl()
        {
            string     urlStr = "https://www.dagens.dk/nyheder";
            UriBuilder ub     = new UriBuilder(urlStr);
            WebClient  wc     = new WebClient();

            string webPage = wc.DownloadString(ub.Uri.ToString());

            webPage = webPage.Split("block-system-main")[1];
            webPage = webPage.Split("footer clearfix")[0];
            // var urls = urlTagPattern.Matches(webPage);
            //Console.WriteLine(webPage);
            var           urls  = webPage.Split("<a ");
            List <string> links = new List <string>();

            foreach (string url in urls)
            {
                //Console.WriteLine(url);
                string newUrl = url.Split("\"")[1];
                if (newUrl.Equals("") || newUrl.Length < 35)
                {
                    continue;
                }
                if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) // enten dette eller lave en liste over allerede besøgte links
                {
                    continue;
                }

                Console.WriteLine(newUrl);
                links.Add(newUrl);
            }
            manager.addLinksToQueue(links);
        }
Esempio n. 2
0
        public void crawl()
        {
            string     urlStr = "https://www.dr.dk/"; // does not work on /nyhder
            UriBuilder ub     = new UriBuilder(urlStr);
            WebClient  wc     = new WebClient();

            string webPage = wc.DownloadString(ub.Uri.ToString());

            webPage = webPage.Split("container")[1];
            webPage = webPage.Split("marketing-banner-radio")[0];
            // var urls = urlTagPattern.Matches(webPage);
            //Console.WriteLine(webPage);
            var           urls  = webPage.Split("<a ");
            List <string> links = new List <string>();

            foreach (string url in urls)
            {
                //Console.WriteLine(url);
                string newUrl = hrefPattern.Match(url).Groups[1].Value;
                if (newUrl.Equals(""))
                {
                    continue;
                }
                if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) // enten dette eller lave en liste over allerede besøgte links
                {
                    continue;
                }
                if (newUrl.StartsWith("/"))
                {
                    newUrl = "https://www.dr.dk" + newUrl;
                    if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl))
                    {
                        continue;
                    }
                    if (newUrl.Contains("nyheder") && newUrl.Length > 56)
                    {
                        Console.WriteLine(newUrl);
                        links.Add(newUrl);
                        continue;
                    }
                }
                if (newUrl.Contains("nyheder") && newUrl.Length > 56)
                {
                    Console.WriteLine(newUrl);
                    links.Add(newUrl);
                }
            }
            manager.addLinksToQueue(links);
        }
Esempio n. 3
0
        public void crawl()
        {
            string     urlStr = "https://ekstrabladet.dk/nyheder"; // also works on /nyhder
            UriBuilder ub     = new UriBuilder(urlStr);
            WebClient  wc     = new WebClient();

            string webPage = wc.DownloadString(ub.Uri.ToString());

            webPage = webPage.Split("sitecontent")[2];
            webPage = webPage.Split("footer")[0];
            // var urls = urlTagPattern.Matches(webPage);
            //Console.WriteLine(webPage);
            var           urls  = webPage.Split("<a ");
            List <string> links = new List <string>();

            foreach (string url in urls)
            {
                //Console.WriteLine(url);
                string newUrl = hrefPattern.Match(url).Groups[1].Value;
                if (newUrl.Equals(""))
                {
                    continue;
                }
                if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) // enten dette eller lave en liste over allerede besøgte links
                {
                    continue;
                }
                if (newUrl.StartsWith("/"))
                {
                    newUrl = "https://ekstrabladet.dk" + newUrl;
                    if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl))
                    {
                        continue;
                    }
                    Console.WriteLine(newUrl);
                    links.Add(newUrl);
                    continue;
                }
                Console.WriteLine(newUrl);
                links.Add(newUrl);
            }
            manager.addLinksToQueue(links);
        }
Esempio n. 4
0
        public void crawl()
        {
            string     urlStr = "https://nyheder.tv2.dk/";
            UriBuilder ub     = new UriBuilder(urlStr);
            WebClient  wc     = new WebClient();

            string webPage = wc.DownloadString(ub.Uri.ToString());

            webPage = webPage.Split("o-deck g-con g-col g-row_l g-gutter g-colx")[1] + webPage.Split("o-deck g-con g-col g-row_l g-gutter g-colx")[2];
            webPage = webPage.Split("section_load_more_from_term-loadmore")[0];
            // var urls = urlTagPattern.Matches(webPage);
            //Console.WriteLine(webPage);
            var           urls  = webPage.Split("<a ");
            List <string> links = new List <string>();

            foreach (string url in urls)
            {
                //Console.WriteLine(url);
                string newUrl = url.Split("\"")[1];
                if (newUrl.Equals("") || newUrl.Contains("div class"))
                {
                    continue;
                }
                if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) // enten dette eller lave en liste over allerede besøgte links
                {
                    continue;
                }
                if (newUrl.StartsWith("/"))
                {
                    newUrl = "https:" + newUrl;
                    if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl))
                    {
                        continue;
                    }
                    Console.WriteLine(newUrl);
                    links.Add(newUrl);
                    continue;
                }
                Console.WriteLine(newUrl);
                links.Add(newUrl);
            }
            manager.addLinksToQueue(links);
        }