Exemplo n.º 1
0
        public void crawl()
        {
            string     urlStr = "https://www.dagens.dk/nyheder";
            UriBuilder ub     = new UriBuilder(urlStr);
            WebClient  wc     = new WebClient();

            string webPage = wc.DownloadString(ub.Uri.ToString());

            webPage = webPage.Split("block-system-main")[1];
            webPage = webPage.Split("footer clearfix")[0];
            // var urls = urlTagPattern.Matches(webPage);
            //Console.WriteLine(webPage);
            var           urls  = webPage.Split("<a ");
            List <string> links = new List <string>();

            foreach (string url in urls)
            {
                //Console.WriteLine(url);
                string newUrl = url.Split("\"")[1];
                if (newUrl.Equals("") || newUrl.Length < 35)
                {
                    continue;
                }
                if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) // enten dette eller lave en liste over allerede besøgte links
                {
                    continue;
                }

                Console.WriteLine(newUrl);
                links.Add(newUrl);
            }
            manager.addLinksToQueue(links);
        }
Exemplo n.º 2
0
        /// <summary>
        /// This method goes to dagens.dk, gets the news on the page
        /// and tells the manager to put them into its queue
        /// </summary>
        public void crawl()
        {
            string     urlStr = "https://www.dagens.dk/nyheder";
            UriBuilder ub     = new UriBuilder(urlStr);
            WebClient  wc     = new WebClient();

            string webPage = wc.DownloadString(ub.Uri.ToString());

            webPage = webPage.Split("block-system-main")[1];
            webPage = webPage.Split("footer clearfix")[0];
            var           urls  = webPage.Split("<a ");
            List <string> links = new List <string>();

            foreach (string url in urls)
            {
                string newUrl = url.Split("\"")[1];
                if (newUrl.Equals("") || newUrl.Length < 35)
                {
                    continue;
                }
                if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl))
                {
                    continue;
                }
                links.Add(newUrl);
            }
            manager.addLinksToQueue(links);
        }
Exemplo n.º 3
0
        public void crawl()
        {
            string     urlStr = "https://nyheder.tv2.dk/";
            UriBuilder ub     = new UriBuilder(urlStr);
            WebClient  wc     = new WebClient();

            string webPage = wc.DownloadString(ub.Uri.ToString());

            webPage = webPage.Split("o-deck g-con g-col g-row_l g-gutter g-colx")[1] + webPage.Split("o-deck g-con g-col g-row_l g-gutter g-colx")[2];
            webPage = webPage.Split("section_load_more_from_term-loadmore")[0];
            // var urls = urlTagPattern.Matches(webPage);
            //Console.WriteLine(webPage);
            var           urls  = webPage.Split("<a ");
            List <string> links = new List <string>();

            foreach (string url in urls)
            {
                //Console.WriteLine(url);
                string newUrl = url.Split("\"")[1];
                if (newUrl.Equals("") || newUrl.Contains("div class"))
                {
                    continue;
                }
                if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl)) // enten dette eller lave en liste over allerede besøgte links
                {
                    continue;
                }
                if (newUrl.StartsWith("/"))
                {
                    newUrl = "https:" + newUrl;
                    if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl))
                    {
                        continue;
                    }
                    Console.WriteLine(newUrl);
                    links.Add(newUrl);
                    continue;
                }
                Console.WriteLine(newUrl);
                links.Add(newUrl);
            }
            manager.addLinksToQueue(links);
        }
Exemplo n.º 4
0
        /// <summary>
        /// This method goes to bt.dk/nyheder, gets the news on the page
        /// and tells the manager to put them into its queue
        /// </summary>
        public void crawl()
        {
            string     urlStr = "https://www.bt.dk/nyheder";
            UriBuilder ub     = new UriBuilder(urlStr);
            WebClient  wc     = new WebClient();

            string webPage = wc.DownloadString(ub.Uri.ToString());

            webPage = webPage.Split("container bg ")[1];
            webPage = webPage.Split("site-footer")[0];
            var           urls  = webPage.Split("<a ");
            List <string> links = new List <string>();

            foreach (string url in urls)
            {
                string newUrl = url.Split("\"")[1];
                if (newUrl.Equals("") || newUrl.Contains("id="))
                {
                    continue;
                }
                if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl))
                {
                    continue;
                }
                if (newUrl.StartsWith("/"))
                {
                    newUrl = "https://www.bt.dk" + newUrl;
                    if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl))
                    {
                        continue;
                    }
                    links.Add(newUrl);
                    continue;
                }
                links.Add(newUrl);
            }
            manager.addLinksToQueue(links);
        }
Exemplo n.º 5
0
        /// <summary>
        /// This method goes to nyheder.tv2.dk, gets the news on the page
        /// and tells the manager to put them into its queue
        /// </summary>
        public void crawl()
        {
            string     urlStr = "https://nyheder.tv2.dk/";
            UriBuilder ub     = new UriBuilder(urlStr);
            WebClient  wc     = new WebClient();

            string webPage = wc.DownloadString(ub.Uri.ToString());

            webPage = webPage.Split("o-deck g-con g-col g-row_l g-gutter g-colx")[1] + webPage.Split("o-deck g-con g-col g-row_l g-gutter g-colx")[2];
            webPage = webPage.Split("section_load_more_from_term-loadmore")[0];
            var           urls  = webPage.Split("<a ");
            List <string> links = new List <string>();

            foreach (string url in urls)
            {
                string newUrl = url.Split("\"")[1];
                if (newUrl.Equals("") || newUrl.Contains("div class"))
                {
                    continue;
                }
                if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl))
                {
                    continue;
                }
                if (newUrl.StartsWith("/"))
                {
                    newUrl = "https:" + newUrl;
                    if (Program.newsLinks.Contains(newUrl) || links.Contains(newUrl))
                    {
                        continue;
                    }
                    links.Add(newUrl);
                    continue;
                }
                links.Add(newUrl);
            }
            manager.addLinksToQueue(links);
        }