Exemplo n.º 1
0
        static void Main(string[] args)
        {
            Crawler crawler = new Crawler();

            crawler.Crawl(@"", "//tr", 2);
            crawler.PrintArticles();
            Console.ReadLine();
        }
Exemplo n.º 2
0
        static void Main(string[] args)
        {
            Console.WriteLine("Please input address: ");
            string addr = Console.ReadLine().ToString();

            Crawler myCrawler = new Crawler(addr);

            myCrawler.Crawl();
        }
Exemplo n.º 3
0
        static async void RunCrawl(string url)
        {
            await Task.Run(() =>
            {
                crawler.Crawl(url, 0, 1);
            });

            Console.WriteLine("{0} has been crawled.", url);
        }
Exemplo n.º 4
0
        static void Main(string[] args)
        {
            var crawler  = new Crawler();
            var content1 = crawler.CrawlPage();
            var content2 = crawler.Crawl().Result;

            Logging.WriteToFile(content1);
            Logging.WriteToFile(content2);
        }
Exemplo n.º 5
0
        static void Main(string[] args)
        {
            // var topLevelUrl = @"https://www.cnn.com";
            // var topLevelUrl = @"https://www.google.com";
            var     topLevelUrl = @"https://www.redhat.com";
            var     date        = DateTime.Now.ToString("yyyyMMdd_HHmmss");
            var     reportFile  = @"D:\Temp\GoWebCrawlerReport_" + date + ".txt";
            Crawler crawler     = new Crawler();

            crawler.Crawl(topLevelUrl, reportFile);
        }
Exemplo n.º 6
0
        void Run(string[] args)
        {
            //var uri = new Uri("http://www.fzztb.gov.cn/index_629.htm");
            //var uri = new Uri("http://caigou.jdzol.com/html/list_1433.html");
            //var uri = new Uri("http://www.gzzfcg.gov.cn/products.asp?BigClassID=34&SmallClassID=1");
            var uri = new Uri("http://www.ncszfcg.gov.cn/more.cfm?sid=100002011&c_code=791");

            var siteType = HtmlParse.RecogSite(uri);

            Crawler c = new Crawler(uri, new HtmlDocumentProcessor(), new CrawlProcessor())
            {
                MaximumCrawlDepth  = 5,
                MaximumThreadCount = 5,
                IncludeFilter      = IncludeFilter(siteType),
                ExcludeFilter      = ExcludeFilter(siteType)
            };

            c.Crawl();

            Console.Write("End");
            Console.ReadKey();
        }
Exemplo n.º 7
0
        public static void Main(string[] args)
        {
            var configuration = new Configuration()
            {
                EnableLog             = true,
                OutputPath            = Path.Join(Directory.GetCurrentDirectory(), "output"),
                LogFilePath           = Path.Join(Directory.GetCurrentDirectory(), "output/crawler.log"),
                GraphFilePath         = Path.Join(Directory.GetCurrentDirectory(), "output/graph.json"),
                SaveRobotsFile        = true,
                SaveSitemapFiles      = false,
                SaveUrls              = true,
                DeleteHtmlAfterScrape = true,
                SerializeSite         = true,
                SerializeGraph        = true,
                HostUrlsLimit         = 1000,
                SitemapIndexLimit     = 1000
            };

            var token = new CancellationTokenSource();

            var seedUrls = new Uri[]
            {
                new Uri("https://www.google.com")
            };
            var crawler = new Crawler(configuration, seedUrls, token.Token);

            var task = Task.Run(() =>
            {
                crawler.Crawl();
            });

            Program.WaitUntilCompletedOrKeyPressed(token, task);

            task.Wait();
            token.Dispose();

            Console.WriteLine("Completed");
        }
        //执行函数
        public void Do()
        {
            //记录一下任务开始时的时间
            DateTime dt = DateTime.Now;
            //确定文件名
            string fileName = _lat + "," + _lon + ".html";

            //修正了之前的错误,程序自己创建data文件夹,如果已存在则忽略
            DirectoryInfo dir = new DirectoryInfo("data\\");

            dir.Create();

            //如果已经存在这个文件了,说明抓过了,任务取消
            if ((new FileInfo("data\\" + fileName)).Exists)
            {
                Console.WriteLine("已存在文件:" + fileName);
                return;
            }

            //记录一下为抓这个数据尝试了几次
            int count = 1;
            //确认网址链接,这里没什么门道,浏览NASA网页时看浏览器地址,找规律
            string link = "https://eosweb.larc.nasa.gov/cgi-bin/sse/grid.cgi?&num=182092&lat=" + _lat + "&hgt=100&submit=Submit&veg=17&sitelev=&email=&p=grid_id&step=2&lon=" + _lon;
            //新建抓取者实例
            Crawler crawler = new Crawler(link, "data\\" + fileName);

            //不停地循环尝试抓取,直到成功
            while (!(crawler.Crawl()))
            {
                ++count;
                Console.WriteLine("第" + count + "次尝试抓取" + fileName);
            }

            //计算总耗时并显示
            TimeSpan ts = DateTime.Now - dt;

            Console.WriteLine("耗时" + ts + ",尝试" + count + "次");
        }
        //执行函数
        public void Do()
        {
            //记录一下任务开始时的时间
            DateTime dt = DateTime.Now;
            //确定文件名
            string fileName = _lat + "," + _lon + ".html";

            //修正了之前的错误,程序自己创建data文件夹,如果已存在则忽略
            DirectoryInfo dir = new DirectoryInfo("data\\");
            dir.Create();

            //如果已经存在这个文件了,说明抓过了,任务取消
            if ((new FileInfo("data\\" + fileName)).Exists)
            {
                Console.WriteLine("已存在文件:" + fileName);
                return;
            }

            //记录一下为抓这个数据尝试了几次
            int count = 1;
            //确认网址链接,这里没什么门道,浏览NASA网页时看浏览器地址,找规律
            string link = "https://eosweb.larc.nasa.gov/cgi-bin/sse/grid.cgi?&num=182092&lat=" + _lat + "&hgt=100&submit=Submit&veg=17&sitelev=&email=&p=grid_id&step=2&lon=" + _lon;
            //新建抓取者实例
            Crawler crawler = new Crawler(link, "data\\" + fileName);

            //不停地循环尝试抓取,直到成功
            while (!(crawler.Crawl()))
            {
                ++count;
                Console.WriteLine("第" + count + "次尝试抓取" + fileName);
            }

            //计算总耗时并显示
            TimeSpan ts = DateTime.Now - dt;
            Console.WriteLine("耗时" + ts + ",尝试" + count + "次");
        }
        static void Main(string[] args)
        {
            Crawler c = new Crawler(new HtmlParser(), new FileStorer());

            c.Crawl();
        }
Exemplo n.º 11
0
        static async void CrawlAsync(string url)
        {
            await Task.Run(() => crawler.Crawl(url));

            Console.WriteLine("Where do you want to crawl?");
        }