static void Main(string[] args) { Crawler crawler = new Crawler(); crawler.Crawl(@"", "//tr", 2); crawler.PrintArticles(); Console.ReadLine(); }
static void Main(string[] args) { Console.WriteLine("Please input address: "); string addr = Console.ReadLine().ToString(); Crawler myCrawler = new Crawler(addr); myCrawler.Crawl(); }
static async void RunCrawl(string url) { await Task.Run(() => { crawler.Crawl(url, 0, 1); }); Console.WriteLine("{0} has been crawled.", url); }
static void Main(string[] args) { var crawler = new Crawler(); var content1 = crawler.CrawlPage(); var content2 = crawler.Crawl().Result; Logging.WriteToFile(content1); Logging.WriteToFile(content2); }
static void Main(string[] args) { // var topLevelUrl = @"https://www.cnn.com"; // var topLevelUrl = @"https://www.google.com"; var topLevelUrl = @"https://www.redhat.com"; var date = DateTime.Now.ToString("yyyyMMdd_HHmmss"); var reportFile = @"D:\Temp\GoWebCrawlerReport_" + date + ".txt"; Crawler crawler = new Crawler(); crawler.Crawl(topLevelUrl, reportFile); }
void Run(string[] args) { //var uri = new Uri("http://www.fzztb.gov.cn/index_629.htm"); //var uri = new Uri("http://caigou.jdzol.com/html/list_1433.html"); //var uri = new Uri("http://www.gzzfcg.gov.cn/products.asp?BigClassID=34&SmallClassID=1"); var uri = new Uri("http://www.ncszfcg.gov.cn/more.cfm?sid=100002011&c_code=791"); var siteType = HtmlParse.RecogSite(uri); Crawler c = new Crawler(uri, new HtmlDocumentProcessor(), new CrawlProcessor()) { MaximumCrawlDepth = 5, MaximumThreadCount = 5, IncludeFilter = IncludeFilter(siteType), ExcludeFilter = ExcludeFilter(siteType) }; c.Crawl(); Console.Write("End"); Console.ReadKey(); }
public static void Main(string[] args) { var configuration = new Configuration() { EnableLog = true, OutputPath = Path.Join(Directory.GetCurrentDirectory(), "output"), LogFilePath = Path.Join(Directory.GetCurrentDirectory(), "output/crawler.log"), GraphFilePath = Path.Join(Directory.GetCurrentDirectory(), "output/graph.json"), SaveRobotsFile = true, SaveSitemapFiles = false, SaveUrls = true, DeleteHtmlAfterScrape = true, SerializeSite = true, SerializeGraph = true, HostUrlsLimit = 1000, SitemapIndexLimit = 1000 }; var token = new CancellationTokenSource(); var seedUrls = new Uri[] { new Uri("https://www.google.com") }; var crawler = new Crawler(configuration, seedUrls, token.Token); var task = Task.Run(() => { crawler.Crawl(); }); Program.WaitUntilCompletedOrKeyPressed(token, task); task.Wait(); token.Dispose(); Console.WriteLine("Completed"); }
//执行函数 public void Do() { //记录一下任务开始时的时间 DateTime dt = DateTime.Now; //确定文件名 string fileName = _lat + "," + _lon + ".html"; //修正了之前的错误,程序自己创建data文件夹,如果已存在则忽略 DirectoryInfo dir = new DirectoryInfo("data\\"); dir.Create(); //如果已经存在这个文件了,说明抓过了,任务取消 if ((new FileInfo("data\\" + fileName)).Exists) { Console.WriteLine("已存在文件:" + fileName); return; } //记录一下为抓这个数据尝试了几次 int count = 1; //确认网址链接,这里没什么门道,浏览NASA网页时看浏览器地址,找规律 string link = "https://eosweb.larc.nasa.gov/cgi-bin/sse/grid.cgi?&num=182092&lat=" + _lat + "&hgt=100&submit=Submit&veg=17&sitelev=&email=&p=grid_id&step=2&lon=" + _lon; //新建抓取者实例 Crawler crawler = new Crawler(link, "data\\" + fileName); //不停地循环尝试抓取,直到成功 while (!(crawler.Crawl())) { ++count; Console.WriteLine("第" + count + "次尝试抓取" + fileName); } //计算总耗时并显示 TimeSpan ts = DateTime.Now - dt; Console.WriteLine("耗时" + ts + ",尝试" + count + "次"); }
static void Main(string[] args) { Crawler c = new Crawler(new HtmlParser(), new FileStorer()); c.Crawl(); }
static async void CrawlAsync(string url) { await Task.Run(() => crawler.Crawl(url)); Console.WriteLine("Where do you want to crawl?"); }