/// <summary> /// 抓取 /// </summary> /// <param name="url">网址</param> public void Third_Start(string url) { CrawlResult result = crawler.Crawl(new Uri(url)); if (result.ErrorOccurred) { Outputer.Output(string.Format("抓取完成 {0} 错误: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message)); } else { Outputer.Output(string.Format("抓取完成 {0} 并没发现错误", result.RootUri.AbsoluteUri)); } }
//抓取完成 public static void Completed(object sender, PageCrawlCompletedArgs e) { lock (lockobj) { CrawledPage crawledPage = e.CrawledPage; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { Outputer.Output(string.Format("抓取页面失败: {0}", crawledPage.Uri.AbsoluteUri)); } else { Outputer.Output(string.Format("抓取页面成功 {0}", crawledPage.Uri.AbsoluteUri)); var result = transcoder.Transcode((new WebTranscodingInput(crawledPage.Uri.AbsoluteUri))); model.NoteInfo ni = new model.NoteInfo(); ni.Title = result.ExtractedTitle; ni.FullText = result.ExtractedContent; Outputer.Add(ni); } if (string.IsNullOrEmpty(crawledPage.Content.Text)) { Outputer.Output(string.Format("抓取页面错误: 页面无内容 {0}", crawledPage.Uri.AbsoluteUri)); } } }
//抓取页面失败 public static void Disallowed(object sender, PageCrawlDisallowedArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; Outputer.Output(string.Format("由于产生错误:{1} 无法抓取页面{0}", pageToCrawl.Uri.AbsoluteUri, e.DisallowedReason)); }
//抓取连接失败 public static void Disallowed(object sender, PageLinksCrawlDisallowedArgs e) { CrawledPage crawledPage = e.CrawledPage; Outputer.Output(string.Format("没有抓取 {0} 页上的链接 {1} ", crawledPage.Uri.AbsoluteUri, e.DisallowedReason)); }
//开始抓取 public static void Starting(object sender, PageCrawlStartingArgs e) { PageToCrawl pageToCrawl = e.PageToCrawl; Outputer.Output(string.Format("关于抓取页面 {0} 上找到的链接 {1}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri)); }