private void ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { if (e.CrawledPage.HttpWebResponse != null && e.CrawledPage.HttpWebResponse.StatusCode == HttpStatusCode.OK) { _crawlingStats.ProcessCrawledPage(e.CrawledPage); } }
static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { string strPage=e.CrawledPage.Content.Text; HtmlNodeCollection aTags = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//h1"); string name=""; if(aTags!=null&&aTags.Count>0) { name = aTags[0].InnerText; Console.WriteLine(name); } HtmlNodeCollection aTable = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//table[@id='auction-bidhist']"); HtmlNodeCollection aTr = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//table[@id='auction-bidhist']//tr"); //int index = strPage.IndexOf("col_half col_last single-product"); //int devEnd = strPage.IndexOf("</h1>", index); //string str = strPage.Substring(index, devEnd - index); //str = str.Substring(str.IndexOf("<h1>") + 4); if (aTr != null && aTr[1].InnerText.IndexOf("No") == -1 ) { if( aTr[0].InnerText.IndexOf("Winning")==-1) { string k = "#"; } } //Console.WriteLine(e.CrawledPage.Content); //Process data }
public void Constructor_ValidArg_SetsPublicProperty() { CrawledPage page = new CrawledPage(new Uri("http://aaa.com/")); PageCrawlCompletedArgs uut = new PageCrawlCompletedArgs(new CrawlContext(), page); Assert.AreSame(page, uut.CrawledPage); }
static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { saveWebPage(e); if (e.CrawledPage.Uri.AbsoluteUri.Contains(DemoParameters.shouldSaveToPropertyTableParameter)) saveProperty(e); }
private void crawler_PageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { PageResult pageResult = new PageResult(); pageResult.Url = e.CrawledPage.Uri.AbsoluteUri; if(e.CrawledPage.HttpWebResponse != null) pageResult.HttpStatusCode = Convert.ToInt32(e.CrawledPage.HttpWebResponse.StatusCode); _actualCrawledPages.Add(pageResult); }
void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); else Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri); if (string.IsNullOrEmpty(crawledPage.Content.Text)) Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri); }
static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) SaveURLFail(crawledPage.Uri.AbsoluteUri.ToString()); else SaveURLSuccess(crawledPage.Uri.AbsoluteUri.ToString()); if (string.IsNullOrEmpty(crawledPage.RawContent)) SaveURLNoContent(crawledPage.Uri.AbsoluteUri.ToString()); }
private static void OnPageCrawlCompleted(object p_Sender, PageCrawlCompletedArgs p_PageCrawlCompletedArgs) { CrawledPage page = p_PageCrawlCompletedArgs.CrawledPage; if (page.WebException != null) { Console.WriteLine("Crawl of page \"{0}\" failed{2}. Error: {1}", page.Uri.AbsoluteUri, page.WebException.Message, page.IsRetry ? String.Format(" (Retry #{0})", page.RetryCount) : ""); } else if (page.HttpWebResponse.StatusCode != HttpStatusCode.OK) { Console.WriteLine("Crawl of page \"{0}\" failed{2}. HTTP Status Code: {1}", page.Uri.AbsoluteUri, page.HttpWebResponse.StatusCode, page.IsRetry ? String.Format(" (Retry #{0})", page.RetryCount) : ""); } else { Console.WriteLine("Page crawl completed [{0}]", page.Uri.AbsoluteUri); } }
public void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { backgroundWorker.ReportProgress(1); string strPage = e.CrawledPage.Content.Text; HtmlNodeCollection aTags = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//h1"); string name = "", seller = "", buyer = ""; if (aTags != null && aTags.Count > 0) { name = aTags[0].InnerText; } HtmlNodeCollection aTable = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//table[@id='auction-bidhist']"); HtmlNodeCollection aTr = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//table[@id='auction-bidhist']//tr"); //int index = strPage.IndexOf("col_half col_last single-product"); //int devEnd = strPage.IndexOf("</h1>", index); //string str = strPage.Substring(index, devEnd - index); //str = str.Substring(str.IndexOf("<h1>") + 4); HtmlNodeCollection sellerA = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//h4//a"); if (sellerA != null && sellerA.Count > 0) { seller = sellerA[0].InnerText; } if (aTr != null && aTr[1].InnerText.IndexOf("No") == -1) { if (aTr[0].InnerText.IndexOf("Winning") == -1) { HtmlNodeCollection abbTr = aTr[1].SelectNodes("//td//a"); buyer = abbTr[0].InnerText; } } if (name.Trim().Length > 0) { SetText(name, seller, buyer, e.CrawledPage.Uri.ToString()); } //Console.WriteLine(e.CrawledPage.Content); //Process data }
/// <summary> /// Asynchronous event that is fired when an individual page has been crawled. /// </summary> public static void ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { if (e.CrawledPage.WebException != null || e.CrawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) Console.WriteLine("Crawl of page failed: {0}\n", e.CrawledPage.Uri.AbsoluteUri); else if (string.IsNullOrEmpty(e.CrawledPage.RawContent)) { Console.WriteLine("Page Crawled had no content: {0}\n", e.CrawledPage.Uri.AbsoluteUri); } else { String uri = e.CrawledPage.Uri.AbsoluteUri; //String date = new DateTime(e.CrawledPage.HttpWebResponse.Headers["Last-Modified"]); DateTime lastModified = e.CrawledPage.HttpWebResponse.LastModified; CrawledWebPage page = new CrawledWebPage(lastModified, e.CrawledPage.HtmlDocument.DocumentNode); _crawledPages.Add(uri, page); //Console.WriteLine("Crawled page saved: {0}", crawledPage.Uri.AbsoluteUri); } }
static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; if (Regex.IsMatch(crawledPage.Uri.ToString(), @"http://www.tingchina.com/erge/(\d+)/play_(\d+)_(\d+)\.htm")) { var csQuery = crawledPage.CsQueryDocument.Find("iframe"); foreach (var query in csQuery) { if(query.Name=="playmedia") { var playSrc = query.GetAttribute("src"); var request = String.Format("http://{0}{1}", crawledPage.Uri.Host, playSrc); var http = WebRequest.Create(request); var response = http.GetResponse(); var stream = response.GetResponseStream(); var sr = new StreamReader(stream); var content = sr.ReadToEnd(); stream.Dispose(); var realUrlIndex = content.IndexOf("url[3]="); var realUrlIndex1 = content.IndexOf(";", realUrlIndex); var realUrl = content.Substring(realUrlIndex, realUrlIndex1 - realUrlIndex); Console.WriteLine("!!!!!!!!!!!!!!!!!!!! " + realUrl + " !!!!!!!!!!!!!!!!!!!!"); } } } if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); else Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri); if (string.IsNullOrEmpty(crawledPage.Content.Text)) Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri); }
private void ThrowExceptionWhen_PageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { throw new Exception("Oh No!"); }
static void saveWebPage(PageCrawlCompletedArgs e) { // save to WebPage table //Process data var webpageContext = new WebPageDataContext(DemoParameters.connectionString); //var webpageContext = new WebPageDataContext(); //IEnumerable<WebPage> wp = dbContext.WebPages.OrderBy(c => c.pageUrl); WebPage page = new WebPage { pageUrl = e.CrawledPage.Uri.ToString(), parentUrl = e.CrawledPage.ParentUri.ToString(), requestStartTime = e.CrawledPage.RequestStarted.ToString(), requestEndTime = e.CrawledPage.RequestCompleted.ToString(), downloadStartTime = e.CrawledPage.DownloadContentStarted.ToString(), downloadEndTime = e.CrawledPage.DownloadContentCompleted.ToString(), //pageHtml = e.CrawledPage.Content.Text pageHtml = "" }; webpageContext.WebPages.InsertOnSubmit(page); try { webpageContext.SubmitChanges(); } catch (Exception ex) { Console.WriteLine(ex); // Make some adjustments. // ... // Try again. webpageContext.SubmitChanges(); } finally { if (webpageContext != null) webpageContext.Dispose(); } }
static void saveProperty(PageCrawlCompletedArgs e) { // save to Property table var propertyContext = new PropertyDataContext(DemoParameters.connectionString); //var propertyContext = new PropertyDataContext(); HtmlNode addressNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//span[@class='js-address']"); string addr = ""; if (addressNode != null) addr = addressNode.InnerText.Trim(); HtmlNode priceNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//dd[@class='price']"); string pric = ""; if (priceNode != null) pric = priceNode.InnerText.Trim(); HtmlNode propertytypeNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//dd[@class='propertytype']"); string ptype = ""; if (propertytypeNode != null) ptype = propertytypeNode.InnerText.Trim(); HtmlNode saletypeNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//dd[@class='saleType']"); string stype = ""; if (saletypeNode != null) stype = saletypeNode.InnerText.Trim(); HtmlNode saledateNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//dd[@class='saleDate']"); string sdate = ""; if (saledateNode != null) sdate = saledateNode.InnerText.Trim(); HtmlNode landsizeNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//dd[@class='land']"); string land = ""; if (landsizeNode != null) land = landsizeNode.InnerText.Trim(); HtmlNode featureNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//p[@class='features']"); string feature = ""; if (featureNode != null) feature = featureNode.InnerText.Trim(); HtmlNode agentNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//ul[@class='cB-agentList']"); string agentInfo = ""; if (agentNode != null) agentInfo = agentNode.InnerText.Trim(); HtmlNode schoolNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//div[@class='schoolData bdy collapsible collapsed']"); string school = ""; if (schoolNode != null) school = schoolNode.InnerText.Trim(); HtmlNode descriptionNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//div[@class='cT-productDescription']"); string desc = ""; if (descriptionNode != null) desc = descriptionNode.InnerText.Trim(); Property p = new Property { pageUrl = e.CrawledPage.Uri.ToString(), address = addr, price = pric, propertyType = ptype, saleType = stype, saleDate = sdate, suburb = "", landSize = land, propertyFeature = feature, agents = agentInfo, schoolData = school, propertyDescription = desc }; propertyContext.Properties.InsertOnSubmit(p); try { propertyContext.SubmitChanges(); } catch (Exception ex) { Console.WriteLine(ex); // Make some adjustments. // ... // Try again. propertyContext.SubmitChanges(); } finally { if (propertyContext != null) propertyContext.Dispose(); } }
void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { try { SaveContent(e.CrawledPage); } catch (Exception ex) { LogHelper.Instance.WriteError(ex,GetType(),MethodBase.GetCurrentMethod().Name); } //Process data }
private void PageCrawlCompletedEvent(object sender, PageCrawlCompletedArgs e) { }
static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { //Process data }
private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; bool externalLinksFound = false; _logger.DebugFormat("Page Crawl Completed {0}; Status {1}; Source URL: {2}; CrawlerId: {3}; SessionId: {4}", crawledPage.Uri.AbsoluteUri, crawledPage.HttpWebResponse.StatusCode, crawledPage.ParentUri.AbsoluteUri, crawledPage.PageBag.CrawlerId, crawledPage.PageBag.SessionId); //---------------------------------------- // create and store the crawled link var crawledLink = new CrawledLink(); crawledLink.SessionId = crawledPage.PageBag.SessionId; crawledLink.CrawlerId = crawledPage.PageBag.CrawlerId; crawledLink.SourceUrl = crawledPage.ParentUri.AbsoluteUri; crawledLink.TargetUrl = crawledPage.Uri.AbsoluteUri; // what was crawled crawledLink.StatusCode = crawledPage.HttpWebResponse.StatusCode; crawledLink.IsRoot = crawledPage.IsRoot; crawledLink.CrawlDepth = crawledPage.CrawlDepth; //------------ if (crawledPage.WebException != null) { // store error information if it occurred crawledLink.ErrorOccurred = true; crawledLink.Exception = crawledPage.WebException.Message; //TODO store more data of the exception _logger.Error(string.Format("A WebException occurred for Target Url: {0}; Source URL: {1}; CrawlerId: {2}; SessionId: {3}", crawledLink.TargetUrl, crawledLink.SourceUrl, crawledLink.CrawlerId, crawledLink.SessionId), crawledPage.WebException); } _scheduler.RecordCrawledLink(crawledLink); //---------------------------------------- // Check if the page should be processed, if true process it // - extract the title, keywords, description, cookies, etc from the page // and save processed data. if (crawledPage.WebException == null) { if (IsPageToBeProcessed(crawledPage.Uri, crawledPage.HttpWebResponse.StatusCode)) { using (var processor = _provider.GetInstanceOf<ICrawledPageProcessor>()) { var result = processor.ProcessPage(crawledPage); _repo.AddProcessedPage(result); } } externalLinksFound = _scheduler.ProcessParsedLinks(crawledPage); if (externalLinksFound) { OnExternalLinksFound(CrawlerId, crawledPage.Uri); } } string mssg = null; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { mssg = string.Format("Crawl of page failed {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri); _logger.Error(mssg); } else { mssg = string.Format("Crawl of page succeeded {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri); _logger.Debug(mssg); } if (string.IsNullOrEmpty(crawledPage.Content.Text)) { mssg = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri); _logger.Error(mssg); } //------------ OnLinkCrawlCompleted(CrawlerDefinition, crawledPage.ParentUri.AbsoluteUri, crawledPage.Uri.AbsoluteUri, crawledPage.HttpWebResponse.StatusCode, crawledPage.WebException != null, externalLinksFound); }
static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) Debug.WriteLine(string.Format("页面:{0}抓取失败,原因:{1}", crawledPage.Uri.AbsoluteUri, crawledPage.WebException.Message)); string text = crawledPage.Content.Text; if (string.IsNullOrEmpty(text)) Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri); else if (text.Contains("活动八")) { } else { var c = Regex.Matches(text, "活动(一|二|三|四|五|六|七|八|九|十)(.|\n)*?活动时间(.|\n)*?<br"); if (c.Count == 0) { var a = Regex.Match(text, "thread_subject(.|\n)*?活动(.|\n){1,50}?</a"); if (a.Value != "") { var b = Regex.Match(text, "活动时间(.|\n)*?<br"); var time = Regex.Match(b.Value, "\\d{1,2}月\\d{1,2}(-|日)").Value; Debug.Write(DateTime.Parse(time) + " "); Debug.WriteLine(Regex.Match(a.Value, "活动(.|\n)*?</a").Value.Replace("】", ":").Replace("<br", "").Replace("</a", "")); } } List<string> list = new List<string>() { "古城探宝", "天降神剑" }; foreach (Match item in c) { string date = Regex.Match(item.Value, "\\d{1,2}月\\d{1,2}(-|日)").Value; if (date != "" && !date.Contains("日")) { date = date.Replace("-", "日"); } Debug.Write(DateTime.Parse(date) + " "); Debug.WriteLine(Regex.Match(item.Value, "活动(一|二|三|四|五|六|七|八|九|十)(.|\n)*?<br").Value.Replace("】", ":").Replace("<br", "").Replace("</a", "")); var index = list.FindIndex(x => item.Value.Contains(x)); if (index != -1) { Activity activity = new Activity(); activity.Name = list[index]; activity.Date = DateTime.Parse(date); } } } }
void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; var crawledPageUri = crawledPage.Uri.ToString(); //Crawling books if (Regex.IsMatch(crawledPageUri, @"http://www.tingchina.com/*.*/disp_(\d+).htm")) { int id = Int32.Parse(Regex.Match(crawledPageUri, @"(\d+)").Captures[0].Value); lock (booksLockObject) { var book = new BookViewModel(id, crawledPage.Uri); Books.Add(book); } } //Crawling book sections if (Regex.IsMatch(crawledPageUri, @"http://www.tingchina.com/*.*/(\d+)/play_(\d+)_(\d+)\.htm")) { var csQuery = crawledPage.CsQueryDocument.Find("iframe"); foreach (var query in csQuery) { if (query.Name == "playmedia") { var playSrc = query.GetAttribute("src"); var request = String.Format("http://{0}{1}", crawledPage.Uri.Host, playSrc); lock (lockObject) { Sources.Add(new SoundPlayerSourceViewModel(request)); } } } } if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); else Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri); if (string.IsNullOrEmpty(crawledPage.Content.Text)) Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri); }
void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { if (e.CrawledPage.HttpWebResponse != null) { int status = (int)e.CrawledPage.HttpWebResponse.StatusCode; string url = e.CrawledPage.Uri.ToString(); string parent = e.CrawledPage.ParentUri.ToString(); lock (_lock) { _logfile.WriteLine("Url {0} - status code {1} seen at {2}", url, status, parent); } } pagesDownloaded++; Console.Write("Pages downloaded {0}\r", pagesDownloaded); AddNewPage(e.CrawledPage); }