override public void PROCESSOR(BotCycle bc) { CustomBot cb = (CustomBot)bc.Bot; if (!cb.HR.Get(Url)) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + Url); } DataSifter.Capture gc = cb.list.Parse(cb.HR.HtmlResult); { string url = gc.ValueOf("NextPageUrl"); if (url != null) { cb.BotCycle.Add(new ListItem(Spider.GetAbsoluteUrl(url, cb.HR.ResponseUrl))); } } string[] urls = Spider.GetAbsoluteUrls(gc.ValuesOf("ProductUrl"), cb.HR.ResponseUrl, cb.HR.HtmlResult); foreach (string url in urls) { cb.BotCycle.Add(new ProductItem(url)); } }
override public void PROCESSOR(BotCycle bc) { CustomBot cb = (CustomBot)bc.Bot; int _MaxDownloadedFileLength = Bot.Properties.Web.Default.MaxDownloadedFileLength; if (!Download) { Bot.Properties.Web.Default.MaxDownloadedFileLength = 0; } bool rc = cb.hr.GetPage(Url); Bot.Properties.Web.Default.MaxDownloadedFileLength = _MaxDownloadedFileLength; if (!rc) { if (cb.hr.Status == WebRoutineStatus.UNACCEPTABLE_CONTENT_TYPE) { return; } if (cb.hr.HWResponse.StatusCode == System.Net.HttpStatusCode.NotFound) { FileWriter.This.WriteLine(ParentLink.Url, Url); } //site2boken_urls[item.Site.Url] = site2boken_urls[item.Site.Url] + "\n" + item.Url; else { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + Url); } return; } if (Download) { cb.get_links(Depth + 1); } }
override public void PROCESSOR(BotCycle bc) { CustomBot cb = (CustomBot)bc.Bot; if (!cb.HR.Get(Url)) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + Url); } DataSifter.Capture gc = cb.product.Parse(cb.HR.HtmlResult); Fhr.CrawlerHost.Product product = new Fhr.CrawlerHost.Product( id: gc.ValueOf("Id"), url: Url, name: gc.ValueOf("Name"), sku: gc.ValueOf("Sku"), price: gc.ValueOf("Price"), category_branch: gc.ValuesOf("Category"), image_urls: Spider.GetAbsoluteUrls(gc.ValuesOf("ImageUrl"), Url, cb.HR.HtmlResult), stock: gc.ValueOf("Stock") != null ? (decimal)Fhr.CrawlerHost.Product.StockValue.IN_STOCK : (decimal)Fhr.CrawlerHost.Product.StockValue.NOT_IN_STOCK, description: gc.ValueOf("Description") ); if (!Cliver.Fhr.CrawlerHost.CrawlerApi.SaveProductAsJson(product)) { throw new ProcessorException(ProcessorExceptionType.ERROR, "Product was not saved."); } }
override public void PROCESSOR(BotCycle bc) { CustomBot cb = (CustomBot)bc.Bot; string name = FieldPreparation.Html.GetCsvField(Name); if (!cb.HR.GetPage(Url)) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + Url); } DataSifter.Capture c = product.Parse(cb.HR.HtmlResult); string zip_code = Regex.Replace(c.ValueOf("ZipCode"), @"[^\d]", "", RegexOptions.Singleline); string url2 = "http://www.yellowpages.com/search?search_terms=" + name + "&geo_location_terms=" + zip_code; string email = null; string url3 = url2; if (cb.HR.GetPage(url2)) { DataSifter.Capture c2 = yp.Parse(cb.HR.HtmlResult); string regex_name = get_stripped_name(name); regex_name = Regex.Escape((regex_name.Length > 10 ? regex_name.Substring(0, 10) : regex_name).Trim()); foreach (DataSifter.Capture cc in c2["Company"]) { if (cc.ValueOf("ZipCode") != null && Regex.Replace(cc.ValueOf("ZipCode"), @"[^\d]", "", RegexOptions.Singleline) == zip_code && Regex.IsMatch(get_stripped_name(cc.ValueOf("Name")), regex_name, RegexOptions.IgnoreCase) ) { url3 = Spider.GetAbsoluteUrl(cc.ValueOf("Url"), url2); if (!cb.HR.GetPage(url3)) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + url3); } DataSifter.Capture c3 = yp2.Parse(cb.HR.HtmlResult); email = c3.ValueOf("Email"); break; } } } else if (cb.HR.HWResponse.StatusCode != HttpStatusCode.NotFound) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + url2); } FileWriter.This.PrepareAndWriteHtmlLineWithHeader( "Name", Name, "City", City, "ZipCode", zip_code, "State", State, "Phone", Phone, "Email", email, "Url", Url, "Url2", url3 ); }
override public void PROCESSOR(BotCycle bc) { CustomBot cb = (CustomBot)bc.Bot; if (!cb.hr.GetPage(Url)) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get site: " + Url); } cb.get_links(1); }
override public void PROCESSOR(BotCycle bc) { CustomBot cb = (CustomBot)bc.Bot; if (!cb.HR.Get(Url)) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + Url); } DataSifter.Capture gc = cb.category.Parse(cb.HR.HtmlResult); string[] urls = Spider.GetAbsoluteUrls(gc.ValuesOf("CategoryUrl"), cb.HR.ResponseUrl, cb.HR.HtmlResult); foreach (string url in urls) { cb.BotCycle.Add(new CategoryItem(url)); } }
override public void PROCESSOR(BotCycle bc) { CustomBot cb = (CustomBot)bc.Bot; string url = "http://www.rent.com/" + Regex.Replace(State, @"\s", "-"); if (!cb.HR.GetPage(url)) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + url); } DataSifter.Capture c = cities.Parse(cb.HR.HtmlResult); string[] us = c.ValuesOf("Url"); for (int i = 0; i < us.Length; i++) { bc.Add(new SearchItem("http://www.rent.com" + us[i])); } }
override public void PROCESSOR(BotCycle bc) { CustomBot cb = (CustomBot)bc.Bot; cb.search_processor(Url); }