override public void PROCESSOR(BotCycle bc) { CustomBot cb = (CustomBot)bc.Bot; if (!cb.HR.Get(Url)) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + Url); } DataSifter.Capture gc = cb.list.Parse(cb.HR.HtmlResult); { string url = gc.ValueOf("NextPageUrl"); if (url != null) { cb.BotCycle.Add(new ListItem(Spider.GetAbsoluteUrl(url, cb.HR.ResponseUrl))); } } string[] urls = Spider.GetAbsoluteUrls(gc.ValuesOf("ProductUrl"), cb.HR.ResponseUrl, cb.HR.HtmlResult); foreach (string url in urls) { cb.BotCycle.Add(new ProductItem(url)); } }
override public void PROCESSOR(BotCycle bc) { CustomBot cb = (CustomBot)bc.Bot; string name = FieldPreparation.Html.GetCsvField(Name); if (!cb.HR.GetPage(Url)) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + Url); } DataSifter.Capture c = product.Parse(cb.HR.HtmlResult); string zip_code = Regex.Replace(c.ValueOf("ZipCode"), @"[^\d]", "", RegexOptions.Singleline); string url2 = "http://www.yellowpages.com/search?search_terms=" + name + "&geo_location_terms=" + zip_code; string email = null; string url3 = url2; if (cb.HR.GetPage(url2)) { DataSifter.Capture c2 = yp.Parse(cb.HR.HtmlResult); string regex_name = get_stripped_name(name); regex_name = Regex.Escape((regex_name.Length > 10 ? regex_name.Substring(0, 10) : regex_name).Trim()); foreach (DataSifter.Capture cc in c2["Company"]) { if (cc.ValueOf("ZipCode") != null && Regex.Replace(cc.ValueOf("ZipCode"), @"[^\d]", "", RegexOptions.Singleline) == zip_code && Regex.IsMatch(get_stripped_name(cc.ValueOf("Name")), regex_name, RegexOptions.IgnoreCase) ) { url3 = Spider.GetAbsoluteUrl(cc.ValueOf("Url"), url2); if (!cb.HR.GetPage(url3)) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + url3); } DataSifter.Capture c3 = yp2.Parse(cb.HR.HtmlResult); email = c3.ValueOf("Email"); break; } } } else if (cb.HR.HWResponse.StatusCode != HttpStatusCode.NotFound) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + url2); } FileWriter.This.PrepareAndWriteHtmlLineWithHeader( "Name", Name, "City", City, "ZipCode", zip_code, "State", State, "Phone", Phone, "Email", email, "Url", Url, "Url2", url3 ); }
void search_processor(string url) { if (!HR.GetPage(url)) { throw new ProcessorException(ProcessorExceptionType.RESTORE_AS_NEW, "Could not get: " + url); } DataSifter.Capture c0 = search.Parse(HR.HtmlResult); string npu = c0.ValueOf("NextPageUrl"); if (npu != null) { BotCycle.Add(new SearchNextPageItem(npu)); } foreach (DataSifter.Capture c in c0["Product"]) { BotCycle.Add(new CompanyItem(Spider.GetAbsoluteUrl(c.ValueOf("Url"), url), c.ValueOf("Name"), c.ValueOf("City"), c.ValueOf("State"), c.ValueOf("Phone"))); } }