public void Process(Page page) { foreach (IEntityExtractor pageModelExtractor in EntityExtractorList) { dynamic process = pageModelExtractor.Process(page); if (process == null || (process is IEnumerable && !((IEnumerable)process).GetEnumerator().MoveNext())) { continue; } page.AddResultItem(pageModelExtractor.EntityName, process); } if (!page.MissTargetUrls) { if (GetCustomizeTargetUrls == null) { ExtractLinks(page, TargetUrlExtractInfos); } else { page.AddTargetRequests(GetCustomizeTargetUrls(page)); } } if (page.ResultItems.Results.Count == 0) { page.ResultItems.IsSkip = true; } }
public Page Download(Request request, ITask task) { // ReSharper disable once UnusedVariable string path = BasePath + "/" + task.Identify + "/"; Page page; try { FileInfo file = GetFile(path + Encrypt.Md5Encrypt(request.Url)); StreamReader bufferedReader = new StreamReader(file.OpenRead()); string line = bufferedReader.ReadLine(); if (("url:\t" + request.Url).Equals(line)) { string html = GetHtml(bufferedReader); page = new Page(request); page.SetUrl(PlainText.Create(request.Url)); page.SetHtml(Html.Create(html)); } } catch (IOException e) { if (e.GetType().IsInstanceOfType(typeof(FileNotFoundException))) { _logger.Info("File not exist for url " + request.Url); } else { _logger.Warn("File read error for url " + request.Url, e); } } page = DownloadWhenMiss(request, task); return page; }
public void Process(Page page) { foreach (ExtractRule extractRule in _extractRules) { if (extractRule.IsMulti) { IList<string> results = page.GetHtml().SelectDocumentForList(extractRule.Selector); if (extractRule.IsNotNull && results.Count == 0) { page.SetSkip(true); } else { page.GetResultItems().Put(extractRule.FieldName, results); } } else { string result = page.GetHtml().SelectDocument(extractRule.Selector); if (extractRule.IsNotNull && result == null) { page.SetSkip(true); } else { page.GetResultItems().Put(extractRule.FieldName, result); } } } }
public void Process(Page page) { IList<String> links = page.GetHtml().Links().Regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").GetAll(); page.AddTargetRequests(links); page.PutField("title", page.GetHtml().XPath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").ToString()); page.PutField("content", page.GetHtml().XPath("//div[@class='BlogContent']/tidyText()").ToString()); page.PutField("tags", page.GetHtml().XPath("//div[@class='BlogTags']/a/text()").GetAll()); page.PutField("artical", page.GetHtml().XPath("//*[@Class='Blog']/div[1]/div/h2/a").ToString()); }
public Page Download(Request request, ISpider spider) { Page page = new Page(request, spider.Site.ContentType); page.Content = File.ReadAllText(request.Url.LocalPath); page.TargetUrl = request.Url.ToString(); page.Url = request.Url.ToString(); page.StatusCode = 200; return page; }
public void Process(Page page) { foreach (PageModelExtractor pageModelExtractor in _pageModelExtractorList) { ExtractLinks(page, pageModelExtractor.GetHelpUrlRegionSelector(), pageModelExtractor.GetHelpUrlPatterns()); ExtractLinks(page, pageModelExtractor.GetTargetUrlRegionSelector(), pageModelExtractor.GetTargetUrlPatterns()); object process = pageModelExtractor.Process(page); if (process == null || (process is IList && ((IList)process).Count == 0)) { continue; } PostProcessPageModel(process); page.PutField(pageModelExtractor.GetModelType().FullName, process); } if (page.GetResultItems().GetAll().Count == 0) { page.GetResultItems().IsSkip = true; } }
public void Process(Page page) { if (_language == Language.Javascript) { //engine.eval(defines + "\n" + script, context); // NativeObject o = (NativeObject) engine.get("result"); // if (o != null) { // for (Object o1 : o.getIds()) { // string key = string.valueOf(o1); // page.getResultItems().put(key, NativeObject.getProperty(o, key)); // } // } string realScript = _defines + Environment.NewLine + _script; Jurassic.ScriptEngine engine = new Jurassic.ScriptEngine(); engine.EnableExposedClrTypes = true; engine.SetGlobalValue("page", page); engine.SetGlobalValue("config", Site); engine.Execute(realScript); } else if (_language == Language.Python) { //RubyHash oRuby = (RubyHash)engine.eval(defines + "\n" + script, context); //Iterator itruby = oRuby.entrySet().iterator(); //while (itruby.hasNext()) //{ // Map.Entry pairs = (Map.Entry)itruby.next(); // page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); //} } else if (_language == Language.Ruby) { //engine.eval(defines + "\n" + script, context); //PyDictionary oJython = (PyDictionary)engine.get("result"); //Iterator it = oJython.entrySet().iterator(); //while (it.hasNext()) //{ // Map.Entry pairs = (Map.Entry)it.next(); // page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); //} } }
public Page Download(Request request, ISpider spider) { // ReSharper disable once UnusedVariable string path = BasePath + "/" + spider.Identity + "/"; Page page; try { FileInfo file = PrepareFile(path + Encrypt.Md5Encrypt(request.Url.ToString())); StreamReader bufferedReader = new StreamReader(file.OpenRead()); string line = bufferedReader.ReadLine(); if (("url:\t" + request.Url).Equals(line)) { string html = GetHtml(bufferedReader); page = new Page(request, spider.Site.ContentType); page.Url = request.Url.ToString(); page.Content = html; } } catch (IOException e) { #if !NET_CORE if (e.GetType().IsInstanceOfType(typeof(FileNotFoundException))) #else if (typeof(FileNotFoundException).GetTypeInfo().IsAssignableFrom(e.GetType().GetTypeInfo())) #endif { spider.Logger.Info("File not exist for url: " + request.Url); } else { spider.Logger.Warn("File read error for url " + request.Url, e); } } page = DownloadWhenMiss(request, spider); return page; }
public abstract bool CanStop(Page page);
public void AfterProcess(Page page) { }
public Page Download(Request request, ISpider spider) { var page = new Page(request, ContentType.Html); page.Content = ""; return page; }
public void Process(Page page) { page.IsSkip = true; }
public abstract MatchOther ProcessPage(Page page);
private JObject ProcessSingle(Page page, ISelectable item, EntityMetadata entityDefine, int index) { JObject dataObject = new JObject(); foreach (var field in entityDefine.Entity.Fields) { var fieldValue = ExtractField(item, page, field, index); if (fieldValue != null) { dataObject.Add(field.Name, fieldValue); } } var stopping = entityDefine.Stopping; if (stopping != null) { var field = entityDefine.Entity.Fields.First(f => f.Name == stopping.PropertyName) as Field; if (field != null) { var datatype = field.DataType; bool isEntity = VerifyIfEntity(datatype); if (isEntity) { throw new SpiderExceptoin("Can't compare with object."); } stopping.DataType = datatype.ToString().ToLower(); string value = dataObject.SelectToken($"$.{stopping.PropertyName}")?.ToString(); if (string.IsNullOrEmpty(value)) { page.MissTargetUrls = true; } else { if (stopping.NeedStop(value)) { page.MissTargetUrls = true; } } } else { throw new SpiderExceptoin("Stopping cannot be EntityMetaData."); } } return dataObject.Children().Count() > 0 ? dataObject : null; }
/// <summary> /// ����Ҳ�������URL, ��Ȼ���ص�URL̫�� /// </summary> /// <param name="page"></param> /// <param name="urlRegionSelector"></param> /// <param name="urlPatterns"></param> private void ExtractLinks(Page page, ISelector urlRegionSelector, IList<Regex> urlPatterns) { var links = urlRegionSelector == null ? new List<string>() : page.GetHtml().SelectList(urlRegionSelector).Links().GetAll(); if (urlPatterns == null || urlPatterns.Count == 0) { page.AddTargetRequests(links); return; } foreach (Regex targetUrlPattern in urlPatterns) { foreach (string link in links) { if (targetUrlPattern.IsMatch(link)) { page.AddTargetRequest(new Request(link, page.GetRequest().Extras)); } } } }
public override Page Download(Request request, ISpider spider) { WebDriverItem driverService = null; try { driverService = Pool.Get(); lock (this) { if (!_isLogined && Login != null) { _isLogined = Login.Invoke(driverService.WebDriver as RemoteWebDriver); if (!_isLogined) { throw new SpiderExceptoin("Login failed. Please check your login codes."); } } } //中文乱码URL Uri uri = request.Url; string query = uri.Query; string realUrl = uri.Scheme + "://" + uri.DnsSafeHost + (uri.Port == 80 ? "" : (":" + uri.Port)) + uri.AbsolutePath + (string.IsNullOrEmpty(query) ? "" : ("?" + HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1)))); if (UrlFormat != null) { realUrl = UrlFormat(realUrl); } RedialManagerUtils.Execute("webdriverdownloader-download", () => { driverService.WebDriver.Navigate().GoToUrl(realUrl); }); Thread.Sleep(_webDriverWaitTime); AfterNavigate?.Invoke((RemoteWebDriver)driverService.WebDriver); Page page = new Page(request, spider.Site.ContentType); page.Content = driverService.WebDriver.PageSource; page.Url = request.Url.ToString(); page.TargetUrl = driverService.WebDriver.Url; page.Title = driverService.WebDriver.Title; ValidatePage(page, spider); // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务 request.PutExtra(Request.CycleTriedTimes, null); return page; } catch (Exception e) { if (e.Message == "Need Verify Code.") { VerifyCode?.Invoke(driverService.WebDriver as RemoteWebDriver); } throw e; } finally { Pool.ReturnToPool(driverService); } }
public void Process(Page page) { _pageProcessor.Process(page); }
private dynamic ExtractField(ISelectable item, Page page, DataToken field, int index) { ISelector selector = SelectorUtil.Parse(field.Selector); if (selector == null) { return null; } var f = field as Field; List<Formatter.Formatter> formatters = GenerateFormatter(f?.Formatters); bool isEntity = field is Entity; if (!isEntity) { string tmpValue; if (selector is EnviromentSelector) { var enviromentSelector = selector as EnviromentSelector; tmpValue = GetEnviromentValue(enviromentSelector.Field, page, index); foreach (var formatter in formatters) { tmpValue = formatter.Formate(tmpValue); } return tmpValue; } else { bool needPlainText = (((Field)field).Option == PropertyExtractBy.ValueOption.PlainText); if (field.Multi) { var propertyValues = item.SelectList(selector).Nodes(); List<string> results = new List<string>(); foreach (var propertyValue in propertyValues) { string tmp = propertyValue.GetValue(needPlainText); foreach (var formatter in formatters) { tmp = formatter.Formate(tmp); } results.Add(tmp); } return new JArray(results); } else { bool needCount = (((Field)field).Option == PropertyExtractBy.ValueOption.Count); if (needCount) { var propertyValues = item.SelectList(selector).Nodes(); return propertyValues != null ? propertyValues.Count.ToString() : "-1"; } else { tmpValue = item.Select(selector)?.GetValue(needPlainText); tmpValue = formatters.Aggregate(tmpValue, (current, formatter) => formatter.Formate(current)); return tmpValue; } } } } else { if (field.Multi) { var propertyValues = item.SelectList(selector).Nodes(); JArray objs = new JArray(); var selectables = item.SelectList(selector).Nodes(); foreach (var selectable in selectables) { JObject obj = new JObject(); foreach (var child in ((Entity)field).Fields) { obj.Add(child.Name, ExtractField(selectable, page, child, 0)); } objs.Add(obj); } return objs; } else { JObject obj = new JObject(); var selectable = item.Select(selector); foreach (var child in ((Entity)field).Fields) { obj.Add(child.Name, ExtractField(selectable, page, field, 0)); } return obj; } } }
public dynamic Process(Page page) { if (_enviromentValues != null && _enviromentValues.Count > 0) { foreach (var enviromentValue in _enviromentValues) { string name = enviromentValue.Name; var value = page.Selectable.Select(SelectorUtil.Parse(enviromentValue.Selector)).GetValue(); page.Request.PutExtra(name, value); } } bool isMulti = false; ISelector selector = SelectorUtil.Parse(_entityDefine.Selector); if (selector == null) { isMulti = false; } else { isMulti = _entityDefine.Multi; } if (isMulti) { var list = page.Selectable.SelectList(selector).Nodes(); if (list == null || list.Count == 0) { return null; } var countToken = _entityDefine.Limit; if (countToken != null) { list = list.Take(countToken.Value).ToList(); } List<JObject> result = new List<JObject>(); int index = 0; foreach (var item in list) { JObject obj = ProcessSingle(page, item, _entityDefine, index); if (obj != null) { result.Add(obj); } index++; } return result; } else { ISelectable select; if (selector == null) { select = page.Selectable; } else { select = page.Selectable.Select(selector); if (select == null) { return null; } } return ProcessSingle(page, select, _entityDefine, 0); } }
public void Process(Page page) { if ((from subPageProcessor in _subPageProcessors where subPageProcessor.Match(page.GetRequest()) select subPageProcessor.ProcessPage(page)).Any(matchOtherProcessorProcessor => matchOtherProcessorProcessor != MatchOther.Yes)) { } }
/// <summary> /// 如果找不到则不返回URL, 不然返回的URL太多 /// </summary> /// <param name="page"></param> /// <param name="targetUrlExtractInfos"></param> private void ExtractLinks(Page page, List<Model.TargetUrlExtractor> targetUrlExtractInfos) { if (targetUrlExtractInfos == null) { return; } foreach (var targetUrlExtractInfo in targetUrlExtractInfos) { var urlRegionSelector = targetUrlExtractInfo.Region; var formatters = targetUrlExtractInfo.Formatters; var urlPatterns = targetUrlExtractInfo.Patterns; var links = urlRegionSelector == null ? page.Selectable.Links().GetValues() : (page.Selectable.SelectList(urlRegionSelector)).Links().GetValues(); if (links == null) { return; } // check: 仔细考虑是放在前面, 还是在后面做 formatter, 我倾向于在前面. 对targetUrl做formatter则表示Start Url也应该是要符合这个规则的。 if (formatters != null && formatters.Count > 0) { List<string> tmp = new List<string>(); foreach (string link in links) { var url = new String(link.ToCharArray()); foreach (Formatter f in formatters) { url = f.Formate(url); } tmp.Add(url); } links = tmp; } List<string> tmpLinks = new List<string>(); foreach (var link in links) { #if !NET_CORE tmpLinks.Add(HttpUtility.HtmlDecode(HttpUtility.UrlDecode(link))); #else tmpLinks.Add(WebUtility.HtmlDecode(WebUtility.UrlDecode(link))); #endif } links = tmpLinks; if (urlPatterns == null || urlPatterns.Count == 0) { page.AddTargetRequests(links); return; } foreach (Regex targetUrlPattern in urlPatterns) { foreach (string link in links) { if (targetUrlPattern.IsMatch(link)) { page.AddTargetRequest(new Request(link, page.Request.NextDepth, page.Request.Extras) { }); } } } } }
public override Page Download(Request request, ITask task) { CheckInit(); Site site = task.Site; IWebDriver webDriver = null; try { webDriver = _webDriverPool.Get(); IOptions manage = webDriver.Manage(); if (site.GetCookies() != null) { foreach (KeyValuePair<String, String> cookieEntry in site.GetCookies()) { Cookie cookie = new Cookie(cookieEntry.Key, cookieEntry.Value); manage.Cookies.AddCookie(cookie); } } //Logger.Info("Downloading page " + request.Url); //中文乱码URL Uri uri = new Uri(request.Url); string query = uri.Query; string realUrl = uri.Scheme + "://" + uri.DnsSafeHost + uri.AbsolutePath + (string.IsNullOrEmpty(query) ? "" : ("?" + HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1)))); webDriver.Navigate().GoToUrl(realUrl); string resultUrl = webDriver.Url; if (resultUrl.Contains("error") || resultUrl.Contains("login") || resultUrl.Contains("//www.tmall.com")) { Logger.Error("Url error: " + realUrl); _webDriverPool.Close(webDriver); // throw exception without return this webdriver throw new SpiderExceptoin("Browser request too much."); } Thread.Sleep(_webDriverWaitTime); //IWebElement webElement = webDriver.FindElement(By.XPath("/html")); //String content = webElement.GetAttribute("outerHTML"); Page page = new Page(request); page.SetRawText(webDriver.PageSource); page.SetUrl(new PlainText(request.Url)); // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务 request.PutExtra(Request.CycleTriedTimes, null); return page; } catch (Exception e) { if (site.CycleRetryTimes > 0) { return AddToCycleRetry(request, site); } OnError(request, e); return null; } finally { _webDriverPool.ReturnToPool(webDriver); } }
private string GetEnviromentValue(string field, Page page, int index) { if (field.ToLower() == "url") { return page.Url; } if (field.ToLower() == "targeturl") { return page.TargetUrl; } if (field.ToLower() == "now") { return DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"); } if (field.ToLower() == "monday") { return DateTimeUtils.MONDAY_RUN_ID; } if (field.ToLower() == "today") { return DateTimeUtils.TODAY_RUN_ID; } if (field.ToLower() == "index") { return index.ToString(); } if (!page.Request.ExistExtra(field)) { return field; } else { return page.Request.GetExtra(field)?.ToString(); } }