public virtual List <JObject> Extract(Page page) { List <JObject> result = new List <JObject>(); if (GlobalValues != null && GlobalValues.Count > 0) { foreach (var enviromentValue in GlobalValues) { string name = enviromentValue.Name; var value = page.Selectable.Select(SelectorUtil.Parse(enviromentValue)).GetValue(); page.Request.PutExtra(name, value); } } ISelector selector = SelectorUtil.Parse(EntityMetadata.Selector); if (selector != null && EntityMetadata.Multi) { var list = page.Selectable.SelectList(selector).Nodes(); if (list == null || list.Count == 0) { result = null; } else { if (EntityMetadata.Take > 0) { list = list.Take(EntityMetadata.Take).ToList(); } int index = 0; foreach (var item in list) { var obj = ExtractSingle(page, item, index); if (obj != null) { result.Add(obj); } index++; } } } else { ISelectable select = selector == null ? page.Selectable : page.Selectable.Select(selector); if (select != null) { var singleResult = ExtractSingle(page, select, 0); result = singleResult != null ? new List <JObject> { singleResult } : null; } else { result = null; } }; return(result); }
/// <summary> /// 解析成爬虫实体对象 /// </summary> /// <param name="page">页面数据</param> /// <returns>爬虫实体对象</returns> public List <T> Extract(Page page) { List <T> result = new List <T>(); if (EntityDefine.SharedValues != null && EntityDefine.SharedValues.Count > 0) { foreach (var enviromentValue in EntityDefine.SharedValues) { string name = enviromentValue.Name; var value = page.Selectable.Select(SelectorUtil.Parse(enviromentValue)).GetValue(); page.Request.PutExtra(name, value); } } ISelector selector = SelectorUtil.Parse(EntityDefine.Selector); if (selector != null && EntityDefine.Multi) { var list = page.Selectable.SelectList(selector).Nodes(); if (list == null || list.Count == 0) { result = null; } else { if (EntityDefine.Take > 0) { list = list.Take(EntityDefine.Take).ToList(); } for (int i = 0; i < list.Count; ++i) { var item = list[i]; var obj = ExtractSingle(page, item, i); if (obj != null) { result.Add(obj); } } } } else { ISelectable select = selector == null ? page.Selectable : page.Selectable.Select(selector); if (select != null) { var item = ExtractSingle(page, select, 0); result = item != null ? new List <T> { item } : null; } else { result = null; } } return(result); }
public bool NeedStop(Page page, BaseTargetUrlsCreator creator) { var tmps = page.Selectable.SelectList(SelectorUtil.Parse(TimeSelector)).GetValues(); if (tmps == null) { return(true); } List <string> timeStrings = new List <string>(); foreach (var c in tmps) { var s = c; if (TimeFormatters != null) { foreach (var formatter in TimeFormatters) { s = formatter.Formate(s); } } timeStrings.Add(s); } foreach (var c in timeStrings) { var dt = DateTime.Parse(c); if (IsBefore) { foreach (var stopper in Times) { var stopDate = DateTime.Parse(stopper); if (dt < stopDate) { return(true); } } } else { foreach (var stopper in Times) { var stopDate = DateTime.Parse(stopper); if (dt > stopDate) { return(true); } } } } return(false); }
public override bool CanStop(Page page) { var current = page.Selectable.SelectList(SelectorUtil.Parse(CurrenctPageSelector)).GetValues(); if (current == null) { return(true); } List <string> timeStrings = new List <string>(); foreach (var c in current) { var s = c; if (CurrenctPageFormatters != null) { foreach (var formatter in CurrenctPageFormatters) { s = formatter.Formate(s); } } timeStrings.Add(s); } foreach (var c in timeStrings) { var dt = DateTime.Parse(c.ToString()); if (IsBefore) { foreach (var stopper in Stoppers) { var stopDate = DateTime.Parse(stopper); if (dt < stopDate) { return(true); } } } else { foreach (var stopper in Stoppers) { var stopDate = DateTime.Parse(stopper); if (dt > stopDate) { return(true); } } } } return(false); }
internal Model.TargetUrlExtractor GetTargetUrlExtractInfo() { var t = new Model.TargetUrlExtractor { Formatters = Formatters, Region = SelectorUtil.Parse(Region) }; foreach (var p in Patterns) { if (!string.IsNullOrEmpty(p?.Trim())) { t.Patterns.Add(new Regex(p)); } } return(t); }
public bool NeedStop(Page page, BaseTargetUrlsCreator creator) { int totalPage = -2000; if (TotalPageSelector != null) { string totalStr = page.Selectable.Select(SelectorUtil.Parse(TotalPageSelector)).GetValue(); if (TotalPageFormatters != null) { foreach (var formatter in TotalPageFormatters) { totalStr = formatter.Formate(totalStr); } } if (!string.IsNullOrEmpty(totalStr)) { totalPage = int.Parse(totalStr); } } int currentPage = -1000; if (CurrenctPageSelector != null) { string currentStr = page.Selectable.Select(SelectorUtil.Parse(CurrenctPageSelector)).GetValue(); if (CurrnetPageFormatters != null) { foreach (var formatter in CurrnetPageFormatters) { currentStr = formatter.Formate(currentStr); } } if (!string.IsNullOrEmpty(currentStr)) { currentPage = int.Parse(currentStr); } } if (currentPage == totalPage) { return(true); } return(false); }
public int?Interval(Page page) { var intervalStr = page.Selectable.Select(SelectorUtil.Parse(Selector)).GetValue(); if (!string.IsNullOrEmpty(intervalStr)) { if (IntervalFormatters != null) { foreach (var formatter in IntervalFormatters) { intervalStr = formatter.Formate(intervalStr); } } if (!string.IsNullOrEmpty(intervalStr)) { return(int.Parse(intervalStr)); } } return(null); }
private dynamic ExtractField(ISelectable item, Page page, Field field, int index) { if (field == null) { return(null); } ISelector selector = SelectorUtil.Parse(field.Selector); if (selector == null) { return(null); } string tmpValue; if (selector is EnviromentSelector) { var enviromentSelector = selector as EnviromentSelector; tmpValue = GetEnviromentValue(enviromentSelector.Field, page, index); foreach (var formatter in field.Formatters) { tmpValue = formatter.Formate(tmpValue); } return(tmpValue); } else { bool needPlainText = field.Option == PropertyDefine.Options.PlainText; if (field.Multi) { var propertyValues = item.SelectList(selector).Nodes(); List <string> results = new List <string>(); foreach (var propertyValue in propertyValues) { results.Add(propertyValue.GetValue(needPlainText)); } foreach (var formatter in field.Formatters) { results = formatter.Formate(results); } return(new JArray(results)); } else { bool needCount = field.Option == PropertyDefine.Options.Count; if (needCount) { var propertyValues = item.SelectList(selector).Nodes(); string count = propertyValues?.Count.ToString(); count = string.IsNullOrEmpty(count) ? "-1" : count; return(count); } else { tmpValue = item.Select(selector)?.GetValue(needPlainText); foreach (var formatter in field.Formatters) { tmpValue = formatter.Formate(tmpValue); } return(tmpValue); } } } }
private object ExtractField(ISelectable item, Page page, Column field, int index) { if (field == null) { return(null); } ISelector selector = SelectorUtil.Parse(field.Selector); if (selector == null) { return(null); } if (selector is EnviromentSelector) { var enviromentSelector = selector as EnviromentSelector; var value = SelectorUtil.GetEnviromentValue(enviromentSelector.Field, page, index); foreach (var formatter in field.Formatters) { #if DEBUG try { #endif value = formatter.Formate(value); #if DEBUG } catch (Exception e) { Logger.NLog(e.ToString(), Level.Error); } #endif } return(TryConvert(value, field.DataType)); } else { bool needCount = field.Option == PropertyDefine.Options.Count; if (needCount) { var values = item.SelectList(selector).Nodes(); return(values.Count); } else { var value = (object)item.Select(selector)?.GetValue(field.Option == PropertyDefine.Options.PlainText); foreach (var formatter in field.Formatters) { #if DEBUG try { #endif value = formatter.Formate(value); #if DEBUG } catch (Exception e) { Logger.NLog(e.ToString(), Level.Error); } #endif } return(TryConvert(value, field.DataType)); } } }
public bool NeedStop(Page page, BaseTargetUrlsCreator creator) { int totalPage = -2000; if (TotalPageSelector != null) { string totalStr = string.Empty; if (TotalPageSelector.Type == SelectorType.Enviroment) { var selector = SelectorUtil.Parse(TotalPageSelector) as EnviromentSelector; if (selector != null) { totalStr = EntityExtractor.GetEnviromentValue(selector.Field, page, 0); } } else { totalStr = page.Selectable.Select(SelectorUtil.Parse(TotalPageSelector)).GetValue(); } if (!string.IsNullOrEmpty(totalStr)) { if (TotalPageFormatters != null) { foreach (var formatter in TotalPageFormatters) { totalStr = formatter.Formate(totalStr); } } if (!string.IsNullOrEmpty(totalStr)) { totalPage = int.Parse(totalStr); } } } int currentPage = -1000; if (CurrenctPageSelector != null) { string currentStr = string.Empty; if (CurrenctPageSelector.Type == SelectorType.Enviroment) { var selector = SelectorUtil.Parse(CurrenctPageSelector) as EnviromentSelector; if (selector != null) { currentStr = EntityExtractor.GetEnviromentValue(selector.Field, page, 0); } } else { currentStr = page.Selectable.Select(SelectorUtil.Parse(CurrenctPageSelector)).GetValue(); } if (!string.IsNullOrEmpty(currentStr)) { if (CurrnetPageFormatters != null) { foreach (var formatter in CurrnetPageFormatters) { currentStr = formatter.Formate(currentStr); } } if (!string.IsNullOrEmpty(currentStr)) { currentPage = int.Parse(currentStr); } } } if (currentPage == totalPage) { return(true); } return(false); }
private dynamic ExtractField(ISelectable item, Page page, DataToken field, int index) { ISelector selector = SelectorUtil.Parse(field.Selector); if (selector == null) { return(null); } var f = field as Field; bool isEntity = field is Entity; if (!isEntity) { string tmpValue; if (selector is EnviromentSelector) { var enviromentSelector = selector as EnviromentSelector; tmpValue = GetEnviromentValue(enviromentSelector.Field, page, index); if (f != null) { foreach (var formatter in f.Formatters) { tmpValue = formatter.Formate(tmpValue); } } return(tmpValue); } else { bool needPlainText = ((Field)field).Option == PropertySelector.Options.PlainText; if (field.Multi) { var propertyValues = item.SelectList(selector).Nodes(); List <string> results = new List <string>(); foreach (var propertyValue in propertyValues) { results.Add(propertyValue.GetValue(needPlainText)); } if (f != null) { foreach (var formatter in f.Formatters) { results = formatter.Formate(results); } } return(new JArray(results)); } else { bool needCount = (((Field)field).Option == PropertySelector.Options.Count); if (needCount) { var propertyValues = item.SelectList(selector).Nodes(); return(propertyValues?.Count.ToString() ?? "-1"); } else { tmpValue = item.Select(selector)?.GetValue(needPlainText); if (f != null) { foreach (var formatter in f.Formatters) { tmpValue = formatter.Formate(tmpValue); } } return(tmpValue); } } } } else { if (field.Multi) { JArray objs = new JArray(); var selectables = item.SelectList(selector).Nodes(); foreach (var selectable in selectables) { JObject obj = new JObject(); foreach (var child in ((Entity)field).Fields) { obj.Add(child.Name, ExtractField(selectable, page, child, 0)); } objs.Add(obj); } return(objs); } else { JObject obj = new JObject(); var selectable = item.Select(selector); foreach (var child in ((Entity)field).Fields) { obj.Add(child.Name, ExtractField(selectable, page, field, 0)); } return(obj); } } }
public List <JObject> Process(Page page) { List <JObject> result = new List <JObject>(); bool isTarget = true; foreach (var targetUrlExtractor in EntityMetadata.TargetUrlExtractors) { foreach (var regex in targetUrlExtractor.Regexes) { isTarget = regex.IsMatch(page.Url); if (isTarget) { break; } } } if (!isTarget) { return(null); } if (_globalValues != null && _globalValues.Count > 0) { foreach (var enviromentValue in _globalValues) { string name = enviromentValue.Name; var value = page.Selectable.Select(SelectorUtil.Parse(enviromentValue)).GetValue(); page.Request.PutExtra(name, value); } } ISelector selector = SelectorUtil.Parse(EntityMetadata.Entity.Selector); if (selector != null && EntityMetadata.Entity.Multi) { var list = page.Selectable.SelectList(selector).Nodes(); if (list == null || list.Count == 0) { result = null; } else { var countToken = EntityMetadata.Limit; if (countToken != null) { list = list.Take(countToken.Value).ToList(); } int index = 0; foreach (var item in list) { JObject obj = ProcessSingle(page, item, index); if (obj != null) { result.Add(obj); } index++; } } } else { ISelectable select = selector == null ? page.Selectable : page.Selectable.Select(selector); if (select != null) { var singleResult = ProcessSingle(page, select, 0); result = new List <JObject> { singleResult }; } else { result = null; } } //if (EntityMetadata.TargetUrlsCreators != null && EntityMetadata.TargetUrlExtractors.Count > 0) //{ // foreach (var targetUrlsCreator in EntityMetadata.TargetUrlsCreators) // { // page.AddTargetRequests(targetUrlsCreator.Handle(page)); // } //} if (!page.MissExtractTargetUrls) { ExtractLinks(page, EntityMetadata.TargetUrlExtractors); } return(result); }
public dynamic Process(Page page) { if (_enviromentValues != null && _enviromentValues.Count > 0) { foreach (var enviromentValue in _enviromentValues) { string name = enviromentValue.Name; var value = page.Selectable.Select(SelectorUtil.Parse(enviromentValue.Selector)).GetValue(); page.Request.PutExtra(name, value); } } bool isMulti = false; ISelector selector = SelectorUtil.Parse(_entityDefine.Selector); if (selector == null) { isMulti = false; } else { isMulti = _entityDefine.Multi; } if (isMulti) { var list = page.Selectable.SelectList(selector).Nodes(); if (list == null || list.Count == 0) { return(null); } var countToken = _entityDefine.Limit; if (countToken != null) { list = list.Take(countToken.Value).ToList(); } List <JObject> result = new List <JObject>(); int index = 0; foreach (var item in list) { JObject obj = ProcessSingle(page, item, _entityDefine, index); if (obj != null) { result.Add(obj); } index++; } return(result); } else { ISelectable select; if (selector == null) { select = page.Selectable; } else { select = page.Selectable.Select(selector); if (select == null) { return(null); } } return(ProcessSingle(page, select, _entityDefine, 0)); } }