protected override IEnumerable <IFreeDocument> InternalTransformManyData(IFreeDocument document) { int itemPerPage = 1; int total = 1; int min = 1; var col = string.IsNullOrEmpty(NewColumn) ? Column : NewColumn; if (int.TryParse(document.Query(ItemPerPage), out itemPerPage) && int.TryParse(document[Column].ToString(), out total) && int.TryParse(document.Query(MinValue), out min)) { if (itemPerPage == 0) { itemPerPage = 1; } var remainder = total % itemPerPage; int totalp = total / itemPerPage; if (remainder != 0) { totalp += 1; } for (int i = min; i < min + totalp; i += 1) { var doc = document.Clone(); doc[col] = i; yield return(doc); } } }
public override IEnumerable <IFreeDocument> Generate(IFreeDocument document = null) { int interval; double max, min; if (int.TryParse(document.Query(Interval), out interval) && double.TryParse(document.Query(MinValue), out min) && double.TryParse(document.Query(MaxValue), out max)) { for (var i = min; i <= max; i += interval) { var item = new FreeDocument(); double value = Math.Round(i, 5); object result; if (!string.IsNullOrEmpty(Format)) { result = value.ToString(Format); } else { result = Math.Round(i, 5); } item.Add(Column, result); yield return(item); } } }
public override object TransformData(IFreeDocument datas) { var item = datas[Column]; if (item == null) { item = ""; } var strs = new List <object> { item }; if (string.IsNullOrEmpty(Format)) { return(item); } var format = datas.Query(Format); var exps = rgx.Matches(format); foreach (Match exp in exps) { format = format.Replace(exp.Value, datas.Query(exp.Value)); } var columns = MergeWith.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries); strs.AddRange(columns.Select(key => { if (datas.ContainsKey(key)) { return(datas[key]); } return(key); })); return(string.Format(format, strs.ToArray())); }
public override IEnumerable <IFreeDocument> Generate(IFreeDocument document = null) { var path = FileName; var result = document?.Query(FileName); if (result != null) { path = result; } // var item = datas[Column].ToString(); var res = buffHelper.Get(path); if (res != null) { yield return(res); yield break; } var content = File.ReadAllText(path, AttributeHelper.GetEncoding(EncodingType)); var item = new FreeDocument(); item.Add(Column, content); buffHelper.Set(path, item); yield return(item); }
public override object TransformData(IFreeDocument document) { var item = document[Column]; if (item is IFreeDocument) { return((item as IFreeDocument).GetDataFromXPath(XPath)); } var docu = new HtmlDocument(); docu.LoadHtml(item.ToString()); if (GetText) { var path = docu.DocumentNode.GetTextNode(); var textnode = docu.DocumentNode.SelectSingleNode(path); if (textnode != null) { return(textnode.GetNodeText()); } } if (GetCount) { var textnode = docu.DocumentNode.SelectNodes(XPath); return(textnode.Count); } return(docu.DocumentNode.GetDataFromXPath(document.Query(XPath))); }
public override IEnumerable <IFreeDocument> Generate(IFreeDocument document = null) { if (Connector == null) { return(new List <IFreeDocument>()); } var path = FileName; var result = document?.Query(FileName); if (result != null) { path = result; } Connector.FileName = path; if (!IsExecute) { return(Connector.ReadFile()); } else { return(Connector.ReadFile().CacheDo(buffHelper.GetOrCreate(path, new List <FreeDocument>()), this.Father.SampleMount * 2)); } }
public override object TransformData(IFreeDocument document) { var item = document[Column]; if (htmlResults.Count < 5) { htmlResults.Add(new XPathDetectorModel.HtmlResult() { HTML = item.ToString(), Url = "URL_" + htmlResults.Count }); } if (item is IFreeDocument) { return((item as IFreeDocument).GetDataFromXPath(XPath)); } var docu = new HtmlDocument(); docu.LoadHtml(item.ToString()); string path; if (GetText) { path = docu.DocumentNode.GetTextNode(); return(docu.DocumentNode.GetDataFromXPath(path, CrawlType)); } else { path = document.Query(XPath); return(docu.DocumentNode.GetDataFromXPath(path, CrawlType, SelectorFormat)); } }
protected override IEnumerable <IFreeDocument> InternalTransformManyData(IFreeDocument data) { var item = data[Column]; var docu = new HtmlDocument(); docu.LoadHtml(item.ToString()); var path = data.Query(XPath); var p2 = docu.DocumentNode.SelectNodes(path, this.SelectorFormat); if (p2 == null) { return(new List <IFreeDocument>()); } return(p2.Select(node => { var doc = new FreeDocument(); doc.MergeQuery(data, NewColumn); doc.SetValue("Text", node.GetNodeText()); doc.SetValue("HTML", node.InnerHtml); doc.SetValue("OHTML", node.OuterHtml); return doc; })); }
private List <FreeDocument> GetDatas(IFreeDocument data) { var p = data[Column]; if (p == null) { return(new List <FreeDocument>()); } var url = p.ToString(); var bufkey = url; var post = data.Query(PostData); if (crawler.Http.Method == MethodType.POST) { bufkey += post; } var htmldoc = buffHelper.Get(bufkey); var docs = new List <FreeDocument>(); if (htmldoc == null) { HttpStatusCode code; var maxcount = 1; int.TryParse(data.Query(MaxTryCount), out maxcount); var count = 0; while (count < maxcount) { docs = crawler.CrawlData(url, out htmldoc, out code, post); if (HttpHelper.IsSuccess(code) && docs.Count > 0) { buffHelper.Set(bufkey, htmldoc); break; } Thread.Sleep(ErrorDelay); count++; } } else { docs = crawler.CrawlData(htmldoc); } return(docs); }
public override IEnumerable <IFreeDocument> Generate(IFreeDocument document = null) { int interval; double max, min; if (int.TryParse(document.Query(Interval), out interval) && double.TryParse(document.Query(MinValue), out min) && double.TryParse(document.Query(MaxValue), out max)) { for (var i = min; i <= max; i += interval) { var item = new FreeDocument(); item.Add(Column, Math.Round(i, 5)); yield return(item); } } }
public override IEnumerable<FreeDocument> Generate(IFreeDocument document = null) { int interval; double max, min; if (int.TryParse(document.Query( Interval), out interval)&& double.TryParse(document.Query(MinValue), out min) && double.TryParse(document.Query(MaxValue), out max)) { for (var i = Position * interval + min; i <= max; i += interval) { var item = new FreeDocument(); item.Add(Column, Math.Round(i, 5)); yield return item; } } }
public override bool FilteDataBase(IFreeDocument data) { object item = data[this.Column]; if (item == null) { return false; } bool res = false; var v = (double)AttributeHelper.ConvertTo(item, SimpleDataType.DOUBLE, ref res); if (res == false) { return false; } double max=1, min=0; if (double.TryParse(data.Query(Max), out max) && double.TryParse(data.Query(Min), out min)) return v >= min && v <=max; return true; }
private IEnumerable <FreeDocument> GetDatas(IFreeDocument data) { var p = data[Column]; if (p == null || Crawler == null) { return(new List <FreeDocument>()); } var urlOrHtml = p.ToString(); var bufkey = urlOrHtml; var post = data.Query(PostData); var crawler = Crawler; if (crawler == null) { crawler = defaultCrawler; } if (crawler.Http.Method == MethodType.POST) { bufkey += post; } var htmldoc = buffHelper.Get(bufkey); if (htmldoc == null) { IEnumerable <FreeDocument> docs = null; HttpStatusCode code; if (!ProcessHtmlOnly) { docs = crawler.CrawlData(urlOrHtml, out htmldoc, out code, post); var any = docs.Any(); if (HttpHelper.IsSuccess(code)) { if (!any) { ConfigFile.GetConfig <DataMiningConfig>().ParseErrorCount++; throw new Exception(string.Format(GlobalHelper.Get("key_669"), urlOrHtml)); } if (this.IsExecute == false) { buffHelper.Set(bufkey, htmldoc); } return(docs); } throw new Exception("Web Request Error:" + code); } else { docs = crawler.CrawlHtmlData(urlOrHtml, out htmldoc); return(docs); } } return(crawler.CrawlData(htmldoc.DocumentNode)); }
public override IEnumerable <IFreeDocument> Generate(IFreeDocument document = null) { int count; int max, min; if (int.TryParse(document.Query(Count), out count) && int.TryParse(document.Query(MinValue), out min) && int.TryParse(document.Query(MaxValue), out max)) { int i = 0; while (i < count) { var item = new FreeDocument(); item.Add(Column, random.Next(min, max)); yield return(item); i += 1; } } }
public override object TransformData(IFreeDocument datas) { //初始化方案信息实体类。 var item = datas[Column]; if (item == null) { return(null); } try { var newlocation = buffHelper.Get(item.ToString()); if (newlocation == null) { //以 Get 形式请求 Api 地址 var r = datas.Query(Region); var tag = datas.Query(Tag); var apiUrl = $"http://api.map.baidu.com/place/v2/search?q={item}®ion={r}&tag={tag}&output={format}&ak={apikey}"; //初始化方案信息实体类。 var result = HttpHelper.GetWebSourceHtml(apiUrl, "utf-8"); //以 Get 形式请求 Api 地址 // var result = HttpHelper.DoGet(apiUrl, param); dynamic info = serialier.DeserializeObject(result); // if (info[0]["status"].ToInt32() == 0) { newlocation = Parse(info); } buffHelper.Set(item.ToString(), newlocation); } newlocation.DictCopyTo(datas); } catch (Exception) { } return(true); }
protected override IEnumerable <IFreeDocument> InternalTransformManyData(IFreeDocument data) { var r = data.Query(DelayTime); var result = 100; if (int.TryParse(r, out result)) { Thread.Sleep(result); } return(new List <IFreeDocument>() { data }); }
public override IEnumerable <FreeDocument> Generate(IFreeDocument document = null) { int interval, repeat; double max, min; if (int.TryParse(document.Query(Interval), out interval) && int.TryParse(document.Query(RepeatCount), out repeat) && double.TryParse(document.Query(MinValue), out min) && double.TryParse(document.Query(MaxValue), out max)) { for (var i = Position * interval + min; i <= max; i += interval) { var j = repeat; while (j > 0) { var item = new FreeDocument(); item.Add(Column, Math.Round(i, 5)); yield return(item); j--; } } } }
public override bool FilteDataBase(IFreeDocument data) { object item = data[this.Column]; if (item == null) { return(false); } bool res = false; var v = (double)AttributeHelper.ConvertTo(item, SimpleDataType.DOUBLE, ref res); if (res == false) { return(false); } double max = 1, min = 0; if (double.TryParse(data.Query(Max), out max) && double.TryParse(data.Query(Min), out min)) { return(v >= min && v <= max); } return(true); }
public override object TransformData(IFreeDocument dict) { object item = dict[Column]; var repl = dict.Query(ReplaceText); if (item == null) { return(null); } string r = regex.Replace(item.ToString(), repl); return(r); }
public override object TransformData(IFreeDocument document) { var path = FileName; var result = document?.Query(FileName); if (result != null) { path = result; } var res = buffHelper.Get(path); if (res != null) { return(res); } res = File.ReadAllText(path, AttributeHelper.GetEncoding(EncodingType)); buffHelper.Set(path, res); return(res); }
object TransformData(IFreeDocument datas) { var p = datas[Column]; var post = datas.Query(PostData); if (p == null) { return(new List <FreeDocument>()); } var url = p.ToString(); Crawler.SetCookie(Crawler.Http); var response = helper.GetHtml(Crawler.Http, url, post).Result; var content = response.Html; var code = response.Code; var responseHeader = response.ResponseHeaders; var keys = responseHeader.AllKeys; if (!string.IsNullOrEmpty(HeaderFilter)) { keys = HeaderFilter.Split(' '); } foreach (var key in keys) { var value = responseHeader.Get(key); if (value != null) { datas.SetValue(key, value); } } if (keys.Contains("Location") && datas.ContainsKey("Location") == false) { datas["Location"] = url; } return(null); }
public override object TransformData(IFreeDocument document) { var item = document[Column]; if (item is IFreeDocument) { return((item as IFreeDocument).GetDataFromXPath(XPath)); } var docu = new HtmlDocument(); docu.LoadHtml(item.ToString()); string path; if (GetText) { path = docu.DocumentNode.GetTextNode(); return(docu.DocumentNode.GetDataFromXPath(path, CrawlType)); } else { path = document.Query(XPath); return(docu.DocumentNode.GetDataFromXPath(path, CrawlType, SelectorFormat)); } }
public override object TransformData(IFreeDocument datas) { //初始化方案信息实体类。 var item = datas[Column]; if (item == null) return null; try { var newlocation = buffHelper.Get(item.ToString()); if (newlocation == null) { //以 Get 形式请求 Api 地址 var r = datas.Query(Region); var apiUrl = $"http://api.map.baidu.com/place/v2/search?q={item}®ion={r}&output={format}&ak={apikey}"; //初始化方案信息实体类。 var result = HttpHelper.GetWebSourceHtml(apiUrl, "utf-8"); //以 Get 形式请求 Api 地址 // var result = HttpHelper.DoGet(apiUrl, param); dynamic info = serialier.DeserializeObject(result); // if (info[0]["status"].ToInt32() == 0) { newlocation = Parse(info); } buffHelper.Set(item.ToString(), newlocation); } newlocation.DictCopyTo(datas); } catch (Exception ex) { } return true; }
public override object TransformData(IFreeDocument document) { var item = document[Column]; if (item is IFreeDocument) { return (item as IFreeDocument).GetDataFromXPath(XPath); } var docu = new HtmlDocument(); docu.LoadHtml(item.ToString()); if (GetText) { var path = docu.DocumentNode.GetTextNode(); var textnode = docu.DocumentNode.SelectSingleNode(path); if (textnode != null) return textnode.GetNodeText(); } if (GetCount) { var textnode = docu.DocumentNode.SelectNodes(XPath); return textnode.Count; } return docu.DocumentNode.GetDataFromXPath(document.Query(XPath)); }
private List <FreeDocument> GetDatas(IFreeDocument data) { var p = data[Column]; if (p == null) { return(new List <FreeDocument>()); } var url = p.ToString(); var bufkey = url; var post = data.Query(PostData); if (crawler.Http.Method == MethodType.POST) { bufkey += post; } var htmldoc = buffHelper.Get(bufkey); var docs = new List <FreeDocument>(); if (htmldoc == null) { var delay = data.Query(DelayTime); var delaytime = 0; if (delay != null && int.TryParse(delay, out delaytime)) { if (delaytime != 0) { Thread.Sleep(delaytime); } } HttpStatusCode code; int maxcount = 1; int.TryParse(data.Query(MaxTryCount), out maxcount); int count = 0; while (count < maxcount) { docs = crawler.CrawlData(url, out htmldoc, out code, post); if (HttpHelper.IsSuccess(code)) { buffHelper.Set(bufkey, htmldoc); break; } Thread.Sleep(ErrorDelay); count++; } } else { docs = crawler.CrawlData(htmldoc); } if (generator != null) { var others = htmldoc.DocumentNode.SelectNodes("//@href"); var r3 = others.Select(d => d.Attributes["href"].Value).ToList(); IEnumerable <string> r4; if (string.IsNullOrEmpty(Prefix)) { r4 = r3; } else if (IsRegex == false) { r4 = r3.Where(d => d.StartsWith(Prefix)).Where(d => true); } else { r4 = r3.Where(d => regex.IsMatch(d)); } foreach (var href in r4) { generator.InsertQueue(href); } } return(docs); }
public override object TransformData(IFreeDocument datas) { //初始化方案信息实体类。 var item = datas[Column]; if (item == null) { return(null); } try { var source = item.ToString(); var dest = datas.Query(Dest); var sourcecity = datas.Query(SourceCity); var destcity = datas.Query(DestCity); var mode = map[ModeSelector.SelectItem]; var key = $"{source},{dest},{sourcecity},{destcity},{mode}"; var newlocation = buffHelper.Get(key); if (newlocation == null) { //以 Get 形式请求 Api 地址 var region = ""; if (mode == "transit" || mode == "walking") { region = $"region={sourcecity}"; } else { region = $"origin_region={sourcecity}&destination_region={destcity}"; } var apiUrl = $"http://api.map.baidu.com/direction/v1?mode={mode}&origin={source}&destination={dest}&{region}&output={format}&ak={apikey}"; //初始化方案信息实体类。 var result = HttpHelper.GetWebSourceHtml(apiUrl, "utf-8"); //以 Get 形式请求 Api 地址 // var result = HttpHelper.DoGet(apiUrl, param); dynamic info = serialier.DeserializeObject(result); if (info["status"].ToInt32() == 0 && info["type"].ToInt32() == 2) { var first = info["result"]; newlocation = new FreeDocument(); if (mode == "transit") { newlocation["distance"] = first["routes"]["scheme"]["distance"]; newlocation["duration"] = first["routes"]["scheme"]["duration"]; newlocation["price"] = first["routes"]["scheme"]["price"]; } else if (mode == "walking") { newlocation["distance"] = first["routes"][0]["distance"]; newlocation["duration"] = first["routes"][0]["duration"]; } else { newlocation["distance"] = first["routes"][0]["distance"]; newlocation["duration"] = first["routes"][0]["duration"]; newlocation["traffic_condition"] = first["traffic_condition"]; newlocation["toll"] = first["routes"]["toll"]; } } buffHelper.Set(item.ToString(), newlocation); } newlocation.DictCopyTo(datas); } catch (Exception) { } return(true); }
public override object TransformData(IFreeDocument datas) { //初始化方案信息实体类。 var item = datas[Column]; if (item == null) return null; try { var source = item.ToString(); var dest = datas.Query( Dest); var sourcecity = datas.Query( SourceCity); var destcity = datas.Query(DestCity); var mode = map[ModeSelector.SelectItem]; var key = $"{source},{dest},{sourcecity},{destcity},{mode}"; var newlocation = buffHelper.Get(key); if (newlocation == null) { //以 Get 形式请求 Api 地址 var region = ""; if (mode == "transit" || mode == "walking") { region = $"region={sourcecity}"; } else { region = $"origin_region={sourcecity}&destination_region={destcity}"; } var apiUrl = $"http://api.map.baidu.com/direction/v1?mode={mode}&origin={source}&destination={dest}&{region}&output={format}&ak={apikey}"; //初始化方案信息实体类。 var result = HttpHelper.GetWebSourceHtml(apiUrl, "utf-8"); //以 Get 形式请求 Api 地址 // var result = HttpHelper.DoGet(apiUrl, param); dynamic info = serialier.DeserializeObject(result); if (info["status"].ToInt32() == 0&& info["type"].ToInt32()==2) { var first= info["result"]; newlocation=new FreeDocument(); if (mode == "transit") { newlocation["distance"] = first["routes"]["scheme"]["distance"]; newlocation["duration"] = first["routes"]["scheme"]["duration"]; newlocation["price"] = first["routes"]["scheme"]["price"]; } else if (mode == "walking") { newlocation["distance"] = first["routes"][0]["distance"]; newlocation["duration"] = first["routes"][0]["duration"]; } else { newlocation["distance"] = first["routes"][0]["distance"]; newlocation["duration"] = first["routes"][0]["duration"]; newlocation["traffic_condition"] = first["traffic_condition"]; newlocation["toll"] = first["routes"]["toll"]; } } buffHelper.Set(item.ToString(), newlocation); } newlocation.DictCopyTo(datas); } catch (Exception ex) { } return true; }
private List<FreeDocument> GetDatas(IFreeDocument data) { var p = data[Column]; if (p == null) return new List<FreeDocument>(); var url = p.ToString(); var bufkey = url; var post = data.Query(PostData); if (crawler.Http.Method == MethodType.POST) { bufkey += post; } var htmldoc = buffHelper.Get(bufkey); var docs = new List<FreeDocument>(); if (htmldoc == null) { var delay = data.Query(DelayTime); var delaytime = 0; if (delay != null && int.TryParse(delay, out delaytime)) { if (delaytime != 0) Thread.Sleep(delaytime); } HttpStatusCode code; int maxcount = 1; int.TryParse(data.Query(MaxTryCount),out maxcount); int count = 0; while (count<maxcount) { docs = crawler.CrawlData(url, out htmldoc, out code, post); if (HttpHelper.IsSuccess(code)) { buffHelper.Set(bufkey, htmldoc); break; } Thread.Sleep(ErrorDelay); count++; } } else { docs = crawler.CrawlData(htmldoc); } if (generator != null) { var others = htmldoc.DocumentNode.SelectNodes("//@href"); var r3 = others.Select(d => d.Attributes["href"].Value).ToList(); IEnumerable<string> r4; if (string.IsNullOrEmpty(Prefix)) r4 = r3; else if(IsRegex==false) r4 = r3.Where(d => d.StartsWith(Prefix)).Where(d => true); else { r4 = r3.Where(d => regex.IsMatch(d)); } foreach (var href in r4) { generator.InsertQueue(href); } } return docs; }
public override object TransformData(IFreeDocument free) { return free.Query(NewValue); }
public override object TransformData(IFreeDocument free) { return(free.Query(NewValue)); }