public List <FreeDocument> CrawlHtmlData(string html, out HtmlDocument doc ) { if (IsSuperMode) { html = JavaScriptAnalyzer.Parse2XML(html); } doc = new HtmlDocument(); doc.LoadHtml(html); var datas = new List <FreeDocument>(); try { datas = CrawlData(doc); if (datas.Count == 0) { XLogSys.Print.InfoFormat("HTML抽取数据失败,url:{0}", url); } } catch (Exception ex) { XLogSys.Print.ErrorFormat("HTML抽取数据失败,url:{0}, 异常为{1}", url, ex.Message); } return(datas); }
public string GetHtml(string url, out HttpStatusCode code, string post = null) { var mc = extract.Matches(url); if (SysProcessManager == null) { code = HttpStatusCode.NoContent; return(""); } var list = SysProcessManager.CurrentProcessCollections.ToArray(); var crawler = list.FirstOrDefault(d => d.Name == ShareCookie) as SmartCrawler; if (crawler != null) { Http.ProxyIP = crawler.Http.ProxyIP; Http.ProxyPassword = crawler.Http.ProxyPassword; Http.ProxyUserName = crawler.Http.ProxyUserName; Http.ProxyPort = crawler.Http.ProxyPort; if (Http.Parameters != crawler.Http.Parameters) { var cookie = crawler.Http.GetHeaderParameter().Get <string>("Cookie"); if (string.IsNullOrWhiteSpace(cookie) == false) { Http.SetValue("Cookie", cookie); } } } Dictionary <string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) { paradict = XPathAnalyzer.ParseUrl(URL); } if (paradict == null) { break; } var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } WebHeaderCollection headerCollection; var content = helper.GetHtml(Http, out headerCollection, out code, url, post); content = JavaScriptAnalyzer.Decode(content); if (IsSuperMode) { content = JavaScriptAnalyzer.Parse2XML(content); } return(content); }
public string GetHtml(string url, out HttpStatusCode code, string post = null) { var mc = extract.Matches(url); Dictionary <string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) { paradict = XPathAnalyzer.ParseUrl(URL); } if (paradict == null) { break; } var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } WebHeaderCollection headerCollection; var content = helper.GetHtml(Http, out headerCollection, out code, url, post); content = JavaScriptAnalyzer.Decode(content); if (IsSuperMode) { content = JavaScriptAnalyzer.Parse2XML(content); } return(content); }
public string GetHtml(string url, out HttpStatusCode code, string post = null) { string result = ""; HttpHelper.HttpResponse response; code = HttpStatusCode.NotFound; if (Regex.IsMatch(url, @"^[A-Z]:\\")) //本地文件 { if (File.Exists(url)) { result = File.ReadAllText(url, AttributeHelper.GetEncoding(this.Http.Encoding)); code = HttpStatusCode.Accepted; } } else { var mc = extract.Matches(url); if (SysProcessManager == null) { code = HttpStatusCode.NoContent; return(""); } SetCookie(Http); Dictionary <string, string> paramDict = null; foreach (Match m in mc) { if (paramDict == null) { paramDict = XPathAnalyzer.ParseUrl(URL); } if (paramDict == null) { break; } var str = m.Groups[1].Value; if (paramDict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paramDict[str]); } } response = helper.GetHtml(Http, url, post).Result; result = response.Html; code = response.Code; } result = JavaScriptAnalyzer.Decode(result); if (IsSuperMode) { result = JavaScriptAnalyzer.Parse2XML(result); } return(result); }
public string GetHtml(string url, out HttpStatusCode code, string post = null) { string result = ""; HttpHelper.HttpResponse response; code = HttpStatusCode.NotFound; if (Regex.IsMatch(url, @"^[A-Z]:\\")) //本地文件 { if (File.Exists(url)) { result = File.ReadAllText(url, AttributeHelper.GetEncoding(this.Http.Encoding)); code = HttpStatusCode.Accepted; } } else { var mc = extract.Matches(url); if (SysProcessManager == null) { code = HttpStatusCode.NoContent; return(""); } var crawler = this.SysProcessManager.GetTask <SmartCrawler>(ShareCookie.SelectItem); if (crawler != null) { Http.ProxyIP = crawler.Http.ProxyIP; if (Http.Parameters != crawler.Http.Parameters) { var cookie = crawler.Http.GetHeaderParameter().Get <string>("Cookie"); if (string.IsNullOrWhiteSpace(cookie) == false) { Http.SetValue("Cookie", cookie); } } } Dictionary <string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) { paradict = XPathAnalyzer.ParseUrl(URL); } if (paradict == null) { break; } var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } response = helper.GetHtml(Http, url, post).Result; result = response.Html; code = response.Code; } result = JavaScriptAnalyzer.Decode(result); if (IsSuperMode) { result = JavaScriptAnalyzer.Parse2XML(result); } return(result); }