public string GetHtml(string url, out HttpStatusCode code, string post = null) { var mc = extract.Matches(url); if (SysProcessManager == null) { code = HttpStatusCode.NoContent; return(""); } var list = SysProcessManager.CurrentProcessCollections.ToArray(); var crawler = list.FirstOrDefault(d => d.Name == ShareCookie) as SmartCrawler; if (crawler != null) { Http.ProxyIP = crawler.Http.ProxyIP; Http.ProxyPassword = crawler.Http.ProxyPassword; Http.ProxyUserName = crawler.Http.ProxyUserName; Http.ProxyPort = crawler.Http.ProxyPort; if (Http.Parameters != crawler.Http.Parameters) { var cookie = crawler.Http.GetHeaderParameter().Get <string>("Cookie"); if (string.IsNullOrWhiteSpace(cookie) == false) { Http.SetValue("Cookie", cookie); } } } Dictionary <string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) { paradict = XPathAnalyzer.ParseUrl(URL); } if (paradict == null) { break; } var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } WebHeaderCollection headerCollection; var content = helper.GetHtml(Http, out headerCollection, out code, url, post); content = JavaScriptAnalyzer.Decode(content); if (IsSuperMode) { content = JavaScriptAnalyzer.Parse2XML(content); } return(content); }
public string GetHtml(string url, out HttpStatusCode code, string post = null) { var mc = extract.Matches(url); Dictionary <string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) { paradict = XPathAnalyzer.ParseUrl(URL); } if (paradict == null) { break; } var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } WebHeaderCollection headerCollection; var content = helper.GetHtml(Http, out headerCollection, out code, url, post); bool isjson; content = JavaScriptAnalyzer.Json2XML(content, out isjson, IsSuperMode); return(content); }
public string GetHtml(string url, out HttpStatusCode code, string post = null) { var mc = extract.Matches(url); Dictionary <string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) { paradict = XPathAnalyzer.ParseUrl(URL); } if (paradict == null) { break; } var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } if (!string.IsNullOrEmpty(Crawler)) { var crawler = SysProcessManager.CurrentProcessCollections.FirstOrDefault(d => d.Name == Crawler) as SmartCrawler; var header = crawler?.Http.GetHeaderParameter(); if (header != null) { var myheader = Http.GetHeaderParameter(); object value; if (header.TryGetValue("Cookie", out value)) { myheader["Cookie"] = value.ToString(); } if (header.TryGetValue("Host", out value)) { myheader["Host"] = value.ToString(); } if (header.TryGetValue("Referer", out value)) { myheader["Referer"] = value.ToString(); } Http.Parameters = HttpItem.HeaderToString(myheader); } } WebHeaderCollection headerCollection; var content = helper.GetHtml(Http, out headerCollection, out code, url, post); content = formatCheck(content, isJson(headerCollection) || isJson(Http)); return(content); }
public string GetHtml(string url, out HttpStatusCode code, string post = null) { string result = ""; HttpHelper.HttpResponse response; code = HttpStatusCode.NotFound; if (Regex.IsMatch(url, @"^[A-Z]:\\")) //本地文件 { if (File.Exists(url)) { result = File.ReadAllText(url, AttributeHelper.GetEncoding(this.Http.Encoding)); code = HttpStatusCode.Accepted; } } else { var mc = extract.Matches(url); if (SysProcessManager == null) { code = HttpStatusCode.NoContent; return(""); } SetCookie(Http); Dictionary <string, string> paramDict = null; foreach (Match m in mc) { if (paramDict == null) { paramDict = XPathAnalyzer.ParseUrl(URL); } if (paramDict == null) { break; } var str = m.Groups[1].Value; if (paramDict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paramDict[str]); } } response = helper.GetHtml(Http, url, post).Result; result = response.Html; code = response.Code; } result = JavaScriptAnalyzer.Decode(result); if (IsSuperMode) { result = JavaScriptAnalyzer.Parse2XML(result); } return(result); }
public List <FreeDocument> CrawlData(string url, out HtmlDocument doc, out HttpStatusCode code, string post = null) { var mc = extract.Matches(url); Dictionary <string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) { paradict = XPathAnalyzer.ParseUrl(URL); } if (paradict == null) { break; } var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } var content = helper.GetHtml(Http, out code, url, post); doc = new HtmlDocument(); if (!HttpHelper.IsSuccess(code)) { XLogSys.Print.WarnFormat("HTML Fail,Code:{0},url:{1}", code, url); return(new List <FreeDocument>()); } doc.LoadHtml(content); var datas = CrawlData(doc); if (datas.Count == 0) { XLogSys.Print.DebugFormat("HTML extract Fail,url:{0}", url); } return(datas); }
public List<FreeDocument> CrawlData(string url, out HtmlDocument doc, out HttpStatusCode code, string post = null) { var mc = extract.Matches(url); Dictionary<string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) paradict = XPathAnalyzer.ParseUrl(URL); if (paradict == null) break; var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } if (!string.IsNullOrEmpty(Crawler)) { var crawler = SysProcessManager.CurrentProcessCollections.FirstOrDefault(d => d.Name == Crawler) as SmartCrawler; var header = crawler?.Http.GetHeaderParameter(); if (header != null) { var myheader = Http.GetHeaderParameter(); object value; if (header.TryGetValue("Cookie", out value)) { myheader["Cookie"]= value.ToString(); } if (header.TryGetValue("Host", out value)) { myheader["Host"] = value.ToString(); } if (header.TryGetValue("Referer", out value)) { myheader["Referer"] = value.ToString(); } Http.Parameters = HttpItem.HeaderToString(myheader); } } var content = helper.GetHtml(Http, out code, url, post); doc = new HtmlDocument(); if (!HttpHelper.IsSuccess(code)) { XLogSys.Print.WarnFormat("HTML Fail,Code:{0},url:{1}", code, url); return new List<FreeDocument>(); } doc.LoadHtml(content); var datas = CrawlData(doc); if (datas.Count == 0) { XLogSys.Print.DebugFormat("HTML extract Fail,url:{0}", url); } return datas; }
public string GetHtml(string url, out HttpStatusCode code, string post = null) { string result = ""; HttpHelper.HttpResponse response; code = HttpStatusCode.NotFound; if (Regex.IsMatch(url, @"^[A-Z]:\\")) //本地文件 { if (File.Exists(url)) { result = File.ReadAllText(url, AttributeHelper.GetEncoding(this.Http.Encoding)); code = HttpStatusCode.Accepted; } } else { var mc = extract.Matches(url); if (SysProcessManager == null) { code = HttpStatusCode.NoContent; return(""); } var crawler = this.SysProcessManager.GetTask <SmartCrawler>(ShareCookie.SelectItem); if (crawler != null) { Http.ProxyIP = crawler.Http.ProxyIP; if (Http.Parameters != crawler.Http.Parameters) { var cookie = crawler.Http.GetHeaderParameter().Get <string>("Cookie"); if (string.IsNullOrWhiteSpace(cookie) == false) { Http.SetValue("Cookie", cookie); } } } Dictionary <string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) { paradict = XPathAnalyzer.ParseUrl(URL); } if (paradict == null) { break; } var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } response = helper.GetHtml(Http, url, post).Result; result = response.Html; code = response.Code; } result = JavaScriptAnalyzer.Decode(result); if (IsSuperMode) { result = JavaScriptAnalyzer.Parse2XML(result); } return(result); }