Esempio n. 1
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            var mc = extract.Matches(url);

            if (SysProcessManager == null)
            {
                code = HttpStatusCode.NoContent;
                return("");
            }
            var list =
                SysProcessManager.CurrentProcessCollections.ToArray();
            var crawler =
                list.FirstOrDefault(d => d.Name == ShareCookie) as
                SmartCrawler;

            if (crawler != null)
            {
                Http.ProxyIP       = crawler.Http.ProxyIP;
                Http.ProxyPassword = crawler.Http.ProxyPassword;
                Http.ProxyUserName = crawler.Http.ProxyUserName;
                Http.ProxyPort     = crawler.Http.ProxyPort;
                if (Http.Parameters != crawler.Http.Parameters)
                {
                    var cookie = crawler.Http.GetHeaderParameter().Get <string>("Cookie");
                    if (string.IsNullOrWhiteSpace(cookie) == false)
                    {
                        Http.SetValue("Cookie", cookie);
                    }
                }
            }
            Dictionary <string, string> paradict = null;

            foreach (Match m in mc)
            {
                if (paradict == null)
                {
                    paradict = XPathAnalyzer.ParseUrl(URL);
                }
                if (paradict == null)
                {
                    break;
                }
                var str = m.Groups[1].Value;
                if (paradict.ContainsKey(str))
                {
                    url = url.Replace(m.Groups[0].Value, paradict[str]);
                }
            }
            WebHeaderCollection headerCollection;
            var content = helper.GetHtml(Http, out headerCollection, out code, url, post);

            content = JavaScriptAnalyzer.Decode(content);
            if (IsSuperMode)
            {
                content = JavaScriptAnalyzer.Parse2XML(content);
            }

            return(content);
        }
Esempio n. 2
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            var mc = extract.Matches(url);
            Dictionary <string, string> paradict = null;

            foreach (Match m in mc)
            {
                if (paradict == null)
                {
                    paradict = XPathAnalyzer.ParseUrl(URL);
                }
                if (paradict == null)
                {
                    break;
                }
                var str = m.Groups[1].Value;
                if (paradict.ContainsKey(str))
                {
                    url = url.Replace(m.Groups[0].Value, paradict[str]);
                }
            }
            WebHeaderCollection headerCollection;
            var  content = helper.GetHtml(Http, out headerCollection, out code, url, post);
            bool isjson;

            content = JavaScriptAnalyzer.Json2XML(content, out isjson, IsSuperMode);
            return(content);
        }
Esempio n. 3
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            var mc = extract.Matches(url);
            Dictionary <string, string> paradict = null;

            foreach (Match m in mc)
            {
                if (paradict == null)
                {
                    paradict = XPathAnalyzer.ParseUrl(URL);
                }
                if (paradict == null)
                {
                    break;
                }
                var str = m.Groups[1].Value;
                if (paradict.ContainsKey(str))
                {
                    url = url.Replace(m.Groups[0].Value, paradict[str]);
                }
            }
            if (!string.IsNullOrEmpty(Crawler))
            {
                var crawler =
                    SysProcessManager.CurrentProcessCollections.FirstOrDefault(d => d.Name == Crawler) as SmartCrawler;
                var header = crawler?.Http.GetHeaderParameter();

                if (header != null)
                {
                    var    myheader = Http.GetHeaderParameter();
                    object value;

                    if (header.TryGetValue("Cookie", out value))
                    {
                        myheader["Cookie"] = value.ToString();
                    }
                    if (header.TryGetValue("Host", out value))
                    {
                        myheader["Host"] = value.ToString();
                    }
                    if (header.TryGetValue("Referer", out value))
                    {
                        myheader["Referer"] = value.ToString();
                    }
                    Http.Parameters = HttpItem.HeaderToString(myheader);
                }
            }
            WebHeaderCollection headerCollection;
            var content = helper.GetHtml(Http, out headerCollection, out code, url, post);

            content = formatCheck(content, isJson(headerCollection) || isJson(Http));
            return(content);
        }
Esempio n. 4
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            string result = "";

            HttpHelper.HttpResponse response;
            code = HttpStatusCode.NotFound;
            if (Regex.IsMatch(url, @"^[A-Z]:\\")) //本地文件
            {
                if (File.Exists(url))
                {
                    result = File.ReadAllText(url, AttributeHelper.GetEncoding(this.Http.Encoding));
                    code   = HttpStatusCode.Accepted;
                }
            }
            else
            {
                var mc = extract.Matches(url);
                if (SysProcessManager == null)
                {
                    code = HttpStatusCode.NoContent;
                    return("");
                }
                SetCookie(Http);
                Dictionary <string, string> paramDict = null;
                foreach (Match m in mc)
                {
                    if (paramDict == null)
                    {
                        paramDict = XPathAnalyzer.ParseUrl(URL);
                    }
                    if (paramDict == null)
                    {
                        break;
                    }
                    var str = m.Groups[1].Value;
                    if (paramDict.ContainsKey(str))
                    {
                        url = url.Replace(m.Groups[0].Value, paramDict[str]);
                    }
                }
                response = helper.GetHtml(Http, url, post).Result;
                result   = response.Html;
                code     = response.Code;
            }
            result = JavaScriptAnalyzer.Decode(result);
            if (IsSuperMode)
            {
                result = JavaScriptAnalyzer.Parse2XML(result);
            }

            return(result);
        }
Esempio n. 5
0
        public List <FreeDocument> CrawlData(string url, out HtmlDocument doc, out HttpStatusCode code, string post = null)
        {
            var mc = extract.Matches(url);
            Dictionary <string, string> paradict = null;

            foreach (Match m in mc)
            {
                if (paradict == null)
                {
                    paradict = XPathAnalyzer.ParseUrl(URL);
                }
                if (paradict == null)
                {
                    break;
                }
                var str = m.Groups[1].Value;
                if (paradict.ContainsKey(str))
                {
                    url = url.Replace(m.Groups[0].Value, paradict[str]);
                }
            }

            var content = helper.GetHtml(Http, out code, url, post);

            doc = new HtmlDocument();
            if (!HttpHelper.IsSuccess(code))
            {
                XLogSys.Print.WarnFormat("HTML Fail,Code:{0},url:{1}", code, url);
                return(new List <FreeDocument>());
            }


            doc.LoadHtml(content);
            var datas = CrawlData(doc);

            if (datas.Count == 0)
            {
                XLogSys.Print.DebugFormat("HTML extract Fail,url:{0}", url);
            }
            return(datas);
        }
Esempio n. 6
0
        public List<FreeDocument> CrawlData(string url, out HtmlDocument doc, out HttpStatusCode code,
            string post = null)
        {
            var mc = extract.Matches(url);
            Dictionary<string, string> paradict = null;
            foreach (Match m in mc)
            {
                if (paradict == null)
                    paradict = XPathAnalyzer.ParseUrl(URL);
                if (paradict == null)
                    break;
                var str = m.Groups[1].Value;
                if (paradict.ContainsKey(str))
                {
                    url = url.Replace(m.Groups[0].Value, paradict[str]);
                }
            }
            if (!string.IsNullOrEmpty(Crawler))
            {
                var crawler =
                    SysProcessManager.CurrentProcessCollections.FirstOrDefault(d => d.Name == Crawler) as SmartCrawler;
                var header = crawler?.Http.GetHeaderParameter();

                if (header != null)
                {
                    var myheader = Http.GetHeaderParameter();
                    object value;

                    if (header.TryGetValue("Cookie", out value))
                    {
                        myheader["Cookie"]= value.ToString();
                    }
                    if (header.TryGetValue("Host", out value))
                    {
                        myheader["Host"] = value.ToString();
                    }
                    if (header.TryGetValue("Referer", out value))
                    {
                        myheader["Referer"] = value.ToString();
                    }
                    Http.Parameters = HttpItem.HeaderToString(myheader);
                }

            }
        
            var content = helper.GetHtml(Http, out code, url, post);
            doc = new HtmlDocument();
            if (!HttpHelper.IsSuccess(code))
            {
                XLogSys.Print.WarnFormat("HTML Fail,Code:{0},url:{1}", code, url);
                return new List<FreeDocument>();
            }


            doc.LoadHtml(content);
            var datas = CrawlData(doc);
            if (datas.Count == 0)
            {
                XLogSys.Print.DebugFormat("HTML extract Fail,url:{0}", url);
            }
           
            return datas;
        }
Esempio n. 7
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            string result = "";

            HttpHelper.HttpResponse response;
            code = HttpStatusCode.NotFound;
            if (Regex.IsMatch(url, @"^[A-Z]:\\")) //本地文件
            {
                if (File.Exists(url))
                {
                    result = File.ReadAllText(url, AttributeHelper.GetEncoding(this.Http.Encoding));
                    code   = HttpStatusCode.Accepted;
                }
            }
            else
            {
                var mc = extract.Matches(url);
                if (SysProcessManager == null)
                {
                    code = HttpStatusCode.NoContent;
                    return("");
                }
                var crawler = this.SysProcessManager.GetTask <SmartCrawler>(ShareCookie.SelectItem);
                if (crawler != null)
                {
                    Http.ProxyIP = crawler.Http.ProxyIP;
                    if (Http.Parameters != crawler.Http.Parameters)
                    {
                        var cookie = crawler.Http.GetHeaderParameter().Get <string>("Cookie");
                        if (string.IsNullOrWhiteSpace(cookie) == false)
                        {
                            Http.SetValue("Cookie", cookie);
                        }
                    }
                }
                Dictionary <string, string> paradict = null;
                foreach (Match m in mc)
                {
                    if (paradict == null)
                    {
                        paradict = XPathAnalyzer.ParseUrl(URL);
                    }
                    if (paradict == null)
                    {
                        break;
                    }
                    var str = m.Groups[1].Value;
                    if (paradict.ContainsKey(str))
                    {
                        url = url.Replace(m.Groups[0].Value, paradict[str]);
                    }
                }
                response = helper.GetHtml(Http, url, post).Result;
                result   = response.Html;
                code     = response.Code;
            }
            result = JavaScriptAnalyzer.Decode(result);
            if (IsSuperMode)
            {
                result = JavaScriptAnalyzer.Parse2XML(result);
            }

            return(result);
        }