Example #1
0
        public List <FreeDocument> CrawlHtmlData(string html, out HtmlDocument doc
                                                 )
        {
            if (IsSuperMode)
            {
                html = JavaScriptAnalyzer.Parse2XML(html);
            }
            doc = new HtmlDocument();

            doc.LoadHtml(html);
            var datas = new List <FreeDocument>();

            try
            {
                datas = CrawlData(doc);
                if (datas.Count == 0)
                {
                    XLogSys.Print.InfoFormat("HTML抽取数据失败,url:{0}", url);
                }
            }
            catch (Exception ex)
            {
                XLogSys.Print.ErrorFormat("HTML抽取数据失败,url:{0}, 异常为{1}", url, ex.Message);
            }


            return(datas);
        }
Example #2
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            var mc = extract.Matches(url);

            if (SysProcessManager == null)
            {
                code = HttpStatusCode.NoContent;
                return("");
            }
            var list =
                SysProcessManager.CurrentProcessCollections.ToArray();
            var crawler =
                list.FirstOrDefault(d => d.Name == ShareCookie) as
                SmartCrawler;

            if (crawler != null)
            {
                Http.ProxyIP       = crawler.Http.ProxyIP;
                Http.ProxyPassword = crawler.Http.ProxyPassword;
                Http.ProxyUserName = crawler.Http.ProxyUserName;
                Http.ProxyPort     = crawler.Http.ProxyPort;
                if (Http.Parameters != crawler.Http.Parameters)
                {
                    var cookie = crawler.Http.GetHeaderParameter().Get <string>("Cookie");
                    if (string.IsNullOrWhiteSpace(cookie) == false)
                    {
                        Http.SetValue("Cookie", cookie);
                    }
                }
            }
            Dictionary <string, string> paradict = null;

            foreach (Match m in mc)
            {
                if (paradict == null)
                {
                    paradict = XPathAnalyzer.ParseUrl(URL);
                }
                if (paradict == null)
                {
                    break;
                }
                var str = m.Groups[1].Value;
                if (paradict.ContainsKey(str))
                {
                    url = url.Replace(m.Groups[0].Value, paradict[str]);
                }
            }
            WebHeaderCollection headerCollection;
            var content = helper.GetHtml(Http, out headerCollection, out code, url, post);

            content = JavaScriptAnalyzer.Decode(content);
            if (IsSuperMode)
            {
                content = JavaScriptAnalyzer.Parse2XML(content);
            }

            return(content);
        }
Example #3
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            var mc = extract.Matches(url);
            Dictionary <string, string> paradict = null;

            foreach (Match m in mc)
            {
                if (paradict == null)
                {
                    paradict = XPathAnalyzer.ParseUrl(URL);
                }
                if (paradict == null)
                {
                    break;
                }
                var str = m.Groups[1].Value;
                if (paradict.ContainsKey(str))
                {
                    url = url.Replace(m.Groups[0].Value, paradict[str]);
                }
            }
            WebHeaderCollection headerCollection;
            var content = helper.GetHtml(Http, out headerCollection, out code, url, post);

            content = JavaScriptAnalyzer.Decode(content);
            if (IsSuperMode)
            {
                content = JavaScriptAnalyzer.Parse2XML(content);
            }

            return(content);
        }
Example #4
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            string result = "";

            HttpHelper.HttpResponse response;
            code = HttpStatusCode.NotFound;
            if (Regex.IsMatch(url, @"^[A-Z]:\\")) //本地文件
            {
                if (File.Exists(url))
                {
                    result = File.ReadAllText(url, AttributeHelper.GetEncoding(this.Http.Encoding));
                    code   = HttpStatusCode.Accepted;
                }
            }
            else
            {
                var mc = extract.Matches(url);
                if (SysProcessManager == null)
                {
                    code = HttpStatusCode.NoContent;
                    return("");
                }
                SetCookie(Http);
                Dictionary <string, string> paramDict = null;
                foreach (Match m in mc)
                {
                    if (paramDict == null)
                    {
                        paramDict = XPathAnalyzer.ParseUrl(URL);
                    }
                    if (paramDict == null)
                    {
                        break;
                    }
                    var str = m.Groups[1].Value;
                    if (paramDict.ContainsKey(str))
                    {
                        url = url.Replace(m.Groups[0].Value, paramDict[str]);
                    }
                }
                response = helper.GetHtml(Http, url, post).Result;
                result   = response.Html;
                code     = response.Code;
            }
            result = JavaScriptAnalyzer.Decode(result);
            if (IsSuperMode)
            {
                result = JavaScriptAnalyzer.Parse2XML(result);
            }

            return(result);
        }
Example #5
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            string result = "";

            HttpHelper.HttpResponse response;
            code = HttpStatusCode.NotFound;
            if (Regex.IsMatch(url, @"^[A-Z]:\\")) //本地文件
            {
                if (File.Exists(url))
                {
                    result = File.ReadAllText(url, AttributeHelper.GetEncoding(this.Http.Encoding));
                    code   = HttpStatusCode.Accepted;
                }
            }
            else
            {
                var mc = extract.Matches(url);
                if (SysProcessManager == null)
                {
                    code = HttpStatusCode.NoContent;
                    return("");
                }
                var crawler = this.SysProcessManager.GetTask <SmartCrawler>(ShareCookie.SelectItem);
                if (crawler != null)
                {
                    Http.ProxyIP = crawler.Http.ProxyIP;
                    if (Http.Parameters != crawler.Http.Parameters)
                    {
                        var cookie = crawler.Http.GetHeaderParameter().Get <string>("Cookie");
                        if (string.IsNullOrWhiteSpace(cookie) == false)
                        {
                            Http.SetValue("Cookie", cookie);
                        }
                    }
                }
                Dictionary <string, string> paradict = null;
                foreach (Match m in mc)
                {
                    if (paradict == null)
                    {
                        paradict = XPathAnalyzer.ParseUrl(URL);
                    }
                    if (paradict == null)
                    {
                        break;
                    }
                    var str = m.Groups[1].Value;
                    if (paradict.ContainsKey(str))
                    {
                        url = url.Replace(m.Groups[0].Value, paradict[str]);
                    }
                }
                response = helper.GetHtml(Http, url, post).Result;
                result   = response.Html;
                code     = response.Code;
            }
            result = JavaScriptAnalyzer.Decode(result);
            if (IsSuperMode)
            {
                result = JavaScriptAnalyzer.Parse2XML(result);
            }

            return(result);
        }