Esempio n. 1
0
        public SmartCrawler()
        {
            Http = new HttpItem();
            CrawlItems = new ObservableCollection<CrawlItem>();
            Http.URL = "http://www.cnblogs.com/";

            helper = new HttpHelper();
            IsMultiData = ListType.List;
            Documents = new ObservableCollection<HttpItem>();
        }
Esempio n. 2
0
 /// <summary>
 ///     采用https协议访问网络,根据传入的URl地址,得到响应的数据字符串。
 /// </summary>
 /// <param name="requestitem">参数列表</param>
 /// <returns>String类型的数据</returns>
 public string GetHtml(HttpItem requestitem, out WebHeaderCollection responseHeaders, out HttpStatusCode code, string url = null, string post = null)
 {
     try
     {
         var request = SetRequest(requestitem, url, post);
         var r = GetHttpRequestData(request, requestitem,out responseHeaders, out code);
         if (!IsSuccess(code))
             return "HTTP错误,类型:" + code;
         return r;
     }
     catch (Exception ex)
     {
         code = HttpStatusCode.NotFound;
         responseHeaders = null;
         return ex.Message;
     }
 }
Esempio n. 3
0
 /// <summary>
 ///     为请求准备参数
 /// </summary>
 /// <param name="item">参数列表</param>
 /// <param name="_Encoding">读取数据时的编码方式</param>
 private HttpWebRequest SetRequest(HttpItem item, string desturl = null, string post = null)
 {
     var url = desturl ?? item.URL;
     if (url == null)
         return null;
     if (url.Contains("http") == false)
     {
         url = "http://" + url;
     }
     // 验证证书
     if (url.Contains("https"))
         ServicePointManager.ServerCertificateValidationCallback =
             (sender, certificate, chain, sslPolicyErrors) => true;
     //初始化对像,并设置请求的URL地址
     var request = (HttpWebRequest) WebRequest.Create(GetUrl(url));
     SetRequest(item, request, desturl, post);
     encoding = AttributeHelper.GetEncoding(item.Encoding);
     return request;
 }
Esempio n. 4
0
        public static void SetRequest(HttpItem item, HttpWebRequest request, string desturl = null, string post = null)
        {
            var docu = item.GetHeaderParameter();
            // 设置代理
            //SetProxy(item);
            //请求方式Get或者Post
            request.Method = item.Method.ToString();
            request.Timeout = item.Timeout;
            request.ReadWriteTimeout = item.ReadWriteTimeout;
            //Accept

            request.Headers = new WebHeaderCollection();
            if (docu["Headers"].ToString() != "")
            {
                var str = docu["Headers"].ToString().Split('\n');
                foreach (var s in str)
                {
                    var ms = s.Split(':');
                    if (ms.Length != 2)
                        continue;
                    var key = ms[0].Trim();
                    var value = ms[1].Trim();
                    if (SetHeaderValue(request.Headers, key, value) == false)
                    {
                        request.Headers.Add(key, value);
                    }
                }
            }
            request.Accept = docu["Accept"].ToString();

            //ContentType返回类型
            request.ContentType = docu["Content_Type"].ToString();
            //UserAgent客户端的访问类型,包括浏览器版本和操作系统信息
            request.UserAgent = docu["User-Agent"].ToString();
            var host = docu["Host"].ToString();
            //if (string.IsNullOrEmpty(host) == false)
            // request.Host = host;

            //设置Cookie
            var cookie = docu["Cookie"].ToString();
            if (!string.IsNullOrEmpty(cookie))
            {
                request.Headers[HttpRequestHeader.Cookie] = cookie;
            }


            //来源地址
            request.Referer = docu["Referer"].ToString();
            //是否执行跳转功能
            request.AllowAutoRedirect = item.Allowautoredirect;
            //设置Post数据
            string postdata = null;
            if (post == null)
            {
                postdata = item.Postdata;
            }
            else
            {
                postdata = post;
            }
            //验证在得到结果时是否有传入数据
            if (!string.IsNullOrEmpty(postdata) && request.Method.Trim().ToLower().Contains("post"))
            {
                var buffer = Encoding.Default.GetBytes(postdata);
                request.ContentLength = buffer.Length;
                request.GetRequestStream().Write(buffer, 0, buffer.Length);
            }
            ////设置最大连接
            //if (item.Connectionlimit > 0)
            //{
            //    request.ServicePoint.ConnectionLimit = item.Connectionlimit;
            //}
        }
Esempio n. 5
0
        /// <summary>
        ///     根据相传入的数据,得到相应页面数据
        /// </summary>
        /// <param name="strPostdata">传入的数据Post方式,get方式传NUll或者空字符串都可以</param>
        /// <param name="ContentType">返回的响应数据的类型</param>
        /// <returns>string类型的响应数据</returns>
        private string GetHttpRequestData(HttpWebRequest request, HttpItem objhttpitem,out WebHeaderCollection responseHeaders, out HttpStatusCode statusCode)
        {
            var result = "";

            #region 得到请求的response

            using (var response = (HttpWebResponse) request.GetResponse())
            {
                MemoryStream stream;

                var docu = objhttpitem.GetHeaderParameter();
                if (response.Headers["set-cookie"] != null)
                    docu["Cookie"] = MergeCookie(docu["Cookie"].ToString(), response.Headers["set-cookie"]);

                responseHeaders= response.Headers;
                statusCode = response.StatusCode;
                objhttpitem.Parameters = HttpItem.HeaderToString(docu);
                //GZIIP处理
                if (response.ContentEncoding != null &&
                    response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                {
                    stream =
                        GetMemoryStream(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                }
                else
                {
                    stream = GetMemoryStream(response.GetResponseStream());
                }
                //获取Byte
                var rawResponse = stream.ToArray();
                //是否返回Byte类型数据

                if (objhttpitem.Encoding == EncodingType.Unknown || encoding == null)
                {
                    var temp = Encoding.Default.GetString(rawResponse, 0, rawResponse.Length);
                    //<meta(.*?)charset([\s]?)=[^>](.*?)>
                    var meta = Regex.Match(temp, "<meta([^<]*)charset=([^<]*)[\"']",
                        RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    var charter = (meta.Groups.Count > 2) ? meta.Groups[2].Value : string.Empty;
                    charter = charter.Replace("\"", string.Empty)
                        .Replace("'", string.Empty)
                        .Replace(";", string.Empty);
                    if (charter.Length > 0)
                    {
                        charter = charter.ToLower().Replace("iso-8859-1", "gbk");
                        if (charter.Contains("utf-8") || charter.Contains("UTF-8"))
                        {
                            encoding = Encoding.UTF8;
                        }
                        else if (charter.Contains("gb"))
                        {
                            encoding = Encoding.GetEncoding("GB2312");
                        }
                        else
                        {
                            encoding = Encoding.GetEncoding(charter);
                        }
                    }
                    else
                    {
                        if (response.CharacterSet != null && response.CharacterSet.ToLower().Trim() == "iso-8859-1")
                        {
                            encoding = Encoding.GetEncoding("gbk");
                        }
                    }
                }


                //得到返回的HTML
                result = encoding.GetString(rawResponse);
                //最后释放流
                stream.Close();
            }

            return result;
        }
Esempio n. 6
0
        public override object TransformData(IFreeDocument datas)
        {
            var text = datas[Column];
            if (text == null)
                return null;
            var pattern = "all";
            var format = "plain";
            switch (ResultType)
            {
                case ContentType.Json:
                    format = "json";
                    break;
                case ContentType.Text:
                    format = "plain";
                    break;
                case ContentType.XML:
                    format = "xml";
                    break;
                case ContentType.Byte:
                    format = "conll";
                    break;
            }
            switch (Pattern)
            {
                case Pattern.分词:
                    pattern = "ws";
                    break;
                case Pattern.词性标注:
                    pattern = "pos";
                    break;
                case Pattern.依存句法分析:
                    pattern = "dp";
                    break;
                case Pattern.语义依存分析:
                    pattern = "sdp";
                    break;
                case Pattern.语义角色标注:
                    pattern = "srl";
                    break;
                case Pattern.命名实体识别:
                    pattern = "ner";
                    break;
            }

            var param = ("api_key=" + apiKey +

                         "&pattern=" + pattern +
                         "&format=" + format+
                             "&text=" + HttpUtility.UrlEncode(text.ToString())
                         );
            var docs = buffHelper.Get(param);
            if (docs == null)
            {
                var item = new HttpItem();
                item.URL = uriBase+'?'+ param;
                item.Method = MethodType.GET;
                item.Encoding = EncodingType.UTF8;
                var helper = new HttpHelper();
                HttpStatusCode code;
                var result = helper.GetHtml(item,out code);
                if(code==HttpStatusCode.OK)
                    buffHelper.Set(param, result);
                return result;

            }
            return docs;
        }
Esempio n. 7
0
 public static HtmlDocument GetHtmlDocument(string url)
 {
     var httpitem = new HttpItem {URL = url};
     var helper = new HttpHelper();
     HttpStatusCode statusCode;
     var doc2 = helper.GetHtml(httpitem, out statusCode);
     if (statusCode != HttpStatusCode.OK)
         return null;
     var htmldoc = new HtmlDocument();
     htmldoc.LoadHtml(doc2);
     return htmldoc;
 }
Esempio n. 8
0
        /// <summary>
        ///     根据相传入的数据,得到相应页面数据
        /// </summary>
        /// <param name="strPostdata">传入的数据Post方式,get方式传NUll或者空字符串都可以</param>
        /// <param name="ContentType">返回的响应数据的类型</param>
        /// <returns>string类型的响应数据</returns>
        private byte[] GetHttpRequestFile(HttpWebRequest request, HttpItem objhttpitem, out HttpStatusCode statusCode)
        {
            byte[] result = null;


            using (var response = (HttpWebResponse) request.GetResponse())
            {
                var _stream = new MemoryStream();

                var docu = objhttpitem.GetHeaderParameter();
                if (response.Headers["set-cookie"] != null)
                    docu["Cookie"] = MergeCookie(docu["Cookie"].ToString(), response.Headers["set-cookie"]);

                statusCode = response.StatusCode;
                objhttpitem.Parameters = HttpItem.HeaderToString(docu);
                //GZIIP处理
                if (response.ContentEncoding != null &&
                    response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                {
                    //开始读取流并设置编码方式
                    //new GZipStream(response.GetResponseStream(), CompressionMode.Decompress).CopyTo(_stream, 10240);
                    //.net4.0以下写法
                    _stream =
                        GetMemoryStream(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                }
                else
                {
                    //开始读取流并设置编码方式
                    //response.GetResponseStream().CopyTo(_stream, 10240);
                    //.net4.0以下写法
                    _stream = GetMemoryStream(response.GetResponseStream());
                }
                //获取Byte
                result = _stream.ToArray();
                //是否返回Byte类型数据


                _stream.Close();
            }

            return result;
        }
Esempio n. 9
0
        public override void DictDeserialize(IDictionary<string, object> dicts, Scenario scenario = Scenario.Database)
        {
            base.DictDeserialize(dicts, scenario);
            URL = dicts.Set("URL", URL);
            RootXPath = dicts.Set("RootXPath", RootXPath);
            IsMultiData = dicts.Set("IsMultiData", IsMultiData);
            URLFilter = dicts.Set("URLFilter", URLFilter);
            Crawler = dicts.Set("Crawler", Crawler);
            ContentFilter = dicts.Set("ContentFilter", ContentFilter);
            if (dicts.ContainsKey("HttpSet"))
            {
                var doc2 = dicts["HttpSet"];
                var p = doc2 as IDictionary<string, object>;
                Http.UnsafeDictDeserialize(p);
            }

            if (dicts.ContainsKey("Login"))
            {
                var doc2 = dicts["Login"];
                var p = doc2 as IDictionary<string, object>;
                var item = new HttpItem();
                item.DictDeserialize(p);
                Documents.Add(item);
            }

            if (dicts.ContainsKey("Generator"))
            {
                var doc2 = dicts["Generator"];
                var p = doc2 as IDictionary<string, object>;
            }
            var doc = dicts as FreeDocument;
            if (doc?.Children != null)
            {
                foreach (var child in doc.Children)
                {
                    var item = new CrawlItem();
                    item.DictDeserialize(child);
                    CrawlItems.Add(item);
                }
            }
        }
Esempio n. 10
0
        private void FiddlerApplicationAfterSessionComplete(Session oSession)
        {
            if (string.IsNullOrEmpty(URLFilter) == false)
            {
                URLFilter = URLFilter.Replace("http://", "");
                if (URLFilter.Split(' ').Any(item => oSession.url.Contains(item) == false))
                {
                    return;
                }
            }

            var httpitem = new HttpItem {Parameters = oSession.oRequest.headers.ToString()};


            if ((oSession.BitFlags & SessionFlags.IsHTTPS) != 0)
            {
                httpitem.URL = "https://" + oSession.url;
            }
            else
            {
                httpitem.URL = "http://" + oSession.url;
            }


            httpitem.Postdata = Encoding.Default.GetString(oSession.RequestBody);
            if (string.IsNullOrEmpty(httpitem.Postdata) == false)
            {
                httpitem.Method = MethodType.POST;
                ControlExtended.UIInvoke(() => Documents.Add(httpitem));
            }


            if (string.IsNullOrEmpty(ContentFilter) == false)
            {
                if (ContentFilter.Split(' ').Any(item => oSession.GetResponseBodyAsString().Contains(item) == false))
                {
                    return;
                }
            }


            httpitem.DictCopyTo(Http);
            XLogSys.Print.Info("已经成功获取嗅探字段" + oSession.url);
        }
Esempio n. 11
0
 /// <summary>
 ///     采用https协议访问网络,根据传入的URl地址,得到响应的数据字符串。
 /// </summary>
 /// <param name="requestitem">参数列表</param>
 /// <returns>String类型的数据</returns>
 public string GetHtml(HttpItem requestitem,  out HttpStatusCode code, string url = null, string post = null)
 {
     WebHeaderCollection responseHeaders = null;
     return GetHtml(requestitem, out responseHeaders,out  code, url, post);
 }
Esempio n. 12
0
        public static void SetRequest(HttpItem item, HttpWebRequest request, string desturl = null, string post = null)
        {
            var docu = item.GetHeaderParameter();
            // 设置代理
            if (item.ProxyPort == 0 || string.IsNullOrEmpty(item.ProxyIP))
            {
                //不需要设置
            }
            else
            {
                //设置代理服务器
                var myProxy = new WebProxy(item.ProxyIP, item.ProxyPort);

                //建议连接
                myProxy.Credentials = new NetworkCredential(item.ProxyUserName, item.ProxyPassword);
                //给当前请求对象
                request.Proxy = myProxy;
                //设置安全凭证
                request.Credentials = CredentialCache.DefaultNetworkCredentials;
            }
            //请求方式Get或者Post
            request.Method = item.Method.ToString();
            request.Timeout = item.Timeout;
            request.ReadWriteTimeout = item.ReadWriteTimeout;
            //Accept

            request.Headers = new WebHeaderCollection();
            if (docu["Headers"].ToString() != "")
            {
                var str = docu["Headers"].ToString().Split('\n');
                foreach (var s in str)
                {
                    var ms = s.Split(':');
                    if (ms.Length != 2)
                        continue;
                    var key = ms[0].Trim();
                    var value = ms[1].Trim();
                    if (SetHeaderValue(request.Headers, key, value) == false)
                    {
                        request.Headers.Add(key, value);
                    }
                }
            }
            request.Accept = docu["Accept"].ToString();

            //ContentType返回类型
            request.ContentType = docu["Content_Type"].ToString();
            //UserAgent客户端的访问类型,包括浏览器版本和操作系统信息
            request.UserAgent = docu["User-Agent"].ToString();
            var host = docu["Host"].ToString();
            //if (string.IsNullOrEmpty(host) == false)
            // request.Host = host;

            //设置Cookie
            var cookie = docu["Cookie"].ToString();
            if (!string.IsNullOrEmpty(cookie))
            {
                request.Headers[HttpRequestHeader.Cookie] = cookie;
            }


            //来源地址
            request.Referer = docu["Referer"].ToString();
            //是否执行跳转功能
            request.AllowAutoRedirect = item.Allowautoredirect;
            //设置Post数据
            string postdata = null;
            if (post == null)
            {
                postdata = item.Postdata;
            }
            else
            {
                postdata = post;
            }
            //验证在得到结果时是否有传入数据
            if (!string.IsNullOrEmpty(postdata) && request.Method.Trim().ToLower().Contains("post"))
            {
                var buffer = Encoding.Default.GetBytes(postdata);
                request.ContentLength = buffer.Length;
                request.GetRequestStream().Write(buffer, 0, buffer.Length);
            }
            ////设置最大连接
            //if (item.Connectionlimit > 0)
            //{
            //    request.ServicePoint.ConnectionLimit = item.Connectionlimit;
            //}
        }
Esempio n. 13
0
        /// <summary>
        ///     根据相传入的数据,得到相应页面数据
        /// </summary>
        /// <param name="strPostdata">传入的数据Post方式,get方式传NUll或者空字符串都可以</param>
        /// <param name="ContentType">返回的响应数据的类型</param>
        /// <returns>string类型的响应数据</returns>
        private string GetHttpRequestData(HttpWebRequest request, HttpItem objhttpitem,out WebHeaderCollection responseHeaders, out HttpStatusCode statusCode)
        {
            var result = "";

            #region 得到请求的response

            using (var response = (HttpWebResponse) request.GetResponse())
            {
                MemoryStream stream;

                var docu = objhttpitem.GetHeaderParameter();
                if (response.Headers["set-cookie"] != null)
                    docu["Cookie"] = MergeCookie(docu["Cookie"].ToString(), response.Headers["set-cookie"]);

                responseHeaders= response.Headers;
                statusCode = response.StatusCode;
                objhttpitem.Parameters = HttpItem.HeaderToString(docu);
                //GZIIP处理
                if (response.ContentEncoding != null &&
                    response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                {
                    stream =
                        GetMemoryStream(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                }
                else
                {
                    stream = GetMemoryStream(response.GetResponseStream());
                }
                //获取Byte
                var rawResponse = stream.ToArray();
                //是否返回Byte类型数据

                if (objhttpitem.Encoding == EncodingType.Unknown || encoding == null)
                {
                    var temp = Encoding.Default.GetString(rawResponse, 0, rawResponse.Length);
                    //<meta(.*?)charset([\s]?)=[^>](.*?)>
                    var meta = Regex.Match(temp, "<meta([^<]*)charset=([^<]*)[\"']",
                        RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    var charter = (meta.Groups.Count > 2) ? meta.Groups[2].Value : string.Empty;
                    charter = charter.Replace("\"", string.Empty)
                        .Replace("'", string.Empty)
                        .Replace(";", string.Empty);
                    if (charter.Length > 0)
                    {
                        charter = charter.ToLower().Replace("iso-8859-1", "gbk");
                        if (charter.Contains("utf-8") || charter.Contains("UTF-8"))
                        {
                            encoding = Encoding.UTF8;
                        }
                        else if (charter.Contains("gb"))
                        {
                            encoding = Encoding.GetEncoding("GB2312");
                        }
                        else
                        {
                            encoding = Encoding.GetEncoding(charter);
                        }
                    }
                    else
                    {
                        if (response.CharacterSet != null && response.CharacterSet.ToLower().Trim() == "iso-8859-1")
                        {
                            encoding = Encoding.GetEncoding("gbk");
                        }
                    }
                }


                //得到返回的HTML
                result = encoding.GetString(rawResponse);
                //最后释放流
                stream.Close();
            }

            return result;
        }
Esempio n. 14
0
        public static IEnumerable<List<FreeDocument>> GetMultiDataFromURL(string url)
        {
            var httpitem = new HttpItem {URL = url};
            var helper = new HttpHelper();
            HttpStatusCode statusCode;
            var doc2 = helper.GetHtml(httpitem, out statusCode);
            if (statusCode != HttpStatusCode.OK)
                yield break;

            if (doc2 == null)
                yield return new List<FreeDocument>();
            var htmldoc = new HtmlDocument();
            htmldoc.LoadHtml(doc2);

            foreach (var item in htmldoc.GetDataFromHtml())
            {
                yield return item;
            }
        }
Esempio n. 15
0
 /// <summary>
 ///     采用https协议访问网络,根据传入的URl地址,得到响应的数据字符串。
 /// </summary>
 /// <param name="requestitem">参数列表</param>
 /// <returns>String类型的数据</returns>
 public string GetHtml(HttpItem requestitem,  out HttpStatusCode code, string url = null, string post = null)
 {
     WebHeaderCollection responseHeaders = null;
     return GetHtml(requestitem, out responseHeaders,out  code, url, post);
 }
Esempio n. 16
0
 /// <summary>
 ///     采用https协议访问网络,根据传入的URl地址,得到响应的数据字符串。
 /// </summary>
 /// <param name="requestitem">参数列表</param>
 /// <returns>String类型的数据</returns>
 public byte[] GetFile(HttpItem requestitem, out HttpStatusCode code, string url = null, string post = null)
 {
     try
     {
         var request = SetRequest(requestitem, url, post);
         var r = GetHttpRequestFile(request, requestitem, out code);
         if (!IsSuccess(code))
             XLogSys.Print.ErrorFormat("HTTP错误,URL:{0},类型:{1}", url, code.ToString());
         return r;
     }
     catch (Exception ex)
     {
         code = HttpStatusCode.NotFound;
         return new byte[0];
     }
 }
Esempio n. 17
0
 public CookieAwareWebClient(HttpItem item)
 {
     this.Encoding = Encoding.UTF8;
         HttpItem = item;
 }
Esempio n. 18
0
        public string Translate(string item)
        {
            var res = buffHelper.Get(item);
            if (res != null)
                return res;
            if (string.IsNullOrWhiteSpace(item))
                return item;
            var httpitem = new HttpItem();

            string url =
                $"http://openapi.baidu.com/public/2.0/bmt/translate?client_id={ClientID}&q={item}&from={language[Source.SelectItem]}&to={language[Target.SelectItem]}";
            httpitem.URL = url;
            HttpStatusCode code;

            string result = helper.GetHtml(httpitem,out code);
            var r = JsonConvert.Import(result) as JsonObject;

            if (r.Contains("error_code ") == false)
            {
                var sb = new StringBuilder();
                var array = r["trans_result"] as JsonArray;
                for (int i = 0; i < array.Length; i++)
                {
                    var j = array[i] as JsonObject;
                    object r2 = j["dst"];
                    sb.AppendLine(r2.ToString());
                }
                string t = sb.ToString();
                buffHelper.Set(item,t);
                return t;
            }
            return "Error";
        }
Esempio n. 19
0
        public static HtmlDocument GetDocumentFromURL(string url, EncodingType encoding = EncodingType.Unknown)
        {
            var httpitem = new HttpItem();
            httpitem.URL = url;

            httpitem.Encoding = encoding;
            var helper = new HttpHelper();
            HttpStatusCode code;

            var doc = new HtmlDocument();
            var result = helper.GetHtml(httpitem, out code);
            if (!HttpHelper.IsSuccess(code))
                return doc;
            doc.LoadHtml(result);
            return doc;
        }