Ejemplo n.º 1
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            var mc = extract.Matches(url);

            if (SysProcessManager == null)
            {
                code = HttpStatusCode.NoContent;
                return("");
            }
            var list =
                SysProcessManager.CurrentProcessCollections.ToArray();
            var crawler =
                list.FirstOrDefault(d => d.Name == ShareCookie) as
                SmartCrawler;

            if (crawler != null)
            {
                Http.ProxyIP       = crawler.Http.ProxyIP;
                Http.ProxyPassword = crawler.Http.ProxyPassword;
                Http.ProxyUserName = crawler.Http.ProxyUserName;
                Http.ProxyPort     = crawler.Http.ProxyPort;
                if (Http.Parameters != crawler.Http.Parameters)
                {
                    var cookie = crawler.Http.GetHeaderParameter().Get <string>("Cookie");
                    if (string.IsNullOrWhiteSpace(cookie) == false)
                    {
                        Http.SetValue("Cookie", cookie);
                    }
                }
            }
            Dictionary <string, string> paradict = null;

            foreach (Match m in mc)
            {
                if (paradict == null)
                {
                    paradict = XPathAnalyzer.ParseUrl(URL);
                }
                if (paradict == null)
                {
                    break;
                }
                var str = m.Groups[1].Value;
                if (paradict.ContainsKey(str))
                {
                    url = url.Replace(m.Groups[0].Value, paradict[str]);
                }
            }
            WebHeaderCollection headerCollection;
            var content = helper.GetHtml(Http, out headerCollection, out code, url, post);

            content = JavaScriptAnalyzer.Decode(content);
            if (IsSuperMode)
            {
                content = JavaScriptAnalyzer.Parse2XML(content);
            }

            return(content);
        }
Ejemplo n.º 2
0
        public static List <WeiBoContentItem> GetWeiBoTopicContentV2(string topicName, string targetName = "")
        {
            List <WeiBoContentItem> res = new List <WeiBoContentItem>();
            HtmlWeb      webClient      = new HtmlWeb();
            HtmlDocument doc            = webClient.Load("https://s.weibo.com/weibo/" + topicName + "&Refer=weibo_weibo&xsort=time&realtimeweibo=1");

            doc.DocumentNode.InnerHtml = JavaScriptAnalyzer.Decode(doc.DocumentNode.InnerHtml);
            HtmlNodeCollection ContentList = doc.DocumentNode.SelectNodes("//div[@class='content clearfix']");

            //获取一个话题项
            ContentList.ToList().ForEach(p =>
            {
                var item = new WeiBoContentItem();
                //获取时间
                var timeItem    = p.SelectNodes(".//a[@class='W_textb']");
                item.Time       = Convert.ToDateTime(timeItem.FirstOrDefault()?.InnerText);
                var nickName    = p.SelectNodes(".//a[@class='W_texta W_fb']");
                item.Author     = nickName.FirstOrDefault()?.InnerText.Trim();
                var content     = p.SelectNodes(".//p[@class='comment_txt']");
                item.ContentStr = content.FirstOrDefault()?.InnerText.Trim();
                var pic         = p.SelectNodes(".//img[@action-type='feed_list_media_img']");
                item.Pic        = "https:" + pic.FirstOrDefault()?.Attributes.FirstOrDefault(c => c.Name == "src")?.Value.Replace("thumbnail", "large");
                res.Add(item);
            });
            return(res.Where(p => p.Author.Trim().Contains(targetName)).OrderByDescending(p => p.Time).ToList());
        }
Ejemplo n.º 3
0
        public List <FreeDocument> CrawlHtmlData(string html, out HtmlDocument doc
                                                 )
        {
            if (IsSuperMode)
            {
                html = JavaScriptAnalyzer.Parse2XML(html);
            }
            doc = new HtmlDocument();

            doc.LoadHtml(html);
            var datas = new List <FreeDocument>();

            try
            {
                datas = CrawlData(doc);
                if (datas.Count == 0)
                {
                    XLogSys.Print.InfoFormat("HTML抽取数据失败,url:{0}", url);
                }
            }
            catch (Exception ex)
            {
                XLogSys.Print.ErrorFormat("HTML抽取数据失败,url:{0}, 异常为{1}", url, ex.Message);
            }


            return(datas);
        }
Ejemplo n.º 4
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            var mc = extract.Matches(url);
            Dictionary <string, string> paradict = null;

            foreach (Match m in mc)
            {
                if (paradict == null)
                {
                    paradict = XPathAnalyzer.ParseUrl(URL);
                }
                if (paradict == null)
                {
                    break;
                }
                var str = m.Groups[1].Value;
                if (paradict.ContainsKey(str))
                {
                    url = url.Replace(m.Groups[0].Value, paradict[str]);
                }
            }
            WebHeaderCollection headerCollection;
            var  content = helper.GetHtml(Http, out headerCollection, out code, url, post);
            bool isjson;

            content = JavaScriptAnalyzer.Json2XML(content, out isjson, IsSuperMode);
            return(content);
        }
Ejemplo n.º 5
0
        private void FiddlerApplicationAfterSessionComplete(Session oSession)
        {
            if (oSession.oRequest.headers == null)
            {
                return;
            }
            var httpitem = new HttpItem {
                Parameters = oSession.oRequest.headers.ToString()
            };


            if ((oSession.BitFlags & SessionFlags.IsHTTPS) != 0)
            {
                httpitem.URL = "https://" + oSession.url;
            }
            else
            {
                httpitem.URL = "http://" + oSession.url;
            }


            httpitem.Postdata = Encoding.Default.GetString(oSession.RequestBody);


            if (string.IsNullOrWhiteSpace(SelectText) == false)
            {
                var content = oSession.GetResponseBodyAsString();

                content = JavaScriptAnalyzer.Decode(content);
                if (content.Contains(SelectText) == false)
                {
                    return;
                }
            }
            IsSuperMode = true;
            StopVisit();
            httpitem.DictCopyTo(Http);
            var post = "";

            if (Http.Method == MethodType.POST)
            {
                post = "post请求的内容为:\n" + httpitem.Postdata + "\n";
            }
            var window = MainFrm as Window;

            ControlExtended.UIInvoke(() => { if (window != null)
                                             {
                                                 window.Topmost = true;
                                             }
                                     });
            var info = $"已经成功获取嗅探字段! 真实请求地址:\n{oSession.url},\n已自动配置了网页采集器,请求类型为{Http.Method}\n {post}已经刷新了网页采集器的内容";

            XLogSys.Print.Info(info);
            ControlExtended.UIInvoke(() => { if (window != null)
                                             {
                                                 window.Topmost = false;
                                             }
                                     });
            URL = oSession.url;
        }
Ejemplo n.º 6
0
        private void Design()
        {
            (!string.IsNullOrWhiteSpace(CrawlerSelector)).SafeCheck("采集器名称不能为空");

            var isRealJson = false;
            var newhtml    = JavaScriptAnalyzer.Json2XML(lastData, out isRealJson, true);

            if (!(isRealJson).SafeCheck("只有标准json格式才能启用采集器设计"))
            {
                return;
            }
            var selector = GetCrawler(CrawlerSelector);

            if (selector == null)
            {
                if (MessageBox.Show($"是否要创建名为{CrawlerSelector}的网页采集器?", "提示信息", MessageBoxButton.OKCancel) !=
                    MessageBoxResult.OK)
                {
                    return;
                }
                var crawler = new SmartCrawler();
                crawler.Name = CrawlerSelector;
                processManager.CurrentProcessCollections.Add(crawler);
                selector = crawler;
            }

            (MainDescription.MainFrm as IDockableManager).ActiveThisContent(CrawlerSelector);
            selector.URLHTML = newhtml;
            selector.HtmlDoc.LoadHtml(newhtml);
            selector.enableRefresh = false;
            //selector.GreatHand();
        }
Ejemplo n.º 7
0
        public override object TransformData(IFreeDocument datas)
        {
            var item = datas[Column];

            if (item == null || string.IsNullOrWhiteSpace(item.ToString()))
            {
                return(null);
            }
            bool isrealjson;
            var  itemstr = item.ToString();

            if (lastData == null)
            {
                var html = JavaScriptAnalyzer.Json2XML(itemstr, out isrealjson, true);
                if (isrealjson)
                {
                    lastData = itemstr;
                }
            }

            if (crawlerEnabled)
            {
                var html = JavaScriptAnalyzer.Json2XML(itemstr, out isrealjson, true);
                if (isrealjson)
                {
                    HtmlDocument htmldoc = null;
                    var          doc     = selector.CrawlHtmlData(html, out htmldoc).FirstOrDefault();
                    doc.DictCopyTo(datas);
                }
                return(null);
            }
            dynamic d = null;

            try
            {
                d = serialier.DeserializeObject(item.ToString());
            }
            catch (Exception ex)
            {
                SetValue(datas, ex.Message);
                // XLogSys.Print.Error(ex);
                return(null);
            }
            if (ScriptWorkMode == ScriptWorkMode.单文档)
            {
                var newdoc = ScriptHelper.ToDocument(d) as FreeDocument;
                newdoc.DictCopyTo(datas);
            }
            else
            {
                SetValue(datas, d);
            }

            return(null);
        }
Ejemplo n.º 8
0
        /// <summary>
        /// 获取微博话题内容列表(使用微博话题api),此接口返回内容详细,非常好用
        /// </summary>
        /// <param name="topicId">话题名</param>
        /// <param name="tragetName">指定发送者名称</param>
        /// <returns></returns>
        public static List <WeiBoContentItem> GetWeiBoTopicContentV1(string topicName, string targetName = "")
        {
            var encode      = System.Web.HttpUtility.UrlEncode(topicName);
            var res         = JavaScriptAnalyzer.Decode(ToolClass.GetAPI($"https://m.weibo.cn/api/container/getIndex?type=uid&value=1761587065"));
            var ret         = Newtonsoft.Json.JsonConvert.DeserializeObject <WeiBoTopicRes>(res);
            var card_Groups = new List <WeiBoTopicRes.Card_Group>();

            ret.data.cards.Where(p => p.card_group != null).Select(p => p).ToList().ForEach(
                c =>
            {
                card_Groups.AddRange(c.card_group);
            });

            List <WeiBoContentItem> theres = new List <WeiBoContentItem>();

            card_Groups.ForEach(p =>
            {
                HtmlDocument htmlDocument = new HtmlDocument();
                htmlDocument.LoadHtml(p.mblog.text);
                WeiBoContentItem item = new WeiBoContentItem
                {
                    Pic        = p.mblog.original_pic,
                    Author     = p.mblog.user.screen_name,
                    ContentStr = htmlDocument.DocumentNode?.InnerText
                };
                if (p.mblog.created_at.Contains("分钟"))
                {
                    var getNum = Convert.ToInt32(p.mblog.created_at.Replace("分钟前", ""));
                    item.Time  = DateTime.Now.AddMinutes(-getNum);
                }
                else if (p.mblog.created_at.Contains("小时"))
                {
                    var getNum = Convert.ToInt32(p.mblog.created_at.Replace("小时前", ""));
                    item.Time  = DateTime.Now.AddHours(-getNum);
                }
                else if (p.mblog.created_at.Contains("昨天"))
                {
                    var getNum = Convert.ToDateTime(p.mblog.created_at.Replace("昨天", "").Trim());
                    item.Time  = getNum.AddDays(-1);
                }
                else if (p.mblog.created_at.Contains("前天"))
                {
                    var getNum = Convert.ToDateTime(p.mblog.created_at.Replace("前天", "").Trim());
                    item.Time  = getNum.AddDays(-2);
                }
                else
                {
                    item.Time = Convert.ToDateTime(p.mblog.created_at);
                }

                theres.Add(item);
            });
            return(theres.Where(p => p.Author.Trim().Contains(targetName)).OrderByDescending(p => p.Time).ToList());
        }
Ejemplo n.º 9
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            string result = "";

            HttpHelper.HttpResponse response;
            code = HttpStatusCode.NotFound;
            if (Regex.IsMatch(url, @"^[A-Z]:\\")) //本地文件
            {
                if (File.Exists(url))
                {
                    result = File.ReadAllText(url, AttributeHelper.GetEncoding(this.Http.Encoding));
                    code   = HttpStatusCode.Accepted;
                }
            }
            else
            {
                var mc = extract.Matches(url);
                if (SysProcessManager == null)
                {
                    code = HttpStatusCode.NoContent;
                    return("");
                }
                SetCookie(Http);
                Dictionary <string, string> paramDict = null;
                foreach (Match m in mc)
                {
                    if (paramDict == null)
                    {
                        paramDict = XPathAnalyzer.ParseUrl(URL);
                    }
                    if (paramDict == null)
                    {
                        break;
                    }
                    var str = m.Groups[1].Value;
                    if (paramDict.ContainsKey(str))
                    {
                        url = url.Replace(m.Groups[0].Value, paramDict[str]);
                    }
                }
                response = helper.GetHtml(Http, url, post).Result;
                result   = response.Html;
                code     = response.Code;
            }
            result = JavaScriptAnalyzer.Decode(result);
            if (IsSuperMode)
            {
                result = JavaScriptAnalyzer.Parse2XML(result);
            }

            return(result);
        }
Ejemplo n.º 10
0
        public static List <WeiBoContentItem> GetWeiboByUid(string Uid, string ContainerId, string TopicFilter = "")
        {
            var res         = JavaScriptAnalyzer.Decode(ToolClass.GetAPI($"https://m.weibo.cn/api/container/getIndex?type=uid&value={Uid}&containerid={ContainerId}"));
            var ret         = Newtonsoft.Json.JsonConvert.DeserializeObject <WeiBoDirectContentItem.WeiBoDirectRes>(res);
            var card_Groups = ret.data.cards.ToList();
            List <WeiBoContentItem> theres = new List <WeiBoContentItem>();

            card_Groups.ForEach(p =>
            {
                HtmlDocument htmlDocument = new HtmlDocument();
                htmlDocument.LoadHtml(p.mblog.text);
                WeiBoContentItem item = new WeiBoContentItem
                {
                    Pic        = p.mblog.original_pic,
                    Author     = p.mblog.user.screen_name,
                    ContentStr = htmlDocument.DocumentNode?.InnerText
                };
                if (p.mblog.created_at.Contains("分钟"))
                {
                    var getNum = Convert.ToInt32(p.mblog.created_at.Replace("分钟前", ""));
                    item.Time  = DateTime.Now.AddMinutes(-getNum);
                }
                else if (p.mblog.created_at.Contains("小时"))
                {
                    var getNum = Convert.ToInt32(p.mblog.created_at.Replace("小时前", ""));
                    item.Time  = DateTime.Now.AddHours(-getNum);
                }
                else if (p.mblog.created_at.Contains("昨天"))
                {
                    var getNum = Convert.ToDateTime(p.mblog.created_at.Replace("昨天", "").Trim());
                    item.Time  = getNum.AddDays(-1);
                }
                else if (p.mblog.created_at.Contains("前天"))
                {
                    var getNum = Convert.ToDateTime(p.mblog.created_at.Replace("前天", "").Trim());
                    item.Time  = getNum.AddDays(-2);
                }
                else
                {
                    item.Time = Convert.ToDateTime(p.mblog.created_at);
                }

                theres.Add(item);
            });
            return(theres.Where(p => p.ContentStr.Contains(TopicFilter)).OrderByDescending(p => p.Time).ToList());
        }
Ejemplo n.º 11
0
        public override IEnumerable <IFreeDocument> TransformManyData(IEnumerable <IFreeDocument> datas, AnalyzeItem analyzer)
        {
            foreach (var data in datas)
            {
                var item = data[Column].ToString();
                if (string.IsNullOrEmpty(item))
                {
                    continue;
                }
                var itemstr = item;
                lastData = itemstr;
                if (crawlerEnabled)
                {
                    bool isrealjson;
                    var  html = JavaScriptAnalyzer.Json2XML(itemstr, out isrealjson, true);
                    if (isrealjson)
                    {
                        HtmlDocument htmldoc = null;
                        var          doc     = selector.CrawlHtmlData(html, out htmldoc);
                        foreach (var item3 in doc)
                        {
                            yield return(item3.MergeQuery(data, NewColumn));
                        }
                    }
                    continue;
                }
                dynamic d = null;
                try
                {
                    d = serialier.DeserializeObject(itemstr);
                }
                catch (Exception ex)
                {
                    //  XLogSys.Print.Error(ex);
                    continue;
                }


                foreach (var item2 in ScriptHelper.ToDocuments(d))
                {
                    var item3 = item2 as FreeDocument;
                    yield return(item3.MergeQuery(data, NewColumn));
                }
            }
        }
Ejemplo n.º 12
0
        /// <summary>
        /// 获取话题Id
        /// </summary>
        /// <param name="topicName"></param>
        /// <returns></returns>
        public static string GetWeiBoTopicId(string topicName)
        {
            string             topicUrl    = "";
            HtmlWeb            webClient   = new HtmlWeb();
            HtmlDocument       doc         = webClient.Load("https://s.weibo.com/weibo/" + topicName + "&Refer=weibo_weibo&xsort=time&realtimeweibo=1");
            var                ress        = JavaScriptAnalyzer.Decode(doc.DocumentNode.InnerHtml);
            HtmlNodeCollection ContentList = doc.DocumentNode.SelectNodes("//a[@class='W_btn_b6']");
            var                item        = ContentList.FirstOrDefault();

            if (item == null)
            {
                return(null);
            }
            else
            {
                var res = item.Attributes["action-data"];
                topicUrl = res.Value;
            }

            var ret = topicUrl.Substring(topicUrl.LastIndexOf(':') + 1);

            return(ret);
        }
Ejemplo n.º 13
0
        private void FiddlerApplicationAfterSessionComplete(Session oSession)
        {
            if (oSession.oRequest.headers == null)
            {
                return;
            }

            var httpitem = new HttpItem {
                Parameters = oSession.oRequest.headers.ToString()
            };

            XLogSys.Print.Debug("visiting... " + oSession.url);

            if ((oSession.BitFlags & SessionFlags.IsHTTPS) != 0)
            {
                httpitem.URL = "https://" + oSession.url;
            }
            else
            {
                httpitem.URL = "http://" + oSession.url;
            }
            if (oSession.RequestMethod.ToLower() == "post")
            {
                httpitem.Method = MethodType.POST;
            }

            httpitem.Postdata = Encoding.Default.GetString(oSession.RequestBody);



            if (string.IsNullOrWhiteSpace(SelectText) == false)
            {
                var content = oSession.GetResponseBodyAsString();

                content = JavaScriptAnalyzer.Decode(content);
                if (content.Contains(SelectText) == false)
                {
                    return;
                }
            }
            if (string.IsNullOrWhiteSpace(SelectText) == true)
            {
                return;
            }
            if (ConfigFile.Config.Get <bool>("AutoStartStopFiddler"))
            {
                StopVisit();
            }
            httpitem.DictCopyTo(Http);
            var post = "";

            if (Http.Method == MethodType.POST)
            {
                post = "POST content is:\n" + httpitem.Postdata + "\n";
            }
            var window = MainFrm as Window;

            ControlExtended.UIInvoke(() => { if (window != null)
                                             {
                                                 window.Topmost = true;
                                             }
                                     });
            var info = GlobalHelper.FormatArgs("success_get", oSession.url, Http.Method, post);

            XLogSys.Print.Info(info);
            //IsSuperMode = false;
            ControlExtended.UIInvoke(() => { if (window != null)
                                             {
                                                 window.Topmost = false;
                                             }
                                     });
            SniffSucceed?.Invoke(this, new EventArgs());
            URL = oSession.url;
        }
Ejemplo n.º 14
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            string result = "";

            HttpHelper.HttpResponse response;
            code = HttpStatusCode.NotFound;
            if (Regex.IsMatch(url, @"^[A-Z]:\\")) //本地文件
            {
                if (File.Exists(url))
                {
                    result = File.ReadAllText(url, AttributeHelper.GetEncoding(this.Http.Encoding));
                    code   = HttpStatusCode.Accepted;
                }
            }
            else
            {
                var mc = extract.Matches(url);
                if (SysProcessManager == null)
                {
                    code = HttpStatusCode.NoContent;
                    return("");
                }
                var crawler = this.SysProcessManager.GetTask <SmartCrawler>(ShareCookie.SelectItem);
                if (crawler != null)
                {
                    Http.ProxyIP = crawler.Http.ProxyIP;
                    if (Http.Parameters != crawler.Http.Parameters)
                    {
                        var cookie = crawler.Http.GetHeaderParameter().Get <string>("Cookie");
                        if (string.IsNullOrWhiteSpace(cookie) == false)
                        {
                            Http.SetValue("Cookie", cookie);
                        }
                    }
                }
                Dictionary <string, string> paradict = null;
                foreach (Match m in mc)
                {
                    if (paradict == null)
                    {
                        paradict = XPathAnalyzer.ParseUrl(URL);
                    }
                    if (paradict == null)
                    {
                        break;
                    }
                    var str = m.Groups[1].Value;
                    if (paradict.ContainsKey(str))
                    {
                        url = url.Replace(m.Groups[0].Value, paradict[str]);
                    }
                }
                response = helper.GetHtml(Http, url, post).Result;
                result   = response.Html;
                code     = response.Code;
            }
            result = JavaScriptAnalyzer.Decode(result);
            if (IsSuperMode)
            {
                result = JavaScriptAnalyzer.Parse2XML(result);
            }

            return(result);
        }