Beispiel #1
0
        private static void unitTest()
        {
            var doc = XPathAnalyzer.GetHtmlDocument(url);

            var datas = XPathAnalyzer.GetDataFromURL(url);

            var properties = doc.DocumentNode.SearchPropertiesSmartList();


            var firstOrDefault = properties.FirstOrDefault();

            datas = doc.DocumentNode.GetDataFromXPath(firstOrDefault.CrawItems).ToList();
        }
Beispiel #2
0
        public void XPathTest()
        {
            var doc = XPathAnalyzer.GetHtmlDocument(url);

            Assert.IsTrue(doc != null);
            var datas = XPathAnalyzer.GetDataFromURL(url);

            Assert.IsTrue(datas != null && datas.Count > 10);
            var properties = doc.DocumentNode.SearchPropertiesSmartList();

            Assert.IsTrue(properties.Any());
            var firstOrDefault = properties.FirstOrDefault();

            datas = doc.DocumentNode.GetDataFromXPath(firstOrDefault.CrawItems).ToList();
            Assert.IsTrue(datas != null && datas.Count > 10);
        }
Beispiel #3
0
        public List <FreeDocument> CrawlData(string url, out HtmlDocument doc, out HttpStatusCode code, string post = null)
        {
            var mc = extract.Matches(url);
            Dictionary <string, string> paradict = null;

            foreach (Match m in mc)
            {
                if (paradict == null)
                {
                    paradict = XPathAnalyzer.ParseUrl(URL);
                }
                if (paradict == null)
                {
                    break;
                }
                var str = m.Groups[1].Value;
                if (paradict.ContainsKey(str))
                {
                    url = url.Replace(m.Groups[0].Value, paradict[str]);
                }
            }

            var content = helper.GetHtml(Http, out code, url, post);

            doc = new HtmlDocument();
            if (!HttpHelper.IsSuccess(code))
            {
                XLogSys.Print.WarnFormat("HTML Fail,Code:{0},url:{1}", code, url);
                return(new List <FreeDocument>());
            }


            doc.LoadHtml(content);
            var datas = CrawlData(doc);

            if (datas.Count == 0)
            {
                XLogSys.Print.DebugFormat("HTML extract Fail,url:{0}", url);
            }
            return(datas);
        }
Beispiel #4
0
        private void AddNewItem(bool isAlert = true)
        {
            var path     = SelectXPath;
            var rootPath = RootXPath;

            if (!string.IsNullOrEmpty(rootPath))
            {
                //TODO: 当XPath路径错误时,需要捕获异常
                HtmlNode root = null;
                try
                {
                    root = HtmlDoc.DocumentNode.SelectSingleNodePlus(rootPath, RootFormat);
                }
                catch (Exception)
                {
                    XLogSys.Print.Error(string.Format(GlobalHelper.Get("key_662"), RootXPath, RootFormat));
                }
                if (!(root != null).SafeCheck(string.Format(GlobalHelper.Get("key_663"), RootFormat, RootXPath)))
                {
                    return;
                }
                root = HtmlDoc.DocumentNode.SelectSingleNodePlus(rootPath, RootFormat)?.ParentNode;

                HtmlNode node = null;
                if (
                    !ControlExtended.SafeInvoke(() => HtmlDoc.DocumentNode.SelectSingleNodePlus(path, SearchFormat),
                                                ref node,
                                                LogType.Info, GlobalHelper.Get("key_664"), true))

                {
                    return;
                }
                if (!(node != null).SafeCheck(GlobalHelper.Get("key_665")))
                {
                    return;
                }

                if (!node.IsAncestor(root) && isAlert)
                {
                    if (
                        MessageBox.Show(GlobalHelper.Get("key_666"), GlobalHelper.Get("key_99"), MessageBoxButton.YesNo) ==
                        MessageBoxResult.No)
                    {
                        return;
                    }
                }
                string attr      = "";
                string attrValue = "";
                XPathAnalyzer.GetAttribute(path, out attr, out attrValue);
                if (SearchFormat == SelectorFormat.XPath)
                {
                    path = XPath.TakeOffPlus(node.XPath, root.XPath);
                    if (attr != "")
                    {
                        path += "/@" + attr + "[1]";
                    }
                }
            }
            if (CrawlItems.FirstOrDefault(d => d.Name == SelectName) == null ||
                MessageBox.Show(GlobalHelper.Get("add_column_sure"), GlobalHelper.Get("key_99"), MessageBoxButton.OKCancel) == MessageBoxResult.OK)
            {
                var item = new CrawlItem {
                    XPath = path, Name = SelectName, SampleData1 = SelectText
                };
                item.Format = SearchFormat;
                CrawlItems.Add(item);

                SelectXPath = "";
                SelectName  = "";

                XLogSys.Print.Info(GlobalHelper.Get("key_668"));
            }
        }
Beispiel #5
0
        public List<FreeDocument> CrawlData(string url, out HtmlDocument doc, out HttpStatusCode code,
            string post = null)
        {
            var mc = extract.Matches(url);
            Dictionary<string, string> paradict = null;
            foreach (Match m in mc)
            {
                if (paradict == null)
                    paradict = XPathAnalyzer.ParseUrl(URL);
                if (paradict == null)
                    break;
                var str = m.Groups[1].Value;
                if (paradict.ContainsKey(str))
                {
                    url = url.Replace(m.Groups[0].Value, paradict[str]);
                }
            }
            if (!string.IsNullOrEmpty(Crawler))
            {
                var crawler =
                    SysProcessManager.CurrentProcessCollections.FirstOrDefault(d => d.Name == Crawler) as SmartCrawler;
                var header = crawler?.Http.GetHeaderParameter();

                if (header != null)
                {
                    var myheader = Http.GetHeaderParameter();
                    object value;

                    if (header.TryGetValue("Cookie", out value))
                    {
                        myheader["Cookie"]= value.ToString();
                    }
                    if (header.TryGetValue("Host", out value))
                    {
                        myheader["Host"] = value.ToString();
                    }
                    if (header.TryGetValue("Referer", out value))
                    {
                        myheader["Referer"] = value.ToString();
                    }
                    Http.Parameters = HttpItem.HeaderToString(myheader);
                }

            }
        
            var content = helper.GetHtml(Http, out code, url, post);
            doc = new HtmlDocument();
            if (!HttpHelper.IsSuccess(code))
            {
                XLogSys.Print.WarnFormat("HTML Fail,Code:{0},url:{1}", code, url);
                return new List<FreeDocument>();
            }


            doc.LoadHtml(content);
            var datas = CrawlData(doc);
            if (datas.Count == 0)
            {
                XLogSys.Print.DebugFormat("HTML extract Fail,url:{0}", url);
            }
           
            return datas;
        }
Beispiel #6
0
        public string GetHtml(string url, out HttpStatusCode code,
                              string post = null)
        {
            string result = "";

            HttpHelper.HttpResponse response;
            code = HttpStatusCode.NotFound;
            if (Regex.IsMatch(url, @"^[A-Z]:\\")) //本地文件
            {
                if (File.Exists(url))
                {
                    result = File.ReadAllText(url, AttributeHelper.GetEncoding(this.Http.Encoding));
                    code   = HttpStatusCode.Accepted;
                }
            }
            else
            {
                var mc = extract.Matches(url);
                if (SysProcessManager == null)
                {
                    code = HttpStatusCode.NoContent;
                    return("");
                }
                var crawler = this.SysProcessManager.GetTask <SmartCrawler>(ShareCookie.SelectItem);
                if (crawler != null)
                {
                    Http.ProxyIP = crawler.Http.ProxyIP;
                    if (Http.Parameters != crawler.Http.Parameters)
                    {
                        var cookie = crawler.Http.GetHeaderParameter().Get <string>("Cookie");
                        if (string.IsNullOrWhiteSpace(cookie) == false)
                        {
                            Http.SetValue("Cookie", cookie);
                        }
                    }
                }
                Dictionary <string, string> paradict = null;
                foreach (Match m in mc)
                {
                    if (paradict == null)
                    {
                        paradict = XPathAnalyzer.ParseUrl(URL);
                    }
                    if (paradict == null)
                    {
                        break;
                    }
                    var str = m.Groups[1].Value;
                    if (paradict.ContainsKey(str))
                    {
                        url = url.Replace(m.Groups[0].Value, paradict[str]);
                    }
                }
                response = helper.GetHtml(Http, url, post).Result;
                result   = response.Html;
                code     = response.Code;
            }
            result = JavaScriptAnalyzer.Decode(result);
            if (IsSuperMode)
            {
                result = JavaScriptAnalyzer.Parse2XML(result);
            }

            return(result);
        }
Beispiel #7
0
        private void AddNewItem(bool isAlert = true)
        {
            var path     = SelectXPath;
            var rootPath = RootXPath;

            if (!string.IsNullOrEmpty(rootPath))
            {
                //TODO: 当XPath路径错误时,需要捕获异常
                HtmlNode root = null;
                try
                {
                    root = HtmlDoc.DocumentNode.SelectSingleNodePlus(rootPath, RootFormat);
                }
                catch (Exception ex)
                {
                    XLogSys.Print.Error($"{RootXPath}  不能被识别为正确的{RootFormat}表达式,请检查");
                }
                if (!(root != null).SafeCheck($"使用当前父节点{RootFormat} {RootXPath},在文档中找不到任何父节点"))
                {
                    return;
                }
                root = HtmlDoc.DocumentNode.SelectSingleNodePlus(rootPath, RootFormat)?.ParentNode;

                HtmlNode node = null;
                if (
                    !ControlExtended.SafeInvoke(() => HtmlDoc.DocumentNode.SelectSingleNodePlus(path, SearchFormat),
                                                ref node,
                                                LogType.Info, "检查子节点XPath正确性", true))

                {
                    return;
                }
                if (!(node != null).SafeCheck("使用当前子节点XPath,在文档中找不到任何子节点"))
                {
                    return;
                }

                if (!node.IsAncestor(root) && isAlert)
                {
                    if (
                        MessageBox.Show("当前XPath所在节点不是父节点的后代,请检查对应的XPath,是否依然要添加?", "提示信息", MessageBoxButton.YesNo) ==
                        MessageBoxResult.No)
                    {
                        return;
                    }
                }
                string attr      = "";
                string attrValue = "";
                XPathAnalyzer.GetAttribute(path, out attr, out attrValue);
                if (SearchFormat == SelectorFormat.XPath)
                {
                    path = XPath.TakeOffPlus(node.XPath, root.XPath);
                    if (attr != "")
                    {
                        path += "/@" + attr + "[1]";
                    }
                }
            }
            if (CrawlItems.FirstOrDefault(d => d.Name == SelectName) == null ||
                MessageBox.Show("已经存在同名的属性,是否依然添加?", "提示信息", MessageBoxButton.OKCancel) == MessageBoxResult.OK)
            {
                var item = new CrawlItem {
                    XPath = path, Name = SelectName, SampleData1 = SelectText
                };
                item.Format = SearchFormat;
                CrawlItems.Add(item);

                SelectXPath = "";
                SelectName  = "";

                XLogSys.Print.Info("成功添加属性");
            }
        }