private static void unitTest() { var doc = XPathAnalyzer.GetHtmlDocument(url); var datas = XPathAnalyzer.GetDataFromURL(url); var properties = doc.DocumentNode.SearchPropertiesSmartList(); var firstOrDefault = properties.FirstOrDefault(); datas = doc.DocumentNode.GetDataFromXPath(firstOrDefault.CrawItems).ToList(); }
public void XPathTest() { var doc = XPathAnalyzer.GetHtmlDocument(url); Assert.IsTrue(doc != null); var datas = XPathAnalyzer.GetDataFromURL(url); Assert.IsTrue(datas != null && datas.Count > 10); var properties = doc.DocumentNode.SearchPropertiesSmartList(); Assert.IsTrue(properties.Any()); var firstOrDefault = properties.FirstOrDefault(); datas = doc.DocumentNode.GetDataFromXPath(firstOrDefault.CrawItems).ToList(); Assert.IsTrue(datas != null && datas.Count > 10); }
public List <FreeDocument> CrawlData(string url, out HtmlDocument doc, out HttpStatusCode code, string post = null) { var mc = extract.Matches(url); Dictionary <string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) { paradict = XPathAnalyzer.ParseUrl(URL); } if (paradict == null) { break; } var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } var content = helper.GetHtml(Http, out code, url, post); doc = new HtmlDocument(); if (!HttpHelper.IsSuccess(code)) { XLogSys.Print.WarnFormat("HTML Fail,Code:{0},url:{1}", code, url); return(new List <FreeDocument>()); } doc.LoadHtml(content); var datas = CrawlData(doc); if (datas.Count == 0) { XLogSys.Print.DebugFormat("HTML extract Fail,url:{0}", url); } return(datas); }
private void AddNewItem(bool isAlert = true) { var path = SelectXPath; var rootPath = RootXPath; if (!string.IsNullOrEmpty(rootPath)) { //TODO: 当XPath路径错误时,需要捕获异常 HtmlNode root = null; try { root = HtmlDoc.DocumentNode.SelectSingleNodePlus(rootPath, RootFormat); } catch (Exception) { XLogSys.Print.Error(string.Format(GlobalHelper.Get("key_662"), RootXPath, RootFormat)); } if (!(root != null).SafeCheck(string.Format(GlobalHelper.Get("key_663"), RootFormat, RootXPath))) { return; } root = HtmlDoc.DocumentNode.SelectSingleNodePlus(rootPath, RootFormat)?.ParentNode; HtmlNode node = null; if ( !ControlExtended.SafeInvoke(() => HtmlDoc.DocumentNode.SelectSingleNodePlus(path, SearchFormat), ref node, LogType.Info, GlobalHelper.Get("key_664"), true)) { return; } if (!(node != null).SafeCheck(GlobalHelper.Get("key_665"))) { return; } if (!node.IsAncestor(root) && isAlert) { if ( MessageBox.Show(GlobalHelper.Get("key_666"), GlobalHelper.Get("key_99"), MessageBoxButton.YesNo) == MessageBoxResult.No) { return; } } string attr = ""; string attrValue = ""; XPathAnalyzer.GetAttribute(path, out attr, out attrValue); if (SearchFormat == SelectorFormat.XPath) { path = XPath.TakeOffPlus(node.XPath, root.XPath); if (attr != "") { path += "/@" + attr + "[1]"; } } } if (CrawlItems.FirstOrDefault(d => d.Name == SelectName) == null || MessageBox.Show(GlobalHelper.Get("add_column_sure"), GlobalHelper.Get("key_99"), MessageBoxButton.OKCancel) == MessageBoxResult.OK) { var item = new CrawlItem { XPath = path, Name = SelectName, SampleData1 = SelectText }; item.Format = SearchFormat; CrawlItems.Add(item); SelectXPath = ""; SelectName = ""; XLogSys.Print.Info(GlobalHelper.Get("key_668")); } }
public List<FreeDocument> CrawlData(string url, out HtmlDocument doc, out HttpStatusCode code, string post = null) { var mc = extract.Matches(url); Dictionary<string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) paradict = XPathAnalyzer.ParseUrl(URL); if (paradict == null) break; var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } if (!string.IsNullOrEmpty(Crawler)) { var crawler = SysProcessManager.CurrentProcessCollections.FirstOrDefault(d => d.Name == Crawler) as SmartCrawler; var header = crawler?.Http.GetHeaderParameter(); if (header != null) { var myheader = Http.GetHeaderParameter(); object value; if (header.TryGetValue("Cookie", out value)) { myheader["Cookie"]= value.ToString(); } if (header.TryGetValue("Host", out value)) { myheader["Host"] = value.ToString(); } if (header.TryGetValue("Referer", out value)) { myheader["Referer"] = value.ToString(); } Http.Parameters = HttpItem.HeaderToString(myheader); } } var content = helper.GetHtml(Http, out code, url, post); doc = new HtmlDocument(); if (!HttpHelper.IsSuccess(code)) { XLogSys.Print.WarnFormat("HTML Fail,Code:{0},url:{1}", code, url); return new List<FreeDocument>(); } doc.LoadHtml(content); var datas = CrawlData(doc); if (datas.Count == 0) { XLogSys.Print.DebugFormat("HTML extract Fail,url:{0}", url); } return datas; }
public string GetHtml(string url, out HttpStatusCode code, string post = null) { string result = ""; HttpHelper.HttpResponse response; code = HttpStatusCode.NotFound; if (Regex.IsMatch(url, @"^[A-Z]:\\")) //本地文件 { if (File.Exists(url)) { result = File.ReadAllText(url, AttributeHelper.GetEncoding(this.Http.Encoding)); code = HttpStatusCode.Accepted; } } else { var mc = extract.Matches(url); if (SysProcessManager == null) { code = HttpStatusCode.NoContent; return(""); } var crawler = this.SysProcessManager.GetTask <SmartCrawler>(ShareCookie.SelectItem); if (crawler != null) { Http.ProxyIP = crawler.Http.ProxyIP; if (Http.Parameters != crawler.Http.Parameters) { var cookie = crawler.Http.GetHeaderParameter().Get <string>("Cookie"); if (string.IsNullOrWhiteSpace(cookie) == false) { Http.SetValue("Cookie", cookie); } } } Dictionary <string, string> paradict = null; foreach (Match m in mc) { if (paradict == null) { paradict = XPathAnalyzer.ParseUrl(URL); } if (paradict == null) { break; } var str = m.Groups[1].Value; if (paradict.ContainsKey(str)) { url = url.Replace(m.Groups[0].Value, paradict[str]); } } response = helper.GetHtml(Http, url, post).Result; result = response.Html; code = response.Code; } result = JavaScriptAnalyzer.Decode(result); if (IsSuperMode) { result = JavaScriptAnalyzer.Parse2XML(result); } return(result); }
private void AddNewItem(bool isAlert = true) { var path = SelectXPath; var rootPath = RootXPath; if (!string.IsNullOrEmpty(rootPath)) { //TODO: 当XPath路径错误时,需要捕获异常 HtmlNode root = null; try { root = HtmlDoc.DocumentNode.SelectSingleNodePlus(rootPath, RootFormat); } catch (Exception ex) { XLogSys.Print.Error($"{RootXPath} 不能被识别为正确的{RootFormat}表达式,请检查"); } if (!(root != null).SafeCheck($"使用当前父节点{RootFormat} {RootXPath},在文档中找不到任何父节点")) { return; } root = HtmlDoc.DocumentNode.SelectSingleNodePlus(rootPath, RootFormat)?.ParentNode; HtmlNode node = null; if ( !ControlExtended.SafeInvoke(() => HtmlDoc.DocumentNode.SelectSingleNodePlus(path, SearchFormat), ref node, LogType.Info, "检查子节点XPath正确性", true)) { return; } if (!(node != null).SafeCheck("使用当前子节点XPath,在文档中找不到任何子节点")) { return; } if (!node.IsAncestor(root) && isAlert) { if ( MessageBox.Show("当前XPath所在节点不是父节点的后代,请检查对应的XPath,是否依然要添加?", "提示信息", MessageBoxButton.YesNo) == MessageBoxResult.No) { return; } } string attr = ""; string attrValue = ""; XPathAnalyzer.GetAttribute(path, out attr, out attrValue); if (SearchFormat == SelectorFormat.XPath) { path = XPath.TakeOffPlus(node.XPath, root.XPath); if (attr != "") { path += "/@" + attr + "[1]"; } } } if (CrawlItems.FirstOrDefault(d => d.Name == SelectName) == null || MessageBox.Show("已经存在同名的属性,是否依然添加?", "提示信息", MessageBoxButton.OKCancel) == MessageBoxResult.OK) { var item = new CrawlItem { XPath = path, Name = SelectName, SampleData1 = SelectText }; item.Format = SearchFormat; CrawlItems.Add(item); SelectXPath = ""; SelectName = ""; XLogSys.Print.Info("成功添加属性"); } }