/// <summary> /// 从批量集合中获取数据 /// </summary> /// <param name="doc"></param> /// <param name="crawItem"></param> /// <param name="shortv"></param> /// <param name="document"></param> public static void GetDataFromXPath(this HtmlDocument doc, CrawlItem crawItem, IFreeDocument document) { var result = doc.DocumentNode.GetDataFromXPath(crawItem.XPath, crawItem.IsHTML); if (result != null) { document.SetValue(crawItem.Name, result); } }
/// <summary> /// 从批量集合中获取数据 /// </summary> /// <param name="doc"></param> /// <param name="crawItem"></param> /// <param name="shortv"></param> /// <param name="document"></param> public static void GetDataFromXPath(this HtmlDocument doc, CrawlItem crawItem, string shortv, IFreeDocument document) { var result = doc.DocumentNode.GetDataFromXPath(new XPath(crawItem.XPath).TakeOff(shortv).ToString(), crawItem.IsHTML); if (result != null) { document.SetValue(crawItem.Name, result); } }
/// <summary> /// 查询XPATH,返回CrawlItem /// </summary> /// <param name="doc"></param> /// <param name="keyword"></param> /// <param name="name"></param> /// <returns></returns> public static CrawlItem SearchXPath(this HtmlDocument doc, string keyword, string name, bool hasAttr = true) { var xpath = doc.SearchXPath(keyword, () => hasAttr).FirstOrDefault(); if (xpath == null) { return(null); } var crawitem = new CrawlItem { XPath = xpath, SampleData1 = keyword, Name = name } ; return(crawitem); }
private static List <CrawlItem> GetDiffNodes(HtmlDocument doc2, string root, bool isAttrEnabled, IEnumerable <CrawlItem> exists = null, int minNodeCount = 2) { HtmlNodeCollection nodes = null; var crawlItems = new List <CrawlItem>(); try { nodes = doc2.DocumentNode.SelectNodes(root); } catch (Exception ex) { XLogSys.Print.Error(ex.Message + " 可能XPath表达式有误"); return(new List <CrawlItem>()); } if (nodes == null || nodes.Count < minNodeCount) { return(new List <CrawlItem>()); } var buffers = new List <List <string> >(); var nodes3 = nodes.ToList(); // .Where(d => d.Name.Contains("#") == false).ToList(); if (nodes3.Count > 1) { GetDiffNodes(nodes3, crawlItems, buffers, isAttrEnabled); } if (exists != null) { var copied = exists.Select(d => { var xp = new CrawlItem(); d.DictCopyTo(xp); return(xp); }).ToList(); crawlItems.RemoveElementsNoReturn(d => copied.Any(r => IsSameXPath(d.XPath, r.XPath, root))); crawlItems.AddRange(copied); } return(crawlItems); }
/// <summary> /// find different text and return it name and xpath /// </summary> /// <param name="isAttrEnabled">是否抓取标签中的数据</param> /// <returns></returns> private static bool GetDiffNodes(List <HtmlNode> nodes, List <CrawlItem> result, List <List <string> > buffers, bool isAttrEnabled) { var isChildContainInfo = false; var node1 = nodes.First(); var node2 = nodes[1]; if (node1.ChildNodes.Count == 1 && node1.ChildNodes[0].NodeType == HtmlNodeType.Text) { var row = nodes.Select(d => d.SelectSingleNode(d.XPath)) .Where(d => d != null).Select(d => d.InnerText.Trim()).ToList(); if (row.Any(d => CompareString(d, node1.InnerText) == false)) { var name = node1.SearchPropertyName(result) ?? "属性" + result.Count; if (buffers.Any(d => ListEqual(d, row))) { return(true); } var crawlItem = new CrawlItem { Name = name, SampleData1 = node1.InnerText, XPath = result.Count % 2 == 0 ? node1.XPath : node2.XPath }; result.Add(crawlItem); buffers.Add(row); return(true); } return(false); } foreach (var nodechild1 in node1.ChildNodes) { if (nodechild1.XPath.Contains("#")) { continue; } var path = new XPath(nodechild1.XPath).TakeOff(node1.XPath).ToString(); var nodechild2 = nodes.Select(d => d.SelectSingleNode(d.XPath + path)).Where(d => d != null).ToList(); if (nodechild2.Count == 1) { continue; } isChildContainInfo |= GetDiffNodes(nodechild2, result, buffers, isAttrEnabled); } if (isAttrEnabled == false) { return(isChildContainInfo); } foreach (var attribute in node1.Attributes) { var attr1 = attribute.Value; ; var row = nodes.Select(d => d.SelectSingleNode(d.XPath)) .Where(d => d != null).Where(d => d.Attributes.Contains(attribute.Name)).Select(d => d.Attributes[attribute.Name].Value).ToList(); if (row.Any(d => CompareString(d, attr1) == false)) { if (buffers.Any(d => ListEqual(d, row))) { return(isChildContainInfo); } var name = node1.SearchPropertyName(result); if (name != null) { name += '_' + attribute.Name; } else { name = "属性" + result.Count; } var craw = new CrawlItem { Name = name, SampleData1 = attr1, XPath = attribute.XPath }; result.Add(craw); } } return(isChildContainInfo); }
/// <summary> /// find different text and return it name and xpath /// </summary> /// <param name="isAttrEnabled">是否抓取标签中的数据</param> /// <returns></returns> private static bool GetDiffNodes(List <HtmlNode> nodes, List <CrawlItem> result, List <List <string> > buffers, bool isAttrEnabled) { var isChildContainInfo = false; var len = nodes.Count; var node1 = nodes[Random.Next(0, len / 2)]; var node2 = nodes[Random.Next(len / 2, len)]; if (node1.ChildNodes.Count == 1 && node1.ChildNodes[0].NodeType == HtmlNodeType.Text) { var row = nodes.Select(d => d.SelectSingleNode(d.XPath)) .Where(d => d != null).Select(d => d.InnerText.Trim()).ToList(); if (row.Any(d => CompareString(d, node1.InnerText) == false)) { var name = node1.SearchPropertyName(result) ?? "属性" + result.Count; if (buffers.Any(d => ListEqual(d, row))) { return(true); } var crawlItem = new CrawlItem { Name = name, SampleData1 = node1.InnerText, XPath = result.Count % 2 == 0 ? node1.XPath : node2.XPath }; result.Add(crawlItem); buffers.Add(row); return(true); } return(false); } foreach (var nodechild1 in node1.ChildNodes) { if (nodechild1.XPath.Contains("#")) { continue; } var path = XPath.TakeOff(nodechild1.XPath, node1.XPath); var fail = false; var nodechild2 = nodes.Select(d => { if (fail) { return(null); } try { var node = d.SelectSingleNode(d.XPath + path); return(node); } catch (Exception) { fail = true; return(null); } }).Where(d => d != null && fail == false).ToList(); if (nodechild2.Count < 2 || fail) { continue; } isChildContainInfo |= GetDiffNodes(nodechild2, result, buffers, isAttrEnabled); } if (isAttrEnabled == false) { return(isChildContainInfo); } foreach (var attribute in node1.Attributes) { var attr1 = attribute.Value; ; var row = nodes.Select(d => { try { return(d.SelectSingleNode(d.XPath)); } catch (Exception ex) { XLogSys.Print.Error("XPath表达式编写错误: " + d.XPath); return(null); } }) .Where(d => d != null) .Where(d => d.Attributes.Contains(attribute.Name)) .Select(d => d.Attributes[attribute.Name].Value) .ToList(); if (row.Any(d => CompareString(d, attr1) == false)) { if (buffers.Any(d => ListEqual(d, row))) { return(isChildContainInfo); } var name = node1.SearchPropertyName(result); if (name != null) { name += '_' + attribute.Name; } else { name = "属性" + result.Count; } var craw = new CrawlItem { Name = name, SampleData1 = attr1, XPath = attribute.XPath }; result.Add(craw); } } return(isChildContainInfo); }
private void AddNewItem(bool isAlert = true) { var path = SelectXPath; if (!string.IsNullOrEmpty(RootXPath)) { var root = HtmlDoc.DocumentNode.SelectSingleNode(RootXPath).ParentNode; var node = HtmlDoc.DocumentNode.SelectSingleNode(path); if (!node.IsAncestor(root)) { if (isAlert) MessageBox.Show("当前XPath所在节点不是父节点的后代,请检查对应的XPath"); return; } path = new XPath(node.XPath).TakeOff(root.XPath).ToString(); } var item = new CrawlItem {XPath = path, Name = SelectName, SampleData1 = SelectText}; if (CrawlItems.Any(d => d.Name == SelectName)) { SelectName = "属性" + CrawlItems.Count; if (isAlert) { MessageBox.Show($"已存在名称为{SelectName}的属性,不能重复添加"); return; } } CrawlItems.Add(item); SelectXPath = ""; }
public override void DictDeserialize(IDictionary<string, object> dicts, Scenario scenario = Scenario.Database) { base.DictDeserialize(dicts, scenario); URL = dicts.Set("URL", URL); RootXPath = dicts.Set("RootXPath", RootXPath); IsMultiData = dicts.Set("IsMultiData", IsMultiData); URLFilter = dicts.Set("URLFilter", URLFilter); Crawler = dicts.Set("Crawler", Crawler); ContentFilter = dicts.Set("ContentFilter", ContentFilter); if (dicts.ContainsKey("HttpSet")) { var doc2 = dicts["HttpSet"]; var p = doc2 as IDictionary<string, object>; Http.UnsafeDictDeserialize(p); } if (dicts.ContainsKey("Login")) { var doc2 = dicts["Login"]; var p = doc2 as IDictionary<string, object>; var item = new HttpItem(); item.DictDeserialize(p); Documents.Add(item); } if (dicts.ContainsKey("Generator")) { var doc2 = dicts["Generator"]; var p = doc2 as IDictionary<string, object>; } var doc = dicts as FreeDocument; if (doc?.Children != null) { foreach (var child in doc.Children) { var item = new CrawlItem(); item.DictDeserialize(child); CrawlItems.Add(item); } } }
/// <summary> /// find different text and return it name and xpath /// </summary> /// <param name="isAttrEnabled">是否抓取标签中的数据</param> /// <returns></returns> private static bool GetDiffNodes(List<HtmlNode> nodes, List<CrawlItem> result, List<List<string>> buffers, bool isAttrEnabled) { var isChildContainInfo = false; var node1 = nodes.First(); var node2 = nodes[1]; if (node1.ChildNodes.Count == 1 && node1.ChildNodes[0].NodeType == HtmlNodeType.Text) { var row = nodes.Select(d => d.SelectSingleNode(d.XPath)) .Where(d => d != null).Select(d => d.InnerText.Trim()).ToList(); if (row.Any(d => CompareString(d, node1.InnerText) == false)) { var name = node1.SearchPropertyName(result) ?? "属性" + result.Count; if (buffers.Any(d => ListEqual(d, row))) return true; var crawlItem = new CrawlItem { Name = name, SampleData1 = node1.InnerText, XPath = result.Count%2 == 0 ? node1.XPath : node2.XPath }; result.Add(crawlItem); buffers.Add(row); return true; } return false; } foreach (var nodechild1 in node1.ChildNodes) { if (nodechild1.XPath.Contains("#")) continue; var path = new XPath(nodechild1.XPath).TakeOff(node1.XPath).ToString(); var nodechild2 = nodes.Select(d => d.SelectSingleNode(d.XPath + path)).Where(d => d != null).ToList(); if (nodechild2.Count == 1) continue; isChildContainInfo |= GetDiffNodes(nodechild2, result, buffers, isAttrEnabled); } if (isAttrEnabled == false) { return isChildContainInfo; } foreach (var attribute in node1.Attributes) { var attr1 = attribute.Value; ; var row = nodes.Select(d => d.SelectSingleNode(d.XPath)) .Where(d => d != null).Where(d => d.Attributes.Contains(attribute.Name)).Select(d=>d.Attributes[attribute.Name].Value).ToList(); if (row.Any(d => CompareString(d, attr1) == false)) { if (buffers.Any(d => ListEqual(d, row))) return isChildContainInfo; var name = node1.SearchPropertyName(result); if (name != null) name += '_' + attribute.Name; else { name = "属性" + result.Count; } var craw = new CrawlItem {Name = name, SampleData1 = attr1, XPath = attribute.XPath}; result.Add(craw); } } return isChildContainInfo; }
/// <summary> /// 查询XPATH,返回CrawlItem /// </summary> /// <param name="doc"></param> /// <param name="keyword"></param> /// <param name="name"></param> /// <returns></returns> public static CrawlItem SearchXPath(this HtmlDocument doc, string keyword, string name, bool hasAttr = true) { var xpath = doc.SearchXPath(keyword, () => hasAttr).FirstOrDefault(); if (xpath == null) return null; var crawitem = new CrawlItem {XPath = xpath, SampleData1 = keyword, Name = name} ; return crawitem; }
/// <summary> /// 从批量集合中获取数据 /// </summary> /// <param name="doc"></param> /// <param name="crawItem"></param> /// <param name="shortv"></param> /// <param name="document"></param> public static void GetDataFromXPath(this HtmlDocument doc, CrawlItem crawItem, IFreeDocument document) { var result = doc.DocumentNode.GetDataFromXPath(crawItem.XPath, crawItem.IsHTML); if (result != null) document.SetValue(crawItem.Name, result); }
/// <summary> /// 从批量集合中获取数据 /// </summary> /// <param name="doc"></param> /// <param name="crawItem"></param> /// <param name="shortv"></param> /// <param name="document"></param> public static void GetDataFromXPath(this HtmlDocument doc, CrawlItem crawItem, string shortv, IFreeDocument document) { var result = doc.DocumentNode.GetDataFromXPath(new XPath(crawItem.XPath).TakeOff(shortv).ToString(), crawItem.IsHTML); if (result != null) document.SetValue(crawItem.Name, result); }