private static bool IsSameXPath(string xpath1, string xpath2, string shortv) { var p1 = XPath.TakeOff(xpath1, shortv); var p2 = XPath.TakeOff(xpath2, shortv); return(p1 == p2); }
/// <summary> /// 获取从头开始的最大公共子串 /// </summary> /// <returns></returns> public static XPath GetMaxCompareXPath(IList <XPath> items) { int minlen = items.Min(d => d.Count); string c = null; int i = 0; for (i = 0; i < minlen; i++) { for (int index = 0; index < items.Count; index++) { XPath path = items[index]; if (index == 0) { c = path[i]; } else { if (c != path[i]) { goto OVER; } } } } OVER: XPath first = items.First().SubXPath(i + 1); first.RemoveFinalNum(); return(first); }
private static bool IsSameXPath(string xpath1, string xpath2, string shortv) { var p1 = new XPath(xpath1).TakeOff(shortv); var p2 = new XPath(xpath2).TakeOff(shortv); return(p1.ToString() == p2.ToString()); }
public static string GetTextNode(this HtmlNode node) { var para = new ParaClass(); node.GetTextRootProbability(para); return(XPath.SubXPath(para.Path, -2)); }
public static string GetTextNode(this HtmlNode node) { var para = new ParaClass(); node.GetTextRootProbability(para); var path = new XPath(para.Path); return(path.SubXPath(0, path.Count - 2).ToString()); }
public XPath TakeOff(string fullPath) { if (string.IsNullOrEmpty(fullPath)) { return(this); } var temp = new XPath(fullPath); return(SubXPath(temp.Count, Count - temp.Count)); }
public static IEnumerable <List <CrawlItem> > SearchPropertiesSmart(this HtmlDocument doc2, ICollection <CrawlItem> existItems = null, bool isAttrEnabled = false) { if (existItems == null) { existItems = new List <CrawlItem>(); } var shortv = ""; if (existItems.Count > 1) { shortv = XPath.GetMaxCompareXPath(existItems.Select(d => new XPath(d.XPath)).ToList()).ToString(); yield return(GetDiffNodes(doc2, shortv, isAttrEnabled, existItems)); } else if (existItems.Count == 1) { var realPath = new XPath(existItems.First().XPath); var array = realPath.SelectAll(d => true) .Select(d => new XPath(realPath.Take(d)).ToString()).ToList(); var dict = new Dictionary <string, double>(); foreach (var item in array) { GetTableRootProbability( doc2.DocumentNode.SelectSingleNode(item) .ChildNodes.Where(d2 => d2.Name.Contains("#") == false) .ToList(), dict, false); } foreach ( var item in dict.OrderByDescending(d => d.Value)) { shortv = item.Key; yield return(GetDiffNodes(doc2, shortv, isAttrEnabled, existItems)); } } else { var dict = new Dictionary <string, double>(); GetTableRootProbability( doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true); IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value); foreach (var keyValuePair in p) { var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems); if (items.Count > 1) { yield return(items); } } } }
/// <summary> /// 从批量集合中获取数据 /// </summary> /// <param name="doc"></param> /// <param name="crawItem"></param> /// <param name="root"></param> /// <param name="document"></param> public static void GetDataFromXPath(this HtmlDocument doc, CrawlItem crawItem, string root, IFreeDocument document) { var result = doc.DocumentNode.GetDataFromXPath(XPath.TakeOff(crawItem.XPath, root), crawItem.IsHTML); if (result != null) { document.SetValue(crawItem.Name, result); } }
public static string CompileCrawItems(this HtmlDocument doc2, IList <CrawlItem> crawlItem) { var shortv = XPath.GetMaxCompareXPath(crawlItem.Select(d => new XPath(d.XPath)).ToList()).ToString(); if (!string.IsNullOrEmpty(shortv)) { crawlItem.Execute(d => d.XPath = new XPath(d.XPath).TakeOff(shortv).ToString()); return(shortv); } return(""); }
private static CrawTarget getCrawTarget(List <CrawlItem> items, string root = null) { if (items.Count > 1) { return(new CrawTarget(items)); } else if (items.Count == 1 && string.IsNullOrEmpty(root) == false) { var child = XPath.TakeOff(items[0].XPath, root); items[0].XPath = child; return(new CrawTarget(items, root)); } return(null); }
/// <summary> /// 计算可能是列表根节点的概率,同时将值保存在dict中。 /// </summary> /// <param name="node"></param> /// <param name="dict"></param> private static void GetTableRootProbability(IList <HtmlNode> nodes, Dictionary <string, double> dict, bool haschild) { if (nodes.Count == 0) { return; } var node = nodes[0]; var xpath = XPath.RemoveFinalNum(node.XPath); if (haschild) { foreach (var htmlNode in nodes) { GetTableRootProbability(htmlNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, haschild); } } var avanode = nodes.ToList(); if (avanode.Count < 3) { return; } //if (avanode.Count(d => d.Name == avanode[1].Name) < avanode.Count*0.7) // return; var childCount = (double)avanode.Count; var childCounts = avanode.Select(d => (double)d.ChildNodes.Count).ToArray(); var v = childCounts.Variance(); //TODO: 此处需要一个更好的手段,因为有效节点往往是间隔的 if (v > 100) { return; } var leafCount = avanode.Last().GetLeafNodeCount(); var value = (childCount * PM25 + leafCount) * (v == 0 ? 2 : (Math.Log10((100 - v) / 100))); if (xpath.Contains("你")) { Console.WriteLine(xpath); } dict.SetValue(xpath, value); }
public override bool Init(IEnumerable <IFreeDocument> docus) { base.Init(docus); if (Crawler == null) { return(false); } IsMultiYield = true; xpaths = Crawler.CrawlItems.GroupBy(d => d.Name).Select(d => { var column = d.Key; var path = XPath.GetMaxCompareXPath(d.Select(d2 => d2.XPath).ToList()); return(new { Column = column, XPath = path }); }).ToDictionary(d => d.Column, d => d.XPath); return(true); }
/// <summary> /// 计算可能是列表根节点的概率,同时将值保存在dict中。 /// </summary> /// <param name="node"></param> /// <param name="dict"></param> private static void GetTableRootProbability(IList <HtmlNode> nodes, Dictionary <string, double> dict, bool haschild) { if (nodes.Count == 0) { return; } var node = nodes[0]; var xpath = new XPath(node.XPath).RemoveFinalNum().ToString(); if (haschild) { foreach (var htmlNode in nodes) { GetTableRootProbability(htmlNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, haschild); } } var avanode = nodes.ToList(); if (avanode.Count < 3) { return; } if (avanode.Count(d => d.Name == avanode[1].Name) < avanode.Count * 0.7) { return; } var childCount = (double)avanode.Count; var childCounts = avanode.Select(d => (double)d.ChildNodes.Count).ToArray(); var v = childCounts.Variance(); //TODO: 此处需要一个更好的手段,因为有效节点往往是间隔的 if (v > 5) { return; } var leafCount = avanode.First().GetLeafNodeCount(); var value = childCount * PM25 + leafCount; dict.SetValue(xpath, value); }
/// <summary> /// 从XPath获取数据 /// </summary> /// <param name="doc"></param> /// <param name="path"></param> /// <param name="ishtml"></param> /// <returns></returns> public static string GetDataFromXPath(this HtmlNode doc, string path, bool ishtml = false) { if (!string.IsNullOrEmpty(path)) { HtmlNode p2 = null; try { p2 = doc.SelectSingleNode(path); } catch (Exception ex) { } if (p2 == null) { return(null); } var paths = path.Split('/'); var last = paths[paths.Length - 1]; if (last.Any() && last.Contains("@") && last.Contains("[1]")) //标签数据 { var name = XPath.GetAttributeName(last.Split('@', '[')[1]); if (p2.HasAttributes) { var a = p2.Attributes.FirstOrDefault(d => d.Name == name); return(a.Value.Trim()); } } else if (ishtml) { return(p2.InnerHtml); } else { return(p2.GetNodeText()); } } return(null); }
public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2, ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false) { if (existItems == null) { existItems = new List <CrawlItem>(); } var shortv = ""; var dict = new Dictionary <string, double>(); if (string.IsNullOrEmpty(rootPath)) { if (existItems.Count > 1) { shortv = XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath)); var items = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1); var target = getCrawTarget(items, shortv); if (target != null) { yield return(target); } yield break; } if (existItems.Count == 1) { var realPath = existItems.First().XPath; var items = XPath.Split(realPath); var array = items.SelectAll(d => true) .Select(d => XPath.SubXPath(realPath, d)).ToList(); foreach (var item in array) { GetTableRootProbability( doc2.DocumentNode.SelectSingleNode(item) .ChildNodes.Where(d2 => d2.Name.Contains("#") == false) .ToList(), dict, false); } } else { GetTableRootProbability( doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true); } if (existItems.Count < 2) { IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value); foreach (var keyValuePair in p) { var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4); var target = getCrawTarget(items, keyValuePair.Key); if (target != null) { yield return(target); } } } } else { var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>()); if (items.Count > 0) { var root = doc2.DocumentNode.SelectSingleNode(rootPath); foreach (var crawlItem in items) { crawlItem.XPath = XPath.TakeOff(crawlItem.XPath, root.XPath); } var target = getCrawTarget(items, rootPath); if (target != null) { yield return(target); } } } }
public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2, ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false) { if (existItems == null) { existItems = new List <CrawlItem>(); } var shortv = ""; var dict = new Dictionary <string, double>(); if (string.IsNullOrEmpty(rootPath)) { var isForceItemOne = false; //强制进入只有一个属性的模式 if (existItems.Count > 1) { shortv = XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath)); var nodes = doc2.DocumentNode.SelectNodes(shortv); if (nodes == null || nodes.Count == 0) { yield break; } if (nodes.Count == 1) { isForceItemOne = true; } else { var items = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1); var target = getCrawTarget(items, shortv); if (target != null) { yield return(target); } } } if (isForceItemOne || existItems.Count == 1) { var realPath = existItems.First().XPath; var items = XPath.Split(realPath); var array = items.SelectAll(d => true) .Select(d => XPath.SubXPath(realPath, d)).ToList(); foreach (var item in array) { GetTableRootProbability( doc2.DocumentNode.SelectSingleNode(item) .ChildNodes.Where(d2 => d2.Name.Contains("#") == false) .ToList(), dict, false); } } else { GetTableRootProbability( doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true); } if (isForceItemOne || existItems.Count < 2) { IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value); foreach (var keyValuePair in p) { var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4); if (items.Count == 0) { continue; } var target = getCrawTarget(items, keyValuePair.Key); var rootNode = doc2.DocumentNode.SelectSingleNode(keyValuePair.Key).ParentNode; if (rootNode == null) { continue; } target.Html = rootNode.InnerHtml; target.Text = rootNode.InnerText; target.NodeCount = doc2.DocumentNode.SelectNodes(keyValuePair.Key).Count; target.Score = keyValuePair.Value; target.ColumnCount = items.Count; yield return(target); } } } else { var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>()); if (items.Count > 0) { var root = doc2.DocumentNode.SelectSingleNode(rootPath); var xpath = XPath.RemoveFinalNum(root.XPath); var target = getCrawTarget(items, xpath); target.RootXPath = rootPath; yield return(target); } } }
/// <summary> /// find different text and return it name and xpath /// </summary> /// <param name="isAttrEnabled">是否抓取标签中的数据</param> /// <returns></returns> private static bool GetDiffNodes(List <HtmlNode> nodes, List <CrawlItem> result, List <List <string> > buffers, bool isAttrEnabled) { var isChildContainInfo = false; var node1 = nodes.First(); var node2 = nodes[1]; if (node1.ChildNodes.Count == 1 && node1.ChildNodes[0].NodeType == HtmlNodeType.Text) { var row = nodes.Select(d => d.SelectSingleNode(d.XPath)) .Where(d => d != null).Select(d => d.InnerText.Trim()).ToList(); if (row.Any(d => CompareString(d, node1.InnerText) == false)) { var name = node1.SearchPropertyName(result) ?? "属性" + result.Count; if (buffers.Any(d => ListEqual(d, row))) { return(true); } var crawlItem = new CrawlItem { Name = name, SampleData1 = node1.InnerText, XPath = result.Count % 2 == 0 ? node1.XPath : node2.XPath }; result.Add(crawlItem); buffers.Add(row); return(true); } return(false); } foreach (var nodechild1 in node1.ChildNodes) { if (nodechild1.XPath.Contains("#")) { continue; } var path = new XPath(nodechild1.XPath).TakeOff(node1.XPath).ToString(); var nodechild2 = nodes.Select(d => d.SelectSingleNode(d.XPath + path)).Where(d => d != null).ToList(); if (nodechild2.Count == 1) { continue; } isChildContainInfo |= GetDiffNodes(nodechild2, result, buffers, isAttrEnabled); } if (isAttrEnabled == false) { return(isChildContainInfo); } foreach (var attribute in node1.Attributes) { var attr1 = attribute.Value; ; var row = nodes.Select(d => d.SelectSingleNode(d.XPath)) .Where(d => d != null).Where(d => d.Attributes.Contains(attribute.Name)).Select(d => d.Attributes[attribute.Name].Value).ToList(); if (row.Any(d => CompareString(d, attr1) == false)) { if (buffers.Any(d => ListEqual(d, row))) { return(isChildContainInfo); } var name = node1.SearchPropertyName(result); if (name != null) { name += '_' + attribute.Name; } else { name = "属性" + result.Count; } var craw = new CrawlItem { Name = name, SampleData1 = attr1, XPath = attribute.XPath }; result.Add(craw); } } return(isChildContainInfo); }
public static List <FreeDocument> GetDataFromXPath(this HtmlDocument doc2, IList <CrawlItem> crawlItems, ListType type = ListType.List, string rootXPath = "") { if (crawlItems.Count == 0) { return(new List <FreeDocument>()); } var documents = new List <FreeDocument>(); switch (type) { case ListType.List: var root = ""; var takeoff = ""; if (string.IsNullOrEmpty(rootXPath)) { root = XPath.GetMaxCompareXPath(crawlItems.Select(d => new XPath(d.XPath)).ToList()).ToString(); takeoff = root; } else { root = rootXPath; } var nodes = doc2.DocumentNode.SelectNodes(root); if (nodes == null) { break; } foreach (var node in nodes) { var document = new FreeDocument(); foreach (var r in crawlItems) { string path; if (string.IsNullOrEmpty(takeoff)) { path = node.XPath + r.XPath; } else { path = node.XPath + new XPath(r.XPath).TakeOff(takeoff); } var result = node.GetDataFromXPath(path, r.IsHTML); document.SetValue(r.Name, result); } documents.Add(document); } return(documents); case ListType.One: var freeDocument = new FreeDocument(); foreach (var r in crawlItems) { doc2.GetDataFromXPath(r, freeDocument); } return(new List <FreeDocument> { freeDocument }); } return(new List <FreeDocument>()); }
/// <summary> /// find different text and return it name and xpath /// </summary> /// <param name="isAttrEnabled">是否抓取标签中的数据</param> /// <returns></returns> private static bool GetDiffNodes(List <HtmlNode> nodes, List <CrawlItem> result, List <List <string> > buffers, bool isAttrEnabled) { var isChildContainInfo = false; var len = nodes.Count; var node1 = nodes[Random.Next(0, len / 2)]; var node2 = nodes[Random.Next(len / 2, len)]; if (node1.ChildNodes.Count == 1 && node1.ChildNodes[0].NodeType == HtmlNodeType.Text) { var row = nodes.Select(d => d.SelectSingleNode(d.XPath)) .Where(d => d != null).Select(d => d.InnerText.Trim()).ToList(); if (row.Any(d => CompareString(d, node1.InnerText) == false)) { var name = node1.SearchPropertyName(result) ?? "属性" + result.Count; if (buffers.Any(d => ListEqual(d, row))) { return(true); } var crawlItem = new CrawlItem { Name = name, SampleData1 = node1.InnerText, XPath = result.Count % 2 == 0 ? node1.XPath : node2.XPath }; result.Add(crawlItem); buffers.Add(row); return(true); } return(false); } foreach (var nodechild1 in node1.ChildNodes) { if (nodechild1.XPath.Contains("#")) { continue; } var path = XPath.TakeOff(nodechild1.XPath, node1.XPath); var fail = false; var nodechild2 = nodes.Select(d => { if (fail) { return(null); } try { var node = d.SelectSingleNode(d.XPath + path); return(node); } catch (Exception) { fail = true; return(null); } }).Where(d => d != null && fail == false).ToList(); if (nodechild2.Count < 2 || fail) { continue; } isChildContainInfo |= GetDiffNodes(nodechild2, result, buffers, isAttrEnabled); } if (isAttrEnabled == false) { return(isChildContainInfo); } foreach (var attribute in node1.Attributes) { var attr1 = attribute.Value; ; var row = nodes.Select(d => { try { return(d.SelectSingleNode(d.XPath)); } catch (Exception ex) { XLogSys.Print.Error("XPath表达式编写错误: " + d.XPath); return(null); } }) .Where(d => d != null) .Where(d => d.Attributes.Contains(attribute.Name)) .Select(d => d.Attributes[attribute.Name].Value) .ToList(); if (row.Any(d => CompareString(d, attr1) == false)) { if (buffers.Any(d => ListEqual(d, row))) { return(isChildContainInfo); } var name = node1.SearchPropertyName(result); if (name != null) { name += '_' + attribute.Name; } else { name = "属性" + result.Count; } var craw = new CrawlItem { Name = name, SampleData1 = attr1, XPath = attribute.XPath }; result.Add(craw); } } return(isChildContainInfo); }