/// <summary> /// 获取从头开始的最大公共子串 /// </summary> /// <returns></returns> public static XPath GetMaxCompareXPath(IList <XPath> items) { int minlen = items.Min(d => d.Count); string c = null; int i = 0; for (i = 0; i < minlen; i++) { for (int index = 0; index < items.Count; index++) { XPath path = items[index]; if (index == 0) { c = path[i]; } else { if (c != path[i]) { goto OVER; } } } } OVER: XPath first = items.First().SubXPath(i + 1); first.RemoveFinalNum(); return(first); }
/// <summary> /// 计算可能是列表根节点的概率,同时将值保存在dict中。 /// </summary> /// <param name="node"></param> /// <param name="dict"></param> private static void GetTableRootProbability(IList <HtmlNode> nodes, Dictionary <string, double> dict, bool haschild) { if (nodes.Count == 0) { return; } var node = nodes[0]; var xpath = XPath.RemoveFinalNum(node.XPath); if (haschild) { foreach (var htmlNode in nodes) { GetTableRootProbability(htmlNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, haschild); } } var avanode = nodes.ToList(); if (avanode.Count < 3) { return; } //if (avanode.Count(d => d.Name == avanode[1].Name) < avanode.Count*0.7) // return; var childCount = (double)avanode.Count; var childCounts = avanode.Select(d => (double)d.ChildNodes.Count).ToArray(); var v = childCounts.Variance(); //TODO: 此处需要一个更好的手段,因为有效节点往往是间隔的 if (v > 100) { return; } var leafCount = avanode.Last().GetLeafNodeCount(); var value = (childCount * PM25 + leafCount) * (v == 0 ? 2 : (Math.Log10((100 - v) / 100))); if (xpath.Contains("你")) { Console.WriteLine(xpath); } dict.SetValue(xpath, value); }
public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2, ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false) { if (existItems == null) { existItems = new List <CrawlItem>(); } var shortv = ""; var dict = new Dictionary <string, double>(); if (string.IsNullOrEmpty(rootPath)) { var isForceItemOne = false; //强制进入只有一个属性的模式 if (existItems.Count > 1) { shortv = XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath)); var nodes = doc2.DocumentNode.SelectNodes(shortv); if (nodes == null || nodes.Count == 0) { yield break; } if (nodes.Count == 1) { isForceItemOne = true; } else { var items = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1); var target = getCrawTarget(items, shortv); if (target != null) { yield return(target); } } } if (isForceItemOne || existItems.Count == 1) { var realPath = existItems.First().XPath; var items = XPath.Split(realPath); var array = items.SelectAll(d => true) .Select(d => XPath.SubXPath(realPath, d)).ToList(); foreach (var item in array) { GetTableRootProbability( doc2.DocumentNode.SelectSingleNode(item) .ChildNodes.Where(d2 => d2.Name.Contains("#") == false) .ToList(), dict, false); } } else { GetTableRootProbability( doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true); } if (isForceItemOne || existItems.Count < 2) { IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value); foreach (var keyValuePair in p) { var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4); if (items.Count == 0) { continue; } var target = getCrawTarget(items, keyValuePair.Key); var rootNode = doc2.DocumentNode.SelectSingleNode(keyValuePair.Key).ParentNode; if (rootNode == null) { continue; } target.Html = rootNode.InnerHtml; target.Text = rootNode.InnerText; target.NodeCount = doc2.DocumentNode.SelectNodes(keyValuePair.Key).Count; target.Score = keyValuePair.Value; target.ColumnCount = items.Count; yield return(target); } } } else { var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>()); if (items.Count > 0) { var root = doc2.DocumentNode.SelectSingleNode(rootPath); var xpath = XPath.RemoveFinalNum(root.XPath); var target = getCrawTarget(items, xpath); target.RootXPath = rootPath; yield return(target); } } }