public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2, ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false) { if (existItems == null) { existItems = new List <CrawlItem>(); } var shortv = ""; var dict = new Dictionary <string, double>(); if (string.IsNullOrEmpty(rootPath)) { var isForceItemOne = false; //强制进入只有一个属性的模式 if (existItems.Count > 1) { shortv = XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath)); var nodes = doc2.DocumentNode.SelectNodes(shortv); if (nodes == null || nodes.Count == 0) { yield break; } if (nodes.Count == 1) { isForceItemOne = true; } else { var items = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1); var target = getCrawTarget(items, shortv); if (target != null) { yield return(target); } } } if (isForceItemOne || existItems.Count == 1) { var realPath = existItems.First().XPath; var items = XPath.Split(realPath); var array = items.SelectAll(d => true) .Select(d => XPath.SubXPath(realPath, d)).ToList(); foreach (var item in array) { GetTableRootProbability( doc2.DocumentNode.SelectSingleNode(item) .ChildNodes.Where(d2 => d2.Name.Contains("#") == false) .ToList(), dict, false); } } else { GetTableRootProbability( doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true); } if (isForceItemOne || existItems.Count < 2) { IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value); foreach (var keyValuePair in p) { var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4); if (items.Count == 0) { continue; } var target = getCrawTarget(items, keyValuePair.Key); var rootNode = doc2.DocumentNode.SelectSingleNode(keyValuePair.Key).ParentNode; if (rootNode == null) { continue; } target.Html = rootNode.InnerHtml; target.Text = rootNode.InnerText; target.NodeCount = doc2.DocumentNode.SelectNodes(keyValuePair.Key).Count; target.Score = keyValuePair.Value; target.ColumnCount = items.Count; yield return(target); } } } else { var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>()); if (items.Count > 0) { var root = doc2.DocumentNode.SelectSingleNode(rootPath); var xpath = XPath.RemoveFinalNum(root.XPath); var target = getCrawTarget(items, xpath); target.RootXPath = rootPath; yield return(target); } } }
public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2, ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false) { if (existItems == null) { existItems = new List <CrawlItem>(); } var shortv = ""; var dict = new Dictionary <string, double>(); if (string.IsNullOrEmpty(rootPath)) { if (existItems.Count > 1) { shortv = XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath)); var items = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1); var target = getCrawTarget(items, shortv); if (target != null) { yield return(target); } yield break; } if (existItems.Count == 1) { var realPath = existItems.First().XPath; var items = XPath.Split(realPath); var array = items.SelectAll(d => true) .Select(d => XPath.SubXPath(realPath, d)).ToList(); foreach (var item in array) { GetTableRootProbability( doc2.DocumentNode.SelectSingleNode(item) .ChildNodes.Where(d2 => d2.Name.Contains("#") == false) .ToList(), dict, false); } } else { GetTableRootProbability( doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true); } if (existItems.Count < 2) { IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value); foreach (var keyValuePair in p) { var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4); var target = getCrawTarget(items, keyValuePair.Key); if (target != null) { yield return(target); } } } } else { var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>()); if (items.Count > 0) { var root = doc2.DocumentNode.SelectSingleNode(rootPath); foreach (var crawlItem in items) { crawlItem.XPath = XPath.TakeOff(crawlItem.XPath, root.XPath); } var target = getCrawTarget(items, rootPath); if (target != null) { yield return(target); } } } }