public static IEnumerable <List <CrawlItem> > SearchPropertiesSmart(this HtmlDocument doc2, ICollection <CrawlItem> existItems = null, bool isAttrEnabled = false) { if (existItems == null) { existItems = new List <CrawlItem>(); } var shortv = ""; if (existItems.Count > 1) { shortv = XPath.GetMaxCompareXPath(existItems.Select(d => new XPath(d.XPath)).ToList()).ToString(); yield return(GetDiffNodes(doc2, shortv, isAttrEnabled, existItems)); } else if (existItems.Count == 1) { var realPath = new XPath(existItems.First().XPath); var array = realPath.SelectAll(d => true) .Select(d => new XPath(realPath.Take(d)).ToString()).ToList(); var dict = new Dictionary <string, double>(); foreach (var item in array) { GetTableRootProbability( doc2.DocumentNode.SelectSingleNode(item) .ChildNodes.Where(d2 => d2.Name.Contains("#") == false) .ToList(), dict, false); } foreach ( var item in dict.OrderByDescending(d => d.Value)) { shortv = item.Key; yield return(GetDiffNodes(doc2, shortv, isAttrEnabled, existItems)); } } else { var dict = new Dictionary <string, double>(); GetTableRootProbability( doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true); IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value); foreach (var keyValuePair in p) { var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems); if (items.Count > 1) { yield return(items); } } } }
public static string CompileCrawItems(this HtmlDocument doc2, IList <CrawlItem> crawlItem) { var shortv = XPath.GetMaxCompareXPath(crawlItem.Select(d => new XPath(d.XPath)).ToList()).ToString(); if (!string.IsNullOrEmpty(shortv)) { crawlItem.Execute(d => d.XPath = new XPath(d.XPath).TakeOff(shortv).ToString()); return(shortv); } return(""); }
public override bool Init(IEnumerable <IFreeDocument> docus) { base.Init(docus); if (Crawler == null) { return(false); } IsMultiYield = true; xpaths = Crawler.CrawlItems.GroupBy(d => d.Name).Select(d => { var column = d.Key; var path = XPath.GetMaxCompareXPath(d.Select(d2 => d2.XPath).ToList()); return(new { Column = column, XPath = path }); }).ToDictionary(d => d.Column, d => d.XPath); return(true); }
public static List <FreeDocument> GetDataFromXPath(this HtmlDocument doc2, IList <CrawlItem> crawlItems, ListType type = ListType.List, string rootXPath = "") { if (crawlItems.Count == 0) { return(new List <FreeDocument>()); } var documents = new List <FreeDocument>(); switch (type) { case ListType.List: var root = ""; var takeoff = ""; if (string.IsNullOrEmpty(rootXPath)) { root = XPath.GetMaxCompareXPath(crawlItems.Select(d => new XPath(d.XPath)).ToList()).ToString(); takeoff = root; } else { root = rootXPath; } var nodes = doc2.DocumentNode.SelectNodes(root); if (nodes == null) { break; } foreach (var node in nodes) { var document = new FreeDocument(); foreach (var r in crawlItems) { string path; if (string.IsNullOrEmpty(takeoff)) { path = node.XPath + r.XPath; } else { path = node.XPath + new XPath(r.XPath).TakeOff(takeoff); } var result = node.GetDataFromXPath(path, r.IsHTML); document.SetValue(r.Name, result); } documents.Add(document); } return(documents); case ListType.One: var freeDocument = new FreeDocument(); foreach (var r in crawlItems) { doc2.GetDataFromXPath(r, freeDocument); } return(new List <FreeDocument> { freeDocument }); } return(new List <FreeDocument>()); }
public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2, ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false) { if (existItems == null) { existItems = new List <CrawlItem>(); } var shortv = ""; var dict = new Dictionary <string, double>(); if (string.IsNullOrEmpty(rootPath)) { var isForceItemOne = false; //强制进入只有一个属性的模式 if (existItems.Count > 1) { shortv = XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath)); var nodes = doc2.DocumentNode.SelectNodes(shortv); if (nodes == null || nodes.Count == 0) { yield break; } if (nodes.Count == 1) { isForceItemOne = true; } else { var items = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1); var target = getCrawTarget(items, shortv); if (target != null) { yield return(target); } } } if (isForceItemOne || existItems.Count == 1) { var realPath = existItems.First().XPath; var items = XPath.Split(realPath); var array = items.SelectAll(d => true) .Select(d => XPath.SubXPath(realPath, d)).ToList(); foreach (var item in array) { GetTableRootProbability( doc2.DocumentNode.SelectSingleNode(item) .ChildNodes.Where(d2 => d2.Name.Contains("#") == false) .ToList(), dict, false); } } else { GetTableRootProbability( doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true); } if (isForceItemOne || existItems.Count < 2) { IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value); foreach (var keyValuePair in p) { var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4); if (items.Count == 0) { continue; } var target = getCrawTarget(items, keyValuePair.Key); var rootNode = doc2.DocumentNode.SelectSingleNode(keyValuePair.Key).ParentNode; if (rootNode == null) { continue; } target.Html = rootNode.InnerHtml; target.Text = rootNode.InnerText; target.NodeCount = doc2.DocumentNode.SelectNodes(keyValuePair.Key).Count; target.Score = keyValuePair.Value; target.ColumnCount = items.Count; yield return(target); } } } else { var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>()); if (items.Count > 0) { var root = doc2.DocumentNode.SelectSingleNode(rootPath); var xpath = XPath.RemoveFinalNum(root.XPath); var target = getCrawTarget(items, xpath); target.RootXPath = rootPath; yield return(target); } } }
public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2, ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false) { if (existItems == null) { existItems = new List <CrawlItem>(); } var shortv = ""; var dict = new Dictionary <string, double>(); if (string.IsNullOrEmpty(rootPath)) { if (existItems.Count > 1) { shortv = XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath)); var items = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1); var target = getCrawTarget(items, shortv); if (target != null) { yield return(target); } yield break; } if (existItems.Count == 1) { var realPath = existItems.First().XPath; var items = XPath.Split(realPath); var array = items.SelectAll(d => true) .Select(d => XPath.SubXPath(realPath, d)).ToList(); foreach (var item in array) { GetTableRootProbability( doc2.DocumentNode.SelectSingleNode(item) .ChildNodes.Where(d2 => d2.Name.Contains("#") == false) .ToList(), dict, false); } } else { GetTableRootProbability( doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true); } if (existItems.Count < 2) { IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value); foreach (var keyValuePair in p) { var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4); var target = getCrawTarget(items, keyValuePair.Key); if (target != null) { yield return(target); } } } } else { var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>()); if (items.Count > 0) { var root = doc2.DocumentNode.SelectSingleNode(rootPath); foreach (var crawlItem in items) { crawlItem.XPath = XPath.TakeOff(crawlItem.XPath, root.XPath); } var target = getCrawTarget(items, rootPath); if (target != null) { yield return(target); } } } }