Пример #1
0
        public static IEnumerable <List <CrawlItem> > SearchPropertiesSmart(this HtmlDocument doc2,
                                                                            ICollection <CrawlItem> existItems = null, bool isAttrEnabled = false)
        {
            if (existItems == null)
            {
                existItems = new List <CrawlItem>();
            }
            var shortv = "";

            if (existItems.Count > 1)
            {
                shortv =
                    XPath.GetMaxCompareXPath(existItems.Select(d => new XPath(d.XPath)).ToList()).ToString();
                yield return(GetDiffNodes(doc2, shortv, isAttrEnabled, existItems));
            }
            else if (existItems.Count == 1)
            {
                var realPath = new XPath(existItems.First().XPath);
                var array    =
                    realPath.SelectAll(d => true)
                    .Select(d => new XPath(realPath.Take(d)).ToString()).ToList();
                var dict = new Dictionary <string, double>();
                foreach (var item in array)
                {
                    GetTableRootProbability(
                        doc2.DocumentNode.SelectSingleNode(item)
                        .ChildNodes.Where(d2 => d2.Name.Contains("#") == false)
                        .ToList(), dict, false);
                }

                foreach (
                    var item in
                    dict.OrderByDescending(d => d.Value))
                {
                    shortv = item.Key;
                    yield return(GetDiffNodes(doc2, shortv, isAttrEnabled, existItems));
                }
            }
            else
            {
                var dict = new Dictionary <string, double>();
                GetTableRootProbability(
                    doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true);
                IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value);
                foreach (var keyValuePair in p)
                {
                    var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems);
                    if (items.Count > 1)
                    {
                        yield return(items);
                    }
                }
            }
        }
Пример #2
0
        public static string CompileCrawItems(this HtmlDocument doc2, IList <CrawlItem> crawlItem)
        {
            var shortv =
                XPath.GetMaxCompareXPath(crawlItem.Select(d => new XPath(d.XPath)).ToList()).ToString();

            if (!string.IsNullOrEmpty(shortv))
            {
                crawlItem.Execute(d => d.XPath = new XPath(d.XPath).TakeOff(shortv).ToString());
                return(shortv);
            }
            return("");
        }
Пример #3
0
 public override bool Init(IEnumerable <IFreeDocument> docus)
 {
     base.Init(docus);
     if (Crawler == null)
     {
         return(false);
     }
     IsMultiYield = true;
     xpaths       = Crawler.CrawlItems.GroupBy(d => d.Name).Select(d =>
     {
         var column = d.Key;
         var path   = XPath.GetMaxCompareXPath(d.Select(d2 => d2.XPath).ToList());
         return(new { Column = column, XPath = path });
     }).ToDictionary(d => d.Column, d => d.XPath);
     return(true);
 }
Пример #4
0
        public static List <FreeDocument> GetDataFromXPath(this HtmlDocument doc2, IList <CrawlItem> crawlItems,
                                                           ListType type = ListType.List, string rootXPath = "")
        {
            if (crawlItems.Count == 0)
            {
                return(new List <FreeDocument>());
            }

            var documents = new List <FreeDocument>();

            switch (type)
            {
            case ListType.List:
                var root    = "";
                var takeoff = "";
                if (string.IsNullOrEmpty(rootXPath))
                {
                    root =
                        XPath.GetMaxCompareXPath(crawlItems.Select(d => new XPath(d.XPath)).ToList()).ToString();
                    takeoff = root;
                }
                else
                {
                    root = rootXPath;
                }


                var nodes = doc2.DocumentNode.SelectNodes(root);

                if (nodes == null)
                {
                    break;
                }
                foreach (var node in nodes)
                {
                    var document = new FreeDocument();
                    foreach (var r in crawlItems)
                    {
                        string path;
                        if (string.IsNullOrEmpty(takeoff))
                        {
                            path = node.XPath + r.XPath;
                        }
                        else
                        {
                            path = node.XPath + new XPath(r.XPath).TakeOff(takeoff);
                        }

                        var result = node.GetDataFromXPath(path, r.IsHTML);


                        document.SetValue(r.Name, result);
                    }
                    documents.Add(document);
                }
                return(documents);

            case ListType.One:


                var freeDocument = new FreeDocument();


                foreach (var r in crawlItems)
                {
                    doc2.GetDataFromXPath(r, freeDocument);
                }

                return(new List <FreeDocument> {
                    freeDocument
                });
            }
            return(new List <FreeDocument>());
        }
Пример #5
0
        public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2,
                                                                     ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false)
        {
            if (existItems == null)
            {
                existItems = new List <CrawlItem>();
            }
            var shortv = "";
            var dict   = new Dictionary <string, double>();

            if (string.IsNullOrEmpty(rootPath))
            {
                var isForceItemOne = false; //强制进入只有一个属性的模式
                if (existItems.Count > 1)
                {
                    shortv =
                        XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath));
                    var nodes = doc2.DocumentNode.SelectNodes(shortv);
                    if (nodes == null || nodes.Count == 0)
                    {
                        yield break;
                    }

                    if (nodes.Count == 1)
                    {
                        isForceItemOne = true;
                    }
                    else
                    {
                        var items  = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1);
                        var target = getCrawTarget(items, shortv);
                        if (target != null)
                        {
                            yield return(target);
                        }
                    }
                }

                if (isForceItemOne || existItems.Count == 1)
                {
                    var realPath = existItems.First().XPath;
                    var items    = XPath.Split(realPath);
                    var array    =
                        items.SelectAll(d => true)
                        .Select(d => XPath.SubXPath(realPath, d)).ToList();
                    foreach (var item in array)
                    {
                        GetTableRootProbability(
                            doc2.DocumentNode.SelectSingleNode(item)
                            .ChildNodes.Where(d2 => d2.Name.Contains("#") == false)
                            .ToList(), dict, false);
                    }
                }
                else
                {
                    GetTableRootProbability(
                        doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true);
                }
                if (isForceItemOne || existItems.Count < 2)
                {
                    IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value);
                    foreach (var keyValuePair in p)
                    {
                        var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4);
                        if (items.Count == 0)
                        {
                            continue;
                        }
                        var target   = getCrawTarget(items, keyValuePair.Key);
                        var rootNode = doc2.DocumentNode.SelectSingleNode(keyValuePair.Key).ParentNode;
                        if (rootNode == null)
                        {
                            continue;
                        }

                        target.Html        = rootNode.InnerHtml;
                        target.Text        = rootNode.InnerText;
                        target.NodeCount   = doc2.DocumentNode.SelectNodes(keyValuePair.Key).Count;
                        target.Score       = keyValuePair.Value;
                        target.ColumnCount = items.Count;
                        yield return(target);
                    }
                }
            }
            else
            {
                var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>());
                if (items.Count > 0)
                {
                    var root   = doc2.DocumentNode.SelectSingleNode(rootPath);
                    var xpath  = XPath.RemoveFinalNum(root.XPath);
                    var target = getCrawTarget(items, xpath);
                    target.RootXPath = rootPath;
                    yield return(target);
                }
            }
        }
Пример #6
0
        public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2,
                                                                     ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false)
        {
            if (existItems == null)
            {
                existItems = new List <CrawlItem>();
            }
            var shortv = "";
            var dict   = new Dictionary <string, double>();

            if (string.IsNullOrEmpty(rootPath))
            {
                if (existItems.Count > 1)
                {
                    shortv =
                        XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath));

                    var items  = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1);
                    var target = getCrawTarget(items, shortv);
                    if (target != null)
                    {
                        yield return(target);
                    }
                    yield break;
                }

                if (existItems.Count == 1)
                {
                    var realPath = existItems.First().XPath;
                    var items    = XPath.Split(realPath);
                    var array    =
                        items.SelectAll(d => true)
                        .Select(d => XPath.SubXPath(realPath, d)).ToList();
                    foreach (var item in array)
                    {
                        GetTableRootProbability(
                            doc2.DocumentNode.SelectSingleNode(item)
                            .ChildNodes.Where(d2 => d2.Name.Contains("#") == false)
                            .ToList(), dict, false);
                    }
                }
                else
                {
                    GetTableRootProbability(
                        doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true);
                }
                if (existItems.Count < 2)
                {
                    IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value);
                    foreach (var keyValuePair in p)
                    {
                        var items  = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4);
                        var target = getCrawTarget(items, keyValuePair.Key);
                        if (target != null)
                        {
                            yield return(target);
                        }
                    }
                }
            }
            else
            {
                var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>());
                if (items.Count > 0)
                {
                    var root = doc2.DocumentNode.SelectSingleNode(rootPath);

                    foreach (var crawlItem in items)
                    {
                        crawlItem.XPath = XPath.TakeOff(crawlItem.XPath, root.XPath);
                    }

                    var target = getCrawTarget(items, rootPath);
                    if (target != null)
                    {
                        yield return(target);
                    }
                }
            }
        }