Пример #1
0
        private static bool IsSameXPath(string xpath1, string xpath2, string shortv)
        {
            var p1 = XPath.TakeOff(xpath1, shortv);
            var p2 = XPath.TakeOff(xpath2, shortv);

            return(p1 == p2);
        }
Пример #2
0
        /// <summary>
        ///     获取从头开始的最大公共子串
        /// </summary>
        /// <returns></returns>
        public static XPath GetMaxCompareXPath(IList <XPath> items)
        {
            int minlen = items.Min(d => d.Count);

            string c = null;
            int    i = 0;

            for (i = 0; i < minlen; i++)
            {
                for (int index = 0; index < items.Count; index++)
                {
                    XPath path = items[index];
                    if (index == 0)
                    {
                        c = path[i];
                    }
                    else
                    {
                        if (c != path[i])
                        {
                            goto OVER;
                        }
                    }
                }
            }
OVER:
            XPath first = items.First().SubXPath(i + 1);

            first.RemoveFinalNum();
            return(first);
        }
Пример #3
0
        private static bool IsSameXPath(string xpath1, string xpath2, string shortv)
        {
            var p1 = new XPath(xpath1).TakeOff(shortv);
            var p2 = new XPath(xpath2).TakeOff(shortv);

            return(p1.ToString() == p2.ToString());
        }
Пример #4
0
        public static string GetTextNode(this HtmlNode node)
        {
            var para = new ParaClass();

            node.GetTextRootProbability(para);

            return(XPath.SubXPath(para.Path, -2));
        }
Пример #5
0
        public static string GetTextNode(this HtmlNode node)
        {
            var para = new ParaClass();

            node.GetTextRootProbability(para);

            var path = new XPath(para.Path);

            return(path.SubXPath(0, path.Count - 2).ToString());
        }
Пример #6
0
        public XPath TakeOff(string fullPath)
        {
            if (string.IsNullOrEmpty(fullPath))
            {
                return(this);
            }
            var temp = new XPath(fullPath);

            return(SubXPath(temp.Count, Count - temp.Count));
        }
Пример #7
0
        public static IEnumerable <List <CrawlItem> > SearchPropertiesSmart(this HtmlDocument doc2,
                                                                            ICollection <CrawlItem> existItems = null, bool isAttrEnabled = false)
        {
            if (existItems == null)
            {
                existItems = new List <CrawlItem>();
            }
            var shortv = "";

            if (existItems.Count > 1)
            {
                shortv =
                    XPath.GetMaxCompareXPath(existItems.Select(d => new XPath(d.XPath)).ToList()).ToString();
                yield return(GetDiffNodes(doc2, shortv, isAttrEnabled, existItems));
            }
            else if (existItems.Count == 1)
            {
                var realPath = new XPath(existItems.First().XPath);
                var array    =
                    realPath.SelectAll(d => true)
                    .Select(d => new XPath(realPath.Take(d)).ToString()).ToList();
                var dict = new Dictionary <string, double>();
                foreach (var item in array)
                {
                    GetTableRootProbability(
                        doc2.DocumentNode.SelectSingleNode(item)
                        .ChildNodes.Where(d2 => d2.Name.Contains("#") == false)
                        .ToList(), dict, false);
                }

                foreach (
                    var item in
                    dict.OrderByDescending(d => d.Value))
                {
                    shortv = item.Key;
                    yield return(GetDiffNodes(doc2, shortv, isAttrEnabled, existItems));
                }
            }
            else
            {
                var dict = new Dictionary <string, double>();
                GetTableRootProbability(
                    doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true);
                IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value);
                foreach (var keyValuePair in p)
                {
                    var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems);
                    if (items.Count > 1)
                    {
                        yield return(items);
                    }
                }
            }
        }
Пример #8
0
        /// <summary>
        ///     从批量集合中获取数据
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="crawItem"></param>
        /// <param name="root"></param>
        /// <param name="document"></param>
        public static void GetDataFromXPath(this HtmlDocument doc, CrawlItem crawItem, string root,
                                            IFreeDocument document)
        {
            var result = doc.DocumentNode.GetDataFromXPath(XPath.TakeOff(crawItem.XPath, root),
                                                           crawItem.IsHTML);


            if (result != null)
            {
                document.SetValue(crawItem.Name, result);
            }
        }
Пример #9
0
        public static string CompileCrawItems(this HtmlDocument doc2, IList <CrawlItem> crawlItem)
        {
            var shortv =
                XPath.GetMaxCompareXPath(crawlItem.Select(d => new XPath(d.XPath)).ToList()).ToString();

            if (!string.IsNullOrEmpty(shortv))
            {
                crawlItem.Execute(d => d.XPath = new XPath(d.XPath).TakeOff(shortv).ToString());
                return(shortv);
            }
            return("");
        }
Пример #10
0
 private static CrawTarget getCrawTarget(List <CrawlItem> items, string root = null)
 {
     if (items.Count > 1)
     {
         return(new CrawTarget(items));
     }
     else if (items.Count == 1 && string.IsNullOrEmpty(root) == false)
     {
         var child = XPath.TakeOff(items[0].XPath, root);
         items[0].XPath = child;
         return(new CrawTarget(items, root));
     }
     return(null);
 }
Пример #11
0
        /// <summary>
        ///     计算可能是列表根节点的概率,同时将值保存在dict中。
        /// </summary>
        /// <param name="node"></param>
        /// <param name="dict"></param>
        private static void GetTableRootProbability(IList <HtmlNode> nodes, Dictionary <string, double> dict,
                                                    bool haschild)
        {
            if (nodes.Count == 0)
            {
                return;
            }
            var node  = nodes[0];
            var xpath = XPath.RemoveFinalNum(node.XPath);

            if (haschild)
            {
                foreach (var htmlNode in nodes)
                {
                    GetTableRootProbability(htmlNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict,
                                            haschild);
                }
            }

            var avanode = nodes.ToList();

            if (avanode.Count < 3)
            {
                return;
            }
            //if (avanode.Count(d => d.Name == avanode[1].Name) < avanode.Count*0.7)
            //    return;

            var childCount = (double)avanode.Count;

            var childCounts = avanode.Select(d => (double)d.ChildNodes.Count).ToArray();
            var v           = childCounts.Variance();

            //TODO: 此处需要一个更好的手段,因为有效节点往往是间隔的
            if (v > 100)
            {
                return;
            }

            var leafCount = avanode.Last().GetLeafNodeCount();
            var value     = (childCount * PM25 + leafCount) * (v == 0 ? 2 : (Math.Log10((100 - v) / 100)));

            if (xpath.Contains("你"))
            {
                Console.WriteLine(xpath);
            }
            dict.SetValue(xpath, value);
        }
Пример #12
0
 public override bool Init(IEnumerable <IFreeDocument> docus)
 {
     base.Init(docus);
     if (Crawler == null)
     {
         return(false);
     }
     IsMultiYield = true;
     xpaths       = Crawler.CrawlItems.GroupBy(d => d.Name).Select(d =>
     {
         var column = d.Key;
         var path   = XPath.GetMaxCompareXPath(d.Select(d2 => d2.XPath).ToList());
         return(new { Column = column, XPath = path });
     }).ToDictionary(d => d.Column, d => d.XPath);
     return(true);
 }
Пример #13
0
        /// <summary>
        ///     计算可能是列表根节点的概率,同时将值保存在dict中。
        /// </summary>
        /// <param name="node"></param>
        /// <param name="dict"></param>
        private static void GetTableRootProbability(IList <HtmlNode> nodes, Dictionary <string, double> dict,
                                                    bool haschild)
        {
            if (nodes.Count == 0)
            {
                return;
            }
            var node  = nodes[0];
            var xpath = new XPath(node.XPath).RemoveFinalNum().ToString();

            if (haschild)
            {
                foreach (var htmlNode in nodes)
                {
                    GetTableRootProbability(htmlNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict,
                                            haschild);
                }
            }

            var avanode = nodes.ToList();

            if (avanode.Count < 3)
            {
                return;
            }
            if (avanode.Count(d => d.Name == avanode[1].Name) < avanode.Count * 0.7)
            {
                return;
            }

            var childCount = (double)avanode.Count;

            var childCounts = avanode.Select(d => (double)d.ChildNodes.Count).ToArray();
            var v           = childCounts.Variance();

            //TODO: 此处需要一个更好的手段,因为有效节点往往是间隔的
            if (v > 5)
            {
                return;
            }

            var leafCount = avanode.First().GetLeafNodeCount();
            var value     = childCount * PM25 + leafCount;


            dict.SetValue(xpath, value);
        }
Пример #14
0
        /// <summary>
        ///     从XPath获取数据
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="path"></param>
        /// <param name="ishtml"></param>
        /// <returns></returns>
        public static string GetDataFromXPath(this HtmlNode doc, string path, bool ishtml = false)
        {
            if (!string.IsNullOrEmpty(path))
            {
                HtmlNode p2 = null;
                try
                {
                    p2 = doc.SelectSingleNode(path);
                }
                catch (Exception ex)
                {
                }

                if (p2 == null)
                {
                    return(null);
                }

                var paths = path.Split('/');
                var last  = paths[paths.Length - 1];
                if (last.Any() && last.Contains("@") && last.Contains("[1]")) //标签数据
                {
                    var name = XPath.GetAttributeName(last.Split('@', '[')[1]);
                    if (p2.HasAttributes)
                    {
                        var a = p2.Attributes.FirstOrDefault(d => d.Name == name);
                        return(a.Value.Trim());
                    }
                }
                else if (ishtml)
                {
                    return(p2.InnerHtml);
                }
                else
                {
                    return(p2.GetNodeText());
                }
            }
            return(null);
        }
Пример #15
0
        public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2,
                                                                     ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false)
        {
            if (existItems == null)
            {
                existItems = new List <CrawlItem>();
            }
            var shortv = "";
            var dict   = new Dictionary <string, double>();

            if (string.IsNullOrEmpty(rootPath))
            {
                if (existItems.Count > 1)
                {
                    shortv =
                        XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath));

                    var items  = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1);
                    var target = getCrawTarget(items, shortv);
                    if (target != null)
                    {
                        yield return(target);
                    }
                    yield break;
                }

                if (existItems.Count == 1)
                {
                    var realPath = existItems.First().XPath;
                    var items    = XPath.Split(realPath);
                    var array    =
                        items.SelectAll(d => true)
                        .Select(d => XPath.SubXPath(realPath, d)).ToList();
                    foreach (var item in array)
                    {
                        GetTableRootProbability(
                            doc2.DocumentNode.SelectSingleNode(item)
                            .ChildNodes.Where(d2 => d2.Name.Contains("#") == false)
                            .ToList(), dict, false);
                    }
                }
                else
                {
                    GetTableRootProbability(
                        doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true);
                }
                if (existItems.Count < 2)
                {
                    IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value);
                    foreach (var keyValuePair in p)
                    {
                        var items  = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4);
                        var target = getCrawTarget(items, keyValuePair.Key);
                        if (target != null)
                        {
                            yield return(target);
                        }
                    }
                }
            }
            else
            {
                var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>());
                if (items.Count > 0)
                {
                    var root = doc2.DocumentNode.SelectSingleNode(rootPath);

                    foreach (var crawlItem in items)
                    {
                        crawlItem.XPath = XPath.TakeOff(crawlItem.XPath, root.XPath);
                    }

                    var target = getCrawTarget(items, rootPath);
                    if (target != null)
                    {
                        yield return(target);
                    }
                }
            }
        }
Пример #16
0
        public static IEnumerable <CrawTarget> SearchPropertiesSmart(this HtmlDocument doc2,
                                                                     ICollection <CrawlItem> existItems = null, string rootPath = null, bool isAttrEnabled = false)
        {
            if (existItems == null)
            {
                existItems = new List <CrawlItem>();
            }
            var shortv = "";
            var dict   = new Dictionary <string, double>();

            if (string.IsNullOrEmpty(rootPath))
            {
                var isForceItemOne = false; //强制进入只有一个属性的模式
                if (existItems.Count > 1)
                {
                    shortv =
                        XPath.GetMaxCompareXPath(existItems.Select(d => d.XPath));
                    var nodes = doc2.DocumentNode.SelectNodes(shortv);
                    if (nodes == null || nodes.Count == 0)
                    {
                        yield break;
                    }

                    if (nodes.Count == 1)
                    {
                        isForceItemOne = true;
                    }
                    else
                    {
                        var items  = GetDiffNodes(doc2, shortv, isAttrEnabled, existItems, 1);
                        var target = getCrawTarget(items, shortv);
                        if (target != null)
                        {
                            yield return(target);
                        }
                    }
                }

                if (isForceItemOne || existItems.Count == 1)
                {
                    var realPath = existItems.First().XPath;
                    var items    = XPath.Split(realPath);
                    var array    =
                        items.SelectAll(d => true)
                        .Select(d => XPath.SubXPath(realPath, d)).ToList();
                    foreach (var item in array)
                    {
                        GetTableRootProbability(
                            doc2.DocumentNode.SelectSingleNode(item)
                            .ChildNodes.Where(d2 => d2.Name.Contains("#") == false)
                            .ToList(), dict, false);
                    }
                }
                else
                {
                    GetTableRootProbability(
                        doc2.DocumentNode.ChildNodes.Where(d => d.Name.Contains("#") == false).ToList(), dict, true);
                }
                if (isForceItemOne || existItems.Count < 2)
                {
                    IEnumerable <KeyValuePair <string, double> > p = dict.OrderByDescending(d => d.Value);
                    foreach (var keyValuePair in p)
                    {
                        var items = GetDiffNodes(doc2, keyValuePair.Key, isAttrEnabled, existItems, 4);
                        if (items.Count == 0)
                        {
                            continue;
                        }
                        var target   = getCrawTarget(items, keyValuePair.Key);
                        var rootNode = doc2.DocumentNode.SelectSingleNode(keyValuePair.Key).ParentNode;
                        if (rootNode == null)
                        {
                            continue;
                        }

                        target.Html        = rootNode.InnerHtml;
                        target.Text        = rootNode.InnerText;
                        target.NodeCount   = doc2.DocumentNode.SelectNodes(keyValuePair.Key).Count;
                        target.Score       = keyValuePair.Value;
                        target.ColumnCount = items.Count;
                        yield return(target);
                    }
                }
            }
            else
            {
                var items = GetDiffNodes(doc2, rootPath, isAttrEnabled, new List <CrawlItem>());
                if (items.Count > 0)
                {
                    var root   = doc2.DocumentNode.SelectSingleNode(rootPath);
                    var xpath  = XPath.RemoveFinalNum(root.XPath);
                    var target = getCrawTarget(items, xpath);
                    target.RootXPath = rootPath;
                    yield return(target);
                }
            }
        }
Пример #17
0
        /// <summary>
        ///     find different text and return it name and xpath
        /// </summary>
        /// <param name="isAttrEnabled">是否抓取标签中的数据</param>
        /// <returns></returns>
        private static bool GetDiffNodes(List <HtmlNode> nodes, List <CrawlItem> result, List <List <string> > buffers,
                                         bool isAttrEnabled)
        {
            var isChildContainInfo = false;
            var node1 = nodes.First();
            var node2 = nodes[1];

            if (node1.ChildNodes.Count == 1 && node1.ChildNodes[0].NodeType == HtmlNodeType.Text)
            {
                var row = nodes.Select(d => d.SelectSingleNode(d.XPath))
                          .Where(d => d != null).Select(d => d.InnerText.Trim()).ToList();
                if (row.Any(d => CompareString(d, node1.InnerText) == false))
                {
                    var name = node1.SearchPropertyName(result) ?? "属性" + result.Count;
                    if (buffers.Any(d => ListEqual(d, row)))
                    {
                        return(true);
                    }
                    var crawlItem = new CrawlItem
                    {
                        Name        = name,
                        SampleData1 = node1.InnerText,
                        XPath       = result.Count % 2 == 0 ? node1.XPath : node2.XPath
                    };
                    result.Add(crawlItem);
                    buffers.Add(row);
                    return(true);
                }
                return(false);
            }
            foreach (var nodechild1 in node1.ChildNodes)
            {
                if (nodechild1.XPath.Contains("#"))
                {
                    continue;
                }

                var path       = new XPath(nodechild1.XPath).TakeOff(node1.XPath).ToString();
                var nodechild2 =
                    nodes.Select(d => d.SelectSingleNode(d.XPath + path)).Where(d => d != null).ToList();
                if (nodechild2.Count == 1)
                {
                    continue;
                }

                isChildContainInfo |= GetDiffNodes(nodechild2, result, buffers, isAttrEnabled);
            }


            if (isAttrEnabled == false)
            {
                return(isChildContainInfo);
            }
            foreach (var attribute in node1.Attributes)
            {
                var attr1 = attribute.Value;
                ;
                var row = nodes.Select(d => d.SelectSingleNode(d.XPath))
                          .Where(d => d != null).Where(d => d.Attributes.Contains(attribute.Name)).Select(d => d.Attributes[attribute.Name].Value).ToList();
                if (row.Any(d => CompareString(d, attr1) == false))
                {
                    if (buffers.Any(d => ListEqual(d, row)))
                    {
                        return(isChildContainInfo);
                    }
                    var name = node1.SearchPropertyName(result);
                    if (name != null)
                    {
                        name += '_' + attribute.Name;
                    }
                    else
                    {
                        name = "属性" + result.Count;
                    }

                    var craw = new CrawlItem {
                        Name = name, SampleData1 = attr1, XPath = attribute.XPath
                    };
                    result.Add(craw);
                }
            }
            return(isChildContainInfo);
        }
Пример #18
0
        public static List <FreeDocument> GetDataFromXPath(this HtmlDocument doc2, IList <CrawlItem> crawlItems,
                                                           ListType type = ListType.List, string rootXPath = "")
        {
            if (crawlItems.Count == 0)
            {
                return(new List <FreeDocument>());
            }

            var documents = new List <FreeDocument>();

            switch (type)
            {
            case ListType.List:
                var root    = "";
                var takeoff = "";
                if (string.IsNullOrEmpty(rootXPath))
                {
                    root =
                        XPath.GetMaxCompareXPath(crawlItems.Select(d => new XPath(d.XPath)).ToList()).ToString();
                    takeoff = root;
                }
                else
                {
                    root = rootXPath;
                }


                var nodes = doc2.DocumentNode.SelectNodes(root);

                if (nodes == null)
                {
                    break;
                }
                foreach (var node in nodes)
                {
                    var document = new FreeDocument();
                    foreach (var r in crawlItems)
                    {
                        string path;
                        if (string.IsNullOrEmpty(takeoff))
                        {
                            path = node.XPath + r.XPath;
                        }
                        else
                        {
                            path = node.XPath + new XPath(r.XPath).TakeOff(takeoff);
                        }

                        var result = node.GetDataFromXPath(path, r.IsHTML);


                        document.SetValue(r.Name, result);
                    }
                    documents.Add(document);
                }
                return(documents);

            case ListType.One:


                var freeDocument = new FreeDocument();


                foreach (var r in crawlItems)
                {
                    doc2.GetDataFromXPath(r, freeDocument);
                }

                return(new List <FreeDocument> {
                    freeDocument
                });
            }
            return(new List <FreeDocument>());
        }
Пример #19
0
        /// <summary>
        ///     find different text and return it name and xpath
        /// </summary>
        /// <param name="isAttrEnabled">是否抓取标签中的数据</param>
        /// <returns></returns>
        private static bool GetDiffNodes(List <HtmlNode> nodes, List <CrawlItem> result, List <List <string> > buffers,
                                         bool isAttrEnabled)
        {
            var isChildContainInfo = false;
            var len   = nodes.Count;
            var node1 = nodes[Random.Next(0, len / 2)];
            var node2 = nodes[Random.Next(len / 2, len)];

            if (node1.ChildNodes.Count == 1 && node1.ChildNodes[0].NodeType == HtmlNodeType.Text)
            {
                var row = nodes.Select(d => d.SelectSingleNode(d.XPath))
                          .Where(d => d != null).Select(d => d.InnerText.Trim()).ToList();
                if (row.Any(d => CompareString(d, node1.InnerText) == false))
                {
                    var name = node1.SearchPropertyName(result) ?? "属性" + result.Count;
                    if (buffers.Any(d => ListEqual(d, row)))
                    {
                        return(true);
                    }
                    var crawlItem = new CrawlItem
                    {
                        Name        = name,
                        SampleData1 = node1.InnerText,
                        XPath       = result.Count % 2 == 0 ? node1.XPath : node2.XPath
                    };
                    result.Add(crawlItem);
                    buffers.Add(row);
                    return(true);
                }
                return(false);
            }
            foreach (var nodechild1 in node1.ChildNodes)
            {
                if (nodechild1.XPath.Contains("#"))
                {
                    continue;
                }

                var path = XPath.TakeOff(nodechild1.XPath, node1.XPath);
                var fail = false;

                var nodechild2 =
                    nodes.Select(d =>
                {
                    if (fail)
                    {
                        return(null);
                    }
                    try
                    {
                        var node = d.SelectSingleNode(d.XPath + path);
                        return(node);
                    }
                    catch (Exception)
                    {
                        fail = true;
                        return(null);
                    }
                }).Where(d => d != null && fail == false).ToList();
                if (nodechild2.Count < 2 || fail)
                {
                    continue;
                }

                isChildContainInfo |= GetDiffNodes(nodechild2, result, buffers, isAttrEnabled);
            }


            if (isAttrEnabled == false)
            {
                return(isChildContainInfo);
            }
            foreach (var attribute in node1.Attributes)
            {
                var attr1 = attribute.Value;
                ;
                var row = nodes.Select(d =>
                {
                    try
                    {
                        return(d.SelectSingleNode(d.XPath));
                    }
                    catch (Exception ex)
                    {
                        XLogSys.Print.Error("XPath表达式编写错误: " + d.XPath);
                        return(null);
                    }
                })
                          .Where(d => d != null)
                          .Where(d => d.Attributes.Contains(attribute.Name))
                          .Select(d => d.Attributes[attribute.Name].Value)
                          .ToList();
                if (row.Any(d => CompareString(d, attr1) == false))
                {
                    if (buffers.Any(d => ListEqual(d, row)))
                    {
                        return(isChildContainInfo);
                    }
                    var name = node1.SearchPropertyName(result);
                    if (name != null)
                    {
                        name += '_' + attribute.Name;
                    }
                    else
                    {
                        name = "属性" + result.Count;
                    }

                    var craw = new CrawlItem {
                        Name = name, SampleData1 = attr1, XPath = attribute.XPath
                    };
                    result.Add(craw);
                }
            }
            return(isChildContainInfo);
        }