Beispiel #1
0
        /// <summary>
        ///     从批量集合中获取数据
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="crawItem"></param>
        /// <param name="shortv"></param>
        /// <param name="document"></param>
        public static void GetDataFromXPath(this HtmlDocument doc, CrawlItem crawItem, IFreeDocument document)
        {
            var result = doc.DocumentNode.GetDataFromXPath(crawItem.XPath, crawItem.IsHTML);


            if (result != null)
            {
                document.SetValue(crawItem.Name, result);
            }
        }
Beispiel #2
0
        /// <summary>
        ///     从批量集合中获取数据
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="crawItem"></param>
        /// <param name="shortv"></param>
        /// <param name="document"></param>
        public static void GetDataFromXPath(this HtmlDocument doc, CrawlItem crawItem, string shortv,
                                            IFreeDocument document)
        {
            var result = doc.DocumentNode.GetDataFromXPath(new XPath(crawItem.XPath).TakeOff(shortv).ToString(),
                                                           crawItem.IsHTML);


            if (result != null)
            {
                document.SetValue(crawItem.Name, result);
            }
        }
Beispiel #3
0
        /// <summary>
        ///     查询XPATH,返回CrawlItem
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="keyword"></param>
        /// <param name="name"></param>
        /// <returns></returns>
        public static CrawlItem SearchXPath(this HtmlDocument doc, string keyword, string name, bool hasAttr = true)
        {
            var xpath = doc.SearchXPath(keyword, () => hasAttr).FirstOrDefault();

            if (xpath == null)
            {
                return(null);
            }
            var crawitem = new CrawlItem {
                XPath = xpath, SampleData1 = keyword, Name = name
            }
            ;

            return(crawitem);
        }
Beispiel #4
0
        private static List <CrawlItem> GetDiffNodes(HtmlDocument doc2, string root, bool isAttrEnabled,
                                                     IEnumerable <CrawlItem> exists = null, int minNodeCount = 2)
        {
            HtmlNodeCollection nodes = null;
            var crawlItems           = new List <CrawlItem>();

            try
            {
                nodes = doc2.DocumentNode.SelectNodes(root);
            }
            catch (Exception ex)
            {
                XLogSys.Print.Error(ex.Message + "  可能XPath表达式有误");
                return(new List <CrawlItem>());
            }

            if (nodes == null || nodes.Count < minNodeCount)
            {
                return(new List <CrawlItem>());
            }
            var buffers = new List <List <string> >();
            var nodes3  = nodes.ToList(); // .Where(d => d.Name.Contains("#") == false).ToList();

            if (nodes3.Count > 1)
            {
                GetDiffNodes(nodes3, crawlItems, buffers, isAttrEnabled);
            }
            if (exists != null)
            {
                var copied = exists.Select(d =>
                {
                    var xp = new CrawlItem();
                    d.DictCopyTo(xp);
                    return(xp);
                }).ToList();
                crawlItems.RemoveElementsNoReturn(d => copied.Any(r => IsSameXPath(d.XPath, r.XPath, root)));
                crawlItems.AddRange(copied);
            }

            return(crawlItems);
        }
Beispiel #5
0
        /// <summary>
        ///     find different text and return it name and xpath
        /// </summary>
        /// <param name="isAttrEnabled">是否抓取标签中的数据</param>
        /// <returns></returns>
        private static bool GetDiffNodes(List <HtmlNode> nodes, List <CrawlItem> result, List <List <string> > buffers,
                                         bool isAttrEnabled)
        {
            var isChildContainInfo = false;
            var node1 = nodes.First();
            var node2 = nodes[1];

            if (node1.ChildNodes.Count == 1 && node1.ChildNodes[0].NodeType == HtmlNodeType.Text)
            {
                var row = nodes.Select(d => d.SelectSingleNode(d.XPath))
                          .Where(d => d != null).Select(d => d.InnerText.Trim()).ToList();
                if (row.Any(d => CompareString(d, node1.InnerText) == false))
                {
                    var name = node1.SearchPropertyName(result) ?? "属性" + result.Count;
                    if (buffers.Any(d => ListEqual(d, row)))
                    {
                        return(true);
                    }
                    var crawlItem = new CrawlItem
                    {
                        Name        = name,
                        SampleData1 = node1.InnerText,
                        XPath       = result.Count % 2 == 0 ? node1.XPath : node2.XPath
                    };
                    result.Add(crawlItem);
                    buffers.Add(row);
                    return(true);
                }
                return(false);
            }
            foreach (var nodechild1 in node1.ChildNodes)
            {
                if (nodechild1.XPath.Contains("#"))
                {
                    continue;
                }

                var path       = new XPath(nodechild1.XPath).TakeOff(node1.XPath).ToString();
                var nodechild2 =
                    nodes.Select(d => d.SelectSingleNode(d.XPath + path)).Where(d => d != null).ToList();
                if (nodechild2.Count == 1)
                {
                    continue;
                }

                isChildContainInfo |= GetDiffNodes(nodechild2, result, buffers, isAttrEnabled);
            }


            if (isAttrEnabled == false)
            {
                return(isChildContainInfo);
            }
            foreach (var attribute in node1.Attributes)
            {
                var attr1 = attribute.Value;
                ;
                var row = nodes.Select(d => d.SelectSingleNode(d.XPath))
                          .Where(d => d != null).Where(d => d.Attributes.Contains(attribute.Name)).Select(d => d.Attributes[attribute.Name].Value).ToList();
                if (row.Any(d => CompareString(d, attr1) == false))
                {
                    if (buffers.Any(d => ListEqual(d, row)))
                    {
                        return(isChildContainInfo);
                    }
                    var name = node1.SearchPropertyName(result);
                    if (name != null)
                    {
                        name += '_' + attribute.Name;
                    }
                    else
                    {
                        name = "属性" + result.Count;
                    }

                    var craw = new CrawlItem {
                        Name = name, SampleData1 = attr1, XPath = attribute.XPath
                    };
                    result.Add(craw);
                }
            }
            return(isChildContainInfo);
        }
Beispiel #6
0
        /// <summary>
        ///     find different text and return it name and xpath
        /// </summary>
        /// <param name="isAttrEnabled">是否抓取标签中的数据</param>
        /// <returns></returns>
        private static bool GetDiffNodes(List <HtmlNode> nodes, List <CrawlItem> result, List <List <string> > buffers,
                                         bool isAttrEnabled)
        {
            var isChildContainInfo = false;
            var len   = nodes.Count;
            var node1 = nodes[Random.Next(0, len / 2)];
            var node2 = nodes[Random.Next(len / 2, len)];

            if (node1.ChildNodes.Count == 1 && node1.ChildNodes[0].NodeType == HtmlNodeType.Text)
            {
                var row = nodes.Select(d => d.SelectSingleNode(d.XPath))
                          .Where(d => d != null).Select(d => d.InnerText.Trim()).ToList();
                if (row.Any(d => CompareString(d, node1.InnerText) == false))
                {
                    var name = node1.SearchPropertyName(result) ?? "属性" + result.Count;
                    if (buffers.Any(d => ListEqual(d, row)))
                    {
                        return(true);
                    }
                    var crawlItem = new CrawlItem
                    {
                        Name        = name,
                        SampleData1 = node1.InnerText,
                        XPath       = result.Count % 2 == 0 ? node1.XPath : node2.XPath
                    };
                    result.Add(crawlItem);
                    buffers.Add(row);
                    return(true);
                }
                return(false);
            }
            foreach (var nodechild1 in node1.ChildNodes)
            {
                if (nodechild1.XPath.Contains("#"))
                {
                    continue;
                }

                var path = XPath.TakeOff(nodechild1.XPath, node1.XPath);
                var fail = false;

                var nodechild2 =
                    nodes.Select(d =>
                {
                    if (fail)
                    {
                        return(null);
                    }
                    try
                    {
                        var node = d.SelectSingleNode(d.XPath + path);
                        return(node);
                    }
                    catch (Exception)
                    {
                        fail = true;
                        return(null);
                    }
                }).Where(d => d != null && fail == false).ToList();
                if (nodechild2.Count < 2 || fail)
                {
                    continue;
                }

                isChildContainInfo |= GetDiffNodes(nodechild2, result, buffers, isAttrEnabled);
            }


            if (isAttrEnabled == false)
            {
                return(isChildContainInfo);
            }
            foreach (var attribute in node1.Attributes)
            {
                var attr1 = attribute.Value;
                ;
                var row = nodes.Select(d =>
                {
                    try
                    {
                        return(d.SelectSingleNode(d.XPath));
                    }
                    catch (Exception ex)
                    {
                        XLogSys.Print.Error("XPath表达式编写错误: " + d.XPath);
                        return(null);
                    }
                })
                          .Where(d => d != null)
                          .Where(d => d.Attributes.Contains(attribute.Name))
                          .Select(d => d.Attributes[attribute.Name].Value)
                          .ToList();
                if (row.Any(d => CompareString(d, attr1) == false))
                {
                    if (buffers.Any(d => ListEqual(d, row)))
                    {
                        return(isChildContainInfo);
                    }
                    var name = node1.SearchPropertyName(result);
                    if (name != null)
                    {
                        name += '_' + attribute.Name;
                    }
                    else
                    {
                        name = "属性" + result.Count;
                    }

                    var craw = new CrawlItem {
                        Name = name, SampleData1 = attr1, XPath = attribute.XPath
                    };
                    result.Add(craw);
                }
            }
            return(isChildContainInfo);
        }
Beispiel #7
0
        private void AddNewItem(bool isAlert = true)
        {
            var path = SelectXPath;
            if (!string.IsNullOrEmpty(RootXPath))
            {
                var root = HtmlDoc.DocumentNode.SelectSingleNode(RootXPath).ParentNode;
                var node = HtmlDoc.DocumentNode.SelectSingleNode(path);
                if (!node.IsAncestor(root))
                {
                    if (isAlert)
                        MessageBox.Show("当前XPath所在节点不是父节点的后代,请检查对应的XPath");
                    return;
                }
                path = new XPath(node.XPath).TakeOff(root.XPath).ToString();
            }

            var item = new CrawlItem {XPath = path, Name = SelectName, SampleData1 = SelectText};
            if (CrawlItems.Any(d => d.Name == SelectName))
            {
                SelectName = "属性" + CrawlItems.Count;
                if (isAlert)
                {
                    MessageBox.Show($"已存在名称为{SelectName}的属性,不能重复添加");
                    return;
                }
            }
            CrawlItems.Add(item);
            SelectXPath = "";
        }
Beispiel #8
0
        public override void DictDeserialize(IDictionary<string, object> dicts, Scenario scenario = Scenario.Database)
        {
            base.DictDeserialize(dicts, scenario);
            URL = dicts.Set("URL", URL);
            RootXPath = dicts.Set("RootXPath", RootXPath);
            IsMultiData = dicts.Set("IsMultiData", IsMultiData);
            URLFilter = dicts.Set("URLFilter", URLFilter);
            Crawler = dicts.Set("Crawler", Crawler);
            ContentFilter = dicts.Set("ContentFilter", ContentFilter);
            if (dicts.ContainsKey("HttpSet"))
            {
                var doc2 = dicts["HttpSet"];
                var p = doc2 as IDictionary<string, object>;
                Http.UnsafeDictDeserialize(p);
            }

            if (dicts.ContainsKey("Login"))
            {
                var doc2 = dicts["Login"];
                var p = doc2 as IDictionary<string, object>;
                var item = new HttpItem();
                item.DictDeserialize(p);
                Documents.Add(item);
            }

            if (dicts.ContainsKey("Generator"))
            {
                var doc2 = dicts["Generator"];
                var p = doc2 as IDictionary<string, object>;
            }
            var doc = dicts as FreeDocument;
            if (doc?.Children != null)
            {
                foreach (var child in doc.Children)
                {
                    var item = new CrawlItem();
                    item.DictDeserialize(child);
                    CrawlItems.Add(item);
                }
            }
        }
Beispiel #9
0
        /// <summary>
        ///     find different text and return it name and xpath
        /// </summary>
        /// <param name="isAttrEnabled">是否抓取标签中的数据</param>
        /// <returns></returns>
        private static bool GetDiffNodes(List<HtmlNode> nodes, List<CrawlItem> result, List<List<string>> buffers,
            bool isAttrEnabled)
        {
            var isChildContainInfo = false;
            var node1 = nodes.First();
            var node2 = nodes[1];
            if (node1.ChildNodes.Count == 1 && node1.ChildNodes[0].NodeType == HtmlNodeType.Text)
            {
                var row = nodes.Select(d => d.SelectSingleNode(d.XPath))
                    .Where(d => d != null).Select(d => d.InnerText.Trim()).ToList();
                if (row.Any(d => CompareString(d, node1.InnerText) == false))
                {
                    var name = node1.SearchPropertyName(result) ?? "属性" + result.Count;
                    if (buffers.Any(d => ListEqual(d, row)))
                        return true;
                    var crawlItem = new CrawlItem
                    {
                        Name = name,
                        SampleData1 = node1.InnerText,
                        XPath = result.Count%2 == 0 ? node1.XPath : node2.XPath
                    };
                    result.Add(crawlItem);
                    buffers.Add(row);
                    return true;
                }
                return false;
            }
            foreach (var nodechild1 in node1.ChildNodes)
            {
                if (nodechild1.XPath.Contains("#"))
                    continue;

                var path = new XPath(nodechild1.XPath).TakeOff(node1.XPath).ToString();
                var nodechild2 =
                    nodes.Select(d => d.SelectSingleNode(d.XPath + path)).Where(d => d != null).ToList();
                if (nodechild2.Count == 1)
                    continue;

                isChildContainInfo |= GetDiffNodes(nodechild2, result, buffers, isAttrEnabled);
            }

            if (isAttrEnabled == false)
            {
                return isChildContainInfo;
            }
            foreach (var attribute in node1.Attributes)
            {
                var attr1 = attribute.Value;
              ;
                var row = nodes.Select(d => d.SelectSingleNode(d.XPath))
                   .Where(d => d != null).Where(d => d.Attributes.Contains(attribute.Name)).Select(d=>d.Attributes[attribute.Name].Value).ToList();
                if (row.Any(d => CompareString(d, attr1) == false))
                {
                    if (buffers.Any(d => ListEqual(d, row)))
                        return isChildContainInfo;
                    var name = node1.SearchPropertyName(result);
                    if (name != null)
                        name += '_' + attribute.Name;
                    else
                    {
                        name = "属性" + result.Count;
                    }

                    var craw = new CrawlItem {Name = name, SampleData1 = attr1, XPath = attribute.XPath};
                    result.Add(craw);
                }
            }
            return isChildContainInfo;
        }
Beispiel #10
0
 /// <summary>
 ///     查询XPATH,返回CrawlItem
 /// </summary>
 /// <param name="doc"></param>
 /// <param name="keyword"></param>
 /// <param name="name"></param>
 /// <returns></returns>
 public static CrawlItem SearchXPath(this HtmlDocument doc, string keyword, string name, bool hasAttr = true)
 {
     var xpath = doc.SearchXPath(keyword, () => hasAttr).FirstOrDefault();
     if (xpath == null)
         return null;
     var crawitem = new CrawlItem {XPath = xpath, SampleData1 = keyword, Name = name}
         ;
     return crawitem;
 }
Beispiel #11
0
        /// <summary>
        ///     从批量集合中获取数据
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="crawItem"></param>
        /// <param name="shortv"></param>
        /// <param name="document"></param>
        public static void GetDataFromXPath(this HtmlDocument doc, CrawlItem crawItem, IFreeDocument document)
        {
            var result = doc.DocumentNode.GetDataFromXPath(crawItem.XPath, crawItem.IsHTML);

            if (result != null)
                document.SetValue(crawItem.Name, result);
        }
Beispiel #12
0
        /// <summary>
        ///     从批量集合中获取数据
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="crawItem"></param>
        /// <param name="shortv"></param>
        /// <param name="document"></param>
        public static void GetDataFromXPath(this HtmlDocument doc, CrawlItem crawItem, string shortv,
            IFreeDocument document)
        {
            var result = doc.DocumentNode.GetDataFromXPath(new XPath(crawItem.XPath).TakeOff(shortv).ToString(),
                crawItem.IsHTML);

            if (result != null)
                document.SetValue(crawItem.Name, result);
        }