Пример #1
0
        public List <Field> GetFields(HtmlNode item, SpliderRule rule)
        {
            var fields = new List <Field>();

            foreach (var rulefield in rule.RuleFields)
            {
                var field = new Field()
                {
                    DisplayName = rulefield.DisplayName, FieldName = ""
                };

                var fieldnode = item.SelectSingleNode(rulefield.XPath);
                if (fieldnode != null)
                {
                    field.InnerHtml      = fieldnode.InnerHtml;
                    field.InnerText      = fieldnode.InnerText;
                    field.AfterRegexHtml = !string.IsNullOrWhiteSpace(rulefield.InnerHtmlRegex) ? Regex.Replace(fieldnode.InnerHtml, rulefield.InnerHtmlRegex, "") : fieldnode.InnerHtml;
                    field.AfterRegexText = !string.IsNullOrWhiteSpace(rulefield.InnerTextRegex) ? Regex.Replace(fieldnode.InnerText, rulefield.InnerTextRegex, "") : fieldnode.InnerText;

                    //field.AfterRegexHtml = Regex.Replace(fieldnode.InnerHtml, rulefield.InnerHtmlRegex, "");
                    //field.AfterRegexText = Regex.Replace(fieldnode.InnerText, rulefield.InnerTextRegex, "");
                    if (!string.IsNullOrWhiteSpace(rulefield.Attribute))
                    {
                        field.Value = fieldnode.Attributes[rulefield.Attribute].Value;
                    }
                    else
                    {
                        field.Value = rulefield.IsFirstInnerText ? field.AfterRegexText : field.AfterRegexHtml;
                    }
                }
                fields.Add(field);
            }
            return(fields);
        }
Пример #2
0
        /// <summary>
        /// 根据Rule
        /// </summary>
        /// <param name="rule"></param>
        /// <returns></returns>
        public List <SpliderContent> GetByRule(SpliderRule rule)
        {
            var     url = rule.Url;
            HtmlWeb web = new HtmlWeb();
            //1.支持从web或本地path加载html
            var htmlDoc     = web.Load(url);
            var contentnode = htmlDoc.DocumentNode.SelectSingleNode(rule.ContentXPath);

            var list = new List <SpliderContent>();

            //列表页
            if (!string.IsNullOrWhiteSpace(rule.EachXPath))
            {
                var itemsNodes = contentnode.SelectNodes(rule.EachXPath);
                foreach (var item in itemsNodes)
                {
                    var fields = GetFields(item, rule);
                    list.Add(new SpliderContent()
                    {
                        Fields        = fields,
                        SpliderRuleId = rule.Id
                    });
                }
                return(list);
            }
            //详情页
            var cfields = GetFields(contentnode, rule);

            list.Add(new SpliderContent()
            {
                Fields        = cfields,
                SpliderRuleId = rule.Id
            });
            return(list);
        }
Пример #3
0
        /// <summary>
        /// 根据Rule
        /// </summary>
        /// <param name="rule"></param>
        /// <returns></returns>
        public List <SpliderContent> GetByRule(SpliderRule rule, string content)
        {
            var htmlDoc = new HtmlDocument();

            htmlDoc.LoadHtml(content);

            var contentnode = htmlDoc.DocumentNode.SelectSingleNode(rule.ContentXPath);

            var list = new List <SpliderContent>();

            if (!string.IsNullOrWhiteSpace(rule.EachXPath))
            {
                var itemsNodes = contentnode.SelectNodes(rule.EachXPath);
                foreach (var item in itemsNodes)
                {
                    var fields = GetFields(item, rule);

                    list.Add(new SpliderContent()
                    {
                        Fields        = fields,
                        SpliderRuleId = rule.Id
                    });
                }
            }
            return(list);
        }
Пример #4
0
        public List <SpliderContent> GetByRuleFromFile(SpliderRule rule, string filename)
        {
            var htmlDoc = new HtmlDocument();

            htmlDoc.Load(filename);
            var contentnode = htmlDoc.DocumentNode.SelectSingleNode(rule.ContentXPath);

            var list = new List <SpliderContent>();
            //详情页
            var cfields = GetFields(contentnode, rule);
            var sc      = new SpliderContent()
            {
                Fields        = cfields,
                SpliderRuleId = rule.Id
            };

            list.Add(sc);
            return(list);
        }
Пример #5
0
        public List <SpliderContent> GetByRule(SpliderRule rule)
        {
            HtmlWeb web = new HtmlWeb();
            //1.支持从web加载html
            var htmlDoc     = web.Load(rule.Url);
            var contentnode = htmlDoc.DocumentNode.SelectSingleNode(rule.ContentXPath);

            var list = new List <SpliderContent>();
            //详情页
            var cfields = GetFields(contentnode, rule);
            var sc      = new SpliderContent()
            {
                Fields        = cfields,
                SpliderRuleId = rule.Id
            };

            list.Add(sc);
            return(list);
        }
Пример #6
0
        /// <summary>
        /// 根据Rule
        /// </summary>
        /// <param name="rule"></param>
        /// <returns></returns>
        public List <SpliderContent> GetByRule(SpliderRule rule)
        {
            var     url = @"https://www.wandoujia.com/wdjweb/api/category/more?catId=5017&subCatId=593&page=2";//rule.Url;
            HtmlWeb web = new HtmlWeb();
            //1.支持从web或本地path加载html
            var htmlDoc     = web.Load(url);
            var contentnode = htmlDoc.DocumentNode.SelectSingleNode(rule.ContentXPath);

            var list = new List <SpliderContent>();

            //列表页
            if (!string.IsNullOrWhiteSpace(rule.EachXPath))
            {
                var itemsNodes = contentnode.SelectNodes(rule.EachXPath);
                foreach (var item in itemsNodes)
                {
                    var fields = GetFields(item, rule);

                    list.Add(new SpliderContent()
                    {
                        Fields        = fields,
                        SpliderRuleId = rule.Id
                    });
                }
                return(list);
            }
            //详情页
            var cfields = GetFields(contentnode, rule);
            var sc      = new SpliderContent()
            {
                Fields        = cfields,
                SpliderRuleId = rule.Id
            };

            list.Add(sc);
            return(list);
        }