Пример #1
0
        private ISelector CompileSelector()
        {
            switch (expressionType)
            {
            case ExpressionType.Css:
                if (expressionParams.Length >= 1)
                {
                    return(Selectors.Css(expressionValue, expressionParams[0]));
                }
                else
                {
                    return(Selectors.Css(expressionValue));
                }

            case ExpressionType.XPath:
                return(Selectors.XPath(expressionValue));

            case ExpressionType.Regex:
                if (expressionParams.Length >= 1)
                {
                    return(Selectors.Regex(expressionValue,
                                           int.Parse(expressionParams[0])));
                }
                else
                {
                    return(Selectors.Regex(expressionValue));
                }

            case ExpressionType.JsonPath:
                return(new JsonPathSelector(expressionValue));

            default:
                return(Selectors.XPath(expressionValue));
            }
        }
Пример #2
0
        /// <summary>
        /// 把BaseSelector转换成真正的查询器
        /// </summary>
        /// <param name="selector">BaseSelector</param>
        /// <returns>查询器</returns>
        public static ISelector ToSelector(this Selector selector)
        {
            if (selector != null)
            {
                string expression = selector.Expression;

                switch (selector.Type)
                {
                case SelectorType.Css:
                {
                    NotNullExpression(selector);
                    return(Selectors.Css(expression));
                }

                case SelectorType.Enviroment:
                {
                    return(Selectors.Enviroment(expression));
                }

                case SelectorType.JsonPath:
                {
                    NotNullExpression(selector);
                    return(Selectors.JsonPath(expression));
                }

                case SelectorType.Regex:
                {
                    NotNullExpression(selector);
                    if (string.IsNullOrEmpty(selector.Arguments))
                    {
                        return(Selectors.Regex(expression));
                    }
                    else
                    {
                        int group;
                        if (int.TryParse(selector.Arguments, out group))
                        {
                            return(Selectors.Regex(expression, group));
                        }
                        throw new SpiderException("Regex argument should be a number set to group: " + selector);
                    }
                }

                case SelectorType.XPath:
                {
                    NotNullExpression(selector);
                    return(Selectors.XPath(expression));
                }

                default:
                {
                    throw new SpiderException($"Selector {selector} unsupoort");
                }
                }
            }
            else
            {
                return(null);
            }
        }
Пример #3
0
        public void Css()
        {
            var result = Selectors.Css("div h1 a").Select(html);

            Assert.Equal("aabbcc", result.Value);
            Assert.Equal("xxx", Selectors.Css("div h1 a", "href").Select(html).Value);
            Assert.Equal("aabbcc", Selectors.Css("div h1 a").Select(html).Value);
        }
Пример #4
0
 public void TestEach()
 {
     Assert.AreEqual(Selectors.Css("div h1 a").Select(_html).OuterHtml, "<a href=\"xxx\">aabbcc</a>");
     Assert.AreEqual(Selectors.Css("div h1 a", "href").Select(_html), "xxx");
     Assert.AreEqual(Selectors.Css("div h1 a").Select(_html).InnerHtml, "aabbcc");
     Assert.AreEqual(Selectors.XPath("//a/@href").Select(_html), "xxx");
     Assert.AreEqual(Selectors.Regex("a href=\"(.*)\"").Select(_html), "xxx");
     Assert.AreEqual(Selectors.Regex("(a href)=\"(.*)\"", 2).Select(_html), "xxx");
 }
Пример #5
0
        public void TestCombo()
        {
            var value1 = Selectors.And(Selectors.Css("title"), Selectors.Regex("aa(bb)cc")).Select(_html2);

            Assert.AreEqual(value1, "bb");

            var or = Selectors.Or(Selectors.Css("div h1 a", "innerHtml"), Selectors.XPath("//title"));

            Assert.AreEqual(or.Select(_html), "aabbcc");
            Assert.AreEqual(or.Select(_html2), "aabbcc");
        }
Пример #6
0
        public static ISelector Parse(BaseSelector selector)
        {
            if (string.IsNullOrEmpty(selector?.Expression))
            {
                return(null);
            }

            string expression = selector.Expression;

            switch (selector.Type)
            {
            case SelectorType.Css:
            {
                return(Selectors.Css(expression));
            }

            case SelectorType.Enviroment:
            {
                return(Selectors.Enviroment(expression));
            }

            case SelectorType.JsonPath:
            {
                return(Selectors.JsonPath(expression));
            }

            case SelectorType.Regex:
            {
                if (string.IsNullOrEmpty(selector.Argument))
                {
                    return(Selectors.Regex(expression));
                }
                else
                {
                    int group;
                    if (int.TryParse(selector.Argument, out group))
                    {
                        return(Selectors.Regex(expression, group));
                    }
                    throw new SpiderException("Regex argument should be a number set to group: " + selector);
                }
            }

            case SelectorType.XPath:
            {
                return(Selectors.XPath(expression));
            }
            }
            throw new SpiderException("Not support selector: " + selector);
        }
Пример #7
0
        /// <summary>
        /// 把BaseSelector转换成真正的查询器
        /// </summary>
        /// <param name="selector">BaseSelector</param>
        /// <returns>查询器</returns>
        public static ISelector ToSelector(this Attribute.Selector selector)
        {
            if (selector != null)
            {
                string expression = selector.Expression;

                switch (selector.Type)
                {
                case SelectorType.Css:
                {
                    NotNullExpression(selector);
                    return(Selectors.Css(expression));
                }

                case SelectorType.JsonPath:
                {
                    NotNullExpression(selector);
                    return(Selectors.JsonPath(expression));
                }

                case SelectorType.Regex:
                {
                    NotNullExpression(selector);
                    if (string.IsNullOrEmpty(selector.Arguments))
                    {
                        return(Selectors.Regex(expression));
                    }

                    if (int.TryParse(selector.Arguments, out var group))
                    {
                        return(Selectors.Regex(expression, @group));
                    }
                    throw new ArgumentException($"Regex argument should be a number set to group: {selector}");
                }

                case SelectorType.XPath:
                {
                    NotNullExpression(selector);
                    return(Selectors.XPath(expression));
                }

                default:
                {
                    throw new NotSupportedException($"{selector} unsupported");
                }
                }
            }

            return(null);
        }
Пример #8
0
        /// <summary>
        /// 把 BaseSelector 转换成真正的查询器
        /// </summary>
        /// <param name="selector">BaseSelector</param>
        /// <returns>查询器</returns>
        public static ISelector ToSelector(this Selector selector)
        {
            if (selector != null)
            {
                var expression = selector.Expression;

                switch (selector.Type)
                {
                case SelectorType.Css:
                {
                    NotNullExpression(selector);
                    return(Selectors.Css(expression));
                }

                case SelectorType.JsonPath:
                {
                    NotNullExpression(selector);
                    return(Selectors.JsonPath(expression));
                }

                case SelectorType.Regex:
                {
                    NotNullExpression(selector);
                    if (string.IsNullOrEmpty(selector.Arguments))
                    {
                        return(Selectors.Regex(expression));
                    }

                    var arguments   = selector.Arguments.Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                    var options     = (RegexOptions)Enum.Parse(typeof(RegexOptions), arguments[0]);
                    var replacement = arguments[1];
                    return(Selectors.Regex(expression, options, replacement));
                }

                case SelectorType.XPath:
                {
                    NotNullExpression(selector);
                    return(Selectors.XPath(expression));
                }

                default:
                {
                    throw new NotSupportedException($"{selector} unsupported");
                }
                }
            }

            return(null);
        }
Пример #9
0
        protected override void Handle(Page page)
        {
            var returnLi = new List <VideoContent>();

            var contents = page.Selectable.SelectList(Selectors.Css(".yk-col4")).Nodes();

            foreach (var node in contents)
            {
                returnLi.Add(new VideoContent
                {
                    Title   = node.Css(".info-list .title a").Nodes().FirstOrDefault()?.GetValue(ValueOption.InnerText),
                    Hits    = node.Css(".info-list li").Nodes().LastOrDefault()?.GetValue(ValueOption.InnerText),
                    Href    = node.Css(".info-list .title a").XPath("@href").GetValue(),
                    ImgHref = node.Css(".p-thumb img").XPath("@src").GetValue()
                });
            }

            //以自定义KEY存入page对象中供Pipeline调用
            page.AddResultItem("VideoResult", returnLi);
        }
Пример #10
0
        private List <string> GetHtmlContent(string id)
        {
            List <string>        list       = new List <string>();
            HttpClientDownloader downloader = new HttpClientDownloader();

            /*
             * http://lizhi.yjbys.com/mingyan/373438.html
             * http://lizhi.yjbys.com/mingyan/373443.html
             * http://lizhi.yjbys.com/mingyan/221755.html
             */
            string url = "http://lizhi.yjbys.com/mingyan/" + id + ".html";

            Java2Dotnet.Spider.Core.Spider spider = Java2Dotnet.Spider.Core.Spider.Create(new Site()
            {
                EncodingName = "UTF-8"
            },
                                                                                          new SimplePageProcessor(url, url + "/* ")).AddPipeline(new TestPipeline()).SetThreadNum(1);
            Page p = downloader.Download(new Request(url, 2, new Dictionary <string, dynamic>()), spider);
            //Console.WriteLine(p.Content);

            var s = Selectors.Css("div p").SelectList(p.Content);

            for (int i = 0; i < s.Count; i++)
            {
                Match m = Regex.Match(s[i].OuterHtml, @"<p>([\s\S]*)</p>", RegexOptions.IgnoreCase);
                if (m.Success)
                {
                    string result = m.Result("$1").Replace("&mdash;", "—").Replace("&nbsp;", "");
                    if (result != "")
                    {
                        if (!result.Contains("<"))
                        {
                            list.Add(result);
                        }
                    }
                }
            }
            return(list);
        }
Пример #11
0
        public static ISelector Parse(Selector selector)
        {
            if (string.IsNullOrEmpty(selector?.Expression))
            {
                return(null);
            }

            string expression = selector.Expression;

            switch (selector.Type)
            {
            case SelectorType.Css:
            {
                return(Selectors.Css(expression));
            }

            case SelectorType.Enviroment:
            {
                return(Selectors.Enviroment(expression));
            }

            case SelectorType.JsonPath:
            {
                return(Selectors.JsonPath(expression));
            }

            case SelectorType.Regex:
            {
                return(Selectors.Regex(expression));
            }

            case SelectorType.XPath:
            {
                return(Selectors.XPath(expression));
            }
            }
            throw new SpiderException("Not support selector: " + selector);
        }
Пример #12
0
 public void Css()
 {
     Assert.Equal(Selectors.Css("div h1 a").Select(_html).OuterHtml, "<a href=\"xxx\">aabbcc</a>");
     Assert.Equal(Selectors.Css("div h1 a", "href").Select(_html), "xxx");
     Assert.Equal(Selectors.Css("div h1 a").Select(_html).InnerHtml, "aabbcc");
 }
Пример #13
0
        /// <summary>
        /// 通过Css 选择器查找元素, 并取得属性的值
        /// </summary>
        /// <param name="css">Css 选择器</param>
        /// <param name="attrName">查询到的元素的属性</param>
        /// <returns>查询接口</returns>
        public override ISelectable Css(string css, string attrName)
        {
            var cssSelector = Selectors.Css(css, attrName);

            return(SelectList(cssSelector));
        }
Пример #14
0
 /// <summary>
 /// 通过Css 选择器查找结果
 /// </summary>
 /// <param name="css">Css 选择器</param>
 /// <returns>查询接口</returns>
 public override ISelectable Css(string css)
 {
     return(SelectList(Selectors.Css(css)));
 }
Пример #15
0
        /// <summary>
        /// 构造方法
        /// </summary>
        public DataParser()
        {
            Model = new Model <T>();

            var patterns = new HashSet <string>();

            if (Model.FollowRequestSelectors != null)
            {
                foreach (var followSelector in Model.FollowRequestSelectors)
                {
                    switch (followSelector.SelectorType)
                    {
                    case SelectorType.Css:
                    {
                        foreach (var expression in followSelector.Expressions)
                        {
                            AddFollowRequestQuerier(Selectors.Css(expression));
                        }

                        break;
                    }

                    case SelectorType.Regex:
                    {
                        foreach (var expression in followSelector.Expressions)
                        {
                            AddFollowRequestQuerier(Selectors.Regex(expression));
                        }

                        break;
                    }

                    case SelectorType.XPath:
                    {
                        foreach (var expression in followSelector.Expressions)
                        {
                            AddFollowRequestQuerier(Selectors.XPath(expression));
                        }

                        break;
                    }

                    case SelectorType.Environment:
                    {
                        Logger.LogWarning("SelectorType of follow selector is not supported");
                        break;
                    }

                    case SelectorType.JsonPath:
                    {
                        foreach (var expression in followSelector.Expressions)
                        {
                            AddFollowRequestQuerier(Selectors.JsonPath(expression));
                        }

                        break;
                    }
                    }

                    foreach (var pattern in followSelector.Patterns)
                    {
                        patterns.Add(pattern);
                    }
                }
            }

            foreach (var pattern in patterns)
            {
                AddRequiredValidator(request => Regex.IsMatch(request.RequestUri.ToString(), pattern));
            }
        }