private ISelector CompileSelector() { switch (expressionType) { case ExpressionType.Css: if (expressionParams.Length >= 1) { return(Selectors.Css(expressionValue, expressionParams[0])); } else { return(Selectors.Css(expressionValue)); } case ExpressionType.XPath: return(Selectors.XPath(expressionValue)); case ExpressionType.Regex: if (expressionParams.Length >= 1) { return(Selectors.Regex(expressionValue, int.Parse(expressionParams[0]))); } else { return(Selectors.Regex(expressionValue)); } case ExpressionType.JsonPath: return(new JsonPathSelector(expressionValue)); default: return(Selectors.XPath(expressionValue)); } }
/// <summary> /// 把BaseSelector转换成真正的查询器 /// </summary> /// <param name="selector">BaseSelector</param> /// <returns>查询器</returns> public static ISelector ToSelector(this Selector selector) { if (selector != null) { string expression = selector.Expression; switch (selector.Type) { case SelectorType.Css: { NotNullExpression(selector); return(Selectors.Css(expression)); } case SelectorType.Enviroment: { return(Selectors.Enviroment(expression)); } case SelectorType.JsonPath: { NotNullExpression(selector); return(Selectors.JsonPath(expression)); } case SelectorType.Regex: { NotNullExpression(selector); if (string.IsNullOrEmpty(selector.Arguments)) { return(Selectors.Regex(expression)); } else { int group; if (int.TryParse(selector.Arguments, out group)) { return(Selectors.Regex(expression, group)); } throw new SpiderException("Regex argument should be a number set to group: " + selector); } } case SelectorType.XPath: { NotNullExpression(selector); return(Selectors.XPath(expression)); } default: { throw new SpiderException($"Selector {selector} unsupoort"); } } } else { return(null); } }
public void Css() { var result = Selectors.Css("div h1 a").Select(html); Assert.Equal("aabbcc", result.Value); Assert.Equal("xxx", Selectors.Css("div h1 a", "href").Select(html).Value); Assert.Equal("aabbcc", Selectors.Css("div h1 a").Select(html).Value); }
public void TestEach() { Assert.AreEqual(Selectors.Css("div h1 a").Select(_html).OuterHtml, "<a href=\"xxx\">aabbcc</a>"); Assert.AreEqual(Selectors.Css("div h1 a", "href").Select(_html), "xxx"); Assert.AreEqual(Selectors.Css("div h1 a").Select(_html).InnerHtml, "aabbcc"); Assert.AreEqual(Selectors.XPath("//a/@href").Select(_html), "xxx"); Assert.AreEqual(Selectors.Regex("a href=\"(.*)\"").Select(_html), "xxx"); Assert.AreEqual(Selectors.Regex("(a href)=\"(.*)\"", 2).Select(_html), "xxx"); }
public void TestCombo() { var value1 = Selectors.And(Selectors.Css("title"), Selectors.Regex("aa(bb)cc")).Select(_html2); Assert.AreEqual(value1, "bb"); var or = Selectors.Or(Selectors.Css("div h1 a", "innerHtml"), Selectors.XPath("//title")); Assert.AreEqual(or.Select(_html), "aabbcc"); Assert.AreEqual(or.Select(_html2), "aabbcc"); }
public static ISelector Parse(BaseSelector selector) { if (string.IsNullOrEmpty(selector?.Expression)) { return(null); } string expression = selector.Expression; switch (selector.Type) { case SelectorType.Css: { return(Selectors.Css(expression)); } case SelectorType.Enviroment: { return(Selectors.Enviroment(expression)); } case SelectorType.JsonPath: { return(Selectors.JsonPath(expression)); } case SelectorType.Regex: { if (string.IsNullOrEmpty(selector.Argument)) { return(Selectors.Regex(expression)); } else { int group; if (int.TryParse(selector.Argument, out group)) { return(Selectors.Regex(expression, group)); } throw new SpiderException("Regex argument should be a number set to group: " + selector); } } case SelectorType.XPath: { return(Selectors.XPath(expression)); } } throw new SpiderException("Not support selector: " + selector); }
/// <summary> /// 把BaseSelector转换成真正的查询器 /// </summary> /// <param name="selector">BaseSelector</param> /// <returns>查询器</returns> public static ISelector ToSelector(this Attribute.Selector selector) { if (selector != null) { string expression = selector.Expression; switch (selector.Type) { case SelectorType.Css: { NotNullExpression(selector); return(Selectors.Css(expression)); } case SelectorType.JsonPath: { NotNullExpression(selector); return(Selectors.JsonPath(expression)); } case SelectorType.Regex: { NotNullExpression(selector); if (string.IsNullOrEmpty(selector.Arguments)) { return(Selectors.Regex(expression)); } if (int.TryParse(selector.Arguments, out var group)) { return(Selectors.Regex(expression, @group)); } throw new ArgumentException($"Regex argument should be a number set to group: {selector}"); } case SelectorType.XPath: { NotNullExpression(selector); return(Selectors.XPath(expression)); } default: { throw new NotSupportedException($"{selector} unsupported"); } } } return(null); }
/// <summary> /// 把 BaseSelector 转换成真正的查询器 /// </summary> /// <param name="selector">BaseSelector</param> /// <returns>查询器</returns> public static ISelector ToSelector(this Selector selector) { if (selector != null) { var expression = selector.Expression; switch (selector.Type) { case SelectorType.Css: { NotNullExpression(selector); return(Selectors.Css(expression)); } case SelectorType.JsonPath: { NotNullExpression(selector); return(Selectors.JsonPath(expression)); } case SelectorType.Regex: { NotNullExpression(selector); if (string.IsNullOrEmpty(selector.Arguments)) { return(Selectors.Regex(expression)); } var arguments = selector.Arguments.Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries); var options = (RegexOptions)Enum.Parse(typeof(RegexOptions), arguments[0]); var replacement = arguments[1]; return(Selectors.Regex(expression, options, replacement)); } case SelectorType.XPath: { NotNullExpression(selector); return(Selectors.XPath(expression)); } default: { throw new NotSupportedException($"{selector} unsupported"); } } } return(null); }
protected override void Handle(Page page) { var returnLi = new List <VideoContent>(); var contents = page.Selectable.SelectList(Selectors.Css(".yk-col4")).Nodes(); foreach (var node in contents) { returnLi.Add(new VideoContent { Title = node.Css(".info-list .title a").Nodes().FirstOrDefault()?.GetValue(ValueOption.InnerText), Hits = node.Css(".info-list li").Nodes().LastOrDefault()?.GetValue(ValueOption.InnerText), Href = node.Css(".info-list .title a").XPath("@href").GetValue(), ImgHref = node.Css(".p-thumb img").XPath("@src").GetValue() }); } //以自定义KEY存入page对象中供Pipeline调用 page.AddResultItem("VideoResult", returnLi); }
private List <string> GetHtmlContent(string id) { List <string> list = new List <string>(); HttpClientDownloader downloader = new HttpClientDownloader(); /* * http://lizhi.yjbys.com/mingyan/373438.html * http://lizhi.yjbys.com/mingyan/373443.html * http://lizhi.yjbys.com/mingyan/221755.html */ string url = "http://lizhi.yjbys.com/mingyan/" + id + ".html"; Java2Dotnet.Spider.Core.Spider spider = Java2Dotnet.Spider.Core.Spider.Create(new Site() { EncodingName = "UTF-8" }, new SimplePageProcessor(url, url + "/* ")).AddPipeline(new TestPipeline()).SetThreadNum(1); Page p = downloader.Download(new Request(url, 2, new Dictionary <string, dynamic>()), spider); //Console.WriteLine(p.Content); var s = Selectors.Css("div p").SelectList(p.Content); for (int i = 0; i < s.Count; i++) { Match m = Regex.Match(s[i].OuterHtml, @"<p>([\s\S]*)</p>", RegexOptions.IgnoreCase); if (m.Success) { string result = m.Result("$1").Replace("—", "—").Replace(" ", ""); if (result != "") { if (!result.Contains("<")) { list.Add(result); } } } } return(list); }
public static ISelector Parse(Selector selector) { if (string.IsNullOrEmpty(selector?.Expression)) { return(null); } string expression = selector.Expression; switch (selector.Type) { case SelectorType.Css: { return(Selectors.Css(expression)); } case SelectorType.Enviroment: { return(Selectors.Enviroment(expression)); } case SelectorType.JsonPath: { return(Selectors.JsonPath(expression)); } case SelectorType.Regex: { return(Selectors.Regex(expression)); } case SelectorType.XPath: { return(Selectors.XPath(expression)); } } throw new SpiderException("Not support selector: " + selector); }
public void Css() { Assert.Equal(Selectors.Css("div h1 a").Select(_html).OuterHtml, "<a href=\"xxx\">aabbcc</a>"); Assert.Equal(Selectors.Css("div h1 a", "href").Select(_html), "xxx"); Assert.Equal(Selectors.Css("div h1 a").Select(_html).InnerHtml, "aabbcc"); }
/// <summary> /// 通过Css 选择器查找元素, 并取得属性的值 /// </summary> /// <param name="css">Css 选择器</param> /// <param name="attrName">查询到的元素的属性</param> /// <returns>查询接口</returns> public override ISelectable Css(string css, string attrName) { var cssSelector = Selectors.Css(css, attrName); return(SelectList(cssSelector)); }
/// <summary> /// 通过Css 选择器查找结果 /// </summary> /// <param name="css">Css 选择器</param> /// <returns>查询接口</returns> public override ISelectable Css(string css) { return(SelectList(Selectors.Css(css))); }
/// <summary> /// 构造方法 /// </summary> public DataParser() { Model = new Model <T>(); var patterns = new HashSet <string>(); if (Model.FollowRequestSelectors != null) { foreach (var followSelector in Model.FollowRequestSelectors) { switch (followSelector.SelectorType) { case SelectorType.Css: { foreach (var expression in followSelector.Expressions) { AddFollowRequestQuerier(Selectors.Css(expression)); } break; } case SelectorType.Regex: { foreach (var expression in followSelector.Expressions) { AddFollowRequestQuerier(Selectors.Regex(expression)); } break; } case SelectorType.XPath: { foreach (var expression in followSelector.Expressions) { AddFollowRequestQuerier(Selectors.XPath(expression)); } break; } case SelectorType.Environment: { Logger.LogWarning("SelectorType of follow selector is not supported"); break; } case SelectorType.JsonPath: { foreach (var expression in followSelector.Expressions) { AddFollowRequestQuerier(Selectors.JsonPath(expression)); } break; } } foreach (var pattern in followSelector.Patterns) { patterns.Add(pattern); } } } foreach (var pattern in patterns) { AddRequiredValidator(request => Regex.IsMatch(request.RequestUri.ToString(), pattern)); } }