public void NodesTest() { List <HtmlNode> nodes = null; noRecursive = true; //Recursively nodes = HtmlAgilityHelper.Nodes(documentNode, true, HtmlTags.span); Assert.Equal(5, nodes.Count); // Non-recursively if (noRecursive) { nodes = HtmlAgilityHelper.Nodes(bodyNode, false, HtmlTags.span); Assert.Equal(2, nodes.Count); } // Recursively nodes = HtmlAgilityHelper.Nodes(bodyNode, true, AllStrings.asterisk); Assert.Equal(10, nodes.Count); // Non-recursively if (noRecursive) { nodes = HtmlAgilityHelper.Nodes(bodyNode, false, AllStrings.asterisk); Assert.Equal(7, nodes.Count); } }
public void NodeWithAttrTest() { HtmlNode node = null; // Recursively node = HtmlAgilityHelper.NodeWithAttr(documentNode, true, HtmlTags.span, HtmlAttrs.c, cssClassC); Assert.NotNull(node); // exists but "a b" node = HtmlAgilityHelper.NodeWithAttr(documentNode, true, HtmlTags.span, HtmlAttrs.c, cssClassA); Assert.Null(node); node = HtmlAgilityHelper.NodeWithAttr(documentNode, true, AllStrings.asterisk, HtmlAttrs.cAttr, cssClassA); Assert.Null(node); node = HtmlAgilityHelper.NodeWithAttr(documentNode, true, HtmlTags.img, HtmlAttrs.cAttr, cssClassA); Assert.Null(node); // Non-recursively if (noRecursive) { node = HtmlAgilityHelper.NodeWithAttr(bodyNode, false, HtmlTags.span, HtmlAttrs.c, cssClassC); Assert.NotNull(node); // exists but "a b" node = HtmlAgilityHelper.NodeWithAttr(bodyNode, false, HtmlTags.span, HtmlAttrs.c, cssClassA); Assert.Null(node); node = HtmlAgilityHelper.NodeWithAttr(bodyNode, false, AllStrings.asterisk, HtmlAttrs.c, cssClassC); Assert.NotNull(node); } }
void GetHtmlDocumentTestFile() { HtmlDocument hd = HtmlAgilityHelper.CreateHtmlDocument(); hd.Load(testFile); this.hd = hd.DocumentNode; }
public void NodeTest() { HtmlNode node = null; // Recursively node = HtmlAgilityHelper.Node(documentNode, true, HtmlTags.span); Assert.NotNull(node); node = HtmlAgilityHelper.Node(documentNode, true, HtmlTags.img); Assert.Null(node); node = HtmlAgilityHelper.Node(documentNode, true, AllStrings.asterisk); Assert.NotNull(node); // Non-recursively if (noRecursive) { node = HtmlAgilityHelper.Node(bodyNode, false, HtmlTags.span); Assert.NotNull(node); node = HtmlAgilityHelper.Node(bodyNode, false, HtmlTags.img); Assert.Null(node); node = HtmlAgilityHelper.Node(bodyNode, false, AllStrings.asterisk); Assert.NotNull(node); } }
private static IEnumerable <string> GetUrlsFromHtmlByPatterns(string content, string baseUrl, params string[] urlPatterns) { HtmlNode rootNode = HtmlAgilityHelper.ParseHtmlDocument(content); var aLinkNodes = rootNode.SelectNodes(".//a[@href]"); var hrefRegexs = new List <Regex>(); foreach (string urlPattern in urlPatterns) { hrefRegexs.Add(new Regex(urlPattern, RegexOptions.Compiled)); } foreach (HtmlNode aLinkNode in aLinkNodes) { string href = aLinkNode.Attributes["href"].Value; if (href == null || href.Length == 0) { continue; } Uri result; if (Uri.TryCreate(new Uri(baseUrl), href, out result)) { href = result.AbsoluteUri; foreach (Regex hrefRegex in hrefRegexs) { if (hrefRegex.IsMatch(href)) { yield return(href); break; } } } } }
void GetHtmlDocumentTestFile() { HtmlDocument hd = HtmlAgilityHelper.CreateHtmlDocument(); hd.Load(testFile); this.documentNode = hd.DocumentNode; this.bodyNode = HtmlHelper.ReturnTagRek(documentNode, HtmlTags.body); }
public void ReplacePlainUriForAnchors() { string actual = "I tried https://www.nuget.org/p/ because <a href=\"http://jepsano.net/\">http://jepsano.net/</a> another text"; string excepted = "I tried <a href=\"https://www.nuget.org/p/\">https://www.nuget.org/p/</a> because <a href=\"http://jepsano.net/\">http://jepsano.net/</a> another text"; string result = HtmlAgilityHelper.ReplacePlainUriForAnchors(actual); Assert.Equal(excepted, result); }
public async Task <ScrapeResultDto <ArticleItemDto> > GetArticles(string url, bool includeAbstract, CancellationToken cancellationToken) { ScrapeResultDto <ArticleItemDto> result = new ScrapeResultDto <ArticleItemDto>() { RequestUrl = url }; HtmlDocument htmlDocument = await HtmlAgilityHelper.GetHtmlDocument(url, cancellationToken); return(htmlDocument == null ? result : ScrapeArticleList(result, htmlDocument, includeAbstract)); }
public void HtmlTableParserTest() { var a = @"D:\_Test\sunamo\sunamo\Html\HtmlTableParserTests\a.html"; var hd = HtmlAgilityHelper.CreateHtmlDocument(); hd.LoadHtml(TF.ReadFile(a)); var table = HtmlAgilityHelper.Node(hd.DocumentNode, true, "table"); HtmlTableParser p = new HtmlTableParser(table, false); var v = p.ColumnValues("1", false, false); int i = 0; }
public async Task <ArticleItemDto> GetArticle(string url, CancellationToken cancellationToken) { ArticleItemDto dto = new ArticleItemDto(); HtmlDocument doc = await HtmlAgilityHelper.GetHtmlDocument(url, cancellationToken); var raw = doc.ToString(); //TODO : return(dto); }
public void XmlAgilityDocumentTest() { var c = @"D:\_Test\sunamo\sunamo\Xml\XmlAgilityDocumentTests\input.csproj"; XmlAgilityDocument x = new XmlAgilityDocument(); x.Load(c); var nodes = HtmlAgilityHelper.NodesWithAttrWildCard(x.hd.DocumentNode, true, "Compile", Consts.Include, "*.cs", true); foreach (var item in nodes) { item.Remove(); } x.path = FS.ChangeFilename(x.path, "output.csproj", false); x.Save(); }
public void NodesWithAttrTest() { List <HtmlNode> nodes = null; // Recursively nodes = HtmlAgilityHelper.NodesWithAttr(documentNode, true, HtmlTags.span, HtmlAttrs.c, cssClassC); Assert.Equal(3, nodes.Count); // Non-recursively if (noRecursive) { nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, false, HtmlTags.span, HtmlAttrs.c, cssClassC); Assert.Equal(1, nodes.Count); } // Recursively nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, true, AllStrings.asterisk, HtmlAttrs.c, cssClassC); Assert.Equal(4, nodes.Count); // Non-recursively if (noRecursive) { nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, false, AllStrings.asterisk, HtmlAttrs.c, cssClassC); Assert.Equal(2, nodes.Count); } // Recursively nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, true, AllStrings.asterisk, HtmlAttrs.c, cssClassA, true); Assert.Equal(3, nodes.Count); // Non-recursively if (noRecursive) { nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, false, AllStrings.asterisk, HtmlAttrs.c, cssClassC, true); Assert.Equal(2, nodes.Count); } // Recursively nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, true, AllStrings.asterisk, HtmlAttrs.c, AllStrings.asterisk, true); Assert.Equal(10, nodes.Count); // Non-recursively if (noRecursive) { nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, false, AllStrings.asterisk, HtmlAttrs.c, AllStrings.asterisk, true); Assert.Equal(7, nodes.Count); } }
protected override IEnumerable <Entity> GetEntities() { while (!_NextUrlQueue.IsEmpty) { string[] urls = _NextUrlQueue.ToArray(); StringUtil.Shuffle(urls); // Make the crawler more like BFS _NextUrlQueue = new ConcurrentQueue <string>(); ConcurrentQueue <Entity> resultEntities = new ConcurrentQueue <Entity>(); Parallel.ForEach(urls, new ParallelOptions() { MaxDegreeOfParallelism = MaxParallelCount }, (url) => { string content = HttpRequestHelper.GetContentFromHttpUrl(url); foreach (Entity entity in HtmlAgilityHelper.GetEntitiesFromContent(content, EntityModel)) { entity.SetValue("Url", url); resultEntities.Enqueue(entity); } foreach (string nextUrl in DiscoverCrawlerUrls(content, url)) { if (VisitedUrls.Contains(nextUrl)) { continue; } _NextUrlQueue.Enqueue(nextUrl); VisitedUrls.Add(nextUrl); } }); foreach (var entity in resultEntities) { yield return(entity); } } }
private IEnumerable <Entity> Parse(string content) { return(HtmlAgilityHelper.GetEntitiesFromContent(content, Model)); }