Пример #1
0
        public void NodesTest()
        {
            List <HtmlNode> nodes = null;

            noRecursive = true;

            //Recursively
            nodes = HtmlAgilityHelper.Nodes(documentNode, true, HtmlTags.span);
            Assert.Equal(5, nodes.Count);
            // Non-recursively
            if (noRecursive)
            {
                nodes = HtmlAgilityHelper.Nodes(bodyNode, false, HtmlTags.span);
                Assert.Equal(2, nodes.Count);
            }



            // Recursively
            nodes = HtmlAgilityHelper.Nodes(bodyNode, true, AllStrings.asterisk);
            Assert.Equal(10, nodes.Count);
            // Non-recursively
            if (noRecursive)
            {
                nodes = HtmlAgilityHelper.Nodes(bodyNode, false, AllStrings.asterisk);
                Assert.Equal(7, nodes.Count);
            }
        }
Пример #2
0
        public void NodeWithAttrTest()
        {
            HtmlNode node = null;

            // Recursively
            node = HtmlAgilityHelper.NodeWithAttr(documentNode, true, HtmlTags.span, HtmlAttrs.c, cssClassC);
            Assert.NotNull(node);

            // exists but "a b"
            node = HtmlAgilityHelper.NodeWithAttr(documentNode, true, HtmlTags.span, HtmlAttrs.c, cssClassA);
            Assert.Null(node);

            node = HtmlAgilityHelper.NodeWithAttr(documentNode, true, AllStrings.asterisk, HtmlAttrs.cAttr, cssClassA);
            Assert.Null(node);

            node = HtmlAgilityHelper.NodeWithAttr(documentNode, true, HtmlTags.img, HtmlAttrs.cAttr, cssClassA);
            Assert.Null(node);

            // Non-recursively
            if (noRecursive)
            {
                node = HtmlAgilityHelper.NodeWithAttr(bodyNode, false, HtmlTags.span, HtmlAttrs.c, cssClassC);
                Assert.NotNull(node);

                // exists but "a b"
                node = HtmlAgilityHelper.NodeWithAttr(bodyNode, false, HtmlTags.span, HtmlAttrs.c, cssClassA);
                Assert.Null(node);

                node = HtmlAgilityHelper.NodeWithAttr(bodyNode, false, AllStrings.asterisk, HtmlAttrs.c, cssClassC);
                Assert.NotNull(node);
            }
        }
Пример #3
0
        void GetHtmlDocumentTestFile()
        {
            HtmlDocument hd = HtmlAgilityHelper.CreateHtmlDocument();

            hd.Load(testFile);
            this.hd = hd.DocumentNode;
        }
Пример #4
0
        public void NodeTest()
        {
            HtmlNode node = null;

            // Recursively
            node = HtmlAgilityHelper.Node(documentNode, true, HtmlTags.span);
            Assert.NotNull(node);

            node = HtmlAgilityHelper.Node(documentNode, true, HtmlTags.img);
            Assert.Null(node);

            node = HtmlAgilityHelper.Node(documentNode, true, AllStrings.asterisk);
            Assert.NotNull(node);

            // Non-recursively
            if (noRecursive)
            {
                node = HtmlAgilityHelper.Node(bodyNode, false, HtmlTags.span);
                Assert.NotNull(node);

                node = HtmlAgilityHelper.Node(bodyNode, false, HtmlTags.img);
                Assert.Null(node);

                node = HtmlAgilityHelper.Node(bodyNode, false, AllStrings.asterisk);
                Assert.NotNull(node);
            }
        }
        private static IEnumerable <string> GetUrlsFromHtmlByPatterns(string content, string baseUrl, params string[] urlPatterns)
        {
            HtmlNode rootNode   = HtmlAgilityHelper.ParseHtmlDocument(content);
            var      aLinkNodes = rootNode.SelectNodes(".//a[@href]");
            var      hrefRegexs = new List <Regex>();

            foreach (string urlPattern in urlPatterns)
            {
                hrefRegexs.Add(new Regex(urlPattern, RegexOptions.Compiled));
            }

            foreach (HtmlNode aLinkNode in aLinkNodes)
            {
                string href = aLinkNode.Attributes["href"].Value;
                if (href == null || href.Length == 0)
                {
                    continue;
                }

                Uri result;
                if (Uri.TryCreate(new Uri(baseUrl), href, out result))
                {
                    href = result.AbsoluteUri;
                    foreach (Regex hrefRegex in hrefRegexs)
                    {
                        if (hrefRegex.IsMatch(href))
                        {
                            yield return(href);

                            break;
                        }
                    }
                }
            }
        }
Пример #6
0
        void GetHtmlDocumentTestFile()
        {
            HtmlDocument hd = HtmlAgilityHelper.CreateHtmlDocument();

            hd.Load(testFile);
            this.documentNode = hd.DocumentNode;
            this.bodyNode     = HtmlHelper.ReturnTagRek(documentNode, HtmlTags.body);
        }
Пример #7
0
        public void ReplacePlainUriForAnchors()
        {
            string actual   = "I tried https://www.nuget.org/p/ because <a href=\"http://jepsano.net/\">http://jepsano.net/</a> another text";
            string excepted = "I tried <a href=\"https://www.nuget.org/p/\">https://www.nuget.org/p/</a> because <a href=\"http://jepsano.net/\">http://jepsano.net/</a> another text";

            string result = HtmlAgilityHelper.ReplacePlainUriForAnchors(actual);

            Assert.Equal(excepted, result);
        }
Пример #8
0
        public async Task <ScrapeResultDto <ArticleItemDto> > GetArticles(string url, bool includeAbstract, CancellationToken cancellationToken)
        {
            ScrapeResultDto <ArticleItemDto> result = new ScrapeResultDto <ArticleItemDto>()
            {
                RequestUrl = url
            };

            HtmlDocument htmlDocument = await HtmlAgilityHelper.GetHtmlDocument(url, cancellationToken);

            return(htmlDocument == null ? result :  ScrapeArticleList(result, htmlDocument, includeAbstract));
        }
Пример #9
0
    public void HtmlTableParserTest()
    {
        var a  = @"D:\_Test\sunamo\sunamo\Html\HtmlTableParserTests\a.html";
        var hd = HtmlAgilityHelper.CreateHtmlDocument();

        hd.LoadHtml(TF.ReadFile(a));
        var             table = HtmlAgilityHelper.Node(hd.DocumentNode, true, "table");
        HtmlTableParser p     = new HtmlTableParser(table, false);
        var             v     = p.ColumnValues("1", false, false);
        int             i     = 0;
    }
Пример #10
0
        public async Task <ArticleItemDto> GetArticle(string url, CancellationToken cancellationToken)
        {
            ArticleItemDto dto = new ArticleItemDto();

            HtmlDocument doc = await HtmlAgilityHelper.GetHtmlDocument(url, cancellationToken);

            var raw = doc.ToString();

            //TODO :

            return(dto);
        }
Пример #11
0
    public void XmlAgilityDocumentTest()
    {
        var c = @"D:\_Test\sunamo\sunamo\Xml\XmlAgilityDocumentTests\input.csproj";
        XmlAgilityDocument x = new XmlAgilityDocument();

        x.Load(c);
        var nodes = HtmlAgilityHelper.NodesWithAttrWildCard(x.hd.DocumentNode, true, "Compile", Consts.Include, "*.cs", true);

        foreach (var item in nodes)
        {
            item.Remove();
        }

        x.path = FS.ChangeFilename(x.path, "output.csproj", false);
        x.Save();
    }
Пример #12
0
        public void NodesWithAttrTest()
        {
            List <HtmlNode> nodes = null;

            // Recursively
            nodes = HtmlAgilityHelper.NodesWithAttr(documentNode, true, HtmlTags.span, HtmlAttrs.c, cssClassC);
            Assert.Equal(3, nodes.Count);
            // Non-recursively
            if (noRecursive)
            {
                nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, false, HtmlTags.span, HtmlAttrs.c, cssClassC);
                Assert.Equal(1, nodes.Count);
            }

            // Recursively
            nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, true, AllStrings.asterisk, HtmlAttrs.c, cssClassC);
            Assert.Equal(4, nodes.Count);
            // Non-recursively
            if (noRecursive)
            {
                nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, false, AllStrings.asterisk, HtmlAttrs.c, cssClassC);
                Assert.Equal(2, nodes.Count);
            }

            // Recursively
            nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, true, AllStrings.asterisk, HtmlAttrs.c, cssClassA, true);
            Assert.Equal(3, nodes.Count);
            // Non-recursively
            if (noRecursive)
            {
                nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, false, AllStrings.asterisk, HtmlAttrs.c, cssClassC, true);
                Assert.Equal(2, nodes.Count);
            }

            // Recursively
            nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, true, AllStrings.asterisk, HtmlAttrs.c, AllStrings.asterisk, true);
            Assert.Equal(10, nodes.Count);
            // Non-recursively
            if (noRecursive)
            {
                nodes = HtmlAgilityHelper.NodesWithAttr(bodyNode, false, AllStrings.asterisk, HtmlAttrs.c, AllStrings.asterisk, true);
                Assert.Equal(7, nodes.Count);
            }
        }
        protected override IEnumerable <Entity> GetEntities()
        {
            while (!_NextUrlQueue.IsEmpty)
            {
                string[] urls = _NextUrlQueue.ToArray();
                StringUtil.Shuffle(urls); // Make the crawler more like BFS

                _NextUrlQueue = new ConcurrentQueue <string>();
                ConcurrentQueue <Entity> resultEntities = new ConcurrentQueue <Entity>();
                Parallel.ForEach(urls,
                                 new ParallelOptions()
                {
                    MaxDegreeOfParallelism = MaxParallelCount
                },
                                 (url) =>
                {
                    string content = HttpRequestHelper.GetContentFromHttpUrl(url);
                    foreach (Entity entity in HtmlAgilityHelper.GetEntitiesFromContent(content, EntityModel))
                    {
                        entity.SetValue("Url", url);
                        resultEntities.Enqueue(entity);
                    }

                    foreach (string nextUrl in DiscoverCrawlerUrls(content, url))
                    {
                        if (VisitedUrls.Contains(nextUrl))
                        {
                            continue;
                        }
                        _NextUrlQueue.Enqueue(nextUrl);
                        VisitedUrls.Add(nextUrl);
                    }
                });

                foreach (var entity in resultEntities)
                {
                    yield return(entity);
                }
            }
        }
 private IEnumerable <Entity> Parse(string content)
 {
     return(HtmlAgilityHelper.GetEntitiesFromContent(content, Model));
 }