C# (CSharp) HtmlUtility.AbsoluteUrl примеры использования

Язык программирования: C# (CSharp)

Класс/Тип: HtmlUtility

Метод/Функция: AbsoluteUrl

Примеров на hotexamples.com: 2

C# (CSharp) HtmlUtility.AbsoluteUrl - 2 примера найдено. Это лучшие примеры C# (CSharp) кода для HtmlUtility.AbsoluteUrl, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GetText(15)

HtmlEncode(8)

GetFull(7)

RemoveHtmlTags(7)

CleanHtml(5)

ConvertToXml(5)

Encode(5)

RemoveInvalidHtmlTags(4)

IsValid(4)

BBCodeToHtml(4)

RemoveHeaderFromHtml(4)

HtmlStrip(3)

CreateProfileLink(3)

GetLastMessageFromMessageHTMLBody(3)

RemoveAllTag(3)

ParseMetadataAsync(2)

GetRandomEmail(2)

QuotesDecode(2)

QuotesEncode(2)

GetTableHolderHtmlId(2)

RemoveCssClass(2)

GetRootUrl(2)

AbsoluteUrl(2)

AddCssClass(2)

GetEnumInfo(2)

ClearFormattingOfHtml(2)

GetContainerHtmlId(2)

ConvertHtmlToText(2)

ExpandRelativePath(2)

MakeSortLink(1)

BuildPagerScript(1)

ConvertBreaksToCrlf(1)

ConvertCrlfToBreaks(1)

ConvertToDisplayText(1)

CutWordLength(1)

Parse(1)

LoadString(1)

LoadUrl(1)

GetParagraphs(1)

IsValidUri(1)

Decode(1)

GetAllImages(1)

GetWordsFrequency(1)

GetTitleContent(1)

GetTableHtmlId(1)

GetHtml(1)

GetPreview(1)

Пример #1

Показать файл

Файл: PageParser.cs Проект: mlzboy/list_discovery

        public static bool validateNextPage(string Html, string Pattern, ref string Url)
        {
            HtmlNode htmlNode = HtmlUtility.getSafeHtmlRootNode(Html);

            List <HtmlNode>    atagHtmlNodes = htmlNode.SelectNodes("//a[@href]").ToList();
            List <testNextUrl> x             = new List <testNextUrl>();
            StringBuilder      Result        = new StringBuilder();

            int intX = 0;

            foreach (HtmlNode tmpNode in atagHtmlNodes)
            {
                intX++;
                if (Regex.Match(tmpNode.InnerText, @".*[一二三四五六七八九十\d页].*").Success)
                {
                    testNextUrl tmp = new testNextUrl();
                    tmp.index   = intX;
                    tmp.urlText = HTMLCleaner.CleanHTML(tmpNode.InnerText, true);
                    tmp.urlLink = HtmlUtility.AbsoluteUrl(tmpNode.Attributes["href"].Value, Url, true);
                    x.Add(tmp);
                }
            }
            if (x.Count > 0)
            {
                if (Url.Equals(TrianNextUrl(x).urlLink))
                {
                    return(false);
                }
                else
                {
                    Url = TrianNextUrl(x).urlLink;
                }
            }
            if (Url.Length == 0)
            {
                return(false);
            }
            else
            {
                return(true);
            }
        }

Пример #2

Показать файл

        /// <summary>
        /// 从List页面上根据各字段XPath提取内容集合
        /// </summary>
        /// <param name="Url">网址</param>
        /// <param name="RootNode">Document的根节点</param>
        /// <param name="Path">根据此ListPath来提取内容</param>
        /// <param name="List_MinCountItem">至少List几个Item（用于判定旧网站中大量A堆砌在同一个元素下的情况）</param>
        /// <param name="needscalepages">是否需要翻页，默认为否</param>
        /// <returns></returns>
        public static List <Article> ExtractItemFromList(string Url, HtmlNode RootNode, XpathPattern Path)
        {
            List <Article> Content = new List <Article>();

            //fix a null bug by carey. 2014-09-10
            HtmlNodeCollection rootNodes = RootNode.SelectNodes(Path.ItemRootXPath);

            if (rootNodes != null && rootNodes.Count > 0)
            {
                foreach (HtmlNode BaseNode in rootNodes)
                {
                    //正常情况下，每个BaseNode有一个Item，但是某些网站可能存在多个
                    if (string.IsNullOrWhiteSpace(Path.TitleXPath) || BaseNode.SelectNodes(Path.TitleXPath) == null)
                    {
                        continue;
                    }

                    //如果 BaseNode 的数量小于6，则判断是否存在多个可匹配的 Title 项；如果存在的话则记录数量
                    List <HtmlNode> nodecollection      = new List <HtmlNode>();
                    int             singleNodeItemCount = 0;

                    if (!string.IsNullOrWhiteSpace(Path.UrlXPath))
                    {
                        nodecollection = BaseNode.SelectNodes(Path.UrlXPath).Where(n => n.Attributes.Contains("href")).ToList();
                    }
                    else
                    {
                        nodecollection = BaseNode.SelectNodes(Path.TitleXPath).Where(n => n.Attributes.Contains("href")).ToList();
                    }

                    if (!string.IsNullOrWhiteSpace(Path.TitleXPath))
                    {
                        Path.TitleXPath = Path.UrlXPath;
                    }

                    singleNodeItemCount = nodecollection?.Count ?? 0;

                    if (nodecollection != null && nodecollection.Count() > 0 && nodecollection.Any(n => !string.IsNullOrEmpty(n.Attributes["href"].Value)))
                    {
                        Article[] articleNodeItems = new Article[singleNodeItemCount];

                        for (int i = 0; i < singleNodeItemCount; i++)
                        {
                            articleNodeItems[i]       = new Article();
                            articleNodeItems[i].Title = ExtractInnerTextFromBaseNode(BaseNode, Path.TitleXPath, i);
                            if (articleNodeItems[i].Title != null)
                            {
                                try
                                {
                                    articleNodeItems[i].Url = nodecollection.Where(n => !string.IsNullOrEmpty(n.Attributes["href"].Value)).ElementAt(i).Attributes["href"].Value;
                                    if (articleNodeItems[i].Url.Contains(".pdf"))
                                    {
                                        continue;
                                    }
                                    if (articleNodeItems[i].Url.StartsWith("javascript:openArticle"))
                                    {
                                        articleNodeItems[i].Url = articleNodeItems[i].Url.Substring(articleNodeItems[i].Url.IndexOf("('") + 2);
                                        articleNodeItems[i].Url = articleNodeItems[i].Url.Substring(0, articleNodeItems[i].Url.IndexOf("')"));
                                    }
                                    articleNodeItems[i].Url = HtmlUtility.AbsoluteUrl(articleNodeItems[i].Url, Url, true);
                                    string url = HtmlUtility.AbsoluteUrl(articleNodeItems[i].Url, Url, true);
                                    articleNodeItems[i].Url = url;
                                    if (articleNodeItems[i].Url.Contains('@'))
                                    {
                                        continue;
                                    }
                                }
                                catch (Exception ex)
                                {
                                    articleNodeItems[i].Url = null;
                                }
                            }
                            if (!string.IsNullOrWhiteSpace(Path.MediaNameXPath))
                            {
                                articleNodeItems[i].MediaName = ExtractInnerTextFromBaseNode(BaseNode, Path.MediaNameXPath, i);
                                articleNodeItems[i].MediaName = ExtractSegmentFromInnerText(articleNodeItems[i].MediaName, MediaPrefixRegex);
                                articleNodeItems[i].MediaName = HTMLCleaner.CleanMediaName(articleNodeItems[i].MediaName);//清洗
                            }
                            if (!string.IsNullOrWhiteSpace(Path.AuthorXPath))
                            {
                                articleNodeItems[i].Author = ExtractInnerTextFromBaseNode(BaseNode, Path.AuthorXPath, i);
                                articleNodeItems[i].Author = ExtractSegmentFromInnerText(articleNodeItems[i].Author, AuthorPrefixRegex);
                                articleNodeItems[i].Author = HTMLCleaner.CleanAuthor(articleNodeItems[i].Author);//清洗
                            }
                            if (!string.IsNullOrWhiteSpace(Path.DateXPath))
                            {
                                articleNodeItems[i].PubDate = DateTimeParser.Parser(ExtractInnerTextFromBaseNode(BaseNode, Path.DateXPath, i));
                            }
                            if (!string.IsNullOrWhiteSpace(Path.AbsTractXPath))
                            {
                                articleNodeItems[i].AbsTract = ExtractInnerTextFromBaseNode(BaseNode, Path.AbsTractXPath, i);
                            }
                            //点击数的提取逻辑
                            string ViewString = string.Empty;
                            if (!string.IsNullOrWhiteSpace(Path.ViewXPath) || !string.IsNullOrWhiteSpace(Path.ReplyXPath))
                            {
                                ViewData currentViewData = new ViewData();
                                currentViewData.FetchTime = DateTime.Now;

                                ViewString = ExtractInnerTextFromBaseNode(BaseNode, Path.ViewXPath, i, false);
                                if (!string.IsNullOrEmpty(ViewString))
                                {
                                    MatchCollection digiText = Regex.Matches(ViewString, @"\d{1,9}");
                                    if (digiText.Count == 1)
                                    {
                                        currentViewData.View = int.Parse(digiText[0].Captures[0].Value);
                                    }
                                    else if (digiText.Count > 1 && Path.ViewXPath == Path.ReplyXPath) //View和Reply在一个格子里，这里容易出现多个的情况，不建议使用
                                    {
                                        int a = int.Parse(digiText[0].Captures[0].Value);
                                        int b = int.Parse(digiText[1].Captures[0].Value);
                                        currentViewData.View  = a >= b ? a : b;
                                        currentViewData.Reply = a >= b ? b : a;
                                    }
                                }

                                //评论数的提取逻辑
                                if (!string.IsNullOrEmpty(Path.ReplyXPath) && Path.ViewXPath != Path.ReplyXPath)
                                {
                                    string ReplyString = ExtractInnerTextFromBaseNode(BaseNode, Path.ReplyXPath, i, false);
                                    if (!string.IsNullOrEmpty(ReplyString))
                                    {
                                        MatchCollection digiText = Regex.Matches(ReplyString, @"\d{1,9}");
                                        if (digiText.Count > 0) //单独的Reply
                                        {
                                            currentViewData.Reply = int.Parse(digiText[0].Captures[0].Value);
                                        }
                                    }
                                }
                                if (articleNodeItems[i].ViewDataList == null)
                                {
                                    articleNodeItems[i].ViewDataList = new List <ViewData>();
                                }

                                articleNodeItems[i].ViewDataList.Add(currentViewData);
                            }
                        }

                        Content.AddRange(articleNodeItems.Where(f => !string.IsNullOrWhiteSpace(f.Url)));
                    }
                }
            }

            return(Content);
        }