예제 #1
0
        /// <summary>
        /// Extracts the link tag info from the page using RegEx
        /// </summary>
        /// <param name="p_strPageHtmlContent"></param>
        /// <returns>Deprecated</returns>
        private static Dict<string, Anchor> GetAnchorsList(string p_strPageHtmlContent)
        {
            Dict<string, Anchor> dictReturnSet = new Dict<string, Anchor>();
            Anchor ancrLink = new Anchor();

            //   Try grabbing the meta info of the page into a dictionary
            string pattern = "<a.+?(?:href=(?:\"|')(.*?)(?:\"|').*?)?(?:title=(?:\"|')(.*?)(?:\"|').*?)?(?:href=(?:\"|')(.*?)(?:\"|'))?/?>.*?</head>";
            RegexOptions rxoOptions = RegexOptions.IgnoreCase | RegexOptions.Singleline;

            foreach (Match match in Regex.Matches(p_strPageHtmlContent, pattern, rxoOptions))
            {
                ancrLink = new Anchor();
                ancrLink.Rel = match.Groups[1].Value;
                ancrLink.Type = match.Groups[2].Value;
                ancrLink.Href = match.Groups[3].Value;
                dictReturnSet.Add(match.Groups[1].Value, ancrLink);
            }

            return dictReturnSet;
        }
예제 #2
0
        /// <summary>
        /// Extracts anchor information using HtmlDocument (Html Agility Pack)
        /// </summary>
        /// <param name="htmlDocDocument"></param>
        /// <returns></returns>
        private static List<Anchor> GetAnchorList(HtmlDocument p_htmlDocDocument)
        {
            List<Anchor> ancrLstReturnList = new List<Anchor>();

            if (p_htmlDocDocument.DocumentNode.SelectNodes("//a[@href]") != null)
            {
                foreach (HtmlNode hnLink in p_htmlDocDocument.DocumentNode.SelectNodes("//a[@href]"))
                {
                    Anchor aNewAnchorTag = new Anchor();
                    aNewAnchorTag.Href = GetHtmlAttributeValue(hnLink.Attributes, "href");
                    aNewAnchorTag.HrefLang = GetHtmlAttributeValue(hnLink.Attributes, "hreflang");
                    aNewAnchorTag.Title = GetHtmlAttributeValue(hnLink.Attributes, "title");
                    aNewAnchorTag.Target = GetHtmlAttributeValue(hnLink.Attributes, "target");
                    aNewAnchorTag.Rel = GetHtmlAttributeValue(hnLink.Attributes, "rel");
                    aNewAnchorTag.Name = GetHtmlAttributeValue(hnLink.Attributes, "name");
                    aNewAnchorTag.Download = GetHtmlAttributeValue(hnLink.Attributes, "download");
                    aNewAnchorTag.Media = GetHtmlAttributeValue(hnLink.Attributes, "media");
                    aNewAnchorTag.Type = GetHtmlAttributeValue(hnLink.Attributes, "type");
                    aNewAnchorTag.Text = hnLink.InnerText;

                    ancrLstReturnList.Add(aNewAnchorTag);
                }
            }
            return ancrLstReturnList;
        }