/// <summary> /// Extracts the link tag info from the page using RegEx /// </summary> /// <param name="p_strPageHtmlContent"></param> /// <returns>Deprecated</returns> private static Dict<string, Anchor> GetAnchorsList(string p_strPageHtmlContent) { Dict<string, Anchor> dictReturnSet = new Dict<string, Anchor>(); Anchor ancrLink = new Anchor(); // Try grabbing the meta info of the page into a dictionary string pattern = "<a.+?(?:href=(?:\"|')(.*?)(?:\"|').*?)?(?:title=(?:\"|')(.*?)(?:\"|').*?)?(?:href=(?:\"|')(.*?)(?:\"|'))?/?>.*?</head>"; RegexOptions rxoOptions = RegexOptions.IgnoreCase | RegexOptions.Singleline; foreach (Match match in Regex.Matches(p_strPageHtmlContent, pattern, rxoOptions)) { ancrLink = new Anchor(); ancrLink.Rel = match.Groups[1].Value; ancrLink.Type = match.Groups[2].Value; ancrLink.Href = match.Groups[3].Value; dictReturnSet.Add(match.Groups[1].Value, ancrLink); } return dictReturnSet; }
/// <summary> /// Extracts anchor information using HtmlDocument (Html Agility Pack) /// </summary> /// <param name="htmlDocDocument"></param> /// <returns></returns> private static List<Anchor> GetAnchorList(HtmlDocument p_htmlDocDocument) { List<Anchor> ancrLstReturnList = new List<Anchor>(); if (p_htmlDocDocument.DocumentNode.SelectNodes("//a[@href]") != null) { foreach (HtmlNode hnLink in p_htmlDocDocument.DocumentNode.SelectNodes("//a[@href]")) { Anchor aNewAnchorTag = new Anchor(); aNewAnchorTag.Href = GetHtmlAttributeValue(hnLink.Attributes, "href"); aNewAnchorTag.HrefLang = GetHtmlAttributeValue(hnLink.Attributes, "hreflang"); aNewAnchorTag.Title = GetHtmlAttributeValue(hnLink.Attributes, "title"); aNewAnchorTag.Target = GetHtmlAttributeValue(hnLink.Attributes, "target"); aNewAnchorTag.Rel = GetHtmlAttributeValue(hnLink.Attributes, "rel"); aNewAnchorTag.Name = GetHtmlAttributeValue(hnLink.Attributes, "name"); aNewAnchorTag.Download = GetHtmlAttributeValue(hnLink.Attributes, "download"); aNewAnchorTag.Media = GetHtmlAttributeValue(hnLink.Attributes, "media"); aNewAnchorTag.Type = GetHtmlAttributeValue(hnLink.Attributes, "type"); aNewAnchorTag.Text = hnLink.InnerText; ancrLstReturnList.Add(aNewAnchorTag); } } return ancrLstReturnList; }