/// <summary> /// Extracts the meta tag info from the page using RegEx /// </summary> /// <param name="p_strPageHtmlContent"></param> /// <returns>Deprecated</returns> private static Dict<string, PageMeta> GetPageMetaInfo(string p_strPageHtmlContent) { Dict<string, PageMeta> dictReturnSet = new Dict<string, PageMeta>(); PageMeta pmMetaTag = new PageMeta(); // Try grabbing the meta info of the page into a dictionary string pattern = "<meta.+?(?:name=(?:\"|')(.*?)(?:\"|').*?)?(?:property=(?:\"|')(.*?)(?:\"|').*?)?(?:content=(?:\"|')(.*?)(?:\"|'))?/?>.*?</head>"; RegexOptions rxoOptions = RegexOptions.IgnoreCase | RegexOptions.Singleline; foreach (Match match in Regex.Matches(p_strPageHtmlContent, pattern, rxoOptions)) { pmMetaTag = new PageMeta(); pmMetaTag.Name = match.Groups[1].Value; pmMetaTag.Property = match.Groups[2].Value; pmMetaTag.Content = match.Groups[3].Value; if (!dictReturnSet.ContainsKey(match.Groups[1].Value)) { dictReturnSet.Add(match.Groups[1].Value, pmMetaTag); } } return dictReturnSet; }
/// <summary> /// Grabs and returns Meta tags from the page head /// </summary> /// <param name="htmlDocDocument"></param> /// <returns></returns> private static List<PageMeta> GetPageMetaInfo(HtmlDocument p_htmlDocDocument) { PageMeta pmMeta = null; List<PageMeta> lstMeta = new List<PageMeta>(); if (p_htmlDocDocument.DocumentNode.SelectNodes("//meta") != null) { foreach (HtmlNode hnItem in p_htmlDocDocument.DocumentNode.SelectNodes("//meta")) { pmMeta = new PageMeta(); pmMeta.Name = GetHtmlAttributeValue(hnItem.Attributes, "name"); pmMeta.Property = GetHtmlAttributeValue(hnItem.Attributes, "property"); pmMeta.Content = GetHtmlAttributeValue(hnItem.Attributes, "content"); lstMeta.Add(pmMeta); } } return lstMeta; }