/// <summary> /// Converts to tag. /// </summary> /// <param name="html">The HTML.</param> /// <param name="lowerNames">if set to <c>true</c> [lower names].</param> /// <returns></returns> public static ScreenScraperTag ConvertToTag(string html, bool lowerNames) { Match m = tagRegex.Match(html); if (m.Success) { ScreenScraperTag tag = new ScreenScraperTag(); tag.Name = m.Groups[1].ToString(); foreach (Capture capture in m.Groups[2].Captures) { Match m2 = nameValueRegex.Match(capture.ToString()); if (m2.Success) { string id = m2.Groups[1].ToString(); if (lowerNames) { id = id.ToLower(); } try { tag.Attributes[id] = m2.Groups[2].ToString(); } catch (Exception) { // Catch if duplicate attributes are added } } } return(tag); } return(null); }
/// <summary> /// Converts to tag list. /// </summary> /// <param name="tags">The tags.</param> /// <param name="lowerNames">if set to <c>true</c> [lower names].</param> /// <returns></returns> public static IList <ScreenScraperTag> ConvertToTagList(IList <string> tags, bool lowerNames) { IList <ScreenScraperTag> ret = new List <ScreenScraperTag>(); foreach (string tag in tags) { ScreenScraperTag t = ConvertToTag(tag, lowerNames); if (t != null) { ret.Add(t); } } return(ret); }
/// <summary> /// Posts the process data. /// </summary> /// <param name="page">The page.</param> /// <returns></returns> private ScrapedPage PostProcessData(ScrapedPage page) { if (FollowEquivRefreshes) { bool followed = false; // See if we can find an http-equiv refresh IList <ScreenScraperTag> metaTags = ScrapedPage.ConvertToTagList(page.FindChildlessTags("meta", false), true); // Now, we have all META tags. Try to find one with HTTP-EQUIV="refresh" ScreenScraperTag refreshTag = null; foreach (ScreenScraperTag metaTag in metaTags) { string httpEquivValue = metaTag.FindAttributeValue("http-equiv"); if (httpEquivValue != null && httpEquivValue.Equals("refresh")) { refreshTag = metaTag; break; } } if (refreshTag != null) { // It's a refresh. Try to figure out the URL we have to go to. string contentValue = refreshTag.FindAttributeValue("content"); if (contentValue != null) { // First, split it by semicolon string[] refreshPieces = contentValue.Split(';'); string url = null; int time = 0; foreach (string refreshPiece in refreshPieces) { if (refreshPiece.ToLower().Trim().StartsWith("url")) { // found the URL. Just take everything after the = int equalPos = refreshPiece.IndexOf('='); if (equalPos != -1) { url = refreshPiece.Substring(equalPos + 1).Trim(); break; } } else if (time == 0) { int.TryParse(refreshPiece.Trim(), out time); } } if (time == 0 && url != null) { // We have a refresh url, so we need to update the page // If it is a relative url, make it absolute. if (!Uri.IsWellFormedUriString(url, UriKind.Absolute)) { url = LastProcessResponseUri.GetLeftPart(UriPartial.Authority) + "/" + (url.StartsWith("/") ? url.Substring(1) : url); } if (!url.Equals(LastMetaFollow)) { page = Scrape(MetaRefreshScrapeType == null ? page.ScrapeType : MetaRefreshScrapeType.Value, url, page.QueryParameters); LastMetaFollow = url; followed = true; } else { throw new Exception("Appears to be a recursive loop of http-equiv redirects to the same page ('" + url + "')."); } } } } if (!followed) { LastMetaFollow = null; } } return(page); }