/// <summary> /// Posts the process data. /// </summary> /// <param name="page">The page.</param> /// <returns></returns> private ScrapedPage PostProcessData(ScrapedPage page) { if (FollowEquivRefreshes) { bool followed = false; // See if we can find an http-equiv refresh List<ScreenScraperTag> metaTags = ScrapedPage.ConvertToTagList(page.FindChildlessTags("meta", false), true); // Now, we have all META tags. Try to find one with HTTP-EQUIV="refresh" ScreenScraperTag refreshTag = null; foreach (ScreenScraperTag metaTag in metaTags) { string httpEquivValue = metaTag.FindAttributeValue("http-equiv"); if (httpEquivValue != null && httpEquivValue.Equals("refresh")) { refreshTag = metaTag; break; } } if (refreshTag != null) { // It's a refresh. Try to figure out the URL we have to go to. string contentValue = refreshTag.FindAttributeValue("content"); if (contentValue != null) { // First, split it by semicolon string[] refreshPieces = contentValue.Split(';'); string url = null; int time = 0; foreach (string refreshPiece in refreshPieces) { if (refreshPiece.ToLower().Trim().StartsWith("url")) { // found the URL. Just take everything after the = int equalPos = refreshPiece.IndexOf('='); if (equalPos != -1) { url = refreshPiece.Substring(equalPos + 1).Trim(); break; } } else if (time == 0) { int.TryParse(refreshPiece.Trim(), out time); } } if (time == 0 && url != null) { // We have a refresh url, so we need to update the page // If it is a relative url, make it absolute. if (!Uri.IsWellFormedUriString(url, UriKind.Absolute)) { url = LastProcessResponseUri.GetLeftPart(UriPartial.Authority) + "/" + (url.StartsWith("/") ? url.Substring(1) : url); } if (!url.Equals(LastMetaFollow)) { page = Scrape(MetaRefreshScrapeType == null ? page.ScrapeType : MetaRefreshScrapeType.Value, url, page.QueryParameters); LastMetaFollow = url; followed = true; } else { throw new Exception("Appears to be a recursive loop of http-equiv redirects to the same page ('" + url + "')."); } } } } if (!followed) { LastMetaFollow = null; } } return page; }
/// <summary> /// Scrapes the specified type. /// </summary> /// <param name="type">The type.</param> /// <param name="uri">The URI.</param> /// <param name="query">The query.</param> /// <returns></returns> public ScrapedPage Scrape(ScrapeType type, string uri, NameValueCollection query) { ScrapedPage page = new ScrapedPage(); string qs = BuildQueryString(query); page.QueryParameters = query; page.ScrapeType = type; switch (type) { case ScrapeType.GET: uri = uri.Contains("?") ? (uri + "&" + qs) : (uri + "?" + qs); page.RawStream = HttpGet(uri); break; case ScrapeType.POST: page.RawStream = HttpPost(uri, qs); break; default: throw new NotImplementedException(); } if (page.RawStream == null) { throw new Exception("No data for " + uri); } else { page.Url = new Uri(uri); Referer = uri; page = PostProcessData(page); } return page; }