Beispiel #1
0
        /// <summary>
        /// Converts to tag.
        /// </summary>
        /// <param name="html">The HTML.</param>
        /// <param name="lowerNames">if set to <c>true</c> [lower names].</param>
        /// <returns></returns>
        public static ScreenScraperTag ConvertToTag(string html, bool lowerNames)
        {
            Match m = tagRegex.Match(html);

            if (m.Success)
            {
                ScreenScraperTag tag = new ScreenScraperTag();
                tag.Name = m.Groups[1].ToString();
                foreach (Capture capture in m.Groups[2].Captures)
                {
                    Match m2 = nameValueRegex.Match(capture.ToString());
                    if (m2.Success)
                    {
                        string id = m2.Groups[1].ToString();
                        if (lowerNames)
                        {
                            id = id.ToLower();
                        }
                        try
                        {
                            tag.Attributes[id] = m2.Groups[2].ToString();
                        }
                        catch (Exception)
                        {
                            // Catch if duplicate attributes are added
                        }
                    }
                }
                return(tag);
            }
            return(null);
        }
Beispiel #2
0
        /// <summary>
        /// Converts to tag list.
        /// </summary>
        /// <param name="tags">The tags.</param>
        /// <param name="lowerNames">if set to <c>true</c> [lower names].</param>
        /// <returns></returns>
        public static IList <ScreenScraperTag> ConvertToTagList(IList <string> tags, bool lowerNames)
        {
            IList <ScreenScraperTag> ret = new List <ScreenScraperTag>();

            foreach (string tag in tags)
            {
                ScreenScraperTag t = ConvertToTag(tag, lowerNames);
                if (t != null)
                {
                    ret.Add(t);
                }
            }
            return(ret);
        }
        /// <summary>
        /// Posts the process data.
        /// </summary>
        /// <param name="page">The page.</param>
        /// <returns></returns>
        private ScrapedPage PostProcessData(ScrapedPage page)
        {
            if (FollowEquivRefreshes)
            {
                bool followed = false;
                // See if we can find an http-equiv refresh
                IList <ScreenScraperTag> metaTags = ScrapedPage.ConvertToTagList(page.FindChildlessTags("meta", false), true);

                // Now, we have all META tags. Try to find one with HTTP-EQUIV="refresh"
                ScreenScraperTag refreshTag = null;
                foreach (ScreenScraperTag metaTag in metaTags)
                {
                    string httpEquivValue = metaTag.FindAttributeValue("http-equiv");
                    if (httpEquivValue != null && httpEquivValue.Equals("refresh"))
                    {
                        refreshTag = metaTag;
                        break;
                    }
                }
                if (refreshTag != null)
                {
                    // It's a refresh. Try to figure out the URL we have to go to.
                    string contentValue = refreshTag.FindAttributeValue("content");
                    if (contentValue != null)
                    {
                        // First, split it by semicolon
                        string[] refreshPieces = contentValue.Split(';');
                        string   url           = null;
                        int      time          = 0;
                        foreach (string refreshPiece in refreshPieces)
                        {
                            if (refreshPiece.ToLower().Trim().StartsWith("url"))
                            {
                                // found the URL. Just take everything after the =
                                int equalPos = refreshPiece.IndexOf('=');
                                if (equalPos != -1)
                                {
                                    url = refreshPiece.Substring(equalPos + 1).Trim();
                                    break;
                                }
                            }
                            else if (time == 0)
                            {
                                int.TryParse(refreshPiece.Trim(), out time);
                            }
                        }
                        if (time == 0 && url != null)
                        {
                            // We have a refresh url, so we need to update the page

                            // If it is a relative url, make it absolute.
                            if (!Uri.IsWellFormedUriString(url, UriKind.Absolute))
                            {
                                url = LastProcessResponseUri.GetLeftPart(UriPartial.Authority) + "/" + (url.StartsWith("/") ? url.Substring(1) : url);
                            }

                            if (!url.Equals(LastMetaFollow))
                            {
                                page           = Scrape(MetaRefreshScrapeType == null ? page.ScrapeType : MetaRefreshScrapeType.Value, url, page.QueryParameters);
                                LastMetaFollow = url;
                                followed       = true;
                            }
                            else
                            {
                                throw new Exception("Appears to be a recursive loop of http-equiv redirects to the same page ('" + url + "').");
                            }
                        }
                    }
                }

                if (!followed)
                {
                    LastMetaFollow = null;
                }
            }
            return(page);
        }