Example #1
0
        //dup to FindTitle() - which one we should use?
        public static string FindTitle2(string url, string defaultIfNoMatch, IWebProxy proxy, ICredentials credentials)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

            request.AllowAutoRedirect = true;
            request.Proxy             = proxy;
            request.Credentials       = credentials;
            request.Timeout           = 5 * 1000 /* 5 second timeout */;

            if (FeedSource.SetCookies)
            {
                HttpCookieManager.SetCookies(request);
            }

            /* use bogus user agent since some sites will bounce you to unsupported browser page otherwise */
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;)";

            string title = defaultIfNoMatch;

            using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
            {
                using (FastSgmlXPathReader sgmlReader = new FastSgmlXPathReader())
                {
                    using (StreamReader inputStreamReader = new StreamReader(response.GetResponseStream()))
                    {
                        try
                        {
                            sgmlReader.InputStream = inputStreamReader;
                            sgmlReader.DocType     = "HTML";
                            sgmlReader.CaseFolding = CaseFolding.ToLower;
                            bool done = false;

                            while (!done && sgmlReader.Read())
                            {
                                if (sgmlReader.NodeType == XmlNodeType.Element)
                                {
                                    switch (sgmlReader.XPath)
                                    {
                                    case "//html/title":
                                        title = sgmlReader.ReadElementContentAsString();                                                 // .ReadInnerXml();
                                        done  = true;
                                        break;

                                    case "//html/head/title":
                                        title = sgmlReader.ReadElementContentAsString();                                                // .ReadInnerXml();
                                        done  = true;
                                        break;

                                    case "//html/body":
                                        done = true;
                                        break;
                                    }
                                }
                            }                             //while
                        }
                        catch (Exception e)
                        {
                            _log.Debug("Error retrieving title from HTML page at " + url, e);
                        }
                    }
                }
            }

            return(title);
        }
Example #2
0
        public static List <TitledLink> RetrieveTitledLinks(string html)
        {
            if (string.IsNullOrEmpty(html))
            {
                return(GetList <TitledLink> .Empty);
            }

            List <TitledLink> list = new List <TitledLink>();

            FastSgmlXPathReader sgmlReader = new FastSgmlXPathReader
            {
                DocType     = "HTML",
                CaseFolding = CaseFolding.ToLower,
                IgnoreDtd   = true
            };

            for (Match m = RegExFindHref.Match(html); m.Success; m = m.NextMatch())
            {
                string href = m.Groups[1].Value.ToLower();                 // filter non-real relation urls:
                if (href.StartsWith("mailto:") || href.StartsWith("javascript:"))
                {
                    continue;
                }

                if (href.Length == 0)
                {
                    continue;
                }

                var linkText  = m.Groups[2].Value;
                var linkTitle = linkText;

                using (var inputStreamReader = new StringReader(m.Groups[0].Value))
                {
                    try
                    {
                        sgmlReader.InputStream = inputStreamReader;
                        while (sgmlReader.Read())
                        {
                            if (sgmlReader.NodeType == XmlNodeType.Element)
                            {
                                if (sgmlReader.Name == "a")
                                {
                                    href = sgmlReader.GetAttribute("href");
                                    if (!String.IsNullOrWhiteSpace(href))
                                    {
                                        href = href.ToLower();
                                        // filter non-real relation urls:
                                        if (href.StartsWith("mailto:") || href.StartsWith("javascript:"))
                                        {
                                            continue;
                                        }

                                        linkTitle = sgmlReader.GetAttribute("title");
                                        linkText  = sgmlReader.ReadInnerXml();
                                    }
                                }
                            }
                        }
                    }
                    catch (Exception e)
                    {
                        _log.Debug("Error retrieving title with FastSgmlXPathReader() from HTML page", e);
                    }
                }

                if (!String.IsNullOrEmpty(href))
                {
                    href = RelationCosmos.RelationCosmos.UrlTable.Add(href);
                    TitledLink link = new TitledLink(ref href, String.IsNullOrEmpty(linkTitle) ? linkText : linkTitle);

                    var found = list.FirstOrDefault(newLink => link.Url.Equals(newLink.Url, StringComparison.Ordinal));
                    if (String.IsNullOrEmpty(found.Url))
                    {
                        list.Add(link);
                    }
                }
            }

            if (list.Count == 0)
            {
                list = GetList <TitledLink> .Empty;
            }

            return(list);
        }