Esempio n. 1
0
        /// <summary>
        /// Gets the title out of the HTML head section.
        /// </summary>
        /// <param name="url">The URL of the page</param>
        /// <param name="defaultIfNoMatch">string to return, if no match was found</param>
        /// <param name="credentials">Credentials for authenticating the request</param>
        /// <param name="proxy">Proxy server to direct the request through</param>
        /// <returns></returns>
        //dup to FindTitle2() - which one we should use?
        public static string FindTitle(string url, string defaultIfNoMatch, IWebProxy proxy, ICredentials credentials)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

            request.AllowAutoRedirect = true;
            request.Proxy             = proxy;
            request.Credentials       = credentials;
            request.Timeout           = 5 * 1000 /* 5 second timeout */;

            if (FeedSource.SetCookies)
            {
                HttpCookieManager.SetCookies(request);
            }

            /* use bogus user agent since some sites will bounce you to unsupported browser page otherwise */
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;)";

            string title  = defaultIfNoMatch;
            Stream stream = null;

            try
            {
                stream = request.GetResponse().GetResponseStream();

                SgmlReader reader = new SgmlReader();
                reader.InputStream = new StreamReader(stream);

                while (reader.Read())
                {
                    if ((reader.NodeType == XmlNodeType.Element) && (reader.Name.ToLower().Equals("title")))
                    {
                        title = reader.ReadElementContentAsString();
                        stream.Flush();
                        break;
                    }
                } //while
            }
            catch (Exception e)
            {
                _log.Debug("Error retrieving title from HTML page at " + url, e);
            }
            finally
            {
                if (stream != null)
                {
                    stream.Close();
                }
            }

            return(title);
        }
Esempio n. 2
0
        //dup to FindTitle() - which one we should use?
        public static string FindTitle2(string url, string defaultIfNoMatch, IWebProxy proxy, ICredentials credentials)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

            request.AllowAutoRedirect = true;
            request.Proxy             = proxy;
            request.Credentials       = credentials;
            request.Timeout           = 5 * 1000 /* 5 second timeout */;

            if (FeedSource.SetCookies)
            {
                HttpCookieManager.SetCookies(request);
            }

            /* use bogus user agent since some sites will bounce you to unsupported browser page otherwise */
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;)";

            string title = defaultIfNoMatch;

            using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
            {
                using (FastSgmlXPathReader sgmlReader = new FastSgmlXPathReader())
                {
                    using (StreamReader inputStreamReader = new StreamReader(response.GetResponseStream()))
                    {
                        try
                        {
                            sgmlReader.InputStream = inputStreamReader;
                            sgmlReader.DocType     = "HTML";
                            sgmlReader.CaseFolding = CaseFolding.ToLower;
                            bool done = false;

                            while (!done && sgmlReader.Read())
                            {
                                if (sgmlReader.NodeType == XmlNodeType.Element)
                                {
                                    switch (sgmlReader.XPath)
                                    {
                                    case "//html/title":
                                        title = sgmlReader.ReadElementContentAsString();                                                 // .ReadInnerXml();
                                        done  = true;
                                        break;

                                    case "//html/head/title":
                                        title = sgmlReader.ReadElementContentAsString();                                                // .ReadInnerXml();
                                        done  = true;
                                        break;

                                    case "//html/body":
                                        done = true;
                                        break;
                                    }
                                }
                            }                             //while
                        }
                        catch (Exception e)
                        {
                            _log.Debug("Error retrieving title from HTML page at " + url, e);
                        }
                    }
                }
            }

            return(title);
        }