//dup to FindTitle() - which one we should use? public static string FindTitle2(string url, string defaultIfNoMatch, IWebProxy proxy, ICredentials credentials) { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.AllowAutoRedirect = true; request.Proxy = proxy; request.Credentials = credentials; request.Timeout = 5 * 1000 /* 5 second timeout */; if (FeedSource.SetCookies) { HttpCookieManager.SetCookies(request); } /* use bogus user agent since some sites will bounce you to unsupported browser page otherwise */ request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;)"; string title = defaultIfNoMatch; using (HttpWebResponse response = (HttpWebResponse)request.GetResponse()) { using (FastSgmlXPathReader sgmlReader = new FastSgmlXPathReader()) { using (StreamReader inputStreamReader = new StreamReader(response.GetResponseStream())) { try { sgmlReader.InputStream = inputStreamReader; sgmlReader.DocType = "HTML"; sgmlReader.CaseFolding = CaseFolding.ToLower; bool done = false; while (!done && sgmlReader.Read()) { if (sgmlReader.NodeType == XmlNodeType.Element) { switch (sgmlReader.XPath) { case "//html/title": title = sgmlReader.ReadElementContentAsString(); // .ReadInnerXml(); done = true; break; case "//html/head/title": title = sgmlReader.ReadElementContentAsString(); // .ReadInnerXml(); done = true; break; case "//html/body": done = true; break; } } } //while } catch (Exception e) { _log.Debug("Error retrieving title from HTML page at " + url, e); } } } } return(title); }
public static List <TitledLink> RetrieveTitledLinks(string html) { if (string.IsNullOrEmpty(html)) { return(GetList <TitledLink> .Empty); } List <TitledLink> list = new List <TitledLink>(); FastSgmlXPathReader sgmlReader = new FastSgmlXPathReader { DocType = "HTML", CaseFolding = CaseFolding.ToLower, IgnoreDtd = true }; for (Match m = RegExFindHref.Match(html); m.Success; m = m.NextMatch()) { string href = m.Groups[1].Value.ToLower(); // filter non-real relation urls: if (href.StartsWith("mailto:") || href.StartsWith("javascript:")) { continue; } if (href.Length == 0) { continue; } var linkText = m.Groups[2].Value; var linkTitle = linkText; using (var inputStreamReader = new StringReader(m.Groups[0].Value)) { try { sgmlReader.InputStream = inputStreamReader; while (sgmlReader.Read()) { if (sgmlReader.NodeType == XmlNodeType.Element) { if (sgmlReader.Name == "a") { href = sgmlReader.GetAttribute("href"); if (!String.IsNullOrWhiteSpace(href)) { href = href.ToLower(); // filter non-real relation urls: if (href.StartsWith("mailto:") || href.StartsWith("javascript:")) { continue; } linkTitle = sgmlReader.GetAttribute("title"); linkText = sgmlReader.ReadInnerXml(); } } } } } catch (Exception e) { _log.Debug("Error retrieving title with FastSgmlXPathReader() from HTML page", e); } } if (!String.IsNullOrEmpty(href)) { href = RelationCosmos.RelationCosmos.UrlTable.Add(href); TitledLink link = new TitledLink(ref href, String.IsNullOrEmpty(linkTitle) ? linkText : linkTitle); var found = list.FirstOrDefault(newLink => link.Url.Equals(newLink.Url, StringComparison.Ordinal)); if (String.IsNullOrEmpty(found.Url)) { list.Add(link); } } } if (list.Count == 0) { list = GetList <TitledLink> .Empty; } return(list); }