public List <Link> EvalLinks(NodeContent nodeContent) { List <Link> results = new List <Link>(); if (!nodeContent.ContentType.StartsWith("text/html")) { return(results); } try { foreach (Match anchor in anchorsRegex.Matches(nodeContent.Content)) { string url = anchor.Value; // TODO: this should be done not only with double quotes if (!url.Contains("href=\"")) { continue; } url = url.Substring(url.IndexOf("href=\"") + 6); if (url.Contains("\"")) { url = url.Substring(0, url.IndexOf("\"")); } if (url.Contains("#")) { url = url.Substring(0, url.IndexOf("#")); } Uri uri = null; try { uri = new Uri(url); } catch (UriFormatException) { // Try to construct a full url, in case what we have is a relative url url = nodeContent.Node.Uri.ToString() + "/" + url; // Remove excess slashes url = url.Replace("///", "/"); while (url.Substring(url.IndexOf("://") + 3).Contains("//")) { url = url.Substring(0, url.IndexOf("://") + 3) + url.Substring(url.IndexOf("://") + 3).Replace("//", "/"); } try { uri = new Uri(url); } catch (Exception) { continue; } } catch (Exception ex) { myCrawlListener.ExceptionRaised(this, ex); } if (uri != null) { results.Add(new Link(uri)); } } } catch (Exception ex) { myCrawlListener.ExceptionRaised(this, ex); } return(results); }
private string ReadUri(Uri uri, out HttpStatusCode status, out string contentType, out long contentLength) { Stream data = null; StreamReader reader = null; string s = string.Empty; status = HttpStatusCode.NoContent; contentType = string.Empty; contentLength = 0; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri); request.Headers.Add("user-agent", userAgent); try { HttpWebResponse response = (HttpWebResponse)request.GetResponse(); status = response.StatusCode; contentType = response.ContentType; contentLength = response.ContentLength; if (status == HttpStatusCode.OK) { if (contentType.StartsWith("text")) { data = response.GetResponseStream(); try { reader = new StreamReader(data); s = reader.ReadToEnd(); } finally { if (reader != null) { reader.Close(); } } } } } catch (WebException ex) { var errorResponse = ex.Response as HttpWebResponse; if (errorResponse != null) { status = errorResponse.StatusCode; } myCrawlListener.ExceptionRaised(this, ex); } catch (Exception ex) { myCrawlListener.ExceptionRaised(this, ex); } finally { if (data != null) { data.Close(); } } return(s); }