public void Crawl() { this.HttpRequestResults = new List <HttpRequestResult>(); foreach (Uri seed in Seeds) { HttpRequestResult result = BrowserToTest.Get(seed); result.Parse(); HttpRequestResults.Add(result); } for (int i = 0; i < HttpRequestResults.Count && HttpRequestResults.Count < RecursionLimit; i++) { if (HttpRequestResults[i].Links != null && GetSetOfCrawlableHosts().Contains(HttpRequestResults[i].ResultUrl.Host)) { foreach (Link link in HttpRequestResults[i].Links) { if (link.Ex == null) { bool alreadyRequested = (from result in HttpRequestResults where result.Equals(link) select result).Count() != 0; if (!alreadyRequested && HttpRequestResults.Count < RecursionLimit) { var result = BrowserToTest.Get(link.AbsoluteUri); result.Parse(); HttpRequestResults.Add(result); } } } } } foreach (var result in HttpRequestResults) { if (result.Links != null) { foreach (var link in result.Links) { foreach (var result2 in HttpRequestResults) { if (result2.Equals(link) && (result.ResultUrl == null && GetSetOfCrawlableHosts().Contains(result.RequestUrl.Host.ToString()) || result.ResultUrl != null && GetSetOfCrawlableHosts().Contains(result.ResultUrl.Host.ToString()))) { link.WasRetrieved = true; link.IsBroken = result2.Error != null; break; } } } } } }
public HtmlParser(HttpRequestResult HttpRequestResult) { this.HttpRequestResult = HttpRequestResult; }
/// <summary> /// Will be used to link the set of HttpRequestResults and the Links /// for each HttpRequestResult to generate information on which pages /// contain links that are broken. /// </summary> /// <param name="obj"></param> /// <returns></returns> public bool Equals(HttpRequestResult obj) { return(AbsoluteUri != null && AbsoluteUri.Equals(obj.RequestUrl)); }
public HttpRequestResult Get(Uri url) { HttpRequestResult results = (from httpRequestResult in HttpRequestResults where httpRequestResult.RequestUrl.Equals(url) select httpRequestResult).FirstOrDefault(); if (results == null) { results = new HttpRequestResult(); results.RequestUrl = url; results.Start = DateTime.Now; results.BrowserUsed = this; StreamReader streamReader = null; WebResponse response = null; try { var request = (HttpWebRequest)WebRequest.Create(url); request.MaximumAutomaticRedirections = MaximumAutomaticRedirections; request.AllowAutoRedirect = AllowAutoRedirect; request.UserAgent = UserAgent; request.Accept = Accept; request.Headers.Add("Accept-Charset", AcceptCharset); request.Headers.Add("Accept-Language", AcceptLanguage); request.Credentials = this.Credentials; response = request.GetResponse(); streamReader = new StreamReader(response.GetResponseStream()); string content = streamReader.ReadToEnd(); results.ContentType = response.ContentType; if (results.IsCss || results.IsHtml) { results.Content = content; } results.ResultUrl = request.Address; HttpRequestResults.Add(results); } catch (WebException exception) { var error = new HttpValidationError() { AbsoluteUri = url, Error = exception, Message = exception.Message }; if (exception.Status == WebExceptionStatus.ProtocolError) { error.HttpCode = (int)((HttpWebResponse)exception.Response).StatusCode; } results.Error = error; } catch (Exception exception) { results.Error = new HttpValidationError() { AbsoluteUri = url, Error = exception, Message = exception.Message }; } finally { results.End = DateTime.Now; if (streamReader != null) { try { streamReader.Close(); } catch { } } if (response != null) { try { response.Close(); } catch { } } } } return(results); }
public CssParser(HttpRequestResult HttpRequestResult) { this.HttpRequestResult = HttpRequestResult; }