public Report Execute() { Report report = new Report { StartUri = StartUri }; Logger.Info(report.ToString(Report.ReportFormat.Head)); AddWebPage(StartUri, StartUri.AbsoluteUri); Stopwatch sw = new Stopwatch(); while (webPagesPending.Count > 0 && (spiderOptions.UriProcessedCountMax == -1 || report.PagesProcessed < spiderOptions.UriProcessedCountMax)) { WebPageState state = (WebPageState)webPagesPending.Dequeue(); sw.Start(); spiderOptions.WebPageProcessor.Process(state); sw.Stop(); state.ElapsedTimeSpan = sw.Elapsed; if (spiderOptions.ShowSuccessUrls || !state.IsOk) { report.PageStates.Add(state); } report.PagesProcessed++; Logger.Info(Resources.WebSpiderExecuteProcessedUrlsInfo, report.PagesProcessed, webPagesPending.Count, state); } report.EndTime = DateTime.Now; Logger.Info(report.ToString(Report.ReportFormat.Footer)); return(report); }
private WebResponse GetDestinationResponse(WebPageState state, Uri redirect = null) { Uri requestUri = redirect ?? state.Uri; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(requestUri); request.Method = "GET"; request.UserAgent = Config.UserAgent; request.AllowAutoRedirect = false; if (Options != null && Options.Credential != null) request.Credentials = Options.Credential; bool isRedirect = false; HttpWebResponse response = null; try { if ((response = request.GetResponse() as HttpWebResponse) == null) return null; if (redirect == null) state.StatusCode = response.StatusCode; else if (state.Redirects.Count > 0) { state.Redirects[state.Redirects.Count - 1].StatusCode = response.StatusCode; } state.StatusCodeDescription = response.StatusDescription; if (WebPageState.GetStatus(response.StatusCode) == WebPageState.PageStatus.Redirect) { isRedirect = true; requestUri = new Uri(response.Headers["Location"]); state.Redirects.Add(new WebPageState.WebRequestState { Uri = requestUri}); } else if (response.ContentType.StartsWith("text", StringComparison.OrdinalIgnoreCase)) { Stream stream = response.GetResponseStream(); if (stream != null) state.Content = new StreamReader(stream).ReadToEnd(); } } catch (WebException ex) { Logger.LogException(LogLevel.Fatal, ex.Status.ToString(), ex); if (ex.Response != null && ex.Response is HttpWebResponse) { state.StatusCode = ((HttpWebResponse)ex.Response).StatusCode; state.StatusCodeDescription = ((HttpWebResponse) ex.Response).StatusDescription; } } finally { if (response != null) response.Close(); } return isRedirect ? GetDestinationResponse(state, requestUri) : response; }
public void HandleLinks(WebPageState state) { if (state != null && state.IsContinueProcess) { Match m = RegExUtil.GetMatchRegEx(state.Content); do AddWebPage(state.Uri, m.Groups["url"].ToString()); while ((m = m.NextMatch()).Success); } }
public void HandleLinks(WebPageState state) { if (state != null && state.IsContinueProcess) { Match m = RegExUtil.GetMatchRegEx(state.Content); do { AddWebPage(state.Uri, m.Groups["url"].ToString()); }while ((m = m.NextMatch()).Success); } }
public bool Process(WebPageState state) { WebResponse response = GetDestinationResponse(state); bool isProcessSuccessfull = response != null && state.IsOk && ContentHandler != null; if (isProcessSuccessfull) ContentHandler(state); return isProcessSuccessfull; }
public bool Process(WebPageState state) { WebResponse response = GetDestinationResponse(state); bool isProcessSuccessfull = response != null && state.IsOk && ContentHandler != null; if (isProcessSuccessfull) { ContentHandler(state); } return(isProcessSuccessfull); }
private void AddWebPage(Uri baseUri, string newUri) { // Remove any anchors int index = newUri.IndexOf("#", StringComparison.OrdinalIgnoreCase); string url = (!string.IsNullOrEmpty(newUri) && index > 0) ? newUri.Substring(0, index) : newUri; var uri = new Uri(baseUri, url); if (webPages.Contains(uri)) { return; } var state = new WebPageState(uri) { IsContinueProcess = uri.AbsoluteUri.StartsWith(spiderOptions.BaseUri.AbsoluteUri, StringComparison.OrdinalIgnoreCase) }; webPagesPending.Enqueue(state); webPages.Add(uri, state); }
private void AddWebPage(Uri baseUri, string newUri) { // Remove any anchors int index = newUri.IndexOf("#", StringComparison.OrdinalIgnoreCase); string url = (!string.IsNullOrEmpty(newUri) && index > 0) ? newUri.Substring(0, index) : newUri; var uri = new Uri(baseUri, url); if (webPages.Contains(uri)) return; var state = new WebPageState(uri) { IsContinueProcess = uri.AbsoluteUri.StartsWith(spiderOptions.BaseUri.AbsoluteUri, StringComparison.OrdinalIgnoreCase) }; webPagesPending.Enqueue(state); webPages.Add(uri, state); }
private WebResponse GetDestinationResponse(WebPageState state, Uri redirect = null) { Uri requestUri = redirect ?? state.Uri; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(requestUri); request.Method = "GET"; request.UserAgent = Config.UserAgent; request.AllowAutoRedirect = false; if (Options != null && Options.Credential != null) { request.Credentials = Options.Credential; } bool isRedirect = false; HttpWebResponse response = null; try { if ((response = request.GetResponse() as HttpWebResponse) == null) { return(null); } if (redirect == null) { state.StatusCode = response.StatusCode; } else if (state.Redirects.Count > 0) { state.Redirects[state.Redirects.Count - 1].StatusCode = response.StatusCode; } state.StatusCodeDescription = response.StatusDescription; if (WebPageState.GetStatus(response.StatusCode) == WebPageState.PageStatus.Redirect) { isRedirect = true; requestUri = new Uri(response.Headers["Location"]); state.Redirects.Add(new WebPageState.WebRequestState { Uri = requestUri }); } else if (response.ContentType.StartsWith("text", StringComparison.OrdinalIgnoreCase)) { Stream stream = response.GetResponseStream(); if (stream != null) { state.Content = new StreamReader(stream).ReadToEnd(); } } } catch (WebException ex) { Logger.LogException(LogLevel.Fatal, ex.Status.ToString(), ex); if (ex.Response != null && ex.Response is HttpWebResponse) { state.StatusCode = ((HttpWebResponse)ex.Response).StatusCode; state.StatusCodeDescription = ((HttpWebResponse)ex.Response).StatusDescription; } } finally { if (response != null) { response.Close(); } } return(isRedirect ? GetDestinationResponse(state, requestUri) : response); }