Пример #1
0
        public Report Execute()
        {
            Report report = new Report {
                StartUri = StartUri
            };

            Logger.Info(report.ToString(Report.ReportFormat.Head));

            AddWebPage(StartUri, StartUri.AbsoluteUri);
            Stopwatch sw = new Stopwatch();

            while (webPagesPending.Count > 0 &&
                   (spiderOptions.UriProcessedCountMax == -1 || report.PagesProcessed < spiderOptions.UriProcessedCountMax))
            {
                WebPageState state = (WebPageState)webPagesPending.Dequeue();
                sw.Start();
                spiderOptions.WebPageProcessor.Process(state);
                sw.Stop();

                state.ElapsedTimeSpan = sw.Elapsed;
                if (spiderOptions.ShowSuccessUrls || !state.IsOk)
                {
                    report.PageStates.Add(state);
                }


                report.PagesProcessed++;
                Logger.Info(Resources.WebSpiderExecuteProcessedUrlsInfo, report.PagesProcessed, webPagesPending.Count, state);
            }

            report.EndTime = DateTime.Now;
            Logger.Info(report.ToString(Report.ReportFormat.Footer));
            return(report);
        }
Пример #2
0
        private WebResponse GetDestinationResponse(WebPageState state, Uri redirect = null)
        {
            Uri requestUri = redirect ?? state.Uri;
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(requestUri);
            request.Method = "GET";
            request.UserAgent = Config.UserAgent;
            request.AllowAutoRedirect = false;

            if (Options != null && Options.Credential != null)
                request.Credentials = Options.Credential;

            bool isRedirect = false;

            HttpWebResponse response = null;

            try
            {
                if ((response = request.GetResponse() as HttpWebResponse) == null)
                    return null;

                if (redirect == null)
                    state.StatusCode = response.StatusCode;
                else if (state.Redirects.Count > 0)
                {
                    state.Redirects[state.Redirects.Count - 1].StatusCode = response.StatusCode;
                }

                state.StatusCodeDescription = response.StatusDescription;

                if (WebPageState.GetStatus(response.StatusCode) == WebPageState.PageStatus.Redirect)
                {
                    isRedirect = true;
                    requestUri = new Uri(response.Headers["Location"]);
                    state.Redirects.Add(new WebPageState.WebRequestState { Uri = requestUri});
                }
                else if (response.ContentType.StartsWith("text", StringComparison.OrdinalIgnoreCase))
                {
                    Stream stream = response.GetResponseStream();
                    if (stream != null)
                        state.Content = new StreamReader(stream).ReadToEnd();
                }
            }
            catch (WebException ex)
            {
                Logger.LogException(LogLevel.Fatal, ex.Status.ToString(), ex);
                if (ex.Response != null && ex.Response is HttpWebResponse)
                {
                    state.StatusCode = ((HttpWebResponse)ex.Response).StatusCode;
                    state.StatusCodeDescription = ((HttpWebResponse) ex.Response).StatusDescription;
                }
            }
            finally
            {
                if (response != null)
                    response.Close();
            }

            return isRedirect ? GetDestinationResponse(state, requestUri) : response;
        }
Пример #3
0
 public void HandleLinks(WebPageState state)
 {
     if (state != null && state.IsContinueProcess)
     {
         Match m = RegExUtil.GetMatchRegEx(state.Content);
         do
             AddWebPage(state.Uri, m.Groups["url"].ToString());
         while ((m = m.NextMatch()).Success);
     }
 }
Пример #4
0
 public void HandleLinks(WebPageState state)
 {
     if (state != null && state.IsContinueProcess)
     {
         Match m = RegExUtil.GetMatchRegEx(state.Content);
         do
         {
             AddWebPage(state.Uri, m.Groups["url"].ToString());
         }while ((m = m.NextMatch()).Success);
     }
 }
Пример #5
0
        public bool Process(WebPageState state)
        {
            WebResponse response = GetDestinationResponse(state);

            bool isProcessSuccessfull =
                response != null &&
                state.IsOk &&
                ContentHandler != null;

            if (isProcessSuccessfull)
                ContentHandler(state);

            return isProcessSuccessfull;
        }
Пример #6
0
        public bool Process(WebPageState state)
        {
            WebResponse response = GetDestinationResponse(state);

            bool isProcessSuccessfull =
                response != null &&
                state.IsOk &&
                ContentHandler != null;

            if (isProcessSuccessfull)
            {
                ContentHandler(state);
            }

            return(isProcessSuccessfull);
        }
Пример #7
0
        private void AddWebPage(Uri baseUri, string newUri)
        {
            // Remove any anchors
            int    index = newUri.IndexOf("#", StringComparison.OrdinalIgnoreCase);
            string url   = (!string.IsNullOrEmpty(newUri) && index > 0) ? newUri.Substring(0, index) : newUri;

            var uri = new Uri(baseUri, url);

            if (webPages.Contains(uri))
            {
                return;
            }

            var state = new WebPageState(uri)
            {
                IsContinueProcess = uri.AbsoluteUri.StartsWith(spiderOptions.BaseUri.AbsoluteUri, StringComparison.OrdinalIgnoreCase)
            };

            webPagesPending.Enqueue(state);
            webPages.Add(uri, state);
        }
Пример #8
0
        private void AddWebPage(Uri baseUri, string newUri)
        {
            // Remove any anchors
            int index = newUri.IndexOf("#", StringComparison.OrdinalIgnoreCase);
            string url = (!string.IsNullOrEmpty(newUri) && index > 0) ? newUri.Substring(0, index) : newUri;

            var uri = new Uri(baseUri, url);

            if (webPages.Contains(uri))
                return;

            var state = new WebPageState(uri)
                            {
                                IsContinueProcess = uri.AbsoluteUri.StartsWith(spiderOptions.BaseUri.AbsoluteUri, StringComparison.OrdinalIgnoreCase)
                            };

            webPagesPending.Enqueue(state);
            webPages.Add(uri, state);
        }
Пример #9
0
        private WebResponse GetDestinationResponse(WebPageState state, Uri redirect = null)
        {
            Uri            requestUri = redirect ?? state.Uri;
            HttpWebRequest request    = (HttpWebRequest)WebRequest.Create(requestUri);

            request.Method            = "GET";
            request.UserAgent         = Config.UserAgent;
            request.AllowAutoRedirect = false;

            if (Options != null && Options.Credential != null)
            {
                request.Credentials = Options.Credential;
            }

            bool isRedirect = false;

            HttpWebResponse response = null;

            try
            {
                if ((response = request.GetResponse() as HttpWebResponse) == null)
                {
                    return(null);
                }

                if (redirect == null)
                {
                    state.StatusCode = response.StatusCode;
                }
                else if (state.Redirects.Count > 0)
                {
                    state.Redirects[state.Redirects.Count - 1].StatusCode = response.StatusCode;
                }

                state.StatusCodeDescription = response.StatusDescription;

                if (WebPageState.GetStatus(response.StatusCode) == WebPageState.PageStatus.Redirect)
                {
                    isRedirect = true;
                    requestUri = new Uri(response.Headers["Location"]);
                    state.Redirects.Add(new WebPageState.WebRequestState {
                        Uri = requestUri
                    });
                }
                else if (response.ContentType.StartsWith("text", StringComparison.OrdinalIgnoreCase))
                {
                    Stream stream = response.GetResponseStream();
                    if (stream != null)
                    {
                        state.Content = new StreamReader(stream).ReadToEnd();
                    }
                }
            }
            catch (WebException ex)
            {
                Logger.LogException(LogLevel.Fatal, ex.Status.ToString(), ex);
                if (ex.Response != null && ex.Response is HttpWebResponse)
                {
                    state.StatusCode            = ((HttpWebResponse)ex.Response).StatusCode;
                    state.StatusCodeDescription = ((HttpWebResponse)ex.Response).StatusDescription;
                }
            }
            finally
            {
                if (response != null)
                {
                    response.Close();
                }
            }

            return(isRedirect ? GetDestinationResponse(state, requestUri) : response);
        }