public void Execute()
        {
            uriProcessedCount_ = 0;
            DateTime start = DateTime.Now;

            AddWebPage(startUri_, startUri_.AbsoluteUri);
            try
            {
                while (webPagesPending_.Count > 0 && (uriProcessedCountMax_ == -1 || uriProcessedCount_ < uriProcessedCountMax_))
                {
                    WebPageState webPageState = (WebPageState)webPagesPending_.Dequeue();
                    webPageProcessor_.Process(webPageState);
                    if (!keepWebContent_)
                    {
                        webPageState.content_ = null;
                    }

                    uriProcessedCount_++;
                }
            }
            catch (Exception ex)
            {
                StringBuilder strBuilder = new StringBuilder();
                strBuilder.AppendFormat(@"Error occurs in Execute(), error message: {0}", ex.Message);
                MessageBox.Show(strBuilder.ToString(), "error");
            }

            DateTime end     = DateTime.Now;
            float    elasped = (end.Ticks - start.Ticks) / 10000000;
        }
        private bool AddWebPage(Uri baseUri, string newUri)
        {
            // Remove any anchors
            string url = StrUtil.LeftIndexOf(newUri, "#");

            // Construct a Uri, using the current page Uri as a base reference
            Uri uri = new Uri(baseUri, url);

            if (!IsValidPage(uri.LocalPath) || webPages_.Contains(uri))
            {
                return(false);
            }
            WebPageState webPageState = new WebPageState(uri);

            // Only process links for pages within the same site.
            if (uri.AbsoluteUri.StartsWith(baseUri_.AbsoluteUri))
            {
                webPageState.processInstructions_ += "Handle Links";
            }

            webPagesPending_.Enqueue(webPageState);
            webPages_.Add(uri, webPageState);

            return(true);
        }
Example #3
0
        /// <summary>
        /// Process performs the action of reading in the contents from the URI
        /// assigned to the WebPageState object that is passed in.
        /// <param name="state">The state object containst the URI to process and will hold onto state regarding the URI as it is processed</param>
        /// <returns>True if the process worked without exception</returns>
        /// </summary>
        public bool Process(WebPageState webPageState)
        {
            webPageState.processStarted_     = true;
            webPageState.processSuccessfull_ = false;

            try
            {
                WebRequest  req = WebRequest.Create(webPageState.uri_);
                WebResponse res = null;

                try
                {
                    res = req.GetResponse();

                    if (res is HttpWebResponse)
                    {
                        webPageState.statusCode_        = ((HttpWebResponse)res).StatusCode.ToString();
                        webPageState.statusDescription_ = ((HttpWebResponse)res).StatusDescription;
                    }
                    if (res is FileWebResponse)
                    {
                        webPageState.statusCode_        = "OK";
                        webPageState.statusDescription_ = "OK";
                    }

                    if (webPageState.statusCode_.Equals("OK"))
                    {
                        StreamReader sr = new StreamReader(res.GetResponseStream());

                        webPageState.content_ = sr.ReadToEnd();

                        if (contentHandler_ != null)
                        {
                            contentHandler_(webPageState);
                        }
                    }

                    webPageState.processSuccessfull_ = true;
                }
                catch (Exception ex)
                {
                    HandleException(ex, webPageState);
                }
                finally
                {
                    if (res != null)
                    {
                        res.Close();
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.ToString());
            }
            Console.WriteLine("Successfull: {0}", webPageState.processSuccessfull_);

            return(webPageState.processSuccessfull_);
        }
Example #4
0
 // Each web error such as 404 does not show up as specific error so lookup the code from a WebException
 private bool LookupWebException(string ex, WebPageState state, string[] errors)
 {
     foreach (string error in errors)
     {
         string errCode = error.Substring(0, 5);
         if (ex.IndexOf(errCode) != -1)
         {
             state.statusCode_        = errCode;
             state.statusDescription_ = error;
             return(true);
         }
     }
     return(false);
 }
        public void HandleLinks(WebPageState state)
        {
            if (state.processInstructions_.IndexOf("Handle Links") != -1)
            {
                int   counter = 0;
                Match m       = RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.content_);

                while (m.Success)
                {
                    if (AddWebPage(state.uri_, m.Groups["url"].ToString()))
                    {
                        counter++;
                    }

                    m = m.NextMatch();
                }
            }
        }
Example #6
0
        // Assign status code and description based on thrown exception
        private void HandleException(Exception ex, WebPageState state)
        {
            if (ex is WebException && LookupWebException(ex.ToString(), state, new String[] {
                "(400) Bad Request",
                "(401) Unauthorized",
                "(402) Payment Required",
                "(403) Forbidden",
                "(404) Not Found",
                "(405) Method not allowed",
                "(406) Page format not understood",
                "(407) Request must be authorized first",
                "(408) Request timed out",
                "(409) Conflict, to many requests for resource",
                "(410) Page use to be there, but now it's gone",
                "(411) Content-length missing",
                "(412) Pre-condition not met",
                "(413) Too big",
                "(414) URL is to long",
                "(415) Unsupported media type",
                "(500) Internal Error",
                "(501) Not implemented",
                "(502) Bad Gateway",
                "(503) Server Unavailable",
                "(504) Gateway Timeout",
                "(505) HTTP not supported"
            }))
            {
                return;
            }

            if (ex.InnerException != null && ex.InnerException is FileNotFoundException)
            {
                state.statusCode_        = "FileNotFound";
                state.statusDescription_ = ex.InnerException.Message;
            }
            else
            {
                state.statusDescription_ = ex.ToString();
            }
        }