private bool AddWebPage(Uri baseUri, string newUri) { // Remove any anchors string url = StrUtil.LeftIndexOf(newUri, "#"); // Construct a Uri, using the current page Uri as a base reference Uri uri = new Uri(baseUri, url); if (!ValidPage(uri.LocalPath) || m_webPages.Contains(uri)) { return(false); } WebPageState state = new WebPageState(uri); // Only process links for pages within the same site. if (uri.AbsoluteUri.StartsWith(BaseUri.AbsoluteUri)) { state.ProcessInstructions += "Handle Links"; } m_webPagesPending.Enqueue(state); m_webPages.Add(uri, state); return(true); }
public void AddWebPage(Uri baseUri, string newUri) { // Remove any anchors string url = StrUtil.LeftIndexOf(newUri, "#"); // Construct a Uri, using the current page Uri as a base reference Uri uri = new Uri(baseUri, url); if (!ValidPageExtension(uri.LocalPath)) { // Log( "Uri not processed: " + uri + " - Invalid Extension" ); return; } if (m_webPages.Contains(uri)) { // Log( "Add WebPage: " + uri + " - Allready exists" ); } else { // Log( "Add WebPage: " + uri ); WebPageState state = new WebPageState(uri); // Only process pages that are with in the same location as this site if (uri.AbsoluteUri.StartsWith(BaseUri.AbsoluteUri)) { m_webPagesPending.Enqueue(state); } m_webPages.Add(uri, state); } }
private void HandleLinks(WebPageState state) { string html = state.Content; Match m = RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, html); while (m.Success) { m_spider.AddWebPage(state.Uri, m.Groups["url"].ToString( )); m = m.NextMatch( ); } }
// Each web error such as 404 does not show up as specific error so lookup the code from a WebException private bool LookupWebException(string ex, WebPageState state, string[] errors) { foreach (string error in errors) { string errCode = error.Substring(0, 5); if (ex.IndexOf(errCode) != -1) { state.StatusCode = errCode; state.StatusDescription = error; return(true); } } return(false); }
public void Execute( ) { UriProcessedCount = 0; DateTime start = DateTime.Now; Console.WriteLine("======================================================================================================"); Console.WriteLine("Proccess URI: " + m_startUri.AbsoluteUri); Console.WriteLine("Start At : " + start); Console.WriteLine("------------------------------------------------------------------------------------------------------"); AddWebPage(StartUri, StartUri.AbsoluteUri); try { while (WebPagesPending.Count > 0 && (UriProcessedCountMax == -1 || UriProcessedCount < UriProcessedCountMax)) { Console.WriteLine("Max URI's: {0}, Processed URI's: {1}, Pending URI's: {2}", UriProcessedCountMax, UriProcessedCount, WebPagesPending.Count); WebPageState state = (WebPageState)m_webPagesPending.Dequeue( ); m_webPageProcessor.Process(state); if (!KeepWebContent) { state.Content = null; } UriProcessedCount++; } } catch (Exception ex) { Console.WriteLine("Failure while running web spider: " + ex.ToString( )); } DateTime end = DateTime.Now; float elasped = (end.Ticks - start.Ticks) / 10000000; Console.WriteLine("------------------------------------------------------------------------------------------------------"); Console.WriteLine("URI Finished : " + m_startUri.AbsoluteUri); Console.WriteLine("Pages Processed: " + UriProcessedCount); Console.WriteLine("Pages Pending : " + WebPagesPending.Count); Console.WriteLine("End At : " + end); Console.WriteLine("Elasped Time : {0} seconds", elasped); Console.WriteLine("======================================================================================================"); }
public void HandleLinks(WebPageState state) { if (state.ProcessInstructions.IndexOf("Handle Links") != -1) { int counter = 0; Match m = RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.Content); while (m.Success) { if (AddWebPage(state.Uri, m.Groups["url"].ToString( ))) { counter++; } m = m.NextMatch( ); } Console.WriteLine(" : {0} new links were added", counter); } }
// Assign status code and description based on thrown exception private void HandleException(Exception ex, WebPageState state) { if (ex is WebException && LookupWebException(ex.ToString( ), state, new String[] { "(400) Bad Request", "(401) Unauthorized", "(402) Payment Required", "(403) Forbidden", "(404) Not Found", "(405) Method not allowed", "(406) Page format not understood", "(407) Request must be authorized first", "(408) Request timed out", "(409) Conflict, to many requests for resource", "(410) Page use to be there, but now it's gone", "(411) Content-length missing", "(412) Pre-condition not met", "(413) Too big", "(414) URL is to long", "(415) Unsupported media type", "(500) Internal Error", "(501) Not implemented", "(502) Bad Gateway", "(503) Server Unavailable", "(504) Gateway Timeout", "(505) HTTP not supported" })) { return; } if (ex.InnerException != null && ex.InnerException is FileNotFoundException) { state.StatusCode = "FileNotFound"; state.StatusDescription = ex.InnerException.Message; } else { state.StatusDescription = ex.ToString( ); } }
public void Process(Object s) { try { Log("Process Uri: {0}, HashCode: {1}", ((WebPageState)s).Uri.AbsoluteUri, Thread.CurrentThread.GetHashCode( )); WebPageState state = (WebPageState)s; WebRequest req = WebRequest.Create(state.Uri); WebResponse res = null; Thread.Sleep(1); try { res = req.GetResponse( ); Thread.Sleep(1); if (res is HttpWebResponse) { state.StatusCode = ((HttpWebResponse)res).StatusCode.ToString( ); state.StatusDescription = ((HttpWebResponse)res).StatusDescription; } if (res is FileWebResponse /* && FILE EXIST - TODO */) { state.StatusCode = "OK"; state.StatusDescription = "OK"; } if (state.StatusCode.Equals("OK")) { StreamReader sr = new StreamReader(res.GetResponseStream( )); state.Content = sr.ReadToEnd( ); Thread.Sleep(1); Log("Handle Links for Uri: {0}", state.Uri.AbsoluteUri); HandleLinks(state); } } catch (Exception ex) { if (ex.ToString( ).IndexOf("404") != -1) { state.StatusCode = "404"; state.StatusDescription = "(404) Not Found"; } else { state.StatusDescription = ex.ToString( ); } } finally { if (res != null) { res.Close( ); } } state.Content = null; Log("Process Uri: {0} - Success", state.Uri.AbsoluteUri); } catch (Exception ex) { Log("Process Uri: {0} - Failure", ((WebPageState)s).Uri.AbsoluteUri); Log(ex.ToString( )); } lock ( m_spider ) { m_spider.ThreadCount--; if (m_spider.ThreadCount == 0) { _.P("ResetEvent.Set"); m_spider.ResetEvent.Set( ); } } }
/// <summary> /// Process performs the action of reading in the contents from the URI /// assigned to the WebPageState object that is passed in. /// <param name="state">The state object containst the URI to process and will hold onto state regarding the URI as it is processed</param> /// <returns>True if the process worked without exception</returns> /// </summary> public bool Process(WebPageState state) { state.ProcessStarted = true; state.ProcessSuccessfull = false; try { Console.WriteLine("Process Uri: {0}", state.Uri.AbsoluteUri); WebRequest req = WebRequest.Create(state.Uri); WebResponse res = null; try { res = req.GetResponse( ); if (res is HttpWebResponse) { state.StatusCode = ((HttpWebResponse)res).StatusCode.ToString( ); state.StatusDescription = ((HttpWebResponse)res).StatusDescription; } if (res is FileWebResponse) { state.StatusCode = "OK"; state.StatusDescription = "OK"; } if (state.StatusCode.Equals("OK")) { StreamReader sr = new StreamReader(res.GetResponseStream( )); state.Content = sr.ReadToEnd( ); if (ContentHandler != null) { ContentHandler(state); } } state.ProcessSuccessfull = true; } catch (Exception ex) { HandleException(ex, state); } finally { if (res != null) { res.Close( ); } } } catch (Exception ex) { Console.WriteLine(ex.ToString( )); } Console.WriteLine("Successfull: {0}", state.ProcessSuccessfull); return(state.ProcessSuccessfull); }