private bool AddWebPage(Uri l_baseUri, string newUri) { // Dim url As String = StrUtil.LeftIndexOf(newUri, "#") // Dim uri As New Uri(l_baseUri, url) Uri uri = new Uri(l_baseUri, newUri); if (!ValidPage(uri.LocalPath) || mWebPages.Contains(uri)) { return(false); } WebPageStatus state = new WebPageStatus(uri); state.OriginalUrl = newUri; if ((uri.AbsoluteUri.StartsWith(BaseUri.AbsoluteUri))) { state.TaskInformation += "Handle Links"; } m_webPagesPending.Enqueue(state); mWebPages.Add(uri, state); return(true); }
//New public void Execute() { UrlCrawledCount = 0; DateTime startTime = DateTime.Now; AddWebPage(StartUri, StartUri.AbsoluteUri); try { while (WebPagesPending.Count > 0 && (MaximumUrlAllowed == -1 || UrlCrawledCount < MaximumUrlAllowed)) { WebPageStatus state = (WebPageStatus)m_webPagesPending.Dequeue(); mWebPageManager.Process(state); if (!KeepWebContent) { state.Content = null; } UrlCrawledCount += 1; } } catch (Exception ex) { MessageBox.Show("There was some error in crawling the website. Try again later." + Constants.vbCrLf + "Error:" + ex.ToString()); } DateTime endTime = DateTime.Now; float elasped = (endTime.Ticks - startTime.Ticks) / 10000000; var diffTimeInSeconds = (endTime - startTime).TotalMilliseconds; var diffTimeInMiliiSeconds = (endTime.Ticks - startTime.Ticks) / 10000; }
private void spider_WebPageContentHandler(WebPageStatus state) { this.Dispatcher.Invoke(new Action(() => { CrawlPageDetail detail = new CrawlPageDetail(); detail.WebsiteUrl = state.Uri.ToString() + ""; detail.SerialNumber = mCrawlDetail.Pages.Count + 1; detail.Status = state.StatusCode; mCrawlDetail.Pages.Add(detail); mCrawlDetail.TotalCrawled += 1; if (state.TaskCompleted == false && state.TaskStarted == true) { detail.IsSuccess = false; mCrawlDetail.TotalBrokenLink += 1; } else { detail.IsSuccess = true; } Uri homeUri = new Uri(txtSearchUrl.Text); if (detail.IsSuccess == true && state.OriginalUrl.ToLowerInvariant() == homeUri.ToString().ToLowerInvariant()) { mWebsiteContent = state.Content; } if (detail.IsSuccess == false) { UpdateWebsiteBrowser(state.Uri.ToString()); } })); }
public void HandleLinks(WebPageStatus state) { if (state.TaskInformation != null && !(state.TaskInformation.IndexOf("Handle Links") == -1)) { int counter = 0; Match m = RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.Content); while (m.Success) { if (AddWebPage(state.Uri, m.Groups["url"].ToString())) { counter += 1; } m = m.NextMatch(); } } }
//Process #region "local interface" // private void HandleException(Exception ex, ref WebPageStatus state) { if (ex.ToString().IndexOf("(404)") != -1) { state.StatusCode = "404"; state.StatusDescription = "(404) Not Found"; } else if (ex.ToString().IndexOf("(403)") != -1) { state.StatusDescription = "(403) Forbidden"; } else if (ex.ToString().IndexOf("(500)") != -1) { state.TaskCompleted = true; state.StatusCode = "OK"; state.StatusDescription = "(500) Internal Server Error"; } else if (ex.ToString().IndexOf("(502)") != -1) { state.StatusCode = "502"; state.StatusDescription = "(502) Bad Gateway"; } else if (ex.ToString().IndexOf("(503)") != -1) { state.StatusCode = "503"; state.StatusDescription = "(503) Server Unavailable"; } else if (ex.ToString().IndexOf("(504)") != -1) { state.StatusCode = "504"; state.StatusDescription = "(504) Gateway Timeout"; } else if ((ex.InnerException != null) && ex.InnerException is FileNotFoundException) { state.StatusCode = "FileNotFound"; state.StatusDescription = ex.InnerException.Message; } else { state.StatusDescription = ex.ToString(); } }
public bool Process(WebPageStatus state) { state.TaskStarted = true; state.TaskCompleted = false; try { Console.WriteLine("Process Uri: {0}", state.Uri.AbsoluteUri); WebRequest req = WebRequest.Create(state.Uri); WebResponse res = null; try { res = req.GetResponse(); if (res is HttpWebResponse) { state.StatusCode = ((HttpWebResponse)res).StatusCode.ToString(); state.StatusDescription = ((HttpWebResponse)res).StatusDescription; } if (res is FileWebResponse) { state.StatusCode = "OK"; state.StatusDescription = "OK"; } if (state.StatusCode.Equals("OK")) { StreamReader sr = new StreamReader(res.GetResponseStream()); state.Content = sr.ReadToEnd(); if ((WebPageContentHandler != null)) { WebPageContentDelegate handler = WebPageContentHandler; handler(state); } } state.TaskCompleted = true; } catch (Exception ex) { HandleException(ex, ref state); } finally { if ((res != null)) { res.Close(); } } } catch (Exception ex) { Console.WriteLine(ex.ToString()); } Console.WriteLine("Completed: {0}", state.TaskCompleted); if (WebPageTaskCompleted != null) { WebPageContentDelegate taskHandler = WebPageTaskCompleted; taskHandler(state); } return(state.TaskCompleted); }