private void OnCrawlAnnounced(CrawlAnnounceItem item) { Console.WriteLine(String.Format("{0}-{1}-{2}-{3}", item.Page.Url, item.CrawlStatus.ToString() , item.Description != null ? item.Description : "", item.DateTime.ToShortTimeString())); if (CrawlAnnounced != null) { CrawlAnnounced(this, new CrawlAnnouncedEventArgs(item)); } }
public CrawlAnnouncedEventArgs(CrawlAnnounceItem crawlAnnounceItem) { this.CrawlAnnounceItem = crawlAnnounceItem; }
private void OnCrawlAnnounced(CrawlAnnounceItem item) { if (CrawlAnnounced != null) { CrawlAnnounced(this, new CrawlAnnouncedEventArgs(item)); } }
private Webpage[] ExtractLinks(Webpage parent) { CrawlAnnounceItem item = new CrawlAnnounceItem(parent, CrawlStatus.ExtractingLinksStarted, null, DateTime.Now, _sharedResource); OnCrawlAnnounced(item); List<Webpage> Webpages = new List<Webpage>(); HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(parent.Html); htmlDocument.OptionFixNestedTags = true; foreach (HtmlNode link in htmlDocument.DocumentNode.Descendants("a") .Where(d => d.Attributes.Contains("href"))) { Webpage page = new Webpage(); page.Depth = parent.Depth + 1; page.HostId = parent.HostId; string uri = link.Attributes["href"].Value; if(Uri.IsWellFormedUriString(uri,UriKind.Absolute)) page.Url = uri; else if (Uri.IsWellFormedUriString(uri,UriKind.Relative)) page.Url = UnifyUri(parent, uri); if (page.Url != null) { Uri href = new Uri(page.Url); if (string.IsNullOrEmpty(href.Fragment)) { page.Url = page.Url.ToLower(); page.RefererId = parent.Id; page.Depth = parent.Depth + 1; page.HostId = parent.HostId; Webpages.Add(page); } } } _sharedResource.AddTotalLinksFound(Webpages.Count); item = new CrawlAnnounceItem(parent, CrawlStatus.ExtractingLinksFinished, string.Format("این صفحه دارای {0} لینک می باشد.", Webpages.Count), DateTime.Now,_sharedResource); OnCrawlAnnounced(item); return Webpages.ToArray(); }
private Form[] ExtractForms(Webpage page) { CrawlAnnounceItem item = new CrawlAnnounceItem(page, CrawlStatus.ExtractingFormsStarted, null, DateTime.Now, _sharedResource); OnCrawlAnnounced(item); List<Form> _formLst = new List<Form>(); HtmlNode.ElementsFlags.Remove("form"); HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(page.Html); HtmlNode root = htmlDocument.DocumentNode; foreach (HtmlNode formNode in root.Descendants("form")) { Form form = new Form(); HtmlAttribute att = formNode.Attributes["action"]; string uri = (att == null || att.Value == "" || att.Value.StartsWith("#") ? page.Url : att.Value); if (Uri.IsWellFormedUriString(uri, UriKind.Absolute)) form.Action = uri; else if (Uri.IsWellFormedUriString(uri, UriKind.Relative)) form.Action = UnifyUri(page, uri); form.Method = formNode.Attributes["method"].Value; if (form.Action != null) { foreach (HtmlNode inputNode in formNode.Descendants("input")) { FormElement element = new FormElement(); if (inputNode.Attributes.Any(a => a.Name == "name")) element.Name = inputNode.Attributes["name"].Value; else element.Name = ""; if (inputNode.Attributes.Any(a => a.Name == "value")) element.Value = inputNode.Attributes["value"].Value; else element.Value = ""; element.Type = inputNode.Attributes["type"].Value; form.FormElements.Add(element); } _formLst.Add(form); } } _sharedResource.AddTotalFormsFound(_formLst.Count); item = new CrawlAnnounceItem(page, CrawlStatus.ExtractingFormsFinished, string.Format("این صفحه دارای {0} فرم می باشد.", _formLst.Count), DateTime.Now, _sharedResource); OnCrawlAnnounced(item); return _formLst.ToArray(); }
/// <summary> /// Downloads contents and save it /// </summary> /// <param name="page"></param> private void DownloadPage(Webpage page) { CrawlAnnounceItem item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingStarted, null, DateTime.Now, _sharedResource); OnCrawlAnnounced(item); if (page.Url.EndsWith(".jpg") || page.Url.EndsWith(".jpeg") || page.Url.EndsWith(".zip") || page.Url.EndsWith(".rar") || page.Url.EndsWith(".png") || page.Url.EndsWith(".exe") || page.Url.EndsWith(".gif") || page.Url.EndsWith(".mp3") || page.Url.EndsWith(".wma") || page.Url.EndsWith(".pdf") || page.Url.EndsWith(".wav") || page.Url.EndsWith(".bmp") || page.Url.EndsWith(".apk")) { page.Html = null; page.DateTime = DateTime.Now; item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted, "این آدرس محتوی متن نمی باشد.", DateTime.Now, _sharedResource); OnCrawlAnnounced(item); return; } try { HttpWebRequest request = WebRequest.Create(page.Url) as HttpWebRequest; request.Timeout = 100000; request.UserAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"; request.AllowAutoRedirect = true; request.KeepAlive = false; if (page.Url.Contains("rar")) Console.WriteLine(); using (HttpWebResponse response = request.GetResponse() as HttpWebResponse) { if ( (response.StatusCode != HttpStatusCode.NotFound || response.StatusCode != HttpStatusCode.BadGateway || response.StatusCode != HttpStatusCode.BadRequest || response.StatusCode != HttpStatusCode.Forbidden || response.StatusCode != HttpStatusCode.GatewayTimeout || response.StatusCode != HttpStatusCode.Gone || response.StatusCode != HttpStatusCode.InternalServerError || response.StatusCode != HttpStatusCode.NotAcceptable) && (response.ContentType.Contains("text/html")) ) { using (StreamReader sr = new StreamReader(response.GetResponseStream())) page.Html = sr.ReadToEnd(); item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingFinished, null, DateTime.Now, _sharedResource); OnCrawlAnnounced(item); } else { item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted, "خطایی در حین بارگذاری صفحه رخ داد", DateTime.Now, _sharedResource); OnCrawlAnnounced(item); page.Html = null; } page.DateTime = DateTime.Now; } } catch(WebException ex) { page.Html = null; page.DateTime = DateTime.Now; HttpWebResponse response = ex.Response as HttpWebResponse; if(response!=null) item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted,response.StatusCode +" "+ response.StatusDescription + "خطایی در حین بارگذاری صفحه رخ داد", DateTime.Now, _sharedResource); else item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted,"خطایی در حین بارگذاری صفحه رخ داد", DateTime.Now, _sharedResource); OnCrawlAnnounced(item); } }