コード例 #1
0
ファイル: Crawler.cs プロジェクト: mirsaeedi/xKnight
        private void OnCrawlAnnounced(CrawlAnnounceItem item)
        {
            Console.WriteLine(String.Format("{0}-{1}-{2}-{3}", item.Page.Url, item.CrawlStatus.ToString()
                , item.Description != null ? item.Description : "", item.DateTime.ToShortTimeString()));

            if (CrawlAnnounced != null)
            {
                CrawlAnnounced(this, new CrawlAnnouncedEventArgs(item));
            }
        }
コード例 #2
0
 public CrawlAnnouncedEventArgs(CrawlAnnounceItem crawlAnnounceItem)
 {
     this.CrawlAnnounceItem = crawlAnnounceItem;
 }
コード例 #3
0
 public CrawlAnnouncedEventArgs(CrawlAnnounceItem crawlAnnounceItem)
 {
     this.CrawlAnnounceItem = crawlAnnounceItem;
 }
コード例 #4
0
ファイル: CrawlerAgent.cs プロジェクト: mirsaeedi/xKnight
 private void OnCrawlAnnounced(CrawlAnnounceItem item)
 {
     if (CrawlAnnounced != null)
     {
         CrawlAnnounced(this, new CrawlAnnouncedEventArgs(item));
     }
 }
コード例 #5
0
ファイル: CrawlerAgent.cs プロジェクト: mirsaeedi/xKnight
        private Webpage[] ExtractLinks(Webpage parent)
        {
            CrawlAnnounceItem item = new CrawlAnnounceItem(parent, CrawlStatus.ExtractingLinksStarted, null, DateTime.Now, _sharedResource);
            OnCrawlAnnounced(item);

            List<Webpage> Webpages = new List<Webpage>();

            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(parent.Html);
            htmlDocument.OptionFixNestedTags = true;

            foreach (HtmlNode link in htmlDocument.DocumentNode.Descendants("a")
                .Where(d => d.Attributes.Contains("href")))
            {

                Webpage page = new Webpage();
                page.Depth = parent.Depth + 1;
                page.HostId = parent.HostId;

                string uri = link.Attributes["href"].Value;

                if(Uri.IsWellFormedUriString(uri,UriKind.Absolute))
                    page.Url = uri;
                else if (Uri.IsWellFormedUriString(uri,UriKind.Relative))
                    page.Url = UnifyUri(parent, uri);

                if (page.Url != null)
                {
                    Uri href = new Uri(page.Url);
                    if (string.IsNullOrEmpty(href.Fragment))
                    {
                        page.Url = page.Url.ToLower();
                        page.RefererId = parent.Id;
                        page.Depth = parent.Depth + 1;
                        page.HostId = parent.HostId;

                        Webpages.Add(page);
                    }
                }

            }

            _sharedResource.AddTotalLinksFound(Webpages.Count);

            item = new CrawlAnnounceItem(parent, CrawlStatus.ExtractingLinksFinished, string.Format("این صفحه دارای {0} لینک می باشد.", Webpages.Count), DateTime.Now,_sharedResource);
            OnCrawlAnnounced(item);

            return Webpages.ToArray();
        }
コード例 #6
0
ファイル: CrawlerAgent.cs プロジェクト: mirsaeedi/xKnight
        private Form[] ExtractForms(Webpage page)
        {
            CrawlAnnounceItem item = new CrawlAnnounceItem(page, CrawlStatus.ExtractingFormsStarted, null, DateTime.Now, _sharedResource);
            OnCrawlAnnounced(item);

            List<Form> _formLst = new List<Form>();
            HtmlNode.ElementsFlags.Remove("form");

            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(page.Html);

            HtmlNode root = htmlDocument.DocumentNode;

            foreach (HtmlNode formNode in root.Descendants("form"))
            {
                Form form = new Form();
                HtmlAttribute att = formNode.Attributes["action"];
                string uri = (att == null || att.Value == "" || att.Value.StartsWith("#") ? page.Url : att.Value);

                if (Uri.IsWellFormedUriString(uri, UriKind.Absolute))
                    form.Action = uri;
                else if (Uri.IsWellFormedUriString(uri, UriKind.Relative))
                    form.Action = UnifyUri(page, uri);

                form.Method = formNode.Attributes["method"].Value;

                if (form.Action != null)
                {
                    foreach (HtmlNode inputNode in formNode.Descendants("input"))
                    {
                        FormElement element = new FormElement();
                        if (inputNode.Attributes.Any(a => a.Name == "name"))
                            element.Name = inputNode.Attributes["name"].Value;
                        else
                            element.Name = "";

                        if (inputNode.Attributes.Any(a => a.Name == "value"))
                            element.Value = inputNode.Attributes["value"].Value;
                        else
                            element.Value = "";

                        element.Type = inputNode.Attributes["type"].Value;

                        form.FormElements.Add(element);
                    }

                    _formLst.Add(form);
                }
            }

            _sharedResource.AddTotalFormsFound(_formLst.Count);

            item = new CrawlAnnounceItem(page, CrawlStatus.ExtractingFormsFinished, string.Format("این صفحه دارای {0} فرم می باشد.", _formLst.Count), DateTime.Now, _sharedResource);
            OnCrawlAnnounced(item);

            return _formLst.ToArray();
        }
コード例 #7
0
ファイル: CrawlerAgent.cs プロジェクト: mirsaeedi/xKnight
        /// <summary>
        /// Downloads  contents and save it
        /// </summary>
        /// <param name="page"></param>
        private void DownloadPage(Webpage page)
        {
            CrawlAnnounceItem item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingStarted, null, DateTime.Now, _sharedResource);
            OnCrawlAnnounced(item);

            if (page.Url.EndsWith(".jpg")
                || page.Url.EndsWith(".jpeg")
                || page.Url.EndsWith(".zip")
                || page.Url.EndsWith(".rar")
                || page.Url.EndsWith(".png")
                || page.Url.EndsWith(".exe")
                || page.Url.EndsWith(".gif")
                || page.Url.EndsWith(".mp3")
                || page.Url.EndsWith(".wma")
                || page.Url.EndsWith(".pdf")
                || page.Url.EndsWith(".wav")
                || page.Url.EndsWith(".bmp")
                || page.Url.EndsWith(".apk"))
            {
                page.Html = null;
                page.DateTime = DateTime.Now;

                item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted, "این آدرس محتوی متن نمی باشد.", DateTime.Now, _sharedResource);
                OnCrawlAnnounced(item);

                return;
            }

            try
            {
                HttpWebRequest request = WebRequest.Create(page.Url) as HttpWebRequest;
                request.Timeout = 100000;
                request.UserAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)";
                request.AllowAutoRedirect = true;
                request.KeepAlive = false;

                if (page.Url.Contains("rar"))
                    Console.WriteLine();

                using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)
                {
                    if (
                        (response.StatusCode != HttpStatusCode.NotFound
                        || response.StatusCode != HttpStatusCode.BadGateway
                        || response.StatusCode != HttpStatusCode.BadRequest
                        || response.StatusCode != HttpStatusCode.Forbidden
                        || response.StatusCode != HttpStatusCode.GatewayTimeout
                        || response.StatusCode != HttpStatusCode.Gone
                        || response.StatusCode != HttpStatusCode.InternalServerError
                        || response.StatusCode != HttpStatusCode.NotAcceptable)
                        && (response.ContentType.Contains("text/html"))
                        )
                    {
                        using (StreamReader sr = new StreamReader(response.GetResponseStream()))
                            page.Html = sr.ReadToEnd();

                        item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingFinished, null, DateTime.Now, _sharedResource);
                        OnCrawlAnnounced(item);
                    }
                    else
                    {
                        item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted, "خطایی در حین بارگذاری صفحه رخ داد", DateTime.Now, _sharedResource);
                        OnCrawlAnnounced(item);
                        page.Html = null;
                    }

                    page.DateTime = DateTime.Now;
                }
            }
            catch(WebException ex)
            {
                page.Html = null;
                page.DateTime = DateTime.Now;

                HttpWebResponse response = ex.Response as HttpWebResponse;

                if(response!=null)
                    item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted,response.StatusCode +" "+ response.StatusDescription + "خطایی در حین بارگذاری صفحه رخ داد", DateTime.Now, _sharedResource);
                else
                    item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted,"خطایی در حین بارگذاری صفحه رخ داد", DateTime.Now, _sharedResource);

                OnCrawlAnnounced(item);
            }
        }