Ejemplo n.º 1
0
 private void OnCrawlAnnounced(CrawlAnnounceItem item)
 {
     if (CrawlAnnounced != null)
     {
         CrawlAnnounced(this, new CrawlAnnouncedEventArgs(item));
     }
 }
Ejemplo n.º 2
0
        private void OnCrawlAnnounced(CrawlAnnounceItem item)
        {
            Console.WriteLine(String.Format("{0}-{1}-{2}-{3}", item.Page.Url, item.CrawlStatus.ToString()
                                            , item.Description != null ? item.Description : "", item.DateTime.ToShortTimeString()));

            if (CrawlAnnounced != null)
            {
                CrawlAnnounced(this, new CrawlAnnouncedEventArgs(item));
            }
        }
Ejemplo n.º 3
0
        private Webpage[] ExtractLinks(Webpage parent)
        {
            CrawlAnnounceItem item = new CrawlAnnounceItem(parent, CrawlStatus.ExtractingLinksStarted, null, DateTime.Now, _sharedResource);

            OnCrawlAnnounced(item);

            List <Webpage> Webpages = new List <Webpage>();

            HtmlDocument htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(parent.Html);
            htmlDocument.OptionFixNestedTags = true;

            foreach (HtmlNode link in htmlDocument.DocumentNode.Descendants("a")
                     .Where(d => d.Attributes.Contains("href")))
            {
                Webpage page = new Webpage();
                page.Depth  = parent.Depth + 1;
                page.HostId = parent.HostId;

                string uri = link.Attributes["href"].Value;

                if (Uri.IsWellFormedUriString(uri, UriKind.Absolute))
                {
                    page.Url = uri;
                }
                else if (Uri.IsWellFormedUriString(uri, UriKind.Relative))
                {
                    page.Url = UnifyUri(parent, uri);
                }

                if (page.Url != null)
                {
                    Uri href = new Uri(page.Url);
                    if (string.IsNullOrEmpty(href.Fragment))
                    {
                        page.Url       = page.Url.ToLower();
                        page.RefererId = parent.Id;
                        page.Depth     = parent.Depth + 1;
                        page.HostId    = parent.HostId;

                        Webpages.Add(page);
                    }
                }
            }

            _sharedResource.AddTotalLinksFound(Webpages.Count);

            item = new CrawlAnnounceItem(parent, CrawlStatus.ExtractingLinksFinished, string.Format("این صفحه دارای {0} لینک می باشد.", Webpages.Count), DateTime.Now, _sharedResource);
            OnCrawlAnnounced(item);

            return(Webpages.ToArray());
        }
Ejemplo n.º 4
0
        private Form[] ExtractForms(Webpage page)
        {
            CrawlAnnounceItem item = new CrawlAnnounceItem(page, CrawlStatus.ExtractingFormsStarted, null, DateTime.Now, _sharedResource);

            OnCrawlAnnounced(item);

            List <Form> _formLst = new List <Form>();

            HtmlNode.ElementsFlags.Remove("form");

            HtmlDocument htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(page.Html);

            HtmlNode root = htmlDocument.DocumentNode;

            foreach (HtmlNode formNode in root.Descendants("form"))
            {
                Form          form = new Form();
                HtmlAttribute att  = formNode.Attributes["action"];
                string        uri  = (att == null || att.Value == "" || att.Value.StartsWith("#") ? page.Url : att.Value);

                if (Uri.IsWellFormedUriString(uri, UriKind.Absolute))
                {
                    form.Action = uri;
                }
                else if (Uri.IsWellFormedUriString(uri, UriKind.Relative))
                {
                    form.Action = UnifyUri(page, uri);
                }

                form.Method = formNode.Attributes["method"].Value;

                if (form.Action != null)
                {
                    foreach (HtmlNode inputNode in formNode.Descendants("input"))
                    {
                        FormElement element = new FormElement();
                        if (inputNode.Attributes.Any(a => a.Name == "name"))
                        {
                            element.Name = inputNode.Attributes["name"].Value;
                        }
                        else
                        {
                            element.Name = "";
                        }

                        if (inputNode.Attributes.Any(a => a.Name == "value"))
                        {
                            element.Value = inputNode.Attributes["value"].Value;
                        }
                        else
                        {
                            element.Value = "";
                        }

                        element.Type = inputNode.Attributes["type"].Value;

                        form.FormElements.Add(element);
                    }

                    _formLst.Add(form);
                }
            }

            _sharedResource.AddTotalFormsFound(_formLst.Count);

            item = new CrawlAnnounceItem(page, CrawlStatus.ExtractingFormsFinished, string.Format("این صفحه دارای {0} فرم می باشد.", _formLst.Count), DateTime.Now, _sharedResource);
            OnCrawlAnnounced(item);

            return(_formLst.ToArray());
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Downloads  contents and save it
        /// </summary>
        /// <param name="page"></param>
        private void DownloadPage(Webpage page)
        {
            CrawlAnnounceItem item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingStarted, null, DateTime.Now, _sharedResource);

            OnCrawlAnnounced(item);

            if (page.Url.EndsWith(".jpg") ||
                page.Url.EndsWith(".jpeg") ||
                page.Url.EndsWith(".zip") ||
                page.Url.EndsWith(".rar") ||
                page.Url.EndsWith(".png") ||
                page.Url.EndsWith(".exe") ||
                page.Url.EndsWith(".gif") ||
                page.Url.EndsWith(".mp3") ||
                page.Url.EndsWith(".wma") ||
                page.Url.EndsWith(".pdf") ||
                page.Url.EndsWith(".wav") ||
                page.Url.EndsWith(".bmp") ||
                page.Url.EndsWith(".apk"))
            {
                page.Html     = null;
                page.DateTime = DateTime.Now;

                item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted, "این آدرس محتوی متن نمی باشد.", DateTime.Now, _sharedResource);
                OnCrawlAnnounced(item);

                return;
            }

            try
            {
                HttpWebRequest request = WebRequest.Create(page.Url) as HttpWebRequest;
                request.Timeout           = 100000;
                request.UserAgent         = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)";
                request.AllowAutoRedirect = true;
                request.KeepAlive         = false;

                if (page.Url.Contains("rar"))
                {
                    Console.WriteLine();
                }

                using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)
                {
                    if (
                        (response.StatusCode != HttpStatusCode.NotFound ||
                         response.StatusCode != HttpStatusCode.BadGateway ||
                         response.StatusCode != HttpStatusCode.BadRequest ||
                         response.StatusCode != HttpStatusCode.Forbidden ||
                         response.StatusCode != HttpStatusCode.GatewayTimeout ||
                         response.StatusCode != HttpStatusCode.Gone ||
                         response.StatusCode != HttpStatusCode.InternalServerError ||
                         response.StatusCode != HttpStatusCode.NotAcceptable) &&
                        (response.ContentType.Contains("text/html"))
                        )
                    {
                        using (StreamReader sr = new StreamReader(response.GetResponseStream()))
                            page.Html = sr.ReadToEnd();

                        item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingFinished, null, DateTime.Now, _sharedResource);
                        OnCrawlAnnounced(item);
                    }
                    else
                    {
                        item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted, "خطایی در حین بارگذاری صفحه رخ داد", DateTime.Now, _sharedResource);
                        OnCrawlAnnounced(item);
                        page.Html = null;
                    }

                    page.DateTime = DateTime.Now;
                }
            }
            catch (WebException ex)
            {
                page.Html     = null;
                page.DateTime = DateTime.Now;

                HttpWebResponse response = ex.Response as HttpWebResponse;

                if (response != null)
                {
                    item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted, response.StatusCode + " " + response.StatusDescription + "خطایی در حین بارگذاری صفحه رخ داد", DateTime.Now, _sharedResource);
                }
                else
                {
                    item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted, "خطایی در حین بارگذاری صفحه رخ داد", DateTime.Now, _sharedResource);
                }

                OnCrawlAnnounced(item);
            }
        }