Example #1
0
 public CrawlAnnounceItem(Webpage page, CrawlStatus crawlStatus, string description, DateTime dateTime, CrawlingSharedResource crawlingSharedResource)
 {
     this.CrawlingSharedResource = crawlingSharedResource;
     this.Description = description;
     this.Page = page;
     this.CrawlStatus = crawlStatus;
     this.DateTime = dateTime;
 }
Example #2
0
 public static void Save(Webpage page)
 {
     xKnightEntities context = new xKnightEntities();
     if (page.Id == 0)
     {
         context.Webpages.AddObject(page);
         context.SaveChanges();
     }
     else
     {
         context.Webpages.Attach(page);
         context.ObjectStateManager.ChangeObjectState(page, System.Data.EntityState.Modified);
     }
     context.Dispose();
 }
 /// <summary>
 /// Deprecated Method for adding a new object to the Webpages EntitySet. Consider using the .Add method of the associated ObjectSet&lt;T&gt; property instead.
 /// </summary>
 public void AddToWebpages(Webpage webpage)
 {
     base.AddObject("Webpages", webpage);
 }
 /// <summary>
 /// Create a new Webpage object.
 /// </summary>
 /// <param name="id">Initial value of the Id property.</param>
 /// <param name="hostId">Initial value of the HostId property.</param>
 /// <param name="url">Initial value of the Url property.</param>
 /// <param name="depth">Initial value of the Depth property.</param>
 /// <param name="dateTime">Initial value of the DateTime property.</param>
 public static Webpage CreateWebpage(global::System.Int32 id, global::System.Int32 hostId, global::System.String url, global::System.Int32 depth, global::System.DateTime dateTime)
 {
     Webpage webpage = new Webpage();
     webpage.Id = id;
     webpage.HostId = hostId;
     webpage.Url = url;
     webpage.Depth = depth;
     webpage.DateTime = dateTime;
     return webpage;
 }
Example #5
0
        private Webpage[] ExtractLinks(Webpage parent)
        {
            CrawlAnnounceItem item = new CrawlAnnounceItem(parent, CrawlStatus.ExtractingLinksStarted, null, DateTime.Now, _sharedResource);
            OnCrawlAnnounced(item);

            List<Webpage> Webpages = new List<Webpage>();

            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(parent.Html);
            htmlDocument.OptionFixNestedTags = true;

            foreach (HtmlNode link in htmlDocument.DocumentNode.Descendants("a")
                .Where(d => d.Attributes.Contains("href")))
            {

                Webpage page = new Webpage();
                page.Depth = parent.Depth + 1;
                page.HostId = parent.HostId;

                string uri = link.Attributes["href"].Value;

                if(Uri.IsWellFormedUriString(uri,UriKind.Absolute))
                    page.Url = uri;
                else if (Uri.IsWellFormedUriString(uri,UriKind.Relative))
                    page.Url = UnifyUri(parent, uri);

                if (page.Url != null)
                {
                    Uri href = new Uri(page.Url);
                    if (string.IsNullOrEmpty(href.Fragment))
                    {
                        page.Url = page.Url.ToLower();
                        page.RefererId = parent.Id;
                        page.Depth = parent.Depth + 1;
                        page.HostId = parent.HostId;

                        Webpages.Add(page);
                    }
                }

            }

            _sharedResource.AddTotalLinksFound(Webpages.Count);

            item = new CrawlAnnounceItem(parent, CrawlStatus.ExtractingLinksFinished, string.Format("این صفحه دارای {0} لینک می باشد.", Webpages.Count), DateTime.Now,_sharedResource);
            OnCrawlAnnounced(item);

            return Webpages.ToArray();
        }
Example #6
0
 private string UnifyUri(Webpage basePage,string relativeUri)
 {
     try
     {
         Uri uri = new Uri(new Uri(basePage.Url), relativeUri);
         return uri.ToString();
     }
     catch
     {
         return null;
     }
 }
Example #7
0
        private Form[] ExtractForms(Webpage page)
        {
            CrawlAnnounceItem item = new CrawlAnnounceItem(page, CrawlStatus.ExtractingFormsStarted, null, DateTime.Now, _sharedResource);
            OnCrawlAnnounced(item);

            List<Form> _formLst = new List<Form>();
            HtmlNode.ElementsFlags.Remove("form");

            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(page.Html);

            HtmlNode root = htmlDocument.DocumentNode;

            foreach (HtmlNode formNode in root.Descendants("form"))
            {
                Form form = new Form();
                HtmlAttribute att = formNode.Attributes["action"];
                string uri = (att == null || att.Value == "" || att.Value.StartsWith("#") ? page.Url : att.Value);

                if (Uri.IsWellFormedUriString(uri, UriKind.Absolute))
                    form.Action = uri;
                else if (Uri.IsWellFormedUriString(uri, UriKind.Relative))
                    form.Action = UnifyUri(page, uri);

                form.Method = formNode.Attributes["method"].Value;

                if (form.Action != null)
                {
                    foreach (HtmlNode inputNode in formNode.Descendants("input"))
                    {
                        FormElement element = new FormElement();
                        if (inputNode.Attributes.Any(a => a.Name == "name"))
                            element.Name = inputNode.Attributes["name"].Value;
                        else
                            element.Name = "";

                        if (inputNode.Attributes.Any(a => a.Name == "value"))
                            element.Value = inputNode.Attributes["value"].Value;
                        else
                            element.Value = "";

                        element.Type = inputNode.Attributes["type"].Value;

                        form.FormElements.Add(element);
                    }

                    _formLst.Add(form);
                }
            }

            _sharedResource.AddTotalFormsFound(_formLst.Count);

            item = new CrawlAnnounceItem(page, CrawlStatus.ExtractingFormsFinished, string.Format("این صفحه دارای {0} فرم می باشد.", _formLst.Count), DateTime.Now, _sharedResource);
            OnCrawlAnnounced(item);

            return _formLst.ToArray();
        }
Example #8
0
        /// <summary>
        /// Downloads  contents and save it
        /// </summary>
        /// <param name="page"></param>
        private void DownloadPage(Webpage page)
        {
            CrawlAnnounceItem item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingStarted, null, DateTime.Now, _sharedResource);
            OnCrawlAnnounced(item);

            if (page.Url.EndsWith(".jpg")
                || page.Url.EndsWith(".jpeg")
                || page.Url.EndsWith(".zip")
                || page.Url.EndsWith(".rar")
                || page.Url.EndsWith(".png")
                || page.Url.EndsWith(".exe")
                || page.Url.EndsWith(".gif")
                || page.Url.EndsWith(".mp3")
                || page.Url.EndsWith(".wma")
                || page.Url.EndsWith(".pdf")
                || page.Url.EndsWith(".wav")
                || page.Url.EndsWith(".bmp")
                || page.Url.EndsWith(".apk"))
            {
                page.Html = null;
                page.DateTime = DateTime.Now;

                item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted, "این آدرس محتوی متن نمی باشد.", DateTime.Now, _sharedResource);
                OnCrawlAnnounced(item);

                return;
            }

            try
            {
                HttpWebRequest request = WebRequest.Create(page.Url) as HttpWebRequest;
                request.Timeout = 100000;
                request.UserAgent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)";
                request.AllowAutoRedirect = true;
                request.KeepAlive = false;

                if (page.Url.Contains("rar"))
                    Console.WriteLine();

                using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)
                {
                    if (
                        (response.StatusCode != HttpStatusCode.NotFound
                        || response.StatusCode != HttpStatusCode.BadGateway
                        || response.StatusCode != HttpStatusCode.BadRequest
                        || response.StatusCode != HttpStatusCode.Forbidden
                        || response.StatusCode != HttpStatusCode.GatewayTimeout
                        || response.StatusCode != HttpStatusCode.Gone
                        || response.StatusCode != HttpStatusCode.InternalServerError
                        || response.StatusCode != HttpStatusCode.NotAcceptable)
                        && (response.ContentType.Contains("text/html"))
                        )
                    {
                        using (StreamReader sr = new StreamReader(response.GetResponseStream()))
                            page.Html = sr.ReadToEnd();

                        item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingFinished, null, DateTime.Now, _sharedResource);
                        OnCrawlAnnounced(item);
                    }
                    else
                    {
                        item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted, "خطایی در حین بارگذاری صفحه رخ داد", DateTime.Now, _sharedResource);
                        OnCrawlAnnounced(item);
                        page.Html = null;
                    }

                    page.DateTime = DateTime.Now;
                }
            }
            catch(WebException ex)
            {
                page.Html = null;
                page.DateTime = DateTime.Now;

                HttpWebResponse response = ex.Response as HttpWebResponse;

                if(response!=null)
                    item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted,response.StatusCode +" "+ response.StatusDescription + "خطایی در حین بارگذاری صفحه رخ داد", DateTime.Now, _sharedResource);
                else
                    item = new CrawlAnnounceItem(page, CrawlStatus.DownloadingHalted,"خطایی در حین بارگذاری صفحه رخ داد", DateTime.Now, _sharedResource);

                OnCrawlAnnounced(item);
            }
        }
Example #9
0
        private void AddPagesToQueue(Webpage parent, Webpage[] pages)
        {
            if (pages == null)
                return;

            for (int i = 0; i < pages.Length; i++)
            {
                if (Uri.IsWellFormedUriString(pages[i].Url, UriKind.Absolute))
                {
                    Uri uri = new Uri(pages[i].Url);
                    Uri dns = new Uri(_sharedResource.Host.HostName);
                    if (uri.Host == dns.Host)
                    {
                        if (!_sharedResource.SharedQueue.Any(p => p.Url == pages[i].Url)
                            && _sharedResource.CrawlerSetting.MaxDepth > parent.Depth)
                        {
                            lock (_sharedResource.SharedLock)
                            {
                                if (!_sharedResource.SharedPageHash.Contains(pages[i].Url))
                                {
                                    _sharedResource.SharedPageHash.Add(pages[i].Url);
                                    _sharedResource.SharedQueue.Enqueue(pages[i]);
                                }
                            }
                        }
                    }
                }
            }
        }
Example #10
0
        private void AddFormsToPage(Webpage page, Form[] forms)
        {
            lock (_sharedResource.SharedLock)
            {
                for (int i = 0; i < forms.Length; i++)
                {
                    string id=forms[i].Action + ":" + forms[i].Method;
                    for (int j = 0; j < forms[i].FormElements.Count; j++)
                    {
                        id+=":"+forms[i].FormElements.ElementAt(j).Name+":"+forms[i].FormElements.ElementAt(j).Value;
                    }

                    byte[] hashData = Encoding.UTF8.GetBytes(id);

                    if (!_sharedResource.SharedFormHash.Contains(id))
                    {
                        page.Forms.Add(forms[i]);
                        _sharedResource.SharedFormHash.Add(id);
                    }
                    else
                        Console.WriteLine("Duplicated Form");
                }
            }
        }
Example #11
0
        private void AddDomainPageToQueue()
        {
            lock (_sharedResource.SharedLock)
            {
                if (!_sharedResource.SharedPageHash.Contains(_sharedResource.Host.HostName))
                {
                    Webpage page = new Webpage();
                    page.Depth = 0;
                    page.Url = _sharedResource.Host.HostName;
                    page.HostId = _sharedResource.Host.Id;

                    _sharedResource.AddTotalLinksFound(1);
                    _sharedResource.SharedPageHash.Add(page.Url);
                    _sharedResource.SharedQueue.Enqueue(page);
                }
            }
        }