Exemplo n.º 1
0
        /// <summary>
        /// Crawls a page.
        /// </summary>
        /// <param name="url">The url to crawl.</param>
        private void CrawlPage(string url)
        {
            // clean up the url a bit
            url = StandardizeUrl(url);

            try
            {
                if (!PageHasBeenCrawled(url) && _robotHelper.IsPathAllowed(_userAgent, url) && url.StartsWith(_baseUrl))
                {
                    string rawPage = GetWebText(url);

                    if (!string.IsNullOrWhiteSpace(rawPage))
                    {
                        var htmlDoc = new HtmlDocument();
                        htmlDoc.LoadHtml(rawPage);

                        // ensure the page should be indexed by looking at the robot and rock conventions
                        HtmlNode metaRobot = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='robot']");
                        if (metaRobot == null || metaRobot.Attributes["content"] == null || !metaRobot.Attributes["content"].Value.Contains("noindex"))
                        {
                            _previouslyCrawledPages.Add(url);

                            // index the page
                            SitePageIndex sitePage = new SitePageIndex();

                            sitePage.Content             = GetPageText(htmlDoc);
                            sitePage.Url                 = url;
                            sitePage.Id                  = url.MakeInt64HashCode();
                            sitePage.SourceIndexModel    = "Rock.Model.Site";
                            sitePage.PageTitle           = GetPageTitle(htmlDoc, url);
                            sitePage.DocumentName        = sitePage.PageTitle;
                            sitePage.SiteName            = _site.Name;
                            sitePage.SiteId              = _site.Id;
                            sitePage.LastIndexedDateTime = RockDateTime.Now;

                            HtmlNode metaDescription = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
                            if (metaDescription != null && metaDescription.Attributes["content"] != null)
                            {
                                sitePage.PageSummary = metaDescription.Attributes["content"].Value;
                            }

                            HtmlNode metaKeynotes = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='keywords']");
                            if (metaKeynotes != null && metaKeynotes.Attributes["content"] != null)
                            {
                                sitePage.PageKeywords = metaKeynotes.Attributes["content"].Value;
                            }

                            IndexContainer.IndexDocument(sitePage);

                            // crawl all the links found on the page.
                            foreach (string link in ParseLinks(htmlDoc))
                            {
                                CrawlPage(link);
                            }
                        }
                    }
                }
            }
            catch { }
        }
Exemplo n.º 2
0
        /// <summary>
        /// Crawls a page.
        /// </summary>
        /// <param name="url">The URL to crawl.</param>
        private void CrawlPage(string url)
        {
            try
            {
                // clean up the URL a bit
                url = StandardizeUrl(url);

                if (!PageHasBeenCrawled(url))
                {
                    _previouslyCrawledPages.Add(url);

                    if (url.StartsWith(_baseUrl) && _robotHelper.IsPathAllowed(_userAgent, url.Replace(_baseUrl, "")))
                    {
                        string rawPage = GetWebText(url);

                        if (!string.IsNullOrWhiteSpace(rawPage))
                        {
                            var htmlDoc = new HtmlDocument();
                            htmlDoc.LoadHtml(rawPage);

                            // ensure the page should be indexed by looking at the robot and rock conventions
                            HtmlNode metaRobot = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='robots']");
                            if (metaRobot == null || metaRobot.Attributes["content"] == null || !metaRobot.Attributes["content"].Value.Contains("noindex"))
                            {
                                // index the page
                                SitePageIndex sitePage = new SitePageIndex();

                                sitePage.Content             = GetPageText(htmlDoc);
                                sitePage.Url                 = url;
                                sitePage.Id                  = url.MakeInt64HashCode();
                                sitePage.SourceIndexModel    = "Rock.Model.Site";
                                sitePage.PageTitle           = GetPageTitle(htmlDoc, url);
                                sitePage.DocumentName        = sitePage.PageTitle;
                                sitePage.SiteName            = _site.Name;
                                sitePage.SiteId              = _site.Id;
                                sitePage.LastIndexedDateTime = RockDateTime.Now;

                                HtmlNode metaDescription = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
                                if (metaDescription != null && metaDescription.Attributes["content"] != null)
                                {
                                    sitePage.PageSummary = metaDescription.Attributes["content"].Value;
                                }

                                HtmlNode metaKeynotes = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='keywords']");
                                if (metaKeynotes != null && metaKeynotes.Attributes["content"] != null)
                                {
                                    sitePage.PageKeywords = metaKeynotes.Attributes["content"].Value;
                                }

                                // Get a hash of the content and check it against a list of to see if page has already been indexed, if not then index it and add it to the list.
                                long contentHash = sitePage.Content.MakeInt64HashCode();

                                if (!_pageHashes.Contains(contentHash))
                                {
                                    IndexContainer.IndexDocument(sitePage);
                                    _pageHashes.Add(contentHash);
                                }
                            }

                            if (metaRobot == null || metaRobot.Attributes["content"] == null || !metaRobot.Attributes["content"].Value.Contains("nofollow"))
                            {
                                // crawl all the links found on the page.
                                var links = ParseLinks(htmlDoc);

                                foreach (string link in links)
                                {
                                    _urlQueue.Enqueue(link);
                                }
                            }
                        }
                    }
                }
            }
            catch { }
        }