Пример #1
0
        /// <summary>
        /// Crawls a page.
        /// </summary>
        /// <param name="url">The url to crawl.</param>
        private void CrawlPage(string url)
        {
            // clean up the url a bit
            url = StandardizeUrl(url);

            try
            {
                if (!PageHasBeenCrawled(url) && _robotHelper.IsPathAllowed(_userAgent, url) && url.StartsWith(_baseUrl))
                {
                    string rawPage = GetWebText(url);

                    if (!string.IsNullOrWhiteSpace(rawPage))
                    {
                        var htmlDoc = new HtmlDocument();
                        htmlDoc.LoadHtml(rawPage);

                        // ensure the page should be indexed by looking at the robot and rock conventions
                        HtmlNode metaRobot = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='robot']");
                        if (metaRobot == null || metaRobot.Attributes["content"] == null || !metaRobot.Attributes["content"].Value.Contains("noindex"))
                        {
                            _previouslyCrawledPages.Add(url);

                            // index the page
                            SitePageIndex sitePage = new SitePageIndex();

                            sitePage.Content             = GetPageText(htmlDoc);
                            sitePage.Url                 = url;
                            sitePage.Id                  = url.MakeInt64HashCode();
                            sitePage.SourceIndexModel    = "Rock.Model.Site";
                            sitePage.PageTitle           = GetPageTitle(htmlDoc, url);
                            sitePage.DocumentName        = sitePage.PageTitle;
                            sitePage.SiteName            = _site.Name;
                            sitePage.SiteId              = _site.Id;
                            sitePage.LastIndexedDateTime = RockDateTime.Now;

                            HtmlNode metaDescription = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
                            if (metaDescription != null && metaDescription.Attributes["content"] != null)
                            {
                                sitePage.PageSummary = metaDescription.Attributes["content"].Value;
                            }

                            HtmlNode metaKeynotes = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='keywords']");
                            if (metaKeynotes != null && metaKeynotes.Attributes["content"] != null)
                            {
                                sitePage.PageKeywords = metaKeynotes.Attributes["content"].Value;
                            }

                            IndexContainer.IndexDocument(sitePage);

                            // crawl all the links found on the page.
                            foreach (string link in ParseLinks(htmlDoc))
                            {
                                CrawlPage(link);
                            }
                        }
                    }
                }
            }
            catch { }
        }
Пример #2
0
        /// <summary>
        /// Indexes the document.
        /// </summary>
        /// <param name="id"></param>
        public void IndexDocument(int id)
        {
            var documentEntity = new DocumentService(new RockContext()).Get(id);

            var indexItem = DocumentIndex.LoadByModel(documentEntity);

            IndexContainer.IndexDocument(indexItem);
        }
Пример #3
0
        /// <summary>
        /// Indexes the document.
        /// </summary>
        /// <param name="id"></param>
        public void IndexDocument(int id)
        {
            var groupEntity = new GroupService(new RockContext()).Get(id);

            // check that this group type is set to be indexed.
            if (groupEntity.GroupType.IsIndexEnabled && groupEntity.IsActive)
            {
                var indexItem = GroupIndex.LoadByModel(groupEntity);
                IndexContainer.IndexDocument(indexItem);
            }
        }
Пример #4
0
        /// <summary>
        /// Indexes the document.
        /// </summary>
        /// <param name="id">The identifier.</param>
        public void IndexDocument(int id)
        {
            var eventItemEntity = new EventItemService(new RockContext()).Get(id);

            // Check to ensure that the event item is on a calendar that is indexed
            if (eventItemEntity != null && eventItemEntity.EventCalendarItems.Any(c => c.EventCalendar.IsIndexEnabled))
            {
                var indexItem = EventItemIndex.LoadByModel(eventItemEntity);
                IndexContainer.IndexDocument(indexItem);
            }
        }
Пример #5
0
        /// <summary>
        /// Indexes the document.
        /// </summary>
        /// <param name="id"></param>
        public void IndexDocument(int id)
        {
            var itemEntity = new ContentChannelItemService(new RockContext()).Get(id);

            // only index if the content channel is set to be indexed
            if (itemEntity.ContentChannel != null && itemEntity.ContentChannel.IsIndexEnabled)
            {
                // ensure it's meant to be indexed
                if (itemEntity.ContentChannel.IsIndexEnabled && (itemEntity.ContentChannel.RequiresApproval == false || itemEntity.ContentChannel.ContentChannelType.DisableStatus || itemEntity.Status == ContentChannelItemStatus.Approved))
                {
                    var indexItem = ContentChannelItemIndex.LoadByModel(itemEntity);
                    IndexContainer.IndexDocument(indexItem);
                }
            }
        }
Пример #6
0
        /// <summary>
        /// Crawls a page.
        /// </summary>
        /// <param name="url">The URL to crawl.</param>
        private void CrawlPage(string url)
        {
            try
            {
                // clean up the URL a bit
                url = StandardizeUrl(url);

                if (!PageHasBeenCrawled(url))
                {
                    _previouslyCrawledPages.Add(url);

                    if (url.StartsWith(_baseUrl) && _robotHelper.IsPathAllowed(_userAgent, url.Replace(_baseUrl, "")))
                    {
                        string rawPage = GetWebText(url);

                        if (!string.IsNullOrWhiteSpace(rawPage))
                        {
                            var htmlDoc = new HtmlDocument();
                            htmlDoc.LoadHtml(rawPage);

                            // ensure the page should be indexed by looking at the robot and rock conventions
                            HtmlNode metaRobot = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='robots']");
                            if (metaRobot == null || metaRobot.Attributes["content"] == null || !metaRobot.Attributes["content"].Value.Contains("noindex"))
                            {
                                // index the page
                                SitePageIndex sitePage = new SitePageIndex();

                                sitePage.Content             = GetPageText(htmlDoc);
                                sitePage.Url                 = url;
                                sitePage.Id                  = url.MakeInt64HashCode();
                                sitePage.SourceIndexModel    = "Rock.Model.Site";
                                sitePage.PageTitle           = GetPageTitle(htmlDoc, url);
                                sitePage.DocumentName        = sitePage.PageTitle;
                                sitePage.SiteName            = _site.Name;
                                sitePage.SiteId              = _site.Id;
                                sitePage.LastIndexedDateTime = RockDateTime.Now;

                                HtmlNode metaDescription = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
                                if (metaDescription != null && metaDescription.Attributes["content"] != null)
                                {
                                    sitePage.PageSummary = metaDescription.Attributes["content"].Value;
                                }

                                HtmlNode metaKeynotes = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='keywords']");
                                if (metaKeynotes != null && metaKeynotes.Attributes["content"] != null)
                                {
                                    sitePage.PageKeywords = metaKeynotes.Attributes["content"].Value;
                                }

                                // Get a hash of the content and check it against a list of to see if page has already been indexed, if not then index it and add it to the list.
                                long contentHash = sitePage.Content.MakeInt64HashCode();

                                if (!_pageHashes.Contains(contentHash))
                                {
                                    IndexContainer.IndexDocument(sitePage);
                                    _pageHashes.Add(contentHash);
                                }
                            }

                            if (metaRobot == null || metaRobot.Attributes["content"] == null || !metaRobot.Attributes["content"].Value.Contains("nofollow"))
                            {
                                // crawl all the links found on the page.
                                var links = ParseLinks(htmlDoc);

                                foreach (string link in links)
                                {
                                    _urlQueue.Enqueue(link);
                                }
                            }
                        }
                    }
                }
            }
            catch { }
        }