/// <summary> /// Crawls a page. /// </summary> /// <param name="url">The url to crawl.</param> private void CrawlPage(string url) { // clean up the url a bit url = StandardizeUrl(url); try { if (!PageHasBeenCrawled(url) && _robotHelper.IsPathAllowed(_userAgent, url) && url.StartsWith(_baseUrl)) { string rawPage = GetWebText(url); if (!string.IsNullOrWhiteSpace(rawPage)) { var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(rawPage); // ensure the page should be indexed by looking at the robot and rock conventions HtmlNode metaRobot = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='robot']"); if (metaRobot == null || metaRobot.Attributes["content"] == null || !metaRobot.Attributes["content"].Value.Contains("noindex")) { _previouslyCrawledPages.Add(url); // index the page SitePageIndex sitePage = new SitePageIndex(); sitePage.Content = GetPageText(htmlDoc); sitePage.Url = url; sitePage.Id = url.MakeInt64HashCode(); sitePage.SourceIndexModel = "Rock.Model.Site"; sitePage.PageTitle = GetPageTitle(htmlDoc, url); sitePage.DocumentName = sitePage.PageTitle; sitePage.SiteName = _site.Name; sitePage.SiteId = _site.Id; sitePage.LastIndexedDateTime = RockDateTime.Now; HtmlNode metaDescription = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']"); if (metaDescription != null && metaDescription.Attributes["content"] != null) { sitePage.PageSummary = metaDescription.Attributes["content"].Value; } HtmlNode metaKeynotes = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='keywords']"); if (metaKeynotes != null && metaKeynotes.Attributes["content"] != null) { sitePage.PageKeywords = metaKeynotes.Attributes["content"].Value; } IndexContainer.IndexDocument(sitePage); // crawl all the links found on the page. foreach (string link in ParseLinks(htmlDoc)) { CrawlPage(link); } } } } } catch { } }
/// <summary> /// Crawls a page. /// </summary> /// <param name="url">The URL to crawl.</param> private void CrawlPage(string url) { try { // clean up the URL a bit url = StandardizeUrl(url); if (!PageHasBeenCrawled(url)) { _previouslyCrawledPages.Add(url); if (url.StartsWith(_baseUrl) && _robotHelper.IsPathAllowed(_userAgent, url.Replace(_baseUrl, ""))) { string rawPage = GetWebText(url); if (!string.IsNullOrWhiteSpace(rawPage)) { var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(rawPage); // ensure the page should be indexed by looking at the robot and rock conventions HtmlNode metaRobot = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='robots']"); if (metaRobot == null || metaRobot.Attributes["content"] == null || !metaRobot.Attributes["content"].Value.Contains("noindex")) { // index the page SitePageIndex sitePage = new SitePageIndex(); sitePage.Content = GetPageText(htmlDoc); sitePage.Url = url; sitePage.Id = url.MakeInt64HashCode(); sitePage.SourceIndexModel = "Rock.Model.Site"; sitePage.PageTitle = GetPageTitle(htmlDoc, url); sitePage.DocumentName = sitePage.PageTitle; sitePage.SiteName = _site.Name; sitePage.SiteId = _site.Id; sitePage.LastIndexedDateTime = RockDateTime.Now; HtmlNode metaDescription = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']"); if (metaDescription != null && metaDescription.Attributes["content"] != null) { sitePage.PageSummary = metaDescription.Attributes["content"].Value; } HtmlNode metaKeynotes = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='keywords']"); if (metaKeynotes != null && metaKeynotes.Attributes["content"] != null) { sitePage.PageKeywords = metaKeynotes.Attributes["content"].Value; } // Get a hash of the content and check it against a list of to see if page has already been indexed, if not then index it and add it to the list. long contentHash = sitePage.Content.MakeInt64HashCode(); if (!_pageHashes.Contains(contentHash)) { IndexContainer.IndexDocument(sitePage); _pageHashes.Add(contentHash); } } if (metaRobot == null || metaRobot.Attributes["content"] == null || !metaRobot.Attributes["content"].Value.Contains("nofollow")) { // crawl all the links found on the page. var links = ParseLinks(htmlDoc); foreach (string link in links) { _urlQueue.Enqueue(link); } } } } } } catch { } }