/// <summary> /// Crawls a page. /// </summary> /// <param name="url">The url to crawl.</param> private void CrawlPage( string url ) { if ( !PageHasBeenCrawled( url ) && _robot.IsPathAllowed( _userAgent, url ) ) { string rawPage = GetWebText( url ); if ( !string.IsNullOrWhiteSpace( rawPage ) ) { var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml( rawPage ); // get page title CrawledPage page = new CrawledPage(); if ( htmlDoc.DocumentNode.SelectSingleNode( "//body" ) != null ) { page.Text = GetPageText( htmlDoc );// htmlDoc.DocumentNode.SelectSingleNode( "//body" ).InnerHtml; } else { page.Text = rawPage; } if ( htmlDoc.DocumentNode.SelectSingleNode( "//head/title" ) != null) { page.Title = htmlDoc.DocumentNode.SelectSingleNode( "//head/title" ).InnerText.Trim(); } else { page.Title = url; } page.Url = url; // set whether that page should in indexed HtmlNode metaRobot = htmlDoc.DocumentNode.SelectSingleNode( "//meta[@name='robot']" ); if ( metaRobot != null && metaRobot.Attributes["content"] != null && metaRobot.Attributes["content"].Value.Contains( "noindex" ) ) { page.AllowsIndex = false; } _pages.Add( page ); // index the page // clean up the page title a bit by removing the site name off it if ( page.AllowsIndex ) { SitePageIndex sitePage = new SitePageIndex(); sitePage.Id = page.Url.MakeInt64HashCode(); sitePage.Content = page.Text.SanitizeHtml(); // store only the page title (strip the site name off per Rock convention) if ( page.Title.Contains( "|" ) ) { sitePage.PageTitle = page.Title.Substring( 0, (page.Title.IndexOf( '|' ) - 1) ).Trim(); } else { sitePage.PageTitle = page.Title.Trim(); } sitePage.DocumentName = sitePage.PageTitle; sitePage.SiteName = _site.Name; sitePage.SiteId = _site.Id; sitePage.Url = page.Url; sitePage.LastIndexedDateTime = RockDateTime.Now; HtmlNode metaDescription = htmlDoc.DocumentNode.SelectSingleNode( "//meta[@name='description']" ); if ( metaDescription != null && metaDescription.Attributes["content"] != null ) { sitePage.PageSummary = metaDescription.Attributes["content"].Value; } HtmlNode metaKeynotes = htmlDoc.DocumentNode.SelectSingleNode( "//meta[@name='keywords']" ); if ( metaKeynotes != null && metaKeynotes.Attributes["content"] != null ) { sitePage.PageKeywords = metaKeynotes.Attributes["content"].Value; } IndexContainer.IndexDocument( sitePage ); } LinkParser linkParser = new LinkParser(); linkParser.ParseLinks( htmlDoc, url, _startUrl ); //Add data to main data lists AddRangeButNoDuplicates( _externalUrls, linkParser.ExternalUrls ); AddRangeButNoDuplicates( _otherUrls, linkParser.OtherUrls ); AddRangeButNoDuplicates( _failedUrls, linkParser.BadUrls ); foreach ( string exception in linkParser.Exceptions ) _exceptions.Add( exception ); //Crawl all the links found on the page. foreach ( string link in linkParser.GoodUrls ) { CrawlPage( link ); } } } }
/// <summary> /// This method will be called each time a page is found by the crawler. /// </summary> /// <param name="page">The page.</param> /// <returns></returns> public bool PageCallback( CrawledPage page) { if ( page.AllowsIndex ) { // clean up the page title a bit by removing the site name off it var pageTitle = page.Title.Substring( 0, (page.Title.IndexOf( '|' ) - 1) ).Trim(); SitePageIndex sitePage = new SitePageIndex(); sitePage.Id = page.Url.MakeInt64HashCode(); sitePage.Content = page.Text; sitePage.PageTitle = pageTitle; sitePage.SiteName = _site.Name; sitePage.SiteId = _site.Id; sitePage.Url = page.Url; sitePage.LastIndexedDateTime = RockDateTime.Now; IndexContainer.IndexDocument( sitePage ); _indexedPageCount++; return true; } return false; }