Exemplo n.º 1
0
        /// <summary>
        /// Crawls a page.
        /// </summary>
        /// <param name="url">The url to crawl.</param>
        private void CrawlPage( string url )
        {
            if ( !PageHasBeenCrawled( url ) && _robot.IsPathAllowed( _userAgent, url ) )
            {
                string rawPage = GetWebText( url );

                if ( !string.IsNullOrWhiteSpace( rawPage ) )
                {
                    var htmlDoc = new HtmlDocument();
                    htmlDoc.LoadHtml( rawPage );

                    // get page title
                    CrawledPage page = new CrawledPage();

                    if ( htmlDoc.DocumentNode.SelectSingleNode( "//body" ) != null )
                    {
                        page.Text = GetPageText( htmlDoc );// htmlDoc.DocumentNode.SelectSingleNode( "//body" ).InnerHtml;
                    }
                    else
                    {
                        page.Text = rawPage;
                    }

                    if ( htmlDoc.DocumentNode.SelectSingleNode( "//head/title" ) != null)
                    {
                        page.Title = htmlDoc.DocumentNode.SelectSingleNode( "//head/title" ).InnerText.Trim();
                    }
                    else
                    {
                        page.Title = url;
                    }

                    page.Url = url;

                    // set whether that page should in indexed
                    HtmlNode metaRobot = htmlDoc.DocumentNode.SelectSingleNode( "//meta[@name='robot']" );
                    if ( metaRobot != null && metaRobot.Attributes["content"] != null && metaRobot.Attributes["content"].Value.Contains( "noindex" ) )
                    {
                        page.AllowsIndex = false;
                    }

                    _pages.Add( page );

                    // index the page
                    // clean up the page title a bit by removing  the site name off it
                    if ( page.AllowsIndex )
                    {
                        SitePageIndex sitePage = new SitePageIndex();
                        sitePage.Id = page.Url.MakeInt64HashCode();
                        sitePage.Content = page.Text.SanitizeHtml();

                        // store only the page title (strip the site name off per Rock convention)
                        if ( page.Title.Contains( "|" ) )
                        {
                            sitePage.PageTitle = page.Title.Substring( 0, (page.Title.IndexOf( '|' ) - 1) ).Trim();

                        }
                        else
                        {
                            sitePage.PageTitle = page.Title.Trim();
                        }

                        sitePage.DocumentName = sitePage.PageTitle;
                        sitePage.SiteName = _site.Name;
                        sitePage.SiteId = _site.Id;
                        sitePage.Url = page.Url;
                        sitePage.LastIndexedDateTime = RockDateTime.Now;

                        HtmlNode metaDescription = htmlDoc.DocumentNode.SelectSingleNode( "//meta[@name='description']" );
                        if ( metaDescription != null && metaDescription.Attributes["content"] != null )
                        {
                            sitePage.PageSummary = metaDescription.Attributes["content"].Value;
                        }

                        HtmlNode metaKeynotes = htmlDoc.DocumentNode.SelectSingleNode( "//meta[@name='keywords']" );
                        if ( metaKeynotes != null && metaKeynotes.Attributes["content"] != null )
                        {
                            sitePage.PageKeywords = metaKeynotes.Attributes["content"].Value;
                        }

                        IndexContainer.IndexDocument( sitePage );
                    }

                    LinkParser linkParser = new LinkParser();
                    linkParser.ParseLinks( htmlDoc, url, _startUrl );

                    //Add data to main data lists
                    AddRangeButNoDuplicates( _externalUrls, linkParser.ExternalUrls );
                    AddRangeButNoDuplicates( _otherUrls, linkParser.OtherUrls );
                    AddRangeButNoDuplicates( _failedUrls, linkParser.BadUrls );

                    foreach ( string exception in linkParser.Exceptions )
                        _exceptions.Add( exception );

                    //Crawl all the links found on the page.
                    foreach ( string link in linkParser.GoodUrls )
                    {
                        CrawlPage( link );
                    }
                }
            }
        }
Exemplo n.º 2
0
        /// <summary>
        /// This method will be called each time a page is found by the crawler.
        /// </summary>
        /// <param name="page">The page.</param>
        /// <returns></returns>
        public bool PageCallback( CrawledPage page)
        {
            if ( page.AllowsIndex )
            {
                // clean up the page title a bit by removing  the site name off it
                var pageTitle = page.Title.Substring( 0, (page.Title.IndexOf( '|' ) - 1) ).Trim();

                SitePageIndex sitePage = new SitePageIndex();
                sitePage.Id = page.Url.MakeInt64HashCode();
                sitePage.Content = page.Text;
                sitePage.PageTitle = pageTitle;
                sitePage.SiteName = _site.Name;
                sitePage.SiteId = _site.Id;
                sitePage.Url = page.Url;
                sitePage.LastIndexedDateTime = RockDateTime.Now;

                IndexContainer.IndexDocument( sitePage );

                _indexedPageCount++;
                return true;
            }

            return false;
        }