Esempio n. 1
0
        /// <summary>
        /// Recursive 'process' method: takes the uri input, downloads it (following redirects if required)
        /// receiving a Document subclass, then calling the Parse() method to get the words which
        /// are then added to the Catalog.
        /// </summary>
        protected int ProcessUri(Uri uri, int level)
        {
            // [j105 Rob] recursion fix
            // http://www.codeproject.com/aspnet/Spideroo.asp?df=100&forumid=71481&select=1862807#xx1862807xx
            if (level > Preferences.RecursionLimit)
            {
                return(Preferences.RecursionLimit);
            }

            int    wordcount = 0;
            string url       = uri.AbsoluteUri;



            if (!_Robot.Allowed(uri))
            {
                ProgressEvent(this, new ProgressEventArgs(2, "RobotsTxt exclusion prevented indexing of " + url + ""));
            }
            else
            {
                if (_Visited.Contains(url))
                {
                    ProgressEvent(this, new ProgressEventArgs(2, url + " already spidered"));
                }
                else
                {
                    _Visited.Add(url);
                    ProgressEvent(this, new ProgressEventArgs(2, url + " being downloaded"));
                    Document downloadDocument = Download(uri);
                    if (null == downloadDocument)
                    {
                        ProgressEvent(this, new ProgressEventArgs(1, "Download() failed on " + url + ""));
                    }
                    else
                    {
                        downloadDocument.Parse();
                        if (downloadDocument.RobotIndexOK)
                        {
                            wordcount = AddToCatalog(downloadDocument);
                        }
                        else
                        {
                            ProgressEvent(this, new ProgressEventArgs(2, "RobotMeta exclusion prevented indexing of " + url + ""));
                        }
                    }

                    if (wordcount > 0)
                    {
                        ProgressEvent(this, new ProgressEventArgs(1, downloadDocument.Title + " parsed " + wordcount + " words!"));
                        ProgressEvent(this, new ProgressEventArgs(4, downloadDocument.Title + " " + downloadDocument.Uri.AbsoluteUri + System.Environment.NewLine
                                                                  + (downloadDocument.RobotIndexOK ? "Indexed" : "RobotMeta Excluded Index")
                                                                  + downloadDocument.Description));
                    }
                    else
                    {
                        ProgressEvent(this, new ProgressEventArgs(2, url + " parsed but zero words found."));
                    }
                    // ### Loop through the 'local' links in the document ###
                    // ### and parse each of them recursively ###
                    if (null != downloadDocument && null != downloadDocument.LocalLinks && downloadDocument.RobotFollowOK)
                    { // only if the Robot meta says it's OK
                        foreach (object link in downloadDocument.LocalLinks)
                        {
                            try
                            {
                                Uri urlToFollow = new Uri(downloadDocument.Uri, link.ToString());
                                ProcessUri(urlToFollow, level + 1); // calls THIS method!
                            }
                            catch (Exception ex)
                            {
                                ProgressEvent(this, new ProgressEventArgs(2, "new Uri(" + downloadDocument.Uri + ", " + link.ToString() + ") invalid : " + ex.Message + ""));
                            }
                        }
                    } // process local links
                }     // not visited
            }         // robot allowed
            return(level);
        }
        /// <summary>
        /// Recursive 'process' method: takes the uri input, downloads it (following redirects if required)
        /// receiving a Document subclass, then calling the Parse() method to get the words which
        /// are then added to the Catalog.
        /// </summary>
        protected int ProcessUri(Uri uri, int level)
        {
            // [j105 Rob] recursion fix
            // http://www.codeproject.com/aspnet/Spideroo.asp?df=100&forumid=71481&select=1862807#xx1862807xx
            if (level > Preferences.RecursionLimit)
            {
                return(Preferences.RecursionLimit);
            }

            int    wordcount = 0;
            string url       = uri.AbsoluteUri.ToLower(); // [v6]

            if (!_Robot.Allowed(uri))
            {
                ProgressEvent(this, new ProgressEventArgs(2, "RobotsTxt exclusion prevented indexing of " + url + ""));
            }
            else
            {
                bool alreadyVisited = _Visited.Contains(url);

                if (!alreadyVisited && Preferences.UseDefaultDocument)
                {   // [v7] First-attempt at treating 'folder' Urls (eg mysite.com/Photos) and default documents (eg mysite.com/Photos/Default.aspx)
                    // as the SAME PAGE to prevent duplicates in the search results. To do this, when we find a Url that looks like a 'folder'
                    // (eg. no file extension OR ends with a / slash) we add all three 'variations' of that Url to the _Visited list so the other
                    // variations aren't even retrieved/indexed.
                    string defaultDoc                = Preferences.DefaultDocument;
                    int    defaultDocLength          = defaultDoc.Length;
                    int    defaultDocLengthPlusSlash = defaultDoc.Length;

                    if (url.LastIndexOf("/") == (url.Length - 1))
                    {   // Variation #1: ends in slash /
                        alreadyVisited = _Visited.Contains(url + defaultDoc) || _Visited.Contains(url.Trim('/'));
                        _Visited.Add(url + defaultDoc);
                        _Visited.Add(url.Trim('/'));
                    }
                    else if (System.IO.Path.GetExtension(url) == "")
                    {   // Variation #2: no file extension
                        alreadyVisited = _Visited.Contains(url + "/" + defaultDoc) || _Visited.Contains(url + "/");
                        _Visited.Add(url + "/" + defaultDoc);
                        _Visited.Add(url + "/");
                    }
                    else if (url.LastIndexOf(defaultDoc) == (url.Length - defaultDocLength))
                    {   // Variation #3: ends in /default.aspx (or whatever the specified default document is: index.html, default.htm, etc)
                        alreadyVisited = _Visited.Contains(url.Substring(0, (url.Length - defaultDocLengthPlusSlash))) ||
                                         _Visited.Contains(url.Substring(0, (url.Length - defaultDocLength)));
                        _Visited.Add(url.Substring(0, (url.Length - defaultDocLengthPlusSlash)));
                        _Visited.Add(url.Substring(0, (url.Length - defaultDocLength)));
                    }
                }
                if (alreadyVisited)
                {
                    ProgressEvent(this, new ProgressEventArgs(2, url + " already spidered"));
                }
                else
                {
                    _Visited.Add(url);
                    ProgressEvent(this, new ProgressEventArgs(2, url + " being downloaded"));
                    // ### IMPORTANT ###
                    // Uri is actually retrieved here!
                    Document downloadDocument = Download(uri);

                    if (null == downloadDocument)
                    {
                        ProgressEvent(this, new ProgressEventArgs(1, "Download() failed on " + url + ""));
                    }
                    else
                    {
                        // ### IMPORTANT ###
                        // Uri downloaded content is actually parsed here!
                        downloadDocument.Parse();
                        if (downloadDocument.RobotIndexOK)
                        {
                            wordcount = AddToCatalog(downloadDocument);
                        }
                        else
                        {
                            ProgressEvent(this, new ProgressEventArgs(2, "RobotMeta exclusion prevented indexing of " + url + ""));
                        }
                    }

                    if (wordcount > 0)
                    {
                        ProgressEvent(this, new ProgressEventArgs(1, downloadDocument.Title + " parsed " + wordcount + " words!"));
                        ProgressEvent(this, new ProgressEventArgs(4, downloadDocument.Title + " " + downloadDocument.Uri.AbsoluteUri + System.Environment.NewLine
                                                                  + (downloadDocument.RobotIndexOK ? "Indexed" : "RobotMeta Excluded Index")
                                                                  + downloadDocument.Description));
                    }
                    else
                    {
                        ProgressEvent(this, new ProgressEventArgs(2, url + " parsed but zero words found."));
                    }
                    // [v7] bugfix
                    if (null == downloadDocument)
                    {
                        // why is it null here?
                        System.Diagnostics.Debug.WriteLine(url + " resulted in a null downloadDocument");
                    }
                    else
                    {
                        // Move some 'External' to 'Local' links
                        ArrayList elinks = (ArrayList)downloadDocument.ExternalLinks.Clone();
                        for (int l = 0; l < elinks.Count; l++)
                        {
                            string link    = elinks[l].ToString();
                            Uri    linkUri = new Uri(link);
                            //if (link.ToLower().StartsWith(this._CurrentStartUriString))
                            if (_CurrentStartUri.IsBaseOf(linkUri))
                            {   // if this link is actually 'under' the starting one, treat it as internal (even
                                // though it started with http:
                                downloadDocument.ExternalLinks.Remove(link);
                                downloadDocument.LocalLinks.Add(link);
                            }
                        }

                        // ### Loop through the 'local' links in the document ###
                        // ### and parse each of them recursively ###
                        if (null != downloadDocument && null != downloadDocument.LocalLinks && downloadDocument.RobotFollowOK)
                        { // only if the Robot meta says it's OK
                            foreach (object link in downloadDocument.LocalLinks)
                            {
                                try
                                {
                                    Uri urlToFollow = new Uri(downloadDocument.Uri, link.ToString());
                                    ProcessUri(urlToFollow, level + 1); // calls THIS method, recursively
                                }
                                catch (Exception ex)
                                {
                                    ProgressEvent(this, new ProgressEventArgs(2, "new Uri(" + downloadDocument.Uri + ", " + link.ToString() + ") invalid : " + ex.Message + ""));
                                }
                            }
                        } // process local links
                    }     // document was not null
                }         // not visited
            }             // robot allowed
            return(level);
        }