/// <summary> /// Recursive 'process' method: takes the uri input, downloads it (following redirects if required) /// receiving a Document subclass, then calling the Parse() method to get the words which /// are then added to the Catalog. /// </summary> protected int ProcessUri(Uri uri, int level) { // [j105 Rob] recursion fix // http://www.codeproject.com/aspnet/Spideroo.asp?df=100&forumid=71481&select=1862807#xx1862807xx if (level > Preferences.RecursionLimit) { return(Preferences.RecursionLimit); } int wordcount = 0; string url = uri.AbsoluteUri; if (!_Robot.Allowed(uri)) { ProgressEvent(this, new ProgressEventArgs(2, "RobotsTxt exclusion prevented indexing of " + url + "")); } else { if (_Visited.Contains(url)) { ProgressEvent(this, new ProgressEventArgs(2, url + " already spidered")); } else { _Visited.Add(url); ProgressEvent(this, new ProgressEventArgs(2, url + " being downloaded")); Document downloadDocument = Download(uri); if (null == downloadDocument) { ProgressEvent(this, new ProgressEventArgs(1, "Download() failed on " + url + "")); } else { downloadDocument.Parse(); if (downloadDocument.RobotIndexOK) { wordcount = AddToCatalog(downloadDocument); } else { ProgressEvent(this, new ProgressEventArgs(2, "RobotMeta exclusion prevented indexing of " + url + "")); } } if (wordcount > 0) { ProgressEvent(this, new ProgressEventArgs(1, downloadDocument.Title + " parsed " + wordcount + " words!")); ProgressEvent(this, new ProgressEventArgs(4, downloadDocument.Title + " " + downloadDocument.Uri.AbsoluteUri + System.Environment.NewLine + (downloadDocument.RobotIndexOK ? "Indexed" : "RobotMeta Excluded Index") + downloadDocument.Description)); } else { ProgressEvent(this, new ProgressEventArgs(2, url + " parsed but zero words found.")); } // ### Loop through the 'local' links in the document ### // ### and parse each of them recursively ### if (null != downloadDocument && null != downloadDocument.LocalLinks && downloadDocument.RobotFollowOK) { // only if the Robot meta says it's OK foreach (object link in downloadDocument.LocalLinks) { try { Uri urlToFollow = new Uri(downloadDocument.Uri, link.ToString()); ProcessUri(urlToFollow, level + 1); // calls THIS method! } catch (Exception ex) { ProgressEvent(this, new ProgressEventArgs(2, "new Uri(" + downloadDocument.Uri + ", " + link.ToString() + ") invalid : " + ex.Message + "")); } } } // process local links } // not visited } // robot allowed return(level); }
/// <summary> /// Recursive 'process' method: takes the uri input, downloads it (following redirects if required) /// receiving a Document subclass, then calling the Parse() method to get the words which /// are then added to the Catalog. /// </summary> protected int ProcessUri(Uri uri, int level) { // [j105 Rob] recursion fix // http://www.codeproject.com/aspnet/Spideroo.asp?df=100&forumid=71481&select=1862807#xx1862807xx if (level > Preferences.RecursionLimit) { return(Preferences.RecursionLimit); } int wordcount = 0; string url = uri.AbsoluteUri.ToLower(); // [v6] if (!_Robot.Allowed(uri)) { ProgressEvent(this, new ProgressEventArgs(2, "RobotsTxt exclusion prevented indexing of " + url + "")); } else { bool alreadyVisited = _Visited.Contains(url); if (!alreadyVisited && Preferences.UseDefaultDocument) { // [v7] First-attempt at treating 'folder' Urls (eg mysite.com/Photos) and default documents (eg mysite.com/Photos/Default.aspx) // as the SAME PAGE to prevent duplicates in the search results. To do this, when we find a Url that looks like a 'folder' // (eg. no file extension OR ends with a / slash) we add all three 'variations' of that Url to the _Visited list so the other // variations aren't even retrieved/indexed. string defaultDoc = Preferences.DefaultDocument; int defaultDocLength = defaultDoc.Length; int defaultDocLengthPlusSlash = defaultDoc.Length; if (url.LastIndexOf("/") == (url.Length - 1)) { // Variation #1: ends in slash / alreadyVisited = _Visited.Contains(url + defaultDoc) || _Visited.Contains(url.Trim('/')); _Visited.Add(url + defaultDoc); _Visited.Add(url.Trim('/')); } else if (System.IO.Path.GetExtension(url) == "") { // Variation #2: no file extension alreadyVisited = _Visited.Contains(url + "/" + defaultDoc) || _Visited.Contains(url + "/"); _Visited.Add(url + "/" + defaultDoc); _Visited.Add(url + "/"); } else if (url.LastIndexOf(defaultDoc) == (url.Length - defaultDocLength)) { // Variation #3: ends in /default.aspx (or whatever the specified default document is: index.html, default.htm, etc) alreadyVisited = _Visited.Contains(url.Substring(0, (url.Length - defaultDocLengthPlusSlash))) || _Visited.Contains(url.Substring(0, (url.Length - defaultDocLength))); _Visited.Add(url.Substring(0, (url.Length - defaultDocLengthPlusSlash))); _Visited.Add(url.Substring(0, (url.Length - defaultDocLength))); } } if (alreadyVisited) { ProgressEvent(this, new ProgressEventArgs(2, url + " already spidered")); } else { _Visited.Add(url); ProgressEvent(this, new ProgressEventArgs(2, url + " being downloaded")); // ### IMPORTANT ### // Uri is actually retrieved here! Document downloadDocument = Download(uri); if (null == downloadDocument) { ProgressEvent(this, new ProgressEventArgs(1, "Download() failed on " + url + "")); } else { // ### IMPORTANT ### // Uri downloaded content is actually parsed here! downloadDocument.Parse(); if (downloadDocument.RobotIndexOK) { wordcount = AddToCatalog(downloadDocument); } else { ProgressEvent(this, new ProgressEventArgs(2, "RobotMeta exclusion prevented indexing of " + url + "")); } } if (wordcount > 0) { ProgressEvent(this, new ProgressEventArgs(1, downloadDocument.Title + " parsed " + wordcount + " words!")); ProgressEvent(this, new ProgressEventArgs(4, downloadDocument.Title + " " + downloadDocument.Uri.AbsoluteUri + System.Environment.NewLine + (downloadDocument.RobotIndexOK ? "Indexed" : "RobotMeta Excluded Index") + downloadDocument.Description)); } else { ProgressEvent(this, new ProgressEventArgs(2, url + " parsed but zero words found.")); } // [v7] bugfix if (null == downloadDocument) { // why is it null here? System.Diagnostics.Debug.WriteLine(url + " resulted in a null downloadDocument"); } else { // Move some 'External' to 'Local' links ArrayList elinks = (ArrayList)downloadDocument.ExternalLinks.Clone(); for (int l = 0; l < elinks.Count; l++) { string link = elinks[l].ToString(); Uri linkUri = new Uri(link); //if (link.ToLower().StartsWith(this._CurrentStartUriString)) if (_CurrentStartUri.IsBaseOf(linkUri)) { // if this link is actually 'under' the starting one, treat it as internal (even // though it started with http: downloadDocument.ExternalLinks.Remove(link); downloadDocument.LocalLinks.Add(link); } } // ### Loop through the 'local' links in the document ### // ### and parse each of them recursively ### if (null != downloadDocument && null != downloadDocument.LocalLinks && downloadDocument.RobotFollowOK) { // only if the Robot meta says it's OK foreach (object link in downloadDocument.LocalLinks) { try { Uri urlToFollow = new Uri(downloadDocument.Uri, link.ToString()); ProcessUri(urlToFollow, level + 1); // calls THIS method, recursively } catch (Exception ex) { ProgressEvent(this, new ProgressEventArgs(2, "new Uri(" + downloadDocument.Uri + ", " + link.ToString() + ") invalid : " + ex.Message + "")); } } } // process local links } // document was not null } // not visited } // robot allowed return(level); }