/// <summary> /// [v6] /// </summary> /// <param name="startPageUri">array of start pages</param> /// <returns>Catalog of words/documents</returns> public Catalog BuildCatalog(Uri[] startPageUris) { _Catalog = new Catalog(); //_Cache = new Cache(); // [v7] ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (Uri Array) count: " + startPageUris.Length.ToString())); // Setup Stop, Go, Stemming SetPreferences(); foreach (Uri startPageUri in startPageUris) { _CurrentStartUri = startPageUri; // to compare against fully qualified links _CurrentStartUriString = _CurrentStartUri.AbsoluteUri.ToString().ToLower(); ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (start Uri) " + startPageUri.AbsoluteUri)); _Robot = new RobotsTxt(startPageUri, Preferences.RobotUserAgent); // GETS THE FIRST DOCUMENT, AND STARTS THE SPIDER! -- create the 'root' document to start the search // HtmlDocument htmldoc = new HtmlDocument(startPageUri); // RECURSIVE CALL TO 'Process()' STARTS HERE ProcessUri(startPageUri, 0); ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (end Uri) " + startPageUri.AbsoluteUri)); } // Now we've FINISHED Spidering ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog() complete.")); ProgressEvent(this, new ProgressEventArgs(2, "Serializing to disk location " + Preferences.CatalogFileName)); // Serialization of the Catalog, so we can load it again if the server Application is restarted _Catalog.Save(); //_Cache.Save(); //[v7] ProgressEvent(this, new ProgressEventArgs(3, "Save to disk " + Preferences.CatalogFileName + " successful")); return _Catalog;// finished, return to the calling code to 'use' }
/// <summary> /// ALL processing happens here, since we are not using ASP.NET controls or events. /// Page_Load will: /// * check the Cache for a catalog to use /// * if not, check the filesystem for a serialized cache /// * and if STILL not, Server.Transfer to the Spider to build a new cache /// * check the QueryString for search arguments (and if so, do a search) /// * otherwise just show the HTML of this page - a blank search form /// </summary> public void Page_Load() { // prevent Searcharoo from indexing itself (ie. it's own results page) if ((Request.UserAgent != null) && (Request.UserAgent.ToLower().IndexOf("searcharoo") > 0)) { Response.Clear(); Response.End(); return; } bool getCatalog = false; try { // see if there is a catalog object in the cache _Catalog = (Catalog)Application["Searcharoo_Catalog"]; // if so, get the _WordCount _WordCount = _Catalog.Length; _Cache = (Searcharoo.Common.Cache)Application["Searcharoo_Cache"]; } catch (Exception ex) { // otherwise, we'll need to build the catalog log.Warn("Catalog object unavailable : building a new one!"); _Catalog = null; // in case _Cache = null; } ucSearchPanelHeader.WordCount = _WordCount; //ucSearchPanelFooter.WordCount = _WordCount; if (_Catalog == null) { getCatalog = true; } else if (_Catalog.Length == 0) { getCatalog = true; } if (getCatalog) { // Create the thread object, passing in the Alpha.Beta method // via a ThreadStart delegate. This does not start the thread. SearchCatalogInit sci = new SearchCatalogInit(); Thread t = new Thread(() => sci.GetCatalog(this)); t.Start(); if ((string)Application["CatalogLoad"] == "") { // Still no Catalog, so we have to start building a new one if (_Catalog == null) { _Catalog = (Catalog)Application["Searcharoo_Catalog"]; _Cache = (Searcharoo.Common.Cache)Application["Searcharoo_Cache"]; } } } if (this.SearchQuery == "") { ucSearchPanelHeader.IsSearchResultsPage = false; } else { //refactored into class - catalog can be build via a console application as well as the SearchSpider.aspx page Searcharoo.Engine.Search se = new Searcharoo.Engine.Search(); SortedList output = this.GetSearchResults(se); // se.GetResults(this.SearchQuery, _Catalog); _NumberOfMatches = output.Count.ToString(); if (output.Count > 0) { _PagedResults.DataSource = output.GetValueList(); _PagedResults.AllowPaging = true; _PagedResults.PageSize = MaxResultsPerPage; //;Preferences.ResultsPerPage; //10; _PagedResults.CurrentPageIndex = Request.QueryString["page"] == null ? 0 : Convert.ToInt32(Request.QueryString["page"]) - 1; _Matches = se.SearchQueryMatchHtml; _DisplayTime = se.DisplayTime; _Geocoded = se.GeocodedMatches; SearchResults.DataSource = _PagedResults; SearchResults.DataBind(); } else { lblNoSearchResults.Visible = true; } // Set the display info in the top & bottom user controls ucSearchPanelHeader.Word = this.SearchQuery; ucSearchPanelHeader.IsSearchResultsPage = true; } }
//public SortedList GetResults(string searchterm, Catalog catalog, bool geolocOnly) //{ // return GetResults(searchterm, catalog, false, new Cache()); //} /// <summary> /// v6 /// </summary> /// <param name="searchterm">search query</param> /// <param name="catalog">catalog to search</param> /// <param name="geolocOnly">If true, ONLY return results with a lat/long</param> /// <param name="cache">Cache of page 'content'</param> /// <returns>ResultFile SortedList for display</returns> public SortedList GetResults(string searchterm, Catalog catalog, bool geolocOnly) { SortedList output = new SortedList(); // ----------------------- DOING A SEARCH ----------------------- if ((null != searchterm) && (null != catalog)) { SetPreferences(); string[] searchTermArray = null, searchTermDisplay = null; /****** Too *********/ Regex r = new Regex(@"\s+"); //remove all whitespace searchterm = r.Replace(searchterm, " ");// to a single space searchTermArray = searchterm.Split(' '); // then split searchTermDisplay = (string[])searchTermArray.Clone(); for (int i = 0; i < searchTermArray.Length; i++) { if (_GoChecker.IsGoWord(searchTermArray[i])) { // was a Go word, just Lower it searchTermArray[i] = searchTermArray[i].ToLower(); } else { // Not a Go word, apply stemming searchTermArray[i] = searchTermArray[i].Trim(' ', '?', '\"', ',', '\'', ';', ':', '.', '(', ')').ToLower(); searchTermArray[i] = _Stemmer.StemWord(searchTermArray[i].ToString()); } } if (searchterm == String.Empty) { // After trimming the search term, it was found to be empty! return output; } else { // we have a search term! DateTime start = DateTime.Now; // to show 'time taken' to perform search // Array of arrays of results that match ONE of the search criteria Dictionary<File, List<int>>[] searchResultsArrayArray = new Dictionary<File, List<int>>[searchTermArray.Length]; // finalResultsArray is populated with pages that *match* ALL the search criteria HybridDictionary finalResultsArray = new HybridDictionary(); bool botherToFindMatches = true; int indexOfShortestResultSet = -1, lengthOfShortestResultSet = -1; for (int i = 0; i < searchTermArray.Length; i++) { // ##### THE SEARCH ##### searchResultsArrayArray[i] = catalog.Search(searchTermArray[i].ToString()); if (null == searchResultsArrayArray[i]) { _Matches += searchTermDisplay[i] + " <font color=gray style='font-size:xx-small'>(not found)</font> "; botherToFindMatches = false; // if *any one* of the terms isn't found, there won't be a 'set' of Matches } else { int resultsInThisSet = searchResultsArrayArray[i].Count; _Matches += "<a href=\"?" + Preferences.QuerystringParameterName + "=" + searchTermDisplay[i] + "\" title=\"" + searchTermArray[i] + "\">" + searchTermDisplay[i] + "</a> <font color=gray style='font-size:xx-small'>(" + resultsInThisSet + ")</font> "; if ((lengthOfShortestResultSet == -1) || (lengthOfShortestResultSet > resultsInThisSet)) { indexOfShortestResultSet = i; lengthOfShortestResultSet = resultsInThisSet; } } } // Find the common files from the array of arrays of documents // matching ONE of the criteria if (botherToFindMatches) // all words have *some* matches { // for each result set [NOT required, but maybe later if we do AND/OR searches) int c = indexOfShortestResultSet; // loop through the *shortest* resultset Dictionary<File, List<int>> searchResultsArray = searchResultsArrayArray[c]; foreach (File foundInFile in searchResultsArray.Keys) // for each file in the *shortest* result set { //DictionaryEntry fo = (DictionaryEntry)foundInFile; // find matching files in the other resultsets int matchcount = 0, totalcount = 0, weight = 0; List<int> occurences = new List<int>(); for (int cx = 0; cx < searchResultsArrayArray.Length; cx++) { totalcount += (cx + 1); // keep track, so we can compare at the end (if term is in ALL resultsets) if (cx == c) // current resultset { matchcount += (cx + 1); // implicitly matches in the current resultset //weight += (int)fo.Value; // sum the weighting weight += searchResultsArray[foundInFile].Count; // sum the weighting occurences.AddRange(searchResultsArray[foundInFile]); } else { Dictionary<File, List<int>> searchResultsArrayx = searchResultsArrayArray[cx]; if (null != searchResultsArrayx) { foreach (File foundInFilex in searchResultsArrayx.Keys) { // for each file in the result set //DictionaryEntry fox = (DictionaryEntry)foundInFilex; //if (fo.Key == fox.Key) if (foundInFile == foundInFilex) { matchcount += (cx + 1); // and if it matches, track the matchcount //weight += (int)fox.Value; // and weighting; then break out of loop, since weight += searchResultsArrayx[foundInFilex].Count; occurences.AddRange(searchResultsArrayx[foundInFilex]); break; // no need to keep looking through this resultset } } // foreach } // if } // else } // for if ((matchcount > 0) && (matchcount == totalcount)) // was matched in each Array { // we build the finalResults here, to pass to the formatting code below // - we could do the formatting here, but it would mix up the 'result generation' // and display code too much //fo.Value = weight; // set the 'weight' in the combined results to the sum of individual document matches //if (!finalResultsArray.Contains(fo.Key)) finalResultsArray.Add(fo.Key, fo); if (!finalResultsArray.Contains(foundInFile)) finalResultsArray.Add(foundInFile, occurences); //.Count } // if } // foreach } // Time taken calculation Int64 ticks = DateTime.Now.Ticks - start.Ticks; TimeSpan taken = new TimeSpan(ticks); if (taken.Seconds > 0) { _DisplayTime = taken.Seconds + " seconds"; } else if (taken.TotalMilliseconds > 0) { _DisplayTime = Convert.ToInt32(taken.TotalMilliseconds) + " milliseconds"; } else { _DisplayTime = "less than 1 millisecond"; } // The preceding 80 lines (or so) replaces this single line from Version 1 // Hashtable searchResultsArray = m_catalog.Search (searchterm); // when only single-word-searches were supported. Look closely and you'll see this line // labelled #THE SEARCH# still in the code above... // Format the results if (finalResultsArray.Count > 0) { // intermediate data-structure for 'ranked' result HTML //SortedList output = new SortedList(finalResultsArray.Count); // empty sorted list // DictionaryEntry fo; ResultFile infile; // string result=""; int sortrank = 0; // build each result row foreach (object foundInFile in finalResultsArray.Keys) { // Create a ResultFile with it's own Rank infile = new ResultFile((File)foundInFile); // [v7] if we have a cache of the page's content, we'll display the relevant // text excerpt in the search results if (catalog.FileCache.Contains(infile.Url)) { string desc = ""; string[] words = catalog.FileCache.GetDocumentCache(infile.Url); int position = (words.Length / 2); // # find the position of a searched-for word here !!!!!!! if (words.Length < 10) { for (int i = 0; i < words.Length; i++) { desc += words[i] + " "; } } else { List<int> pos = (List<int>)finalResultsArray[foundInFile]; pos.Sort(); int q = 0; position = pos[q]; List<int> useablePos = new List<int>(); foreach (int p in pos) { if (p < (position + 50)) useablePos.Add(p); if (p > (position + 50)) break; } int lowerBound = (position < 24) ? position : 24; int upperBound = (position < 24) ? 48 - position : 24; lowerBound = position - lowerBound; upperBound = position + upperBound; if (upperBound > words.Length) upperBound = words.Length - 1; for (int i = lowerBound; i < upperBound; i++) { if (i == position) desc += "<b>"; desc += words[i] + " "; if (i == position) { desc += "</b>"; q++; if (q < pos.Count) { position = pos[q]; } } } } infile.Description = desc; } if (geolocOnly && (infile.GpsLocation == null || infile.GpsLocation == new Location() )) { // don't add this ResultFile to output [v6] } else { // Jim Harkins [sort for paging] ported from VB to C# // http://www.codeproject.com/aspnet/spideroo.asp#xx927327xx //infile.Rank = (int)((DictionaryEntry)finalResultsArray[foundInFile]).Value; infile.Rank = (int)((List<int>)finalResultsArray[foundInFile]).Count; sortrank = infile.Rank * -1000; // Assume not 'thousands' of results if (output.Contains(sortrank)) { // rank exists - drop key index one number until it fits for (int i = 1; i < 999; i++) { sortrank++; if (!output.Contains(sortrank)) { output.Add(sortrank, infile); if (infile.GpsLocation != null) _GeolocCount += 1; break; } } } else { output.Add(sortrank, infile); if (infile.GpsLocation != null) _GeolocCount += 1; } } sortrank = 0; // reset for next pass } // Jim Harkins [paged results] // http://aspnet.4guysfromrolla.com/articles/081804-1.aspx } // else Count == 0, so output SortedList will be empty } } return output; }
public void GetCatalog(Page pg) { if ((string)pg.Application["CatalogLoad"] == "") { pg.Application.Lock(); pg.Application["CatalogLoad"] = "Loading catalog in progress..."; pg.Application.UnLock(); log.Info("Loading catolog in progress..."); // No catalog 'in memory', so let's look for one // First, for a serialized version on disk try { _Catalog = Catalog.Load(); // returns null if not found } catch (Exception ex) { log.Error("Loading catalog error", ex); } pg.Application.Lock(); pg.Application["CatalogLoad"] = "Loading cache catalog in progress..."; pg.Application.UnLock(); log.Info("Loading cache catolog in progress..."); try { _Cache = Searcharoo.Common.Cache.Load(); _Catalog.FileCache = _Cache; } catch (Exception ex) { log.Error("Loading cache catalog error", ex); } // Still no Catalog, so we have to start building a new one if (_Catalog == null) { _Catalog = (Catalog)pg.Application["Searcharoo_Catalog"]; _Cache = (Searcharoo.Common.Cache)pg.Application["Searcharoo_Cache"]; if (_Catalog != null) { log.Info("Catalog retrieved from Cache[] " + _Catalog.Words); } } else { // Yep, there was a serialized catalog file // Don't forget to add to cache for next time (the Spider does this too) pg.Application.Lock(); pg.Application["Searcharoo_Catalog"] = _Catalog; pg.Application["Searcharoo_Cache"] = _Cache; pg.Application.UnLock(); if (_Catalog != null) { log.Info("Deserialized catalog and put in Cache[] " + _Catalog.Words); } } pg.Application.Lock(); pg.Application["CatalogLoad"] = ""; pg.Application.UnLock(); } }
/// <summary> /// Legacy method signature (pre v6) /// </summary> public SortedList GetResults(string searchterm, Catalog catalog) { return GetResults(searchterm, catalog, false); // all results, including but not limited to geoloc }