/// <summary> /// Creates an instance of ResourceServer <see cref="ResourceServer"/> and uses it /// to get a list of Resources and catalogs their contents /// </summary> /// <remarks> ///This is the MAIN method of the indexing system. /// </remarks> public Catalog BuildCatalog() { _Catalog = new Catalog(); ProgressEvent(this, new ProgressEventArgs(1, "Spider.BuildCatalog() starting.")); // Setup Stop, Go, Stemming SetPreferences(); ResourceServer server = new ResourceServer(); foreach (string path in server.PathCollection) { ProcessPath(path); } // Now we've FINISHED Spidering ProgressEvent(this, new ProgressEventArgs(1, "Spider.BuildCatalog() complete.")); if (_Catalog.Length > 0) { ProgressEvent(this, new ProgressEventArgs(2, "Serializing to disk location " + Preferences.CatalogFileName)); // Serialization of the Catalog, so we can load it again if the server Application is restarted _Catalog.Save(); ProgressEvent(this, new ProgressEventArgs(3, "Save to disk " + Preferences.CatalogFileName + " successful")); } else { ProgressEvent(this, new ProgressEventArgs(3, "Not serializing/saving: Empty catalog!")); } return _Catalog;// finished, return to the calling code to 'use' }
/// <summary> /// This page uses the Spider class to read and catalog a website /// </summary> protected void Page_Load(object sender, EventArgs e) { Response.Write( @"<html> <head> <style type='text/css'> BODY { color: #000000; background-color: white; font-family: trebuchet ms, verdana, arial, sans-serif; font-size:x-small; margin-left: 0px; margin-top: 0px; } </style> <title>Crawling the library...</title> </head> <body> <h3><p><font color='red'>S</font><font color='blue'>h</font><font color='green'>e</font><font color='orange'>l</font><font color='navy'>f</font>.<font color='maroon'>Search</font> <font color='#990000'><sup>Beta</sup></font></p></h3> <h2>Catalog file not found! Building Catalog now ..</h2><p>" ); // Build the catalog! Spider spidey = new Spider(); spidey.SpiderProgressEvent += new SpiderProgressEventHandler(OnProgressEvent); _Catalog = spidey.BuildCatalog(); Cache[Preferences.CatalogCacheKey] = _Catalog; // Check if anything was found if (_Catalog.Length > 0) { Response.Write("<br />Finished - now you can search!</p>"); Logger.PerformanceLog(this, "Built new Catalog successfully!"); Response.Write("<center><a href='Search.aspx'><h3>Start Searching Now !</a></h3></center>"); Response.Write("</body></html>"); } else { Response.Write("</p><br /><p font='color:red'>Sorry, nothing was cataloged." + " Administrator will check if there are resources to catalog," + " and the logs to see if any error has occured. Sorry for the " + "inconvenience, please check back here later.</p>"); Response.Write("</body></html>"); Response.End(); } }
/// <summary> /// Method called from UI /// </summary> /// <param name="searchterm">search query</param> /// <param name="catalog">catalog to search</param> /// <returns>ResultFile SortedList for display</returns> public SortedList GetResults(string searchterm, Catalog catalog) { SortedList output = new SortedList(); // ----------------------- DOING A SEARCH ----------------------- if ((null != searchterm) && (null != catalog)) { SetPreferences(); string[] searchTermArray = null, searchTermDisplay = null; /****** Too *********/ Regex r = new Regex(@"\s+"); // matches continuous whitespace searchterm = r.Replace(searchterm, " "); // replaces 'em with a single space searchTermArray = searchterm.Split(' '); // then split searchTermDisplay = (string[])searchTermArray.Clone(); for (int i = 0; i < searchTermArray.Length; i++) { if (_GoChecker.IsGoWord(searchTermArray[i])) { // was a Go word, just Lower it searchTermArray[i] = searchTermArray[i].ToLower(); } else { // Not a Go word, apply stemming searchTermArray[i] = searchTermArray[i].Trim(' ', '?', '\"', ',', '\'', ';', ':', '.', '(', ')').ToLower(); searchTermArray[i] = _Stemmer.StemWord(searchTermArray[i].ToString()); } } if (searchterm == String.Empty) { // After trimming the search term, it was found to be empty! return output; } else { // we have a search term! DateTime start = DateTime.Now; // to show 'time taken' to perform search // Array of arrays of results that match ONE of the search criteria Hashtable[] searchResultsArrayArray = new Hashtable[searchTermArray.Length]; // finalResultsArray is populated with pages that *match* ALL the search criteria HybridDictionary finalResultsArray = new HybridDictionary(); bool botherToFindMatches = true; int indexOfShortestResultSet = -1, lengthOfShortestResultSet = -1; for (int i = 0; i < searchTermArray.Length; i++) { // ##### THE SEARCH ##### searchResultsArrayArray[i] = catalog.Search(searchTermArray[i].ToString()); if (null == searchResultsArrayArray[i]) { _Matches += searchTermDisplay[i] + " <font color='gray' style='font-size:xx-small'>(not found)</font> "; botherToFindMatches = false; // if *any one* of the terms isn't found, there won't be a 'set' of Matches } else { int resultsInThisSet = searchResultsArrayArray[i].Count; _Matches += "<a href=\"?" + Preferences.QuerystringParameterName + "=" + searchTermDisplay[i] + "\" title=\"" + searchTermArray[i] + "\">" + searchTermDisplay[i] + "</a> <font color=gray style='font-size:xx-small'>(" + resultsInThisSet + ")</font> "; if ((lengthOfShortestResultSet == -1) || (lengthOfShortestResultSet > resultsInThisSet)) { indexOfShortestResultSet = i; lengthOfShortestResultSet = resultsInThisSet; } } } // Find the common files from the array of arrays of documents // matching ONE of the criteria if (botherToFindMatches) // all words have *some* matches { // for each result set [NOT required, but maybe later if we do AND/OR searches) int c = indexOfShortestResultSet; // loop through the *shortest* resultset Hashtable searchResultsArray = searchResultsArrayArray[c]; foreach (object foundInFile in searchResultsArray) // for each file in the *shortest* result set { DictionaryEntry fo = (DictionaryEntry)foundInFile; // find matching files in the other resultsets int matchcount = 0, totalcount = 0, weight = 0; for (int cx = 0; cx < searchResultsArrayArray.Length; cx++) { totalcount += (cx + 1); // keep track, so we can compare at the end (if term is in ALL resultsets) if (cx == c) // current resultset { matchcount += (cx + 1); // implicitly matches in the current resultset weight += (int)fo.Value; // sum the weighting } else { Hashtable searchResultsArrayx = searchResultsArrayArray[cx]; if (null != searchResultsArrayx) { foreach (object foundInFilex in searchResultsArrayx) { // for each file in the result set DictionaryEntry fox = (DictionaryEntry)foundInFilex; if (fo.Key == fox.Key) { matchcount += (cx + 1); // and if it matches, track the matchcount weight += (int)fox.Value; // and weighting; then break out of loop, since break; // no need to keep looking through this resultset } } // foreach } // if } // else } // for if ((matchcount > 0) && (matchcount == totalcount)) // was matched in each Array { // we build the finalResults here, to pass to the formatting code below // - we could do the formatting here, but it would mix up the 'result generation' // and display code too much fo.Value = weight; // set the 'weight' in the combined results to the sum of individual document matches if (!finalResultsArray.Contains(fo.Key)) finalResultsArray.Add(fo.Key, fo); } // if } // foreach } // Time taken calculation Int64 ticks = DateTime.Now.Ticks - start.Ticks; TimeSpan taken = new TimeSpan(ticks); if (taken.Seconds > 0) { _DisplayTime = taken.Seconds + " seconds"; } else if (taken.TotalMilliseconds > 0) { _DisplayTime = Convert.ToInt32(taken.TotalMilliseconds) + " milliseconds"; } else { _DisplayTime = "less than 1 millisecond"; } // Format the results if (finalResultsArray.Count > 0) { // intermediate data-structure for 'ranked' result HTML //SortedList output = new SortedList(finalResultsArray.Count); // empty sorted list ResultFile infile; int sortrank = 0; // build each result row foreach (object foundInFile in finalResultsArray.Keys) { // Create a ResultFile with it's own Rank infile = new ResultFile((File)foundInFile); infile.Rank = (int)((DictionaryEntry)finalResultsArray[foundInFile]).Value; sortrank = infile.Rank * -1000; // Assume not 'thousands' of results if (output.Contains(sortrank)) { // rank exists - drop key index one number until it fits for (int i = 1; i < 999; i++) { sortrank++; if (!output.Contains(sortrank)) { output.Add(sortrank, infile); break; } } } else { output.Add(sortrank, infile); } sortrank = 0; // reset for next pass } // Jim Harkins [paged results] // http://aspnet.4guysfromrolla.com/articles/081804-1.aspx } // else Count == 0, so output SortedList will be empty } } return output; }
protected void Page_Load() { bool getCatalog = false; try { // see if there is a catalog object in the cache _Catalog = (Catalog)Cache[Preferences.CatalogCacheKey]; _WordCount = _Catalog.Length; // if so, get the _WordCount } catch (Exception ex) { // If not, we'll need to load_from_file or build the catalog again. Logger.PerformanceLog(this, "Catalog object unavailable : Loadind from file!" + ex.ToString()); _Catalog = null; // in case } if (null == _Catalog) getCatalog = true; else if (_Catalog.Length == 0) getCatalog = true; if (getCatalog) { // No catalog 'in memory', so let's look for one // First, for a serialized version on disk _Catalog = Catalog.Load(); // returns null if not found // Still no Catalog, so we have to start building a new one if (null == _Catalog) { Logger.PerformanceLog(this, "Catalog object unavailable & serialized file missing : Building new Catalog!"); Response.Redirect("Crawling.aspx", true); } else { // Yep, there was a serialized catalog file // Don't forget to add to cache for next time (the Spider does this too) Cache[Preferences.CatalogCacheKey] = _Catalog; _WordCount = _Catalog.Length; // if so, get the _WordCount Logger.PerformanceLog(this, "Deserialized catalog and put in Cache"); } } ucSearchPanelHeader.WordCount = _WordCount; ucSearchPanelFooter.WordCount = _WordCount; if (this.SearchQuery == "") { ucSearchPanelFooter.Visible = false; ucSearchPanelFooter.IsFooter = true; ucSearchPanelHeader.IsSearchResultsPage = false; } else { SearchEngine se = new SearchEngine(); SortedList output = GetSearchResults(se); // which'll do se.GetResults(this.SearchQuery, _Catalog); _NumberOfMatches = output.Count.ToString(); if (output.Count > 0) { _PagedResults.DataSource = output.GetValueList(); _PagedResults.AllowPaging = true; _PagedResults.PageSize = MaxResultsPerPage; //;Preferences.ResultsPerPage; //10; _PagedResults.CurrentPageIndex = Request.QueryString["page"] == null ? 0 : Convert.ToInt32(Request.QueryString["page"]) - 1; _Matches = se.SearchQueryMatchHtml; _DisplayTime = se.DisplayTime; SearchResults.DataSource = _PagedResults; SearchResults.DataBind(); } else { lblNoSearchResults.Visible = true; } // Set the display info in the top & bottom user controls ucSearchPanelHeader.Word = ucSearchPanelFooter.Word = SearchQuery; ucSearchPanelFooter.Visible = true; ucSearchPanelFooter.IsFooter = true; ucSearchPanelHeader.IsSearchResultsPage = true; } if (!string.IsNullOrEmpty(Request.QueryString["semantics"]) && (Request.QueryString["semantics"].Equals("true",StringComparison.InvariantCultureIgnoreCase))) { SemanticsPanel.Visible = true; SInformation.Text = SInfo.SemanticsHtml; } }