Пример #1
0
        /// <summary>
        /// [v6]
        /// </summary>
        /// <param name="startPageUri">array of start pages</param>
        /// <returns>Catalog of words/documents</returns>
        public Catalog BuildCatalog(Uri[] startPageUris)
        {
            _Catalog = new Catalog(); //_Cache = new Cache(); // [v7]
            ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (Uri Array) count: " + startPageUris.Length.ToString()));
            // Setup Stop, Go, Stemming
            SetPreferences();

            foreach (Uri startPageUri in startPageUris)
            {
                _CurrentStartUri = startPageUri;    // to compare against fully qualified links
                _CurrentStartUriString = _CurrentStartUri.AbsoluteUri.ToString().ToLower();

                ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (start Uri) " + startPageUri.AbsoluteUri));

                _Robot = new RobotsTxt(startPageUri, Preferences.RobotUserAgent);

                // GETS THE FIRST DOCUMENT, AND STARTS THE SPIDER! -- create the 'root' document to start the search
                // HtmlDocument htmldoc = new HtmlDocument(startPageUri);
                // RECURSIVE CALL TO 'Process()' STARTS HERE
                ProcessUri(startPageUri, 0);

                ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (end Uri) " + startPageUri.AbsoluteUri));
            }
            // Now we've FINISHED Spidering
            ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog() complete."));
            ProgressEvent(this, new ProgressEventArgs(2, "Serializing to disk location " + Preferences.CatalogFileName));

            // Serialization of the Catalog, so we can load it again if the server Application is restarted
            _Catalog.Save();
            //_Cache.Save(); //[v7]

            ProgressEvent(this, new ProgressEventArgs(3, "Save to disk " + Preferences.CatalogFileName + " successful"));

            return _Catalog;// finished, return to the calling code to 'use'
        }
Пример #2
0
        /// <summary>
        /// ALL processing happens here, since we are not using ASP.NET controls or events.
        /// Page_Load will:
        /// * check the Cache for a catalog to use 
        /// * if not, check the filesystem for a serialized cache
        /// * and if STILL not, Server.Transfer to the Spider to build a new cache
        /// * check the QueryString for search arguments (and if so, do a search)
        /// * otherwise just show the HTML of this page - a blank search form
        /// </summary>
        public void Page_Load()
        {
            // prevent Searcharoo from indexing itself (ie. it's own results page)
            if ((Request.UserAgent != null) && (Request.UserAgent.ToLower().IndexOf("searcharoo") > 0)) { Response.Clear(); Response.End(); return; }

            bool getCatalog = false;
            try
            {
                // see if there is a catalog object in the cache
                _Catalog = (Catalog)Application["Searcharoo_Catalog"];

                // if so, get the _WordCount
                _WordCount = _Catalog.Length;
                _Cache = (Searcharoo.Common.Cache)Application["Searcharoo_Cache"];
            }
            catch (Exception ex)
            {
                // otherwise, we'll need to build the catalog
                log.Warn("Catalog object unavailable : building a new one!");

                _Catalog = null; // in case
                _Cache = null;
            }

            ucSearchPanelHeader.WordCount = _WordCount;
            //ucSearchPanelFooter.WordCount = _WordCount;

            if (_Catalog == null)
            {
                getCatalog = true;
            }
            else if (_Catalog.Length == 0)
            {
                getCatalog = true;
            }

            if (getCatalog)
            {
                // Create the thread object, passing in the Alpha.Beta method
                // via a ThreadStart delegate. This does not start the thread.
                SearchCatalogInit sci = new SearchCatalogInit();
                Thread t = new Thread(() => sci.GetCatalog(this));
                t.Start();

                if ((string)Application["CatalogLoad"] == "")
                {
                    // Still no Catalog, so we have to start building a new one
                    if (_Catalog == null)
                    {
                        _Catalog = (Catalog)Application["Searcharoo_Catalog"];
                        _Cache = (Searcharoo.Common.Cache)Application["Searcharoo_Cache"];
                    }
                }
            }

            if (this.SearchQuery == "")
            {
                ucSearchPanelHeader.IsSearchResultsPage = false;
            }
            else
            {
                //refactored into class - catalog can be build via a console application as well as the SearchSpider.aspx page
                Searcharoo.Engine.Search se = new Searcharoo.Engine.Search();
                SortedList output = this.GetSearchResults(se); // se.GetResults(this.SearchQuery, _Catalog);

                _NumberOfMatches = output.Count.ToString();
                if (output.Count > 0)
                {
                    _PagedResults.DataSource = output.GetValueList();
                    _PagedResults.AllowPaging = true;
                    _PagedResults.PageSize = MaxResultsPerPage; //;Preferences.ResultsPerPage; //10;
                    _PagedResults.CurrentPageIndex = Request.QueryString["page"] == null ? 0 : Convert.ToInt32(Request.QueryString["page"]) - 1;

                    _Matches = se.SearchQueryMatchHtml;
                    _DisplayTime = se.DisplayTime;
                    _Geocoded = se.GeocodedMatches;

                    SearchResults.DataSource = _PagedResults;
                    SearchResults.DataBind();
                }
                else
                {
                    lblNoSearchResults.Visible = true;
                }

                // Set the display info in the top & bottom user controls
                ucSearchPanelHeader.Word = this.SearchQuery;
                ucSearchPanelHeader.IsSearchResultsPage = true;
            }
        }
Пример #3
0
        //public SortedList GetResults(string searchterm, Catalog catalog, bool geolocOnly)
        //{
        //    return GetResults(searchterm, catalog, false, new Cache());
        //}
        /// <summary>
        /// v6
        /// </summary>
        /// <param name="searchterm">search query</param>
        /// <param name="catalog">catalog to search</param>
        /// <param name="geolocOnly">If true, ONLY return results with a lat/long</param>
        /// <param name="cache">Cache of page 'content'</param>
        /// <returns>ResultFile SortedList for display</returns>
        public SortedList GetResults(string searchterm, Catalog catalog, bool geolocOnly)
        {
            SortedList output = new SortedList();

            // ----------------------- DOING A SEARCH -----------------------
            if ((null != searchterm) && (null != catalog))
            {
                SetPreferences();

                string[] searchTermArray = null, searchTermDisplay = null;

                /****** Too *********/
                Regex r = new Regex(@"\s+");            //remove all whitespace
                searchterm = r.Replace(searchterm, " ");// to a single space
                searchTermArray = searchterm.Split(' '); // then split
                searchTermDisplay = (string[])searchTermArray.Clone();
                for (int i = 0; i < searchTermArray.Length; i++)
                {
                    if (_GoChecker.IsGoWord(searchTermArray[i]))
                    {	// was a Go word, just Lower it
                        searchTermArray[i] = searchTermArray[i].ToLower();
                    }
                    else
                    {	// Not a Go word, apply stemming
                        searchTermArray[i] = searchTermArray[i].Trim(' ', '?', '\"', ',', '\'', ';', ':', '.', '(', ')').ToLower();
                        searchTermArray[i] = _Stemmer.StemWord(searchTermArray[i].ToString());
                    }
                }

                if (searchterm == String.Empty)
                {
                    // After trimming the search term, it was found to be empty!
                    return output;
                }
                else
                {	// we have a search term!
                    DateTime start = DateTime.Now;  // to show 'time taken' to perform search

                    // Array of arrays of results that match ONE of the search criteria
                    Dictionary<File, List<int>>[] searchResultsArrayArray = new Dictionary<File, List<int>>[searchTermArray.Length];
                    // finalResultsArray is populated with pages that *match* ALL the search criteria
                    HybridDictionary finalResultsArray = new HybridDictionary();

                    bool botherToFindMatches = true;
                    int indexOfShortestResultSet = -1, lengthOfShortestResultSet = -1;

                    for (int i = 0; i < searchTermArray.Length; i++)
                    {	// ##### THE SEARCH #####
                        searchResultsArrayArray[i] = catalog.Search(searchTermArray[i].ToString());
                        if (null == searchResultsArrayArray[i])
                        {
                            _Matches += searchTermDisplay[i] + " <font color=gray style='font-size:xx-small'>(not found)</font> ";
                            botherToFindMatches = false; // if *any one* of the terms isn't found, there won't be a 'set' of Matches
                        }
                        else
                        {
                            int resultsInThisSet = searchResultsArrayArray[i].Count;
                            _Matches += "<a href=\"?" + Preferences.QuerystringParameterName + "=" + searchTermDisplay[i] + "\" title=\"" + searchTermArray[i] + "\">"
                                    + searchTermDisplay[i]
                                    + "</a> <font color=gray style='font-size:xx-small'>(" + resultsInThisSet + ")</font> ";
                            if ((lengthOfShortestResultSet == -1) || (lengthOfShortestResultSet > resultsInThisSet))
                            {
                                indexOfShortestResultSet = i;
                                lengthOfShortestResultSet = resultsInThisSet;
                            }
                        }
                    }

                    // Find the common files from the array of arrays of documents
                    // matching ONE of the criteria
                    if (botherToFindMatches)                                            // all words have *some* matches
                    {																	// for each result set [NOT required, but maybe later if we do AND/OR searches)
                        int c = indexOfShortestResultSet;                               // loop through the *shortest* resultset
                        Dictionary<File, List<int>> searchResultsArray = searchResultsArrayArray[c];

                        foreach (File foundInFile in searchResultsArray.Keys)             // for each file in the *shortest* result set
                        {
                            //DictionaryEntry fo = (DictionaryEntry)foundInFile;          // find matching files in the other resultsets

                            int matchcount = 0, totalcount = 0, weight = 0;
                            List<int> occurences = new List<int>();

                            for (int cx = 0; cx < searchResultsArrayArray.Length; cx++)
                            {
                                totalcount += (cx + 1);                                // keep track, so we can compare at the end (if term is in ALL resultsets)
                                if (cx == c)                                      // current resultset
                                {
                                    matchcount += (cx + 1);                          // implicitly matches in the current resultset
                                    //weight += (int)fo.Value;                       // sum the weighting
                                    weight += searchResultsArray[foundInFile].Count;              // sum the weighting
                                    occurences.AddRange(searchResultsArray[foundInFile]);
                                }
                                else
                                {
                                    Dictionary<File, List<int>> searchResultsArrayx = searchResultsArrayArray[cx];
                                    if (null != searchResultsArrayx)
                                    {
                                        foreach (File foundInFilex in searchResultsArrayx.Keys)
                                        {   // for each file in the result set
                                            //DictionaryEntry fox = (DictionaryEntry)foundInFilex;
                                            //if (fo.Key == fox.Key)
                                            if (foundInFile == foundInFilex)
                                            {
                                                matchcount += (cx + 1);               // and if it matches, track the matchcount
                                                //weight += (int)fox.Value;           // and weighting; then break out of loop, since
                                                weight += searchResultsArrayx[foundInFilex].Count;
                                                occurences.AddRange(searchResultsArrayx[foundInFilex]);
                                                break;                              // no need to keep looking through this resultset
                                            }
                                        } // foreach
                                    } // if
                                } // else
                            } // for
                            if ((matchcount > 0) && (matchcount == totalcount))		// was matched in each Array
                            {   // we build the finalResults here, to pass to the formatting code below
                                // - we could do the formatting here, but it would mix up the 'result generation'
                                // and display code too much
                                //fo.Value = weight; // set the 'weight' in the combined results to the sum of individual document matches

                                //if (!finalResultsArray.Contains(fo.Key)) finalResultsArray.Add(fo.Key, fo);
                                if (!finalResultsArray.Contains(foundInFile)) finalResultsArray.Add(foundInFile, occurences); //.Count
                            } // if
                        } // foreach
                    }

                    // Time taken calculation
                    Int64 ticks = DateTime.Now.Ticks - start.Ticks;
                    TimeSpan taken = new TimeSpan(ticks);
                    if (taken.Seconds > 0)
                    {
                        _DisplayTime = taken.Seconds + " seconds";
                    }
                    else if (taken.TotalMilliseconds > 0)
                    {
                        _DisplayTime = Convert.ToInt32(taken.TotalMilliseconds) + " milliseconds";
                    }
                    else
                    {
                        _DisplayTime = "less than 1 millisecond";
                    }

                    // The preceding 80 lines (or so) replaces this single line from Version 1
                    //       Hashtable searchResultsArray = m_catalog.Search (searchterm);
                    // when only single-word-searches were supported. Look closely and you'll see this line
                    // labelled #THE SEARCH# still in the code above...

                    // Format the results
                    if (finalResultsArray.Count > 0)
                    {	// intermediate data-structure for 'ranked' result HTML
                        //SortedList
                        output = new SortedList(finalResultsArray.Count); // empty sorted list
                        //                DictionaryEntry fo;
                        ResultFile infile;
                        //                string result="";
                        int sortrank = 0;

                        // build each result row
                        foreach (object foundInFile in finalResultsArray.Keys)
                        {
                            // Create a ResultFile with it's own Rank
                            infile = new ResultFile((File)foundInFile);

                            // [v7] if we have a cache of the page's content, we'll display the relevant
                            // text excerpt in the search results
                            if (catalog.FileCache.Contains(infile.Url))
                            {
                                string desc = "";
                                string[] words = catalog.FileCache.GetDocumentCache(infile.Url);

                                int position = (words.Length / 2);      // # find the position of a searched-for word here !!!!!!!

                                if (words.Length < 10)
                                {
                                    for (int i = 0; i < words.Length; i++)
                                    {
                                        desc += words[i] + " ";
                                    }
                                }
                                else
                                {
                                    List<int> pos = (List<int>)finalResultsArray[foundInFile];
                                    pos.Sort();

                                    int q = 0;
                                    position = pos[q];

                                    List<int> useablePos = new List<int>();
                                    foreach (int p in pos)
                                    {
                                        if (p < (position + 50)) useablePos.Add(p);
                                        if (p > (position + 50)) break;
                                    }

                                    int lowerBound = (position < 24) ? position : 24;
                                    int upperBound = (position < 24) ? 48 - position : 24;

                                    lowerBound = position - lowerBound;
                                    upperBound = position + upperBound;

                                    if (upperBound > words.Length) upperBound = words.Length - 1;

                                    for (int i = lowerBound; i < upperBound; i++)
                                    {
                                        if (i == position) desc += "<b>";
                                        desc += words[i] + " ";
                                        if (i == position)
                                        {
                                            desc += "</b>";
                                            q++;
                                            if (q < pos.Count) {   position = pos[q]; }
                                        }
                                    }
                                }

                                infile.Description = desc;
                            }

                            if (geolocOnly && (infile.GpsLocation == null || infile.GpsLocation == new Location() ))
                            {
                                // don't add this ResultFile to output [v6]
                            }
                            else
                            {
                                // Jim Harkins [sort for paging] ported from VB to C#
                                // http://www.codeproject.com/aspnet/spideroo.asp#xx927327xx
                                //infile.Rank = (int)((DictionaryEntry)finalResultsArray[foundInFile]).Value;

                                infile.Rank = (int)((List<int>)finalResultsArray[foundInFile]).Count;

                                sortrank = infile.Rank * -1000;		// Assume not 'thousands' of results
                                if (output.Contains(sortrank))
                                { // rank exists - drop key index one number until it fits
                                    for (int i = 1; i < 999; i++)
                                    {
                                        sortrank++;
                                        if (!output.Contains(sortrank))
                                        {
                                            output.Add(sortrank, infile);
                                            if (infile.GpsLocation != null) _GeolocCount += 1;
                                            break;
                                        }
                                    }
                                }
                                else
                                {
                                    output.Add(sortrank, infile);
                                    if (infile.GpsLocation != null) _GeolocCount += 1;
                                }
                            }
                            sortrank = 0;	// reset for next pass
                        }
                        // Jim Harkins [paged results]
                        // http://aspnet.4guysfromrolla.com/articles/081804-1.aspx
                    } // else Count == 0, so output SortedList will be empty
                }
            }
            return output;
        }
Пример #4
0
        public void GetCatalog(Page pg)
        {
            if ((string)pg.Application["CatalogLoad"] == "")
            {
                pg.Application.Lock();
                pg.Application["CatalogLoad"] = "Loading catalog in progress...";
                pg.Application.UnLock();

                log.Info("Loading catolog in progress...");

                // No catalog 'in memory', so let's look for one
                // First, for a serialized version on disk
                try
                {
                    _Catalog = Catalog.Load();  // returns null if not found
                }
                catch (Exception ex)
                {
                    log.Error("Loading catalog error", ex);
                }

                pg.Application.Lock();
                pg.Application["CatalogLoad"] = "Loading cache catalog in progress...";
                pg.Application.UnLock();

                log.Info("Loading cache catolog in progress...");

                try
                {
                    _Cache = Searcharoo.Common.Cache.Load();
                    _Catalog.FileCache = _Cache;
                }
                catch (Exception ex)
                {
                    log.Error("Loading cache catalog error", ex);
                }

                // Still no Catalog, so we have to start building a new one
                if (_Catalog == null)
                {
                    _Catalog = (Catalog)pg.Application["Searcharoo_Catalog"];
                    _Cache = (Searcharoo.Common.Cache)pg.Application["Searcharoo_Cache"];

                    if (_Catalog != null)
                    {
                        log.Info("Catalog retrieved from Cache[] " + _Catalog.Words);
                    }
                }
                else
                {
                    // Yep, there was a serialized catalog file
                    // Don't forget to add to cache for next time (the Spider does this too)
                    pg.Application.Lock();
                    pg.Application["Searcharoo_Catalog"] = _Catalog;
                    pg.Application["Searcharoo_Cache"] = _Cache;
                    pg.Application.UnLock();

                    if (_Catalog != null)
                    {
                        log.Info("Deserialized catalog and put in Cache[] " + _Catalog.Words);
                    }
                }

                pg.Application.Lock();
                pg.Application["CatalogLoad"] = "";
                pg.Application.UnLock();
            }
        }
Пример #5
0
 /// <summary>
 /// Legacy method signature (pre v6)
 /// </summary>
 public SortedList GetResults(string searchterm, Catalog catalog)
 {
     return GetResults(searchterm, catalog, false);  // all results, including but not limited to geoloc
 }