Exemplo n.º 1
0
        private string[] SplitWords(string words)
        {
            string[] wordsArray = null;

            // COMPRESS ALL WHITESPACE into a single space, seperating words
            if (!String.IsNullOrEmpty(words))
            {
                Regex  r          = new Regex(@"\s+");  //remove all whitespace
                string compressed = r.Replace(words, " ");
                wordsArray = compressed.Split(' ');
            }
            else
            {
                wordsArray = new string[0];
            }

            int    i   = 0;
            string key = "";    // temp variables

            foreach (string word in wordsArray)
            {
                key = word.ToLower();

                if (!_goWord.IsGoWord(key))
                {       // not a special case, parse like any other word
                    RemovePunctuation(ref key);

                    if (!IsNumber(ref key))
                    {
                        // not a number, so get rid of numeric seperators and catalog as a word
                        // TODO: remove inline punctuation, split hyphenated words?
                        // http://blogs.msdn.com/ericgu/archive/2006/01/16/513645.aspx
                        key = System.Text.RegularExpressions.Regex.Replace(key, "[,.]", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                        // Apply Stemmer
                        key = _stemmer.StemWord(key);

                        // Apply Stopper
                        key = _stopper.StopWord(key);
                    }
                }
                else
                {
                }

                wordsArray[i] = key;
                i++;
            }

            return(wordsArray);
        }
        /// <summary>
        /// Add the Document subclass to the catalog, BY FIRST 'copying' the main
        /// properties into a File class. The distinction is a bit arbitrary: Documents
        /// are downloaded and indexed, but their content is modelled in as a File
        /// class in the Catalog (and represented as a ResultFile object in the search ASPX page)
        /// </summary>
        /// <return>Number of words catalogued in the Document</return>
        protected int AddToCatalog(Document downloadDocument)
        {
            File infile = new File(downloadDocument.Uri.AbsoluteUri
                                   , downloadDocument.Title.UnicodeToCharacter()
                                   , downloadDocument.Description.UnicodeToCharacter()
                                   , DateTime.Now
                                   , downloadDocument.Length
                                   , downloadDocument.GpsLocation
                                   , downloadDocument.Extension
                                   , downloadDocument.KeywordString.UnicodeToCharacter());

            // ### Loop through words in the file ###
            int    i = 0, j = 0; // count of words, count of words _indexed
            string key = "";     // temp variables

            foreach (string word in downloadDocument.WordsArray)
            {
                key = word.UnicodeToCharacter().ToLower();
                if (!_GoChecker.IsGoWord(key))
                {       // not a special case, parse like any other word
                    RemovePunctuation(ref key);

                    if (!IsNumber(ref key))
                    {   // not a number, so get rid of numeric seperators and catalog as a word
                        // TODO: remove inline punctuation, split hyphenated words?
                        // http://blogs.msdn.com/ericgu/archive/2006/01/16/513645.aspx
                        key = System.Text.RegularExpressions.Regex.Replace(key, "[,.]", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                        // Apply Stemmer (set by preferences)
                        key = _Stemmer.StemWord(key);

                        // Apply Stopper (set by preferences)
                        key = _Stopper.StopWord(key);
                    }
                }
                else
                {
                    ProgressEvent(this, new ProgressEventArgs(4, "Found GoWord " + key + " in " + downloadDocument.Title));
                }
                if (key != String.Empty)
                {
                    _Catalog.Add(key, infile, i);
                    j++;
                }
                i++;
            }
            _Catalog.FileCache.Add(downloadDocument.WordsArray, infile);
            return(i);
        }
Exemplo n.º 3
0
        //public SortedList GetResults(string searchterm, Catalog catalog, bool geolocOnly)
        //{
        //    return GetResults(searchterm, catalog, false, new Cache());
        //}

        /// <summary>
        /// v6
        /// </summary>
        /// <param name="searchterm">search query</param>
        /// <param name="catalog">catalog to search</param>
        /// <param name="geolocOnly">If true, ONLY return results with a lat/long</param>
        /// <param name="cache">Cache of page 'content'</param>
        /// <returns>ResultFile SortedList for display</returns>
        public SortedList GetResults(string searchterm, Catalog catalog, bool geolocOnly)
        {
            SortedList output = new SortedList();

            // ----------------------- DOING A SEARCH -----------------------
            if ((null != searchterm) && (null != catalog))
            {
                SetPreferences();

                string[] searchTermArray = null, searchTermDisplay = null;

                /****** Too *********/
                Regex r = new Regex(@"\s+");                    //remove all whitespace
                searchterm        = r.Replace(searchterm, " "); // to a single space
                searchTermArray   = searchterm.Split(' ');      // then split
                searchTermDisplay = (string[])searchTermArray.Clone();
                for (int i = 0; i < searchTermArray.Length; i++)
                {
                    if (_GoChecker.IsGoWord(searchTermArray[i]))
                    {   // was a Go word, just Lower it
                        searchTermArray[i] = searchTermArray[i].ToLower();
                    }
                    else
                    {   // Not a Go word, apply stemming
                        searchTermArray[i] = searchTermArray[i].Trim(' ', '?', '\"', ',', '\'', ';', ':', '.', '(', ')').ToLower();
                        searchTermArray[i] = _Stemmer.StemWord(searchTermArray[i].ToString());
                    }
                }

                if (searchterm == String.Empty)
                {
                    // After trimming the search term, it was found to be empty!
                    return(output);
                }
                else
                {                                  // we have a search term!
                    DateTime start = DateTime.Now; // to show 'time taken' to perform search

                    // Array of arrays of results that match ONE of the search criteria
                    Dictionary <File, List <int> >[] searchResultsArrayArray = new Dictionary <File, List <int> > [searchTermArray.Length];
                    // finalResultsArray is populated with pages that *match* ALL the search criteria
                    HybridDictionary finalResultsArray = new HybridDictionary();

                    bool botherToFindMatches = true;
                    int  indexOfShortestResultSet = -1, lengthOfShortestResultSet = -1;

                    for (int i = 0; i < searchTermArray.Length; i++)
                    {   // ##### THE SEARCH #####
                        searchResultsArrayArray[i] = catalog.Search(searchTermArray[i].ToString());
                        if (null == searchResultsArrayArray[i])
                        {
                            _Matches           += searchTermDisplay[i] + " <font color=gray style='font-size:xx-small'>(not found)</font> ";
                            botherToFindMatches = false; // if *any one* of the terms isn't found, there won't be a 'set' of Matches
                        }
                        else
                        {
                            int resultsInThisSet = searchResultsArrayArray[i].Count;
                            _Matches += "<a href=\"?" + Preferences.QuerystringParameterName + "=" + searchTermDisplay[i] + "\" title=\"" + searchTermArray[i] + "\">"
                                        + searchTermDisplay[i]
                                        + "</a> <font color=gray style='font-size:xx-small'>(" + resultsInThisSet + ")</font> ";
                            if ((lengthOfShortestResultSet == -1) || (lengthOfShortestResultSet > resultsInThisSet))
                            {
                                indexOfShortestResultSet  = i;
                                lengthOfShortestResultSet = resultsInThisSet;
                            }
                        }
                    }

                    // Find the common files from the array of arrays of documents
                    // matching ONE of the criteria
                    if (botherToFindMatches)                                            // all words have *some* matches
                    {                                                                   // for each result set [NOT required, but maybe later if we do AND/OR searches)
                        int c = indexOfShortestResultSet;                               // loop through the *shortest* resultset
                        Dictionary <File, List <int> > searchResultsArray = searchResultsArrayArray[c];

                        foreach (File foundInFile in searchResultsArray.Keys)             // for each file in the *shortest* result set
                        {
                            //DictionaryEntry fo = (DictionaryEntry)foundInFile;          // find matching files in the other resultsets

                            int        matchcount = 0, totalcount = 0, weight = 0;
                            List <int> occurences = new List <int>();

                            for (int cx = 0; cx < searchResultsArrayArray.Length; cx++)
                            {
                                totalcount += (cx + 1);                              // keep track, so we can compare at the end (if term is in ALL resultsets)
                                if (cx == c)                                         // current resultset
                                {
                                    matchcount += (cx + 1);                          // implicitly matches in the current resultset
                                    //weight += (int)fo.Value;                       // sum the weighting
                                    weight += searchResultsArray[foundInFile].Count; // sum the weighting
                                    occurences.AddRange(searchResultsArray[foundInFile]);
                                }
                                else
                                {
                                    Dictionary <File, List <int> > searchResultsArrayx = searchResultsArrayArray[cx];
                                    if (null != searchResultsArrayx)
                                    {
                                        foreach (File foundInFilex in searchResultsArrayx.Keys)
                                        {   // for each file in the result set
                                            //DictionaryEntry fox = (DictionaryEntry)foundInFilex;
                                            //if (fo.Key == fox.Key)
                                            if (foundInFile == foundInFilex)
                                            {
                                                matchcount += (cx + 1);               // and if it matches, track the matchcount
                                                //weight += (int)fox.Value;           // and weighting; then break out of loop, since
                                                weight += searchResultsArrayx[foundInFilex].Count;
                                                occurences.AddRange(searchResultsArrayx[foundInFilex]);
                                                break;                              // no need to keep looking through this resultset
                                            }
                                        } // foreach
                                    } // if
                                }     // else
                            }         // for
                            if ((matchcount > 0) && (matchcount == totalcount))         // was matched in each Array
                            {   // we build the finalResults here, to pass to the formatting code below
                                // - we could do the formatting here, but it would mix up the 'result generation'
                                // and display code too much
                                //fo.Value = weight; // set the 'weight' in the combined results to the sum of individual document matches

                                //if (!finalResultsArray.Contains(fo.Key)) finalResultsArray.Add(fo.Key, fo);
                                if (!finalResultsArray.Contains(foundInFile))
                                {
                                    finalResultsArray.Add(foundInFile, occurences);                                           //.Count
                                }
                            } // if
                        } // foreach
                    }


                    // Time taken calculation
                    Int64    ticks = DateTime.Now.Ticks - start.Ticks;
                    TimeSpan taken = new TimeSpan(ticks);
                    if (taken.Seconds > 0)
                    {
                        _DisplayTime = taken.Seconds + " seconds";
                    }
                    else if (taken.TotalMilliseconds > 0)
                    {
                        _DisplayTime = Convert.ToInt32(taken.TotalMilliseconds) + " milliseconds";
                    }
                    else
                    {
                        _DisplayTime = "less than 1 millisecond";
                    }

                    // The preceding 80 lines (or so) replaces this single line from Version 1
                    //       Hashtable searchResultsArray = m_catalog.Search (searchterm);
                    // when only single-word-searches were supported. Look closely and you'll see this line
                    // labelled #THE SEARCH# still in the code above...

                    // Format the results
                    if (finalResultsArray.Count > 0)
                    {                                                     // intermediate data-structure for 'ranked' result HTML
                        //SortedList
                        output = new SortedList(finalResultsArray.Count); // empty sorted list
                        //                DictionaryEntry fo;
                        ResultFile infile;
                        //                string result="";
                        int sortrank = 0;

                        // build each result row
                        foreach (object foundInFile in finalResultsArray.Keys)
                        {
                            // Create a ResultFile with it's own Rank
                            infile = new ResultFile((File)foundInFile);

                            // [v7] if we have a cache of the page's content, we'll display the relevant
                            // text excerpt in the search results
                            if (catalog.FileCache.Contains(infile.Url))
                            {
                                string   desc  = "";
                                string[] words = catalog.FileCache.GetDocumentCache(infile.Url);

                                int position = (words.Length / 2);      // # find the position of a searched-for word here !!!!!!!

                                if (words.Length < 10)
                                {
                                    for (int i = 0; i < words.Length; i++)
                                    {
                                        desc += words[i] + " ";
                                    }
                                }
                                else
                                {
                                    List <int> pos = (List <int>)finalResultsArray[foundInFile];
                                    pos.Sort();

                                    int q = 0;
                                    position = pos[q];

                                    List <int> useablePos = new List <int>();
                                    foreach (int p in pos)
                                    {
                                        if (p < (position + 50))
                                        {
                                            useablePos.Add(p);
                                        }
                                        if (p > (position + 50))
                                        {
                                            break;
                                        }
                                    }

                                    int lowerBound = (position < 24) ? position : 24;
                                    int upperBound = (position < 24) ? 48 - position : 24;

                                    lowerBound = position - lowerBound;
                                    upperBound = position + upperBound;

                                    if (upperBound > words.Length)
                                    {
                                        upperBound = words.Length - 1;
                                    }

                                    for (int i = lowerBound; i < upperBound; i++)
                                    {
                                        if (i == position)
                                        {
                                            desc += "<b>";
                                        }
                                        desc += words[i] + " ";
                                        if (i == position)
                                        {
                                            desc += "</b>";
                                            q++;
                                            if (q < pos.Count)
                                            {
                                                position = pos[q];
                                            }
                                        }
                                    }
                                }

                                infile.Description = desc;
                            }


                            if (geolocOnly && (infile.GpsLocation == null || infile.GpsLocation == new Location()))
                            {
                                // don't add this ResultFile to output [v6]
                            }
                            else
                            {
                                // Jim Harkins [sort for paging] ported from VB to C#
                                // http://www.codeproject.com/aspnet/spideroo.asp#xx927327xx
                                //infile.Rank = (int)((DictionaryEntry)finalResultsArray[foundInFile]).Value;

                                infile.Rank = (int)((List <int>)finalResultsArray[foundInFile]).Count;

                                sortrank = infile.Rank * -1000; // Assume not 'thousands' of results
                                if (output.Contains(sortrank))
                                {                               // rank exists - drop key index one number until it fits
                                    for (int i = 1; i < 999; i++)
                                    {
                                        sortrank++;
                                        if (!output.Contains(sortrank))
                                        {
                                            output.Add(sortrank, infile);
                                            if (infile.GpsLocation != null)
                                            {
                                                _GeolocCount += 1;
                                            }
                                            break;
                                        }
                                    }
                                }
                                else
                                {
                                    output.Add(sortrank, infile);
                                    if (infile.GpsLocation != null)
                                    {
                                        _GeolocCount += 1;
                                    }
                                }
                            }
                            sortrank = 0;       // reset for next pass
                        }
                        // Jim Harkins [paged results]
                        // http://aspnet.4guysfromrolla.com/articles/081804-1.aspx
                    } // else Count == 0, so output SortedList will be empty
                }
            }
            return(output);
        }
Exemplo n.º 4
0
        /// <summary>
        ///
        /// </summary>
        /// <return>Number of words catalogued</return>
        protected int AddToCatalog(Document downloadDocument)
        {
            File infile = new File(downloadDocument.Uri.AbsoluteUri
                                   , downloadDocument.Title
                                   , downloadDocument.Description
                                   , DateTime.Now
                                   , downloadDocument.Length);

            // ### Loop through words in the file ###
            int    i   = 0;     // count of words
            string key = "";    // temp variables

            foreach (string word in downloadDocument.WordsArray)
            {
                key = word.ToLower();
                if (!_GoChecker.IsGoWord(key))
                {       // not a special case, parse like any other word
                    RemovePunctuation(ref key);

                    if (!IsNumber(ref key))
                    {   // not a number, so get rid of numeric seperators and catalog as a word
                        // TODO: remove inline punctuation, split hyphenated words?
                        // http://blogs.msdn.com/ericgu/archive/2006/01/16/513645.aspx
                        key = System.Text.RegularExpressions.Regex.Replace(key, "[,.]", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                        //*******
                        foreach (char ss in word)
                        {
                            if ((ss >= 0x600 && ss <= 0x6ff) || (ss >= 0x750 && ss <= 0x77f) || (ss >= 0xfb50 && ss <= 0xfc3f) || (ss >= 0xfe70 && ss <= 0xfefc))
                            {
                                Searcharoo.Engine.Search.flag = 1;

                                break;
                            }
                            else
                            {
                                Searcharoo.Engine.Search.flag = 0;

                                break;
                            }
                        }
                        if (Searcharoo.Engine.Search.flag == 1)
                        {
                            String          Stoped = ""; String Stemmed = "";
                            int             ii = 1;// richTextBox1.Lines.Length;
                            ArabicStopWords SW = new ArabicStopWords();
                            for (int k = 0; k < ii; k++)
                            {
                                string S = ""; string Temp = "";
                                Temp = word;//richTextBox1.Lines[k];
                                for (int l = 0; l < Temp.Length; l++)
                                {
                                    if (char.IsLetter(Temp, l))
                                    {
                                        S += Temp[l].ToString();
                                    }
                                    else
                                    {
                                        S += " ";
                                    }
                                }
                                S = S.Trim();
                                string[] R = S.Split(' ');
                                for (int j = 0; j < R.Length; j++)
                                {
                                    string Stem = SW.removing(R[j]);
                                    Stem   += " ";
                                    Stoped += Stem;
                                    // richTextBox4.AppendText(Stem);
                                }
                            }


                            //    richTextBox2.Clear();
                            i = 1;//richTextBox4.Lines.Length;
                            ISRI Stemmer = new ISRI();
                            for (int k = 0; k < i; k++)
                            {
                                string S = ""; string Temp = "";
                                Temp = Stoped;//richTextBox4.Lines[k];
                                for (int l = 0; l < Temp.Length; l++)
                                {
                                    if (char.IsLetter(Temp, l))
                                    {
                                        S += Temp[l].ToString();
                                    }
                                    else
                                    {
                                        S += " ";
                                    }
                                }
                                S = S.Trim();
                                string[] R = S.Split(' ');
                                for (int j = 0; j < R.Length; j++)
                                {
                                    string Stem = Stemmer.Stemming(R[j]);
                                    Stem += " ";
                                    key  += Stem;
                                    //searchTermArray[j] = Stem;
                                    //richTextBox2.AppendText(Stem);
                                }
                            }
                        }
                        else
                        {
                            // Apply Stemmer (set by preferences)
                            key = _Stemmer.StemWord(key);

                            // Apply Stopper (set by preferences)
                            key = _Stopper.StopWord(key);
                        }
                    }
                }
                else
                {
                    ProgressEvent(this, new ProgressEventArgs(4, "Found GoWord " + key + " in " + downloadDocument.Title));
                }
                if (key != String.Empty)
                {
                    _Catalog.Add(key, infile, i);
                    i++;
                }
            }
            return(i);
        }