Пример #1
0
 public ResultFile(File sourceFile)
 {
     this.Url = sourceFile.Url;
     this.Title = sourceFile.Title;
     this.Description = sourceFile.Description;
     this.CrawledDate = sourceFile.CrawledDate;
     this.Size = sourceFile.Size;
     this.Rank = -1;
     this.GpsLocation = sourceFile.GpsLocation;
     this.KeywordString = sourceFile.KeywordString;
     this.Extension = sourceFile.Extension;
 }
Пример #2
0
        /// <summary>
        /// Add the Document subclass to the catalog, BY FIRST 'copying' the main
        /// properties into a File class. The distinction is a bit arbitrary: Documents
        /// are downloaded and indexed, but their content is modelled in as a File
        /// class in the Catalog (and represented as a ResultFile object in the search ASPX page)
        /// </summary>
        /// <return>Number of words catalogued in the Document</return>
        protected int AddToCatalog(Document downloadDocument)
        {
            File infile = new File(downloadDocument.Uri.AbsoluteUri
                , downloadDocument.Title.UnicodeToCharacter()
                , downloadDocument.Description.UnicodeToCharacter()
                , DateTime.Now
                , downloadDocument.Length
                , downloadDocument.GpsLocation
                , downloadDocument.Extension
                , downloadDocument.KeywordString.UnicodeToCharacter());

            // ### Loop through words in the file ###
            int i = 0, j = 0;   // count of words, count of words _indexed
            string key = "";    // temp variables

            foreach (string word in downloadDocument.WordsArray)
            {
                key = word.UnicodeToCharacter().ToLower();
                if (!_GoChecker.IsGoWord(key))
                {	// not a special case, parse like any other word
                    RemovePunctuation(ref key);

                    if (!IsNumber(ref key))
                    {	// not a number, so get rid of numeric seperators and catalog as a word
                        // TODO: remove inline punctuation, split hyphenated words?
                        // http://blogs.msdn.com/ericgu/archive/2006/01/16/513645.aspx
                        key = System.Text.RegularExpressions.Regex.Replace(key, "[,.]", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                        // Apply Stemmer (set by preferences)
                        key = _Stemmer.StemWord(key);

                        // Apply Stopper (set by preferences)
                        key = _Stopper.StopWord(key);
                    }
                }
                else
                {
                    ProgressEvent(this, new ProgressEventArgs(4, "Found GoWord " + key + " in " + downloadDocument.Title));
                }
                if (key != String.Empty)
                {
                    _Catalog.Add(key, infile, i);
                    j++;
                }
                i++;
            }
            _Catalog.FileCache.Add(downloadDocument.WordsArray, infile);
            return i;
        }
Пример #3
0
        public bool Add(string[] words, File infile)
        {
            // ### Make sure the Word object is in the index ONCE only
            if (_Index.ContainsKey(infile.Url.UnicodeToCharacter()))
            {
                // already cached
                return false;
            }
            else
            {
                CachedFile cf = new CachedFile();
                cf.Url = infile.Url.UnicodeToCharacter();

                for(int i = 0; i < words.Length; i++)
                {
                    words[i] = words[i].UnicodeToCharacter();
                }
                cf.Words = words;

                _Index.Add(infile.Url, cf);
            }
            return true;
        }