Ejemplo n.º 1
0
 /// <summary>
 /// Extracts the words found in the contents of a document. Used by DBUpdater when
 /// a document is stored in the database in order to extract the words it contains
 /// and add them to the database at the same time.
 /// </summary>
 /// <param name="data">The <see cref="UrlCrawlData"/> to be processed.</param>
 public void ExtractWords(ref UrlCrawlData data)
 {
     //First try to extract the words from the document. If something goes wrong just
     //return, otherwise add the words to the cache, remove any old words related to
     //the url with this id from the database and store the new url-words.
     try
     {
         SortedList words = wordExtractor.ExtractWords(data.Data);
         if (words.Count == 0)
         {
             return;
         }
         //add all the words to the database if they don't exist already
         string word       = String.Empty;
         short  word_count = 0;
         int    word_id    = -1;
         foreach (DictionaryEntry de in words)
         {
             word = (string)de.Key;
             cache.AddStemmedWord(word);
         }
         //remove all the old words related to this url from the database
         RemoveUrlWords(data.ID);
         //now add relationships between the url and its words
         foreach (DictionaryEntry d in words)
         {
             word       = (string)d.Key;
             word_count = (short)d.Value;
             word_id    = cache[word];
             AddUrlWord(data.ID, word_id, word_count);
         }
         UpdateUrlDataLastProcess(data.ID);
     }
     catch (Exception e)
     {
         events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Warning, DateTime.Now, "WordExtractionPlugin failed to extract words from Url with ID " + data.ID.ToString() + ": " + e.ToString()));
     }
 }