/// <summary> /// Extracts the words found in the contents of a document. Used by DBUpdater when /// a document is stored in the database in order to extract the words it contains /// and add them to the database at the same time. /// </summary> /// <param name="data">The <see cref="UrlCrawlData"/> to be processed.</param> public void ExtractWords(ref UrlCrawlData data) { //First try to extract the words from the document. If something goes wrong just //return, otherwise add the words to the cache, remove any old words related to //the url with this id from the database and store the new url-words. try { SortedList words = wordExtractor.ExtractWords(data.Data); if (words.Count == 0) { return; } //add all the words to the database if they don't exist already string word = String.Empty; short word_count = 0; int word_id = -1; foreach (DictionaryEntry de in words) { word = (string)de.Key; cache.AddStemmedWord(word); } //remove all the old words related to this url from the database RemoveUrlWords(data.ID); //now add relationships between the url and its words foreach (DictionaryEntry d in words) { word = (string)d.Key; word_count = (short)d.Value; word_id = cache[word]; AddUrlWord(data.ID, word_id, word_count); } UpdateUrlDataLastProcess(data.ID); } catch (Exception e) { events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Warning, DateTime.Now, "WordExtractionPlugin failed to extract words from Url with ID " + data.ID.ToString() + ": " + e.ToString())); } }