Inheritance: OptimizedPersistable
Esempio n. 1
0
 public void textToWords(Document doc, IndexRoot indexRoot, string docTextString, Placement documentPlacement,
   Placement documentTextPlacement, Placement wordPlacement, Placement wordHitPlacement)
 {
   DocumentText docText = new DocumentText(docTextString, doc);
   Word word;
   doc.Persist(documentPlacement, session);
   doc.Page.Database.Name = doc.Name;
   docText.Persist(documentTextPlacement, session);
   indexRoot.repository.documentSet.Add(doc);
   doc.Content = docText;
   docTextString = docTextString.ToLower();
   string[] excludedWords = new string[] { "and", "the" };
   char[] splitChars = new char[] { ' ', '\n', '(', '"', '!', ',', '(', ')', '\t' };
   string[] words = docTextString.Split(splitChars, StringSplitOptions.RemoveEmptyEntries);
   UInt64 wordCt = 0;
   int i = 0;
   string aWord;
   char[] trimEndChars = new char[] { ';', '.', '"', ',', '\r', ':', ']', '!', '?', '+', '(', ')', '\'', '{', '}', '-', '`', '/', '=' };
   char[] trimStartChars = new char[] { ';', '&', '-', '#', '*', '[', '.', '"', ',', '\r', ')', '(', '\'', '{', '}', '-', '`' };
   foreach (string wordStr in words)
   {
     i++;
     aWord = wordStr.TrimEnd(trimEndChars);
     aWord = aWord.TrimStart(trimStartChars);
     word = new Word(aWord);
     if (aWord.Length > 1 && excludedWords.Contains(aWord) == false)
     {
       createLocalInvertedIndex(doc, word, wordCt, wordPlacement, wordHitPlacement);
       ++wordCt;
     }
   }
 }
Esempio n. 2
0
 static void createDocumentInvertedIndex(SessionBase session, Database db, BTreeSet<Document> documentSet)
 {
   UInt32 dbNum = db.DatabaseNumber;
   Document doc = null;
   Document inputDoc = new Document(db.Id);
   Placement wordPlacement = new Placement(inputDoc.DatabaseNumber, 20000, 1, 25000, 65000, true, false, 1, false);
   Placement wordHitPlacement = new Placement(inputDoc.DatabaseNumber, 40000, 1, 25000, 65500, true, false, 1, false);
   //session.SetTraceDbActivity(db.DatabaseNumber);
   BTreeSetIterator<Document> iterator = documentSet.Iterator();
   iterator.GoTo(inputDoc);
   inputDoc = iterator.Current();
   while (inputDoc != null && inputDoc.Page.Database.DatabaseNumber == dbNum)
   {
     doc = (Document)session.Open(inputDoc.Page.Database, inputDoc.Id); // if matching database is availeble, use it to speed up lookup
     DocumentText docText = doc.Content;
     string text = docText.Text.ToLower();
     MatchCollection tagMatches = Regex.Matches(text, "[a-z][a-z.$]+");
     UInt64 wordCt = 0;
     WordHit wordHit;
     Word word;
     if (++s_docCountIndexed % 50000 == 0)
       Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed);
     BTreeSetOidShort<Word> wordSet = doc.WordSet;
     foreach (Match m in tagMatches)
     {
       word = new Word(m.Value);
       if (wordSet.TryGetKey(word, ref word))
       {
         //wordHit = doc.WordHit[word]; // to costly to add tight now - figure out a better way ?
         //wordHit.Add(wordCt);
       }
       else
       {
         word = new Word(m.Value);
         word.Persist(wordPlacement, session);
         wordSet.Add(word);
         wordHit = new WordHit(doc, wordCt++, session);
         //wordHit.Persist(wordHitPlacement, session);
         doc.WordHit.ValuePlacement = wordHitPlacement;
         doc.WordHit.AddFast(word, wordHit);
       }
     }
     inputDoc = iterator.Next();
   }
   session.FlushUpdates(db);
   session.ClearCachedObjects(db); // free up memory for objects we no longer need to have cached
   Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed + " Database: " + dbNum + " is completed.");
 }
Esempio n. 3
0
 public void createLocalInvertedIndex(Document doc, Word word, UInt64 wordCt, Placement wordPlacement, Placement wordHitPlacement)
 {
   WordHit wordHit;
   BTreeSetOidShort<Word> wordSet = doc.WordSet;
   if (wordSet.TryGetKey(word, ref word))
   {
     wordHit = doc.WordHit[word];
     wordHit.Add(wordCt);
   }
   else
   {
     word.Persist(wordPlacement, session);
     wordSet.Add(word);
     wordHit = new WordHit(doc, wordCt++, session);
     doc.WordHit.ValuePlacement = wordHitPlacement;
     doc.WordHit.AddFast(word, wordHit);
   }
 }
Esempio n. 4
0
        public override int CompareTo(object obj)
        {
            Word otherToken = obj as Word;

            return(aWord.CompareTo(otherToken.aWord));
        }