Inheritance: OptimizedPersistable
 public void createLocalInvertedIndex(Document doc, Word word, UInt64 wordCt, Placement wordPlacement, Placement wordHitPlacement)
 {
   WordHit wordHit;
   BTreeSetOidShort<Word> wordSet = doc.WordSet;
   if (wordSet.TryGetKey(word, ref word))
   {
     wordHit = doc.WordHit[word];
     wordHit.Add(wordCt);
   }
   else
   {
     word.Persist(wordPlacement, session);
     wordSet.Add(word);
     wordHit = new WordHit(doc, wordCt++, session);
     doc.WordHit.ValuePlacement = wordHitPlacement;
     doc.WordHit.AddFast(word, wordHit);
   }
 }
Exemple #2
0
 static void createDocumentInvertedIndex(SessionBase session, Database db, BTreeSet<Document> documentSet)
 {
   UInt32 dbNum = db.DatabaseNumber;
   Document doc = null;
   Document inputDoc = new Document(db.Id);
   Placement wordPlacement = new Placement(inputDoc.DatabaseNumber, 20000, 1, 25000, 65000, true, false, 1, false);
   Placement wordHitPlacement = new Placement(inputDoc.DatabaseNumber, 40000, 1, 25000, 65500, true, false, 1, false);
   //session.SetTraceDbActivity(db.DatabaseNumber);
   BTreeSetIterator<Document> iterator = documentSet.Iterator();
   iterator.GoTo(inputDoc);
   inputDoc = iterator.Current();
   while (inputDoc != null && inputDoc.Page.Database.DatabaseNumber == dbNum)
   {
     doc = (Document)session.Open(inputDoc.Page.Database, inputDoc.Id); // if matching database is availeble, use it to speed up lookup
     DocumentText docText = doc.Content;
     string text = docText.Text.ToLower();
     MatchCollection tagMatches = Regex.Matches(text, "[a-z][a-z.$]+");
     UInt64 wordCt = 0;
     WordHit wordHit;
     Word word;
     if (++s_docCountIndexed % 50000 == 0)
       Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed);
     BTreeSetOidShort<Word> wordSet = doc.WordSet;
     foreach (Match m in tagMatches)
     {
       word = new Word(m.Value);
       if (wordSet.TryGetKey(word, ref word))
       {
         //wordHit = doc.WordHit[word]; // to costly to add tight now - figure out a better way ?
         //wordHit.Add(wordCt);
       }
       else
       {
         word = new Word(m.Value);
         word.Persist(wordPlacement, session);
         wordSet.Add(word);
         wordHit = new WordHit(doc, wordCt++, session);
         //wordHit.Persist(wordHitPlacement, session);
         doc.WordHit.ValuePlacement = wordHitPlacement;
         doc.WordHit.AddFast(word, wordHit);
       }
     }
     inputDoc = iterator.Next();
   }
   session.FlushUpdates(db);
   session.ClearCachedObjects(db); // free up memory for objects we no longer need to have cached
   Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed + " Database: " + dbNum + " is completed.");
 }