Exemplo n.º 1
0
        static void createDocumentInvertedIndex(SessionBase session, Database db, BTreeSet <Document> documentSet)
        {
            UInt32    dbNum            = db.DatabaseNumber;
            Document  doc              = null;
            Document  inputDoc         = new Document(db.Id);
            Placement wordPlacement    = new Placement(inputDoc.DatabaseNumber, 20000, 1, 25000, 65000, true, false, 1, false);
            Placement wordHitPlacement = new Placement(inputDoc.DatabaseNumber, 40000, 1, 25000, 65500, true, false, 1, false);
            //session.SetTraceDbActivity(db.DatabaseNumber);
            BTreeSetIterator <Document> iterator = documentSet.Iterator();

            iterator.GoTo(inputDoc);
            inputDoc = iterator.Current();
            while (inputDoc != null && inputDoc.Page.Database.DatabaseNumber == dbNum)
            {
                doc = (Document)session.Open(inputDoc.Page.Database, inputDoc.Id); // if matching database is availeble, use it to speed up lookup
                DocumentText    docText    = doc.Content;
                string          text       = docText.Text.ToLower();
                MatchCollection tagMatches = Regex.Matches(text, "[a-z][a-z.$]+");
                UInt64          wordCt     = 0;
                WordHit         wordHit;
                Word            word;
                if (++s_docCountIndexed % 50000 == 0)
                {
                    Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed);
                }
                BTreeSetOidShort <Word> wordSet = doc.WordSet;
                foreach (Match m in tagMatches)
                {
                    word = new Word(m.Value);
                    if (wordSet.TryGetKey(word, ref word))
                    {
                        //wordHit = doc.WordHit[word]; // to costly to add tight now - figure out a better way ?
                        //wordHit.Add(wordCt);
                    }
                    else
                    {
                        word = new Word(m.Value);
                        word.Persist(wordPlacement, session);
                        wordSet.Add(word);
                        wordHit = new WordHit(doc, wordCt++, session);
                        //wordHit.Persist(wordHitPlacement, session);
                        doc.WordHit.ValuePlacement = wordHitPlacement;
                        doc.WordHit.AddFast(word, wordHit);
                    }
                }
                inputDoc = iterator.Next();
            }
            session.FlushUpdates(db);
            session.ClearCachedObjects(db); // free up memory for objects we no longer need to have cached
            Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed + " Database: " + dbNum + " is completed.");
        }
Exemplo n.º 2
0
 static void createDocumentInvertedIndex(SessionBase session, Database db, BTreeSet<Document> documentSet)
 {
   UInt32 dbNum = db.DatabaseNumber;
   Document doc = null;
   Document inputDoc = new Document(db.Id);
   Placement wordPlacement = new Placement(inputDoc.DatabaseNumber, 20000, 1, 25000, 65000, true, false, 1, false);
   Placement wordHitPlacement = new Placement(inputDoc.DatabaseNumber, 40000, 1, 25000, 65500, true, false, 1, false);
   //session.SetTraceDbActivity(db.DatabaseNumber);
   BTreeSetIterator<Document> iterator = documentSet.Iterator();
   iterator.GoTo(inputDoc);
   inputDoc = iterator.Current();
   while (inputDoc != null && inputDoc.Page.Database.DatabaseNumber == dbNum)
   {
     doc = (Document)session.Open(inputDoc.Page.Database, inputDoc.Id); // if matching database is availeble, use it to speed up lookup
     DocumentText docText = doc.Content;
     string text = docText.Text.ToLower();
     MatchCollection tagMatches = Regex.Matches(text, "[a-z][a-z.$]+");
     UInt64 wordCt = 0;
     WordHit wordHit;
     Word word;
     if (++s_docCountIndexed % 50000 == 0)
       Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed);
     BTreeSetOidShort<Word> wordSet = doc.WordSet;
     foreach (Match m in tagMatches)
     {
       word = new Word(m.Value);
       if (wordSet.TryGetKey(word, ref word))
       {
         //wordHit = doc.WordHit[word]; // to costly to add tight now - figure out a better way ?
         //wordHit.Add(wordCt);
       }
       else
       {
         word = new Word(m.Value);
         word.Persist(wordPlacement, session);
         wordSet.Add(word);
         wordHit = new WordHit(doc, wordCt++, session);
         //wordHit.Persist(wordHitPlacement, session);
         doc.WordHit.ValuePlacement = wordHitPlacement;
         doc.WordHit.AddFast(word, wordHit);
       }
     }
     inputDoc = iterator.Next();
   }
   session.FlushUpdates(db);
   session.ClearCachedObjects(db); // free up memory for objects we no longer need to have cached
   Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed + " Database: " + dbNum + " is completed.");
 }