Пример #1
0
        public void createGlobalInvertedIndex(IndexRoot indexRoot)
        {
            Placement wordPlacement         = new Placement(Lexicon.PlaceInDatabase, 2);
            BTreeSetOidShort <Word> wordSet = indexRoot.lexicon.WordSet;
            BTreeSet <Document>     docSet  = indexRoot.repository.documentSet;
            Word existingWord = null;

            foreach (Document doc in docSet)
            {
                if (doc.Indexed == false)
                {
                    foreach (Word word in doc.WordSet)
                    {
                        WordHit wordHit = doc.WordHit[word];
                        if (wordSet.TryGetKey(word, ref existingWord))
                        {
                            existingWord.GlobalCount = existingWord.GlobalCount + (uint)wordHit.Count;
                        }
                        else
                        {
                            existingWord = new WordGlobal(word.aWord, session, (uint)wordHit.Count);
                            existingWord.Persist(wordPlacement, session);
                            wordSet.Add(existingWord);
                        }
                        existingWord.DocumentHit.AddFast(doc);
                    }
                    doc.Indexed = true;
                }
            }
        }
Пример #2
0
 static void createTopLevelInvertedIndex()
 {
     Console.WriteLine(DateTime.Now.ToString() + ", start creating top level inverted index");
     using (SessionNoServer session = new SessionNoServer(s_systemDir))
     {
         Placement wordPlacement = new Placement(Lexicon.PlaceInDatabase, 2, 1, 1000, 50000, true, false, UInt32.MaxValue, false);
         session.BeginUpdate();
         IndexRoot indexRoot                 = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1));
         BTreeSetOidShort <Word> wordSet     = indexRoot.lexicon.WordSet;
         BTreeSet <Document>     documentSet = indexRoot.repository.documentSet;
         Word existingWord = null;
         foreach (Document doc in documentSet)
         {
             foreach (Word word in doc.WordSet)
             {
                 WordHit wordHit = doc.WordHit[word];
                 if (wordSet.TryGetKey(word, ref existingWord))
                 {
                     existingWord.GlobalCount = existingWord.GlobalCount + (uint)wordHit.Count;
                 }
                 else
                 {
                     existingWord = new WordGlobal(word.aWord, session, (uint)wordHit.Count);
                     existingWord.Persist(wordPlacement, session);
                     indexRoot.lexicon.WordSet.Add(existingWord);
                 }
                 existingWord.DocumentHit.AddFast(doc);
             }
             doc.Indexed = true;
         }
         session.Commit();
         Console.WriteLine(DateTime.Now.ToString() + ", done creating top level inverted index");
     }
 }
Пример #3
0
        static void createDocumentInvertedIndex(SessionBase session, Database db, BTreeSet <Document> documentSet)
        {
            UInt32    dbNum            = db.DatabaseNumber;
            Document  doc              = null;
            Document  inputDoc         = new Document(db.Id);
            Placement wordPlacement    = new Placement(inputDoc.DatabaseNumber, 20000, 1, 25000, 65000, true, false, 1, false);
            Placement wordHitPlacement = new Placement(inputDoc.DatabaseNumber, 40000, 1, 25000, 65500, true, false, 1, false);
            //session.SetTraceDbActivity(db.DatabaseNumber);
            BTreeSetIterator <Document> iterator = documentSet.Iterator();

            iterator.GoTo(inputDoc);
            inputDoc = iterator.Current();
            while (inputDoc != null && inputDoc.Page.Database.DatabaseNumber == dbNum)
            {
                doc = (Document)session.Open(inputDoc.Page.Database, inputDoc.Id); // if matching database is availeble, use it to speed up lookup
                DocumentText    docText    = doc.Content;
                string          text       = docText.Text.ToLower();
                MatchCollection tagMatches = Regex.Matches(text, "[a-z][a-z.$]+");
                UInt64          wordCt     = 0;
                WordHit         wordHit;
                Word            word;
                if (++s_docCountIndexed % 50000 == 0)
                {
                    Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed);
                }
                BTreeSetOidShort <Word> wordSet = doc.WordSet;
                foreach (Match m in tagMatches)
                {
                    word = new Word(m.Value);
                    if (wordSet.TryGetKey(word, ref word))
                    {
                        //wordHit = doc.WordHit[word]; // to costly to add tight now - figure out a better way ?
                        //wordHit.Add(wordCt);
                    }
                    else
                    {
                        word = new Word(m.Value);
                        word.Persist(wordPlacement, session);
                        wordSet.Add(word);
                        wordHit = new WordHit(doc, wordCt++, session);
                        //wordHit.Persist(wordHitPlacement, session);
                        doc.WordHit.ValuePlacement = wordHitPlacement;
                        doc.WordHit.AddFast(word, wordHit);
                    }
                }
                inputDoc = iterator.Next();
            }
            session.FlushUpdates(db);
            session.ClearCachedObjects(db); // free up memory for objects we no longer need to have cached
            Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed + " Database: " + dbNum + " is completed.");
        }
Пример #4
0
        public void createLocalInvertedIndex(Document doc, Word word, UInt64 wordCt, Placement wordPlacement, Placement wordHitPlacement)
        {
            WordHit wordHit;
            BTreeSetOidShort <Word> wordSet = doc.WordSet;

            if (wordSet.TryGetKey(word, ref word))
            {
                wordHit = doc.WordHit[word];
                wordHit.Add(wordCt);
            }
            else
            {
                word.Persist(wordPlacement, session);
                wordSet.Add(word);
                wordHit = new WordHit(doc, wordCt++, session);
                doc.WordHit.ValuePlacement = wordHitPlacement;
                doc.WordHit.AddFast(word, wordHit);
            }
        }