public void createGlobalInvertedIndex(IndexRoot indexRoot) { Placement wordPlacement = new Placement(Lexicon.PlaceInDatabase, 2); BTreeSetOidShort <Word> wordSet = indexRoot.lexicon.WordSet; BTreeSet <Document> docSet = indexRoot.repository.documentSet; Word existingWord = null; foreach (Document doc in docSet) { if (doc.Indexed == false) { foreach (Word word in doc.WordSet) { WordHit wordHit = doc.WordHit[word]; if (wordSet.TryGetKey(word, ref existingWord)) { existingWord.GlobalCount = existingWord.GlobalCount + (uint)wordHit.Count; } else { existingWord = new WordGlobal(word.aWord, session, (uint)wordHit.Count); existingWord.Persist(wordPlacement, session); wordSet.Add(existingWord); } existingWord.DocumentHit.AddFast(doc); } doc.Indexed = true; } } }
static void createTopLevelInvertedIndex() { Console.WriteLine(DateTime.Now.ToString() + ", start creating top level inverted index"); using (SessionNoServer session = new SessionNoServer(s_systemDir)) { Placement wordPlacement = new Placement(Lexicon.PlaceInDatabase, 2, 1, 1000, 50000, true, false, UInt32.MaxValue, false); session.BeginUpdate(); IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); BTreeSetOidShort <Word> wordSet = indexRoot.lexicon.WordSet; BTreeSet <Document> documentSet = indexRoot.repository.documentSet; Word existingWord = null; foreach (Document doc in documentSet) { foreach (Word word in doc.WordSet) { WordHit wordHit = doc.WordHit[word]; if (wordSet.TryGetKey(word, ref existingWord)) { existingWord.GlobalCount = existingWord.GlobalCount + (uint)wordHit.Count; } else { existingWord = new WordGlobal(word.aWord, session, (uint)wordHit.Count); existingWord.Persist(wordPlacement, session); indexRoot.lexicon.WordSet.Add(existingWord); } existingWord.DocumentHit.AddFast(doc); } doc.Indexed = true; } session.Commit(); Console.WriteLine(DateTime.Now.ToString() + ", done creating top level inverted index"); } }
static void createDocumentInvertedIndex(SessionBase session, Database db, BTreeSet <Document> documentSet) { UInt32 dbNum = db.DatabaseNumber; Document doc = null; Document inputDoc = new Document(db.Id); Placement wordPlacement = new Placement(inputDoc.DatabaseNumber, 20000, 1, 25000, 65000, true, false, 1, false); Placement wordHitPlacement = new Placement(inputDoc.DatabaseNumber, 40000, 1, 25000, 65500, true, false, 1, false); //session.SetTraceDbActivity(db.DatabaseNumber); BTreeSetIterator <Document> iterator = documentSet.Iterator(); iterator.GoTo(inputDoc); inputDoc = iterator.Current(); while (inputDoc != null && inputDoc.Page.Database.DatabaseNumber == dbNum) { doc = (Document)session.Open(inputDoc.Page.Database, inputDoc.Id); // if matching database is availeble, use it to speed up lookup DocumentText docText = doc.Content; string text = docText.Text.ToLower(); MatchCollection tagMatches = Regex.Matches(text, "[a-z][a-z.$]+"); UInt64 wordCt = 0; WordHit wordHit; Word word; if (++s_docCountIndexed % 50000 == 0) { Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed); } BTreeSetOidShort <Word> wordSet = doc.WordSet; foreach (Match m in tagMatches) { word = new Word(m.Value); if (wordSet.TryGetKey(word, ref word)) { //wordHit = doc.WordHit[word]; // to costly to add tight now - figure out a better way ? //wordHit.Add(wordCt); } else { word = new Word(m.Value); word.Persist(wordPlacement, session); wordSet.Add(word); wordHit = new WordHit(doc, wordCt++, session); //wordHit.Persist(wordHitPlacement, session); doc.WordHit.ValuePlacement = wordHitPlacement; doc.WordHit.AddFast(word, wordHit); } } inputDoc = iterator.Next(); } session.FlushUpdates(db); session.ClearCachedObjects(db); // free up memory for objects we no longer need to have cached Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed + " Database: " + dbNum + " is completed."); }
public void createLocalInvertedIndex(Document doc, Word word, UInt64 wordCt, Placement wordPlacement, Placement wordHitPlacement) { WordHit wordHit; BTreeSetOidShort <Word> wordSet = doc.WordSet; if (wordSet.TryGetKey(word, ref word)) { wordHit = doc.WordHit[word]; wordHit.Add(wordCt); } else { word.Persist(wordPlacement, session); wordSet.Add(word); wordHit = new WordHit(doc, wordCt++, session); doc.WordHit.ValuePlacement = wordHitPlacement; doc.WordHit.AddFast(word, wordHit); } }