public void createLocalInvertedIndex(Document doc, Word word, UInt64 wordCt, Placement wordPlacement, Placement wordHitPlacement) { WordHit wordHit; BTreeSetOidShort<Word> wordSet = doc.WordSet; if (wordSet.TryGetKey(word, ref word)) { wordHit = doc.WordHit[word]; wordHit.Add(wordCt); } else { word.Persist(wordPlacement, session); wordSet.Add(word); wordHit = new WordHit(doc, wordCt++, session); doc.WordHit.ValuePlacement = wordHitPlacement; doc.WordHit.AddFast(word, wordHit); } }
static void createDocumentInvertedIndex(SessionBase session, Database db, BTreeSet<Document> documentSet) { UInt32 dbNum = db.DatabaseNumber; Document doc = null; Document inputDoc = new Document(db.Id); Placement wordPlacement = new Placement(inputDoc.DatabaseNumber, 20000, 1, 25000, 65000, true, false, 1, false); Placement wordHitPlacement = new Placement(inputDoc.DatabaseNumber, 40000, 1, 25000, 65500, true, false, 1, false); //session.SetTraceDbActivity(db.DatabaseNumber); BTreeSetIterator<Document> iterator = documentSet.Iterator(); iterator.GoTo(inputDoc); inputDoc = iterator.Current(); while (inputDoc != null && inputDoc.Page.Database.DatabaseNumber == dbNum) { doc = (Document)session.Open(inputDoc.Page.Database, inputDoc.Id); // if matching database is availeble, use it to speed up lookup DocumentText docText = doc.Content; string text = docText.Text.ToLower(); MatchCollection tagMatches = Regex.Matches(text, "[a-z][a-z.$]+"); UInt64 wordCt = 0; WordHit wordHit; Word word; if (++s_docCountIndexed % 50000 == 0) Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed); BTreeSetOidShort<Word> wordSet = doc.WordSet; foreach (Match m in tagMatches) { word = new Word(m.Value); if (wordSet.TryGetKey(word, ref word)) { //wordHit = doc.WordHit[word]; // to costly to add tight now - figure out a better way ? //wordHit.Add(wordCt); } else { word = new Word(m.Value); word.Persist(wordPlacement, session); wordSet.Add(word); wordHit = new WordHit(doc, wordCt++, session); //wordHit.Persist(wordHitPlacement, session); doc.WordHit.ValuePlacement = wordHitPlacement; doc.WordHit.AddFast(word, wordHit); } } inputDoc = iterator.Next(); } session.FlushUpdates(db); session.ClearCachedObjects(db); // free up memory for objects we no longer need to have cached Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed + " Database: " + dbNum + " is completed."); }