public Document(UInt64 id): base(id) {} // for lookups public Document(string url, IndexRoot indexRoot, SessionBase session) { this.url = url; HashCodeComparer<Word> hashCodeComparer = new HashCodeComparer<Word>(); m_wordHit = new BTreeMapOidShort<Word, WordHit>(null, session); m_wordHit.TransientBatchSize = 10000; wordSet = new BTreeSetOidShort<Word>(hashCodeComparer, session, 1500, sizeof(int)); }
} // for lookups public Document(string url, IndexRoot indexRoot, SessionBase session) { this.url = url; HashCodeComparer <Word> hashCodeComparer = new HashCodeComparer <Word>(); m_wordHit = new BTreeMapOidShort <Word, WordHit>(null, session); m_wordHit.TransientBatchSize = 10000; wordSet = new BTreeSetOidShort <Word>(hashCodeComparer, session, 1500, sizeof(int)); }
public int Remove(IndexRoot indexRoot, SessionBase session) { if (Id == 0) { return(-1); } foreach (KeyValuePair <Word, WordHit> pair in m_wordHit) { if (pair.Key.DocumentHit.Count > 0) // somehow empty wordHit maps may appaer (need to fix) { uint occurances = (uint)pair.Value.Count; pair.Key.GlobalCount = pair.Key.GlobalCount - occurances; if (pair.Key.GlobalCount == 0) { indexRoot.lexicon.WordSet.Remove(pair.Key); if (pair.Key.DocumentHit.Count > 1) { throw new UnexpectedException("When globalCount is 0, then only this single doc should remain for the word"); } pair.Key.DocumentHit.Unpersist(session); pair.Key.Unpersist(session); } else { pair.Key.DocumentHit.Remove(this); } } } int index = 0; var itr = indexRoot.repository.documentSet.Iterator(); itr.GoTo(this); while (itr.MovePrevious()) { ++index; } indexRoot.repository.documentSet.Remove(this); m_wordHit.Clear(); m_wordHit.Unpersist(session); base.Unpersist(session); return(index); }
public int Remove(IndexRoot indexRoot, SessionBase session) { if (Id == 0) { return(-1); } foreach (KeyValuePair <UInt32, UInt32> pair in m_wordHit) { if (pair.Value > 0) // somehow empty wordHit maps may appaer (need to fix) { var lexicon = indexRoot.Lexicon; var globalCt = indexRoot.Lexicon.IdToGlobalCount[pair.Key]; if (globalCt == pair.Value) { lexicon.RemoveToken(pair.Key); lexicon.IdToGlobalCount.Remove(pair.Key); } else { lexicon.ReduceGlobalCount(pair.Key, pair.Value); var docs = lexicon.TokenMap[pair.Key]; docs.Remove(this); } } } int index = 0; var itr = indexRoot.Repository.DocumentSet.Iterator(); itr.GoTo(this); while (itr.MovePrevious()) { ++index; } indexRoot.Repository.DocumentSet.Remove(this); Unpersist(session); return(index); }
public MainWindow() { const ushort btreeNodeSize = 5000; GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer dataGridList = new List<DataGrid>(); dataTableList = new List<DataTable>(); InitializeComponent(); session = new SessionNoServer(s_systemDir); Placement placerIndexRoot = new Placement(IndexRoot.PlaceInDatabase); session.BeginUpdate(); Console.WriteLine("Running with databases in directory: " + session.SystemDirectory); File.Copy(s_licenseDbFile, Path.Combine(session.SystemDirectory, "4.odb"), true); IndexRoot indexRoot; Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false); if (db == null) { session.NewDatabase(IndexRoot.PlaceInDatabase, 0, "IndexRoot"); session.NewDatabase(Lexicon.PlaceInDatabase, 0, "Lexicon"); session.NewDatabase(Document.PlaceInDatabase, 0, "Document"); session.NewDatabase(Repository.PlaceInDatabase, 0, "Repository"); session.NewDatabase(DocumentText.PlaceInDatabase, 0, "DocumentText"); session.NewDatabase(Word.PlaceInDatabase, 0, "Word"); indexRoot = new IndexRoot(btreeNodeSize, session); if (Directory.Exists(s_booksDir)) { string[] directoryTextFiles = Directory.GetFiles(s_booksDir, "*.txt"); foreach (string fileName in directoryTextFiles) { listBoxPagesToAdd.Items.Add(fileName); } } else { wordMinCt.Text = 1.ToString(); listBoxPagesToAdd.Items.Add("http://www.VelocityDB.com/"); // other database products listBoxPagesToAdd.Items.Add("https://foundationdb.com/"); listBoxPagesToAdd.Items.Add("http://www.oracle.com/us/products/database/index.html"); listBoxPagesToAdd.Items.Add("http://www-01.ibm.com/software/data/db2/"); listBoxPagesToAdd.Items.Add("http://www.versant.com/"); listBoxPagesToAdd.Items.Add("http://web.progress.com/en/objectstore/"); listBoxPagesToAdd.Items.Add("https://www.mongodb.org/"); listBoxPagesToAdd.Items.Add("http://cassandra.apache.org/"); listBoxPagesToAdd.Items.Add("http://www.sybase.com/"); listBoxPagesToAdd.Items.Add("http://www.mcobject.com/perst"); listBoxPagesToAdd.Items.Add("http://www.marklogic.com/what-is-marklogic/"); listBoxPagesToAdd.Items.Add("http://hamsterdb.com/"); listBoxPagesToAdd.Items.Add("http://www.firebirdsql.org/"); listBoxPagesToAdd.Items.Add("http://www.h2database.com/"); listBoxPagesToAdd.Items.Add("http://www.oracle.com/technology/products/berkeley-db"); listBoxPagesToAdd.Items.Add("http://www.scimore.com/"); listBoxPagesToAdd.Items.Add("http://www.stsdb.com/"); listBoxPagesToAdd.Items.Add("http://www.sqlite.org/about.html"); listBoxPagesToAdd.Items.Add("http://www.mysql.com/products/enterprise/techspec.html"); listBoxPagesToAdd.Items.Add("http://www.objectivity.com"); listBoxPagesToAdd.Items.Add("http://vistadb.net/"); listBoxPagesToAdd.Items.Add("http://www.google.com/search?q=object+database&sourceid=ie7&rls=com.microsoft:en-us:IE-SearchBox&ie=&oe="); } indexRoot.Persist(session, indexRoot); } else indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); if (indexRoot.repository.documentSet.Count > 0) { List<Document> docs = indexRoot.repository.documentSet.ToList<Document>().Take(50).ToList<Document>(); inDbListBox.ItemsSource = docs; } updateDataGrids(indexRoot); session.Commit(); //verify(); }
void updateDataGrids(IndexRoot indexRoot, int indexOfRemoved = -1) { if (indexRoot == null) return; if (indexRoot.lexicon.WordSet.Count == 0) return; stackPanel.IsEnabled = false; bool aRefresh = stackPanel.Children.Count > 0; if (indexOfRemoved >= 0 && aRefresh) stackPanel.Children.RemoveAt(0); else if (stackPanel.Children.Count > 0) stackPanel.Children.Clear(); DataGrid dataGrid = new DataGrid(); dataGrid.AutoGenerateColumns = true; dataGrid.MaxColumnWidth = 150; dataGridList.Add(dataGrid); DataTable table = new DataTable("Word Count"); DataColumn wordColumn = new DataColumn("Words (all pages)", Type.GetType("System.String")); DataColumn countColumn = new DataColumn("Count", Type.GetType("System.UInt32")); table.Columns.Add(wordColumn); table.Columns.Add(countColumn); DataRow newRow; int pageIndex = 0; int min = 3; int.TryParse(wordMinCt.Text, out min); foreach (Word word in indexRoot.lexicon.WordSet) { if (word.GlobalCount >= min) { newRow = table.NewRow(); newRow[0] = word.aWord; newRow[1] = word.GlobalCount; table.Rows.Add(newRow); } } DataView dataView = new DataView(table); dataView.Sort = "Count desc"; dataGrid.ItemsSource = dataView; stackPanel.Children.Insert(pageIndex++, dataGrid); if (indexOfRemoved >= 0 && aRefresh) stackPanel.Children.RemoveAt(indexOfRemoved + 1); else { List<Document> docs = indexRoot.repository.documentSet.ToList<Document>().ToList<Document>(); foreach (Document page in docs) { DataTable pageTable = new DataTable(); dataTableList.Add(pageTable); string pageName = page.url.TrimEnd('/'); int index = pageName.IndexOf("//"); if (index >= 0) pageName = pageName.Remove(0, index + 2); index = pageName.IndexOf("www."); if (index >= 0) pageName = pageName.Remove(0, index + 4); pageName = pageName.Replace('.', ' '); pageName = pageName.Replace('/', ' '); DataColumn wordColumnPage = new DataColumn(pageName, Type.GetType("System.String")); DataColumn countColumnPage = new DataColumn("Count", Type.GetType("System.Int32")); pageTable.Columns.Add(wordColumnPage); pageTable.Columns.Add(countColumnPage); foreach (KeyValuePair<Word, WordHit> pair in page.WordHit) { if ((int)pair.Value.Count >= min) { newRow = pageTable.NewRow(); string aString = pair.Key.aWord; newRow.SetField<string>(wordColumnPage, aString); newRow.SetField<int>(countColumnPage, (int)pair.Value.Count); //wc.Add(new WordCount(aString, (uint) hit.wordPositionSet.Count)); pageTable.Rows.Add(newRow); } } dataGrid = new DataGrid(); dataGrid.AutoGenerateColumns = true; dataGrid.MaxColumnWidth = 150; dataGridList.Add(dataGrid); dataView = new DataView(pageTable); dataView.Sort = "Count desc"; dataGrid.ItemsSource = dataView; stackPanel.Children.Insert(pageIndex++, dataGrid); } } stackPanel.IsEnabled = true; }
public Document parseTextFile(string url, IndexRoot indexRoot, Placement docPlacement) { Document doc = new Document(Path.GetFileName(url), indexRoot, session); Placement docTextPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 1)); Placement wordPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 2)); Placement wordHitPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 10)); using (StreamReader reader = new StreamReader(url)) { textToWords(doc, indexRoot, reader.ReadToEnd(), docPlacement, docTextPlacement, wordPlacement, wordHitPlacement); } return doc; }
public Document parseHtml(string url, IndexRoot indexRoot) { Document doc = new Document(url, indexRoot, session); Placement docPlacement = new Placement(Document.PlaceInDatabase); Placement docTextPlacement = new Placement(Document.PlaceInDatabase, 2); Placement wordPlacement = new Placement(Document.PlaceInDatabase, 3); Placement wordHitPlacement = new Placement(Document.PlaceInDatabase, 100); using (WebClient client = new WebClient()) { string html = client.DownloadString(url); string pageBody = ""; HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(html); foreach (HtmlNode node in htmlDoc.DocumentNode.SelectNodes("//text()")) pageBody += " " + node.InnerText; textToWords(doc, indexRoot, pageBody, docPlacement, docTextPlacement, wordPlacement, wordHitPlacement); } return doc; }
public void textToWords(Document doc, IndexRoot indexRoot, string docTextString, Placement documentPlacement, Placement documentTextPlacement, Placement wordPlacement, Placement wordHitPlacement) { DocumentText docText = new DocumentText(docTextString, doc); Word word; doc.Persist(documentPlacement, session); doc.Page.Database.Name = doc.Name; docText.Persist(documentTextPlacement, session); indexRoot.repository.documentSet.Add(doc); doc.Content = docText; docTextString = docTextString.ToLower(); string[] excludedWords = new string[] { "and", "the" }; char[] splitChars = new char[] { ' ', '\n', '(', '"', '!', ',', '(', ')', '\t' }; string[] words = docTextString.Split(splitChars, StringSplitOptions.RemoveEmptyEntries); UInt64 wordCt = 0; int i = 0; string aWord; char[] trimEndChars = new char[] { ';', '.', '"', ',', '\r', ':', ']', '!', '?', '+', '(', ')', '\'', '{', '}', '-', '`', '/', '=' }; char[] trimStartChars = new char[] { ';', '&', '-', '#', '*', '[', '.', '"', ',', '\r', ')', '(', '\'', '{', '}', '-', '`' }; foreach (string wordStr in words) { i++; aWord = wordStr.TrimEnd(trimEndChars); aWord = aWord.TrimStart(trimStartChars); word = new Word(aWord); if (aWord.Length > 1 && excludedWords.Contains(aWord) == false) { createLocalInvertedIndex(doc, word, wordCt, wordPlacement, wordHitPlacement); ++wordCt; } } }
public void createGlobalInvertedIndex(IndexRoot indexRoot) { Placement wordPlacement = new Placement(Lexicon.PlaceInDatabase, 2); BTreeSetOidShort<Word> wordSet = indexRoot.lexicon.WordSet; BTreeSet<Document> docSet = indexRoot.repository.documentSet; Word existingWord = null; foreach (Document doc in docSet) { if (doc.Indexed == false) { foreach (Word word in doc.WordSet) { WordHit wordHit = doc.WordHit[word]; if (wordSet.TryGetKey(word, ref existingWord)) { existingWord.GlobalCount = existingWord.GlobalCount + (uint)wordHit.Count; } else { existingWord = new WordGlobal(word.aWord, session, (uint)wordHit.Count); existingWord.Persist(wordPlacement, session); wordSet.Add(existingWord); } existingWord.DocumentHit.AddFast(doc); } doc.Indexed = true; } } }
static void importEntireWikipedia() { const ushort btreeNodeSize = 10000; Console.WriteLine(DateTime.Now.ToString() + ", start importing Wikipedia text"); //System.Xml.Schema.XmlSchema docSchema; //using (System.Xml.XmlTextReader schemaReader = new System.Xml.XmlTextReader("c:\\export-0_5.xsd")) //{ // docSchema = System.Xml.Schema.XmlSchema.Read(schemaReader, ValidationCallBack); // } int docCount = 0; using (SessionNoServer session = new SessionNoServer(s_systemDir, 5000, false, false, CacheEnum.No)) // turn of page and object caching { Console.WriteLine("Running with databases in directory: " + session.SystemDirectory); //GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer Placement documentPlacement = new Placement(Document.PlaceInDatabase, 1003, 1, 500, 1000, false, false, 1000, false); Placement contentPlacement = new Placement(Document.PlaceInDatabase, 1, 1, 500, UInt16.MaxValue, false, false, 1, false); XmlComment xmlComment; XmlElement xmlElement; XmlEntity xmlEntity; XmlText xmlText; XmlWhitespace xmlWhitespace; session.BeginUpdate(); File.Copy(s_licenseDbFile, System.IO.Path.Combine(session.SystemDirectory, "4.odb"), true); // register all database schema classes used by the application in advance to avoid lock conflict later in parallell indexing session.RegisterClass(typeof(Repository)); session.RegisterClass(typeof(IndexRoot)); session.RegisterClass(typeof(Document)); session.RegisterClass(typeof(Lexicon)); session.RegisterClass(typeof(DocumentText)); session.RegisterClass(typeof(Word)); session.RegisterClass(typeof(WordGlobal)); session.RegisterClass(typeof(WordHit)); session.RegisterClass(typeof(BTreeSet<Document>)); session.RegisterClass(typeof(OidShort)); session.RegisterClass(typeof(BTreeMap<Word, WordHit>)); session.RegisterClass(typeof(HashCodeComparer<Word>)); session.RegisterClass(typeof(BTreeSetOidShort<Word>)); session.RegisterClass(typeof(BTreeMapOidShort<Word, WordHit>)); Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false); if (db != null) { outputSomeInfo(session); session.Abort(); return; } session.NewDatabase(IndexRoot.PlaceInDatabase, 0, "IndexRoot"); session.NewDatabase(Lexicon.PlaceInDatabase, 0, "Lexicon"); session.NewDatabase(Repository.PlaceInDatabase, 0, "Repository"); for (UInt32 i = 40; i <= 186; i++) { session.NewDatabase(i, 512, "Document"); // pre allocate 146 Document databases presized to 512MB each } //session.SetTraceDbActivity(Lexicon.PlaceInDatabase); //session.SetTraceAllDbActivity(); XmlDocument xmlDocument = new XmlDocument("enwiki-latest-pages-articles.xml"); IndexRoot indexRoot = new IndexRoot(btreeNodeSize, session); indexRoot.Persist(session, indexRoot, true); Document doc = null; bool titleElement = false; bool pageText = false; UInt32 currentDocumentDatabaseNum = documentPlacement.StartDatabaseNumber; using (FileStream fs = new FileStream(s_wikipediaXmlFile, FileMode.Open)) { //using (GZipStream zipStream = new GZipStream(fs, CompressionMode.Decompress)) // if input was a .gz file { using (System.Xml.XmlTextReader textReader = new System.Xml.XmlTextReader(fs)) { while (textReader.Read()) { System.Xml.XmlNodeType nodeType = textReader.NodeType; switch (nodeType) { case System.Xml.XmlNodeType.Attribute: break; case System.Xml.XmlNodeType.CDATA: break; case System.Xml.XmlNodeType.Comment: xmlComment = new XmlComment(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.Document: break; case System.Xml.XmlNodeType.DocumentFragment: break; case System.Xml.XmlNodeType.DocumentType: break; case System.Xml.XmlNodeType.Element: xmlElement = new XmlElement(textReader.Prefix, textReader.LocalName, textReader.NamespaceURI, xmlDocument); if (textReader.LocalName == "title") titleElement = true; else if (textReader.LocalName == "text") pageText = true; break; case System.Xml.XmlNodeType.EndElement: if (textReader.LocalName == "title" && doc != null) titleElement = false; else if (textReader.LocalName == "text" && doc != null) pageText = false; break; case System.Xml.XmlNodeType.EndEntity: break; case System.Xml.XmlNodeType.Entity: xmlEntity = new XmlEntity(textReader.LocalName, xmlDocument); break; case System.Xml.XmlNodeType.EntityReference: break; case System.Xml.XmlNodeType.None: break; case System.Xml.XmlNodeType.Notation: break; case System.Xml.XmlNodeType.ProcessingInstruction: break; case System.Xml.XmlNodeType.SignificantWhitespace: break; case System.Xml.XmlNodeType.Text: xmlText = new XmlText(textReader.Value, xmlDocument); if (titleElement) { doc = new Document(textReader.Value, indexRoot, session); doc.Persist(documentPlacement, session, true); if (doc.DatabaseNumber != currentDocumentDatabaseNum) { session.FlushUpdates(session.OpenDatabase(currentDocumentDatabaseNum)); Console.WriteLine("Database: " + currentDocumentDatabaseNum +" is completed, done importing article " + docCount + " number of lines: " + textReader.LineNumber); currentDocumentDatabaseNum = doc.DatabaseNumber; } //doc.Page.Database.Name = doc.Name; } else if (doc != null && pageText) { #if DEBUGx Console.WriteLine(doc.Name + " line: " + textReader.LineNumber); #endif //if (textReader.LineNumber > 1000000) //{ // session.Commit(); // return; //} DocumentText content = new DocumentText(textReader.Value, doc); if (doc.DatabaseNumber != contentPlacement.TryDatabaseNumber) contentPlacement = new Placement(doc.DatabaseNumber, (ushort)contentPlacement.StartPageNumber, 1, contentPlacement.MaxObjectsPerPage, contentPlacement.MaxPagesPerDatabase, false, false, 1, false); content.Persist(contentPlacement, session, false); Debug.Assert(content.DatabaseNumber == doc.DatabaseNumber); doc.Content = content; indexRoot.repository.documentSet.AddFast(doc); if (++docCount % 1000000 == 0) { //session.Commit(false); // skip recovery check, we do it in BeginUpdate which is enough Console.WriteLine("Done importing article " + docCount + " number of lines: " + textReader.LineNumber); //session.BeginUpdate(); } } break; case System.Xml.XmlNodeType.Whitespace: xmlWhitespace = new XmlWhitespace(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.XmlDeclaration: break; }; } Console.WriteLine("Finished importing article " + docCount + " number of lines: " + textReader.LineNumber); } } } session.Commit(); } Console.WriteLine(DateTime.Now.ToString() + ", done importing Wikipedia text"); }
public int Remove(IndexRoot indexRoot, SessionBase session) { if (Id == 0) return -1; foreach (KeyValuePair<Word, WordHit> pair in m_wordHit) { if (pair.Key.DocumentHit.Count > 0) // somehow empty wordHit maps may appaer (need to fix) { uint occurances = (uint)pair.Value.Count; pair.Key.GlobalCount = pair.Key.GlobalCount - occurances; if (pair.Key.GlobalCount == 0) { indexRoot.lexicon.WordSet.Remove(pair.Key); if (pair.Key.DocumentHit.Count > 1) throw new UnexpectedException("When globalCount is 0, then only this single doc should remain for the word"); pair.Key.DocumentHit.Unpersist(session); pair.Key.Unpersist(session); } else pair.Key.DocumentHit.Remove(this); } } int index = 0; var itr = indexRoot.repository.documentSet.Iterator(); itr.GoTo(this); while (itr.MovePrevious()) ++index; indexRoot.repository.documentSet.Remove(this); m_wordHit.Clear(); m_wordHit.Unpersist(session); base.Unpersist(session); return index; }
} // for lookups public Document(string url, IndexRoot indexRoot, SessionBase session) { _url = url; m_wordHit = new BTreeMap <UInt32, UInt32>(null, session, 50000); }