static void importEntireWikipedia() { const ushort btreeNodeSize = 10000; Console.WriteLine(DateTime.Now.ToString() + ", start importing Wikipedia text"); //System.Xml.Schema.XmlSchema docSchema; //using (System.Xml.XmlTextReader schemaReader = new System.Xml.XmlTextReader("c:\\export-0_5.xsd")) //{ // docSchema = System.Xml.Schema.XmlSchema.Read(schemaReader, ValidationCallBack); // } int docCount = 0; using (SessionNoServer session = new SessionNoServer(s_systemDir, 5000, false, false, CacheEnum.No)) // turn of page and object caching { Console.WriteLine("Running with databases in directory: " + session.SystemDirectory); //GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer Placement documentPlacement = new Placement(Document.PlaceInDatabase, 1003, 1, 500, 1000, false, false, 1000, false); Placement contentPlacement = new Placement(Document.PlaceInDatabase, 1, 1, 500, UInt16.MaxValue, false, false, 1, false); XmlComment xmlComment; XmlElement xmlElement; XmlEntity xmlEntity; XmlText xmlText; XmlWhitespace xmlWhitespace; session.BeginUpdate(); File.Copy(s_licenseDbFile, System.IO.Path.Combine(session.SystemDirectory, "4.odb"), true); // register all database schema classes used by the application in advance to avoid lock conflict later in parallell indexing session.RegisterClass(typeof(Repository)); session.RegisterClass(typeof(IndexRoot)); session.RegisterClass(typeof(Document)); session.RegisterClass(typeof(Lexicon)); session.RegisterClass(typeof(DocumentText)); session.RegisterClass(typeof(Word)); session.RegisterClass(typeof(WordGlobal)); session.RegisterClass(typeof(WordHit)); session.RegisterClass(typeof(BTreeSet <Document>)); session.RegisterClass(typeof(OidShort)); session.RegisterClass(typeof(BTreeMap <Word, WordHit>)); session.RegisterClass(typeof(HashCodeComparer <Word>)); session.RegisterClass(typeof(BTreeSetOidShort <Word>)); session.RegisterClass(typeof(BTreeMapOidShort <Word, WordHit>)); Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false); if (db != null) { outputSomeInfo(session); session.Abort(); return; } session.NewDatabase(IndexRoot.PlaceInDatabase, 0, "IndexRoot"); session.NewDatabase(Lexicon.PlaceInDatabase, 0, "Lexicon"); session.NewDatabase(Repository.PlaceInDatabase, 0, "Repository"); for (UInt32 i = 40; i <= 186; i++) { session.NewDatabase(i, 512, "Document"); // pre allocate 146 Document databases presized to 512MB each } //session.SetTraceDbActivity(Lexicon.PlaceInDatabase); //session.SetTraceAllDbActivity(); XmlDocument xmlDocument = new XmlDocument("enwiki-latest-pages-articles.xml"); IndexRoot indexRoot = new IndexRoot(btreeNodeSize, session); indexRoot.Persist(session, indexRoot, true); Document doc = null; bool titleElement = false; bool pageText = false; UInt32 currentDocumentDatabaseNum = documentPlacement.StartDatabaseNumber; using (FileStream fs = new FileStream(s_wikipediaXmlFile, FileMode.Open)) { //using (GZipStream zipStream = new GZipStream(fs, CompressionMode.Decompress)) // if input was a .gz file { using (System.Xml.XmlTextReader textReader = new System.Xml.XmlTextReader(fs)) { while (textReader.Read()) { System.Xml.XmlNodeType nodeType = textReader.NodeType; switch (nodeType) { case System.Xml.XmlNodeType.Attribute: break; case System.Xml.XmlNodeType.CDATA: break; case System.Xml.XmlNodeType.Comment: xmlComment = new XmlComment(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.Document: break; case System.Xml.XmlNodeType.DocumentFragment: break; case System.Xml.XmlNodeType.DocumentType: break; case System.Xml.XmlNodeType.Element: xmlElement = new XmlElement(textReader.Prefix, textReader.LocalName, textReader.NamespaceURI, xmlDocument); if (textReader.LocalName == "title") { titleElement = true; } else if (textReader.LocalName == "text") { pageText = true; } break; case System.Xml.XmlNodeType.EndElement: if (textReader.LocalName == "title" && doc != null) { titleElement = false; } else if (textReader.LocalName == "text" && doc != null) { pageText = false; } break; case System.Xml.XmlNodeType.EndEntity: break; case System.Xml.XmlNodeType.Entity: xmlEntity = new XmlEntity(textReader.LocalName, xmlDocument); break; case System.Xml.XmlNodeType.EntityReference: break; case System.Xml.XmlNodeType.None: break; case System.Xml.XmlNodeType.Notation: break; case System.Xml.XmlNodeType.ProcessingInstruction: break; case System.Xml.XmlNodeType.SignificantWhitespace: break; case System.Xml.XmlNodeType.Text: xmlText = new XmlText(textReader.Value, xmlDocument); if (titleElement) { doc = new Document(textReader.Value, indexRoot, session); doc.Persist(documentPlacement, session, true); if (doc.DatabaseNumber != currentDocumentDatabaseNum) { session.FlushUpdates(session.OpenDatabase(currentDocumentDatabaseNum)); Console.WriteLine("Database: " + currentDocumentDatabaseNum + " is completed, done importing article " + docCount + " number of lines: " + textReader.LineNumber); currentDocumentDatabaseNum = doc.DatabaseNumber; } //doc.Page.Database.Name = doc.Name; } else if (doc != null && pageText) { #if DEBUGx Console.WriteLine(doc.Name + " line: " + textReader.LineNumber); #endif //if (textReader.LineNumber > 1000000) //{ // session.Commit(); // return; //} DocumentText content = new DocumentText(textReader.Value, doc); if (doc.DatabaseNumber != contentPlacement.TryDatabaseNumber) { contentPlacement = new Placement(doc.DatabaseNumber, (ushort)contentPlacement.StartPageNumber, 1, contentPlacement.MaxObjectsPerPage, contentPlacement.MaxPagesPerDatabase, false, false, 1, false); } content.Persist(contentPlacement, session, false); Debug.Assert(content.DatabaseNumber == doc.DatabaseNumber); doc.Content = content; indexRoot.repository.documentSet.AddFast(doc); if (++docCount % 1000000 == 0) { //session.Commit(false); // skip recovery check, we do it in BeginUpdate which is enough Console.WriteLine("Done importing article " + docCount + " number of lines: " + textReader.LineNumber); //session.BeginUpdate(); } } break; case System.Xml.XmlNodeType.Whitespace: xmlWhitespace = new XmlWhitespace(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.XmlDeclaration: break; } ; } Console.WriteLine("Finished importing article " + docCount + " number of lines: " + textReader.LineNumber); } } } session.Commit(); } Console.WriteLine(DateTime.Now.ToString() + ", done importing Wikipedia text"); }
public MainWindow() { const ushort btreeNodeSize = 5000; GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer dataGridList = new List <DataGrid>(); dataTableList = new List <DataTable>(); InitializeComponent(); session = new SessionNoServer(s_systemDir); Placement placerIndexRoot = new Placement(IndexRoot.PlaceInDatabase); session.BeginUpdate(); Console.WriteLine("Running with databases in directory: " + session.SystemDirectory); File.Copy(s_licenseDbFile, Path.Combine(session.SystemDirectory, "4.odb"), true); IndexRoot indexRoot; Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false); if (db == null) { session.NewDatabase(IndexRoot.PlaceInDatabase, 0, "IndexRoot"); session.NewDatabase(Lexicon.PlaceInDatabase, 0, "Lexicon"); session.NewDatabase(Document.PlaceInDatabase, 0, "Document"); session.NewDatabase(Repository.PlaceInDatabase, 0, "Repository"); session.NewDatabase(DocumentText.PlaceInDatabase, 0, "DocumentText"); session.NewDatabase(Word.PlaceInDatabase, 0, "Word"); indexRoot = new IndexRoot(btreeNodeSize, session); if (Directory.Exists(s_booksDir)) { string[] directoryTextFiles = Directory.GetFiles(s_booksDir, "*.txt"); foreach (string fileName in directoryTextFiles) { listBoxPagesToAdd.Items.Add(fileName); } } else { wordMinCt.Text = 1.ToString(); listBoxPagesToAdd.Items.Add("http://www.VelocityDB.com/"); // other database products listBoxPagesToAdd.Items.Add("https://foundationdb.com/"); listBoxPagesToAdd.Items.Add("http://www.oracle.com/us/products/database/index.html"); listBoxPagesToAdd.Items.Add("http://www-01.ibm.com/software/data/db2/"); listBoxPagesToAdd.Items.Add("http://www.versant.com/"); listBoxPagesToAdd.Items.Add("http://web.progress.com/en/objectstore/"); listBoxPagesToAdd.Items.Add("https://www.mongodb.org/"); listBoxPagesToAdd.Items.Add("http://cassandra.apache.org/"); listBoxPagesToAdd.Items.Add("http://www.sybase.com/"); listBoxPagesToAdd.Items.Add("http://www.mcobject.com/perst"); listBoxPagesToAdd.Items.Add("http://www.marklogic.com/what-is-marklogic/"); listBoxPagesToAdd.Items.Add("http://hamsterdb.com/"); listBoxPagesToAdd.Items.Add("http://www.firebirdsql.org/"); listBoxPagesToAdd.Items.Add("http://www.h2database.com/"); listBoxPagesToAdd.Items.Add("http://www.oracle.com/technology/products/berkeley-db"); listBoxPagesToAdd.Items.Add("http://www.scimore.com/"); listBoxPagesToAdd.Items.Add("http://www.stsdb.com/"); listBoxPagesToAdd.Items.Add("http://www.sqlite.org/about.html"); listBoxPagesToAdd.Items.Add("http://www.mysql.com/products/enterprise/techspec.html"); listBoxPagesToAdd.Items.Add("http://www.objectivity.com"); listBoxPagesToAdd.Items.Add("http://vistadb.net/"); listBoxPagesToAdd.Items.Add("http://www.google.com/search?q=object+database&sourceid=ie7&rls=com.microsoft:en-us:IE-SearchBox&ie=&oe="); } indexRoot.Persist(session, indexRoot); } else { indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); } if (indexRoot.repository.documentSet.Count > 0) { List <Document> docs = indexRoot.repository.documentSet.ToList <Document>().Take(50).ToList <Document>(); inDbListBox.ItemsSource = docs; } updateDataGrids(indexRoot); session.Commit(); //verify(); }
static void ImportEntireWikipedia() { const ushort btreeNodeSize = 10000; Console.WriteLine(DateTime.Now.ToString() + ", start importing Wikipedia text"); //System.Xml.Schema.XmlSchema docSchema; //using (System.Xml.XmlTextReader schemaReader = new System.Xml.XmlTextReader("c:\\export-0_5.xsd")) //{ // docSchema = System.Xml.Schema.XmlSchema.Read(schemaReader, ValidationCallBack); // } int docCount = 0; using (SessionNoServer session = new SessionNoServer(s_systemDir, 5000, false, false, CacheEnum.No)) // turn of page and object caching { Console.WriteLine($"Running with databases in directory: {session.SystemDirectory}"); //GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer XmlComment xmlComment; XmlElement xmlElement; XmlEntity xmlEntity; XmlText xmlText; XmlWhitespace xmlWhitespace; session.BeginUpdate(); // register all database schema classes used by the application in advance to avoid lock conflict later in parallel indexing Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false); if (db != null) { outputSomeInfo(session); session.Abort(); return; } //session.SetTraceDbActivity(Lexicon.PlaceInDatabase); //session.SetTraceAllDbActivity(); XmlDocument xmlDocument = new XmlDocument("enwiki-latest-pages-articles.xml"); IndexRoot indexRoot = new IndexRoot(btreeNodeSize, session); indexRoot.Persist(session, indexRoot, true); UInt32 currentDocumentDatabaseNum = 0; Document doc = null; bool titleElement = false; bool pageText = false; using (FileStream fs = new FileStream(s_wikipediaXmlFile, FileMode.Open)) { //using (GZipStream zipStream = new GZipStream(fs, CompressionMode.Decompress)) // if input was a .gz file { using (System.Xml.XmlTextReader textReader = new System.Xml.XmlTextReader(fs)) { while (textReader.Read()) { System.Xml.XmlNodeType nodeType = textReader.NodeType; switch (nodeType) { case System.Xml.XmlNodeType.Attribute: break; case System.Xml.XmlNodeType.CDATA: break; case System.Xml.XmlNodeType.Comment: xmlComment = new XmlComment(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.Document: break; case System.Xml.XmlNodeType.DocumentFragment: break; case System.Xml.XmlNodeType.DocumentType: break; case System.Xml.XmlNodeType.Element: xmlElement = new XmlElement(textReader.Prefix, textReader.LocalName, textReader.NamespaceURI, xmlDocument); if (textReader.LocalName == "title") { titleElement = true; } else if (textReader.LocalName == "text") { pageText = true; } break; case System.Xml.XmlNodeType.EndElement: if (textReader.LocalName == "title" && doc != null) { titleElement = false; } else if (textReader.LocalName == "text" && doc != null) { pageText = false; } break; case System.Xml.XmlNodeType.EndEntity: break; case System.Xml.XmlNodeType.Entity: xmlEntity = new XmlEntity(textReader.LocalName, xmlDocument); break; case System.Xml.XmlNodeType.EntityReference: break; case System.Xml.XmlNodeType.None: break; case System.Xml.XmlNodeType.Notation: break; case System.Xml.XmlNodeType.ProcessingInstruction: break; case System.Xml.XmlNodeType.SignificantWhitespace: break; case System.Xml.XmlNodeType.Text: xmlText = new XmlText(textReader.Value, xmlDocument); if (titleElement) { doc = new Document(textReader.Value, indexRoot, session); session.Persist(doc); if (doc.DatabaseNumber != currentDocumentDatabaseNum) { if (currentDocumentDatabaseNum > 0) { session.FlushUpdates(); Console.WriteLine("Database: " + currentDocumentDatabaseNum + " is completed, done importing article " + docCount + " number of lines: " + textReader.LineNumber); } currentDocumentDatabaseNum = doc.DatabaseNumber; } //doc.Page.Database.Name = doc.Name; } else if (doc != null && pageText) { #if DEBUGx Console.WriteLine(doc.Name + " line: " + textReader.LineNumber); #endif //if (textReader.LineNumber > 1000000) //{ // session.Commit(); // return; //} DocumentText content = new DocumentText(textReader.Value, doc); session.Persist(content, 10000); doc.Content = content; indexRoot.Repository.DocumentSet.AddFast(doc); if (++docCount % 1000000 == 0) { //session.Commit(false); // skip recovery check, we do it in BeginUpdate which is enough Console.WriteLine("Done importing article " + docCount + " number of lines: " + textReader.LineNumber); //session.BeginUpdate(); } } break; case System.Xml.XmlNodeType.Whitespace: xmlWhitespace = new XmlWhitespace(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.XmlDeclaration: break; } ; } Console.WriteLine("Finished importing article " + docCount + " number of lines: " + textReader.LineNumber); } } } session.Commit(); } Console.WriteLine(DateTime.Now.ToString() + ", done importing Wikipedia text"); }