#pragma warning restore 0169 public XmlNode(XmlDocument ownerDocument) { this.ownerDocument = ownerDocument; }
public XmlLinkedNode(XmlDocument ownerDocument) : base(ownerDocument) { }
public XmlElement(string prefix, string localName, string namespaceURI, XmlDocument doc) : base(doc) { this.localName = localName; }
public XmlEntity(string localName, XmlDocument doc) : base(doc) { this.localName = localName; }
public XmlText(string data, XmlDocument doc) : base(data, doc) { }
public XmlCharacterData(string data, XmlDocument doc) : base(doc) { this.data = data; }
static void importEntireWikipedia() { const ushort btreeNodeSize = 10000; Console.WriteLine(DateTime.Now.ToString() + ", start importing Wikipedia text"); //System.Xml.Schema.XmlSchema docSchema; //using (System.Xml.XmlTextReader schemaReader = new System.Xml.XmlTextReader("c:\\export-0_5.xsd")) //{ // docSchema = System.Xml.Schema.XmlSchema.Read(schemaReader, ValidationCallBack); // } int docCount = 0; using (SessionNoServer session = new SessionNoServer(s_systemDir, 5000, false, false, CacheEnum.No)) // turn of page and object caching { Console.WriteLine("Running with databases in directory: " + session.SystemDirectory); //GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer Placement documentPlacement = new Placement(Document.PlaceInDatabase, 1003, 1, 500, 1000, false, false, 1000, false); Placement contentPlacement = new Placement(Document.PlaceInDatabase, 1, 1, 500, UInt16.MaxValue, false, false, 1, false); XmlComment xmlComment; XmlElement xmlElement; XmlEntity xmlEntity; XmlText xmlText; XmlWhitespace xmlWhitespace; session.BeginUpdate(); File.Copy(s_licenseDbFile, System.IO.Path.Combine(session.SystemDirectory, "4.odb"), true); // register all database schema classes used by the application in advance to avoid lock conflict later in parallell indexing session.RegisterClass(typeof(Repository)); session.RegisterClass(typeof(IndexRoot)); session.RegisterClass(typeof(Document)); session.RegisterClass(typeof(Lexicon)); session.RegisterClass(typeof(DocumentText)); session.RegisterClass(typeof(Word)); session.RegisterClass(typeof(WordGlobal)); session.RegisterClass(typeof(WordHit)); session.RegisterClass(typeof(BTreeSet<Document>)); session.RegisterClass(typeof(OidShort)); session.RegisterClass(typeof(BTreeMap<Word, WordHit>)); session.RegisterClass(typeof(HashCodeComparer<Word>)); session.RegisterClass(typeof(BTreeSetOidShort<Word>)); session.RegisterClass(typeof(BTreeMapOidShort<Word, WordHit>)); Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false); if (db != null) { outputSomeInfo(session); session.Abort(); return; } session.NewDatabase(IndexRoot.PlaceInDatabase, 0, "IndexRoot"); session.NewDatabase(Lexicon.PlaceInDatabase, 0, "Lexicon"); session.NewDatabase(Repository.PlaceInDatabase, 0, "Repository"); for (UInt32 i = 40; i <= 186; i++) { session.NewDatabase(i, 512, "Document"); // pre allocate 146 Document databases presized to 512MB each } //session.SetTraceDbActivity(Lexicon.PlaceInDatabase); //session.SetTraceAllDbActivity(); XmlDocument xmlDocument = new XmlDocument("enwiki-latest-pages-articles.xml"); IndexRoot indexRoot = new IndexRoot(btreeNodeSize, session); indexRoot.Persist(session, indexRoot, true); Document doc = null; bool titleElement = false; bool pageText = false; UInt32 currentDocumentDatabaseNum = documentPlacement.StartDatabaseNumber; using (FileStream fs = new FileStream(s_wikipediaXmlFile, FileMode.Open)) { //using (GZipStream zipStream = new GZipStream(fs, CompressionMode.Decompress)) // if input was a .gz file { using (System.Xml.XmlTextReader textReader = new System.Xml.XmlTextReader(fs)) { while (textReader.Read()) { System.Xml.XmlNodeType nodeType = textReader.NodeType; switch (nodeType) { case System.Xml.XmlNodeType.Attribute: break; case System.Xml.XmlNodeType.CDATA: break; case System.Xml.XmlNodeType.Comment: xmlComment = new XmlComment(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.Document: break; case System.Xml.XmlNodeType.DocumentFragment: break; case System.Xml.XmlNodeType.DocumentType: break; case System.Xml.XmlNodeType.Element: xmlElement = new XmlElement(textReader.Prefix, textReader.LocalName, textReader.NamespaceURI, xmlDocument); if (textReader.LocalName == "title") titleElement = true; else if (textReader.LocalName == "text") pageText = true; break; case System.Xml.XmlNodeType.EndElement: if (textReader.LocalName == "title" && doc != null) titleElement = false; else if (textReader.LocalName == "text" && doc != null) pageText = false; break; case System.Xml.XmlNodeType.EndEntity: break; case System.Xml.XmlNodeType.Entity: xmlEntity = new XmlEntity(textReader.LocalName, xmlDocument); break; case System.Xml.XmlNodeType.EntityReference: break; case System.Xml.XmlNodeType.None: break; case System.Xml.XmlNodeType.Notation: break; case System.Xml.XmlNodeType.ProcessingInstruction: break; case System.Xml.XmlNodeType.SignificantWhitespace: break; case System.Xml.XmlNodeType.Text: xmlText = new XmlText(textReader.Value, xmlDocument); if (titleElement) { doc = new Document(textReader.Value, indexRoot, session); doc.Persist(documentPlacement, session, true); if (doc.DatabaseNumber != currentDocumentDatabaseNum) { session.FlushUpdates(session.OpenDatabase(currentDocumentDatabaseNum)); Console.WriteLine("Database: " + currentDocumentDatabaseNum +" is completed, done importing article " + docCount + " number of lines: " + textReader.LineNumber); currentDocumentDatabaseNum = doc.DatabaseNumber; } //doc.Page.Database.Name = doc.Name; } else if (doc != null && pageText) { #if DEBUGx Console.WriteLine(doc.Name + " line: " + textReader.LineNumber); #endif //if (textReader.LineNumber > 1000000) //{ // session.Commit(); // return; //} DocumentText content = new DocumentText(textReader.Value, doc); if (doc.DatabaseNumber != contentPlacement.TryDatabaseNumber) contentPlacement = new Placement(doc.DatabaseNumber, (ushort)contentPlacement.StartPageNumber, 1, contentPlacement.MaxObjectsPerPage, contentPlacement.MaxPagesPerDatabase, false, false, 1, false); content.Persist(contentPlacement, session, false); Debug.Assert(content.DatabaseNumber == doc.DatabaseNumber); doc.Content = content; indexRoot.repository.documentSet.AddFast(doc); if (++docCount % 1000000 == 0) { //session.Commit(false); // skip recovery check, we do it in BeginUpdate which is enough Console.WriteLine("Done importing article " + docCount + " number of lines: " + textReader.LineNumber); //session.BeginUpdate(); } } break; case System.Xml.XmlNodeType.Whitespace: xmlWhitespace = new XmlWhitespace(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.XmlDeclaration: break; }; } Console.WriteLine("Finished importing article " + docCount + " number of lines: " + textReader.LineNumber); } } } session.Commit(); } Console.WriteLine(DateTime.Now.ToString() + ", done importing Wikipedia text"); }
public XmlComment(string comment, XmlDocument doc) : base(comment, doc) { }