public WordHit(Document doc, UInt64 position, SessionBase session) { wordPositionArray = new UInt64[1]; wordPositionArray[0] = position; }
public Document parseHtml(string url, IndexRoot indexRoot) { Document doc = new Document(url, indexRoot, session); Placement docPlacement = new Placement(Document.PlaceInDatabase); Placement docTextPlacement = new Placement(Document.PlaceInDatabase, 2); Placement wordPlacement = new Placement(Document.PlaceInDatabase, 3); Placement wordHitPlacement = new Placement(Document.PlaceInDatabase, 100); using (WebClient client = new WebClient()) { string html = client.DownloadString(url); string pageBody = ""; HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(html); foreach (HtmlNode node in htmlDoc.DocumentNode.SelectNodes("//text()")) pageBody += " " + node.InnerText; textToWords(doc, indexRoot, pageBody, docPlacement, docTextPlacement, wordPlacement, wordHitPlacement); } return doc; }
public Document parseTextFile(string url, IndexRoot indexRoot, Placement docPlacement) { Document doc = new Document(Path.GetFileName(url), indexRoot, session); Placement docTextPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 1)); Placement wordPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 2)); Placement wordHitPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 10)); using (StreamReader reader = new StreamReader(url)) { textToWords(doc, indexRoot, reader.ReadToEnd(), docPlacement, docTextPlacement, wordPlacement, wordHitPlacement); } return doc; }
public void textToWords(Document doc, IndexRoot indexRoot, string docTextString, Placement documentPlacement, Placement documentTextPlacement, Placement wordPlacement, Placement wordHitPlacement) { DocumentText docText = new DocumentText(docTextString, doc); Word word; doc.Persist(documentPlacement, session); doc.Page.Database.Name = doc.Name; docText.Persist(documentTextPlacement, session); indexRoot.repository.documentSet.Add(doc); doc.Content = docText; docTextString = docTextString.ToLower(); string[] excludedWords = new string[] { "and", "the" }; char[] splitChars = new char[] { ' ', '\n', '(', '"', '!', ',', '(', ')', '\t' }; string[] words = docTextString.Split(splitChars, StringSplitOptions.RemoveEmptyEntries); UInt64 wordCt = 0; int i = 0; string aWord; char[] trimEndChars = new char[] { ';', '.', '"', ',', '\r', ':', ']', '!', '?', '+', '(', ')', '\'', '{', '}', '-', '`', '/', '=' }; char[] trimStartChars = new char[] { ';', '&', '-', '#', '*', '[', '.', '"', ',', '\r', ')', '(', '\'', '{', '}', '-', '`' }; foreach (string wordStr in words) { i++; aWord = wordStr.TrimEnd(trimEndChars); aWord = aWord.TrimStart(trimStartChars); word = new Word(aWord); if (aWord.Length > 1 && excludedWords.Contains(aWord) == false) { createLocalInvertedIndex(doc, word, wordCt, wordPlacement, wordHitPlacement); ++wordCt; } } }
public void createLocalInvertedIndex(Document doc, Word word, UInt64 wordCt, Placement wordPlacement, Placement wordHitPlacement) { WordHit wordHit; BTreeSetOidShort<Word> wordSet = doc.WordSet; if (wordSet.TryGetKey(word, ref word)) { wordHit = doc.WordHit[word]; wordHit.Add(wordCt); } else { word.Persist(wordPlacement, session); wordSet.Add(word); wordHit = new WordHit(doc, wordCt++, session); doc.WordHit.ValuePlacement = wordHitPlacement; doc.WordHit.AddFast(word, wordHit); } }
static void importEntireWikipedia() { const ushort btreeNodeSize = 10000; Console.WriteLine(DateTime.Now.ToString() + ", start importing Wikipedia text"); //System.Xml.Schema.XmlSchema docSchema; //using (System.Xml.XmlTextReader schemaReader = new System.Xml.XmlTextReader("c:\\export-0_5.xsd")) //{ // docSchema = System.Xml.Schema.XmlSchema.Read(schemaReader, ValidationCallBack); // } int docCount = 0; using (SessionNoServer session = new SessionNoServer(s_systemDir, 5000, false, false, CacheEnum.No)) // turn of page and object caching { Console.WriteLine("Running with databases in directory: " + session.SystemDirectory); //GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer Placement documentPlacement = new Placement(Document.PlaceInDatabase, 1003, 1, 500, 1000, false, false, 1000, false); Placement contentPlacement = new Placement(Document.PlaceInDatabase, 1, 1, 500, UInt16.MaxValue, false, false, 1, false); XmlComment xmlComment; XmlElement xmlElement; XmlEntity xmlEntity; XmlText xmlText; XmlWhitespace xmlWhitespace; session.BeginUpdate(); File.Copy(s_licenseDbFile, System.IO.Path.Combine(session.SystemDirectory, "4.odb"), true); // register all database schema classes used by the application in advance to avoid lock conflict later in parallell indexing session.RegisterClass(typeof(Repository)); session.RegisterClass(typeof(IndexRoot)); session.RegisterClass(typeof(Document)); session.RegisterClass(typeof(Lexicon)); session.RegisterClass(typeof(DocumentText)); session.RegisterClass(typeof(Word)); session.RegisterClass(typeof(WordGlobal)); session.RegisterClass(typeof(WordHit)); session.RegisterClass(typeof(BTreeSet<Document>)); session.RegisterClass(typeof(OidShort)); session.RegisterClass(typeof(BTreeMap<Word, WordHit>)); session.RegisterClass(typeof(HashCodeComparer<Word>)); session.RegisterClass(typeof(BTreeSetOidShort<Word>)); session.RegisterClass(typeof(BTreeMapOidShort<Word, WordHit>)); Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false); if (db != null) { outputSomeInfo(session); session.Abort(); return; } session.NewDatabase(IndexRoot.PlaceInDatabase, 0, "IndexRoot"); session.NewDatabase(Lexicon.PlaceInDatabase, 0, "Lexicon"); session.NewDatabase(Repository.PlaceInDatabase, 0, "Repository"); for (UInt32 i = 40; i <= 186; i++) { session.NewDatabase(i, 512, "Document"); // pre allocate 146 Document databases presized to 512MB each } //session.SetTraceDbActivity(Lexicon.PlaceInDatabase); //session.SetTraceAllDbActivity(); XmlDocument xmlDocument = new XmlDocument("enwiki-latest-pages-articles.xml"); IndexRoot indexRoot = new IndexRoot(btreeNodeSize, session); indexRoot.Persist(session, indexRoot, true); Document doc = null; bool titleElement = false; bool pageText = false; UInt32 currentDocumentDatabaseNum = documentPlacement.StartDatabaseNumber; using (FileStream fs = new FileStream(s_wikipediaXmlFile, FileMode.Open)) { //using (GZipStream zipStream = new GZipStream(fs, CompressionMode.Decompress)) // if input was a .gz file { using (System.Xml.XmlTextReader textReader = new System.Xml.XmlTextReader(fs)) { while (textReader.Read()) { System.Xml.XmlNodeType nodeType = textReader.NodeType; switch (nodeType) { case System.Xml.XmlNodeType.Attribute: break; case System.Xml.XmlNodeType.CDATA: break; case System.Xml.XmlNodeType.Comment: xmlComment = new XmlComment(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.Document: break; case System.Xml.XmlNodeType.DocumentFragment: break; case System.Xml.XmlNodeType.DocumentType: break; case System.Xml.XmlNodeType.Element: xmlElement = new XmlElement(textReader.Prefix, textReader.LocalName, textReader.NamespaceURI, xmlDocument); if (textReader.LocalName == "title") titleElement = true; else if (textReader.LocalName == "text") pageText = true; break; case System.Xml.XmlNodeType.EndElement: if (textReader.LocalName == "title" && doc != null) titleElement = false; else if (textReader.LocalName == "text" && doc != null) pageText = false; break; case System.Xml.XmlNodeType.EndEntity: break; case System.Xml.XmlNodeType.Entity: xmlEntity = new XmlEntity(textReader.LocalName, xmlDocument); break; case System.Xml.XmlNodeType.EntityReference: break; case System.Xml.XmlNodeType.None: break; case System.Xml.XmlNodeType.Notation: break; case System.Xml.XmlNodeType.ProcessingInstruction: break; case System.Xml.XmlNodeType.SignificantWhitespace: break; case System.Xml.XmlNodeType.Text: xmlText = new XmlText(textReader.Value, xmlDocument); if (titleElement) { doc = new Document(textReader.Value, indexRoot, session); doc.Persist(documentPlacement, session, true); if (doc.DatabaseNumber != currentDocumentDatabaseNum) { session.FlushUpdates(session.OpenDatabase(currentDocumentDatabaseNum)); Console.WriteLine("Database: " + currentDocumentDatabaseNum +" is completed, done importing article " + docCount + " number of lines: " + textReader.LineNumber); currentDocumentDatabaseNum = doc.DatabaseNumber; } //doc.Page.Database.Name = doc.Name; } else if (doc != null && pageText) { #if DEBUGx Console.WriteLine(doc.Name + " line: " + textReader.LineNumber); #endif //if (textReader.LineNumber > 1000000) //{ // session.Commit(); // return; //} DocumentText content = new DocumentText(textReader.Value, doc); if (doc.DatabaseNumber != contentPlacement.TryDatabaseNumber) contentPlacement = new Placement(doc.DatabaseNumber, (ushort)contentPlacement.StartPageNumber, 1, contentPlacement.MaxObjectsPerPage, contentPlacement.MaxPagesPerDatabase, false, false, 1, false); content.Persist(contentPlacement, session, false); Debug.Assert(content.DatabaseNumber == doc.DatabaseNumber); doc.Content = content; indexRoot.repository.documentSet.AddFast(doc); if (++docCount % 1000000 == 0) { //session.Commit(false); // skip recovery check, we do it in BeginUpdate which is enough Console.WriteLine("Done importing article " + docCount + " number of lines: " + textReader.LineNumber); //session.BeginUpdate(); } } break; case System.Xml.XmlNodeType.Whitespace: xmlWhitespace = new XmlWhitespace(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.XmlDeclaration: break; }; } Console.WriteLine("Finished importing article " + docCount + " number of lines: " + textReader.LineNumber); } } } session.Commit(); } Console.WriteLine(DateTime.Now.ToString() + ", done importing Wikipedia text"); }
static void createDocumentInvertedIndex(SessionBase session, Database db, BTreeSet<Document> documentSet) { UInt32 dbNum = db.DatabaseNumber; Document doc = null; Document inputDoc = new Document(db.Id); Placement wordPlacement = new Placement(inputDoc.DatabaseNumber, 20000, 1, 25000, 65000, true, false, 1, false); Placement wordHitPlacement = new Placement(inputDoc.DatabaseNumber, 40000, 1, 25000, 65500, true, false, 1, false); //session.SetTraceDbActivity(db.DatabaseNumber); BTreeSetIterator<Document> iterator = documentSet.Iterator(); iterator.GoTo(inputDoc); inputDoc = iterator.Current(); while (inputDoc != null && inputDoc.Page.Database.DatabaseNumber == dbNum) { doc = (Document)session.Open(inputDoc.Page.Database, inputDoc.Id); // if matching database is availeble, use it to speed up lookup DocumentText docText = doc.Content; string text = docText.Text.ToLower(); MatchCollection tagMatches = Regex.Matches(text, "[a-z][a-z.$]+"); UInt64 wordCt = 0; WordHit wordHit; Word word; if (++s_docCountIndexed % 50000 == 0) Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed); BTreeSetOidShort<Word> wordSet = doc.WordSet; foreach (Match m in tagMatches) { word = new Word(m.Value); if (wordSet.TryGetKey(word, ref word)) { //wordHit = doc.WordHit[word]; // to costly to add tight now - figure out a better way ? //wordHit.Add(wordCt); } else { word = new Word(m.Value); word.Persist(wordPlacement, session); wordSet.Add(word); wordHit = new WordHit(doc, wordCt++, session); //wordHit.Persist(wordHitPlacement, session); doc.WordHit.ValuePlacement = wordHitPlacement; doc.WordHit.AddFast(word, wordHit); } } inputDoc = iterator.Next(); } session.FlushUpdates(db); session.ClearCachedObjects(db); // free up memory for objects we no longer need to have cached Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed + " Database: " + dbNum + " is completed."); }
public DocumentText(string documentText, Document doc) { this.documentText = documentText; documentShortId = (uint) doc.Id; }