Inheritance: OptimizedPersistable
Exemplo n.º 1
0
 public WordHit(Document doc, UInt64 position, SessionBase session)
 {
   wordPositionArray = new UInt64[1];
   wordPositionArray[0] = position;
 }
Exemplo n.º 2
0
 public Document parseHtml(string url, IndexRoot indexRoot)
 {
   Document doc = new Document(url, indexRoot, session);
   Placement docPlacement = new Placement(Document.PlaceInDatabase);
   Placement docTextPlacement = new Placement(Document.PlaceInDatabase, 2);
   Placement wordPlacement = new Placement(Document.PlaceInDatabase, 3);
   Placement wordHitPlacement = new Placement(Document.PlaceInDatabase, 100);
   using (WebClient client = new WebClient())
   {
     string html = client.DownloadString(url);
     string pageBody = "";
     HtmlDocument htmlDoc = new HtmlDocument();
     htmlDoc.LoadHtml(html);
     foreach (HtmlNode node in htmlDoc.DocumentNode.SelectNodes("//text()"))
       pageBody += " " + node.InnerText;
     textToWords(doc, indexRoot, pageBody, docPlacement, docTextPlacement, wordPlacement, wordHitPlacement);
   }
   return doc;
 }
Exemplo n.º 3
0
 public Document parseTextFile(string url, IndexRoot indexRoot, Placement docPlacement)
 {
   Document doc = new Document(Path.GetFileName(url), indexRoot, session);
   Placement docTextPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 1));
   Placement wordPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 2));
   Placement wordHitPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 10));
   using (StreamReader reader = new StreamReader(url))
   {
     textToWords(doc, indexRoot, reader.ReadToEnd(), docPlacement, docTextPlacement, wordPlacement, wordHitPlacement);
   }
   return doc;
 }
Exemplo n.º 4
0
 public void textToWords(Document doc, IndexRoot indexRoot, string docTextString, Placement documentPlacement,
   Placement documentTextPlacement, Placement wordPlacement, Placement wordHitPlacement)
 {
   DocumentText docText = new DocumentText(docTextString, doc);
   Word word;
   doc.Persist(documentPlacement, session);
   doc.Page.Database.Name = doc.Name;
   docText.Persist(documentTextPlacement, session);
   indexRoot.repository.documentSet.Add(doc);
   doc.Content = docText;
   docTextString = docTextString.ToLower();
   string[] excludedWords = new string[] { "and", "the" };
   char[] splitChars = new char[] { ' ', '\n', '(', '"', '!', ',', '(', ')', '\t' };
   string[] words = docTextString.Split(splitChars, StringSplitOptions.RemoveEmptyEntries);
   UInt64 wordCt = 0;
   int i = 0;
   string aWord;
   char[] trimEndChars = new char[] { ';', '.', '"', ',', '\r', ':', ']', '!', '?', '+', '(', ')', '\'', '{', '}', '-', '`', '/', '=' };
   char[] trimStartChars = new char[] { ';', '&', '-', '#', '*', '[', '.', '"', ',', '\r', ')', '(', '\'', '{', '}', '-', '`' };
   foreach (string wordStr in words)
   {
     i++;
     aWord = wordStr.TrimEnd(trimEndChars);
     aWord = aWord.TrimStart(trimStartChars);
     word = new Word(aWord);
     if (aWord.Length > 1 && excludedWords.Contains(aWord) == false)
     {
       createLocalInvertedIndex(doc, word, wordCt, wordPlacement, wordHitPlacement);
       ++wordCt;
     }
   }
 }
Exemplo n.º 5
0
 public void createLocalInvertedIndex(Document doc, Word word, UInt64 wordCt, Placement wordPlacement, Placement wordHitPlacement)
 {
   WordHit wordHit;
   BTreeSetOidShort<Word> wordSet = doc.WordSet;
   if (wordSet.TryGetKey(word, ref word))
   {
     wordHit = doc.WordHit[word];
     wordHit.Add(wordCt);
   }
   else
   {
     word.Persist(wordPlacement, session);
     wordSet.Add(word);
     wordHit = new WordHit(doc, wordCt++, session);
     doc.WordHit.ValuePlacement = wordHitPlacement;
     doc.WordHit.AddFast(word, wordHit);
   }
 }
Exemplo n.º 6
0
    static void importEntireWikipedia()
    {
      const ushort btreeNodeSize = 10000;
      Console.WriteLine(DateTime.Now.ToString() + ", start importing Wikipedia text");
      //System.Xml.Schema.XmlSchema docSchema;
      //using (System.Xml.XmlTextReader schemaReader = new System.Xml.XmlTextReader("c:\\export-0_5.xsd"))
      //{
      //  docSchema = System.Xml.Schema.XmlSchema.Read(schemaReader, ValidationCallBack);
      // }
      int docCount = 0;
      using (SessionNoServer session = new SessionNoServer(s_systemDir, 5000, false, false, CacheEnum.No)) // turn of page and object caching
      {
        Console.WriteLine("Running with databases in directory: " + session.SystemDirectory);
        //GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer
        Placement documentPlacement = new Placement(Document.PlaceInDatabase, 1003, 1, 500, 1000, false, false, 1000, false);
        Placement contentPlacement = new Placement(Document.PlaceInDatabase, 1, 1, 500, UInt16.MaxValue, false, false, 1, false);
        XmlComment xmlComment;
        XmlElement xmlElement;
        XmlEntity xmlEntity;
        XmlText xmlText;
        XmlWhitespace xmlWhitespace;
        session.BeginUpdate();
        File.Copy(s_licenseDbFile, System.IO.Path.Combine(session.SystemDirectory, "4.odb"), true);
        // register all database schema classes used by the application in advance to avoid lock conflict later in parallell indexing
        session.RegisterClass(typeof(Repository));
        session.RegisterClass(typeof(IndexRoot));
        session.RegisterClass(typeof(Document));
        session.RegisterClass(typeof(Lexicon));
        session.RegisterClass(typeof(DocumentText));
        session.RegisterClass(typeof(Word));
        session.RegisterClass(typeof(WordGlobal));
        session.RegisterClass(typeof(WordHit));
        session.RegisterClass(typeof(BTreeSet<Document>));
        session.RegisterClass(typeof(OidShort));
        session.RegisterClass(typeof(BTreeMap<Word, WordHit>));
        session.RegisterClass(typeof(HashCodeComparer<Word>));
        session.RegisterClass(typeof(BTreeSetOidShort<Word>));
        session.RegisterClass(typeof(BTreeMapOidShort<Word, WordHit>));
        Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false);
        if (db != null)
        {
          outputSomeInfo(session);
          session.Abort();
          return;
        }
        session.NewDatabase(IndexRoot.PlaceInDatabase, 0, "IndexRoot");
        session.NewDatabase(Lexicon.PlaceInDatabase, 0, "Lexicon");
        session.NewDatabase(Repository.PlaceInDatabase, 0, "Repository");
        for (UInt32 i = 40; i <= 186; i++)
        {
          session.NewDatabase(i, 512, "Document"); // pre allocate 146 Document databases presized to 512MB each
        }
        //session.SetTraceDbActivity(Lexicon.PlaceInDatabase);
        //session.SetTraceAllDbActivity();
        XmlDocument xmlDocument = new XmlDocument("enwiki-latest-pages-articles.xml");
        IndexRoot indexRoot = new IndexRoot(btreeNodeSize, session);
        indexRoot.Persist(session, indexRoot, true);
        Document doc = null;
        bool titleElement = false;
        bool pageText = false;
        UInt32 currentDocumentDatabaseNum = documentPlacement.StartDatabaseNumber;
        using (FileStream fs = new FileStream(s_wikipediaXmlFile, FileMode.Open))
        {
          //using (GZipStream zipStream = new GZipStream(fs, CompressionMode.Decompress)) // if input was a .gz file
          {
            using (System.Xml.XmlTextReader textReader = new System.Xml.XmlTextReader(fs))
            {
              while (textReader.Read())
              {
                System.Xml.XmlNodeType nodeType = textReader.NodeType;
                switch (nodeType)
                {
                  case System.Xml.XmlNodeType.Attribute:
                    break;
                  case System.Xml.XmlNodeType.CDATA:
                    break;
                  case System.Xml.XmlNodeType.Comment:
                    xmlComment = new XmlComment(textReader.Value, xmlDocument);
                    break;
                  case System.Xml.XmlNodeType.Document:
                    break;
                  case System.Xml.XmlNodeType.DocumentFragment:
                    break;
                  case System.Xml.XmlNodeType.DocumentType:
                    break;
                  case System.Xml.XmlNodeType.Element:
                    xmlElement = new XmlElement(textReader.Prefix, textReader.LocalName, textReader.NamespaceURI, xmlDocument);
                    if (textReader.LocalName == "title")
                      titleElement = true;
                    else if (textReader.LocalName == "text")
                      pageText = true;
                    break;
                  case System.Xml.XmlNodeType.EndElement:
                    if (textReader.LocalName == "title" && doc != null)
                      titleElement = false;
                    else if (textReader.LocalName == "text" && doc != null)
                      pageText = false;
                    break;
                  case System.Xml.XmlNodeType.EndEntity:
                    break;
                  case System.Xml.XmlNodeType.Entity:
                    xmlEntity = new XmlEntity(textReader.LocalName, xmlDocument);
                    break;
                  case System.Xml.XmlNodeType.EntityReference:
                    break;
                  case System.Xml.XmlNodeType.None:
                    break;
                  case System.Xml.XmlNodeType.Notation:
                    break;
                  case System.Xml.XmlNodeType.ProcessingInstruction:
                    break;
                  case System.Xml.XmlNodeType.SignificantWhitespace:
                    break;
                  case System.Xml.XmlNodeType.Text:
                    xmlText = new XmlText(textReader.Value, xmlDocument);
                    if (titleElement)
                    {
                      doc = new Document(textReader.Value, indexRoot, session);
                      doc.Persist(documentPlacement, session, true);
                      if (doc.DatabaseNumber != currentDocumentDatabaseNum)
                      {
                        session.FlushUpdates(session.OpenDatabase(currentDocumentDatabaseNum));
                        Console.WriteLine("Database: " + currentDocumentDatabaseNum +" is completed, done importing article " + docCount + " number of lines: " + textReader.LineNumber);
                        currentDocumentDatabaseNum = doc.DatabaseNumber;
                      }
                      //doc.Page.Database.Name = doc.Name;
                    }
                    else if (doc != null && pageText)
                    {
#if DEBUGx
                      Console.WriteLine(doc.Name + " line: " + textReader.LineNumber);
#endif
                      //if (textReader.LineNumber > 1000000)
                      //{
                      //  session.Commit();
                      //  return;
                      //}
                      DocumentText content = new DocumentText(textReader.Value, doc);
                      if (doc.DatabaseNumber != contentPlacement.TryDatabaseNumber)
                        contentPlacement = new Placement(doc.DatabaseNumber, (ushort)contentPlacement.StartPageNumber, 1, contentPlacement.MaxObjectsPerPage, contentPlacement.MaxPagesPerDatabase, false, false, 1, false);
                      content.Persist(contentPlacement, session, false);
                      Debug.Assert(content.DatabaseNumber == doc.DatabaseNumber);
                      doc.Content = content;
                      indexRoot.repository.documentSet.AddFast(doc);
                      if (++docCount % 1000000 == 0)
                      {
                        //session.Commit(false); // skip recovery check, we do it in BeginUpdate which is enough
                        Console.WriteLine("Done importing article " + docCount + " number of lines: " + textReader.LineNumber);
                        //session.BeginUpdate();
                      }
                    }
                    break;
                  case System.Xml.XmlNodeType.Whitespace:
                    xmlWhitespace = new XmlWhitespace(textReader.Value, xmlDocument);
                    break;
                  case System.Xml.XmlNodeType.XmlDeclaration:
                    break;
                };
              }
              Console.WriteLine("Finished importing article " + docCount + " number of lines: " + textReader.LineNumber);
            }
          }
        }
        session.Commit();
      }
      Console.WriteLine(DateTime.Now.ToString() + ", done importing Wikipedia text");
    }
Exemplo n.º 7
0
 static void createDocumentInvertedIndex(SessionBase session, Database db, BTreeSet<Document> documentSet)
 {
   UInt32 dbNum = db.DatabaseNumber;
   Document doc = null;
   Document inputDoc = new Document(db.Id);
   Placement wordPlacement = new Placement(inputDoc.DatabaseNumber, 20000, 1, 25000, 65000, true, false, 1, false);
   Placement wordHitPlacement = new Placement(inputDoc.DatabaseNumber, 40000, 1, 25000, 65500, true, false, 1, false);
   //session.SetTraceDbActivity(db.DatabaseNumber);
   BTreeSetIterator<Document> iterator = documentSet.Iterator();
   iterator.GoTo(inputDoc);
   inputDoc = iterator.Current();
   while (inputDoc != null && inputDoc.Page.Database.DatabaseNumber == dbNum)
   {
     doc = (Document)session.Open(inputDoc.Page.Database, inputDoc.Id); // if matching database is availeble, use it to speed up lookup
     DocumentText docText = doc.Content;
     string text = docText.Text.ToLower();
     MatchCollection tagMatches = Regex.Matches(text, "[a-z][a-z.$]+");
     UInt64 wordCt = 0;
     WordHit wordHit;
     Word word;
     if (++s_docCountIndexed % 50000 == 0)
       Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed);
     BTreeSetOidShort<Word> wordSet = doc.WordSet;
     foreach (Match m in tagMatches)
     {
       word = new Word(m.Value);
       if (wordSet.TryGetKey(word, ref word))
       {
         //wordHit = doc.WordHit[word]; // to costly to add tight now - figure out a better way ?
         //wordHit.Add(wordCt);
       }
       else
       {
         word = new Word(m.Value);
         word.Persist(wordPlacement, session);
         wordSet.Add(word);
         wordHit = new WordHit(doc, wordCt++, session);
         //wordHit.Persist(wordHitPlacement, session);
         doc.WordHit.ValuePlacement = wordHitPlacement;
         doc.WordHit.AddFast(word, wordHit);
       }
     }
     inputDoc = iterator.Next();
   }
   session.FlushUpdates(db);
   session.ClearCachedObjects(db); // free up memory for objects we no longer need to have cached
   Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed + " Database: " + dbNum + " is completed.");
 }
Exemplo n.º 8
0
 public DocumentText(string documentText, Document doc)
 {
   this.documentText = documentText;
   documentShortId = (uint) doc.Id;
 }