Пример #1
0
 static void createInvertedIndex()
 {
     using (SessionNoServer session = new SessionNoServer(s_systemDir))
     {
         session.BeginUpdate();
         session.EnableAutoPageFlush = false; // so that threads don't stomb on each other
         Console.WriteLine(DateTime.Now.ToString() + ", start creating inverted index");
         ParallelOptions pOptions = new ParallelOptions();
         pOptions.MaxDegreeOfParallelism = 2; // set to what is appropriate for your computer (cores & memory size)
         //pOptions.MaxDegreeOfParallelism = 1; // appears to work best with only 16GB of memory
         IndexRoot           indexRoot   = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1));
         BTreeSet <Document> documentSet = indexRoot.repository.documentSet;
         List <Database>     dbs         = session.OpenAllDatabases(true);
         Parallel.ForEach <Database>(dbs, pOptions,
                                     (Database db, ParallelLoopState loop) => // method invoked by the loop on each iteration
         {
             if (db.DatabaseNumber >= Document.PlaceInDatabase)
             {
                 createDocumentInvertedIndex(session, db, documentSet);
             }
         });
         session.Commit();
         Console.WriteLine(DateTime.Now.ToString() + ", done creating inverted index");
     }
 }
Пример #2
0
        public void createGlobalInvertedIndex(IndexRoot indexRoot)
        {
            Placement wordPlacement         = new Placement(Lexicon.PlaceInDatabase, 2);
            BTreeSetOidShort <Word> wordSet = indexRoot.lexicon.WordSet;
            BTreeSet <Document>     docSet  = indexRoot.repository.documentSet;
            Word existingWord = null;

            foreach (Document doc in docSet)
            {
                if (doc.Indexed == false)
                {
                    foreach (Word word in doc.WordSet)
                    {
                        WordHit wordHit = doc.WordHit[word];
                        if (wordSet.TryGetKey(word, ref existingWord))
                        {
                            existingWord.GlobalCount = existingWord.GlobalCount + (uint)wordHit.Count;
                        }
                        else
                        {
                            existingWord = new WordGlobal(word.aWord, session, (uint)wordHit.Count);
                            existingWord.Persist(wordPlacement, session);
                            wordSet.Add(existingWord);
                        }
                        existingWord.DocumentHit.AddFast(doc);
                    }
                    doc.Indexed = true;
                }
            }
        }
Пример #3
0
        public void textToWords(Document doc, IndexRoot indexRoot, string docTextString)
        {
            DocumentText docText = new DocumentText(docTextString, doc);

            session.Persist(doc);
            doc.Page.Database.Name = doc.Name;
            session.Persist(docText);
            indexRoot.Repository.DocumentSet.Add(doc);
            doc.Content   = docText;
            docTextString = docTextString.ToLower();
            string[] excludedWords = new string[] { "and", "the" };
            char[]   splitChars    = new char[] { ' ', '\n', '(', '"', '!', ',', '(', ')', '\t' };
            string[] words         = docTextString.Split(splitChars, StringSplitOptions.RemoveEmptyEntries);
            int      i             = 0;
            string   aWord;

            char[] trimEndChars   = new char[] { ';', '.', '"', ',', '\r', ':', ']', '!', '?', '+', '(', ')', '\'', '{', '}', '-', '`', '/', '=' };
            char[] trimStartChars = new char[] { ';', '&', '-', '#', '*', '[', '.', '"', ',', '\r', ')', '(', '\'', '{', '}', '-', '`' };
            foreach (string wordStr in words)
            {
                i++;
                aWord = wordStr.TrimEnd(trimEndChars);
                aWord = aWord.TrimStart(trimStartChars);
                if (aWord.Length > 1 && excludedWords.Contains(aWord) == false)
                {
                    indexRoot.Lexicon.PossiblyAddToken(aWord, doc);
                }
            }
        }
Пример #4
0
        public void textToWords(Document doc, IndexRoot indexRoot, string docTextString, Placement documentPlacement,
                                Placement documentTextPlacement, Placement wordPlacement, Placement wordHitPlacement)
        {
            DocumentText docText = new DocumentText(docTextString, doc);
            Word         word;

            doc.Persist(documentPlacement, session);
            doc.Page.Database.Name = doc.Name;
            docText.Persist(documentTextPlacement, session);
            indexRoot.repository.documentSet.Add(doc);
            doc.Content   = docText;
            docTextString = docTextString.ToLower();
            string[] excludedWords = new string[] { "and", "the" };
            char[]   splitChars    = new char[] { ' ', '\n', '(', '"', '!', ',', '(', ')', '\t' };
            string[] words         = docTextString.Split(splitChars, StringSplitOptions.RemoveEmptyEntries);
            UInt64   wordCt        = 0;
            int      i             = 0;
            string   aWord;

            char[] trimEndChars   = new char[] { ';', '.', '"', ',', '\r', ':', ']', '!', '?', '+', '(', ')', '\'', '{', '}', '-', '`', '/', '=' };
            char[] trimStartChars = new char[] { ';', '&', '-', '#', '*', '[', '.', '"', ',', '\r', ')', '(', '\'', '{', '}', '-', '`' };
            foreach (string wordStr in words)
            {
                i++;
                aWord = wordStr.TrimEnd(trimEndChars);
                aWord = aWord.TrimStart(trimStartChars);
                word  = new Word(aWord);
                if (aWord.Length > 1 && excludedWords.Contains(aWord) == false)
                {
                    createLocalInvertedIndex(doc, word, wordCt, wordPlacement, wordHitPlacement);
                    ++wordCt;
                }
            }
        }
Пример #5
0
        private void button1_Click(object sender, RoutedEventArgs e)
        {
            session.BeginUpdate();
            IndexRoot indexRoot    = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1));
            Placement docPlacement = new Placement(Document.PlaceInDatabase);

            foreach (string str in listBoxPagesToAdd.Items)
            {
                Document doc = null;
                try
                {
                    if (str.Contains(".html") || str.Contains(".htm") || str.Contains("http") || str.Contains("aspx"))
                    {
                        doc = parseHtml(str, indexRoot);
                    }
                    else
                    {
                        doc = parseTextFile(str, indexRoot, docPlacement);
                    }
                }
                catch (WebException ex)
                {
                    Console.WriteLine(ex.ToString());
                }
            }
            createGlobalInvertedIndex(indexRoot);
            listBoxPagesToAdd.Items.Clear();
            List <Document> docs = indexRoot.repository.documentSet.ToList <Document>().Take(50).ToList <Document>();

            inDbListBox.ItemsSource = docs;
            session.Commit();
            session.BeginRead();
            updateDataGrids(indexRoot);
            session.Commit();
        }
Пример #6
0
        public async Task <IndexRoot> Integrate(ICollection <VirtualFile> files)
        {
            Utils.Log($"Integrating {files.Count} files");
            var allFiles = AllFiles.Concat(files).GroupBy(f => f.Name).Select(g => g.Last()).ToImmutableList();

            var byFullPath = Task.Run(() => allFiles.SelectMany(f => f.ThisAndAllChildren)
                                      .ToImmutableDictionary(f => f.FullPath));

            var byHash = Task.Run(() => allFiles.SelectMany(f => f.ThisAndAllChildren)
                                  .Where(f => f.Hash != null)
                                  .ToGroupedImmutableDictionary(f => f.Hash));

            var byName = Task.Run(() => allFiles.SelectMany(f => f.ThisAndAllChildren)
                                  .ToGroupedImmutableDictionary(f => f.Name));

            var byRootPath = Task.Run(() => allFiles.ToImmutableDictionary(f => f.Name));

            var result = new IndexRoot(allFiles,
                                       await byFullPath,
                                       await byHash,
                                       await byRootPath,
                                       await byName);

            Utils.Log($"Done integrating");
            return(result);
        }
Пример #7
0
        private void CreateRoot(string name)
        {
            Root = new FileRecordSegmentHeader();
            Root.CreateFileRecordHeader(Enums.MFTEntryFlags.FileNameIndexPresent, null);

            StandardInformation stdInfo = new StandardInformation();

            stdInfo.CreateStandInfoFile(FileAttributes.Normal);
            Root.UsedEntrySize += stdInfo.RecordLength;

            IndexRoot indRoot = new IndexRoot();
            //Root.UsedEntrySize += (uint)indRoot.numberOfChildren * 8; // reference is 8 B

            FileName fileName = new FileName();

            fileName.CreateFileNameFile(name);
            fileName.RealSize   = fileName.AllocatedSize = 0;
            Root.UsedEntrySize += fileName.RecordLength;

            Root.attributes.Add(stdInfo);
            Root.attributes.Add(fileName);
            Root.attributes.Add(indRoot);

            BytesOccupied += Root.UsedEntrySize;
        }
Пример #8
0
        public async Task <IndexRoot> Integrate(ICollection <VirtualFile> files)
        {
            Utils.Log($"Integrating {files.Count} files");
            var allFiles = AllFiles.Concat(files)
                           .OrderByDescending(f => f.LastModified)
                           .GroupBy(f => f.FullPath).Select(g => g.Last())
                           .ToList();

            var byFullPath = Task.Run(() => allFiles.SelectMany(f => f.ThisAndAllChildren)
                                      .ToDictionary(f => f.FullPath));

            var byHash = Task.Run(() => allFiles.SelectMany(f => f.ThisAndAllChildren)
                                  .Where(f => f.Hash != Hash.Empty)
                                  .ToLookup(f => f.Hash));

            var byName = Task.Run(() => allFiles.SelectMany(f => f.ThisAndAllChildren)
                                  .ToLookup(f => f.Name));

            var byRootPath = Task.Run(() => allFiles.ToDictionary(f => f.AbsoluteName));

            var result = new IndexRoot(allFiles,
                                       await byFullPath,
                                       await byHash,
                                       await byRootPath,
                                       await byName);

            Utils.Log($"Done integrating");
            return(result);
        }
Пример #9
0
        public Index(File file, string name, BiosParameterBlock bpb, UpperCase upCase)
        {
            _file = file;
            _name = name;
            _bpb = bpb;
            _isFileIndex = name == "$I30";

            _blockCache = new ObjectCache<long, IndexBlock>();

            _root = _file.GetStream(AttributeType.IndexRoot, _name).GetContent<IndexRoot>();
            _comparer = _root.GetCollator(upCase);

            using (Stream s = _file.OpenStream(AttributeType.IndexRoot, _name, FileAccess.Read))
            {
                byte[] buffer = Utilities.ReadFully(s, (int)s.Length);
                _rootNode = new IndexNode(WriteRootNodeToDisk, 0, this, true, buffer, IndexRoot.HeaderOffset);

                // Give the attribute some room to breathe, so long as it doesn't squeeze others out
                // BROKEN, BROKEN, BROKEN - how to figure this out?  Query at the point of adding entries to the root node?
                _rootNode.TotalSpaceAvailable += _file.MftRecordFreeSpace(AttributeType.IndexRoot, _name) - 100;
            }

            if (_file.StreamExists(AttributeType.IndexAllocation, _name))
            {
                _indexStream = _file.OpenStream(AttributeType.IndexAllocation, _name, FileAccess.ReadWrite);
            }

            if (_file.StreamExists(AttributeType.Bitmap, _name))
            {
                _indexBitmap = new Bitmap(_file.OpenStream(AttributeType.Bitmap, _name, FileAccess.ReadWrite), long.MaxValue);
            }
        }
Пример #10
0
        private void MenuItem_Click(object sender, RoutedEventArgs e)
        {
            MenuItem menuItem = (MenuItem)e.Source;
            Document myItem   = (Document)menuItem.DataContext;

            session.BeginUpdate();
            try
            {
                IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1));
                int       index;
                if (indexRoot.repository.documentSet.TryGetKey(myItem, ref myItem))
                {
                    index = myItem.Remove(indexRoot, session);
                }
                else
                {
                    index = -1; // weird case - should not happen
                }
                inDbListBox.ItemsSource = indexRoot.repository.documentSet.ToList <Document>();
                updateDataGrids(indexRoot, index);
                session.Commit();
            }
            catch
            {
                session.Abort();
            }
        }
Пример #11
0
 static void createTopLevelInvertedIndex()
 {
     Console.WriteLine(DateTime.Now.ToString() + ", start creating top level inverted index");
     using (SessionNoServer session = new SessionNoServer(s_systemDir))
     {
         Placement wordPlacement = new Placement(Lexicon.PlaceInDatabase, 2, 1, 1000, 50000, true, false, UInt32.MaxValue, false);
         session.BeginUpdate();
         IndexRoot indexRoot                 = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1));
         BTreeSetOidShort <Word> wordSet     = indexRoot.lexicon.WordSet;
         BTreeSet <Document>     documentSet = indexRoot.repository.documentSet;
         Word existingWord = null;
         foreach (Document doc in documentSet)
         {
             foreach (Word word in doc.WordSet)
             {
                 WordHit wordHit = doc.WordHit[word];
                 if (wordSet.TryGetKey(word, ref existingWord))
                 {
                     existingWord.GlobalCount = existingWord.GlobalCount + (uint)wordHit.Count;
                 }
                 else
                 {
                     existingWord = new WordGlobal(word.aWord, session, (uint)wordHit.Count);
                     existingWord.Persist(wordPlacement, session);
                     indexRoot.lexicon.WordSet.Add(existingWord);
                 }
                 existingWord.DocumentHit.AddFast(doc);
             }
             doc.Indexed = true;
         }
         session.Commit();
         Console.WriteLine(DateTime.Now.ToString() + ", done creating top level inverted index");
     }
 }
Пример #12
0
        public Document parseTextFile(string url, IndexRoot indexRoot)
        {
            Document doc = new Document(Path.GetFileName(url), indexRoot, session);

            using (StreamReader reader = new StreamReader(url))
            {
                textToWords(doc, indexRoot, reader.ReadToEnd());
            }
            return(doc);
        }
Пример #13
0
        public Database(string dataDir, string indexDir)
        {
            Directory.CreateDirectory(dataDir);

            var logPath      = Path.Combine(dataDir, "log.dat");
            var datafilePath = Path.Combine(dataDir, "data.dat");

            _wal      = new Wal(logPath);
            _dataFile = new DataFile(datafilePath);
            _index    = new IndexRoot(indexDir);

            Recover().GetAwaiter().GetResult();
        }
Пример #14
0
        public Document parseTextFile(string url, IndexRoot indexRoot, Placement docPlacement)
        {
            Document  doc = new Document(Path.GetFileName(url), indexRoot, session);
            Placement docTextPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 1));
            Placement wordPlacement    = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 2));
            Placement wordHitPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 10));

            using (StreamReader reader = new StreamReader(url))
            {
                textToWords(doc, indexRoot, reader.ReadToEnd(), docPlacement, docTextPlacement, wordPlacement, wordHitPlacement);
            }
            return(doc);
        }
Пример #15
0
        static void outputSomeInfo(SessionNoServer session)
        {
            IndexRoot indexRoot             = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1));
            BTreeSetOidShort <Word> wordSet = indexRoot.lexicon.WordSet;

            using (StreamWriter writer = new StreamWriter("Wikipedia.txt"))
            {
                writer.WriteLine("Number of words in Lexicon is: " + wordSet.Count);
                foreach (Word word in wordSet)
                {
                    writer.WriteLine(word.aWord + " " + word.DocumentHit.Count);
                }
                writer.Close();
            }
        }
Пример #16
0
 private void updateWordTables_Click(object sender, RoutedEventArgs e)
 {
     if (session != null)
     {
         try
         {
             session.BeginRead();
             IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1));
             updateDataGrids(indexRoot);
         }
         finally
         {
             session.Commit();
         }
     }
 }
Пример #17
0
        static void outputSomeInfo(SessionNoServer session)
        {
            IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1));
            var       wordHits  = indexRoot.Lexicon.TokenMap;

            using (StreamWriter writer = new StreamWriter("Wikipedia.txt"))
            {
                writer.WriteLine("Number of words in Lexicon is: " + indexRoot.Lexicon.IdToValue.Count);
                foreach (var p in wordHits)
                {
                    var word = indexRoot.Lexicon.IdToValue[p.Key];
                    writer.WriteLine(word + " " + p.Value.Count);
                }
                writer.Close();
            }
        }
Пример #18
0
        public Document parseHtml(string url, IndexRoot indexRoot)
        {
            Document doc = new Document(url, indexRoot, session);

            using (WebClient client = new WebClient())
            {
                string       html     = client.DownloadString(url);
                string       pageBody = "";
                HtmlDocument htmlDoc  = new HtmlDocument();
                htmlDoc.LoadHtml(html);
                foreach (HtmlNode node in htmlDoc.DocumentNode.SelectNodes("//text()"))
                {
                    pageBody += " " + node.InnerText;
                }
                textToWords(doc, indexRoot, pageBody);
            }
            return(doc);
        }
Пример #19
0
 static void CreateInvertedIndex()
 {
     using (SessionNoServer session = new SessionNoServer(s_systemDir, 5000, false, false, CacheEnum.No)) // turn of page and object caching))
     {
         session.BeginUpdate();
         session.RegisterClass(typeof(Repository));
         session.RegisterClass(typeof(IndexRoot));
         session.RegisterClass(typeof(Lexicon <string>));
         Console.WriteLine(DateTime.Now.ToString() + ", start creating inverted index");
         IndexRoot           indexRoot   = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1));
         BTreeSet <Document> documentSet = indexRoot.Repository.DocumentSet;
         foreach (var doc in documentSet)
         {
             createDocumentInvertedIndex(indexRoot, doc);
         }
         session.Commit();
         Console.WriteLine(DateTime.Now.ToString() + ", done creating inverted index");
     }
 }
Пример #20
0
        public void verify(bool startTrans = true)
        {
            if (startTrans)
            {
                session.BeginRead();
            }
            IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1));
            int       i         = 0;
            int       j         = 0;
            int       k         = 0;

            foreach (var pair in indexRoot.Lexicon.TokenMap)
            {
                i++;
                foreach (Document doc in pair.Value)
                {
                    j++;
                    if (doc == null)
                    {
                        throw new UnexpectedException("bad documentHit BTreeSet");
                    }
                    foreach (KeyValuePair <UInt32, UInt32> pair2 in doc.WordHit)
                    {
                        k++;
                        if (pair2.Value == 0)
                        {
                            throw new UnexpectedException("bad document WordHit");
                        }
                        if (indexRoot.Lexicon.IdToValue.Contains(pair2.Key) == false)
                        {
                            throw new UnexpectedException("missing lexicon word");
                        }
                    }
                }
            }
            if (startTrans)
            {
                session.Commit();
            }
        }
Пример #21
0
        public void verify(bool startTrans = true)
        {
            if (startTrans)
            {
                session.BeginRead();
            }
            IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1));
            int       i         = 0;
            int       j         = 0;
            int       k         = 0;

            foreach (Word word in indexRoot.lexicon.WordSet)
            {
                i++;
                foreach (Document doc in word.DocumentHit)
                {
                    j++;
                    if (doc == null)
                    {
                        throw new UnexpectedException("bad documentHit BTreeSet");
                    }
                    foreach (KeyValuePair <Word, WordHit> pair in doc.WordHit)
                    {
                        k++;
                        if (pair.Value == null || pair.Key == null)
                        {
                            throw new UnexpectedException("bad document WordHit");
                        }
                        if (indexRoot.lexicon.WordSet.Contains(pair.Key) == false)
                        {
                            throw new UnexpectedException("missing lexicon word");
                        }
                    }
                }
            }
            if (startTrans)
            {
                session.Commit();
            }
        }
Пример #22
0
 static void createDocumentInvertedIndex(IndexRoot indexRoot, Document doc)
 {
     if (!doc.Indexed)
     {
         DocumentText    docText    = doc.Content;
         string          text       = docText.Text.ToLower();
         MatchCollection tagMatches = Regex.Matches(text, "[a-z][a-z$]+");
         if (++s_docCountIndexed % 50000 == 0)
         {
             Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed);
         }
         foreach (Match m in tagMatches)
         {
             indexRoot.Lexicon.PossiblyAddToken(m.Value, doc);
         }
         if (s_docCountIndexed % 1000 == 0)
         {
             Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed + " Database: " + doc.DatabaseNumber + " is completed.");
         }
         doc.Indexed = true;
     }
 }
Пример #23
0
        public Document parseHtml(string url, IndexRoot indexRoot)
        {
            Document  doc              = new Document(url, indexRoot, session);
            Placement docPlacement     = new Placement(Document.PlaceInDatabase);
            Placement docTextPlacement = new Placement(Document.PlaceInDatabase, 2);
            Placement wordPlacement    = new Placement(Document.PlaceInDatabase, 3);
            Placement wordHitPlacement = new Placement(Document.PlaceInDatabase, 100);

            using (WebClient client = new WebClient())
            {
                string       html     = client.DownloadString(url);
                string       pageBody = "";
                HtmlDocument htmlDoc  = new HtmlDocument();
                htmlDoc.LoadHtml(html);
                foreach (HtmlNode node in htmlDoc.DocumentNode.SelectNodes("//text()"))
                {
                    pageBody += " " + node.InnerText;
                }
                textToWords(doc, indexRoot, pageBody, docPlacement, docTextPlacement, wordPlacement, wordHitPlacement);
            }
            return(doc);
        }
Пример #24
0
        //static void MessageWrite(string str)
        //{
        //    Console.Write(">" + "{0} {1}\n>", str, message1);
        //    Console.ReadLine();
        //}

        static void CreateDirectory(FileRecordSegmentHeader parentFolder, FileRecordSegmentHeader childFolder, string name, NtfsFileSystem fs)
        {
            childFolder.CreateFileRecordHeader(Enums.MFTEntryFlags.FileNameIndexPresent, parentFolder);

            StandardInformation stdInfo = new StandardInformation();

            stdInfo.CreateStandInfoFile(FileAttributes.Normal);
            childFolder.UsedEntrySize += stdInfo.RecordLength;

            IndexRoot indRoot = new IndexRoot();
            //childFolder.UsedEntrySize += indRoot.RecordLength;

            FileName fileName = new FileName();

            fileName.CreateFileNameFile(name);
            fileName.RealSize          = fileName.AllocatedSize = 0;
            childFolder.UsedEntrySize += fileName.RecordLength;

            if (childFolder.UsedEntrySize >= (fs.VolumeSize - fs.BytesOccupied + 8))
            {
                Console.WriteLine("Volume is full!");
                Save(fs.VolName.Name, fs.Save());
                Environment.Exit(0);
            }

            childFolder.attributes.Add(stdInfo);
            childFolder.attributes.Add(fileName);
            childFolder.attributes.Add(indRoot);

            ((IndexRoot)parentFolder.attributes.ElementAt(2)).Children.Add(childFolder);
            ((IndexRoot)parentFolder.attributes.ElementAt(2)).numberOfChildren++;
            childFolder.parent = parentFolder;

            fs.BytesOccupied += childFolder.UsedEntrySize + 8;
            fs.DirectoryRecordCount++;
            parentFolder.UsedFolderSize += childFolder.UsedEntrySize;
        }
Пример #25
0
        private Index(AttributeType attrType, AttributeCollationRule collationRule, File file, string name, BiosParameterBlock bpb, UpperCase upCase)
        {
            _file = file;
            _name = name;
            _bpb = bpb;
            _isFileIndex = name == "$I30";

            _blockCache = new ObjectCache<long, IndexBlock>();

            _file.CreateStream(AttributeType.IndexRoot, _name);

            _root = new IndexRoot()
            {
                AttributeType = (uint)attrType,
                CollationRule = collationRule,
                IndexAllocationSize = (uint)bpb.IndexBufferSize,
                RawClustersPerIndexRecord = bpb.RawIndexBufferSize
            };

            _comparer = _root.GetCollator(upCase);

            _rootNode = new IndexNode(WriteRootNodeToDisk, 0, this, true, 32);
        }
Пример #26
0
        static void ImportEntireWikipedia()
        {
            const ushort btreeNodeSize = 10000;

            Console.WriteLine(DateTime.Now.ToString() + ", start importing Wikipedia text");
            //System.Xml.Schema.XmlSchema docSchema;
            //using (System.Xml.XmlTextReader schemaReader = new System.Xml.XmlTextReader("c:\\export-0_5.xsd"))
            //{
            //  docSchema = System.Xml.Schema.XmlSchema.Read(schemaReader, ValidationCallBack);
            // }
            int docCount = 0;

            using (SessionNoServer session = new SessionNoServer(s_systemDir, 5000, false, false, CacheEnum.No)) // turn of page and object caching
            {
                Console.WriteLine($"Running with databases in directory: {session.SystemDirectory}");
                //GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer
                XmlComment    xmlComment;
                XmlElement    xmlElement;
                XmlEntity     xmlEntity;
                XmlText       xmlText;
                XmlWhitespace xmlWhitespace;
                session.BeginUpdate();
                // register all database schema classes used by the application in advance to avoid lock conflict later in parallel indexing
                Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false);
                if (db != null)
                {
                    outputSomeInfo(session);
                    session.Abort();
                    return;
                }
                //session.SetTraceDbActivity(Lexicon.PlaceInDatabase);
                //session.SetTraceAllDbActivity();
                XmlDocument xmlDocument = new XmlDocument("enwiki-latest-pages-articles.xml");
                IndexRoot   indexRoot   = new IndexRoot(btreeNodeSize, session);
                indexRoot.Persist(session, indexRoot, true);
                UInt32   currentDocumentDatabaseNum = 0;
                Document doc          = null;
                bool     titleElement = false;
                bool     pageText     = false;
                using (FileStream fs = new FileStream(s_wikipediaXmlFile, FileMode.Open))
                {
                    //using (GZipStream zipStream = new GZipStream(fs, CompressionMode.Decompress)) // if input was a .gz file
                    {
                        using (System.Xml.XmlTextReader textReader = new System.Xml.XmlTextReader(fs))
                        {
                            while (textReader.Read())
                            {
                                System.Xml.XmlNodeType nodeType = textReader.NodeType;
                                switch (nodeType)
                                {
                                case System.Xml.XmlNodeType.Attribute:
                                    break;

                                case System.Xml.XmlNodeType.CDATA:
                                    break;

                                case System.Xml.XmlNodeType.Comment:
                                    xmlComment = new XmlComment(textReader.Value, xmlDocument);
                                    break;

                                case System.Xml.XmlNodeType.Document:
                                    break;

                                case System.Xml.XmlNodeType.DocumentFragment:
                                    break;

                                case System.Xml.XmlNodeType.DocumentType:
                                    break;

                                case System.Xml.XmlNodeType.Element:
                                    xmlElement = new XmlElement(textReader.Prefix, textReader.LocalName, textReader.NamespaceURI, xmlDocument);
                                    if (textReader.LocalName == "title")
                                    {
                                        titleElement = true;
                                    }
                                    else if (textReader.LocalName == "text")
                                    {
                                        pageText = true;
                                    }
                                    break;

                                case System.Xml.XmlNodeType.EndElement:
                                    if (textReader.LocalName == "title" && doc != null)
                                    {
                                        titleElement = false;
                                    }
                                    else if (textReader.LocalName == "text" && doc != null)
                                    {
                                        pageText = false;
                                    }
                                    break;

                                case System.Xml.XmlNodeType.EndEntity:
                                    break;

                                case System.Xml.XmlNodeType.Entity:
                                    xmlEntity = new XmlEntity(textReader.LocalName, xmlDocument);
                                    break;

                                case System.Xml.XmlNodeType.EntityReference:
                                    break;

                                case System.Xml.XmlNodeType.None:
                                    break;

                                case System.Xml.XmlNodeType.Notation:
                                    break;

                                case System.Xml.XmlNodeType.ProcessingInstruction:
                                    break;

                                case System.Xml.XmlNodeType.SignificantWhitespace:
                                    break;

                                case System.Xml.XmlNodeType.Text:
                                    xmlText = new XmlText(textReader.Value, xmlDocument);
                                    if (titleElement)
                                    {
                                        doc = new Document(textReader.Value, indexRoot, session);
                                        session.Persist(doc);
                                        if (doc.DatabaseNumber != currentDocumentDatabaseNum)
                                        {
                                            if (currentDocumentDatabaseNum > 0)
                                            {
                                                session.FlushUpdates();
                                                Console.WriteLine("Database: " + currentDocumentDatabaseNum + " is completed, done importing article " + docCount + " number of lines: " + textReader.LineNumber);
                                            }
                                            currentDocumentDatabaseNum = doc.DatabaseNumber;
                                        }
                                        //doc.Page.Database.Name = doc.Name;
                                    }
                                    else if (doc != null && pageText)
                                    {
#if DEBUGx
                                        Console.WriteLine(doc.Name + " line: " + textReader.LineNumber);
#endif
                                        //if (textReader.LineNumber > 1000000)
                                        //{
                                        //  session.Commit();
                                        //  return;
                                        //}
                                        DocumentText content = new DocumentText(textReader.Value, doc);
                                        session.Persist(content, 10000);
                                        doc.Content = content;
                                        indexRoot.Repository.DocumentSet.AddFast(doc);
                                        if (++docCount % 1000000 == 0)
                                        {
                                            //session.Commit(false); // skip recovery check, we do it in BeginUpdate which is enough
                                            Console.WriteLine("Done importing article " + docCount + " number of lines: " + textReader.LineNumber);
                                            //session.BeginUpdate();
                                        }
                                    }
                                    break;

                                case System.Xml.XmlNodeType.Whitespace:
                                    xmlWhitespace = new XmlWhitespace(textReader.Value, xmlDocument);
                                    break;

                                case System.Xml.XmlNodeType.XmlDeclaration:
                                    break;
                                }
                                ;
                            }
                            Console.WriteLine("Finished importing article " + docCount + " number of lines: " + textReader.LineNumber);
                        }
                    }
                }
                session.Commit();
            }
            Console.WriteLine(DateTime.Now.ToString() + ", done importing Wikipedia text");
        }
Пример #27
0
        public FileRecord(byte[] rawBytes, int offset)
        {
            Offset = offset;

            var sig = BitConverter.ToInt32(rawBytes, 0);

            switch (sig)
            {
            case FileSig:
                break;

            case BaadSig:
                _logger.Debug($"Bad signature at offset 0x{offset:X}");
                IsBad = true;
                return;

            default:
                //not initialized
                _logger.Debug($"Uninitialized entry (no signature) at offset 0x{offset:X}");
                IsUninitialized = true;
                return;
            }

            _logger.Debug($"Processing FILE record at offset 0x{offset:X}");

            Attributes = new List <Attribute>();

            FixupOffset     = BitConverter.ToInt16(rawBytes, 0x4);
            FixupEntryCount = BitConverter.ToInt16(rawBytes, 0x6);

            //to build fixup info, take FixupEntryCount x 2 bytes as each are 2 bytes long
            var fixupTotalLength = FixupEntryCount * 2;

            var fixupBuffer = new byte[fixupTotalLength];

            Buffer.BlockCopy(rawBytes, FixupOffset, fixupBuffer, 0, fixupTotalLength);

            //pull this early so we can check if its free in our fix up value messages
            EntryFlags = (EntryFlag)BitConverter.ToInt16(rawBytes, 0x16);

            FixupData = new FixupData(fixupBuffer);

            FixupOk = true;

            //fixup verification
            var counter = 512;

            foreach (var bytese in FixupData.FixupActual)
            {
                //adjust the offset to where we need to check
                var fixupOffset = counter - 2;

                var expected = BitConverter.ToInt16(rawBytes, fixupOffset);
                if (expected != FixupData.FixupExpected && EntryFlags != 0x0)
                {
                    FixupOk = false;
                    _logger.Warn(
                        $"Offset: 0x{Offset:X} Entry/seq: 0x{EntryNumber:X}/0x{SequenceNumber:X} Fixup values do not match at 0x{fixupOffset:X}. Expected: 0x{FixupData.FixupExpected:X2}, actual: 0x{expected:X2}");
                }

                //replace fixup expected with actual bytes. bytese has actual replacement values in it.
                Buffer.BlockCopy(bytese, 0, rawBytes, fixupOffset, 2);

                counter += 512;
            }

            LogSequenceNumber = BitConverter.ToInt64(rawBytes, 0x8);

            SequenceNumber = BitConverter.ToUInt16(rawBytes, 0x10);

            ReferenceCount = BitConverter.ToInt16(rawBytes, 0x12);

            FirstAttributeOffset = BitConverter.ToInt16(rawBytes, 0x14);

            ActualRecordSize = BitConverter.ToInt32(rawBytes, 0x18);

            AllocatedRecordSize = BitConverter.ToInt32(rawBytes, 0x1c);

            var entryBytes = new byte[8];

            Buffer.BlockCopy(rawBytes, 0x20, entryBytes, 0, 8);

            MftRecordToBaseRecord = new MftEntryInfo(entryBytes);

            FirstAvailablAttribueId = BitConverter.ToInt16(rawBytes, 0x28);

            EntryNumber = BitConverter.ToUInt32(rawBytes, 0x2c);

            //start attribute processing at FirstAttributeOffset

            var index = (int)FirstAttributeOffset;

            while (index < ActualRecordSize)
            {
                var attrType = (AttributeType)BitConverter.ToInt32(rawBytes, index);

                var attrSize = BitConverter.ToInt32(rawBytes, index + 4);

                if (attrSize == 0 || attrType == AttributeType.EndOfAttributes)
                {
                    index += 8; //skip -1 type and 0 size

                    if (index != ActualRecordSize)
                    {
                        _logger.Warn($"Slack space found in entry/seq: 0x{EntryNumber:X}/0x{SequenceNumber:X}");
                    }

                    //TODO process slack here?
                    break;
                }

                _logger.Debug(
                    $"Found Attribute Type {attrType.ToString()} at absolute offset: 0x{index + offset:X}");

                _logger.Trace(
                    $"ActualRecordSize: 0x{ActualRecordSize:X}, size: 0x{attrSize:X}, index: 0x{index:X}");

                var rawAttr = new byte[attrSize];
                Buffer.BlockCopy(rawBytes, index, rawAttr, 0, attrSize);

                switch (attrType)
                {
                case AttributeType.StandardInformation:
                    var si = new StandardInfo(rawAttr);
                    Attributes.Add(si);
                    break;

                case AttributeType.FileName:
                    var fi = new FileName(rawAttr);
                    Attributes.Add(fi);
                    break;

                case AttributeType.Data:
                    var d = new Data(rawAttr);
                    Attributes.Add(d);
                    break;

                case AttributeType.IndexAllocation:
                    var ia = new IndexAllocation(rawAttr);
                    Attributes.Add(ia);
                    break;

                case AttributeType.IndexRoot:
                    var ir = new IndexRoot(rawAttr);
                    Attributes.Add(ir);
                    break;

                case AttributeType.Bitmap:
                    var bm = new Bitmap(rawAttr);
                    Attributes.Add(bm);
                    break;

                case AttributeType.VolumeVersionObjectId:
                    var oi = new ObjectId_(rawAttr);
                    Attributes.Add(oi);
                    break;

                case AttributeType.SecurityDescriptor:
                    var sd = new SecurityDescriptor(rawAttr);
                    Attributes.Add(sd);
                    break;

                case AttributeType.VolumeName:
                    var vn = new VolumeName(rawAttr);
                    Attributes.Add(vn);
                    break;

                case AttributeType.VolumeInformation:
                    var vi = new VolumeInformation(rawAttr);
                    Attributes.Add(vi);
                    break;

                case AttributeType.LoggedUtilityStream:
                    var lus = new LoggedUtilityStream(rawAttr);
                    Attributes.Add(lus);
                    break;

                case AttributeType.ReparsePoint:
                    try
                    {
                        var rp = new ReparsePoint(rawAttr);
                        Attributes.Add(rp);
                    }
                    catch (Exception)
                    {
                        var l = LogManager.GetLogger("ReparsePoint");

                        l.Error(
                            $"There was an error parsing a ReparsePoint in FILE record at offset 0x{Offset:X}. Please extract via --dd and --do and send to [email protected]");
                    }

                    break;

                case AttributeType.AttributeList:
                    var al = new AttributeList(rawAttr);
                    Attributes.Add(al);
                    break;

                case AttributeType.Ea:
                    var ea = new ExtendedAttribute(rawAttr);
                    Attributes.Add(ea);
                    break;

                case AttributeType.EaInformation:
                    var eai = new ExtendedAttributeInformation(rawAttr);
                    Attributes.Add(eai);
                    break;

                default:
                    throw new Exception($"Add me: {attrType} (0x{attrType:X})");
                }

                index += attrSize;
            }

            //rest is slack. handle here?
            _logger.Trace($"Slack starts at 0x{index:X} Absolute offset: 0x{index + offset:X}");
        }
        private bool SelfCheckIndexNode(byte[] buffer, int offset, Bitmap bitmap, IndexRoot root, string fileName, string indexName)
        {
            bool ok = true;

            IndexHeader header = new IndexHeader(buffer, offset);

            IndexEntry lastEntry = null;

            IComparer<byte[]> collator = root.GetCollator(_context.UpperCase);

            int pos = (int)header.OffsetToFirstEntry;
            while (pos < header.TotalSizeOfEntries)
            {
                IndexEntry entry = new IndexEntry(indexName == "$I30");
                entry.Read(buffer, offset + pos);
                pos += entry.Size;

                if ((entry.Flags & IndexEntryFlags.Node) != 0)
                {
                    long bitmapIdx = entry.ChildrenVirtualCluster / Utilities.Ceil(root.IndexAllocationSize, _context.BiosParameterBlock.SectorsPerCluster * _context.BiosParameterBlock.BytesPerSector);
                    if (!bitmap.IsPresent(bitmapIdx))
                    {
                        ReportError("Index entry {0} is non-leaf, but child vcn {1} is not in bitmap at index {2}", Index.EntryAsString(entry, fileName, indexName), entry.ChildrenVirtualCluster, bitmapIdx);
                    }
                }

                if ((entry.Flags & IndexEntryFlags.End) != 0)
                {
                    if (pos != header.TotalSizeOfEntries)
                    {
                        ReportError("Found END index entry {0}, but not at end of node", Index.EntryAsString(entry, fileName, indexName));
                        ok = false;
                    }
                }

                if (lastEntry != null && collator.Compare(lastEntry.KeyBuffer, entry.KeyBuffer) >= 0)
                {
                    ReportError("Found entries out of order {0} was before {1}", Index.EntryAsString(lastEntry, fileName, indexName), Index.EntryAsString(entry, fileName, indexName));
                    ok = false;
                }

                lastEntry = entry;
            }

            return ok;
        }
Пример #29
0
        public static AttributeRecord ReadSingleAttribute(byte[] data, int maxLength, int offset = 0)
        {
            Debug.Assert(data.Length - offset >= maxLength);
            Debug.Assert(0 <= offset && offset <= data.Length);

            AttributeTypeCode TypeCode = GetTypeCode(data, offset);

            if (TypeCode == AttributeTypeCode.EndOfAttributes)
            {
                AttributeRecord tmpAR = new AttributeGeneric();
                tmpAR.ReadARHeader(data, offset);

                return(tmpAR);
            }

            AttributeRecord attRecord;

            switch (TypeCode)
            {
            case AttributeTypeCode.STANDARD_INFORMATION:
                attRecord = new StandardInformation();
                break;

            case AttributeTypeCode.ATTRIBUTE_LIST:
                attRecord = new AttributeList();
                break;

            case AttributeTypeCode.FILE_NAME:
                attRecord = new FileName();
                break;

            case AttributeTypeCode.OBJECT_ID:
                attRecord = new ObjectId();
                break;

            // To complicated to quickly be implemented. Maybe one day. lol
            // case AttributeTypeCode.SECURITY_DESCRIPTOR:
            //    attRecord = new SecurityDescriptor();
            //    break;
            case AttributeTypeCode.VOLUME_NAME:
                attRecord = new VolumeName();
                break;

            case AttributeTypeCode.VOLUME_INFORMATION:
                attRecord = new VolumeInformation();
                break;

            case AttributeTypeCode.DATA:
                attRecord = new Data();
                break;

            case AttributeTypeCode.INDEX_ROOT:
                attRecord = new IndexRoot();
                break;

            // INDEX_ALLOCATION is stored as non resident and this project deals only with resident files
            // case AttributeTypeCode.INDEX_ALLOCATION:
            //    attRecord = new IndexAllocation();
            //    break;
            case AttributeTypeCode.BITMAP:
                attRecord = new Bitmap();
                break;

            case AttributeTypeCode.EA_INFORMATION:
                attRecord = new ExtenedAttributeInformation();
                break;

            case AttributeTypeCode.EA:
                attRecord = new ExtenedAttributes();
                break;

            // PROPERTY_SET needs a pre NTFS 3.0 volume. This is probably obsolete!
            // case AttributeTypeCode.PROPERTY_SET:
            // attRecord = new PropertSet();
            // break;
            case AttributeTypeCode.LOGGED_UTILITY_STREAM:
                attRecord = new LoggedUtilityStream();
                break;

            default:     // ?? could be a problem
                attRecord = new AttributeGeneric();
                break;
            }

            attRecord.ReadARHeader(data, offset);
            if (attRecord.FormCode == ResidentFileFlag.Resident)
            {
                attRecord.ResidentHeader = AttributeResidentHeader.ReadHeader(data, offset + 16);

                int residentBodyOffset = offset + attRecord.ResidentHeader.ValueOffset;
                int length             = offset + attRecord.RecordLength - residentBodyOffset;
                attRecord.ReadAttributeResident(data, length, residentBodyOffset);
            }
            else
            {
                throw new Exception("Could not read and process resident flag!\n");
            }

            return(attRecord);
        }
Пример #30
0
        public MainWindow()
        {
            const ushort btreeNodeSize = 5000;

            GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer
            dataGridList           = new List <DataGrid>();
            dataTableList          = new List <DataTable>();
            InitializeComponent();
            session = new SessionNoServer(s_systemDir);
            Placement placerIndexRoot = new Placement(IndexRoot.PlaceInDatabase);

            session.BeginUpdate();
            Console.WriteLine("Running with databases in directory: " + session.SystemDirectory);
            File.Copy(s_licenseDbFile, Path.Combine(session.SystemDirectory, "4.odb"), true);
            IndexRoot indexRoot;
            Database  db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false);

            if (db == null)
            {
                session.NewDatabase(IndexRoot.PlaceInDatabase, 0, "IndexRoot");
                session.NewDatabase(Lexicon.PlaceInDatabase, 0, "Lexicon");
                session.NewDatabase(Document.PlaceInDatabase, 0, "Document");
                session.NewDatabase(Repository.PlaceInDatabase, 0, "Repository");
                session.NewDatabase(DocumentText.PlaceInDatabase, 0, "DocumentText");
                session.NewDatabase(Word.PlaceInDatabase, 0, "Word");
                indexRoot = new IndexRoot(btreeNodeSize, session);
                if (Directory.Exists(s_booksDir))
                {
                    string[] directoryTextFiles = Directory.GetFiles(s_booksDir, "*.txt");
                    foreach (string fileName in directoryTextFiles)
                    {
                        listBoxPagesToAdd.Items.Add(fileName);
                    }
                }
                else
                {
                    wordMinCt.Text = 1.ToString();
                    listBoxPagesToAdd.Items.Add("http://www.VelocityDB.com/");
                    // other database products
                    listBoxPagesToAdd.Items.Add("https://foundationdb.com/");
                    listBoxPagesToAdd.Items.Add("http://www.oracle.com/us/products/database/index.html");
                    listBoxPagesToAdd.Items.Add("http://www-01.ibm.com/software/data/db2/");
                    listBoxPagesToAdd.Items.Add("http://www.versant.com/");
                    listBoxPagesToAdd.Items.Add("http://web.progress.com/en/objectstore/");
                    listBoxPagesToAdd.Items.Add("https://www.mongodb.org/");
                    listBoxPagesToAdd.Items.Add("http://cassandra.apache.org/");
                    listBoxPagesToAdd.Items.Add("http://www.sybase.com/");
                    listBoxPagesToAdd.Items.Add("http://www.mcobject.com/perst");
                    listBoxPagesToAdd.Items.Add("http://www.marklogic.com/what-is-marklogic/");
                    listBoxPagesToAdd.Items.Add("http://hamsterdb.com/");
                    listBoxPagesToAdd.Items.Add("http://www.firebirdsql.org/");
                    listBoxPagesToAdd.Items.Add("http://www.h2database.com/");
                    listBoxPagesToAdd.Items.Add("http://www.oracle.com/technology/products/berkeley-db");
                    listBoxPagesToAdd.Items.Add("http://www.scimore.com/");
                    listBoxPagesToAdd.Items.Add("http://www.stsdb.com/");
                    listBoxPagesToAdd.Items.Add("http://www.sqlite.org/about.html");
                    listBoxPagesToAdd.Items.Add("http://www.mysql.com/products/enterprise/techspec.html");
                    listBoxPagesToAdd.Items.Add("http://www.objectivity.com");
                    listBoxPagesToAdd.Items.Add("http://vistadb.net/");
                    listBoxPagesToAdd.Items.Add("http://www.google.com/search?q=object+database&sourceid=ie7&rls=com.microsoft:en-us:IE-SearchBox&ie=&oe=");
                }
                indexRoot.Persist(session, indexRoot);
            }
            else
            {
                indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1));
            }

            if (indexRoot.repository.documentSet.Count > 0)
            {
                List <Document> docs = indexRoot.repository.documentSet.ToList <Document>().Take(50).ToList <Document>();
                inDbListBox.ItemsSource = docs;
            }
            updateDataGrids(indexRoot);
            session.Commit();
            //verify();
        }
Пример #31
0
        static void importEntireWikipedia()
        {
            const ushort btreeNodeSize = 10000;

            Console.WriteLine(DateTime.Now.ToString() + ", start importing Wikipedia text");
            //System.Xml.Schema.XmlSchema docSchema;
            //using (System.Xml.XmlTextReader schemaReader = new System.Xml.XmlTextReader("c:\\export-0_5.xsd"))
            //{
            //  docSchema = System.Xml.Schema.XmlSchema.Read(schemaReader, ValidationCallBack);
            // }
            int docCount = 0;

            using (SessionNoServer session = new SessionNoServer(s_systemDir, 5000, false, false, CacheEnum.No)) // turn of page and object caching
            {
                Console.WriteLine("Running with databases in directory: " + session.SystemDirectory);
                //GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer
                Placement     documentPlacement = new Placement(Document.PlaceInDatabase, 1003, 1, 500, 1000, false, false, 1000, false);
                Placement     contentPlacement  = new Placement(Document.PlaceInDatabase, 1, 1, 500, UInt16.MaxValue, false, false, 1, false);
                XmlComment    xmlComment;
                XmlElement    xmlElement;
                XmlEntity     xmlEntity;
                XmlText       xmlText;
                XmlWhitespace xmlWhitespace;
                session.BeginUpdate();
                File.Copy(s_licenseDbFile, System.IO.Path.Combine(session.SystemDirectory, "4.odb"), true);
                // register all database schema classes used by the application in advance to avoid lock conflict later in parallell indexing
                session.RegisterClass(typeof(Repository));
                session.RegisterClass(typeof(IndexRoot));
                session.RegisterClass(typeof(Document));
                session.RegisterClass(typeof(Lexicon));
                session.RegisterClass(typeof(DocumentText));
                session.RegisterClass(typeof(Word));
                session.RegisterClass(typeof(WordGlobal));
                session.RegisterClass(typeof(WordHit));
                session.RegisterClass(typeof(BTreeSet <Document>));
                session.RegisterClass(typeof(OidShort));
                session.RegisterClass(typeof(BTreeMap <Word, WordHit>));
                session.RegisterClass(typeof(HashCodeComparer <Word>));
                session.RegisterClass(typeof(BTreeSetOidShort <Word>));
                session.RegisterClass(typeof(BTreeMapOidShort <Word, WordHit>));
                Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false);
                if (db != null)
                {
                    outputSomeInfo(session);
                    session.Abort();
                    return;
                }
                session.NewDatabase(IndexRoot.PlaceInDatabase, 0, "IndexRoot");
                session.NewDatabase(Lexicon.PlaceInDatabase, 0, "Lexicon");
                session.NewDatabase(Repository.PlaceInDatabase, 0, "Repository");
                for (UInt32 i = 40; i <= 186; i++)
                {
                    session.NewDatabase(i, 512, "Document"); // pre allocate 146 Document databases presized to 512MB each
                }
                //session.SetTraceDbActivity(Lexicon.PlaceInDatabase);
                //session.SetTraceAllDbActivity();
                XmlDocument xmlDocument = new XmlDocument("enwiki-latest-pages-articles.xml");
                IndexRoot   indexRoot   = new IndexRoot(btreeNodeSize, session);
                indexRoot.Persist(session, indexRoot, true);
                Document doc          = null;
                bool     titleElement = false;
                bool     pageText     = false;
                UInt32   currentDocumentDatabaseNum = documentPlacement.StartDatabaseNumber;
                using (FileStream fs = new FileStream(s_wikipediaXmlFile, FileMode.Open))
                {
                    //using (GZipStream zipStream = new GZipStream(fs, CompressionMode.Decompress)) // if input was a .gz file
                    {
                        using (System.Xml.XmlTextReader textReader = new System.Xml.XmlTextReader(fs))
                        {
                            while (textReader.Read())
                            {
                                System.Xml.XmlNodeType nodeType = textReader.NodeType;
                                switch (nodeType)
                                {
                                case System.Xml.XmlNodeType.Attribute:
                                    break;

                                case System.Xml.XmlNodeType.CDATA:
                                    break;

                                case System.Xml.XmlNodeType.Comment:
                                    xmlComment = new XmlComment(textReader.Value, xmlDocument);
                                    break;

                                case System.Xml.XmlNodeType.Document:
                                    break;

                                case System.Xml.XmlNodeType.DocumentFragment:
                                    break;

                                case System.Xml.XmlNodeType.DocumentType:
                                    break;

                                case System.Xml.XmlNodeType.Element:
                                    xmlElement = new XmlElement(textReader.Prefix, textReader.LocalName, textReader.NamespaceURI, xmlDocument);
                                    if (textReader.LocalName == "title")
                                    {
                                        titleElement = true;
                                    }
                                    else if (textReader.LocalName == "text")
                                    {
                                        pageText = true;
                                    }
                                    break;

                                case System.Xml.XmlNodeType.EndElement:
                                    if (textReader.LocalName == "title" && doc != null)
                                    {
                                        titleElement = false;
                                    }
                                    else if (textReader.LocalName == "text" && doc != null)
                                    {
                                        pageText = false;
                                    }
                                    break;

                                case System.Xml.XmlNodeType.EndEntity:
                                    break;

                                case System.Xml.XmlNodeType.Entity:
                                    xmlEntity = new XmlEntity(textReader.LocalName, xmlDocument);
                                    break;

                                case System.Xml.XmlNodeType.EntityReference:
                                    break;

                                case System.Xml.XmlNodeType.None:
                                    break;

                                case System.Xml.XmlNodeType.Notation:
                                    break;

                                case System.Xml.XmlNodeType.ProcessingInstruction:
                                    break;

                                case System.Xml.XmlNodeType.SignificantWhitespace:
                                    break;

                                case System.Xml.XmlNodeType.Text:
                                    xmlText = new XmlText(textReader.Value, xmlDocument);
                                    if (titleElement)
                                    {
                                        doc = new Document(textReader.Value, indexRoot, session);
                                        doc.Persist(documentPlacement, session, true);
                                        if (doc.DatabaseNumber != currentDocumentDatabaseNum)
                                        {
                                            session.FlushUpdates(session.OpenDatabase(currentDocumentDatabaseNum));
                                            Console.WriteLine("Database: " + currentDocumentDatabaseNum + " is completed, done importing article " + docCount + " number of lines: " + textReader.LineNumber);
                                            currentDocumentDatabaseNum = doc.DatabaseNumber;
                                        }
                                        //doc.Page.Database.Name = doc.Name;
                                    }
                                    else if (doc != null && pageText)
                                    {
#if DEBUGx
                                        Console.WriteLine(doc.Name + " line: " + textReader.LineNumber);
#endif
                                        //if (textReader.LineNumber > 1000000)
                                        //{
                                        //  session.Commit();
                                        //  return;
                                        //}
                                        DocumentText content = new DocumentText(textReader.Value, doc);
                                        if (doc.DatabaseNumber != contentPlacement.TryDatabaseNumber)
                                        {
                                            contentPlacement = new Placement(doc.DatabaseNumber, (ushort)contentPlacement.StartPageNumber, 1, contentPlacement.MaxObjectsPerPage, contentPlacement.MaxPagesPerDatabase, false, false, 1, false);
                                        }
                                        content.Persist(contentPlacement, session, false);
                                        Debug.Assert(content.DatabaseNumber == doc.DatabaseNumber);
                                        doc.Content = content;
                                        indexRoot.repository.documentSet.AddFast(doc);
                                        if (++docCount % 1000000 == 0)
                                        {
                                            //session.Commit(false); // skip recovery check, we do it in BeginUpdate which is enough
                                            Console.WriteLine("Done importing article " + docCount + " number of lines: " + textReader.LineNumber);
                                            //session.BeginUpdate();
                                        }
                                    }
                                    break;

                                case System.Xml.XmlNodeType.Whitespace:
                                    xmlWhitespace = new XmlWhitespace(textReader.Value, xmlDocument);
                                    break;

                                case System.Xml.XmlNodeType.XmlDeclaration:
                                    break;
                                }
                                ;
                            }
                            Console.WriteLine("Finished importing article " + docCount + " number of lines: " + textReader.LineNumber);
                        }
                    }
                }
                session.Commit();
            }
            Console.WriteLine(DateTime.Now.ToString() + ", done importing Wikipedia text");
        }
Пример #32
0
        void updateDataGrids(IndexRoot indexRoot, int indexOfRemoved = -1)
        {
            if (indexRoot == null)
            {
                return;
            }
            if (indexRoot.lexicon.WordSet.Count == 0)
            {
                return;
            }
            stackPanel.IsEnabled = false;
            bool aRefresh = stackPanel.Children.Count > 0;

            if (indexOfRemoved >= 0 && aRefresh)
            {
                stackPanel.Children.RemoveAt(0);
            }
            else if (stackPanel.Children.Count > 0)
            {
                stackPanel.Children.Clear();
            }
            DataGrid dataGrid = new DataGrid();

            dataGrid.AutoGenerateColumns = true;
            dataGrid.MaxColumnWidth      = 150;
            dataGridList.Add(dataGrid);
            DataTable  table       = new DataTable("Word Count");
            DataColumn wordColumn  = new DataColumn("Words (all pages)", Type.GetType("System.String"));
            DataColumn countColumn = new DataColumn("Count", Type.GetType("System.UInt32"));

            table.Columns.Add(wordColumn);
            table.Columns.Add(countColumn);
            DataRow newRow;
            int     pageIndex = 0;
            int     min       = 3;

            int.TryParse(wordMinCt.Text, out min);
            foreach (Word word in indexRoot.lexicon.WordSet)
            {
                if (word.GlobalCount >= min)
                {
                    newRow    = table.NewRow();
                    newRow[0] = word.aWord;
                    newRow[1] = word.GlobalCount;
                    table.Rows.Add(newRow);
                }
            }
            DataView dataView = new DataView(table);

            dataView.Sort        = "Count desc";
            dataGrid.ItemsSource = dataView;
            stackPanel.Children.Insert(pageIndex++, dataGrid);
            if (indexOfRemoved >= 0 && aRefresh)
            {
                stackPanel.Children.RemoveAt(indexOfRemoved + 1);
            }
            else
            {
                List <Document> docs = indexRoot.repository.documentSet.ToList <Document>().ToList <Document>();
                foreach (Document page in docs)
                {
                    DataTable pageTable = new DataTable();
                    dataTableList.Add(pageTable);
                    string pageName = page.url.TrimEnd('/');
                    int    index    = pageName.IndexOf("//");
                    if (index >= 0)
                    {
                        pageName = pageName.Remove(0, index + 2);
                    }
                    index = pageName.IndexOf("www.");
                    if (index >= 0)
                    {
                        pageName = pageName.Remove(0, index + 4);
                    }
                    pageName = pageName.Replace('.', ' ');
                    pageName = pageName.Replace('/', ' ');
                    DataColumn wordColumnPage  = new DataColumn(pageName, Type.GetType("System.String"));
                    DataColumn countColumnPage = new DataColumn("Count", Type.GetType("System.Int32"));
                    pageTable.Columns.Add(wordColumnPage);
                    pageTable.Columns.Add(countColumnPage);
                    foreach (KeyValuePair <Word, WordHit> pair in page.WordHit)
                    {
                        if ((int)pair.Value.Count >= min)
                        {
                            newRow = pageTable.NewRow();
                            string aString = pair.Key.aWord;
                            newRow.SetField <string>(wordColumnPage, aString);
                            newRow.SetField <int>(countColumnPage, (int)pair.Value.Count);
                            //wc.Add(new WordCount(aString, (uint) hit.wordPositionSet.Count));
                            pageTable.Rows.Add(newRow);
                        }
                    }
                    dataGrid = new DataGrid();
                    dataGrid.AutoGenerateColumns = true;
                    dataGrid.MaxColumnWidth      = 150;
                    dataGridList.Add(dataGrid);
                    dataView             = new DataView(pageTable);
                    dataView.Sort        = "Count desc";
                    dataGrid.ItemsSource = dataView;
                    stackPanel.Children.Insert(pageIndex++, dataGrid);
                }
            }
            stackPanel.IsEnabled = true;
        }
Пример #33
0
        public FileRecord(byte[] rawBytes, int offset)
        {
            Offset = offset;
            var sig = BitConverter.ToInt32(rawBytes, 0);

            if ((sig != _fileSig) && (sig != _baadSig) && (sig != 0x0))
            {
                Logger.Fatal($"Invalid signature! 0x{sig:X}");
                return;
                //throw new Exception("Invalid signature!");
            }

            if (sig == _baadSig)
            {
                Logger.Warn($"Bad signature at offset 0x{offset:X}");
                return;
            }

            Attributes = new List <Attribute>();

            FixupOffset     = BitConverter.ToInt16(rawBytes, 2);
            FixupEntryCount = BitConverter.ToInt16(rawBytes, 4);

            LogSequenceNumber = BitConverter.ToInt64(rawBytes, 0x8);

            SequenceNumber = BitConverter.ToInt16(rawBytes, 0x10);

            ReferenceCount = BitConverter.ToInt16(rawBytes, 0x12);

            FirstAttributeOffset = BitConverter.ToInt16(rawBytes, 0x14);

            EntryFlags = (EntryFlag)BitConverter.ToInt16(rawBytes, 0x16);

            Logger.Trace($"Entry flags: {EntryFlags}");

            ActualRecordSize = BitConverter.ToInt32(rawBytes, 0x18);

            AllocatedRecordSize = BitConverter.ToInt32(rawBytes, 0x1c);

            var entryBytes = new byte[8];

            Buffer.BlockCopy(rawBytes, 0x20, entryBytes, 0, 8);

            MFTRecordToBaseRecord = new MftEntryInfo(entryBytes);

            FirstAvailableAttribueId = BitConverter.ToInt16(rawBytes, 0x28);

            EntryNumber = BitConverter.ToInt32(rawBytes, 0x2c);

            var fixupExpectedBytes = new byte[2];
            var fixupActual1       = new byte[2];
            var fixupActual2       = new byte[2];

            Buffer.BlockCopy(rawBytes, 0x30, fixupExpectedBytes, 0, 2);
            Buffer.BlockCopy(rawBytes, 0x32, fixupActual1, 0, 2);
            Buffer.BlockCopy(rawBytes, 0x34, fixupActual2, 0, 2);

            //verify this record looks ok based on fixup bytes
            //0x1FE and 0x3fe

            var expectedFixupVal = BitConverter.ToInt16(fixupExpectedBytes, 0);
            var x1FeValue        = BitConverter.ToInt16(rawBytes, 0x1FE);
            var x3FeValue        = BitConverter.ToInt16(rawBytes, 0x3FE);

            if ((x1FeValue != expectedFixupVal) &&
                ((EntryFlags & EntryFlag.FileRecordSegmentInUse) == EntryFlag.FileRecordSegmentInUse))
            {
                Logger.Warn(
                    $"FILE record at offset 0x{offset:X}! Fixup values do not match at 0x1FE. Expected: {expectedFixupVal}, actual: {x1FeValue}, EntryFlags: {EntryFlags}");
            }

            if ((x3FeValue != expectedFixupVal) &&
                ((EntryFlags & EntryFlag.FileRecordSegmentInUse) == EntryFlag.FileRecordSegmentInUse))
            {
                Logger.Warn(
                    $"FILE record at offset 0x{offset:X}! Fixup values do not match at 0x3FE. Expected: {expectedFixupVal}, actual: {x3FeValue}, EntryFlags: {EntryFlags}");
            }

            //header is done, replace fixup bytes with actual bytes
            //0x1fe and 0x3fe should contain fixup bytes

            Buffer.BlockCopy(fixupActual1, 0, rawBytes, 0x1fe, 2);
            Buffer.BlockCopy(fixupActual2, 0, rawBytes, 0x3fe, 2);

            //start attribute processing at FirstAttributeOffset

            var index = (int)FirstAttributeOffset;

            while (index < ActualRecordSize)
            {
                var attrType = BitConverter.ToInt32(rawBytes, index);

                var attrSize = BitConverter.ToInt32(rawBytes, index + 4);

//                Logger.Trace(
//                    $"ActualRecordSize: {ActualRecordSize} attrType: 0x{attrType:X}, size: {attrSize}, index: {index}, offset: 0x{offset:x}, i+o: 0x{index + offset:X}");

                if ((attrSize == 0) || (attrType == -1))
                {
                    index += 8;          //skip -1 type and 0 size

                    if (EntryFlags == 0) //this is a free record
                    {
                        break;
                    }

                    continue;
                }

                var rawAttr = new byte[attrSize];
                Buffer.BlockCopy(rawBytes, index, rawAttr, 0, attrSize);

                switch ((AttributeType)attrType)
                {
                case AttributeType.StandardInformation:
                    var si = new StandardInfo(rawAttr);
                    Attributes.Add(si);

                    SILastAccessedOn    = si.LastAccessedOn;
                    SICreatedOn         = si.CreatedOn;
                    SIRecordModifiedOn  = si.RecordModifiedOn;
                    SIContentModifiedOn = si.ContentModifiedOn;

                    break;

                case AttributeType.FileName:
                    var fi = new FileName(rawAttr);
                    Attributes.Add(fi);

                    if ((fi.FileInfo.NameType & NameTypes.Windows) == NameTypes.Windows)
                    {
                        FName = fi.FileInfo.FileName;
                    }

                    //if (fi.FileInfo.LastAccessedOn.UtcDateTime != SILastAccessedOn.UtcDateTime)
                    //{
                    FNLastAccessedOn = fi.FileInfo.LastAccessedOn;
                    //}

                    //if (fi.FileInfo.CreatedOn.UtcDateTime != SICreatedOn.UtcDateTime)
                    //{
                    FNCreatedOn = fi.FileInfo.CreatedOn;
                    //}

                    //if (fi.FileInfo.RecordModifiedOn.UtcDateTime != SIRecordModifiedOn.UtcDateTime)
                    //{
                    FNRecordModifiedOn = fi.FileInfo.RecordModifiedOn;
                    //}


                    //if (fi.FileInfo.ContentModifiedOn.UtcDateTime != SIContentModifiedOn.UtcDateTime)
                    //{
                    FNContentModifiedOn = fi.FileInfo.ContentModifiedOn;
                    //}


                    break;

                case AttributeType.Data:
                    var data = new Data(rawAttr);
                    Attributes.Add(data);
                    break;

                case AttributeType.IndexAllocation:
                    var ia = new IndexAllocation(rawAttr);
                    Attributes.Add(ia);
                    break;

                case AttributeType.IndexRoot:
                    var ir = new IndexRoot(rawAttr);
                    Attributes.Add(ir);
                    break;

                case AttributeType.Bitmap:
                    var bm = new Bitmap(rawAttr);
                    Attributes.Add(bm);
                    break;

                case AttributeType.VolumeVersionObjectId:
                    var oi = new ObjectId(rawAttr);
                    Attributes.Add(oi);
                    break;

                case AttributeType.SecurityDescriptor:
                    var sd = new SecurityDescriptor(rawAttr);
                    Attributes.Add(sd);

                    break;

                case AttributeType.VolumeName:
                    var vn = new VolumeName(rawAttr);
                    Attributes.Add(vn);
                    break;

                case AttributeType.VolumeInformation:
                    var vi = new VolumeInformation(rawAttr);
                    Attributes.Add(vi);
                    break;

                case AttributeType.LoggedUtilityStream:
                    var lus = new LoggedUtilityStream(rawAttr);
                    Attributes.Add(lus);
                    break;

                case AttributeType.ReparsePoint:
                    var rp = new ReparsePoint(rawAttr);
                    Attributes.Add(rp);
                    break;

                case AttributeType.AttributeList:
                    var al = new AttributeList(rawAttr);
                    Attributes.Add(al);
                    break;

                case AttributeType.Ea:
                    //TODO Finish this
                    var ea = new ExtendedAttribute(rawAttr);
                    Attributes.Add(ea);
                    break;

                case AttributeType.EaInformation:
                    var eai = new ExtendedAttributeInformation(rawAttr);
                    Attributes.Add(eai);
                    break;

                default:
                    Logger.Warn($"Unhandled attribute type! Add me: {(AttributeType) attrType}");
                    throw new Exception($"Add me: {(AttributeType) attrType}");
                    break;
                }

                index += attrSize;
            }

            SlackStartOffset = index;

            //rest is slack. handle here?
            Logger.Trace($"Slack starts at {index} i+o: 0x{index + offset:X}");
        }