static void createInvertedIndex() { using (SessionNoServer session = new SessionNoServer(s_systemDir)) { session.BeginUpdate(); session.EnableAutoPageFlush = false; // so that threads don't stomb on each other Console.WriteLine(DateTime.Now.ToString() + ", start creating inverted index"); ParallelOptions pOptions = new ParallelOptions(); pOptions.MaxDegreeOfParallelism = 2; // set to what is appropriate for your computer (cores & memory size) //pOptions.MaxDegreeOfParallelism = 1; // appears to work best with only 16GB of memory IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); BTreeSet <Document> documentSet = indexRoot.repository.documentSet; List <Database> dbs = session.OpenAllDatabases(true); Parallel.ForEach <Database>(dbs, pOptions, (Database db, ParallelLoopState loop) => // method invoked by the loop on each iteration { if (db.DatabaseNumber >= Document.PlaceInDatabase) { createDocumentInvertedIndex(session, db, documentSet); } }); session.Commit(); Console.WriteLine(DateTime.Now.ToString() + ", done creating inverted index"); } }
public void createGlobalInvertedIndex(IndexRoot indexRoot) { Placement wordPlacement = new Placement(Lexicon.PlaceInDatabase, 2); BTreeSetOidShort <Word> wordSet = indexRoot.lexicon.WordSet; BTreeSet <Document> docSet = indexRoot.repository.documentSet; Word existingWord = null; foreach (Document doc in docSet) { if (doc.Indexed == false) { foreach (Word word in doc.WordSet) { WordHit wordHit = doc.WordHit[word]; if (wordSet.TryGetKey(word, ref existingWord)) { existingWord.GlobalCount = existingWord.GlobalCount + (uint)wordHit.Count; } else { existingWord = new WordGlobal(word.aWord, session, (uint)wordHit.Count); existingWord.Persist(wordPlacement, session); wordSet.Add(existingWord); } existingWord.DocumentHit.AddFast(doc); } doc.Indexed = true; } } }
public void textToWords(Document doc, IndexRoot indexRoot, string docTextString) { DocumentText docText = new DocumentText(docTextString, doc); session.Persist(doc); doc.Page.Database.Name = doc.Name; session.Persist(docText); indexRoot.Repository.DocumentSet.Add(doc); doc.Content = docText; docTextString = docTextString.ToLower(); string[] excludedWords = new string[] { "and", "the" }; char[] splitChars = new char[] { ' ', '\n', '(', '"', '!', ',', '(', ')', '\t' }; string[] words = docTextString.Split(splitChars, StringSplitOptions.RemoveEmptyEntries); int i = 0; string aWord; char[] trimEndChars = new char[] { ';', '.', '"', ',', '\r', ':', ']', '!', '?', '+', '(', ')', '\'', '{', '}', '-', '`', '/', '=' }; char[] trimStartChars = new char[] { ';', '&', '-', '#', '*', '[', '.', '"', ',', '\r', ')', '(', '\'', '{', '}', '-', '`' }; foreach (string wordStr in words) { i++; aWord = wordStr.TrimEnd(trimEndChars); aWord = aWord.TrimStart(trimStartChars); if (aWord.Length > 1 && excludedWords.Contains(aWord) == false) { indexRoot.Lexicon.PossiblyAddToken(aWord, doc); } } }
public void textToWords(Document doc, IndexRoot indexRoot, string docTextString, Placement documentPlacement, Placement documentTextPlacement, Placement wordPlacement, Placement wordHitPlacement) { DocumentText docText = new DocumentText(docTextString, doc); Word word; doc.Persist(documentPlacement, session); doc.Page.Database.Name = doc.Name; docText.Persist(documentTextPlacement, session); indexRoot.repository.documentSet.Add(doc); doc.Content = docText; docTextString = docTextString.ToLower(); string[] excludedWords = new string[] { "and", "the" }; char[] splitChars = new char[] { ' ', '\n', '(', '"', '!', ',', '(', ')', '\t' }; string[] words = docTextString.Split(splitChars, StringSplitOptions.RemoveEmptyEntries); UInt64 wordCt = 0; int i = 0; string aWord; char[] trimEndChars = new char[] { ';', '.', '"', ',', '\r', ':', ']', '!', '?', '+', '(', ')', '\'', '{', '}', '-', '`', '/', '=' }; char[] trimStartChars = new char[] { ';', '&', '-', '#', '*', '[', '.', '"', ',', '\r', ')', '(', '\'', '{', '}', '-', '`' }; foreach (string wordStr in words) { i++; aWord = wordStr.TrimEnd(trimEndChars); aWord = aWord.TrimStart(trimStartChars); word = new Word(aWord); if (aWord.Length > 1 && excludedWords.Contains(aWord) == false) { createLocalInvertedIndex(doc, word, wordCt, wordPlacement, wordHitPlacement); ++wordCt; } } }
private void button1_Click(object sender, RoutedEventArgs e) { session.BeginUpdate(); IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); Placement docPlacement = new Placement(Document.PlaceInDatabase); foreach (string str in listBoxPagesToAdd.Items) { Document doc = null; try { if (str.Contains(".html") || str.Contains(".htm") || str.Contains("http") || str.Contains("aspx")) { doc = parseHtml(str, indexRoot); } else { doc = parseTextFile(str, indexRoot, docPlacement); } } catch (WebException ex) { Console.WriteLine(ex.ToString()); } } createGlobalInvertedIndex(indexRoot); listBoxPagesToAdd.Items.Clear(); List <Document> docs = indexRoot.repository.documentSet.ToList <Document>().Take(50).ToList <Document>(); inDbListBox.ItemsSource = docs; session.Commit(); session.BeginRead(); updateDataGrids(indexRoot); session.Commit(); }
public async Task <IndexRoot> Integrate(ICollection <VirtualFile> files) { Utils.Log($"Integrating {files.Count} files"); var allFiles = AllFiles.Concat(files).GroupBy(f => f.Name).Select(g => g.Last()).ToImmutableList(); var byFullPath = Task.Run(() => allFiles.SelectMany(f => f.ThisAndAllChildren) .ToImmutableDictionary(f => f.FullPath)); var byHash = Task.Run(() => allFiles.SelectMany(f => f.ThisAndAllChildren) .Where(f => f.Hash != null) .ToGroupedImmutableDictionary(f => f.Hash)); var byName = Task.Run(() => allFiles.SelectMany(f => f.ThisAndAllChildren) .ToGroupedImmutableDictionary(f => f.Name)); var byRootPath = Task.Run(() => allFiles.ToImmutableDictionary(f => f.Name)); var result = new IndexRoot(allFiles, await byFullPath, await byHash, await byRootPath, await byName); Utils.Log($"Done integrating"); return(result); }
private void CreateRoot(string name) { Root = new FileRecordSegmentHeader(); Root.CreateFileRecordHeader(Enums.MFTEntryFlags.FileNameIndexPresent, null); StandardInformation stdInfo = new StandardInformation(); stdInfo.CreateStandInfoFile(FileAttributes.Normal); Root.UsedEntrySize += stdInfo.RecordLength; IndexRoot indRoot = new IndexRoot(); //Root.UsedEntrySize += (uint)indRoot.numberOfChildren * 8; // reference is 8 B FileName fileName = new FileName(); fileName.CreateFileNameFile(name); fileName.RealSize = fileName.AllocatedSize = 0; Root.UsedEntrySize += fileName.RecordLength; Root.attributes.Add(stdInfo); Root.attributes.Add(fileName); Root.attributes.Add(indRoot); BytesOccupied += Root.UsedEntrySize; }
public async Task <IndexRoot> Integrate(ICollection <VirtualFile> files) { Utils.Log($"Integrating {files.Count} files"); var allFiles = AllFiles.Concat(files) .OrderByDescending(f => f.LastModified) .GroupBy(f => f.FullPath).Select(g => g.Last()) .ToList(); var byFullPath = Task.Run(() => allFiles.SelectMany(f => f.ThisAndAllChildren) .ToDictionary(f => f.FullPath)); var byHash = Task.Run(() => allFiles.SelectMany(f => f.ThisAndAllChildren) .Where(f => f.Hash != Hash.Empty) .ToLookup(f => f.Hash)); var byName = Task.Run(() => allFiles.SelectMany(f => f.ThisAndAllChildren) .ToLookup(f => f.Name)); var byRootPath = Task.Run(() => allFiles.ToDictionary(f => f.AbsoluteName)); var result = new IndexRoot(allFiles, await byFullPath, await byHash, await byRootPath, await byName); Utils.Log($"Done integrating"); return(result); }
public Index(File file, string name, BiosParameterBlock bpb, UpperCase upCase) { _file = file; _name = name; _bpb = bpb; _isFileIndex = name == "$I30"; _blockCache = new ObjectCache<long, IndexBlock>(); _root = _file.GetStream(AttributeType.IndexRoot, _name).GetContent<IndexRoot>(); _comparer = _root.GetCollator(upCase); using (Stream s = _file.OpenStream(AttributeType.IndexRoot, _name, FileAccess.Read)) { byte[] buffer = Utilities.ReadFully(s, (int)s.Length); _rootNode = new IndexNode(WriteRootNodeToDisk, 0, this, true, buffer, IndexRoot.HeaderOffset); // Give the attribute some room to breathe, so long as it doesn't squeeze others out // BROKEN, BROKEN, BROKEN - how to figure this out? Query at the point of adding entries to the root node? _rootNode.TotalSpaceAvailable += _file.MftRecordFreeSpace(AttributeType.IndexRoot, _name) - 100; } if (_file.StreamExists(AttributeType.IndexAllocation, _name)) { _indexStream = _file.OpenStream(AttributeType.IndexAllocation, _name, FileAccess.ReadWrite); } if (_file.StreamExists(AttributeType.Bitmap, _name)) { _indexBitmap = new Bitmap(_file.OpenStream(AttributeType.Bitmap, _name, FileAccess.ReadWrite), long.MaxValue); } }
private void MenuItem_Click(object sender, RoutedEventArgs e) { MenuItem menuItem = (MenuItem)e.Source; Document myItem = (Document)menuItem.DataContext; session.BeginUpdate(); try { IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); int index; if (indexRoot.repository.documentSet.TryGetKey(myItem, ref myItem)) { index = myItem.Remove(indexRoot, session); } else { index = -1; // weird case - should not happen } inDbListBox.ItemsSource = indexRoot.repository.documentSet.ToList <Document>(); updateDataGrids(indexRoot, index); session.Commit(); } catch { session.Abort(); } }
static void createTopLevelInvertedIndex() { Console.WriteLine(DateTime.Now.ToString() + ", start creating top level inverted index"); using (SessionNoServer session = new SessionNoServer(s_systemDir)) { Placement wordPlacement = new Placement(Lexicon.PlaceInDatabase, 2, 1, 1000, 50000, true, false, UInt32.MaxValue, false); session.BeginUpdate(); IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); BTreeSetOidShort <Word> wordSet = indexRoot.lexicon.WordSet; BTreeSet <Document> documentSet = indexRoot.repository.documentSet; Word existingWord = null; foreach (Document doc in documentSet) { foreach (Word word in doc.WordSet) { WordHit wordHit = doc.WordHit[word]; if (wordSet.TryGetKey(word, ref existingWord)) { existingWord.GlobalCount = existingWord.GlobalCount + (uint)wordHit.Count; } else { existingWord = new WordGlobal(word.aWord, session, (uint)wordHit.Count); existingWord.Persist(wordPlacement, session); indexRoot.lexicon.WordSet.Add(existingWord); } existingWord.DocumentHit.AddFast(doc); } doc.Indexed = true; } session.Commit(); Console.WriteLine(DateTime.Now.ToString() + ", done creating top level inverted index"); } }
public Document parseTextFile(string url, IndexRoot indexRoot) { Document doc = new Document(Path.GetFileName(url), indexRoot, session); using (StreamReader reader = new StreamReader(url)) { textToWords(doc, indexRoot, reader.ReadToEnd()); } return(doc); }
public Database(string dataDir, string indexDir) { Directory.CreateDirectory(dataDir); var logPath = Path.Combine(dataDir, "log.dat"); var datafilePath = Path.Combine(dataDir, "data.dat"); _wal = new Wal(logPath); _dataFile = new DataFile(datafilePath); _index = new IndexRoot(indexDir); Recover().GetAwaiter().GetResult(); }
public Document parseTextFile(string url, IndexRoot indexRoot, Placement docPlacement) { Document doc = new Document(Path.GetFileName(url), indexRoot, session); Placement docTextPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 1)); Placement wordPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 2)); Placement wordHitPlacement = new Placement(docPlacement.TryDatabaseNumber, (ushort)(docPlacement.TryPageNumber + 10)); using (StreamReader reader = new StreamReader(url)) { textToWords(doc, indexRoot, reader.ReadToEnd(), docPlacement, docTextPlacement, wordPlacement, wordHitPlacement); } return(doc); }
static void outputSomeInfo(SessionNoServer session) { IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); BTreeSetOidShort <Word> wordSet = indexRoot.lexicon.WordSet; using (StreamWriter writer = new StreamWriter("Wikipedia.txt")) { writer.WriteLine("Number of words in Lexicon is: " + wordSet.Count); foreach (Word word in wordSet) { writer.WriteLine(word.aWord + " " + word.DocumentHit.Count); } writer.Close(); } }
private void updateWordTables_Click(object sender, RoutedEventArgs e) { if (session != null) { try { session.BeginRead(); IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); updateDataGrids(indexRoot); } finally { session.Commit(); } } }
static void outputSomeInfo(SessionNoServer session) { IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); var wordHits = indexRoot.Lexicon.TokenMap; using (StreamWriter writer = new StreamWriter("Wikipedia.txt")) { writer.WriteLine("Number of words in Lexicon is: " + indexRoot.Lexicon.IdToValue.Count); foreach (var p in wordHits) { var word = indexRoot.Lexicon.IdToValue[p.Key]; writer.WriteLine(word + " " + p.Value.Count); } writer.Close(); } }
public Document parseHtml(string url, IndexRoot indexRoot) { Document doc = new Document(url, indexRoot, session); using (WebClient client = new WebClient()) { string html = client.DownloadString(url); string pageBody = ""; HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(html); foreach (HtmlNode node in htmlDoc.DocumentNode.SelectNodes("//text()")) { pageBody += " " + node.InnerText; } textToWords(doc, indexRoot, pageBody); } return(doc); }
static void CreateInvertedIndex() { using (SessionNoServer session = new SessionNoServer(s_systemDir, 5000, false, false, CacheEnum.No)) // turn of page and object caching)) { session.BeginUpdate(); session.RegisterClass(typeof(Repository)); session.RegisterClass(typeof(IndexRoot)); session.RegisterClass(typeof(Lexicon <string>)); Console.WriteLine(DateTime.Now.ToString() + ", start creating inverted index"); IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); BTreeSet <Document> documentSet = indexRoot.Repository.DocumentSet; foreach (var doc in documentSet) { createDocumentInvertedIndex(indexRoot, doc); } session.Commit(); Console.WriteLine(DateTime.Now.ToString() + ", done creating inverted index"); } }
public void verify(bool startTrans = true) { if (startTrans) { session.BeginRead(); } IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); int i = 0; int j = 0; int k = 0; foreach (var pair in indexRoot.Lexicon.TokenMap) { i++; foreach (Document doc in pair.Value) { j++; if (doc == null) { throw new UnexpectedException("bad documentHit BTreeSet"); } foreach (KeyValuePair <UInt32, UInt32> pair2 in doc.WordHit) { k++; if (pair2.Value == 0) { throw new UnexpectedException("bad document WordHit"); } if (indexRoot.Lexicon.IdToValue.Contains(pair2.Key) == false) { throw new UnexpectedException("missing lexicon word"); } } } } if (startTrans) { session.Commit(); } }
public void verify(bool startTrans = true) { if (startTrans) { session.BeginRead(); } IndexRoot indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); int i = 0; int j = 0; int k = 0; foreach (Word word in indexRoot.lexicon.WordSet) { i++; foreach (Document doc in word.DocumentHit) { j++; if (doc == null) { throw new UnexpectedException("bad documentHit BTreeSet"); } foreach (KeyValuePair <Word, WordHit> pair in doc.WordHit) { k++; if (pair.Value == null || pair.Key == null) { throw new UnexpectedException("bad document WordHit"); } if (indexRoot.lexicon.WordSet.Contains(pair.Key) == false) { throw new UnexpectedException("missing lexicon word"); } } } } if (startTrans) { session.Commit(); } }
static void createDocumentInvertedIndex(IndexRoot indexRoot, Document doc) { if (!doc.Indexed) { DocumentText docText = doc.Content; string text = docText.Text.ToLower(); MatchCollection tagMatches = Regex.Matches(text, "[a-z][a-z$]+"); if (++s_docCountIndexed % 50000 == 0) { Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed); } foreach (Match m in tagMatches) { indexRoot.Lexicon.PossiblyAddToken(m.Value, doc); } if (s_docCountIndexed % 1000 == 0) { Console.WriteLine(DateTime.Now.ToString() + ", done indexing article: " + s_docCountIndexed + " Database: " + doc.DatabaseNumber + " is completed."); } doc.Indexed = true; } }
public Document parseHtml(string url, IndexRoot indexRoot) { Document doc = new Document(url, indexRoot, session); Placement docPlacement = new Placement(Document.PlaceInDatabase); Placement docTextPlacement = new Placement(Document.PlaceInDatabase, 2); Placement wordPlacement = new Placement(Document.PlaceInDatabase, 3); Placement wordHitPlacement = new Placement(Document.PlaceInDatabase, 100); using (WebClient client = new WebClient()) { string html = client.DownloadString(url); string pageBody = ""; HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(html); foreach (HtmlNode node in htmlDoc.DocumentNode.SelectNodes("//text()")) { pageBody += " " + node.InnerText; } textToWords(doc, indexRoot, pageBody, docPlacement, docTextPlacement, wordPlacement, wordHitPlacement); } return(doc); }
//static void MessageWrite(string str) //{ // Console.Write(">" + "{0} {1}\n>", str, message1); // Console.ReadLine(); //} static void CreateDirectory(FileRecordSegmentHeader parentFolder, FileRecordSegmentHeader childFolder, string name, NtfsFileSystem fs) { childFolder.CreateFileRecordHeader(Enums.MFTEntryFlags.FileNameIndexPresent, parentFolder); StandardInformation stdInfo = new StandardInformation(); stdInfo.CreateStandInfoFile(FileAttributes.Normal); childFolder.UsedEntrySize += stdInfo.RecordLength; IndexRoot indRoot = new IndexRoot(); //childFolder.UsedEntrySize += indRoot.RecordLength; FileName fileName = new FileName(); fileName.CreateFileNameFile(name); fileName.RealSize = fileName.AllocatedSize = 0; childFolder.UsedEntrySize += fileName.RecordLength; if (childFolder.UsedEntrySize >= (fs.VolumeSize - fs.BytesOccupied + 8)) { Console.WriteLine("Volume is full!"); Save(fs.VolName.Name, fs.Save()); Environment.Exit(0); } childFolder.attributes.Add(stdInfo); childFolder.attributes.Add(fileName); childFolder.attributes.Add(indRoot); ((IndexRoot)parentFolder.attributes.ElementAt(2)).Children.Add(childFolder); ((IndexRoot)parentFolder.attributes.ElementAt(2)).numberOfChildren++; childFolder.parent = parentFolder; fs.BytesOccupied += childFolder.UsedEntrySize + 8; fs.DirectoryRecordCount++; parentFolder.UsedFolderSize += childFolder.UsedEntrySize; }
private Index(AttributeType attrType, AttributeCollationRule collationRule, File file, string name, BiosParameterBlock bpb, UpperCase upCase) { _file = file; _name = name; _bpb = bpb; _isFileIndex = name == "$I30"; _blockCache = new ObjectCache<long, IndexBlock>(); _file.CreateStream(AttributeType.IndexRoot, _name); _root = new IndexRoot() { AttributeType = (uint)attrType, CollationRule = collationRule, IndexAllocationSize = (uint)bpb.IndexBufferSize, RawClustersPerIndexRecord = bpb.RawIndexBufferSize }; _comparer = _root.GetCollator(upCase); _rootNode = new IndexNode(WriteRootNodeToDisk, 0, this, true, 32); }
static void ImportEntireWikipedia() { const ushort btreeNodeSize = 10000; Console.WriteLine(DateTime.Now.ToString() + ", start importing Wikipedia text"); //System.Xml.Schema.XmlSchema docSchema; //using (System.Xml.XmlTextReader schemaReader = new System.Xml.XmlTextReader("c:\\export-0_5.xsd")) //{ // docSchema = System.Xml.Schema.XmlSchema.Read(schemaReader, ValidationCallBack); // } int docCount = 0; using (SessionNoServer session = new SessionNoServer(s_systemDir, 5000, false, false, CacheEnum.No)) // turn of page and object caching { Console.WriteLine($"Running with databases in directory: {session.SystemDirectory}"); //GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer XmlComment xmlComment; XmlElement xmlElement; XmlEntity xmlEntity; XmlText xmlText; XmlWhitespace xmlWhitespace; session.BeginUpdate(); // register all database schema classes used by the application in advance to avoid lock conflict later in parallel indexing Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false); if (db != null) { outputSomeInfo(session); session.Abort(); return; } //session.SetTraceDbActivity(Lexicon.PlaceInDatabase); //session.SetTraceAllDbActivity(); XmlDocument xmlDocument = new XmlDocument("enwiki-latest-pages-articles.xml"); IndexRoot indexRoot = new IndexRoot(btreeNodeSize, session); indexRoot.Persist(session, indexRoot, true); UInt32 currentDocumentDatabaseNum = 0; Document doc = null; bool titleElement = false; bool pageText = false; using (FileStream fs = new FileStream(s_wikipediaXmlFile, FileMode.Open)) { //using (GZipStream zipStream = new GZipStream(fs, CompressionMode.Decompress)) // if input was a .gz file { using (System.Xml.XmlTextReader textReader = new System.Xml.XmlTextReader(fs)) { while (textReader.Read()) { System.Xml.XmlNodeType nodeType = textReader.NodeType; switch (nodeType) { case System.Xml.XmlNodeType.Attribute: break; case System.Xml.XmlNodeType.CDATA: break; case System.Xml.XmlNodeType.Comment: xmlComment = new XmlComment(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.Document: break; case System.Xml.XmlNodeType.DocumentFragment: break; case System.Xml.XmlNodeType.DocumentType: break; case System.Xml.XmlNodeType.Element: xmlElement = new XmlElement(textReader.Prefix, textReader.LocalName, textReader.NamespaceURI, xmlDocument); if (textReader.LocalName == "title") { titleElement = true; } else if (textReader.LocalName == "text") { pageText = true; } break; case System.Xml.XmlNodeType.EndElement: if (textReader.LocalName == "title" && doc != null) { titleElement = false; } else if (textReader.LocalName == "text" && doc != null) { pageText = false; } break; case System.Xml.XmlNodeType.EndEntity: break; case System.Xml.XmlNodeType.Entity: xmlEntity = new XmlEntity(textReader.LocalName, xmlDocument); break; case System.Xml.XmlNodeType.EntityReference: break; case System.Xml.XmlNodeType.None: break; case System.Xml.XmlNodeType.Notation: break; case System.Xml.XmlNodeType.ProcessingInstruction: break; case System.Xml.XmlNodeType.SignificantWhitespace: break; case System.Xml.XmlNodeType.Text: xmlText = new XmlText(textReader.Value, xmlDocument); if (titleElement) { doc = new Document(textReader.Value, indexRoot, session); session.Persist(doc); if (doc.DatabaseNumber != currentDocumentDatabaseNum) { if (currentDocumentDatabaseNum > 0) { session.FlushUpdates(); Console.WriteLine("Database: " + currentDocumentDatabaseNum + " is completed, done importing article " + docCount + " number of lines: " + textReader.LineNumber); } currentDocumentDatabaseNum = doc.DatabaseNumber; } //doc.Page.Database.Name = doc.Name; } else if (doc != null && pageText) { #if DEBUGx Console.WriteLine(doc.Name + " line: " + textReader.LineNumber); #endif //if (textReader.LineNumber > 1000000) //{ // session.Commit(); // return; //} DocumentText content = new DocumentText(textReader.Value, doc); session.Persist(content, 10000); doc.Content = content; indexRoot.Repository.DocumentSet.AddFast(doc); if (++docCount % 1000000 == 0) { //session.Commit(false); // skip recovery check, we do it in BeginUpdate which is enough Console.WriteLine("Done importing article " + docCount + " number of lines: " + textReader.LineNumber); //session.BeginUpdate(); } } break; case System.Xml.XmlNodeType.Whitespace: xmlWhitespace = new XmlWhitespace(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.XmlDeclaration: break; } ; } Console.WriteLine("Finished importing article " + docCount + " number of lines: " + textReader.LineNumber); } } } session.Commit(); } Console.WriteLine(DateTime.Now.ToString() + ", done importing Wikipedia text"); }
public FileRecord(byte[] rawBytes, int offset) { Offset = offset; var sig = BitConverter.ToInt32(rawBytes, 0); switch (sig) { case FileSig: break; case BaadSig: _logger.Debug($"Bad signature at offset 0x{offset:X}"); IsBad = true; return; default: //not initialized _logger.Debug($"Uninitialized entry (no signature) at offset 0x{offset:X}"); IsUninitialized = true; return; } _logger.Debug($"Processing FILE record at offset 0x{offset:X}"); Attributes = new List <Attribute>(); FixupOffset = BitConverter.ToInt16(rawBytes, 0x4); FixupEntryCount = BitConverter.ToInt16(rawBytes, 0x6); //to build fixup info, take FixupEntryCount x 2 bytes as each are 2 bytes long var fixupTotalLength = FixupEntryCount * 2; var fixupBuffer = new byte[fixupTotalLength]; Buffer.BlockCopy(rawBytes, FixupOffset, fixupBuffer, 0, fixupTotalLength); //pull this early so we can check if its free in our fix up value messages EntryFlags = (EntryFlag)BitConverter.ToInt16(rawBytes, 0x16); FixupData = new FixupData(fixupBuffer); FixupOk = true; //fixup verification var counter = 512; foreach (var bytese in FixupData.FixupActual) { //adjust the offset to where we need to check var fixupOffset = counter - 2; var expected = BitConverter.ToInt16(rawBytes, fixupOffset); if (expected != FixupData.FixupExpected && EntryFlags != 0x0) { FixupOk = false; _logger.Warn( $"Offset: 0x{Offset:X} Entry/seq: 0x{EntryNumber:X}/0x{SequenceNumber:X} Fixup values do not match at 0x{fixupOffset:X}. Expected: 0x{FixupData.FixupExpected:X2}, actual: 0x{expected:X2}"); } //replace fixup expected with actual bytes. bytese has actual replacement values in it. Buffer.BlockCopy(bytese, 0, rawBytes, fixupOffset, 2); counter += 512; } LogSequenceNumber = BitConverter.ToInt64(rawBytes, 0x8); SequenceNumber = BitConverter.ToUInt16(rawBytes, 0x10); ReferenceCount = BitConverter.ToInt16(rawBytes, 0x12); FirstAttributeOffset = BitConverter.ToInt16(rawBytes, 0x14); ActualRecordSize = BitConverter.ToInt32(rawBytes, 0x18); AllocatedRecordSize = BitConverter.ToInt32(rawBytes, 0x1c); var entryBytes = new byte[8]; Buffer.BlockCopy(rawBytes, 0x20, entryBytes, 0, 8); MftRecordToBaseRecord = new MftEntryInfo(entryBytes); FirstAvailablAttribueId = BitConverter.ToInt16(rawBytes, 0x28); EntryNumber = BitConverter.ToUInt32(rawBytes, 0x2c); //start attribute processing at FirstAttributeOffset var index = (int)FirstAttributeOffset; while (index < ActualRecordSize) { var attrType = (AttributeType)BitConverter.ToInt32(rawBytes, index); var attrSize = BitConverter.ToInt32(rawBytes, index + 4); if (attrSize == 0 || attrType == AttributeType.EndOfAttributes) { index += 8; //skip -1 type and 0 size if (index != ActualRecordSize) { _logger.Warn($"Slack space found in entry/seq: 0x{EntryNumber:X}/0x{SequenceNumber:X}"); } //TODO process slack here? break; } _logger.Debug( $"Found Attribute Type {attrType.ToString()} at absolute offset: 0x{index + offset:X}"); _logger.Trace( $"ActualRecordSize: 0x{ActualRecordSize:X}, size: 0x{attrSize:X}, index: 0x{index:X}"); var rawAttr = new byte[attrSize]; Buffer.BlockCopy(rawBytes, index, rawAttr, 0, attrSize); switch (attrType) { case AttributeType.StandardInformation: var si = new StandardInfo(rawAttr); Attributes.Add(si); break; case AttributeType.FileName: var fi = new FileName(rawAttr); Attributes.Add(fi); break; case AttributeType.Data: var d = new Data(rawAttr); Attributes.Add(d); break; case AttributeType.IndexAllocation: var ia = new IndexAllocation(rawAttr); Attributes.Add(ia); break; case AttributeType.IndexRoot: var ir = new IndexRoot(rawAttr); Attributes.Add(ir); break; case AttributeType.Bitmap: var bm = new Bitmap(rawAttr); Attributes.Add(bm); break; case AttributeType.VolumeVersionObjectId: var oi = new ObjectId_(rawAttr); Attributes.Add(oi); break; case AttributeType.SecurityDescriptor: var sd = new SecurityDescriptor(rawAttr); Attributes.Add(sd); break; case AttributeType.VolumeName: var vn = new VolumeName(rawAttr); Attributes.Add(vn); break; case AttributeType.VolumeInformation: var vi = new VolumeInformation(rawAttr); Attributes.Add(vi); break; case AttributeType.LoggedUtilityStream: var lus = new LoggedUtilityStream(rawAttr); Attributes.Add(lus); break; case AttributeType.ReparsePoint: try { var rp = new ReparsePoint(rawAttr); Attributes.Add(rp); } catch (Exception) { var l = LogManager.GetLogger("ReparsePoint"); l.Error( $"There was an error parsing a ReparsePoint in FILE record at offset 0x{Offset:X}. Please extract via --dd and --do and send to [email protected]"); } break; case AttributeType.AttributeList: var al = new AttributeList(rawAttr); Attributes.Add(al); break; case AttributeType.Ea: var ea = new ExtendedAttribute(rawAttr); Attributes.Add(ea); break; case AttributeType.EaInformation: var eai = new ExtendedAttributeInformation(rawAttr); Attributes.Add(eai); break; default: throw new Exception($"Add me: {attrType} (0x{attrType:X})"); } index += attrSize; } //rest is slack. handle here? _logger.Trace($"Slack starts at 0x{index:X} Absolute offset: 0x{index + offset:X}"); }
private bool SelfCheckIndexNode(byte[] buffer, int offset, Bitmap bitmap, IndexRoot root, string fileName, string indexName) { bool ok = true; IndexHeader header = new IndexHeader(buffer, offset); IndexEntry lastEntry = null; IComparer<byte[]> collator = root.GetCollator(_context.UpperCase); int pos = (int)header.OffsetToFirstEntry; while (pos < header.TotalSizeOfEntries) { IndexEntry entry = new IndexEntry(indexName == "$I30"); entry.Read(buffer, offset + pos); pos += entry.Size; if ((entry.Flags & IndexEntryFlags.Node) != 0) { long bitmapIdx = entry.ChildrenVirtualCluster / Utilities.Ceil(root.IndexAllocationSize, _context.BiosParameterBlock.SectorsPerCluster * _context.BiosParameterBlock.BytesPerSector); if (!bitmap.IsPresent(bitmapIdx)) { ReportError("Index entry {0} is non-leaf, but child vcn {1} is not in bitmap at index {2}", Index.EntryAsString(entry, fileName, indexName), entry.ChildrenVirtualCluster, bitmapIdx); } } if ((entry.Flags & IndexEntryFlags.End) != 0) { if (pos != header.TotalSizeOfEntries) { ReportError("Found END index entry {0}, but not at end of node", Index.EntryAsString(entry, fileName, indexName)); ok = false; } } if (lastEntry != null && collator.Compare(lastEntry.KeyBuffer, entry.KeyBuffer) >= 0) { ReportError("Found entries out of order {0} was before {1}", Index.EntryAsString(lastEntry, fileName, indexName), Index.EntryAsString(entry, fileName, indexName)); ok = false; } lastEntry = entry; } return ok; }
public static AttributeRecord ReadSingleAttribute(byte[] data, int maxLength, int offset = 0) { Debug.Assert(data.Length - offset >= maxLength); Debug.Assert(0 <= offset && offset <= data.Length); AttributeTypeCode TypeCode = GetTypeCode(data, offset); if (TypeCode == AttributeTypeCode.EndOfAttributes) { AttributeRecord tmpAR = new AttributeGeneric(); tmpAR.ReadARHeader(data, offset); return(tmpAR); } AttributeRecord attRecord; switch (TypeCode) { case AttributeTypeCode.STANDARD_INFORMATION: attRecord = new StandardInformation(); break; case AttributeTypeCode.ATTRIBUTE_LIST: attRecord = new AttributeList(); break; case AttributeTypeCode.FILE_NAME: attRecord = new FileName(); break; case AttributeTypeCode.OBJECT_ID: attRecord = new ObjectId(); break; // To complicated to quickly be implemented. Maybe one day. lol // case AttributeTypeCode.SECURITY_DESCRIPTOR: // attRecord = new SecurityDescriptor(); // break; case AttributeTypeCode.VOLUME_NAME: attRecord = new VolumeName(); break; case AttributeTypeCode.VOLUME_INFORMATION: attRecord = new VolumeInformation(); break; case AttributeTypeCode.DATA: attRecord = new Data(); break; case AttributeTypeCode.INDEX_ROOT: attRecord = new IndexRoot(); break; // INDEX_ALLOCATION is stored as non resident and this project deals only with resident files // case AttributeTypeCode.INDEX_ALLOCATION: // attRecord = new IndexAllocation(); // break; case AttributeTypeCode.BITMAP: attRecord = new Bitmap(); break; case AttributeTypeCode.EA_INFORMATION: attRecord = new ExtenedAttributeInformation(); break; case AttributeTypeCode.EA: attRecord = new ExtenedAttributes(); break; // PROPERTY_SET needs a pre NTFS 3.0 volume. This is probably obsolete! // case AttributeTypeCode.PROPERTY_SET: // attRecord = new PropertSet(); // break; case AttributeTypeCode.LOGGED_UTILITY_STREAM: attRecord = new LoggedUtilityStream(); break; default: // ?? could be a problem attRecord = new AttributeGeneric(); break; } attRecord.ReadARHeader(data, offset); if (attRecord.FormCode == ResidentFileFlag.Resident) { attRecord.ResidentHeader = AttributeResidentHeader.ReadHeader(data, offset + 16); int residentBodyOffset = offset + attRecord.ResidentHeader.ValueOffset; int length = offset + attRecord.RecordLength - residentBodyOffset; attRecord.ReadAttributeResident(data, length, residentBodyOffset); } else { throw new Exception("Could not read and process resident flag!\n"); } return(attRecord); }
public MainWindow() { const ushort btreeNodeSize = 5000; GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer dataGridList = new List <DataGrid>(); dataTableList = new List <DataTable>(); InitializeComponent(); session = new SessionNoServer(s_systemDir); Placement placerIndexRoot = new Placement(IndexRoot.PlaceInDatabase); session.BeginUpdate(); Console.WriteLine("Running with databases in directory: " + session.SystemDirectory); File.Copy(s_licenseDbFile, Path.Combine(session.SystemDirectory, "4.odb"), true); IndexRoot indexRoot; Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false); if (db == null) { session.NewDatabase(IndexRoot.PlaceInDatabase, 0, "IndexRoot"); session.NewDatabase(Lexicon.PlaceInDatabase, 0, "Lexicon"); session.NewDatabase(Document.PlaceInDatabase, 0, "Document"); session.NewDatabase(Repository.PlaceInDatabase, 0, "Repository"); session.NewDatabase(DocumentText.PlaceInDatabase, 0, "DocumentText"); session.NewDatabase(Word.PlaceInDatabase, 0, "Word"); indexRoot = new IndexRoot(btreeNodeSize, session); if (Directory.Exists(s_booksDir)) { string[] directoryTextFiles = Directory.GetFiles(s_booksDir, "*.txt"); foreach (string fileName in directoryTextFiles) { listBoxPagesToAdd.Items.Add(fileName); } } else { wordMinCt.Text = 1.ToString(); listBoxPagesToAdd.Items.Add("http://www.VelocityDB.com/"); // other database products listBoxPagesToAdd.Items.Add("https://foundationdb.com/"); listBoxPagesToAdd.Items.Add("http://www.oracle.com/us/products/database/index.html"); listBoxPagesToAdd.Items.Add("http://www-01.ibm.com/software/data/db2/"); listBoxPagesToAdd.Items.Add("http://www.versant.com/"); listBoxPagesToAdd.Items.Add("http://web.progress.com/en/objectstore/"); listBoxPagesToAdd.Items.Add("https://www.mongodb.org/"); listBoxPagesToAdd.Items.Add("http://cassandra.apache.org/"); listBoxPagesToAdd.Items.Add("http://www.sybase.com/"); listBoxPagesToAdd.Items.Add("http://www.mcobject.com/perst"); listBoxPagesToAdd.Items.Add("http://www.marklogic.com/what-is-marklogic/"); listBoxPagesToAdd.Items.Add("http://hamsterdb.com/"); listBoxPagesToAdd.Items.Add("http://www.firebirdsql.org/"); listBoxPagesToAdd.Items.Add("http://www.h2database.com/"); listBoxPagesToAdd.Items.Add("http://www.oracle.com/technology/products/berkeley-db"); listBoxPagesToAdd.Items.Add("http://www.scimore.com/"); listBoxPagesToAdd.Items.Add("http://www.stsdb.com/"); listBoxPagesToAdd.Items.Add("http://www.sqlite.org/about.html"); listBoxPagesToAdd.Items.Add("http://www.mysql.com/products/enterprise/techspec.html"); listBoxPagesToAdd.Items.Add("http://www.objectivity.com"); listBoxPagesToAdd.Items.Add("http://vistadb.net/"); listBoxPagesToAdd.Items.Add("http://www.google.com/search?q=object+database&sourceid=ie7&rls=com.microsoft:en-us:IE-SearchBox&ie=&oe="); } indexRoot.Persist(session, indexRoot); } else { indexRoot = (IndexRoot)session.Open(Oid.Encode(IndexRoot.PlaceInDatabase, 1, 1)); } if (indexRoot.repository.documentSet.Count > 0) { List <Document> docs = indexRoot.repository.documentSet.ToList <Document>().Take(50).ToList <Document>(); inDbListBox.ItemsSource = docs; } updateDataGrids(indexRoot); session.Commit(); //verify(); }
static void importEntireWikipedia() { const ushort btreeNodeSize = 10000; Console.WriteLine(DateTime.Now.ToString() + ", start importing Wikipedia text"); //System.Xml.Schema.XmlSchema docSchema; //using (System.Xml.XmlTextReader schemaReader = new System.Xml.XmlTextReader("c:\\export-0_5.xsd")) //{ // docSchema = System.Xml.Schema.XmlSchema.Read(schemaReader, ValidationCallBack); // } int docCount = 0; using (SessionNoServer session = new SessionNoServer(s_systemDir, 5000, false, false, CacheEnum.No)) // turn of page and object caching { Console.WriteLine("Running with databases in directory: " + session.SystemDirectory); //GCSettings.LatencyMode = GCLatencyMode.Batch;// try to keep the WeakIOptimizedPersistableReference objects around longer Placement documentPlacement = new Placement(Document.PlaceInDatabase, 1003, 1, 500, 1000, false, false, 1000, false); Placement contentPlacement = new Placement(Document.PlaceInDatabase, 1, 1, 500, UInt16.MaxValue, false, false, 1, false); XmlComment xmlComment; XmlElement xmlElement; XmlEntity xmlEntity; XmlText xmlText; XmlWhitespace xmlWhitespace; session.BeginUpdate(); File.Copy(s_licenseDbFile, System.IO.Path.Combine(session.SystemDirectory, "4.odb"), true); // register all database schema classes used by the application in advance to avoid lock conflict later in parallell indexing session.RegisterClass(typeof(Repository)); session.RegisterClass(typeof(IndexRoot)); session.RegisterClass(typeof(Document)); session.RegisterClass(typeof(Lexicon)); session.RegisterClass(typeof(DocumentText)); session.RegisterClass(typeof(Word)); session.RegisterClass(typeof(WordGlobal)); session.RegisterClass(typeof(WordHit)); session.RegisterClass(typeof(BTreeSet <Document>)); session.RegisterClass(typeof(OidShort)); session.RegisterClass(typeof(BTreeMap <Word, WordHit>)); session.RegisterClass(typeof(HashCodeComparer <Word>)); session.RegisterClass(typeof(BTreeSetOidShort <Word>)); session.RegisterClass(typeof(BTreeMapOidShort <Word, WordHit>)); Database db = session.OpenDatabase(IndexRoot.PlaceInDatabase, false, false); if (db != null) { outputSomeInfo(session); session.Abort(); return; } session.NewDatabase(IndexRoot.PlaceInDatabase, 0, "IndexRoot"); session.NewDatabase(Lexicon.PlaceInDatabase, 0, "Lexicon"); session.NewDatabase(Repository.PlaceInDatabase, 0, "Repository"); for (UInt32 i = 40; i <= 186; i++) { session.NewDatabase(i, 512, "Document"); // pre allocate 146 Document databases presized to 512MB each } //session.SetTraceDbActivity(Lexicon.PlaceInDatabase); //session.SetTraceAllDbActivity(); XmlDocument xmlDocument = new XmlDocument("enwiki-latest-pages-articles.xml"); IndexRoot indexRoot = new IndexRoot(btreeNodeSize, session); indexRoot.Persist(session, indexRoot, true); Document doc = null; bool titleElement = false; bool pageText = false; UInt32 currentDocumentDatabaseNum = documentPlacement.StartDatabaseNumber; using (FileStream fs = new FileStream(s_wikipediaXmlFile, FileMode.Open)) { //using (GZipStream zipStream = new GZipStream(fs, CompressionMode.Decompress)) // if input was a .gz file { using (System.Xml.XmlTextReader textReader = new System.Xml.XmlTextReader(fs)) { while (textReader.Read()) { System.Xml.XmlNodeType nodeType = textReader.NodeType; switch (nodeType) { case System.Xml.XmlNodeType.Attribute: break; case System.Xml.XmlNodeType.CDATA: break; case System.Xml.XmlNodeType.Comment: xmlComment = new XmlComment(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.Document: break; case System.Xml.XmlNodeType.DocumentFragment: break; case System.Xml.XmlNodeType.DocumentType: break; case System.Xml.XmlNodeType.Element: xmlElement = new XmlElement(textReader.Prefix, textReader.LocalName, textReader.NamespaceURI, xmlDocument); if (textReader.LocalName == "title") { titleElement = true; } else if (textReader.LocalName == "text") { pageText = true; } break; case System.Xml.XmlNodeType.EndElement: if (textReader.LocalName == "title" && doc != null) { titleElement = false; } else if (textReader.LocalName == "text" && doc != null) { pageText = false; } break; case System.Xml.XmlNodeType.EndEntity: break; case System.Xml.XmlNodeType.Entity: xmlEntity = new XmlEntity(textReader.LocalName, xmlDocument); break; case System.Xml.XmlNodeType.EntityReference: break; case System.Xml.XmlNodeType.None: break; case System.Xml.XmlNodeType.Notation: break; case System.Xml.XmlNodeType.ProcessingInstruction: break; case System.Xml.XmlNodeType.SignificantWhitespace: break; case System.Xml.XmlNodeType.Text: xmlText = new XmlText(textReader.Value, xmlDocument); if (titleElement) { doc = new Document(textReader.Value, indexRoot, session); doc.Persist(documentPlacement, session, true); if (doc.DatabaseNumber != currentDocumentDatabaseNum) { session.FlushUpdates(session.OpenDatabase(currentDocumentDatabaseNum)); Console.WriteLine("Database: " + currentDocumentDatabaseNum + " is completed, done importing article " + docCount + " number of lines: " + textReader.LineNumber); currentDocumentDatabaseNum = doc.DatabaseNumber; } //doc.Page.Database.Name = doc.Name; } else if (doc != null && pageText) { #if DEBUGx Console.WriteLine(doc.Name + " line: " + textReader.LineNumber); #endif //if (textReader.LineNumber > 1000000) //{ // session.Commit(); // return; //} DocumentText content = new DocumentText(textReader.Value, doc); if (doc.DatabaseNumber != contentPlacement.TryDatabaseNumber) { contentPlacement = new Placement(doc.DatabaseNumber, (ushort)contentPlacement.StartPageNumber, 1, contentPlacement.MaxObjectsPerPage, contentPlacement.MaxPagesPerDatabase, false, false, 1, false); } content.Persist(contentPlacement, session, false); Debug.Assert(content.DatabaseNumber == doc.DatabaseNumber); doc.Content = content; indexRoot.repository.documentSet.AddFast(doc); if (++docCount % 1000000 == 0) { //session.Commit(false); // skip recovery check, we do it in BeginUpdate which is enough Console.WriteLine("Done importing article " + docCount + " number of lines: " + textReader.LineNumber); //session.BeginUpdate(); } } break; case System.Xml.XmlNodeType.Whitespace: xmlWhitespace = new XmlWhitespace(textReader.Value, xmlDocument); break; case System.Xml.XmlNodeType.XmlDeclaration: break; } ; } Console.WriteLine("Finished importing article " + docCount + " number of lines: " + textReader.LineNumber); } } } session.Commit(); } Console.WriteLine(DateTime.Now.ToString() + ", done importing Wikipedia text"); }
void updateDataGrids(IndexRoot indexRoot, int indexOfRemoved = -1) { if (indexRoot == null) { return; } if (indexRoot.lexicon.WordSet.Count == 0) { return; } stackPanel.IsEnabled = false; bool aRefresh = stackPanel.Children.Count > 0; if (indexOfRemoved >= 0 && aRefresh) { stackPanel.Children.RemoveAt(0); } else if (stackPanel.Children.Count > 0) { stackPanel.Children.Clear(); } DataGrid dataGrid = new DataGrid(); dataGrid.AutoGenerateColumns = true; dataGrid.MaxColumnWidth = 150; dataGridList.Add(dataGrid); DataTable table = new DataTable("Word Count"); DataColumn wordColumn = new DataColumn("Words (all pages)", Type.GetType("System.String")); DataColumn countColumn = new DataColumn("Count", Type.GetType("System.UInt32")); table.Columns.Add(wordColumn); table.Columns.Add(countColumn); DataRow newRow; int pageIndex = 0; int min = 3; int.TryParse(wordMinCt.Text, out min); foreach (Word word in indexRoot.lexicon.WordSet) { if (word.GlobalCount >= min) { newRow = table.NewRow(); newRow[0] = word.aWord; newRow[1] = word.GlobalCount; table.Rows.Add(newRow); } } DataView dataView = new DataView(table); dataView.Sort = "Count desc"; dataGrid.ItemsSource = dataView; stackPanel.Children.Insert(pageIndex++, dataGrid); if (indexOfRemoved >= 0 && aRefresh) { stackPanel.Children.RemoveAt(indexOfRemoved + 1); } else { List <Document> docs = indexRoot.repository.documentSet.ToList <Document>().ToList <Document>(); foreach (Document page in docs) { DataTable pageTable = new DataTable(); dataTableList.Add(pageTable); string pageName = page.url.TrimEnd('/'); int index = pageName.IndexOf("//"); if (index >= 0) { pageName = pageName.Remove(0, index + 2); } index = pageName.IndexOf("www."); if (index >= 0) { pageName = pageName.Remove(0, index + 4); } pageName = pageName.Replace('.', ' '); pageName = pageName.Replace('/', ' '); DataColumn wordColumnPage = new DataColumn(pageName, Type.GetType("System.String")); DataColumn countColumnPage = new DataColumn("Count", Type.GetType("System.Int32")); pageTable.Columns.Add(wordColumnPage); pageTable.Columns.Add(countColumnPage); foreach (KeyValuePair <Word, WordHit> pair in page.WordHit) { if ((int)pair.Value.Count >= min) { newRow = pageTable.NewRow(); string aString = pair.Key.aWord; newRow.SetField <string>(wordColumnPage, aString); newRow.SetField <int>(countColumnPage, (int)pair.Value.Count); //wc.Add(new WordCount(aString, (uint) hit.wordPositionSet.Count)); pageTable.Rows.Add(newRow); } } dataGrid = new DataGrid(); dataGrid.AutoGenerateColumns = true; dataGrid.MaxColumnWidth = 150; dataGridList.Add(dataGrid); dataView = new DataView(pageTable); dataView.Sort = "Count desc"; dataGrid.ItemsSource = dataView; stackPanel.Children.Insert(pageIndex++, dataGrid); } } stackPanel.IsEnabled = true; }
public FileRecord(byte[] rawBytes, int offset) { Offset = offset; var sig = BitConverter.ToInt32(rawBytes, 0); if ((sig != _fileSig) && (sig != _baadSig) && (sig != 0x0)) { Logger.Fatal($"Invalid signature! 0x{sig:X}"); return; //throw new Exception("Invalid signature!"); } if (sig == _baadSig) { Logger.Warn($"Bad signature at offset 0x{offset:X}"); return; } Attributes = new List <Attribute>(); FixupOffset = BitConverter.ToInt16(rawBytes, 2); FixupEntryCount = BitConverter.ToInt16(rawBytes, 4); LogSequenceNumber = BitConverter.ToInt64(rawBytes, 0x8); SequenceNumber = BitConverter.ToInt16(rawBytes, 0x10); ReferenceCount = BitConverter.ToInt16(rawBytes, 0x12); FirstAttributeOffset = BitConverter.ToInt16(rawBytes, 0x14); EntryFlags = (EntryFlag)BitConverter.ToInt16(rawBytes, 0x16); Logger.Trace($"Entry flags: {EntryFlags}"); ActualRecordSize = BitConverter.ToInt32(rawBytes, 0x18); AllocatedRecordSize = BitConverter.ToInt32(rawBytes, 0x1c); var entryBytes = new byte[8]; Buffer.BlockCopy(rawBytes, 0x20, entryBytes, 0, 8); MFTRecordToBaseRecord = new MftEntryInfo(entryBytes); FirstAvailableAttribueId = BitConverter.ToInt16(rawBytes, 0x28); EntryNumber = BitConverter.ToInt32(rawBytes, 0x2c); var fixupExpectedBytes = new byte[2]; var fixupActual1 = new byte[2]; var fixupActual2 = new byte[2]; Buffer.BlockCopy(rawBytes, 0x30, fixupExpectedBytes, 0, 2); Buffer.BlockCopy(rawBytes, 0x32, fixupActual1, 0, 2); Buffer.BlockCopy(rawBytes, 0x34, fixupActual2, 0, 2); //verify this record looks ok based on fixup bytes //0x1FE and 0x3fe var expectedFixupVal = BitConverter.ToInt16(fixupExpectedBytes, 0); var x1FeValue = BitConverter.ToInt16(rawBytes, 0x1FE); var x3FeValue = BitConverter.ToInt16(rawBytes, 0x3FE); if ((x1FeValue != expectedFixupVal) && ((EntryFlags & EntryFlag.FileRecordSegmentInUse) == EntryFlag.FileRecordSegmentInUse)) { Logger.Warn( $"FILE record at offset 0x{offset:X}! Fixup values do not match at 0x1FE. Expected: {expectedFixupVal}, actual: {x1FeValue}, EntryFlags: {EntryFlags}"); } if ((x3FeValue != expectedFixupVal) && ((EntryFlags & EntryFlag.FileRecordSegmentInUse) == EntryFlag.FileRecordSegmentInUse)) { Logger.Warn( $"FILE record at offset 0x{offset:X}! Fixup values do not match at 0x3FE. Expected: {expectedFixupVal}, actual: {x3FeValue}, EntryFlags: {EntryFlags}"); } //header is done, replace fixup bytes with actual bytes //0x1fe and 0x3fe should contain fixup bytes Buffer.BlockCopy(fixupActual1, 0, rawBytes, 0x1fe, 2); Buffer.BlockCopy(fixupActual2, 0, rawBytes, 0x3fe, 2); //start attribute processing at FirstAttributeOffset var index = (int)FirstAttributeOffset; while (index < ActualRecordSize) { var attrType = BitConverter.ToInt32(rawBytes, index); var attrSize = BitConverter.ToInt32(rawBytes, index + 4); // Logger.Trace( // $"ActualRecordSize: {ActualRecordSize} attrType: 0x{attrType:X}, size: {attrSize}, index: {index}, offset: 0x{offset:x}, i+o: 0x{index + offset:X}"); if ((attrSize == 0) || (attrType == -1)) { index += 8; //skip -1 type and 0 size if (EntryFlags == 0) //this is a free record { break; } continue; } var rawAttr = new byte[attrSize]; Buffer.BlockCopy(rawBytes, index, rawAttr, 0, attrSize); switch ((AttributeType)attrType) { case AttributeType.StandardInformation: var si = new StandardInfo(rawAttr); Attributes.Add(si); SILastAccessedOn = si.LastAccessedOn; SICreatedOn = si.CreatedOn; SIRecordModifiedOn = si.RecordModifiedOn; SIContentModifiedOn = si.ContentModifiedOn; break; case AttributeType.FileName: var fi = new FileName(rawAttr); Attributes.Add(fi); if ((fi.FileInfo.NameType & NameTypes.Windows) == NameTypes.Windows) { FName = fi.FileInfo.FileName; } //if (fi.FileInfo.LastAccessedOn.UtcDateTime != SILastAccessedOn.UtcDateTime) //{ FNLastAccessedOn = fi.FileInfo.LastAccessedOn; //} //if (fi.FileInfo.CreatedOn.UtcDateTime != SICreatedOn.UtcDateTime) //{ FNCreatedOn = fi.FileInfo.CreatedOn; //} //if (fi.FileInfo.RecordModifiedOn.UtcDateTime != SIRecordModifiedOn.UtcDateTime) //{ FNRecordModifiedOn = fi.FileInfo.RecordModifiedOn; //} //if (fi.FileInfo.ContentModifiedOn.UtcDateTime != SIContentModifiedOn.UtcDateTime) //{ FNContentModifiedOn = fi.FileInfo.ContentModifiedOn; //} break; case AttributeType.Data: var data = new Data(rawAttr); Attributes.Add(data); break; case AttributeType.IndexAllocation: var ia = new IndexAllocation(rawAttr); Attributes.Add(ia); break; case AttributeType.IndexRoot: var ir = new IndexRoot(rawAttr); Attributes.Add(ir); break; case AttributeType.Bitmap: var bm = new Bitmap(rawAttr); Attributes.Add(bm); break; case AttributeType.VolumeVersionObjectId: var oi = new ObjectId(rawAttr); Attributes.Add(oi); break; case AttributeType.SecurityDescriptor: var sd = new SecurityDescriptor(rawAttr); Attributes.Add(sd); break; case AttributeType.VolumeName: var vn = new VolumeName(rawAttr); Attributes.Add(vn); break; case AttributeType.VolumeInformation: var vi = new VolumeInformation(rawAttr); Attributes.Add(vi); break; case AttributeType.LoggedUtilityStream: var lus = new LoggedUtilityStream(rawAttr); Attributes.Add(lus); break; case AttributeType.ReparsePoint: var rp = new ReparsePoint(rawAttr); Attributes.Add(rp); break; case AttributeType.AttributeList: var al = new AttributeList(rawAttr); Attributes.Add(al); break; case AttributeType.Ea: //TODO Finish this var ea = new ExtendedAttribute(rawAttr); Attributes.Add(ea); break; case AttributeType.EaInformation: var eai = new ExtendedAttributeInformation(rawAttr); Attributes.Add(eai); break; default: Logger.Warn($"Unhandled attribute type! Add me: {(AttributeType) attrType}"); throw new Exception($"Add me: {(AttributeType) attrType}"); break; } index += attrSize; } SlackStartOffset = index; //rest is slack. handle here? Logger.Trace($"Slack starts at {index} i+o: 0x{index + offset:X}"); }