public static CacheyBashi Create(string outDir, string dbName, IEnumerable <KeyValuePair <byte[], byte[]> > data, ushort keyLength, byte indexKeyLength = 2) { var cb = new CacheyBashi(outDir, dbName, keyLength, indexKeyLength, true); CbWriter.Write(cb, keyLength, data); return(cb); }
public static CacheyBashi Create(string outDir, string dbName, IEnumerable <KeyValuePair <HashBin, byte[]> > data, ushort keyLength, byte indexKeyLength = 2) { if (!Directory.Exists(outDir)) { Directory.CreateDirectory(outDir); } var cb = new CacheyBashi(outDir, dbName, keyLength, indexKeyLength, true); CbWriter.Write(cb, keyLength, data); return(cb); }
public static CacheyBashi Load(string dir, string dbName, ushort keyLength, byte indexKeyLength) { var cb = new CacheyBashi(dir, dbName, keyLength, indexKeyLength); return(cb); }
static void SortAndWrite(List <string> batchFiles, ulong keyCount, CacheyBashi cb, ushort keyLength, string outFile) { using var streams = new StreamCollection(); var batches = new List <CurrentBatchInfo>(); foreach (var batchFile in batchFiles) { var stream = new FileStream(batchFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);//File.OpenRead(batchFile); streams.Streams.Add(stream); batches.Add(new CurrentBatchInfo(keyLength, stream)); } var cbKeyFileStream = cb.CbKey.FileStream; var cbKeyWritter = new BinaryWriter(cbKeyFileStream); cbKeyWritter.Write(keyCount); var cbKeyAddrFileStream = cb.CbKey.AddrsFileStream; var cbKeyAddrsWritter = new BinaryWriter(cbKeyAddrFileStream); var remainingBatches = new List <CurrentBatchInfo>(); remainingBatches.AddRange(batches); var currentKeyIndex = -1; var currentKeyRangeStartAddr = cbKeyFileStream.Position; ulong keysWritten = 0; var addrOffset = cb.CbKey.HeaderLength + (keyCount * keyLength); HashBin lastHash = null; // #if DEBUG // var debugHash = new HashBin("0000000000000000000000000000d8f4"); // #endif foreach (var nextItem in SortAndDedupe(remainingBatches)) { //update the index if we've reached the end of a key range var keyIndex = cb.CbIndex.GetKeyIndexFromKey(nextItem.CurrentHashBin); if (currentKeyIndex == -1) { currentKeyIndex = keyIndex; } else if (currentKeyIndex != keyIndex)//we've reached the end of a key range { var end = cbKeyFileStream.Position - keyLength; cb.CbIndex.SetHintForKey(lastHash.Hash, new KeyHint() { StartAddr = (ulong)currentKeyRangeStartAddr, EndAddr = (ulong)end }); currentKeyRangeStartAddr = cbKeyFileStream.Position; currentKeyIndex = keyIndex; } //write the key to the key file cbKeyFileStream.Write(nextItem.CurrentHashBin.Hash, 0, nextItem.CurrentHashBin.Length); //write the data addrs to the addr file cbKeyAddrsWritter.Write(nextItem.CurrentAddr.addr); cbKeyAddrsWritter.Write(nextItem.CurrentAddr.len); //finally move to the next key in the batch lastHash = nextItem.CurrentHashBin; keysWritten++; } //don't forget to set the last item's key hint! cb.CbIndex.SetHintForKey(lastHash.Hash, new KeyHint() { StartAddr = (ulong)currentKeyRangeStartAddr, EndAddr = (ulong)cbKeyFileStream.Position - keyLength }); //write the index out to disk cb.CbIndex.WriteToDisk(); //tell cbKey to update stats cb.CbKey.PostWriteUpdate(); //cleanup the batch files. streams.Dispose(); foreach (var batchFile in batchFiles) { File.Delete(batchFile); } }
public static void Write(CacheyBashi cb, ushort keyLength, IEnumerable <KeyValuePair <HashBin, byte[]> > data) { var sw = new Stopwatch(); sw.Start(); List <string> batchFiles = new List <string>(); var batchNameFormat = Path.Combine(cb.Dir, cb.DbName) + ".keybatch_{0}"; var batchIndex = 0; //take batches of 100k? arbitraty or maybe roughly calc mem requirements //use 2 buffers, one for writing, and one for streaming out to file? var keyDataArray1 = new KeyData[100000]; var keyDataArray2 = new KeyData[100000]; var activeKeyDataArray = keyDataArray1; var index = 0; var datFileIndex = 0; var keyCount = (ulong)0; Task batchWriteTask = null; foreach (var kvp in data) { if (kvp.Key.Length != keyLength) { throw new ArgumentException($"All keys must be of the provided keyLength: {keyLength}"); } //need to copy the key array here incase someone is re-using the buffer activeKeyDataArray[index].Key = kvp.Key.Clone(); activeKeyDataArray[index].DataAddr.addr = (ulong)datFileIndex; activeKeyDataArray[index].DataAddr.len = (ulong)kvp.Value.Length; //need to write the dat file here so we can discard data from memory //cleanup tasks cb.CbData.UnsafeWrite(kvp.Value); datFileIndex += kvp.Value.Length; var newBatch = index == activeKeyDataArray.Length - 1; if (newBatch)//time to sort and start a new batch { var batchFile = string.Format(batchNameFormat, batchIndex); batchWriteTask?.Wait(); batchWriteTask?.Dispose(); var array = activeKeyDataArray; batchWriteTask = Task.Run(() => { WriteBatch(array, batchFile, array.Length); }); batchFiles.Add(batchFile); batchIndex++; index = 0; //swap the active buffer if (activeKeyDataArray == keyDataArray1) { activeKeyDataArray = keyDataArray2; } else { activeKeyDataArray = keyDataArray1; } } if (!newBatch) { index++; } keyCount++; } //did we finish processing exactly on a batch boundary? //if so roll back a batch index. if (index == 0 && batchIndex > 0) { batchIndex--; } if (batchWriteTask != null && !batchWriteTask.IsCompleted) { batchWriteTask.Wait(); } if (index > 0) //write the remaining keys to the final batch { var batchFile = string.Format(batchNameFormat, batchIndex); WriteBatch(activeKeyDataArray, batchFile, (int)index); batchFiles.Add(batchFile); } //Console.WriteLine($"writing batches took: {sw.ElapsedMilliseconds}"); //no more sorting required if only 1 batch so just write the file directly if (batchIndex == 1) { var outFile = cb.CbKey.FileStream; var writer = new BinaryWriter(outFile); writer.Write(keyCount); //first the keys foreach (var keyData in activeKeyDataArray) { outFile.Write(keyData.Key.Hash, 0, keyData.Key.Length); } //then the addr infos foreach (var keyData in activeKeyDataArray) { writer.Write(keyData.DataAddr.addr); writer.Write(keyData.DataAddr.len); } return; } sw.Restart(); //now sort the batches into the final file SortAndWrite(batchFiles, keyCount, cb, keyLength, cb.KeyFile); //Console.WriteLine($"sorting batches and writing took: {sw.ElapsedMilliseconds}"); }
public static void Write(CacheyBashi cb, ushort keyLength, IEnumerable <KeyValuePair <byte[], byte[]> > data) { Write(cb, keyLength, data.Select(i => new KeyValuePair <HashBin, byte[]>(i.Key.ToHashBin(), i.Value))); }