public List <Tuple <DateTime, long> > GetSegmentsIndex(BsonDocument version) { var result = new List <Tuple <DateTime, long> >(); var seg_ind_buf = new ByteBuffer(); seg_ind_buf.AppendDecompress(version["segment_index"].AsByteArray); for (int ofs = 0; ofs < seg_ind_buf.Length; ofs += 16) { var t = seg_ind_buf.Read <long>(ofs); var rec = new Tuple <DateTime, long>(DateTime64.ToDateTime(t), seg_ind_buf.Read <long>(ofs + 8)); result.Add(rec); } return(result); }
public async Task <BsonDocument> AppendAsync(string symbol, DataFrame df, int chunksize = 0, bool skipAlreadyWrittenDates = true) { if (df.Index == null) { throw new ArgumentException("Please specify DataFrame.Index column before saving"); } if (chunksize > 0 && df.Rows.Count > chunksize) { var rng = Range.R(0, chunksize); BsonDocument ver = null; int chunkscount = 0; while (rng.First < df.Rows.Count) { var chunk = df[rng]; ver = await AppendAsync(symbol, chunk); rng = Range.R(rng.First + chunksize, rng.Last + chunksize); chunkscount++; } return(ver); } int attemptNo = 0; for (;;) { var previous_version = await ReadVersionAsync(symbol); var version = await GetNewVersion(symbol, previous_version); /*var previous_version = await (_versions.AsQueryable () * .Where (v => v ["symbol"] == symbol && v ["version"] < version ["version"]) * .OrderByDescending (v => v ["version"]) as IAsyncCursorSource<BsonDocument>) * .FirstOrDefaultAsync ();*/ var dtype = version.GetValue("dtype", "").ToString(); Log.Debug("loaded dtype {0}", dtype); if (dtype != "" && df.DType.ToString() != dtype) { // dtype changed. need reload old data and repack it. throw new NotImplementedException("old dtype {0}, new dtype {1}: not implemented".Args(dtype, df.DType)); } var sdt = df.DType.ToString(); version["metadata"] = df.Metadata; version["dtype"] = sdt; Log.Debug("saved dtype {0}", sdt); version["shape"] = new BsonArray { { -1 } }; version["dtype_metadata"] = new BsonDocument { { "index", new BsonArray { { df.Index.Name } } }, { "columns", new BsonArray(df.Columns.Select(c => c.Name).ToList()) } }; version["type"] = "pandasdf"; version["segment_count"] = previous_version != null ? previous_version["segment_count"].AsInt32 + 1 : 1; version["append_count"] = previous_version != null ? previous_version["append_count"].AsInt32 + 1 : 0; var seg_ind_buf = new ByteBuffer(); int segment_offset = 0; bool is_date_time_index = DType.DateTime64.ToString().Equals(df.Index.DType.ToString()); //version ["base_sha"] = version ["sha"]; if (previous_version != null) { var seg_ind = previous_version["segment_index"].AsByteArray; seg_ind_buf.AppendDecompress(seg_ind); segment_offset = previous_version["up_to"].AsInt32; if (is_date_time_index && skipAlreadyWrittenDates) { long date = seg_ind_buf.Read <long>(seg_ind_buf.Length - 16); DateTime dt = DateTime64.ToDateTime(date); var range = df.Index.AsDateTime().RangeOf(dt, 0, df.FilledCount - 1, Location.GT); if (range.Last <= range.First) { Log.Information($"Skipped DataFrame.Append because date {dt} already written for {symbol}"); return(null); // Hey all was skipped } else if (range.First != 0) { Log.Information($"Skipped DataFrame.Append initial {range.First} elements date {dt} already written for {symbol}"); } df = df[range]; } } var up_to = segment_offset + df.Rows.Count; var buf = new ByteBuffer(); // add index that is last datetime + 0-based int64 index of last appended record like (segment_count-1) if (is_date_time_index && df.Rows.Count > 0) { var date = df.Index.AsDateTime().Source[-1]; seg_ind_buf.Append <long>(date); seg_ind_buf.Append <long>(up_to - 1); } var seg_ind_buf2 = new ByteBuffer(); seg_ind_buf2.AppendCompress(seg_ind_buf.GetBytes()); version["segment_index"] = seg_ind_buf2.GetBytes(); version["up_to"] = up_to; var bin = df.ToBuffer(); buf.AppendCompress(bin); var sha1 = SHA1.Create(); var sha = version.GetValue("sha", null); if (sha == null) { byte[] hashBytes = sha1.ComputeHash(bin); version["sha"] = new BsonBinaryData(hashBytes); } #if false var buf2 = new ByteBuffer(); buf2.AppendDecompress(buf.GetBytes()); var bin2 = buf2.GetBytes(); if (!bin.SequenceEqual(bin2)) { throw new InvalidOperationException(); } var df2 = DataFrame.FromBuffer(bin2, df.DType, df.Rows.Count); #endif var segment = new BsonDocument { { "symbol", symbol }, { "data", new BsonBinaryData(buf.GetBytes()) }, { "compressed", true }, { "segment", segment_offset + df.Rows.Count - 1 }, { "parent", new BsonArray { version["_id"] } }, }; var hash = new ByteBuffer(); hash.Append(Encoding.ASCII.GetBytes(symbol)); foreach (var key in segment.Names.OrderByDescending(x => x)) { var value = segment.GetValue(key); if (value is BsonBinaryData) { hash.Append(value.AsByteArray); } else { var str = value.ToString(); hash.Append(Encoding.ASCII.GetBytes(str)); } } segment["sha"] = sha1.ComputeHash(hash.GetBytes()); //await _versions.InsertOneAsync(version); try { await _versions.ReplaceOneAsync(BF.Eq("symbol", symbol), version, Upsert); }catch (MongoWriteException e) { Log.Information("Retrying append symbol {symbol}, attempt {attemptNo}", symbol, attemptNo++); continue; } await _segments.InsertOneAsync(segment); //Log.Information ("inserted new segment {segment} for symbol {symbol}", segment["_id"], symbol); //Log.Information ("replaced version {0} for symbol {symbol} sha1 {sha}", version["_id"], symbol, sha); // update parents in versions //var res = await _segments.UpdateManyAsync (BF.Eq("symbol", symbol), BU.Set ("parent", new BsonArray{ version ["_id"] })); //Log.Debug ("updated segments parents {0}".Args(res.MatchedCount)); return(version); } }