Beispiel #1
0
        public List <Tuple <DateTime, long> > GetSegmentsIndex(BsonDocument version)
        {
            var result = new List <Tuple <DateTime, long> >();

            var seg_ind_buf = new ByteBuffer();

            seg_ind_buf.AppendDecompress(version["segment_index"].AsByteArray);
            for (int ofs = 0; ofs < seg_ind_buf.Length; ofs += 16)
            {
                var t   = seg_ind_buf.Read <long>(ofs);
                var rec = new Tuple <DateTime, long>(DateTime64.ToDateTime(t), seg_ind_buf.Read <long>(ofs + 8));
                result.Add(rec);
            }
            return(result);
        }
Beispiel #2
0
        public async Task <BsonDocument> AppendAsync(string symbol, DataFrame df, int chunksize = 0, bool skipAlreadyWrittenDates = true)
        {
            if (df.Index == null)
            {
                throw new ArgumentException("Please specify DataFrame.Index column before saving");
            }

            if (chunksize > 0 && df.Rows.Count > chunksize)
            {
                var          rng         = Range.R(0, chunksize);
                BsonDocument ver         = null;
                int          chunkscount = 0;
                while (rng.First < df.Rows.Count)
                {
                    var chunk = df[rng];
                    ver = await AppendAsync(symbol, chunk);

                    rng = Range.R(rng.First + chunksize, rng.Last + chunksize);
                    chunkscount++;
                }
                return(ver);
            }
            int attemptNo = 0;

            for (;;)
            {
                var previous_version = await ReadVersionAsync(symbol);

                var version = await GetNewVersion(symbol, previous_version);

                /*var previous_version = await (_versions.AsQueryable ()
                 *  .Where (v => v ["symbol"] == symbol && v ["version"] < version ["version"])
                 *  .OrderByDescending (v => v ["version"]) as IAsyncCursorSource<BsonDocument>)
                 *  .FirstOrDefaultAsync ();*/

                var dtype = version.GetValue("dtype", "").ToString();
                Log.Debug("loaded dtype {0}", dtype);
                if (dtype != "" && df.DType.ToString() != dtype)
                {
                    // dtype changed. need reload old data and repack it.
                    throw new NotImplementedException("old dtype {0}, new dtype {1}: not implemented".Args(dtype, df.DType));
                }

                var sdt = df.DType.ToString();

                version["metadata"] = df.Metadata;

                version["dtype"] = sdt;
                Log.Debug("saved dtype {0}", sdt);

                version["shape"] = new BsonArray {
                    { -1 }
                };
                version["dtype_metadata"] = new BsonDocument {
                    { "index", new BsonArray {
                          { df.Index.Name }
                      } },
                    { "columns", new BsonArray(df.Columns.Select(c => c.Name).ToList()) }
                };
                version["type"]          = "pandasdf";
                version["segment_count"] = previous_version != null ? previous_version["segment_count"].AsInt32 + 1 : 1;
                version["append_count"]  = previous_version != null ? previous_version["append_count"].AsInt32 + 1 : 0;

                var  seg_ind_buf        = new ByteBuffer();
                int  segment_offset     = 0;
                bool is_date_time_index = DType.DateTime64.ToString().Equals(df.Index.DType.ToString());

                //version ["base_sha"] = version ["sha"];
                if (previous_version != null)
                {
                    var seg_ind = previous_version["segment_index"].AsByteArray;
                    seg_ind_buf.AppendDecompress(seg_ind);
                    segment_offset = previous_version["up_to"].AsInt32;
                    if (is_date_time_index && skipAlreadyWrittenDates)
                    {
                        long     date  = seg_ind_buf.Read <long>(seg_ind_buf.Length - 16);
                        DateTime dt    = DateTime64.ToDateTime(date);
                        var      range = df.Index.AsDateTime().RangeOf(dt, 0, df.FilledCount - 1, Location.GT);
                        if (range.Last <= range.First)
                        {
                            Log.Information($"Skipped DataFrame.Append because date {dt} already written for {symbol}");
                            return(null); // Hey all was skipped
                        }
                        else if (range.First != 0)
                        {
                            Log.Information($"Skipped DataFrame.Append initial {range.First} elements date {dt} already written for {symbol}");
                        }
                        df = df[range];
                    }
                }


                var up_to = segment_offset + df.Rows.Count;
                var buf   = new ByteBuffer();

                // add index that is last datetime + 0-based int64 index of last appended record like (segment_count-1)
                if (is_date_time_index && df.Rows.Count > 0)
                {
                    var date = df.Index.AsDateTime().Source[-1];
                    seg_ind_buf.Append <long>(date);
                    seg_ind_buf.Append <long>(up_to - 1);
                }

                var seg_ind_buf2 = new ByteBuffer();
                seg_ind_buf2.AppendCompress(seg_ind_buf.GetBytes());
                version["segment_index"] = seg_ind_buf2.GetBytes();

                version["up_to"] = up_to;
                var bin = df.ToBuffer();
                buf.AppendCompress(bin);

                var sha1 = SHA1.Create();

                var sha = version.GetValue("sha", null);
                if (sha == null)
                {
                    byte[] hashBytes = sha1.ComputeHash(bin);
                    version["sha"] = new BsonBinaryData(hashBytes);
                }

#if false
                var buf2 = new ByteBuffer();
                buf2.AppendDecompress(buf.GetBytes());
                var bin2 = buf2.GetBytes();
                if (!bin.SequenceEqual(bin2))
                {
                    throw new InvalidOperationException();
                }
                var df2 = DataFrame.FromBuffer(bin2, df.DType, df.Rows.Count);
#endif
                var segment = new BsonDocument {
                    { "symbol", symbol },
                    { "data", new BsonBinaryData(buf.GetBytes()) },
                    { "compressed", true },
                    { "segment", segment_offset + df.Rows.Count - 1 },
                    { "parent", new BsonArray {
                          version["_id"]
                      } },
                };

                var hash = new ByteBuffer();
                hash.Append(Encoding.ASCII.GetBytes(symbol));
                foreach (var key in segment.Names.OrderByDescending(x => x))
                {
                    var value = segment.GetValue(key);
                    if (value is BsonBinaryData)
                    {
                        hash.Append(value.AsByteArray);
                    }
                    else
                    {
                        var str = value.ToString();
                        hash.Append(Encoding.ASCII.GetBytes(str));
                    }
                }
                segment["sha"] = sha1.ComputeHash(hash.GetBytes());

                //await _versions.InsertOneAsync(version);
                try {
                    await _versions.ReplaceOneAsync(BF.Eq("symbol", symbol), version, Upsert);
                }catch (MongoWriteException e)
                {
                    Log.Information("Retrying append symbol {symbol}, attempt {attemptNo}", symbol, attemptNo++);
                    continue;
                }
                await _segments.InsertOneAsync(segment);

                //Log.Information ("inserted new segment {segment} for symbol {symbol}", segment["_id"], symbol);
                //Log.Information ("replaced version {0} for symbol {symbol} sha1 {sha}", version["_id"], symbol, sha);

                // update parents in versions
                //var res = await _segments.UpdateManyAsync (BF.Eq("symbol", symbol), BU.Set ("parent", new BsonArray{ version ["_id"] }));
                //Log.Debug ("updated segments parents {0}".Args(res.MatchedCount));
                return(version);
            }
        }