Esempio n. 1
0
        /// <summary>
        /// Write out dataset to the output stream
        /// </summary>
        /// <param name="dataSet">Dataset to write</param>
        /// <param name="compression">Compression method</param>
        /// <param name="append">When true, appends to the file, otherwise creates a new file.</param>
        public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false)
        {
            PrepareFile(dataSet, append);

            int offset = 0;
            int count;

            do
            {
                count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset);
                Thrift.RowGroup rg         = _meta.AddRowGroup();
                long            rgStartPos = Stream.Position;

                rg.Columns = new List <Thrift.ColumnChunk>();
                foreach (SchemaElement se in dataSet.Schema.Flatten())
                {
                    var   cw                 = new ColumnWriter(Stream, ThriftStream, _meta, se, compression, _formatOptions, _writerOptions);
                    IList values             = dataSet.GetColumn(se, offset, count);
                    Thrift.ColumnChunk chunk = cw.Write(offset, count, values);
                    rg.Columns.Add(chunk);
                }

                //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers
                //luckily ColumnChunk already contains sizes of page+header in it's meta
                rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size);
                rg.Num_rows        = count;

                offset += _writerOptions.RowGroupsSize;
            }while (offset < dataSet.Count);

            _dataWritten = true;
        }
Esempio n. 2
0
        public void Can_traverse_streamed()
        {
            var model = new BagOfCharsModel();
            var tree  = model.CreateTree(model, _data);

            using (var indexStream = new MemoryStream())
                using (var vectorStream = new MemoryStream())
                    using (var pageStream = new MemoryStream())
                    {
                        using (var writer = new ColumnWriter(indexStream, keepStreamOpen: true))
                        {
                            writer.CreatePage(tree, vectorStream, new PageIndexWriter(pageStream, keepStreamOpen: true));
                        }

                        pageStream.Position = 0;

                        Assert.DoesNotThrow(() =>
                        {
                            using (var reader = new ColumnReader(new PageIndexReader(pageStream), indexStream, vectorStream, _sessionFactory, _loggerFactory.CreateLogger <ColumnReader>()))
                            {
                                foreach (var word in _data)
                                {
                                    foreach (var queryVector in model.Tokenize(word))
                                    {
                                        var hit = reader.ClosestMatch(queryVector, model);

                                        if (hit == null)
                                        {
                                            throw new Exception($"unable to find {word} in tree.");
                                        }

                                        if (hit.Score < model.IdenticalAngle)
                                        {
                                            throw new Exception($"unable to score {word}.");
                                        }

                                        Debug.WriteLine($"{word} matched vector in disk with {hit.Score * 100}% certainty.");
                                    }
                                }
                            }
                        });
                    }
        }
Esempio n. 3
0
        private static void Main(string[] args)
        {
            string path;
            while ((path = CaptureFeedPath(args)) == null)
            {
            }

            const char delimiter = ',';

            var stopwatch = new Stopwatch();
            stopwatch.Start();

            var parser = new DelimiterSeparatedFieldParser(delimiter);

            TriangleDimensions largestTriangle;
            using( var inputFileReader = new StreamReader(path) )
            {
                var collector = new ErrorCollector();

                WriteProgress("Processing file (this may take some time)...");

                var columnReader = new ColumnReader(inputFileReader, parser);
                var dimensionReader = new Reader<TriangleFeedYearExtract>(columnReader, collector);
                var largestTriangleCalculator = new LargestTriangleCalculator(dimensionReader.Read);
                largestTriangle = largestTriangleCalculator.Calculate();
                ReportAndQuitIfErrors(collector, path, "triangle_dimension_calculation_errors.txt");
            }

            using( var inputFileReader = new StreamReader(path) )
            {
                var collector = new ErrorCollector();

                using( var outputFileWriter = new StreamWriter(BuildPathFromInputPath(path, "output.csv")) )
                {
                    var columnReader = new ColumnReader(inputFileReader, parser);
                    var paymentRecordReader = new Reader<TriangleFeedFullDataExtract>(columnReader, collector);
                    var triangleBuilder = new TriangleBuilder(paymentRecordReader, collector);
                    Func<ClaimTriangle, string[]> triangleConverter = triangle => triangle.Accumulate().Flatten(largestTriangle);
                    var header = string.Format("{0}, {1}", largestTriangle.OriginYear, largestTriangle.DevelopmentYears);
                    var writer = new ColumnWriter<ClaimTriangle>(triangleBuilder.BuildNext, triangleConverter, header, outputFileWriter, delimiter);
                    writer.Write();
                }

                ReportAndQuitIfErrors(collector, path, "payment_record_errors.txt");
            }

            stopwatch.Stop();

            WriteProgress("Processing took: {0}", stopwatch.Elapsed);
            System.Console.WriteLine("Press any key to quit...");
            System.Console.ReadKey();
        }
Esempio n. 4
0
 public Array OnColumnWriter <TValue>(ColumnWriter <TValue> columnWriter)
     where TValue : unmanaged
 {
     columnWriter.WriteBatch(_values.Length, (TValue[])_values);
     return(_values);
 }