/// <summary> /// Write out dataset to the output stream /// </summary> /// <param name="dataSet">Dataset to write</param> /// <param name="compression">Compression method</param> /// <param name="append">When true, appends to the file, otherwise creates a new file.</param> public void Write(DataSet dataSet, CompressionMethod compression = CompressionMethod.Gzip, bool append = false) { PrepareFile(dataSet, append); int offset = 0; int count; do { count = Math.Min(_writerOptions.RowGroupsSize, dataSet.Count - offset); Thrift.RowGroup rg = _meta.AddRowGroup(); long rgStartPos = Stream.Position; rg.Columns = new List <Thrift.ColumnChunk>(); foreach (SchemaElement se in dataSet.Schema.Flatten()) { var cw = new ColumnWriter(Stream, ThriftStream, _meta, se, compression, _formatOptions, _writerOptions); IList values = dataSet.GetColumn(se, offset, count); Thrift.ColumnChunk chunk = cw.Write(offset, count, values); rg.Columns.Add(chunk); } //row group's size is a sum of _uncompressed_ sizes of all columns in it, including the headers //luckily ColumnChunk already contains sizes of page+header in it's meta rg.Total_byte_size = rg.Columns.Sum(c => c.Meta_data.Total_compressed_size); rg.Num_rows = count; offset += _writerOptions.RowGroupsSize; }while (offset < dataSet.Count); _dataWritten = true; }
public void Can_traverse_streamed() { var model = new BagOfCharsModel(); var tree = model.CreateTree(model, _data); using (var indexStream = new MemoryStream()) using (var vectorStream = new MemoryStream()) using (var pageStream = new MemoryStream()) { using (var writer = new ColumnWriter(indexStream, keepStreamOpen: true)) { writer.CreatePage(tree, vectorStream, new PageIndexWriter(pageStream, keepStreamOpen: true)); } pageStream.Position = 0; Assert.DoesNotThrow(() => { using (var reader = new ColumnReader(new PageIndexReader(pageStream), indexStream, vectorStream, _sessionFactory, _loggerFactory.CreateLogger <ColumnReader>())) { foreach (var word in _data) { foreach (var queryVector in model.Tokenize(word)) { var hit = reader.ClosestMatch(queryVector, model); if (hit == null) { throw new Exception($"unable to find {word} in tree."); } if (hit.Score < model.IdenticalAngle) { throw new Exception($"unable to score {word}."); } Debug.WriteLine($"{word} matched vector in disk with {hit.Score * 100}% certainty."); } } } }); } }
private static void Main(string[] args) { string path; while ((path = CaptureFeedPath(args)) == null) { } const char delimiter = ','; var stopwatch = new Stopwatch(); stopwatch.Start(); var parser = new DelimiterSeparatedFieldParser(delimiter); TriangleDimensions largestTriangle; using( var inputFileReader = new StreamReader(path) ) { var collector = new ErrorCollector(); WriteProgress("Processing file (this may take some time)..."); var columnReader = new ColumnReader(inputFileReader, parser); var dimensionReader = new Reader<TriangleFeedYearExtract>(columnReader, collector); var largestTriangleCalculator = new LargestTriangleCalculator(dimensionReader.Read); largestTriangle = largestTriangleCalculator.Calculate(); ReportAndQuitIfErrors(collector, path, "triangle_dimension_calculation_errors.txt"); } using( var inputFileReader = new StreamReader(path) ) { var collector = new ErrorCollector(); using( var outputFileWriter = new StreamWriter(BuildPathFromInputPath(path, "output.csv")) ) { var columnReader = new ColumnReader(inputFileReader, parser); var paymentRecordReader = new Reader<TriangleFeedFullDataExtract>(columnReader, collector); var triangleBuilder = new TriangleBuilder(paymentRecordReader, collector); Func<ClaimTriangle, string[]> triangleConverter = triangle => triangle.Accumulate().Flatten(largestTriangle); var header = string.Format("{0}, {1}", largestTriangle.OriginYear, largestTriangle.DevelopmentYears); var writer = new ColumnWriter<ClaimTriangle>(triangleBuilder.BuildNext, triangleConverter, header, outputFileWriter, delimiter); writer.Write(); } ReportAndQuitIfErrors(collector, path, "payment_record_errors.txt"); } stopwatch.Stop(); WriteProgress("Processing took: {0}", stopwatch.Elapsed); System.Console.WriteLine("Press any key to quit..."); System.Console.ReadKey(); }
public Array OnColumnWriter <TValue>(ColumnWriter <TValue> columnWriter) where TValue : unmanaged { columnWriter.WriteBatch(_values.Length, (TValue[])_values); return(_values); }