public void FillColumnStatistics(ColumnStatistics columnStatistics) { if (columnStatistics.BucketStatistics == null) { columnStatistics.BucketStatistics = new BucketStatistics { Count = new List <ulong> { FalseCount, TrueCount } }; } else { columnStatistics.BucketStatistics.Count[0] += FalseCount; columnStatistics.BucketStatistics.Count[1] += TrueCount; } columnStatistics.NumberOfValues += NumValues; if (HasNull) { columnStatistics.HasNull = true; } }
public void FillColumnStatistics(ColumnStatistics columnStatistics) { if (columnStatistics.DoubleStatistics == null) { columnStatistics.DoubleStatistics = new DoubleStatistics { Minimum = Min, Maximum = Max, Sum = Sum }; } else { if (Min < columnStatistics.DoubleStatistics.Minimum) { columnStatistics.DoubleStatistics.Minimum = Min; } if (Max > columnStatistics.DoubleStatistics.Maximum) { columnStatistics.DoubleStatistics.Maximum = Max; } columnStatistics.DoubleStatistics.Sum = !Sum.HasValue ? null : CheckedAdd(columnStatistics.DoubleStatistics.Sum, Sum.Value); } columnStatistics.NumberOfValues += NumValues; if (HasNull) { columnStatistics.HasNull = true; } }
public void FillColumnStatistics(ColumnStatistics columnStatistics) { if (columnStatistics.TimestampStatistics == null) { columnStatistics.TimestampStatistics = new TimestampStatistics { Minimum = Min, Maximum = Max }; } else { if (Min < columnStatistics.TimestampStatistics.Minimum) { columnStatistics.TimestampStatistics.Minimum = Min; } if (Max > columnStatistics.TimestampStatistics.Maximum) { columnStatistics.TimestampStatistics.Maximum = Max; } } columnStatistics.NumberOfValues += NumValues; if (HasNull) { columnStatistics.HasNull = true; } }
public void FillColumnStatistics(ColumnStatistics columnStatistics) { if (columnStatistics.StringStatistics == null) { columnStatistics.StringStatistics = new StringStatistics { Minimum = Min, Maximum = Max, Sum = Sum }; } else { if (string.Compare(Min, columnStatistics.StringStatistics.Minimum, StringComparison.Ordinal) < 0) { columnStatistics.StringStatistics.Minimum = Min; } if (string.Compare(Max, columnStatistics.StringStatistics.Maximum, StringComparison.Ordinal) > 0) { columnStatistics.StringStatistics.Maximum = Max; } columnStatistics.StringStatistics.Sum += Sum; } columnStatistics.NumberOfValues += NumValues; if (HasNull) { columnStatistics.HasNull = true; } }
public void FillColumnStatistics(ColumnStatistics columnStatistics) { if (columnStatistics.DecimalStatistics == null) { columnStatistics.DecimalStatistics = new DecimalStatistics { Minimum = Min.ToString(), Maximum = Max.ToString(), Sum = Sum.HasValue ? Sum.Value.ToString() : "" }; } else { if (Min < decimal.Parse(columnStatistics.DecimalStatistics.Minimum)) { columnStatistics.DecimalStatistics.Minimum = Min.ToString(); } if (Max > decimal.Parse(columnStatistics.DecimalStatistics.Maximum)) { columnStatistics.DecimalStatistics.Maximum = Max.ToString(); } columnStatistics.DecimalStatistics.Sum = !Sum.HasValue ? "" : CheckedAdd(decimal.Parse(columnStatistics.DecimalStatistics.Sum), Sum.Value).ToString(); } columnStatistics.NumberOfValues += NumValues; if (HasNull) { columnStatistics.HasNull = true; } }
public void FillColumnStatistics(ColumnStatistics columnStatistics) { if (columnStatistics.TimestampStatistics == null) { columnStatistics.TimestampStatistics = new TimestampStatistics(); } var ds = columnStatistics.TimestampStatistics; if (Min.HasValue) { if (!ds.Minimum.HasValue || Min.Value < ds.Minimum.Value) { ds.Minimum = Min.Value; } } if (Max.HasValue) { if (!ds.Maximum.HasValue || Max.Value > ds.Maximum) { ds.Maximum = Max.Value; } } columnStatistics.NumberOfValues += NumValues; if (HasNull) { columnStatistics.HasNull = true; } }
public void FillColumnStatistics(ColumnStatistics columnStatistics) { if (columnStatistics.IntStatistics == null) { columnStatistics.IntStatistics = new IntegerStatistics { Sum = 0 } } ; var ds = columnStatistics.IntStatistics; if (Min.HasValue) { if (!ds.Minimum.HasValue || Min.Value < ds.Minimum.Value) { ds.Minimum = Min.Value; } } if (Max.HasValue) { if (!ds.Maximum.HasValue || Max.Value > ds.Maximum.Value) { ds.Maximum = Max.Value; } } ds.Sum = CheckedAdd(ds.Sum, Sum); columnStatistics.NumberOfValues += NumValues; if (HasNull) { columnStatistics.HasNull = true; } } long?CheckedAdd(long?left, long?right) { if (!left.HasValue || !right.HasValue) { return(null); } try { checked { return(left.Value + right); } } catch (OverflowException) { return(null); } } }
/** * Return list of column statistics * * @return column stats */ public ColumnStatistics[] getColumnStatistics() { ColumnStatistics[] result = new ColumnStatistics[cs.Count]; for (int i = 0; i < result.Length; ++i) { result[i] = ColumnStatisticsImpl.deserialize(cs[i]); } return result; }
public ColumnStatistics[] getStatistics() { ColumnStatistics[] result = new ColumnStatistics[types.Count]; for (int i = 0; i < result.Length; ++i) { result[i] = ColumnStatisticsImpl.deserialize(fileStats[i]); } return(result); }
private void HandleRow(DataRow row, TableInfo tableInfo, Dictionary <string, ColumnStatistics> statistics) { foreach (var columnInfo in tableInfo.TextColumns) { bool hasUnnormalizedLineBreaks = false; bool hasRedundantSpaces = false; var name = columnInfo.Name; ColumnStatistics columnStats = statistics[name]; var oldValue = row[name] as string; if (oldValue != null) { var newValue = oldValue.NormalizeNewLines(); if (newValue != oldValue) { hasUnnormalizedLineBreaks = true; columnStats.HasUnnormalizedLineBreaks++; } oldValue = newValue; newValue = oldValue.StripRedundantSpaces(); if (newValue != oldValue) { hasRedundantSpaces = true; columnStats.HasRedundantSpaces++; } if (hasUnnormalizedLineBreaks && RepairLineBreaks || hasRedundantSpaces && RepairRedundantSpaces) { var updateValue = row[name] as string; if (hasUnnormalizedLineBreaks && RepairLineBreaks) { updateValue = updateValue.NormalizeNewLines(); columnStats.UnnormalizedLineBreaksRepaired++; } if (hasRedundantSpaces && RepairRedundantSpaces) { updateValue = updateValue.StripRedundantSpaces(); columnStats.RedundantSpacesRepaired++; } if (DoUpdate) { UpdateValue(tableInfo, columnInfo, row, updateValue); } } } } }
public void FillColumnStatistics(ColumnStatistics columnStatistics) { if (columnStatistics.BinaryStatistics == null) { columnStatistics.BinaryStatistics = new BinaryStatistics { Sum = 0 } } ; var ds = columnStatistics.BinaryStatistics; ds.Sum = CheckedAdd(ds.Sum, Sum.Value); columnStatistics.NumberOfValues += NumValues; if (HasNull) { columnStatistics.HasNull = true; } } long?CheckedAdd(long?left, long?right) { if (!left.HasValue || !right.HasValue) { return(null); } try { checked { return(left.Value + right.Value); } } catch (OverflowException) { return(null); } } }
public void FillColumnStatistics(ColumnStatistics columnStatistics) { if (columnStatistics.BinaryStatistics == null) { columnStatistics.BinaryStatistics = new BinaryStatistics { Sum = Sum } } ; else { columnStatistics.BinaryStatistics.Sum += Sum; } columnStatistics.NumberOfValues += NumValues; if (HasNull) { columnStatistics.HasNull = true; } }
public void FillColumnStatistics(ColumnStatistics columnStatistics) { if (columnStatistics.StringStatistics == null) { columnStatistics.StringStatistics = new StringStatistics { Sum = 0 } } ; var ds = columnStatistics.StringStatistics; if (Min != null) { if (ds.Minimum == null || string.Compare(Min, ds.Minimum, StringComparison.Ordinal) < 0) { ds.Minimum = Min; } } if (Max != null) { if (ds.Maximum == null || string.Compare(Max, ds.Maximum, StringComparison.Ordinal) > 0) { ds.Maximum = Max; } } columnStatistics.StringStatistics.Sum += Sum; columnStatistics.NumberOfValues += NumValues; if (HasNull) { columnStatistics.HasNull = true; } } }
public void FillColumnStatistics(ColumnStatistics columnStatistics) { //TODO BucketStatistics are stored in a List, so how can FalseCount be optional? Just use zeros for now if (columnStatistics.BucketStatistics == null) { columnStatistics.BucketStatistics = new BucketStatistics { Count = new List <ulong> { 0, 0 } }; } columnStatistics.BucketStatistics.Count[0] += FalseCount ?? 0; columnStatistics.BucketStatistics.Count[1] += TrueCount ?? 0; columnStatistics.NumberOfValues += NumValues; if (HasNull) { columnStatistics.HasNull = true; } }
private static void writeColumnStatistics(JsonWriter writer, ColumnStatistics cs) { if (cs != null) { writer.key("count").value(cs.getNumberOfValues()); writer.key("hasNull").value(cs.hasNull()); if (cs is BinaryColumnStatistics) { writer.key("totalLength").value(((BinaryColumnStatistics)cs).getSum()); writer.key("type").value(OrcProto.Type.Types.Kind.BINARY.ToString()); } else if (cs is BooleanColumnStatistics) { writer.key("trueCount").value(((BooleanColumnStatistics)cs).getTrueCount()); writer.key("falseCount").value(((BooleanColumnStatistics)cs).getFalseCount()); writer.key("type").value(OrcProto.Type.Types.Kind.BOOLEAN.ToString()); } else if (cs is IntegerColumnStatistics) { writer.key("min").value(((IntegerColumnStatistics)cs).getMinimum()); writer.key("max").value(((IntegerColumnStatistics)cs).getMaximum()); if (((IntegerColumnStatistics)cs).isSumDefined()) { writer.key("sum").value(((IntegerColumnStatistics)cs).getSum()); } writer.key("type").value(OrcProto.Type.Types.Kind.LONG.ToString()); } else if (cs is DoubleColumnStatistics) { writer.key("min").value(((DoubleColumnStatistics)cs).getMinimum()); writer.key("max").value(((DoubleColumnStatistics)cs).getMaximum()); writer.key("sum").value(((DoubleColumnStatistics)cs).getSum()); writer.key("type").value(OrcProto.Type.Types.Kind.DOUBLE.ToString()); } else if (cs is StringColumnStatistics) { writer.key("min").value(((StringColumnStatistics)cs).getMinimum()); writer.key("max").value(((StringColumnStatistics)cs).getMaximum()); writer.key("totalLength").value(((StringColumnStatistics)cs).getSum()); writer.key("type").value(OrcProto.Type.Types.Kind.STRING.ToString()); } else if (cs is DateColumnStatistics) { if (((DateColumnStatistics)cs).getMaximum() != null) { #if false writer.key("min").value(((DateColumnStatistics)cs).getMinimum()); writer.key("max").value(((DateColumnStatistics)cs).getMaximum()); #endif } writer.key("type").value(OrcProto.Type.Types.Kind.DATE.ToString()); } else if (cs is TimestampColumnStatistics) { if (((TimestampColumnStatistics)cs).getMaximum() != null) { #if false writer.key("min").value(((TimestampColumnStatistics)cs).getMinimum()); writer.key("max").value(((TimestampColumnStatistics)cs).getMaximum()); #endif } writer.key("type").value(OrcProto.Type.Types.Kind.TIMESTAMP.ToString()); } else if (cs is DecimalColumnStatistics) { if (((DecimalColumnStatistics)cs).getMaximum() != null) { #if false writer.key("min").value(((DecimalColumnStatistics)cs).getMinimum()); writer.key("max").value(((DecimalColumnStatistics)cs).getMaximum()); writer.key("sum").value(((DecimalColumnStatistics)cs).getSum()); #endif } writer.key("type").value(OrcProto.Type.Types.Kind.DECIMAL.ToString()); } } }
public void testHasNull() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(SimpleStruct)); using (Stream file = File.OpenWrite(TestFilePath)) using (Writer writer = OrcFile.createWriter(TestFilePath, file, OrcFile.writerOptions(conf) .inspector(inspector) .rowIndexStride(1000) .stripeSize(10000) .bufferSize(10000))) { // STRIPE 1 // RG1 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), "RG1")); } // RG2 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } // RG3 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), "RG3")); } // RG4 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } // RG5 for (int i = 0; i < 1000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } // STRIPE 2 for (int i = 0; i < 5000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } // STRIPE 3 for (int i = 0; i < 5000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), "STRIPE-3")); } // STRIPE 4 for (int i = 0; i < 5000; i++) { writer.addRow(new SimpleStruct(bytes(1, 2, 3), null)); } } Reader reader = OrcFile.createReader(TestFilePath, OrcFile.readerOptions(conf)); // check the file level stats ColumnStatistics[] stats = reader.getStatistics(); Assert.Equal(20000, stats[0].getNumberOfValues()); Assert.Equal(20000, stats[1].getNumberOfValues()); Assert.Equal(7000, stats[2].getNumberOfValues()); Assert.Equal(false, stats[0].hasNull()); Assert.Equal(false, stats[1].hasNull()); Assert.Equal(true, stats[2].hasNull()); // check the stripe level stats List <StripeStatistics> stripeStats = reader.getStripeStatistics(); // stripe 1 stats StripeStatistics ss1 = stripeStats[0]; ColumnStatistics ss1_cs1 = ss1.getColumnStatistics()[0]; ColumnStatistics ss1_cs2 = ss1.getColumnStatistics()[1]; ColumnStatistics ss1_cs3 = ss1.getColumnStatistics()[2]; Assert.Equal(false, ss1_cs1.hasNull()); Assert.Equal(false, ss1_cs2.hasNull()); Assert.Equal(true, ss1_cs3.hasNull()); // stripe 2 stats StripeStatistics ss2 = stripeStats[1]; ColumnStatistics ss2_cs1 = ss2.getColumnStatistics()[0]; ColumnStatistics ss2_cs2 = ss2.getColumnStatistics()[1]; ColumnStatistics ss2_cs3 = ss2.getColumnStatistics()[2]; Assert.Equal(false, ss2_cs1.hasNull()); Assert.Equal(false, ss2_cs2.hasNull()); Assert.Equal(true, ss2_cs3.hasNull()); // stripe 3 stats StripeStatistics ss3 = stripeStats[2]; ColumnStatistics ss3_cs1 = ss3.getColumnStatistics()[0]; ColumnStatistics ss3_cs2 = ss3.getColumnStatistics()[1]; ColumnStatistics ss3_cs3 = ss3.getColumnStatistics()[2]; Assert.Equal(false, ss3_cs1.hasNull()); Assert.Equal(false, ss3_cs2.hasNull()); Assert.Equal(false, ss3_cs3.hasNull()); // stripe 4 stats StripeStatistics ss4 = stripeStats[3]; ColumnStatistics ss4_cs1 = ss4.getColumnStatistics()[0]; ColumnStatistics ss4_cs2 = ss4.getColumnStatistics()[1]; ColumnStatistics ss4_cs3 = ss4.getColumnStatistics()[2]; Assert.Equal(false, ss4_cs1.hasNull()); Assert.Equal(false, ss4_cs2.hasNull()); Assert.Equal(true, ss4_cs3.hasNull()); #if false // Test file dump TextWriter origOut = System.Console.Out; string outputFilename = "orc-file-has-null.out"; FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename); // replace stdout and run command System.Console.SetOut(new StreamWriter(myOut)); FileDump.main(new String[] { testFilePath.toString(), "--rowindex=2" }); System.Console.Out.Flush(); System.SetOut(origOut); TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename); #endif }
void CompleteStripe() { var stripeFooter = new Protocol.StripeFooter(); var stripeStats = new Protocol.StripeStatistics(); //Columns foreach (var writer in _columnWriters) { writer.ColumnWriter.FlushBuffers(); var dictionaryLength = (writer.ColumnWriter as ColumnTypes.StringWriter)?.DictionaryLength ?? 0; //DictionaryLength is only used by StringWriter stripeFooter.AddColumn(writer.ColumnWriter.ColumnEncoding, dictionaryLength); } var stripeInformation = new Protocol.StripeInformation(); stripeInformation.Offset = (ulong)_outputStream.Position; stripeInformation.NumberOfRows = (ulong)_rowsInStripe; //Indexes foreach (var writer in _columnWriters) { //Write the index buffer var indexBuffer = _bufferFactory.CreateBuffer(Protocol.StreamKind.RowIndex); writer.ColumnWriter.Statistics.WriteToBuffer(indexBuffer, i => writer.ColumnWriter.Buffers[i].MustBeIncluded); indexBuffer.CopyTo(_outputStream); //Add the index to the footer stripeFooter.AddDataStream(writer.ColumnWriter.ColumnId, indexBuffer); //Collect summary statistics var columnStats = new ColumnStatistics(); foreach (var stats in writer.ColumnWriter.Statistics) { stats.FillColumnStatistics(columnStats); stats.FillColumnStatistics(writer.FileStatistics); } stripeStats.ColStats.Add(columnStats); } _stripeStats.Add(stripeStats); stripeInformation.IndexLength = (ulong)_outputStream.Position - stripeInformation.Offset; //Data streams foreach (var writer in _columnWriters) { foreach (var buffer in writer.ColumnWriter.Buffers) { if (!buffer.MustBeIncluded) { continue; } buffer.CopyTo(_outputStream); stripeFooter.AddDataStream(writer.ColumnWriter.ColumnId, buffer); } } stripeInformation.DataLength = (ulong)_outputStream.Position - stripeInformation.IndexLength - stripeInformation.Offset; //Footer long footerLength; _bufferFactory.SerializeAndCompressTo(_outputStream, stripeFooter, out footerLength); stripeInformation.FooterLength = (ulong)footerLength; _stripeInformations.Add(stripeInformation); _rowsInFile += _rowsInStripe; _rowsInStripe = 0; foreach (var writer in _columnWriters) { writer.ColumnWriter.Reset(); } }
public void FillColumnStatistics(ColumnStatistics columnStatistics) { columnStatistics.NumberOfValues += NumValues; }
public void FillColumnStatistics(ColumnStatistics columnStatistics) { if (columnStatistics.DecimalStatistics == null) { columnStatistics.DecimalStatistics = new DecimalStatistics() { Sum = "0" } } ; //null means overflow so start with zero var ds = columnStatistics.DecimalStatistics; if (Min.HasValue) { if (String.IsNullOrEmpty(ds.Minimum) || Min.Value < Decimal.Parse(ds.Minimum)) { ds.Minimum = Min.Value.ToString(); } } if (Max.HasValue) { if (String.IsNullOrEmpty(ds.Maximum) || Max.Value > Decimal.Parse(ds.Maximum)) { ds.Maximum = Max.Value.ToString(); } } if (!String.IsNullOrEmpty(ds.Sum)) { ds.Sum = CheckedAdd(decimal.Parse(ds.Sum), Sum.Value)?.ToString(); } columnStatistics.NumberOfValues += NumValues; if (HasNull) { columnStatistics.HasNull = true; } } decimal?CheckedAdd(decimal?left, decimal?right) { if (!left.HasValue) { return(null); } try { checked { return(left.Value + right); } } catch (OverflowException) { return(null); } } }
/// <summary> /// Initialize coloring rules for newly selected QueryColumn /// </summary> #if false internal CondFormatRules InitializeRulesBasedOnDataValues() { CondFormatRule r; Color[] colors = GetColors(); CondFormatRules rules = new CondFormatRules(); if (ResultsField == null) { return(rules); } QueryColumn qc = ResultsField.QueryColumn; ColumnStatistics stats = ResultsField.GetStats(); if (qc.MetaColumn.DataType == MetaColumnType.Structure) { // setup substructure search rules if structures for (int i1 = 0; i1 < stats.DistinctValueList.Count; i1++) { MobiusDataType mdt = stats.DistinctValueList[i1]; r = new CondFormatRule(); r.Op = "SSS"; r.OpCode = CondFormatOpCode.SSS; ChemicalStructureMx cs = mdt as ChemicalStructureMx; if (cs != null) { r.Value = cs.ChimeString; } r.BackColor1 = colors[i1 % colors.Length]; rules.Add(r); } } else // setup equality rules for other types { for (int i1 = 0; i1 < stats.DistinctValueList.Count; i1++) { MobiusDataType mdt = stats.DistinctValueList[i1]; r = new CondFormatRule(); r.Op = "Equal to"; r.OpCode = CondFormatOpCode.Eq; r.Value = mdt.FormattedText; r.BackColor1 = colors[i1 % colors.Length]; rules.Add(r); // if (i1 + 1 >= 25) break; // limit number of items } } if (stats.NullsExist) { r = new CondFormatRule(); r.Name = "Missing Data"; r.Op = "Missing"; r.OpCode = CondFormatOpCode.NotExists; r.BackColor1 = CondFormatMatcher.DefaultMissingValueColor; rules.Add(r); } SetRules(ResultsField, rules); // put into the grid return(rules); }