/// <inheritdoc cref="State{T}.Sum"/> public override FrequenciesAndNumRows Sum(FrequenciesAndNumRows other) { IEnumerable <string> columns = Frequencies.Schema().Fields .Select(field => field.Name) .Where(field => field != AnalyzersExt.COUNT_COL); IEnumerable <Column> projectionAfterMerge = columns .Select(col => Coalesce(Col($"this.{col}"), Col($"other.{col}")).As(col)) .Append( (AnalyzersExt.ZeroIfNull($"this.{AnalyzersExt.COUNT_COL}") + AnalyzersExt.ZeroIfNull($"other.{AnalyzersExt.COUNT_COL}")).As(AnalyzersExt.COUNT_COL)); Column joinCondition = columns.Aggregate(NullSafeEq(columns.First()), (previous, result) => previous.And(NullSafeEq(result))); DataFrame frequenciesSum = Frequencies .Alias("this") .Join(other.Frequencies.Alias("other"), joinCondition, "outer") .Select(projectionAfterMerge.ToArray()); return(new FrequenciesAndNumRows(frequenciesSum, NumRows + other.NumRows)); }
/// <summary> /// Checks if a schema has a specific column name. /// </summary> /// <param name="column">The name of the column to verify.</param> /// <returns>A callback asserting the presence of the column in the schema.</returns> private static Action <StructType> HasColumn(string column) => schema => { if (!AnalyzersExt.HasColumn(schema, column)) { throw new Exception("Input data does not include column!"); } };
/// <inheritdoc cref="StandardScanShareableAnalyzer{S}.AdditionalPreconditions" /> public override IEnumerable <Action <StructType> > AdditionalPreconditions() { return(new[] { AnalyzersExt.HasColumn(ColumnA), AnalyzersExt.IsNumeric(ColumnA), AnalyzersExt.HasColumn(ColumnB), AnalyzersExt.IsNumeric(ColumnB) }); }
/// <inheritdoc cref="ScanShareableAnalyzer{S,M}.AggregationFunctions"/> public override IEnumerable <Column> AggregationFunctions() { Column expression = When(RegexpExtract(Column(Column.GetOrElse(string.Empty)), Regex.ToString(), 0) != Lit(""), 1) .Otherwise(0); Column summation = Sum(AnalyzersExt.ConditionalSelection(expression, Where).Cast("integer")); return(new[] { summation, AnalyzersExt.ConditionalCount(Where) }); }
/// <inheritdoc cref="ScanShareableAnalyzer{S,M}.AggregationFunctions"/> public override IEnumerable <Column> AggregationFunctions() { Column summarization = Sum(AnalyzersExt.ConditionalSelection(Column, Where) .IsNotNull() .Cast("int")); Column conditional = AnalyzersExt.ConditionalCount(Where); return(new[] { summarization, conditional }); }
/// <inheritdoc cref="StandardScanShareableAnalyzer{S}.FromAggregationResult"/> public virtual DoubleMetric FromAggregationResult(Row result, int offset) { if (result.Values.Length <= offset || result[offset] == null) { return(AnalyzersExt.MetricFromEmpty(this, Name, string.Join(',', Columns), AnalyzersExt.EntityFrom(Columns))); } return(ToSuccessMetric(result.GetAs <double>(offset))); }
/// <inheritdoc cref="Analyzer{S,M}.ComputeMetricFrom"/> public override DoubleMetric ComputeMetricFrom(Option <S> state) { DoubleMetric metric = state.HasValue switch { true => AnalyzersExt.MetricFromValue(new Try <double>(state.Value.GetMetricValue()), Name, Instance, _metricEntity), _ => AnalyzersExt.MetricFromEmpty(this, Name, Instance, _metricEntity) }; return(metric); } }
/// <inheritdoc cref="Analyzer{S,M}.ComputeMetricFrom"/> public override DoubleMetric ComputeMetricFrom(Option <FrequenciesAndNumRows> state) { if (!state.HasValue) { return(AnalyzersExt.MetricFromEmpty(this, "MutualInformation", string.Join(',', Columns), MetricEntity.Multicolumn)); } long total = state.Value.NumRows; string col1 = Columns.First(); string col2 = Columns.Skip(1).First(); string freqCol1 = $"__deequ_f1_{col1}"; string freqCol2 = $"__deequ_f2_{col2}"; DataFrame jointStats = state.Value.Frequencies; DataFrame marginalStats1 = jointStats .Select(col1, AnalyzersExt.COUNT_COL) .GroupBy(col1) .Agg(Sum(AnalyzersExt.COUNT_COL).As(freqCol1)); DataFrame marginalStats2 = jointStats .Select(col2, AnalyzersExt.COUNT_COL) .GroupBy(col2) .Agg(Sum(AnalyzersExt.COUNT_COL).As(freqCol2)); Func <Column, Column, Column, Column> miUdf = Udf((double px, double py, double pxy) => pxy / total * Math.Log(pxy / total / (px / total * (py / total)))); string miCol = $"__deequ_mi_${col1}_$col2"; DataFrame value = jointStats .Join(marginalStats1, col1) .Join(marginalStats2, col2) .WithColumn(miCol, miUdf(Col(freqCol1).Cast("double"), Col(freqCol2).Cast("double"), Col(AnalyzersExt.COUNT_COL).Cast("double"))) .Agg(Sum(miCol)); Row resultRow = value.First(); if (resultRow[0] == null) { return(AnalyzersExt.MetricFromEmpty(this, "MutualInformation", string.Join(',', Columns), MetricEntity.Multicolumn)); } return(AnalyzersExt.MetricFromValue(resultRow.GetAs <double>(0), "MutualInformation", string.Join(',', Columns), MetricEntity.Multicolumn)); }
/// <summary> /// Calculate a <see cref="Metric{T}"/> from a <see cref="State{T}"/> /// </summary> /// <param name="state">The <see cref="State{T}"/> to compute metrics from.</param> /// <param name="aggregateWith">The <see cref="IStateLoader"/> for previous states to include in the computation.</param> /// <param name="saveStateWith">The <see cref="IStatePersister"/>loader for previous states to include in the computation. </param> /// <returns></returns> public M CalculateMetric(Option <S> state, Option <IStateLoader> aggregateWith, Option <IStatePersister> saveStateWith) { Option <S> loadedState = aggregateWith .Select(value => value.Load <S>(new Option <IAnalyzer <IMetric> >((IAnalyzer <IMetric>) this)).Value); Option <S> stateToComputeMetricFrom = AnalyzersExt.Merge(loadedState, state); saveStateWith .Select(persister => persister.Persist(new Option <IAnalyzer <IMetric> >((IAnalyzer <IMetric>) this), stateToComputeMetricFrom)); return(ComputeMetricFrom(stateToComputeMetricFrom)); }
/// <inheritdoc cref="ScanShareableAnalyzer{S,M}.AggregationFunctions" /> public override IEnumerable <Column> AggregationFunctions() { //https://mathoverflow.net/a/57914 var firstSelection = AnalyzersExt.ConditionalSelection(ColumnA, Where); var secondSelection = AnalyzersExt.ConditionalSelection(ColumnB, Where); var count = Count(firstSelection); var sumX = Sum(firstSelection); var sumY = Sum(secondSelection); var sumXY = Sum(firstSelection * secondSelection); var sumX2 = Sum(firstSelection * firstSelection); var sumY2 = Sum(secondSelection * secondSelection); //double n, double sumX, double sumY, double sumXY, double sumX2, double sumY2 return(new[] { count, sumX, sumY, sumXY, sumX2, sumY2 }); }
/// <inheritdoc cref="ScanShareableAnalyzer{S,M}.ComputeMetricFrom"/> public override DoubleMetric ComputeMetricFrom(Option <FrequenciesAndNumRows> state) { if (!state.HasValue) { return(AnalyzersExt.MetricFromEmpty(this, Name, string.Join(',', Columns), AnalyzersExt.EntityFrom(Columns))); } IEnumerable <Column> aggregations = AggregationFunctions(state.Value.NumRows); Row result = state.Value.Frequencies .Agg(aggregations.First(), aggregations.Skip(1).ToArray()) .Collect() .FirstOrDefault(); return(FromAggregationResult(result, 0)); }
/// <inheritdoc cref="FrequencyBasedAnalyzer.Preconditions"/> public override IEnumerable <Action <StructType> > Preconditions() => AnalyzersExt.ExactlyNColumns(Columns, 2).Concat(base.Preconditions());
/// <inheritdoc cref="StandardScanShareableAnalyzer{S}.AdditionalPreconditions"/> public override IEnumerable <Action <StructType> > AdditionalPreconditions() => new[] { AnalyzersExt.HasColumn(Column), AnalyzersExt.IsString(Column.GetOrElse(string.Empty)) };
/// <inheritdoc cref="ScanShareableAnalyzer{S,M}.FromAggregationResult"/> protected override Option <NumMatchesAndCount> FromAggregationResult(Row result, int offset) => AnalyzersExt.IfNoNullsIn(result, offset, () => new NumMatchesAndCount( (int)result.Get(offset), (int)result.Get(offset + 1)), 2);
/// <inheritdoc cref="StandardScanShareableAnalyzer{S}.AdditionalPreconditions"/> public override IEnumerable <Action <StructType> > AdditionalPreconditions() => new[] { AnalyzersExt.HasColumn(Column), AnalyzersExt.IsNotNested(Column) };
/// <inheritdoc cref="StandardScanShareableAnalyzer{S}.FromAggregationResult"/> protected override Option <SumState> FromAggregationResult(Row result, int offset) => AnalyzersExt.IfNoNullsIn(result, offset, () => new SumState(result.GetAs <double>(offset)));
/// <inheritdoc cref="StandardScanShareableAnalyzer{S}.AggregationFunctions"/> public override IEnumerable <Column> AggregationFunctions() => new[] { Sum(AnalyzersExt.ConditionalSelection(Column, Where)).Cast("double") };
/// <inheritdoc cref="ScanShareableAnalyzer{S,M}.AggregationFunctions"/> public override IEnumerable <Column> AggregationFunctions() { Column summation = Sum(AnalyzersExt.ConditionalSelection(Predicate, Where).Cast("int")); return(new[] { summation, AnalyzersExt.ConditionalCount(Where) }); }
/// <inheritdoc cref="GroupingAnalyzer{S,M}.Preconditions"/> public override IEnumerable <Action <StructType> > Preconditions() => new[] { AnalyzersExt.AtLeastOne(Columns) } .Concat(Columns.Select(AnalyzersExt.HasColumn)) .Concat(Columns.Select(AnalyzersExt.IsNotNested)) .Concat(base.Preconditions());
/// <inheritdoc cref="ScanShareableAnalyzer{S,M}.FromAggregationResult"/> protected override Option <MeanState> FromAggregationResult(Row result, int offset) => AnalyzersExt.IfNoNullsIn(result, offset, () => new MeanState((double)result.Get(offset), (int)result.Get(offset + 1)), 2);
/// <inheritdoc cref="Analyzer{S,M}.ToFailureMetric"/> public override DoubleMetric ToFailureMetric(Exception e) => AnalyzersExt.MetricFromFailure(e, Name, Instance, _metricEntity);
/// <inheritdoc cref="ScanShareableAnalyzer{S,M}.FromAggregationResult"/>. protected override Option <DataTypeHistogram> FromAggregationResult(Row result, int offset) => AnalyzersExt.IfNoNullsIn(result, offset, () => { return(DataTypeHistogram.FromArray(result.Values.Select(value => (int)value).ToArray())); });
/// <inheritdoc cref="StandardScanShareableAnalyzer{S}.AggregationFunctions"/> public override IEnumerable <Column> AggregationFunctions() => new[] { AnalyzersExt.ConditionalCount(Where) }.AsEnumerable();
/// <summary> /// Converts an <see cref="Exception"/> to a <see cref="DoubleMetric"/>. /// </summary> /// <param name="exception">The exception to convert into <see cref="DoubleMetric"/>.</param> /// <returns>The instance of type <see cref="DoubleMetric"/> that represents the exception.</returns> public override DoubleMetric ToFailureMetric(Exception exception) => AnalyzersExt.MetricFromFailure(exception, Name, string.Join(',', Columns), AnalyzersExt.EntityFrom(Columns));
/// <summary> /// Converts a value to a <see cref="DoubleMetric"/>. /// </summary> /// <param name="value">The value to convert into <see cref="DoubleMetric"/>.</param> /// <returns>The instance of type <see cref="DoubleMetric"/> that represents the value.</returns> protected DoubleMetric ToSuccessMetric(double value) => AnalyzersExt.MetricFromValue(value, Name, string.Join(',', Columns), AnalyzersExt.EntityFrom(Columns));
/// <inheritdoc cref="StandardScanShareableAnalyzer{S}.FromAggregationResult"/> protected override Option <NumMatches> FromAggregationResult(Row result, int offset) => AnalyzersExt.IfNoNullsIn(result, offset, () => new NumMatches(result.GetAs <int>(offset)));
/// <inheritdoc cref="StandardScanShareableAnalyzer{S}.AggregationFunctions"/> public override IEnumerable <Column> AggregationFunctions() { Column col = AnalyzersExt.ConditionalSelection(Expr(Column.GetOrElse(string.Empty)), Where); return(new[] { Struct(Count(col), Avg(col), StddevPop(col)) }); }
/// <inheritdoc cref="Analyzer{S,M}.ToFailureMetric"/> public override DoubleMetric ToFailureMetric(Exception e) => AnalyzersExt.MetricFromFailure(e, "MutualInformation", string.Join(',', Columns), MetricEntity.Multicolumn);
public override IEnumerable <Column> AggregationFunctions() { Column selection = AnalyzersExt.ConditionalSelection(Predicate, Where); return(new[] { selection, Count("*") }.AsEnumerable()); }
/// <inheritdoc cref="ScanShareableAnalyzer{S,M}.AggregationFunctions"/>. public override IEnumerable <Column> AggregationFunctions() => new[] { AnalyzersExt.ConditionalSelection(Column, Where) };