private void FillRepositoryWithPreviousResults(IMetricsRepository repository) { Enumerable.Range(1, 31) .ToList() .ForEach(pastDay => { var pastResultsEU = new Dictionary <IAnalyzer <IMetric>, IMetric> { { Initializers.Size(), new DoubleMetric(MetricEntity.Dataset, "*", "Size", Math.Floor(pastDay / 3.0)) }, { Initializers.Mean("sales"), new DoubleMetric(MetricEntity.Column, "sales", "Mean", pastDay * 7) } }; var pastResultsNA = new Dictionary <IAnalyzer <IMetric>, IMetric> { { Initializers.Size(), new DoubleMetric(MetricEntity.Dataset, "*", "Size", pastDay) }, { Initializers.Mean("sales"), new DoubleMetric(MetricEntity.Column, "sales", "Mean", pastDay * 9) } }; var analyzerContextEU = new AnalyzerContext(pastResultsEU); var analyzerContextNA = new AnalyzerContext(pastResultsNA); long dateTime = CreateDate(2018, 7, pastDay); repository.Save(new ResultKey(dateTime, new Dictionary <string, string> { { "marketplace", "EU" } }), analyzerContextEU); repository.Save(new ResultKey(dateTime, new Dictionary <string, string> { { "marketplace", "NA" } }), analyzerContextNA); }); }
public void save_should_ignore_failed_result_metrics_when_saving() { Dictionary <IAnalyzer <IMetric>, IMetric> metrics = new Dictionary <IAnalyzer <IMetric>, IMetric> { { Initializers.Size(Option <string> .None), new DoubleMetric(MetricEntity.Column, "Size", "*", Try <double> .From(() => 5.0)) },
public void analysis_results_serialization_with_mixed_Values_should_fail() { ArgumentException sampleException = new ArgumentException("Some"); AnalyzerContext analyzerContextWithMixedValues = new AnalyzerContext( new Dictionary <IAnalyzer <IMetric>, IMetric> { { Initializers.Size(Option <string> .None), new DoubleMetric(MetricEntity.Column, "Size", "*", Try <double> .From(() => 5.0)) },
private VerificationResult CreateAnomalyChecksAndRunEverything( DataFrame data, IMetricsRepository repository, Check otherCheck, IEnumerable <IAnalyzer <IMetric> > additionalRequiredAnalyzers) { // We only want to use historic data with the EU tag for the anomaly checks since the new // data point is from the EU marketplace var filterEU = new Dictionary <string, string> { { "marketplace", "EU" } }; // We only want to use data points before the date time associated with the current // data point and only ones that are from 2018 var afterDateTime = CreateDate(2018, 1, 1); var beforeDateTime = CreateDate(2018, 8, 1); // Config for the size anomaly check var sizeAnomalyCheckConfig = new AnomalyCheckConfig(CheckLevel.Error, "Size only increases", filterEU, afterDateTime, beforeDateTime); var sizeAnomalyDetectionStrategy = new AbsoluteChangeStrategy(0); // Config for the mean sales anomaly check var meanSalesAnomalyCheckConfig = new AnomalyCheckConfig( CheckLevel.Warning, "Sales mean within 2 standard deviations", filterEU, afterDateTime, beforeDateTime ); var meanSalesAnomalyDetectionStrategy = new OnlineNormalStrategy(upperDeviationFactor: 2, lowerDeviationFactor: Option <double> .None, ignoreAnomalies: false); // ResultKey to be used when saving the results of this run var currentRunResultKey = new ResultKey(CreateDate(2018, 8, 1), new Dictionary <string, string> { { "marketplace", "EU" } }); return(new VerificationSuite() .OnData(data) .AddCheck(otherCheck) .AddRequiredAnalyzers(additionalRequiredAnalyzers) .UseRepository(repository) // Add the Size anomaly check .AddAnomalyCheck(sizeAnomalyDetectionStrategy, Initializers.Size(), sizeAnomalyCheckConfig) // Add the Mean sales anomaly check .AddAnomalyCheck(meanSalesAnomalyDetectionStrategy, Initializers.Mean("sales"), meanSalesAnomalyCheckConfig) // Save new data point in the repository after we calculated everything .SaveOrAppendResult(currentRunResultKey) .Run()); }
public static AnalyzerContext DoAnalysisRun ( DataFrame data, IEnumerable <IAnalyzer <IMetric> > analyzers, Option <IStateLoader> aggregateWith, Option <IStatePersister> saveStatesWith, StorageLevel storageLevelOfGroupedDataForMultiplePasses, AnalysisRunnerRepositoryOptions metricsRepositoryOptions = default, AnalysisRunnerFileOutputOptions fileOutputOptions = default) { if (!analyzers.Any()) { return(AnalyzerContext.Empty()); } IEnumerable <IAnalyzer <IMetric> > allAnalyzers = analyzers.Select(analyzer => analyzer); IAnalyzer <IMetric>[] enumerable = allAnalyzers as IAnalyzer <IMetric>[] ?? allAnalyzers.ToArray(); IEnumerable <IAnalyzer <IMetric> > distinctAnalyzers = enumerable.Distinct(); AnalyzerContext resultComputedPreviously = (metricsRepositoryOptions?.metricRepository.HasValue, metricsRepositoryOptions?.reuseExistingResultsForKey.HasValue) switch { (true, true) => metricsRepositoryOptions?.metricRepository.Value .LoadByKey(metricsRepositoryOptions.reuseExistingResultsForKey.Value) .GetOrElse(AnalyzerContext.Empty()), _ => AnalyzerContext.Empty() }; IEnumerable <IAnalyzer <IMetric> > analyzersAlreadyRan = resultComputedPreviously.MetricMap.Keys.AsEnumerable(); IEnumerable <IAnalyzer <IMetric> > analyzersToRun = enumerable.Except(analyzersAlreadyRan); IEnumerable <IAnalyzer <IMetric> > passedAnalyzers = analyzersToRun.Where(analyzer => !FindFirstFailing(data.Schema(), analyzer.Preconditions()).HasValue); IEnumerable <IAnalyzer <IMetric> > failedAnalyzers = analyzersToRun.Except(passedAnalyzers); AnalyzerContext preconditionFailures = ComputePreconditionFailureMetrics(failedAnalyzers, data.Schema()); IEnumerable <IGroupingAnalyzer <IMetric> > groupingAnalyzers = passedAnalyzers.OfType <IGroupingAnalyzer <IMetric> >(); IEnumerable <IAnalyzer <IMetric> > allScanningAnalyzers = passedAnalyzers.Except(groupingAnalyzers).Select(analyzer => analyzer); AnalyzerContext nonGroupedMetrics = RunScanningAnalyzers(data, allScanningAnalyzers, aggregateWith, saveStatesWith); Option <double> numRowsOfData = nonGroupedMetrics.Metric(Initializers.Size()).Select(metric => { if (metric is DoubleMetric dm) { return(dm.Value.Success.Value); } return(0); }); AnalyzerContext groupedMetrics = AnalyzerContext.Empty(); IEnumerable <IGrouping <(IOrderedEnumerable <string>, Option <string>), IGroupingAnalyzer <IMetric> > > sortedAndFilteredGroupingAnalyzers = groupingAnalyzers .Select(analyzer => analyzer) .GroupBy(analyzer => (analyzer.GroupingColumns().OrderBy(columnName => columnName), GetFilterCondition(analyzer))); foreach (IGrouping <(IOrderedEnumerable <string>, Option <string>), IGroupingAnalyzer <IMetric> > analyzerGroup in sortedAndFilteredGroupingAnalyzers) { (long numRows, AnalyzerContext metrics) = RunGroupingAnalyzers(data, analyzerGroup.Key.Item1.ToList(), analyzerGroup.Key.Item2, analyzerGroup, aggregateWith, saveStatesWith, storageLevelOfGroupedDataForMultiplePasses, numRowsOfData); groupedMetrics += metrics; if (!numRowsOfData.HasValue) { numRowsOfData = new Option <double>(numRows); } } AnalyzerContext resultingAnalyzerContext = resultComputedPreviously + preconditionFailures + nonGroupedMetrics + groupedMetrics; //TODO: add kllMetrics if (metricsRepositoryOptions != null) { SaveOrAppendResultsIfNecessary(resultingAnalyzerContext, metricsRepositoryOptions.metricRepository, metricsRepositoryOptions.saveOrAppendResultsWithKey); } SaveJsonOutputsToFilesystemIfNecessary(fileOutputOptions, resultingAnalyzerContext); return(resultingAnalyzerContext); }
private static Analysis CreateAnalysis() => new Analysis() .AddAnalyzer(Initializers.Size(Option <string> .None)) .AddAnalyzer(Initializers.Distinctness(new[] { "item" }, Option <string> .None)) .AddAnalyzer(Initializers.Completeness("att1")) .AddAnalyzer(Initializers.Uniqueness(new[] { "att1", "att2" }));