public static AnalyzerContext DoAnalysisRun ( DataFrame data, IEnumerable <IAnalyzer <IMetric> > analyzers, Option <IStateLoader> aggregateWith, Option <IStatePersister> saveStatesWith, StorageLevel storageLevelOfGroupedDataForMultiplePasses, AnalysisRunnerRepositoryOptions metricsRepositoryOptions = default, AnalysisRunnerFileOutputOptions fileOutputOptions = default) { if (!analyzers.Any()) { return(AnalyzerContext.Empty()); } IEnumerable <IAnalyzer <IMetric> > allAnalyzers = analyzers.Select(analyzer => analyzer); IAnalyzer <IMetric>[] enumerable = allAnalyzers as IAnalyzer <IMetric>[] ?? allAnalyzers.ToArray(); IEnumerable <IAnalyzer <IMetric> > distinctAnalyzers = enumerable.Distinct(); AnalyzerContext resultComputedPreviously = (metricsRepositoryOptions?.metricRepository.HasValue, metricsRepositoryOptions?.reuseExistingResultsForKey.HasValue) switch { (true, true) => metricsRepositoryOptions?.metricRepository.Value .LoadByKey(metricsRepositoryOptions.reuseExistingResultsForKey.Value) .GetOrElse(AnalyzerContext.Empty()), _ => AnalyzerContext.Empty() }; IEnumerable <IAnalyzer <IMetric> > analyzersAlreadyRan = resultComputedPreviously.MetricMap.Keys.AsEnumerable(); IEnumerable <IAnalyzer <IMetric> > analyzersToRun = enumerable.Except(analyzersAlreadyRan); IEnumerable <IAnalyzer <IMetric> > passedAnalyzers = analyzersToRun.Where(analyzer => !FindFirstFailing(data.Schema(), analyzer.Preconditions()).HasValue); IEnumerable <IAnalyzer <IMetric> > failedAnalyzers = analyzersToRun.Except(passedAnalyzers); AnalyzerContext preconditionFailures = ComputePreconditionFailureMetrics(failedAnalyzers, data.Schema()); IEnumerable <IGroupingAnalyzer <IMetric> > groupingAnalyzers = passedAnalyzers.OfType <IGroupingAnalyzer <IMetric> >(); IEnumerable <IAnalyzer <IMetric> > allScanningAnalyzers = passedAnalyzers.Except(groupingAnalyzers).Select(analyzer => analyzer); AnalyzerContext nonGroupedMetrics = RunScanningAnalyzers(data, allScanningAnalyzers, aggregateWith, saveStatesWith); Option <double> numRowsOfData = nonGroupedMetrics.Metric(Initializers.Size()).Select(metric => { if (metric is DoubleMetric dm) { return(dm.Value.Success.Value); } return(0); }); AnalyzerContext groupedMetrics = AnalyzerContext.Empty(); IEnumerable <IGrouping <(IOrderedEnumerable <string>, Option <string>), IGroupingAnalyzer <IMetric> > > sortedAndFilteredGroupingAnalyzers = groupingAnalyzers .Select(analyzer => analyzer) .GroupBy(analyzer => (analyzer.GroupingColumns().OrderBy(columnName => columnName), GetFilterCondition(analyzer))); foreach (IGrouping <(IOrderedEnumerable <string>, Option <string>), IGroupingAnalyzer <IMetric> > analyzerGroup in sortedAndFilteredGroupingAnalyzers) { (long numRows, AnalyzerContext metrics) = RunGroupingAnalyzers(data, analyzerGroup.Key.Item1.ToList(), analyzerGroup.Key.Item2, analyzerGroup, aggregateWith, saveStatesWith, storageLevelOfGroupedDataForMultiplePasses, numRowsOfData); groupedMetrics += metrics; if (!numRowsOfData.HasValue) { numRowsOfData = new Option <double>(numRows); } } AnalyzerContext resultingAnalyzerContext = resultComputedPreviously + preconditionFailures + nonGroupedMetrics + groupedMetrics; //TODO: add kllMetrics if (metricsRepositoryOptions != null) { SaveOrAppendResultsIfNecessary(resultingAnalyzerContext, metricsRepositoryOptions.metricRepository, metricsRepositoryOptions.saveOrAppendResultsWithKey); } SaveJsonOutputsToFilesystemIfNecessary(fileOutputOptions, resultingAnalyzerContext); return(resultingAnalyzerContext); }
private static void SaveJsonOutputsToFilesystemIfNecessary( AnalysisRunnerFileOutputOptions fileOutputOptions, AnalyzerContext analyzerContext) { //TODO implement this part }