private static (long, AnalyzerContext) RunGroupingAnalyzers( DataFrame dataFrame, IEnumerable <string> groupingColumns, Option <string> filterConditions, IEnumerable <IGroupingAnalyzer <IMetric> > analyzers, Option <IStateLoader> aggregateWith, Option <IStatePersister> saveStateTo, StorageLevel storageLevelOfGroupedDataForMultiplePasses, Option <double> numRowsOfData ) { FrequenciesAndNumRows frequenciesAndNumRows = FrequencyBasedAnalyzer.ComputeFrequencies(dataFrame, groupingColumns, filterConditions); Analyzer <FrequenciesAndNumRows, IMetric> sampleAnalyzer = analyzers.First() as Analyzer <FrequenciesAndNumRows, IMetric>; Option <FrequenciesAndNumRows> previousFrequenciesAndNumRows = aggregateWith .Select(stateLoader => stateLoader.Load <FrequenciesAndNumRows>(sampleAnalyzer)) .GetOrElse(Option <FrequenciesAndNumRows> .None); if (previousFrequenciesAndNumRows.HasValue) { frequenciesAndNumRows = (FrequenciesAndNumRows)frequenciesAndNumRows.Sum(previousFrequenciesAndNumRows.Value); } AnalyzerContext results = RunAnalyzersForParticularGrouping(frequenciesAndNumRows, analyzers, saveStateTo); return(frequenciesAndNumRows.NumRows, results); }
private static AnalyzerContext RunAnalyzersForParticularGrouping( FrequenciesAndNumRows frequenciesAndNumRows, IEnumerable <IGroupingAnalyzer <IMetric> > analyzers, Option <IStatePersister> saveStatesTo ) { long numRows = frequenciesAndNumRows.NumRows; IEnumerable <ScanShareableFrequencyBasedAnalyzer> shareable = analyzers .OfType <ScanShareableFrequencyBasedAnalyzer>(); IEnumerable <IGroupingAnalyzer <IMetric> > others = analyzers.Except(shareable); if (!others.Any()) { frequenciesAndNumRows.Frequencies.Persist(); // TODO: storageLevelOfGroupedDataForMultiplePasses } IEnumerable <ScanShareableFrequencyBasedAnalyzer> sharableAnalyzers = shareable; IEnumerable <KeyValuePair <IAnalyzer <IMetric>, IMetric> > metricsByAnalyzer; if (!sharableAnalyzers.Any()) { metricsByAnalyzer = new List <KeyValuePair <IAnalyzer <IMetric>, IMetric> >(); } else { try { IEnumerable <Column> aggregations = sharableAnalyzers .SelectMany(analyzer => analyzer.AggregationFunctions(numRows)); int i = 0; List <int> offsets = sharableAnalyzers.Select(analyzer => { i += analyzer.AggregationFunctions(numRows).Count(); return(i); }).ToList(); offsets.Insert(0, 0); Row results = frequenciesAndNumRows.Frequencies .Agg(aggregations.FirstOrDefault(), aggregations.Skip(1).ToArray()) .Collect() .First(); metricsByAnalyzer = sharableAnalyzers .Zip(offsets, (analyzer, i1) => (analyzer, i1)) .Select(analyzerOffset => new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzerOffset.analyzer, SuccessOfFailureMetricFrom(analyzerOffset.analyzer, results, analyzerOffset.i1))); } catch (Exception e) { metricsByAnalyzer = sharableAnalyzers.Select(analyzer => new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, analyzer.ToFailureMetric(e))); } } IEnumerable <KeyValuePair <IAnalyzer <IMetric>, IMetric> > otherMetrics; try { otherMetrics = others .Select(analyzer => (FrequencyBasedAnalyzer)analyzer) .Select(analyzer => new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, analyzer.ComputeMetricFrom(new Option <FrequenciesAndNumRows>(frequenciesAndNumRows)))); } catch (Exception e) { otherMetrics = others.Select(analyzer => new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, analyzer.ToFailureMetric(e))); } saveStatesTo.Select(statePersister => statePersister.Persist(new Option <IAnalyzer <IMetric> >(analyzers.First()), frequenciesAndNumRows)); frequenciesAndNumRows.Frequencies.Unpersist(); ; return(new AnalyzerContext( new Dictionary <IAnalyzer <IMetric>, IMetric>(metricsByAnalyzer.Concat(otherMetrics)))); }
/** * Compute the metrics from the analyzers configured in the analyis, instead of running * directly on data, this computation leverages (and aggregates) existing states which have * previously been computed on the data. * * @param schema schema of the data frame from which the states were computed * @param analysis the analysis to compute * @param stateLoaders loaders from which we retrieve the states to aggregate * @param saveStatesWith persist resulting states for the configured analyzers (optional) * @param storageLevelOfGroupedDataForMultiplePasses caching level for grouped data that must be * accessed multiple times (use * StorageLevel.NONE to completely disable * caching) * @return AnalyzerContext holding the requested metrics per analyzer */ public static AnalyzerContext RunOnAggregatedStates( StructType schema, Analysis analysis, IEnumerable <IStateLoader> stateLoaders, Option <IStatePersister> saveStatesWith = default, Option <IMetricsRepository> metricsRepository = default, Option <ResultKey> saveOrAppendResultsWithKey = default, StorageLevel storageLevelOfGroupedDataForMultiplePasses = StorageLevel.MEMORY_AND_DISK) { if (analysis.Analyzers == null || stateLoaders == null) { return(AnalyzerContext.Empty()); } IEnumerable <IAnalyzer <IMetric> > analyzers = analysis.Analyzers; /* Find all analyzers which violate their preconditions */ IEnumerable <IAnalyzer <IMetric> > passedAnalyzers = analyzers .Where(analyzer => !FindFirstFailing(schema, analyzer.Preconditions()).HasValue); IEnumerable <IAnalyzer <IMetric> > failedAnalyzers = analyzers.Except(passedAnalyzers); /* Create the failure metrics from the precondition violations */ AnalyzerContext preconditionFailures = ComputePreconditionFailureMetrics(failedAnalyzers, schema); InMemoryStateProvider aggregatedStates = new InMemoryStateProvider(); foreach (IAnalyzer <IMetric> analyzer in passedAnalyzers) { foreach (IStateLoader state in stateLoaders) { analyzer.AggregateStateTo(aggregatedStates, state, aggregatedStates); } } IEnumerable <IGroupingAnalyzer <IMetric> > groupingAnalyzers = passedAnalyzers.OfType <IGroupingAnalyzer <IMetric> >(); IEnumerable <IAnalyzer <IMetric> > scanningAnalyzers = passedAnalyzers.Except(groupingAnalyzers); Dictionary <IAnalyzer <IMetric>, IMetric> nonGroupedResults = new Dictionary <IAnalyzer <IMetric>, IMetric>( scanningAnalyzers.SelectMany(analyzer => { IMetric metrics = analyzer.LoadStateAndComputeMetric(aggregatedStates); if (saveStatesWith.HasValue) { analyzer.CopyStateTo(aggregatedStates, saveStatesWith.Value); } return(new[] { new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, metrics) }); })); AnalyzerContext groupedResults; if (!groupingAnalyzers.Any()) { groupedResults = AnalyzerContext.Empty(); } else { groupedResults = groupingAnalyzers .Select(analyzer => (IGroupingAnalyzer <IMetric>)analyzers) .GroupBy(analyzer => analyzer.GroupingColumns().OrderBy(columnName => columnName)) .Select(analyzerForGrouping => { FrequenciesAndNumRows state = FindStateForParticularGrouping(analyzerForGrouping, aggregatedStates); return(RunAnalyzersForParticularGrouping(state, analyzerForGrouping, saveStatesWith)); }).Aggregate((x, y) => x + y); } AnalyzerContext results = preconditionFailures + new AnalyzerContext(nonGroupedResults) + groupedResults; SaveOrAppendResultsIfNecessary(results, metricsRepository, saveOrAppendResultsWithKey); return(results); }