Example #1
0
        private static void SaveOrAppendResultsIfNecessary(
            AnalyzerContext resultingAnalyzerContext,
            Option <IMetricsRepository> metricsRepository,
            Option <ResultKey> saveOrAppendResultsWithKey) =>
        metricsRepository.OnSuccess(repository =>
        {
            saveOrAppendResultsWithKey.OnSuccess(key =>
            {
                AnalyzerContext valueToSave = repository.LoadByKey(key).GetOrElse(AnalyzerContext.Empty());

                Dictionary <string, IAnalyzer <IMetric> > dictEquality =
                    valueToSave.MetricMap.ToDictionary(pair => pair.Key.ToString(), pair => pair.Key);

                resultingAnalyzerContext.MetricMap.ToList().ForEach(keyValuePair =>
                {
                    if (dictEquality.ContainsKey(keyValuePair.Key.ToString()) &&
                        valueToSave.MetricMap.ContainsKey(dictEquality[keyValuePair.Key.ToString()]))
                    {
                        valueToSave.MetricMap[dictEquality[keyValuePair.Key.ToString()]] = keyValuePair.Value;
                    }
                    else
                    {
                        valueToSave.MetricMap.Add(keyValuePair.Key, keyValuePair.Value);
                    }
                });

                repository.Save(saveOrAppendResultsWithKey.Value, valueToSave);
            });
        });
Example #2
0
        public static AnalyzerContext DoAnalysisRun
        (
            DataFrame data,
            IEnumerable <IAnalyzer <IMetric> > analyzers,
            Option <IStateLoader> aggregateWith,
            Option <IStatePersister> saveStatesWith,
            StorageLevel storageLevelOfGroupedDataForMultiplePasses,
            AnalysisRunnerRepositoryOptions metricsRepositoryOptions = default,
            AnalysisRunnerFileOutputOptions fileOutputOptions        = default)
        {
            if (!analyzers.Any())
            {
                return(AnalyzerContext.Empty());
            }

            IEnumerable <IAnalyzer <IMetric> > allAnalyzers = analyzers.Select(analyzer => analyzer);

            IAnalyzer <IMetric>[] enumerable = allAnalyzers as IAnalyzer <IMetric>[] ?? allAnalyzers.ToArray();
            IEnumerable <IAnalyzer <IMetric> > distinctAnalyzers = enumerable.Distinct();


            AnalyzerContext resultComputedPreviously = (metricsRepositoryOptions?.metricRepository.HasValue,
                                                        metricsRepositoryOptions?.reuseExistingResultsForKey.HasValue) switch
            {
                (true, true) => metricsRepositoryOptions?.metricRepository.Value
                .LoadByKey(metricsRepositoryOptions.reuseExistingResultsForKey.Value)
                .GetOrElse(AnalyzerContext.Empty()),
                _ => AnalyzerContext.Empty()
            };


            IEnumerable <IAnalyzer <IMetric> >
            analyzersAlreadyRan = resultComputedPreviously.MetricMap.Keys.AsEnumerable();
            IEnumerable <IAnalyzer <IMetric> > analyzersToRun = enumerable.Except(analyzersAlreadyRan);

            IEnumerable <IAnalyzer <IMetric> > passedAnalyzers = analyzersToRun.Where(analyzer =>
                                                                                      !FindFirstFailing(data.Schema(), analyzer.Preconditions()).HasValue);

            IEnumerable <IAnalyzer <IMetric> > failedAnalyzers = analyzersToRun.Except(passedAnalyzers);

            AnalyzerContext preconditionFailures = ComputePreconditionFailureMetrics(failedAnalyzers, data.Schema());

            IEnumerable <IGroupingAnalyzer <IMetric> > groupingAnalyzers =
                passedAnalyzers.OfType <IGroupingAnalyzer <IMetric> >();

            IEnumerable <IAnalyzer <IMetric> > allScanningAnalyzers =
                passedAnalyzers.Except(groupingAnalyzers).Select(analyzer => analyzer);

            AnalyzerContext nonGroupedMetrics =
                RunScanningAnalyzers(data, allScanningAnalyzers, aggregateWith, saveStatesWith);

            Option <double> numRowsOfData = nonGroupedMetrics.Metric(Initializers.Size()).Select(metric =>
            {
                if (metric is DoubleMetric dm)
                {
                    return(dm.Value.Success.Value);
                }

                return(0);
            });

            AnalyzerContext groupedMetrics = AnalyzerContext.Empty();

            IEnumerable <IGrouping <(IOrderedEnumerable <string>, Option <string>), IGroupingAnalyzer <IMetric> > >
            sortedAndFilteredGroupingAnalyzers = groupingAnalyzers
                                                 .Select(analyzer => analyzer)
                                                 .GroupBy(analyzer => (analyzer.GroupingColumns().OrderBy(columnName => columnName), GetFilterCondition(analyzer)));

            foreach (IGrouping <(IOrderedEnumerable <string>, Option <string>), IGroupingAnalyzer <IMetric> >
                     analyzerGroup in sortedAndFilteredGroupingAnalyzers)
            {
                (long numRows, AnalyzerContext metrics) =
                    RunGroupingAnalyzers(data,
                                         analyzerGroup.Key.Item1.ToList(),
                                         analyzerGroup.Key.Item2, analyzerGroup, aggregateWith, saveStatesWith,
                                         storageLevelOfGroupedDataForMultiplePasses, numRowsOfData);

                groupedMetrics += metrics;

                if (!numRowsOfData.HasValue)
                {
                    numRowsOfData = new Option <double>(numRows);
                }
            }

            AnalyzerContext resultingAnalyzerContext =
                resultComputedPreviously + preconditionFailures + nonGroupedMetrics +
                groupedMetrics; //TODO: add kllMetrics

            if (metricsRepositoryOptions != null)
            {
                SaveOrAppendResultsIfNecessary(resultingAnalyzerContext,
                                               metricsRepositoryOptions.metricRepository,
                                               metricsRepositoryOptions.saveOrAppendResultsWithKey);
            }

            SaveJsonOutputsToFilesystemIfNecessary(fileOutputOptions, resultingAnalyzerContext);

            return(resultingAnalyzerContext);
        }
Example #3
0
        private static AnalyzerContext RunScanningAnalyzers(DataFrame dataFrame,
                                                            IEnumerable <IAnalyzer <IMetric> > analyzers,
                                                            Option <IStateLoader> aggregateWith,
                                                            Option <IStatePersister> saveStateTo
                                                            )
        {
            IEnumerable <IScanSharableAnalyzer <IState, IMetric> > sharable =
                analyzers.OfType <IScanSharableAnalyzer <IState, IMetric> >();
            IEnumerable <IAnalyzer <IMetric> > others = analyzers.Except(sharable);

            AnalyzerContext sharedResults;

            if (sharable.Any())
            {
                IEnumerable <KeyValuePair <IAnalyzer <IMetric>, IMetric> > metricsByAnalyzer;

                try
                {
                    IEnumerable <Column> aggregations = sharable
                                                        .SelectMany(analyzer => analyzer.AggregationFunctions());

                    int i = 0;


                    List <int> offsets = sharable.Select(analyzer =>
                    {
                        i += analyzer.AggregationFunctions().Count();
                        return(i);
                    }).ToList();

                    offsets.Insert(0, 0);

                    Row results = dataFrame.Agg(aggregations.FirstOrDefault(), aggregations.Skip(1).ToArray()).Collect()
                                  .First();

                    metricsByAnalyzer = sharable
                                        .Zip(offsets, (analyzer, i1) => (analyzer, i1))
                                        .Select(analyzerOffset => new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzerOffset.analyzer,
                                                                                                                  SuccessOfFailureMetricFrom(analyzerOffset.analyzer, results,
                                                                                                                                             analyzerOffset.i1, aggregateWith, saveStateTo)));
                }
                catch (Exception e)
                {
                    metricsByAnalyzer = sharable.Select(analyzer =>
                                                        new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, analyzer.ToFailureMetric(e)));
                }

                Dictionary <IAnalyzer <IMetric>, IMetric> metricsByAnalyzerDict =
                    new Dictionary <IAnalyzer <IMetric>, IMetric>(metricsByAnalyzer);
                sharedResults = new AnalyzerContext(metricsByAnalyzerDict);
            }
            else
            {
                sharedResults = AnalyzerContext.Empty();
            }

            Dictionary <IAnalyzer <IMetric>, IMetric> otherMetrics = new Dictionary <IAnalyzer <IMetric>, IMetric>(
                others.Select(analyzer =>
                              new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer,
                                                                              analyzer.Calculate(dataFrame, aggregateWith, saveStateTo))));

            return(sharedResults + new AnalyzerContext(otherMetrics));
        }
Example #4
0
        /**
         * Compute the metrics from the analyzers configured in the analyis, instead of running
         * directly on data, this computation leverages (and aggregates) existing states which have
         * previously been computed on the data.
         *
         * @param schema schema of the data frame from which the states were computed
         * @param analysis the analysis to compute
         * @param stateLoaders loaders from which we retrieve the states to aggregate
         * @param saveStatesWith persist resulting states for the configured analyzers (optional)
         * @param storageLevelOfGroupedDataForMultiplePasses caching level for grouped data that must be
         * accessed multiple times (use
         * StorageLevel.NONE to completely disable
         * caching)
         * @return AnalyzerContext holding the requested metrics per analyzer
         */
        public static AnalyzerContext RunOnAggregatedStates(
            StructType schema,
            Analysis analysis,
            IEnumerable <IStateLoader> stateLoaders,
            Option <IStatePersister> saveStatesWith                 = default,
            Option <IMetricsRepository> metricsRepository           = default,
            Option <ResultKey> saveOrAppendResultsWithKey           = default,
            StorageLevel storageLevelOfGroupedDataForMultiplePasses = StorageLevel.MEMORY_AND_DISK)
        {
            if (analysis.Analyzers == null || stateLoaders == null)
            {
                return(AnalyzerContext.Empty());
            }

            IEnumerable <IAnalyzer <IMetric> > analyzers = analysis.Analyzers;

            /* Find all analyzers which violate their preconditions */
            IEnumerable <IAnalyzer <IMetric> > passedAnalyzers = analyzers
                                                                 .Where(analyzer => !FindFirstFailing(schema, analyzer.Preconditions()).HasValue);

            IEnumerable <IAnalyzer <IMetric> > failedAnalyzers = analyzers.Except(passedAnalyzers);

            /* Create the failure metrics from the precondition violations */
            AnalyzerContext preconditionFailures = ComputePreconditionFailureMetrics(failedAnalyzers, schema);

            InMemoryStateProvider aggregatedStates = new InMemoryStateProvider();

            foreach (IAnalyzer <IMetric> analyzer in passedAnalyzers)
            {
                foreach (IStateLoader state in stateLoaders)
                {
                    analyzer.AggregateStateTo(aggregatedStates, state, aggregatedStates);
                }
            }


            IEnumerable <IGroupingAnalyzer <IMetric> > groupingAnalyzers =
                passedAnalyzers.OfType <IGroupingAnalyzer <IMetric> >();

            IEnumerable <IAnalyzer <IMetric> > scanningAnalyzers = passedAnalyzers.Except(groupingAnalyzers);

            Dictionary <IAnalyzer <IMetric>, IMetric> nonGroupedResults = new Dictionary <IAnalyzer <IMetric>, IMetric>(
                scanningAnalyzers.SelectMany(analyzer =>
            {
                IMetric metrics = analyzer.LoadStateAndComputeMetric(aggregatedStates);

                if (saveStatesWith.HasValue)
                {
                    analyzer.CopyStateTo(aggregatedStates, saveStatesWith.Value);
                }

                return(new[] { new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, metrics) });
            }));


            AnalyzerContext groupedResults;

            if (!groupingAnalyzers.Any())
            {
                groupedResults = AnalyzerContext.Empty();
            }
            else
            {
                groupedResults = groupingAnalyzers
                                 .Select(analyzer => (IGroupingAnalyzer <IMetric>)analyzers)
                                 .GroupBy(analyzer => analyzer.GroupingColumns().OrderBy(columnName => columnName))
                                 .Select(analyzerForGrouping =>
                {
                    FrequenciesAndNumRows state =
                        FindStateForParticularGrouping(analyzerForGrouping, aggregatedStates);
                    return(RunAnalyzersForParticularGrouping(state, analyzerForGrouping, saveStatesWith));
                }).Aggregate((x, y) => x + y);
            }

            AnalyzerContext results = preconditionFailures + new AnalyzerContext(nonGroupedResults) + groupedResults;

            SaveOrAppendResultsIfNecessary(results, metricsRepository, saveOrAppendResultsWithKey);

            return(results);
        }