Example #1
0
        private static (long, AnalyzerContext) RunGroupingAnalyzers(
            DataFrame dataFrame,
            IEnumerable <string> groupingColumns,
            Option <string> filterConditions,
            IEnumerable <IGroupingAnalyzer <IMetric> > analyzers,
            Option <IStateLoader> aggregateWith,
            Option <IStatePersister> saveStateTo,
            StorageLevel storageLevelOfGroupedDataForMultiplePasses,
            Option <double> numRowsOfData
            )
        {
            FrequenciesAndNumRows frequenciesAndNumRows =
                FrequencyBasedAnalyzer.ComputeFrequencies(dataFrame, groupingColumns, filterConditions);

            Analyzer <FrequenciesAndNumRows, IMetric> sampleAnalyzer =
                analyzers.First() as Analyzer <FrequenciesAndNumRows, IMetric>;

            Option <FrequenciesAndNumRows> previousFrequenciesAndNumRows = aggregateWith
                                                                           .Select(stateLoader => stateLoader.Load <FrequenciesAndNumRows>(sampleAnalyzer))
                                                                           .GetOrElse(Option <FrequenciesAndNumRows> .None);

            if (previousFrequenciesAndNumRows.HasValue)
            {
                frequenciesAndNumRows =
                    (FrequenciesAndNumRows)frequenciesAndNumRows.Sum(previousFrequenciesAndNumRows.Value);
            }


            AnalyzerContext results = RunAnalyzersForParticularGrouping(frequenciesAndNumRows, analyzers, saveStateTo);

            return(frequenciesAndNumRows.NumRows, results);
        }
Example #2
0
        private static AnalyzerContext RunAnalyzersForParticularGrouping(
            FrequenciesAndNumRows frequenciesAndNumRows,
            IEnumerable <IGroupingAnalyzer <IMetric> > analyzers,
            Option <IStatePersister> saveStatesTo
            )
        {
            long numRows = frequenciesAndNumRows.NumRows;

            IEnumerable <ScanShareableFrequencyBasedAnalyzer> shareable = analyzers
                                                                          .OfType <ScanShareableFrequencyBasedAnalyzer>();

            IEnumerable <IGroupingAnalyzer <IMetric> > others = analyzers.Except(shareable);

            if (!others.Any())
            {
                frequenciesAndNumRows.Frequencies.Persist(); // TODO: storageLevelOfGroupedDataForMultiplePasses
            }

            IEnumerable <ScanShareableFrequencyBasedAnalyzer> sharableAnalyzers = shareable;


            IEnumerable <KeyValuePair <IAnalyzer <IMetric>, IMetric> > metricsByAnalyzer;

            if (!sharableAnalyzers.Any())
            {
                metricsByAnalyzer = new List <KeyValuePair <IAnalyzer <IMetric>, IMetric> >();
            }
            else
            {
                try
                {
                    IEnumerable <Column> aggregations = sharableAnalyzers
                                                        .SelectMany(analyzer => analyzer.AggregationFunctions(numRows));

                    int        i       = 0;
                    List <int> offsets = sharableAnalyzers.Select(analyzer =>
                    {
                        i += analyzer.AggregationFunctions(numRows).Count();
                        return(i);
                    }).ToList();

                    offsets.Insert(0, 0);

                    Row results = frequenciesAndNumRows.Frequencies
                                  .Agg(aggregations.FirstOrDefault(), aggregations.Skip(1).ToArray())
                                  .Collect()
                                  .First();

                    metricsByAnalyzer = sharableAnalyzers
                                        .Zip(offsets, (analyzer, i1) => (analyzer, i1))
                                        .Select(analyzerOffset => new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzerOffset.analyzer,
                                                                                                                  SuccessOfFailureMetricFrom(analyzerOffset.analyzer, results, analyzerOffset.i1)));
                }
                catch (Exception e)
                {
                    metricsByAnalyzer = sharableAnalyzers.Select(analyzer =>
                                                                 new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, analyzer.ToFailureMetric(e)));
                }
            }

            IEnumerable <KeyValuePair <IAnalyzer <IMetric>, IMetric> > otherMetrics;

            try
            {
                otherMetrics = others
                               .Select(analyzer => (FrequencyBasedAnalyzer)analyzer)
                               .Select(analyzer => new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer,
                                                                                                   analyzer.ComputeMetricFrom(new Option <FrequenciesAndNumRows>(frequenciesAndNumRows))));
            }
            catch (Exception e)
            {
                otherMetrics = others.Select(analyzer =>
                                             new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, analyzer.ToFailureMetric(e)));
            }

            saveStatesTo.Select(statePersister =>
                                statePersister.Persist(new Option <IAnalyzer <IMetric> >(analyzers.First()), frequenciesAndNumRows));
            frequenciesAndNumRows.Frequencies.Unpersist();

            ;
            return(new AnalyzerContext(
                       new Dictionary <IAnalyzer <IMetric>, IMetric>(metricsByAnalyzer.Concat(otherMetrics))));
        }
Example #3
0
        /**
         * Compute the metrics from the analyzers configured in the analyis, instead of running
         * directly on data, this computation leverages (and aggregates) existing states which have
         * previously been computed on the data.
         *
         * @param schema schema of the data frame from which the states were computed
         * @param analysis the analysis to compute
         * @param stateLoaders loaders from which we retrieve the states to aggregate
         * @param saveStatesWith persist resulting states for the configured analyzers (optional)
         * @param storageLevelOfGroupedDataForMultiplePasses caching level for grouped data that must be
         * accessed multiple times (use
         * StorageLevel.NONE to completely disable
         * caching)
         * @return AnalyzerContext holding the requested metrics per analyzer
         */
        public static AnalyzerContext RunOnAggregatedStates(
            StructType schema,
            Analysis analysis,
            IEnumerable <IStateLoader> stateLoaders,
            Option <IStatePersister> saveStatesWith                 = default,
            Option <IMetricsRepository> metricsRepository           = default,
            Option <ResultKey> saveOrAppendResultsWithKey           = default,
            StorageLevel storageLevelOfGroupedDataForMultiplePasses = StorageLevel.MEMORY_AND_DISK)
        {
            if (analysis.Analyzers == null || stateLoaders == null)
            {
                return(AnalyzerContext.Empty());
            }

            IEnumerable <IAnalyzer <IMetric> > analyzers = analysis.Analyzers;

            /* Find all analyzers which violate their preconditions */
            IEnumerable <IAnalyzer <IMetric> > passedAnalyzers = analyzers
                                                                 .Where(analyzer => !FindFirstFailing(schema, analyzer.Preconditions()).HasValue);

            IEnumerable <IAnalyzer <IMetric> > failedAnalyzers = analyzers.Except(passedAnalyzers);

            /* Create the failure metrics from the precondition violations */
            AnalyzerContext preconditionFailures = ComputePreconditionFailureMetrics(failedAnalyzers, schema);

            InMemoryStateProvider aggregatedStates = new InMemoryStateProvider();

            foreach (IAnalyzer <IMetric> analyzer in passedAnalyzers)
            {
                foreach (IStateLoader state in stateLoaders)
                {
                    analyzer.AggregateStateTo(aggregatedStates, state, aggregatedStates);
                }
            }


            IEnumerable <IGroupingAnalyzer <IMetric> > groupingAnalyzers =
                passedAnalyzers.OfType <IGroupingAnalyzer <IMetric> >();

            IEnumerable <IAnalyzer <IMetric> > scanningAnalyzers = passedAnalyzers.Except(groupingAnalyzers);

            Dictionary <IAnalyzer <IMetric>, IMetric> nonGroupedResults = new Dictionary <IAnalyzer <IMetric>, IMetric>(
                scanningAnalyzers.SelectMany(analyzer =>
            {
                IMetric metrics = analyzer.LoadStateAndComputeMetric(aggregatedStates);

                if (saveStatesWith.HasValue)
                {
                    analyzer.CopyStateTo(aggregatedStates, saveStatesWith.Value);
                }

                return(new[] { new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, metrics) });
            }));


            AnalyzerContext groupedResults;

            if (!groupingAnalyzers.Any())
            {
                groupedResults = AnalyzerContext.Empty();
            }
            else
            {
                groupedResults = groupingAnalyzers
                                 .Select(analyzer => (IGroupingAnalyzer <IMetric>)analyzers)
                                 .GroupBy(analyzer => analyzer.GroupingColumns().OrderBy(columnName => columnName))
                                 .Select(analyzerForGrouping =>
                {
                    FrequenciesAndNumRows state =
                        FindStateForParticularGrouping(analyzerForGrouping, aggregatedStates);
                    return(RunAnalyzersForParticularGrouping(state, analyzerForGrouping, saveStatesWith));
                }).Aggregate((x, y) => x + y);
            }

            AnalyzerContext results = preconditionFailures + new AnalyzerContext(nonGroupedResults) + groupedResults;

            SaveOrAppendResultsIfNecessary(results, metricsRepository, saveOrAppendResultsWithKey);

            return(results);
        }