private static (long, AnalyzerContext) RunGroupingAnalyzers(
            DataFrame dataFrame,
            IEnumerable <string> groupingColumns,
            Option <string> filterConditions,
            IEnumerable <IGroupingAnalyzer <IMetric> > analyzers,
            Option <IStateLoader> aggregateWith,
            Option <IStatePersister> saveStateTo,
            StorageLevel storageLevelOfGroupedDataForMultiplePasses,
            Option <double> numRowsOfData
            )
        {
            FrequenciesAndNumRows frequenciesAndNumRows =
                FrequencyBasedAnalyzer.ComputeFrequencies(dataFrame, groupingColumns, filterConditions);

            Analyzer <FrequenciesAndNumRows, IMetric> sampleAnalyzer =
                analyzers.First() as Analyzer <FrequenciesAndNumRows, IMetric>;

            Option <FrequenciesAndNumRows> previousFrequenciesAndNumRows = aggregateWith
                                                                           .Select(stateLoader => stateLoader.Load <FrequenciesAndNumRows>(sampleAnalyzer))
                                                                           .GetOrElse(Option <FrequenciesAndNumRows> .None);

            if (previousFrequenciesAndNumRows.HasValue)
            {
                frequenciesAndNumRows =
                    (FrequenciesAndNumRows)frequenciesAndNumRows.Sum(previousFrequenciesAndNumRows.Value);
            }


            AnalyzerContext results = RunAnalyzersForParticularGrouping(frequenciesAndNumRows, analyzers, saveStateTo);

            return(frequenciesAndNumRows.NumRows, results);
        }
 /// <summary>
 ///
 /// </summary>
 /// <param name="data"></param>
 /// <param name="analyzers"></param>
 /// <param name="aggregateWith"></param>
 /// <param name="saveStatesWith"></param>
 /// <param name="storageLevelOfGroupedDataForMultiplePasses"></param>
 /// <returns></returns>
 public static AnalyzerContext Run(
     DataFrame data,
     Analysis analysis,
     Option <IStateLoader> aggregateWith     = default,
     Option <IStatePersister> saveStatesWith = default,
     StorageLevel storageLevelOfGroupedDataForMultiplePasses = StorageLevel.MEMORY_AND_DISK
     )
 {
     return(DoAnalysisRun(data,
                          analysis.Analyzers,
                          aggregateWith,
                          saveStatesWith,
                          storageLevelOfGroupedDataForMultiplePasses));
 }
Exemple #3
0
        public void TestStorageLevelProperties()
        {
            var storageLevels = new List <StorageLevel> {
                StorageLevel.NONE,
                StorageLevel.DISK_ONLY,
                StorageLevel.DISK_ONLY_2,
                StorageLevel.MEMORY_ONLY,
                StorageLevel.MEMORY_ONLY_2,
                StorageLevel.MEMORY_ONLY_SER,
                StorageLevel.MEMORY_ONLY_SER_2,
                StorageLevel.MEMORY_AND_DISK,
                StorageLevel.MEMORY_AND_DISK_2,
                StorageLevel.MEMORY_AND_DISK_SER,
                StorageLevel.MEMORY_AND_DISK_SER_2,
                StorageLevel.OFF_HEAP
            };

            foreach (StorageLevel expected in storageLevels)
            {
                _df.Persist(expected);
                StorageLevel actual = _df.StorageLevel();
                Assert.Equal(expected, actual);
                // Needs to be unpersisted so other Persists can take effect.
                _df.Unpersist();
            }

            StorageLevel storageLevel = StorageLevel.MEMORY_AND_DISK;

            Assert.True(storageLevel.UseDisk);
            Assert.True(storageLevel.UseMemory);
            Assert.False(storageLevel.UseOffHeap);
            Assert.True(storageLevel.Deserialized);
            Assert.Equal(1, storageLevel.Replication);

            Assert.IsType <string>(storageLevel.Description());
            Assert.IsType <string>(storageLevel.ToString());
        }
        public static AnalyzerContext DoAnalysisRun
        (
            DataFrame data,
            IEnumerable <IAnalyzer <IMetric> > analyzers,
            Option <IStateLoader> aggregateWith,
            Option <IStatePersister> saveStatesWith,
            StorageLevel storageLevelOfGroupedDataForMultiplePasses,
            AnalysisRunnerRepositoryOptions metricsRepositoryOptions = default,
            AnalysisRunnerFileOutputOptions fileOutputOptions        = default)
        {
            if (!analyzers.Any())
            {
                return(AnalyzerContext.Empty());
            }

            IEnumerable <IAnalyzer <IMetric> > allAnalyzers = analyzers.Select(analyzer => analyzer);

            IAnalyzer <IMetric>[] enumerable = allAnalyzers as IAnalyzer <IMetric>[] ?? allAnalyzers.ToArray();
            IEnumerable <IAnalyzer <IMetric> > distinctAnalyzers = enumerable.Distinct();


            AnalyzerContext resultComputedPreviously = (metricsRepositoryOptions?.metricRepository.HasValue,
                                                        metricsRepositoryOptions?.reuseExistingResultsForKey.HasValue) switch
            {
                (true, true) => metricsRepositoryOptions?.metricRepository.Value
                .LoadByKey(metricsRepositoryOptions.reuseExistingResultsForKey.Value)
                .GetOrElse(AnalyzerContext.Empty()),
                _ => AnalyzerContext.Empty()
            };


            IEnumerable <IAnalyzer <IMetric> >
            analyzersAlreadyRan = resultComputedPreviously.MetricMap.Keys.AsEnumerable();
            IEnumerable <IAnalyzer <IMetric> > analyzersToRun = enumerable.Except(analyzersAlreadyRan);

            IEnumerable <IAnalyzer <IMetric> > passedAnalyzers = analyzersToRun.Where(analyzer =>
                                                                                      !FindFirstFailing(data.Schema(), analyzer.Preconditions()).HasValue);

            IEnumerable <IAnalyzer <IMetric> > failedAnalyzers = analyzersToRun.Except(passedAnalyzers);

            AnalyzerContext preconditionFailures = ComputePreconditionFailureMetrics(failedAnalyzers, data.Schema());

            IEnumerable <IGroupingAnalyzer <IMetric> > groupingAnalyzers =
                passedAnalyzers.OfType <IGroupingAnalyzer <IMetric> >();

            IEnumerable <IAnalyzer <IMetric> > allScanningAnalyzers =
                passedAnalyzers.Except(groupingAnalyzers).Select(analyzer => analyzer);

            AnalyzerContext nonGroupedMetrics =
                RunScanningAnalyzers(data, allScanningAnalyzers, aggregateWith, saveStatesWith);

            Option <double> numRowsOfData = nonGroupedMetrics.Metric(Initializers.Size()).Select(metric =>
            {
                if (metric is DoubleMetric dm)
                {
                    return(dm.Value.Success.Value);
                }

                return(0);
            });

            AnalyzerContext groupedMetrics = AnalyzerContext.Empty();

            IEnumerable <IGrouping <(IOrderedEnumerable <string>, Option <string>), IGroupingAnalyzer <IMetric> > >
            sortedAndFilteredGroupingAnalyzers = groupingAnalyzers
                                                 .Select(analyzer => analyzer)
                                                 .GroupBy(analyzer => (analyzer.GroupingColumns().OrderBy(columnName => columnName), GetFilterCondition(analyzer)));

            foreach (IGrouping <(IOrderedEnumerable <string>, Option <string>), IGroupingAnalyzer <IMetric> >
                     analyzerGroup in sortedAndFilteredGroupingAnalyzers)
            {
                (long numRows, AnalyzerContext metrics) =
                    RunGroupingAnalyzers(data,
                                         analyzerGroup.Key.Item1.ToList(),
                                         analyzerGroup.Key.Item2, analyzerGroup, aggregateWith, saveStatesWith,
                                         storageLevelOfGroupedDataForMultiplePasses, numRowsOfData);

                groupedMetrics += metrics;

                if (!numRowsOfData.HasValue)
                {
                    numRowsOfData = new Option <double>(numRows);
                }
            }

            AnalyzerContext resultingAnalyzerContext =
                resultComputedPreviously + preconditionFailures + nonGroupedMetrics +
                groupedMetrics; //TODO: add kllMetrics

            if (metricsRepositoryOptions != null)
            {
                SaveOrAppendResultsIfNecessary(resultingAnalyzerContext,
                                               metricsRepositoryOptions.metricRepository,
                                               metricsRepositoryOptions.saveOrAppendResultsWithKey);
            }

            SaveJsonOutputsToFilesystemIfNecessary(fileOutputOptions, resultingAnalyzerContext);

            return(resultingAnalyzerContext);
        }
        /**
         * Compute the metrics from the analyzers configured in the analyis, instead of running
         * directly on data, this computation leverages (and aggregates) existing states which have
         * previously been computed on the data.
         *
         * @param schema schema of the data frame from which the states were computed
         * @param analysis the analysis to compute
         * @param stateLoaders loaders from which we retrieve the states to aggregate
         * @param saveStatesWith persist resulting states for the configured analyzers (optional)
         * @param storageLevelOfGroupedDataForMultiplePasses caching level for grouped data that must be
         * accessed multiple times (use
         * StorageLevel.NONE to completely disable
         * caching)
         * @return AnalyzerContext holding the requested metrics per analyzer
         */
        public static AnalyzerContext RunOnAggregatedStates(
            StructType schema,
            Analysis analysis,
            IEnumerable <IStateLoader> stateLoaders,
            Option <IStatePersister> saveStatesWith                 = default,
            Option <IMetricsRepository> metricsRepository           = default,
            Option <ResultKey> saveOrAppendResultsWithKey           = default,
            StorageLevel storageLevelOfGroupedDataForMultiplePasses = StorageLevel.MEMORY_AND_DISK)
        {
            if (analysis.Analyzers == null || stateLoaders == null)
            {
                return(AnalyzerContext.Empty());
            }

            IEnumerable <IAnalyzer <IMetric> > analyzers = analysis.Analyzers;

            /* Find all analyzers which violate their preconditions */
            IEnumerable <IAnalyzer <IMetric> > passedAnalyzers = analyzers
                                                                 .Where(analyzer => !FindFirstFailing(schema, analyzer.Preconditions()).HasValue);

            IEnumerable <IAnalyzer <IMetric> > failedAnalyzers = analyzers.Except(passedAnalyzers);

            /* Create the failure metrics from the precondition violations */
            AnalyzerContext preconditionFailures = ComputePreconditionFailureMetrics(failedAnalyzers, schema);

            InMemoryStateProvider aggregatedStates = new InMemoryStateProvider();

            foreach (IAnalyzer <IMetric> analyzer in passedAnalyzers)
            {
                foreach (IStateLoader state in stateLoaders)
                {
                    analyzer.AggregateStateTo(aggregatedStates, state, aggregatedStates);
                }
            }


            IEnumerable <IGroupingAnalyzer <IMetric> > groupingAnalyzers =
                passedAnalyzers.OfType <IGroupingAnalyzer <IMetric> >();

            IEnumerable <IAnalyzer <IMetric> > scanningAnalyzers = passedAnalyzers.Except(groupingAnalyzers);

            Dictionary <IAnalyzer <IMetric>, IMetric> nonGroupedResults = new Dictionary <IAnalyzer <IMetric>, IMetric>(
                scanningAnalyzers.SelectMany(analyzer =>
            {
                IMetric metrics = analyzer.LoadStateAndComputeMetric(aggregatedStates);

                if (saveStatesWith.HasValue)
                {
                    analyzer.CopyStateTo(aggregatedStates, saveStatesWith.Value);
                }

                return(new[] { new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, metrics) });
            }));


            AnalyzerContext groupedResults;

            if (!groupingAnalyzers.Any())
            {
                groupedResults = AnalyzerContext.Empty();
            }
            else
            {
                groupedResults = groupingAnalyzers
                                 .Select(analyzer => (IGroupingAnalyzer <IMetric>)analyzers)
                                 .GroupBy(analyzer => analyzer.GroupingColumns().OrderBy(columnName => columnName))
                                 .Select(analyzerForGrouping =>
                {
                    FrequenciesAndNumRows state =
                        FindStateForParticularGrouping(analyzerForGrouping, aggregatedStates);
                    return(RunAnalyzersForParticularGrouping(state, analyzerForGrouping, saveStatesWith));
                }).Aggregate((x, y) => x + y);
            }

            AnalyzerContext results = preconditionFailures + new AnalyzerContext(nonGroupedResults) + groupedResults;

            SaveOrAppendResultsIfNecessary(results, metricsRepository, saveOrAppendResultsWithKey);

            return(results);
        }
Exemple #6
0
 /// <summary>Creates a PersistedCloudFlow from the given CloudFlow.</summary>
 /// <param name="flow">The input CloudFlow.</param>
 /// <param name="storageLevel">Desired storage level for the persisted CloudFlow.</param>
 /// <returns>The result PersistedCloudFlow.</returns>
 public static Cloud <PersistedCloudFlow <TSource> > Persist <TSource>(this CloudFlow <TSource> flow, StorageLevel storageLevel)
 {
     return(CloudFlowModule.persist(storageLevel, flow));
 }