private static (long, AnalyzerContext) RunGroupingAnalyzers( DataFrame dataFrame, IEnumerable <string> groupingColumns, Option <string> filterConditions, IEnumerable <IGroupingAnalyzer <IMetric> > analyzers, Option <IStateLoader> aggregateWith, Option <IStatePersister> saveStateTo, StorageLevel storageLevelOfGroupedDataForMultiplePasses, Option <double> numRowsOfData ) { FrequenciesAndNumRows frequenciesAndNumRows = FrequencyBasedAnalyzer.ComputeFrequencies(dataFrame, groupingColumns, filterConditions); Analyzer <FrequenciesAndNumRows, IMetric> sampleAnalyzer = analyzers.First() as Analyzer <FrequenciesAndNumRows, IMetric>; Option <FrequenciesAndNumRows> previousFrequenciesAndNumRows = aggregateWith .Select(stateLoader => stateLoader.Load <FrequenciesAndNumRows>(sampleAnalyzer)) .GetOrElse(Option <FrequenciesAndNumRows> .None); if (previousFrequenciesAndNumRows.HasValue) { frequenciesAndNumRows = (FrequenciesAndNumRows)frequenciesAndNumRows.Sum(previousFrequenciesAndNumRows.Value); } AnalyzerContext results = RunAnalyzersForParticularGrouping(frequenciesAndNumRows, analyzers, saveStateTo); return(frequenciesAndNumRows.NumRows, results); }
/// <summary> /// /// </summary> /// <param name="data"></param> /// <param name="analyzers"></param> /// <param name="aggregateWith"></param> /// <param name="saveStatesWith"></param> /// <param name="storageLevelOfGroupedDataForMultiplePasses"></param> /// <returns></returns> public static AnalyzerContext Run( DataFrame data, Analysis analysis, Option <IStateLoader> aggregateWith = default, Option <IStatePersister> saveStatesWith = default, StorageLevel storageLevelOfGroupedDataForMultiplePasses = StorageLevel.MEMORY_AND_DISK ) { return(DoAnalysisRun(data, analysis.Analyzers, aggregateWith, saveStatesWith, storageLevelOfGroupedDataForMultiplePasses)); }
public void TestStorageLevelProperties() { var storageLevels = new List <StorageLevel> { StorageLevel.NONE, StorageLevel.DISK_ONLY, StorageLevel.DISK_ONLY_2, StorageLevel.MEMORY_ONLY, StorageLevel.MEMORY_ONLY_2, StorageLevel.MEMORY_ONLY_SER, StorageLevel.MEMORY_ONLY_SER_2, StorageLevel.MEMORY_AND_DISK, StorageLevel.MEMORY_AND_DISK_2, StorageLevel.MEMORY_AND_DISK_SER, StorageLevel.MEMORY_AND_DISK_SER_2, StorageLevel.OFF_HEAP }; foreach (StorageLevel expected in storageLevels) { _df.Persist(expected); StorageLevel actual = _df.StorageLevel(); Assert.Equal(expected, actual); // Needs to be unpersisted so other Persists can take effect. _df.Unpersist(); } StorageLevel storageLevel = StorageLevel.MEMORY_AND_DISK; Assert.True(storageLevel.UseDisk); Assert.True(storageLevel.UseMemory); Assert.False(storageLevel.UseOffHeap); Assert.True(storageLevel.Deserialized); Assert.Equal(1, storageLevel.Replication); Assert.IsType <string>(storageLevel.Description()); Assert.IsType <string>(storageLevel.ToString()); }
public static AnalyzerContext DoAnalysisRun ( DataFrame data, IEnumerable <IAnalyzer <IMetric> > analyzers, Option <IStateLoader> aggregateWith, Option <IStatePersister> saveStatesWith, StorageLevel storageLevelOfGroupedDataForMultiplePasses, AnalysisRunnerRepositoryOptions metricsRepositoryOptions = default, AnalysisRunnerFileOutputOptions fileOutputOptions = default) { if (!analyzers.Any()) { return(AnalyzerContext.Empty()); } IEnumerable <IAnalyzer <IMetric> > allAnalyzers = analyzers.Select(analyzer => analyzer); IAnalyzer <IMetric>[] enumerable = allAnalyzers as IAnalyzer <IMetric>[] ?? allAnalyzers.ToArray(); IEnumerable <IAnalyzer <IMetric> > distinctAnalyzers = enumerable.Distinct(); AnalyzerContext resultComputedPreviously = (metricsRepositoryOptions?.metricRepository.HasValue, metricsRepositoryOptions?.reuseExistingResultsForKey.HasValue) switch { (true, true) => metricsRepositoryOptions?.metricRepository.Value .LoadByKey(metricsRepositoryOptions.reuseExistingResultsForKey.Value) .GetOrElse(AnalyzerContext.Empty()), _ => AnalyzerContext.Empty() }; IEnumerable <IAnalyzer <IMetric> > analyzersAlreadyRan = resultComputedPreviously.MetricMap.Keys.AsEnumerable(); IEnumerable <IAnalyzer <IMetric> > analyzersToRun = enumerable.Except(analyzersAlreadyRan); IEnumerable <IAnalyzer <IMetric> > passedAnalyzers = analyzersToRun.Where(analyzer => !FindFirstFailing(data.Schema(), analyzer.Preconditions()).HasValue); IEnumerable <IAnalyzer <IMetric> > failedAnalyzers = analyzersToRun.Except(passedAnalyzers); AnalyzerContext preconditionFailures = ComputePreconditionFailureMetrics(failedAnalyzers, data.Schema()); IEnumerable <IGroupingAnalyzer <IMetric> > groupingAnalyzers = passedAnalyzers.OfType <IGroupingAnalyzer <IMetric> >(); IEnumerable <IAnalyzer <IMetric> > allScanningAnalyzers = passedAnalyzers.Except(groupingAnalyzers).Select(analyzer => analyzer); AnalyzerContext nonGroupedMetrics = RunScanningAnalyzers(data, allScanningAnalyzers, aggregateWith, saveStatesWith); Option <double> numRowsOfData = nonGroupedMetrics.Metric(Initializers.Size()).Select(metric => { if (metric is DoubleMetric dm) { return(dm.Value.Success.Value); } return(0); }); AnalyzerContext groupedMetrics = AnalyzerContext.Empty(); IEnumerable <IGrouping <(IOrderedEnumerable <string>, Option <string>), IGroupingAnalyzer <IMetric> > > sortedAndFilteredGroupingAnalyzers = groupingAnalyzers .Select(analyzer => analyzer) .GroupBy(analyzer => (analyzer.GroupingColumns().OrderBy(columnName => columnName), GetFilterCondition(analyzer))); foreach (IGrouping <(IOrderedEnumerable <string>, Option <string>), IGroupingAnalyzer <IMetric> > analyzerGroup in sortedAndFilteredGroupingAnalyzers) { (long numRows, AnalyzerContext metrics) = RunGroupingAnalyzers(data, analyzerGroup.Key.Item1.ToList(), analyzerGroup.Key.Item2, analyzerGroup, aggregateWith, saveStatesWith, storageLevelOfGroupedDataForMultiplePasses, numRowsOfData); groupedMetrics += metrics; if (!numRowsOfData.HasValue) { numRowsOfData = new Option <double>(numRows); } } AnalyzerContext resultingAnalyzerContext = resultComputedPreviously + preconditionFailures + nonGroupedMetrics + groupedMetrics; //TODO: add kllMetrics if (metricsRepositoryOptions != null) { SaveOrAppendResultsIfNecessary(resultingAnalyzerContext, metricsRepositoryOptions.metricRepository, metricsRepositoryOptions.saveOrAppendResultsWithKey); } SaveJsonOutputsToFilesystemIfNecessary(fileOutputOptions, resultingAnalyzerContext); return(resultingAnalyzerContext); }
/** * Compute the metrics from the analyzers configured in the analyis, instead of running * directly on data, this computation leverages (and aggregates) existing states which have * previously been computed on the data. * * @param schema schema of the data frame from which the states were computed * @param analysis the analysis to compute * @param stateLoaders loaders from which we retrieve the states to aggregate * @param saveStatesWith persist resulting states for the configured analyzers (optional) * @param storageLevelOfGroupedDataForMultiplePasses caching level for grouped data that must be * accessed multiple times (use * StorageLevel.NONE to completely disable * caching) * @return AnalyzerContext holding the requested metrics per analyzer */ public static AnalyzerContext RunOnAggregatedStates( StructType schema, Analysis analysis, IEnumerable <IStateLoader> stateLoaders, Option <IStatePersister> saveStatesWith = default, Option <IMetricsRepository> metricsRepository = default, Option <ResultKey> saveOrAppendResultsWithKey = default, StorageLevel storageLevelOfGroupedDataForMultiplePasses = StorageLevel.MEMORY_AND_DISK) { if (analysis.Analyzers == null || stateLoaders == null) { return(AnalyzerContext.Empty()); } IEnumerable <IAnalyzer <IMetric> > analyzers = analysis.Analyzers; /* Find all analyzers which violate their preconditions */ IEnumerable <IAnalyzer <IMetric> > passedAnalyzers = analyzers .Where(analyzer => !FindFirstFailing(schema, analyzer.Preconditions()).HasValue); IEnumerable <IAnalyzer <IMetric> > failedAnalyzers = analyzers.Except(passedAnalyzers); /* Create the failure metrics from the precondition violations */ AnalyzerContext preconditionFailures = ComputePreconditionFailureMetrics(failedAnalyzers, schema); InMemoryStateProvider aggregatedStates = new InMemoryStateProvider(); foreach (IAnalyzer <IMetric> analyzer in passedAnalyzers) { foreach (IStateLoader state in stateLoaders) { analyzer.AggregateStateTo(aggregatedStates, state, aggregatedStates); } } IEnumerable <IGroupingAnalyzer <IMetric> > groupingAnalyzers = passedAnalyzers.OfType <IGroupingAnalyzer <IMetric> >(); IEnumerable <IAnalyzer <IMetric> > scanningAnalyzers = passedAnalyzers.Except(groupingAnalyzers); Dictionary <IAnalyzer <IMetric>, IMetric> nonGroupedResults = new Dictionary <IAnalyzer <IMetric>, IMetric>( scanningAnalyzers.SelectMany(analyzer => { IMetric metrics = analyzer.LoadStateAndComputeMetric(aggregatedStates); if (saveStatesWith.HasValue) { analyzer.CopyStateTo(aggregatedStates, saveStatesWith.Value); } return(new[] { new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, metrics) }); })); AnalyzerContext groupedResults; if (!groupingAnalyzers.Any()) { groupedResults = AnalyzerContext.Empty(); } else { groupedResults = groupingAnalyzers .Select(analyzer => (IGroupingAnalyzer <IMetric>)analyzers) .GroupBy(analyzer => analyzer.GroupingColumns().OrderBy(columnName => columnName)) .Select(analyzerForGrouping => { FrequenciesAndNumRows state = FindStateForParticularGrouping(analyzerForGrouping, aggregatedStates); return(RunAnalyzersForParticularGrouping(state, analyzerForGrouping, saveStatesWith)); }).Aggregate((x, y) => x + y); } AnalyzerContext results = preconditionFailures + new AnalyzerContext(nonGroupedResults) + groupedResults; SaveOrAppendResultsIfNecessary(results, metricsRepository, saveOrAppendResultsWithKey); return(results); }
/// <summary>Creates a PersistedCloudFlow from the given CloudFlow.</summary> /// <param name="flow">The input CloudFlow.</param> /// <param name="storageLevel">Desired storage level for the persisted CloudFlow.</param> /// <returns>The result PersistedCloudFlow.</returns> public static Cloud <PersistedCloudFlow <TSource> > Persist <TSource>(this CloudFlow <TSource> flow, StorageLevel storageLevel) { return(CloudFlowModule.persist(storageLevel, flow)); }