public void should_execute_incremental_metrics_example() { DataFrame dataSetDE = LoadIncrementalMetricsData( new[] { new object[] { 1, "ManufacturerA", "DE" }, new object[] { 2, "ManufacturerB", "DE" }, new object[] { 2, "ManufacturerC", "DE" } }); DataFrame dataSetUS = LoadIncrementalMetricsData( new[] { new object[] { 3, "ManufacturerD", "US" }, new object[] { 4, "ManufacturerE", "US" }, new object[] { 5, "ManufacturerF", "US" } }); DataFrame dataSetCN = LoadIncrementalMetricsData( new[] { new object[] { 6, "ManufacturerG", "CN" }, new object[] { 7, "ManufacturerH", "CN" }, }); // We initialize a new check for the following data fields var check = new Check(CheckLevel.Warning, "generic check") .IsComplete("manufacturerName") .ContainsURL("manufacturerName", val => val == 0.0) .IsContainedIn("countryCode", new[] { "DE", "US", "CN" }); // We create a new Analysis instance with the corresponding RequiredAnalyzers defined in the check Analysis analysis = new Analysis(check.RequiredAnalyzers()); // We create a new in-memory state provider for each countryCode defined in the dataset InMemoryStateProvider deStates = new InMemoryStateProvider(); InMemoryStateProvider usStates = new InMemoryStateProvider(); InMemoryStateProvider cnStates = new InMemoryStateProvider(); // These call will store the resulting metrics in the separate states providers for each dataSet AnalysisRunner.Run(dataSetDE, analysis, saveStatesWith: deStates); AnalysisRunner.Run(dataSetUS, analysis, saveStatesWith: usStates); AnalysisRunner.Run(dataSetCN, analysis, saveStatesWith: cnStates); // Next, we are able to compute the metrics for the whole table from the partition states // This just aggregates the previously calculated metrics, it doesn't performs computation on the data AnalyzerContext tableMetrics = AnalysisRunner.RunOnAggregatedStates(dataSetDE.Schema(), analysis, new[] { deStates, usStates, cnStates }); // Lets now assume that a single partition changes. We only need to recompute the state of this // partition in order to update the metrics for the whole table. DataFrame updatedUsManufacturers = LoadIncrementalMetricsData(new[] { new object[] { 3, "ManufacturerDNew", "US" }, new object[] { 4, null, "US" }, new object[] { 5, "ManufacturerFNew http://clickme.com", "US" }, }); // Recompute state of partition InMemoryStateProvider updatedUsStates = new InMemoryStateProvider(); AnalysisRunner.Run(updatedUsManufacturers, analysis, updatedUsStates); // Recompute metrics for whole tables from states. We do not need to touch old data! AnalyzerContext updatedTableMetrics = AnalysisRunner.RunOnAggregatedStates(dataSetDE.Schema(), analysis, new[] { deStates, usStates, cnStates }); }
/** * Compute the metrics from the analyzers configured in the analyis, instead of running * directly on data, this computation leverages (and aggregates) existing states which have * previously been computed on the data. * * @param schema schema of the data frame from which the states were computed * @param analysis the analysis to compute * @param stateLoaders loaders from which we retrieve the states to aggregate * @param saveStatesWith persist resulting states for the configured analyzers (optional) * @param storageLevelOfGroupedDataForMultiplePasses caching level for grouped data that must be * accessed multiple times (use * StorageLevel.NONE to completely disable * caching) * @return AnalyzerContext holding the requested metrics per analyzer */ public static AnalyzerContext RunOnAggregatedStates( StructType schema, Analysis analysis, IEnumerable <IStateLoader> stateLoaders, Option <IStatePersister> saveStatesWith = default, Option <IMetricsRepository> metricsRepository = default, Option <ResultKey> saveOrAppendResultsWithKey = default, StorageLevel storageLevelOfGroupedDataForMultiplePasses = StorageLevel.MEMORY_AND_DISK) { if (analysis.Analyzers == null || stateLoaders == null) { return(AnalyzerContext.Empty()); } IEnumerable <IAnalyzer <IMetric> > analyzers = analysis.Analyzers; /* Find all analyzers which violate their preconditions */ IEnumerable <IAnalyzer <IMetric> > passedAnalyzers = analyzers .Where(analyzer => !FindFirstFailing(schema, analyzer.Preconditions()).HasValue); IEnumerable <IAnalyzer <IMetric> > failedAnalyzers = analyzers.Except(passedAnalyzers); /* Create the failure metrics from the precondition violations */ AnalyzerContext preconditionFailures = ComputePreconditionFailureMetrics(failedAnalyzers, schema); InMemoryStateProvider aggregatedStates = new InMemoryStateProvider(); foreach (IAnalyzer <IMetric> analyzer in passedAnalyzers) { foreach (IStateLoader state in stateLoaders) { analyzer.AggregateStateTo(aggregatedStates, state, aggregatedStates); } } IEnumerable <IGroupingAnalyzer <IMetric> > groupingAnalyzers = passedAnalyzers.OfType <IGroupingAnalyzer <IMetric> >(); IEnumerable <IAnalyzer <IMetric> > scanningAnalyzers = passedAnalyzers.Except(groupingAnalyzers); Dictionary <IAnalyzer <IMetric>, IMetric> nonGroupedResults = new Dictionary <IAnalyzer <IMetric>, IMetric>( scanningAnalyzers.SelectMany(analyzer => { IMetric metrics = analyzer.LoadStateAndComputeMetric(aggregatedStates); if (saveStatesWith.HasValue) { analyzer.CopyStateTo(aggregatedStates, saveStatesWith.Value); } return(new[] { new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, metrics) }); })); AnalyzerContext groupedResults; if (!groupingAnalyzers.Any()) { groupedResults = AnalyzerContext.Empty(); } else { groupedResults = groupingAnalyzers .Select(analyzer => (IGroupingAnalyzer <IMetric>)analyzers) .GroupBy(analyzer => analyzer.GroupingColumns().OrderBy(columnName => columnName)) .Select(analyzerForGrouping => { FrequenciesAndNumRows state = FindStateForParticularGrouping(analyzerForGrouping, aggregatedStates); return(RunAnalyzersForParticularGrouping(state, analyzerForGrouping, saveStatesWith)); }).Aggregate((x, y) => x + y); } AnalyzerContext results = preconditionFailures + new AnalyzerContext(nonGroupedResults) + groupedResults; SaveOrAppendResultsIfNecessary(results, metricsRepository, saveOrAppendResultsWithKey); return(results); }