示例#1
0
        public void should_execute_incremental_metrics_example()
        {
            DataFrame dataSetDE = LoadIncrementalMetricsData(
                new[] { new object[] { 1, "ManufacturerA", "DE" }, new object[] { 2, "ManufacturerB", "DE" },
                        new object[] { 2, "ManufacturerC", "DE" } });

            DataFrame dataSetUS = LoadIncrementalMetricsData(
                new[]
            {
                new object[] { 3, "ManufacturerD", "US" }, new object[] { 4, "ManufacturerE", "US" },
                new object[] { 5, "ManufacturerF", "US" }
            });

            DataFrame dataSetCN = LoadIncrementalMetricsData(
                new[] { new object[] { 6, "ManufacturerG", "CN" }, new object[] { 7, "ManufacturerH", "CN" }, });

            // We initialize a new check for the following data fields
            var check = new Check(CheckLevel.Warning, "generic check")
                        .IsComplete("manufacturerName")
                        .ContainsURL("manufacturerName", val => val == 0.0)
                        .IsContainedIn("countryCode", new[] { "DE", "US", "CN" });


            // We create a new Analysis instance with the corresponding RequiredAnalyzers defined in the check
            Analysis analysis = new Analysis(check.RequiredAnalyzers());

            // We create a new in-memory state provider for each countryCode defined in the dataset
            InMemoryStateProvider deStates = new InMemoryStateProvider();
            InMemoryStateProvider usStates = new InMemoryStateProvider();
            InMemoryStateProvider cnStates = new InMemoryStateProvider();

            // These call will store the resulting metrics in the separate states providers for each dataSet
            AnalysisRunner.Run(dataSetDE, analysis, saveStatesWith: deStates);
            AnalysisRunner.Run(dataSetUS, analysis, saveStatesWith: usStates);
            AnalysisRunner.Run(dataSetCN, analysis, saveStatesWith: cnStates);

            // Next, we are able to compute the metrics for the whole table from the partition states
            // This just aggregates the previously calculated metrics, it doesn't performs computation on the data
            AnalyzerContext tableMetrics = AnalysisRunner.RunOnAggregatedStates(dataSetDE.Schema(), analysis,
                                                                                new[] { deStates, usStates, cnStates });

            // Lets now assume that a single partition changes. We only need to recompute the state of this
            // partition in order to update the metrics for the whole table.
            DataFrame updatedUsManufacturers = LoadIncrementalMetricsData(new[]
            {
                new object[] { 3, "ManufacturerDNew", "US" }, new object[] { 4, null, "US" },
                new object[] { 5, "ManufacturerFNew http://clickme.com", "US" },
            });

            // Recompute state of partition
            InMemoryStateProvider updatedUsStates = new InMemoryStateProvider();

            AnalysisRunner.Run(updatedUsManufacturers, analysis, updatedUsStates);

            // Recompute metrics for whole tables from states. We do not need to touch old data!
            AnalyzerContext updatedTableMetrics = AnalysisRunner.RunOnAggregatedStates(dataSetDE.Schema(), analysis,
                                                                                       new[] { deStates, usStates, cnStates });
        }
示例#2
0
        /**
         * Compute the metrics from the analyzers configured in the analyis, instead of running
         * directly on data, this computation leverages (and aggregates) existing states which have
         * previously been computed on the data.
         *
         * @param schema schema of the data frame from which the states were computed
         * @param analysis the analysis to compute
         * @param stateLoaders loaders from which we retrieve the states to aggregate
         * @param saveStatesWith persist resulting states for the configured analyzers (optional)
         * @param storageLevelOfGroupedDataForMultiplePasses caching level for grouped data that must be
         * accessed multiple times (use
         * StorageLevel.NONE to completely disable
         * caching)
         * @return AnalyzerContext holding the requested metrics per analyzer
         */
        public static AnalyzerContext RunOnAggregatedStates(
            StructType schema,
            Analysis analysis,
            IEnumerable <IStateLoader> stateLoaders,
            Option <IStatePersister> saveStatesWith                 = default,
            Option <IMetricsRepository> metricsRepository           = default,
            Option <ResultKey> saveOrAppendResultsWithKey           = default,
            StorageLevel storageLevelOfGroupedDataForMultiplePasses = StorageLevel.MEMORY_AND_DISK)
        {
            if (analysis.Analyzers == null || stateLoaders == null)
            {
                return(AnalyzerContext.Empty());
            }

            IEnumerable <IAnalyzer <IMetric> > analyzers = analysis.Analyzers;

            /* Find all analyzers which violate their preconditions */
            IEnumerable <IAnalyzer <IMetric> > passedAnalyzers = analyzers
                                                                 .Where(analyzer => !FindFirstFailing(schema, analyzer.Preconditions()).HasValue);

            IEnumerable <IAnalyzer <IMetric> > failedAnalyzers = analyzers.Except(passedAnalyzers);

            /* Create the failure metrics from the precondition violations */
            AnalyzerContext preconditionFailures = ComputePreconditionFailureMetrics(failedAnalyzers, schema);

            InMemoryStateProvider aggregatedStates = new InMemoryStateProvider();

            foreach (IAnalyzer <IMetric> analyzer in passedAnalyzers)
            {
                foreach (IStateLoader state in stateLoaders)
                {
                    analyzer.AggregateStateTo(aggregatedStates, state, aggregatedStates);
                }
            }


            IEnumerable <IGroupingAnalyzer <IMetric> > groupingAnalyzers =
                passedAnalyzers.OfType <IGroupingAnalyzer <IMetric> >();

            IEnumerable <IAnalyzer <IMetric> > scanningAnalyzers = passedAnalyzers.Except(groupingAnalyzers);

            Dictionary <IAnalyzer <IMetric>, IMetric> nonGroupedResults = new Dictionary <IAnalyzer <IMetric>, IMetric>(
                scanningAnalyzers.SelectMany(analyzer =>
            {
                IMetric metrics = analyzer.LoadStateAndComputeMetric(aggregatedStates);

                if (saveStatesWith.HasValue)
                {
                    analyzer.CopyStateTo(aggregatedStates, saveStatesWith.Value);
                }

                return(new[] { new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, metrics) });
            }));


            AnalyzerContext groupedResults;

            if (!groupingAnalyzers.Any())
            {
                groupedResults = AnalyzerContext.Empty();
            }
            else
            {
                groupedResults = groupingAnalyzers
                                 .Select(analyzer => (IGroupingAnalyzer <IMetric>)analyzers)
                                 .GroupBy(analyzer => analyzer.GroupingColumns().OrderBy(columnName => columnName))
                                 .Select(analyzerForGrouping =>
                {
                    FrequenciesAndNumRows state =
                        FindStateForParticularGrouping(analyzerForGrouping, aggregatedStates);
                    return(RunAnalyzersForParticularGrouping(state, analyzerForGrouping, saveStatesWith));
                }).Aggregate((x, y) => x + y);
            }

            AnalyzerContext results = preconditionFailures + new AnalyzerContext(nonGroupedResults) + groupedResults;

            SaveOrAppendResultsIfNecessary(results, metricsRepository, saveOrAppendResultsWithKey);

            return(results);
        }