Пример #1
0
        private static VerificationResult BusinessDataTests(DataFrame data)
        {
            Regex timeMatching = new Regex("^([0-1]?[0-9]|2[0-3]):[0-5]?[0-9]|-([0-1]?[0-9]|2[0-3]):[0-5]?[0-9]$");
            VerificationResult verificationResult = new VerificationSuite()
                                                    .OnData(data)
                                                    .AddCheck(
                new Check(CheckLevel.Error, "integrity checks")
                .IsUnique("business_id")
                .AreComplete(new[] { "business_id", "name", "address", "city", "state", "postal_code" })
                .IsComplete("stars")
                .IsContainedIn("latitude", -90, 90)
                .IsContainedIn("longitude", -180, 80)
                .IsContainedIn("stars", 0, 5)
                .HasPattern("hours.Monday", timeMatching, value => value >= .50)
                .HasPattern("hours.Tuesday", timeMatching, value => value >= .50)
                .HasPattern("hours.Wednesday", timeMatching, value => value >= .50)
                .HasPattern("hours.Thursday", timeMatching, value => value >= .50)
                .HasPattern("hours.Friday", timeMatching, value => value >= .50)
                .HasPattern("hours.Saturday", timeMatching, value => value >= .50)
                .HasPattern("hours.Sunday", timeMatching, value => value >= .40)
                )
                                                    .Run();

            verificationResult.Debug();
            return(verificationResult);
        }
Пример #2
0
        private static VerificationResult ReviewsDataTests(DataFrame data)
        {
            VerificationResult verificationResult = new VerificationSuite()
                                                    .OnData(data)
                                                    .AddCheck(
                new Check(CheckLevel.Error, "integrity checks")
                .IsUnique("review_id")
                .AreComplete(new[] { "review_id", "user_id", "business_id" })
                .AreComplete(new[] { "stars", "useful", "funny", "cool" })
                .IsComplete("date")
                .IsContainedIn("stars", 0, 5)
                )
                                                    .AddCheck(
                new Check(CheckLevel.Warning, "semantic checks")
                .ContainsURL("text", value => value <= .2)
                .ContainsCreditCardNumber("text", value => value <= .2)
                .ContainsEmail("text", value => value <= .2)
                .HasMin("useful", d => d <= .2)
                .HasMin("funny", d => d <= .2)
                .HasMin("cool", d => d <= .2)
                )
                                                    .Run();

            verificationResult.Debug();
            return(verificationResult);
        }
Пример #3
0
        public static void ExecuteSimpleVerificationSuiteWithExternalFile()
        {
            var spark = SparkSession.Builder().GetOrCreate();
            var data  = spark.Read().Json("data/inventory.json");

            data.Show();

            VerificationResult verificationResult = new VerificationSuite()
                                                    .OnData(data)
                                                    .AddCheck(
                new Check(CheckLevel.Error, "integrity checks")
                .HasSize(value => value == 5)
                .IsComplete("id")
                .IsUnique("id")
                .IsComplete("productName")
                .IsContainedIn("priority", new[] { "high", "low" })
                .IsNonNegative("numViews")
                )
                                                    .AddCheck(
                new Check(CheckLevel.Warning, "distribution checks")
                .ContainsURL("description", value => value >= .5)
                )
                                                    .Run();

            verificationResult.Debug();
        }
Пример #4
0
        public static void ExecuteSimpleVerificationSuite()
        {
            var data = SparkSession.Builder().GetOrCreate().CreateDataFrame(
                new List <GenericRow>
            {
                new GenericRow(new object[] { 1, "Thingy A", "awesome thing.", "high", 0 }),
                new GenericRow(new object[] { 2, "Thingy B", "available at http://thingb.com", null, 0 }),
                new GenericRow(new object[] { 3, null, null, "low", 5 }),
                new GenericRow(new object[] { 4, "Thingy D", "checkout https://thingd.ca", "low", 10 }),
                new GenericRow(new object[] { 5, "Thingy E", null, "high", 12 })
            },
                new StructType(new List <StructField>
            {
                new StructField("id", new IntegerType()),
                new StructField("productName", new StringType()),
                new StructField("description", new StringType()),
                new StructField("priority", new StringType()),
                new StructField("numViews", new IntegerType()),
            }));

            var verificationResult = new VerificationSuite()
                                     .OnData(data)
                                     .AddCheck(
                new Check(CheckLevel.Error, "integrity checks")
                .HasSize(value => value == 5)
                .IsComplete("id")
                .IsUnique("id")
                .IsComplete("productName")
                .IsContainedIn("priority", new[] { "high", "low" })
                .IsNonNegative("numViews")
                )
                                     .AddCheck(
                new Check(CheckLevel.Warning, "distribution checks")
                .ContainsURL("description", value => value == .5)
                )
                                     .Run();

            verificationResult.Debug();
        }
Пример #5
0
        public void should_execute_anomaly_detection_example()
        {
            // Anomaly detection operates on metrics stored in a metric repository, so lets create one
            InMemoryMetricsRepository metricsRepository = new InMemoryMetricsRepository();
            // This is the key which we use to store the metrics for the dataset from yesterday
            ResultKey yesterdayKeys =
                new ResultKey(DateTime.Now.Ticks - 24 * 60 * 1000);

            /* In this simple example, we assume that we compute metrics on a dataset every day and we want
             * to ensure that they don't change drastically. For sake of simplicity, we just look at the
             * size of the data */

            /* Yesterday, the data had only two rows */
            var yesterdaysDataset = LoadAnomalyDetectionData(new List <object[]>
            {
                new object[] { 1, "Thingy A", "awesome thing.", "high", 0 },
                new object[] { 2, "Thingy B", "available at http://thingb.com", null, 0 }
            });

            /* We test for anomalies in the size of the data, it should not increase by more than 2x. Note
             * that we store the resulting metrics in our repository */
            new VerificationSuite()
            .OnData(yesterdaysDataset)
            .UseRepository(metricsRepository)
            .SaveOrAppendResult(yesterdayKeys)
            .AddAnomalyCheck(
                new RelativeRateOfChangeStrategy(maxRateIncrease: 2.0),
                Size()
                )
            .Run()
            .Debug(_helper.WriteLine);


            /* Todays data has five rows, so the data size more than doubled and our anomaly check should
             * catch this */
            var todaysDataset = LoadAnomalyDetectionData(new List <object[]>
            {
                new object[] { 1, "Thingy A", "awesome thing.", "high", 0 },
                new object[] { 2, "Thingy B", "available at http://thingb.com", null, 0 },
                new object[] { 3, null, null, "low", 5 },
                new object[] { 4, "Thingy D", "checkout https://thingd.ca", "low", 10 },
                new object[] { 5, "Thingy W", null, "high", 12 }
            });


            /* The key for today's result */
            var todaysKey = new ResultKey(DateTime.Now.Ticks - 24 * 60 * 1000);

            /* Repeat the anomaly check for today's data */
            var verificationResult = new VerificationSuite()
                                     .OnData(todaysDataset)
                                     .UseRepository(metricsRepository)
                                     .SaveOrAppendResult(todaysKey)
                                     .AddAnomalyCheck(
                new RelativeRateOfChangeStrategy(maxRateIncrease: 2.0),
                Size()
                )
                                     .Run();

            verificationResult.Status.ShouldBe(CheckStatus.Warning);

            _helper.WriteLine("Anomaly detected in the Size() metric!");

            /* Lets have a look at the actual metrics. */
            metricsRepository
            .Load()
            .ForAnalyzers(new[] { Size() })
            .GetSuccessMetricsAsDataFrame(_session)
            .Show();
        }