private static VerificationResult BusinessDataTests(DataFrame data) { Regex timeMatching = new Regex("^([0-1]?[0-9]|2[0-3]):[0-5]?[0-9]|-([0-1]?[0-9]|2[0-3]):[0-5]?[0-9]$"); VerificationResult verificationResult = new VerificationSuite() .OnData(data) .AddCheck( new Check(CheckLevel.Error, "integrity checks") .IsUnique("business_id") .AreComplete(new[] { "business_id", "name", "address", "city", "state", "postal_code" }) .IsComplete("stars") .IsContainedIn("latitude", -90, 90) .IsContainedIn("longitude", -180, 80) .IsContainedIn("stars", 0, 5) .HasPattern("hours.Monday", timeMatching, value => value >= .50) .HasPattern("hours.Tuesday", timeMatching, value => value >= .50) .HasPattern("hours.Wednesday", timeMatching, value => value >= .50) .HasPattern("hours.Thursday", timeMatching, value => value >= .50) .HasPattern("hours.Friday", timeMatching, value => value >= .50) .HasPattern("hours.Saturday", timeMatching, value => value >= .50) .HasPattern("hours.Sunday", timeMatching, value => value >= .40) ) .Run(); verificationResult.Debug(); return(verificationResult); }
private static VerificationResult ReviewsDataTests(DataFrame data) { VerificationResult verificationResult = new VerificationSuite() .OnData(data) .AddCheck( new Check(CheckLevel.Error, "integrity checks") .IsUnique("review_id") .AreComplete(new[] { "review_id", "user_id", "business_id" }) .AreComplete(new[] { "stars", "useful", "funny", "cool" }) .IsComplete("date") .IsContainedIn("stars", 0, 5) ) .AddCheck( new Check(CheckLevel.Warning, "semantic checks") .ContainsURL("text", value => value <= .2) .ContainsCreditCardNumber("text", value => value <= .2) .ContainsEmail("text", value => value <= .2) .HasMin("useful", d => d <= .2) .HasMin("funny", d => d <= .2) .HasMin("cool", d => d <= .2) ) .Run(); verificationResult.Debug(); return(verificationResult); }
public static void ExecuteSimpleVerificationSuiteWithExternalFile() { var spark = SparkSession.Builder().GetOrCreate(); var data = spark.Read().Json("data/inventory.json"); data.Show(); VerificationResult verificationResult = new VerificationSuite() .OnData(data) .AddCheck( new Check(CheckLevel.Error, "integrity checks") .HasSize(value => value == 5) .IsComplete("id") .IsUnique("id") .IsComplete("productName") .IsContainedIn("priority", new[] { "high", "low" }) .IsNonNegative("numViews") ) .AddCheck( new Check(CheckLevel.Warning, "distribution checks") .ContainsURL("description", value => value >= .5) ) .Run(); verificationResult.Debug(); }
public static void ExecuteSimpleVerificationSuite() { var data = SparkSession.Builder().GetOrCreate().CreateDataFrame( new List <GenericRow> { new GenericRow(new object[] { 1, "Thingy A", "awesome thing.", "high", 0 }), new GenericRow(new object[] { 2, "Thingy B", "available at http://thingb.com", null, 0 }), new GenericRow(new object[] { 3, null, null, "low", 5 }), new GenericRow(new object[] { 4, "Thingy D", "checkout https://thingd.ca", "low", 10 }), new GenericRow(new object[] { 5, "Thingy E", null, "high", 12 }) }, new StructType(new List <StructField> { new StructField("id", new IntegerType()), new StructField("productName", new StringType()), new StructField("description", new StringType()), new StructField("priority", new StringType()), new StructField("numViews", new IntegerType()), })); var verificationResult = new VerificationSuite() .OnData(data) .AddCheck( new Check(CheckLevel.Error, "integrity checks") .HasSize(value => value == 5) .IsComplete("id") .IsUnique("id") .IsComplete("productName") .IsContainedIn("priority", new[] { "high", "low" }) .IsNonNegative("numViews") ) .AddCheck( new Check(CheckLevel.Warning, "distribution checks") .ContainsURL("description", value => value == .5) ) .Run(); verificationResult.Debug(); }
public void should_execute_anomaly_detection_example() { // Anomaly detection operates on metrics stored in a metric repository, so lets create one InMemoryMetricsRepository metricsRepository = new InMemoryMetricsRepository(); // This is the key which we use to store the metrics for the dataset from yesterday ResultKey yesterdayKeys = new ResultKey(DateTime.Now.Ticks - 24 * 60 * 1000); /* In this simple example, we assume that we compute metrics on a dataset every day and we want * to ensure that they don't change drastically. For sake of simplicity, we just look at the * size of the data */ /* Yesterday, the data had only two rows */ var yesterdaysDataset = LoadAnomalyDetectionData(new List <object[]> { new object[] { 1, "Thingy A", "awesome thing.", "high", 0 }, new object[] { 2, "Thingy B", "available at http://thingb.com", null, 0 } }); /* We test for anomalies in the size of the data, it should not increase by more than 2x. Note * that we store the resulting metrics in our repository */ new VerificationSuite() .OnData(yesterdaysDataset) .UseRepository(metricsRepository) .SaveOrAppendResult(yesterdayKeys) .AddAnomalyCheck( new RelativeRateOfChangeStrategy(maxRateIncrease: 2.0), Size() ) .Run() .Debug(_helper.WriteLine); /* Todays data has five rows, so the data size more than doubled and our anomaly check should * catch this */ var todaysDataset = LoadAnomalyDetectionData(new List <object[]> { new object[] { 1, "Thingy A", "awesome thing.", "high", 0 }, new object[] { 2, "Thingy B", "available at http://thingb.com", null, 0 }, new object[] { 3, null, null, "low", 5 }, new object[] { 4, "Thingy D", "checkout https://thingd.ca", "low", 10 }, new object[] { 5, "Thingy W", null, "high", 12 } }); /* The key for today's result */ var todaysKey = new ResultKey(DateTime.Now.Ticks - 24 * 60 * 1000); /* Repeat the anomaly check for today's data */ var verificationResult = new VerificationSuite() .OnData(todaysDataset) .UseRepository(metricsRepository) .SaveOrAppendResult(todaysKey) .AddAnomalyCheck( new RelativeRateOfChangeStrategy(maxRateIncrease: 2.0), Size() ) .Run(); verificationResult.Status.ShouldBe(CheckStatus.Warning); _helper.WriteLine("Anomaly detected in the Size() metric!"); /* Lets have a look at the actual metrics. */ metricsRepository .Load() .ForAnalyzers(new[] { Size() }) .GetSuccessMetricsAsDataFrame(_session) .Show(); }