public static DataFrame ToDataFrame(this IEnumerable <SimpleCheckResultOutput> simpleChecks) { List <GenericRow> elements = new List <GenericRow>(); foreach (SimpleCheckResultOutput check in simpleChecks) { elements.Add( new GenericRow(new[] { check.CheckDescription, check.CheckLevel, check.CheckStatus, check.Constraint, check.ConstraintStatus, check.ConstraintMessage })); } StructType schema = new StructType( new List <StructField> { new StructField("check", new StringType()), new StructField("check_level", new StringType()), new StructField("check_status", new StringType()), new StructField("constraint", new StringType()), new StructField("constraint_status", new StringType()), new StructField("constraint_message", new StringType()) }); return(SparkSession.Active().CreateDataFrame(elements, schema)); }
static DataFrame toDF(List <Document> docs) { var rows = new List <GenericRow>(); var spark = SparkSession.Active(); foreach (var row in docs) { rows.Add(new GenericRow(new object[] { row.Path, row.Content })); } var schema = new StructType(new List <StructField>() { new StructField("Path", new StringType()), new StructField("Content", new StringType()) }); return(spark.CreateDataFrame(rows, schema)); }
public void TestSignaturesV2_4_X() { Assert.IsType <SparkContext>(_spark.SparkContext); Assert.IsType <Builder>(SparkSession.Builder()); SparkSession.ClearActiveSession(); SparkSession.SetActiveSession(_spark); Assert.IsType <SparkSession>(SparkSession.GetActiveSession()); SparkSession.ClearDefaultSession(); SparkSession.SetDefaultSession(_spark); Assert.IsType <SparkSession>(SparkSession.GetDefaultSession()); Assert.IsType <RuntimeConfig>(_spark.Conf()); Assert.IsType <StreamingQueryManager>(_spark.Streams()); Assert.IsType <SparkSession>(_spark.NewSession()); Assert.IsType <DataFrameReader>(_spark.Read()); Assert.IsType <DataFrame>(_spark.Range(10)); Assert.IsType <DataFrame>(_spark.Range(10, 100)); Assert.IsType <DataFrame>(_spark.Range(10, 100, 10)); Assert.IsType <DataFrame>(_spark.Range(10, 100, 10, 5)); _spark.Range(10).CreateOrReplaceTempView("testView"); Assert.IsType <DataFrame>(_spark.Table("testView")); Assert.IsType <DataStreamReader>(_spark.ReadStream()); Assert.IsType <UdfRegistration>(_spark.Udf()); Assert.IsType <Catalog>(_spark.Catalog); Assert.NotNull(_spark.Version()); Assert.IsType <SparkSession>(SparkSession.Active()); }
public void TestSignaturesV2_4_X() { Assert.IsType <SparkSession>(SparkSession.Active()); }
public static void AnomalyDetectionExample() { // Anomaly detection operates on metrics stored in a metric repository, so lets create one IMetricsRepository metricsRepository = new InMemoryMetricsRepository(); // This is the key which we use to store the metrics for the dataset from yesterday ResultKey yesterdayKeys = new ResultKey(new DateTime().Ticks - 24 * 60 * 1000); /* In this simple example, we assume that we compute metrics on a dataset every day and we want * to ensure that they don't change drastically. For sake of simplicity, we just look at the * size of the data */ /* Yesterday, the data had only two rows */ var yesterdaysDataset = LoadData(new List <object[]> { new object[] { 1, "Thingy A", "awesome thing.", "high", 0 }, new object[] { 2, "Thingy B", "available at http://thingb.com", null, 0 } }); /* We test for anomalies in the size of the data, it should not increase by more than 2x. Note * that we store the resulting metrics in our repository */ new VerificationSuite() .OnData(yesterdaysDataset) .UseRepository(metricsRepository) .SaveOrAppendResult(yesterdayKeys) .AddAnomalyCheck( new RelativeRateOfChangeStrategy(2.0), Size() ) .Run(); /* Todays data has five rows, so the data size more than doubled and our anomaly check should * catch this */ var todaysDataset = LoadData(new List <object[]> { new object[] { 1, "Thingy A", "awesome thing.", "high", 0 }, new object[] { 2, "Thingy B", "available at http://thingb.com", null, 0 }, new object[] { 3, null, null, "low", 5 }, new object[] { 4, "Thingy D", "checkout https://thingd.ca", "low", 10 }, new object[] { 5, "Thingy W", null, "high", 12 } }); /* The key for today's result */ var todaysKey = new ResultKey(new DateTime().Ticks - 24 * 60 * 1000); /* Repeat the anomaly check for today's data */ var verificationResult = new VerificationSuite() .OnData(todaysDataset) .UseRepository(metricsRepository) .SaveOrAppendResult(todaysKey) .AddAnomalyCheck( new RelativeRateOfChangeStrategy(maxRateIncrease: 2.0), Size() ) .Run() .Debug(); /* Did we find an anomaly? */ if (verificationResult.Status != CheckStatus.Success) { Console.WriteLine("Anomaly detected in the Size() metric!"); /* Lets have a look at the actual metrics. */ metricsRepository .Load() .ForAnalyzers(new[] { Size() }) .GetSuccessMetricsAsDataFrame(SparkSession.Active()) .Show(); } }
/// <summary> /// Runs the Spark job. /// </summary> private static void RunJob() { const string wordList = nameof(wordList); const string word = nameof(word); const string count = nameof(count); const string docFrequency = nameof(docFrequency); const string total = nameof(total); const string inverseDocFrequency = nameof(inverseDocFrequency); const string termFreq_inverseDocFreq = nameof(termFreq_inverseDocFreq); Console.WriteLine("Starting Spark job to analyze words..."); var spark = SparkSession.Active(); filesHelper.NewModelSession(); // everything var docs = spark.Read().HasHeader().Csv(filesHelper.TempDataFile); docs.CreateOrReplaceTempView(nameof(docs)); // all docs in corpus var totalDocs = docs.Count(); // easy reference var fileCol = nameof(FileDataParse.File).AsColumn(); // split words and group by count var words = docs // transform words into an array of words .Select( fileCol, Functions.Split( nameof(FileDataParse.Words).AsColumn(), " ") .Alias(wordList)) // flatten into one row per word .Select( fileCol, Functions.Explode( wordList.AsColumn()) .Alias(word)); // get frequency of word per document var termFrequency = words // group by attributes of file plus word .GroupBy(fileCol, Functions.Lower(word.AsColumn()).Alias(word)) // generate count .Count() // order by word count per file descending .OrderBy(fileCol, count.AsColumn().Desc()); // count by word termFrequency.CreateOrReplaceTempView(nameof(termFrequency)); // now count frequency of word across all documents var documentFrequency = words .GroupBy(Functions.Lower(word.AsColumn()).Alias(word)) .Agg(Functions.CountDistinct(fileCol).Alias(docFrequency)); documentFrequency.CreateOrReplaceTempView(nameof(documentFrequency));