Beispiel #1
0
        public static DataFrame ToDataFrame(this IEnumerable <SimpleCheckResultOutput> simpleChecks)
        {
            List <GenericRow> elements = new List <GenericRow>();

            foreach (SimpleCheckResultOutput check in simpleChecks)
            {
                elements.Add(
                    new GenericRow(new[]
                {
                    check.CheckDescription, check.CheckLevel, check.CheckStatus, check.Constraint,
                    check.ConstraintStatus, check.ConstraintMessage
                }));
            }

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("check", new StringType()),
                new StructField("check_level", new StringType()),
                new StructField("check_status", new StringType()),
                new StructField("constraint", new StringType()),
                new StructField("constraint_status", new StringType()),
                new StructField("constraint_message", new StringType())
            });

            return(SparkSession.Active().CreateDataFrame(elements, schema));
        }
Beispiel #2
0
        static DataFrame toDF(List <Document> docs)
        {
            var rows = new List <GenericRow>();

            var spark = SparkSession.Active();

            foreach (var row in docs)
            {
                rows.Add(new GenericRow(new object[] { row.Path, row.Content }));
            }

            var schema = new StructType(new List <StructField>()
            {
                new StructField("Path", new StringType()),
                new StructField("Content", new StringType())
            });

            return(spark.CreateDataFrame(rows, schema));
        }
Beispiel #3
0
        public void TestSignaturesV2_4_X()
        {
            Assert.IsType <SparkContext>(_spark.SparkContext);

            Assert.IsType <Builder>(SparkSession.Builder());

            SparkSession.ClearActiveSession();
            SparkSession.SetActiveSession(_spark);
            Assert.IsType <SparkSession>(SparkSession.GetActiveSession());

            SparkSession.ClearDefaultSession();
            SparkSession.SetDefaultSession(_spark);
            Assert.IsType <SparkSession>(SparkSession.GetDefaultSession());

            Assert.IsType <RuntimeConfig>(_spark.Conf());

            Assert.IsType <StreamingQueryManager>(_spark.Streams());

            Assert.IsType <SparkSession>(_spark.NewSession());

            Assert.IsType <DataFrameReader>(_spark.Read());

            Assert.IsType <DataFrame>(_spark.Range(10));
            Assert.IsType <DataFrame>(_spark.Range(10, 100));
            Assert.IsType <DataFrame>(_spark.Range(10, 100, 10));
            Assert.IsType <DataFrame>(_spark.Range(10, 100, 10, 5));

            _spark.Range(10).CreateOrReplaceTempView("testView");
            Assert.IsType <DataFrame>(_spark.Table("testView"));

            Assert.IsType <DataStreamReader>(_spark.ReadStream());

            Assert.IsType <UdfRegistration>(_spark.Udf());

            Assert.IsType <Catalog>(_spark.Catalog);

            Assert.NotNull(_spark.Version());

            Assert.IsType <SparkSession>(SparkSession.Active());
        }
Beispiel #4
0
 public void TestSignaturesV2_4_X()
 {
     Assert.IsType <SparkSession>(SparkSession.Active());
 }
Beispiel #5
0
        public static void AnomalyDetectionExample()
        {
            // Anomaly detection operates on metrics stored in a metric repository, so lets create one
            IMetricsRepository metricsRepository = new InMemoryMetricsRepository();
            // This is the key which we use to store the metrics for the dataset from yesterday
            ResultKey yesterdayKeys =
                new ResultKey(new DateTime().Ticks - 24 * 60 * 1000);

            /* In this simple example, we assume that we compute metrics on a dataset every day and we want
             * to ensure that they don't change drastically. For sake of simplicity, we just look at the
             * size of the data */

            /* Yesterday, the data had only two rows */
            var yesterdaysDataset = LoadData(new List <object[]>
            {
                new object[] { 1, "Thingy A", "awesome thing.", "high", 0 },
                new object[] { 2, "Thingy B", "available at http://thingb.com", null, 0 }
            });

            /* We test for anomalies in the size of the data, it should not increase by more than 2x. Note
             * that we store the resulting metrics in our repository */
            new VerificationSuite()
            .OnData(yesterdaysDataset)
            .UseRepository(metricsRepository)
            .SaveOrAppendResult(yesterdayKeys)
            .AddAnomalyCheck(
                new RelativeRateOfChangeStrategy(2.0),
                Size()
                )
            .Run();


            /* Todays data has five rows, so the data size more than doubled and our anomaly check should
             * catch this */
            var todaysDataset = LoadData(new List <object[]>
            {
                new object[] { 1, "Thingy A", "awesome thing.", "high", 0 },
                new object[] { 2, "Thingy B", "available at http://thingb.com", null, 0 },
                new object[] { 3, null, null, "low", 5 },
                new object[] { 4, "Thingy D", "checkout https://thingd.ca", "low", 10 },
                new object[] { 5, "Thingy W", null, "high", 12 }
            });


            /* The key for today's result */
            var todaysKey = new ResultKey(new DateTime().Ticks - 24 * 60 * 1000);

            /* Repeat the anomaly check for today's data */
            var verificationResult = new VerificationSuite()
                                     .OnData(todaysDataset)
                                     .UseRepository(metricsRepository)
                                     .SaveOrAppendResult(todaysKey)
                                     .AddAnomalyCheck(
                new RelativeRateOfChangeStrategy(maxRateIncrease: 2.0),
                Size()
                )
                                     .Run()
                                     .Debug();



            /* Did we find an anomaly? */
            if (verificationResult.Status != CheckStatus.Success)
            {
                Console.WriteLine("Anomaly detected in the Size() metric!");

                /* Lets have a look at the actual metrics. */
                metricsRepository
                .Load()
                .ForAnalyzers(new[] { Size() })
                .GetSuccessMetricsAsDataFrame(SparkSession.Active())
                .Show();
            }
        }
        /// <summary>
        /// Runs the Spark job.
        /// </summary>
        private static void RunJob()
        {
            const string wordList                = nameof(wordList);
            const string word                    = nameof(word);
            const string count                   = nameof(count);
            const string docFrequency            = nameof(docFrequency);
            const string total                   = nameof(total);
            const string inverseDocFrequency     = nameof(inverseDocFrequency);
            const string termFreq_inverseDocFreq = nameof(termFreq_inverseDocFreq);

            Console.WriteLine("Starting Spark job to analyze words...");

            var spark = SparkSession.Active();

            filesHelper.NewModelSession();

            // everything
            var docs = spark.Read().HasHeader().Csv(filesHelper.TempDataFile);

            docs.CreateOrReplaceTempView(nameof(docs));

            // all docs in corpus
            var totalDocs = docs.Count();

            // easy reference
            var fileCol = nameof(FileDataParse.File).AsColumn();

            // split words and group by count
            var words = docs

                        // transform words into an array of words
                        .Select(
                fileCol,
                Functions.Split(
                    nameof(FileDataParse.Words).AsColumn(), " ")
                .Alias(wordList))

                        // flatten into one row per word
                        .Select(
                fileCol,
                Functions.Explode(
                    wordList.AsColumn())
                .Alias(word));

            // get frequency of word per document
            var termFrequency = words

                                // group by attributes of file plus word
                                .GroupBy(fileCol, Functions.Lower(word.AsColumn()).Alias(word))

                                // generate count
                                .Count()

                                // order by word count per file descending
                                .OrderBy(fileCol, count.AsColumn().Desc());

            // count by word
            termFrequency.CreateOrReplaceTempView(nameof(termFrequency));

            // now count frequency of word across all documents
            var documentFrequency = words
                                    .GroupBy(Functions.Lower(word.AsColumn()).Alias(word))
                                    .Agg(Functions.CountDistinct(fileCol).Alias(docFrequency));

            documentFrequency.CreateOrReplaceTempView(nameof(documentFrequency));