Example #1
0
        public void TestThreadLocalSessions()
        {
            SparkSession.ClearActiveSession();

            void testChildThread(string appName)
            {
                var thread = new Thread(() =>
                {
                    Assert.Null(SparkSession.GetActiveSession());

                    SparkSession.SetActiveSession(
                        SparkSession.Builder().AppName(appName).GetOrCreate());

                    // Since we are in the child thread, GetActiveSession() should return the child
                    // SparkSession.
                    SparkSession activeSession = SparkSession.GetActiveSession();
                    Assert.NotNull(activeSession);
                    Assert.Equal(appName, activeSession.Conf().Get("spark.app.name", null));
                });

                thread.Start();
                thread.Join();
            }

            for (int i = 0; i < 5; ++i)
            {
                testChildThread(i.ToString());
            }

            Assert.Null(SparkSession.GetActiveSession());
        }
Example #2
0
        public void TestSignaturesV2_3_X()
        {
            Assert.IsType <SparkContext>(_spark.SparkContext);

            Assert.IsType <Builder>(SparkSession.Builder());

            SparkSession.ClearDefaultSession();
            SparkSession.SetDefaultSession(_spark);
            Assert.IsType <SparkSession>(SparkSession.GetDefaultSession());

            Assert.IsType <RuntimeConfig>(_spark.Conf());

            Assert.IsType <SparkSession>(_spark.NewSession());

            Assert.IsType <DataFrameReader>(_spark.Read());

            Assert.IsType <DataFrame>(_spark.Range(10));
            Assert.IsType <DataFrame>(_spark.Range(10, 100));
            Assert.IsType <DataFrame>(_spark.Range(10, 100, 10));
            Assert.IsType <DataFrame>(_spark.Range(10, 100, 10, 5));

            _spark.Range(10).CreateOrReplaceTempView("testView");
            Assert.IsType <DataFrame>(_spark.Table("testView"));

            Assert.IsType <DataStreamReader>(_spark.ReadStream());

            Assert.IsType <UdfRegistration>(_spark.Udf());

            Assert.IsType <Catalog>(_spark.Catalog());
        }
Example #3
0
        public void TestSignaturesV2_3_X()
        {
            RuntimeConfig conf = _spark.Conf();

            conf.Set("stringKey", "stringValue");
            conf.Set("boolKey", false);
            conf.Set("longKey", 1234L);

            Assert.Equal("stringValue", conf.Get("stringKey"));
            Assert.Equal("false", conf.Get("boolKey"));
            Assert.Equal("1234", conf.Get("longKey"));

            conf.Unset("stringKey");
            Assert.Equal("defaultValue", conf.Get("stringKey", "defaultValue"));
            Assert.Equal("false", conf.Get("boolKey", "true"));
        }
        static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                Console.Error.WriteLine(
                    "Usage: Remember to include input and output path as arguments");
                Environment.Exit(1);
            }

            var sparkConf = SparkConfUtils.GetSparkConfigurationForFilePath(args);

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Streaming example using Spark.NET")
                                 .GetOrCreate();

            if (sparkConf != null)
            {
                sparkConf.ToList().ForEach(kv => { spark.Conf().Set(kv.Key, kv.Value); });
            }


            var events = spark
                         .ReadStream()
                         .Format("eventhubs")
                         .Options(EventHubConnection.GetEventHubConnectionSettings(eventHubPartitionCount: 2))
                         .Load();

            var processedEvents = events
                                  .Select(
                FromJson(Col("body").Cast("string"), "temperature String, humidity String").As("Raw"),
                Col("properties"),
                Col("enqueuedTime")
                )
                                  .WithColumn("Raw.temperature", Col("Raw.temperature").Cast("double"))
                                  .WithColumn("Raw.humidity", Col("Raw.humidity").Cast("double"))
                                  .WithColumnRenamed("Raw.temperature", "temperature")
                                  .WithColumnRenamed("Raw.humidity", "humidity")
                                  .WithColumn("temperatureAlert", Col("temperature") >= 40)
                                  .SelectExpr("temperature", "humidity", "properties", "enqueuedTime", "temperatureAlert");

            processedEvents.PrintSchema();


            var streamingQuery = processedEvents
                                 .WriteStream()
                                 .OutputMode(OutputMode.Append)
                                 .Format("console")
                                 .Option("path", args[0])
                                 .Option("checkpointLocation", args[1])
                                 .Start();

            streamingQuery.AwaitTermination();
        }
Example #5
0
        public void TestUnicode()
        {
            string expected =
                "①Ⅻㄨㄩ 啊阿鼾齄丂丄狚狛狜狝﨨﨩ˊˋ˙–⿻〇㐀㐁㐃㐄䶴䶵U1[]U2[]U3[]";

            RuntimeConfig conf = _spark.Conf();
            string        key  = "SerDeTests.TestUnicode";

            conf.Set(key, expected);

            string actual = conf.Get(key);

            Assert.Equal(expected, actual);
        }
Example #6
0
        public HyperspaceTests(HyperspaceFixture fixture)
        {
            _spark = fixture.SparkFixture.Spark;
            _hyperspaceSystemDirectory = new TemporaryDirectory();
            _spark.Conf().Set("spark.hyperspace.system.path", _hyperspaceSystemDirectory.Path);
            _hyperspace = new Hyperspace(_spark);

            _sampleDataFrame = _spark.Read()
                               .Option("header", true)
                               .Option("delimiter", ";")
                               .Csv($"{TestEnvironment.ResourceDirectory}people.csv");
            _sampleIndexName   = "sample_dataframe";
            _sampleIndexConfig = new IndexConfig(_sampleIndexName, new[] { "job" }, new[] { "name" });
            _hyperspace.CreateIndex(_sampleDataFrame, _sampleIndexConfig);
        }
        public void TestSignaturesV2_4_X()
        {
            RuntimeConfig conf = _spark.Conf();

            conf.Set("stringKey", "stringValue");
            conf.Set("boolKey", false);
            conf.Set("longKey", 1234L);

            Assert.Equal("stringValue", conf.Get("stringKey"));
            Assert.Equal("false", conf.Get("boolKey"));
            Assert.Equal("1234", conf.Get("longKey"));

            conf.Unset("stringKey");
            Assert.Equal("defaultValue", conf.Get("stringKey", "defaultValue"));
            Assert.Equal("false", conf.Get("boolKey", "true"));

            Assert.True(conf.IsModifiable("spark.sql.streaming.checkpointLocation"));
            Assert.False(conf.IsModifiable("missingKey"));
        }
        public void Run(string[] args)
        {
            string servidoresKafka  = args[0];
            string connectionString = args.Length > 1 ? args[1] : string.Empty;

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Credit Card Category")
                                 .GetOrCreate();

            spark.Conf().Set("spark.sql.shuffle.partitions", "1");  // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar

            // Criando um dataframe pra receber dados do Kafka
            DataFrame df = spark
                           .ReadStream()
                           .Format("kafka")
                           .Option("kafka.bootstrap.servers", servidoresKafka)
                           .Option("subscribe", "transactions")
                           .Load()
                           .SelectExpr("CAST(value AS STRING)");

            /* Criando schema pra validar o JSON que virá nas mensagens do Kafka
             * Exemplo do JSON:
             * {
             *      "transaction":"431",
             *      "number":"0015-0000-0000-0000",
             *      "lat":-23.1618,
             *      "lng":-46.47201,
             *      "amount":91.01487,
             *      "category":"pets",
             *      "eventTime":"2021-01-05T19:07:19.3888"
             * }
             */
            var schema = new StructType(new[]
            {
                new StructField("transaction", new StringType()),
                new StructField("number", new StringType()),
                new StructField("lat", new DoubleType()),
                new StructField("lng", new DoubleType()),
                new StructField("amount", new DoubleType()),
                new StructField("category", new StringType()),
                new StructField("eventTime", new TimestampType())
            });

            // Fazendo o parse do JSON pra um array ...
            df = df.WithColumn("json", FromJson(
                                   Col("value"),
                                   schema.SimpleString)
                               )
                 .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe

            // Colocando um limite de 7 minutos para receber os eventos atrasados
            df = df.WithWatermark("eventTime", "7 minutes");

            // Somando os valores gastos, agrupando por categoria e por janelas de 2 minutos que se iniciam a cada 1 minuto
            df = df.GroupBy(Window(Col("eventTime"), "2 minutes", "1 minutes"), Col("category"))
                 .Sum("amount").WithColumnRenamed("sum(amount)", "total")
                 .Select(Col("window.start"), Col("window.end"), Col("category"), Col("total"));

            // Colocando o streaming pra funcionar e gravando os dados retornados
            StreamingQuery query = df
                                   .WriteStream()
                                   .Format("console")
                                   .OutputMode(OutputMode.Update)
                                   //.Foreach(new MySQLForeachWriter(connectionString))    // Descomentar pra gravar em banco de dados
                                   .Start();

            query.AwaitTermination();
        }
Example #9
0
        public void Run(string[] args)
        {
            string kafkaBrokers = args[0];
            double maxSpeed     = double.Parse(args[1]);

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Credit Card Fraud")
                                 .GetOrCreate();

            spark.Conf().Set("spark.sql.shuffle.partitions", "1");  // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar

            // Criando um dataframe pra receber dados do Kafka
            DataFrame df = spark
                           .ReadStream()
                           .Format("kafka")
                           .Option("kafka.bootstrap.servers", kafkaBrokers)
                           .Option("subscribe", "transactions")
                           .Load()
                           .SelectExpr("CAST(value AS STRING)");

            /* Criando schema pra validar o JSON que virá nas mensagens do Kafka
             * Exemplo do JSON:
             * {
             *      "transaction":"431",
             *      "number":"0015-0000-0000-0000",
             *      "lat":-23.1618,
             *      "lng":-46.47201,
             *      "amount":91.01487,
             *      "category":"pets",
             *      "eventTime":"2021-01-05T19:07:19.3888"
             * }
             */
            var schema = new StructType(new[]
            {
                new StructField("transaction", new StringType()),
                new StructField("number", new StringType()),
                new StructField("lat", new DoubleType()),
                new StructField("lng", new DoubleType()),
                new StructField("amount", new DoubleType()),
                new StructField("category", new StringType()),
                new StructField("eventTime", new TimestampType())
            });

            // Fazendo o parse do JSON pra um array ...
            df = df.WithColumn("json", FromJson(
                                   df.Col("value"),
                                   schema.SimpleString)
                               )
                 .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe

            // Gerando dois dataframes distintos para poder fazer o join e analisar a correção entre as transações
            DataFrame df1 = df
                            .WithWatermark("eventTime", "7 minutes");
            DataFrame df2 = df
                            .WithColumnRenamed("transaction", "transaction2")
                            .WithColumnRenamed("lat", "lat2")
                            .WithColumnRenamed("lng", "lng2")
                            .WithColumnRenamed("eventTime", "eventTime2")
                            .WithWatermark("eventTime2", "7 minutes");

            // Efetuando o join para verificar a correlação de transações dos cartões de crédito
            DataFrame dfJoin = df1.Join(df2,
                                        df1.Col("number").EqualTo(df2.Col("number"))
                                        .And(Col("transaction").NotEqual(Col("transaction2")))
                                        .And(Col("eventTime2").Between(Col("eventTime"), Col("eventTime") + Expr("interval 5 minutes")))
                                        );

            //Registrando uma função personalizada pra ser usada no dataframe
            spark.Udf().Register <double, double, double, double, double>("CalculateDistance", (lat1, lng1, lat2, lng2) => CalculateDistance(lat1, lng1, lat2, lng2));
            spark.Udf().Register <double, Timestamp, Timestamp, double>("CalculateSpeed", (dist, eventTime, eventTime2) => CalculateSpeed(dist, eventTime, eventTime2));

            // Criando novas colunas para armazenar a execução do código da UDF
            dfJoin = dfJoin.WithColumn("dist", CallUDF("CalculateDistance", Col("lat"), Col("lng"), Col("lat2"), Col("lng2")));
            dfJoin = dfJoin.WithColumn("speed", CallUDF("CalculateSpeed", Col("dist"), Col("eventTime"), Col("eventTime2")));

            // Filtrando as transações que tiverem a velocidade acima do esperado (parâmetro "maxSpeed")
            dfJoin = dfJoin.Where(Col("speed").Gt(maxSpeed));

            // Colocando o streaming pra funcionar

            StreamingQuery query = dfJoin
                                   .WriteStream()
                                   .Format("console")
                                   .Option("truncate", "false")
                                   .OutputMode(OutputMode.Append)
                                   .Start();

            query.AwaitTermination();
        }