Example #1
0
        public static void RunSparkStream(string streamInputPath)
        {
            var foreachWriter = new TestForeachWriter();

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("itur")
                                 .GetOrCreate();


            var mySchema = new Microsoft.Spark.Sql.Types.StructType(new[]
            {
                new StructField("IturCode", new Microsoft.Spark.Sql.Types.StringType()),
                new StructField("IturERP", new Microsoft.Spark.Sql.Types.StringType()),
                new StructField("QuantityEdit", new Microsoft.Spark.Sql.Types.StringType()),
                new StructField("PartialQuantity", new Microsoft.Spark.Sql.Types.StringType())
            });

            DataFrame lines = spark
                              .ReadStream()
                              .Schema(mySchema)
                              .Csv(streamInputPath);

            s_query = lines
                      .WriteStream()
                      .Foreach(foreachWriter)
                      .Trigger(Trigger.ProcessingTime(5000))
                      .Start();


            s_query.AwaitTermination();
        }
Example #2
0
        public void Run(string[] args)
        {
            string servidoresKafka = args[0];
            string topico          = args[1];
            string modelo          = args[2];

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Exemplo Streaming com Kafka")
                                 .GetOrCreate();

            // Criando um dataframe pra receber dados do Kafka
            DataFrame df = spark
                           .ReadStream()
                           .Format("kafka")
                           .Option("kafka.bootstrap.servers", servidoresKafka)
                           .Option("subscribe", topico)
                           .Load()
                           .SelectExpr("CAST(value AS STRING)");

            /* Criando schema pra validar o JSON que virá nas mensagens do Kafka
             * Exemplo do JSON:
             * {
             *      "cliente": "Fulano",
             *      "produto": "Mochila",
             *      "opiniao": "Muito boa!"
             * }
             */
            var schema = new StructType(new[]
            {
                new StructField("cliente", new StringType()),
                new StructField("produto", new StringType()),
                new StructField("opiniao", new StringType())
            }); // struct<cliente:string,produto:string,valor_total:float>

            // Fazendo o parse do JSON pra um array ...
            df = df.WithColumn("json", Functions.FromJson(
                                   df.Col("value"),
                                   schema.SimpleString)
                               )
                 .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe

            //Registrando uma função personalizada pra ser usada no dataframe
            spark.Udf().Register <string, float>("AnaliseDeSentimento",
                                                 (texto) => AnalisarSentimento(texto, modelo));
            // Criando nova coluna nota com o resultado da análise de sentimento
            df = df.WithColumn("nota", Functions.CallUDF("AnaliseDeSentimento", df.Col("opiniao")));

            // Colocando o streaming pra funcionar
            StreamingQuery query = df
                                   .WriteStream()
                                   .OutputMode(OutputMode.Append)
                                   .Format("console")
                                   //.Trigger(Trigger.Continuous(2000))
                                   //.Foreach(new RedisForeachWriter())
                                   .Start();

            query.AwaitTermination();   // Necessário pra deixar a aplcação no ar para processar os dados
        }
Example #3
0
        static void Main(string[] args)
        {
            var host = "localhost";
            var port = 9999;

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Emotion_Prediction")
                                 .GetOrCreate();
            DataFrame lines = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", host)
                              .Option("port", port)
                              .Load();

            Func <Column, Column> udfArray =
                Udf <string, string[]>((str) => new string[] { str, " => " + Predict(str) });

            DataFrame arrayDf = lines.Select(Explode(udfArray(lines["value"])));

            StreamingQuery query = arrayDf
                                   .WriteStream()
                                   .Format("console")
                                   .Start();

            query.AwaitTermination();
        }
Example #4
0
        static void Main(string[] args)
        {
            SparkSession ss =
                SparkSession
                .Builder()
                .AppName(".NET for Spark Streaming")
                .GetOrCreate();

            DataFrame stream =
                ss
                .ReadStream()
                .Format("socket")
                .Option("host", "localhost")
                .Option("port", 9000)
                .Load();

            DataFrame grade =
                stream
                .Select(Col("value"));

            StreamingQuery query =
                grade
                .WriteStream()
                .OutputMode(OutputMode.Append)
                .Format("console")
                .Start();

            query.AwaitTermination();
        }
Example #5
0
        public void TestSignaturesV2_3_X()
        {
            Assert.IsType <SparkContext>(_spark.SparkContext);

            Assert.IsType <Builder>(SparkSession.Builder());

            SparkSession.ClearDefaultSession();
            SparkSession.SetDefaultSession(_spark);
            Assert.IsType <SparkSession>(SparkSession.GetDefaultSession());

            Assert.IsType <RuntimeConfig>(_spark.Conf());

            Assert.IsType <SparkSession>(_spark.NewSession());

            Assert.IsType <DataFrameReader>(_spark.Read());

            Assert.IsType <DataFrame>(_spark.Range(10));
            Assert.IsType <DataFrame>(_spark.Range(10, 100));
            Assert.IsType <DataFrame>(_spark.Range(10, 100, 10));
            Assert.IsType <DataFrame>(_spark.Range(10, 100, 10, 5));

            _spark.Range(10).CreateOrReplaceTempView("testView");
            Assert.IsType <DataFrame>(_spark.Table("testView"));

            Assert.IsType <DataStreamReader>(_spark.ReadStream());

            Assert.IsType <UdfRegistration>(_spark.Udf());

            Assert.IsType <Catalog>(_spark.Catalog());
        }
        public void TestSignaturesV2_4_X()
        {
            DataStreamReader dsr = _spark.ReadStream();

            Assert.IsType <DataStreamReader>(dsr.Format("parquet"));

            Assert.IsType <DataStreamReader>(
                dsr.Schema(
                    new StructType(new[]
            {
                new StructField("columnName", new IntegerType())
            })));
            Assert.IsType <DataStreamReader>(dsr.Schema("columnName bigint"));

            Assert.IsType <DataStreamReader>(dsr.Option("key", "value"));
            Assert.IsType <DataStreamReader>(dsr.Option("key", true));
            Assert.IsType <DataStreamReader>(dsr.Option("key", long.MaxValue));
            Assert.IsType <DataStreamReader>(dsr.Option("key", double.MaxValue));
            Assert.IsType <DataStreamReader>(dsr.Options(new Dictionary <string, string>()));
            Assert.IsType <DataStreamReader>(
                dsr.Options(
                    new Dictionary <string, string>
            {
                { "key", "value" }
            }));

            string jsonFilePath = Path.Combine(TestEnvironment.ResourceDirectory, "people.json");

            Assert.IsType <DataFrame>(dsr.Format("json").Load(jsonFilePath));
            Assert.IsType <DataFrame>(dsr.Json(jsonFilePath));
            Assert.IsType <DataFrame>(
                dsr.Csv(Path.Combine(TestEnvironment.ResourceDirectory, "people.csv")));
            Assert.IsType <DataFrame>(
                dsr.Orc(Path.Combine(TestEnvironment.ResourceDirectory, "users.orc")));
            Assert.IsType <DataFrame>(
                dsr.Parquet(Path.Combine(TestEnvironment.ResourceDirectory, "users.parquet")));
            Assert.IsType <DataFrame>
                (dsr.Text(Path.Combine(TestEnvironment.ResourceDirectory, "people.txt")));

            // In Spark 3.1.1+ setting the `path` Option and then calling .Load(path) is not
            // supported unless `spark.sql.legacy.pathOptionBehavior.enabled` conf is set.
            // .Json(path), .Parquet(path), etc follow the same code path so the conf
            // needs to be set in these scenarios as well.
            Assert.IsType <DataFrame>(dsr.Format("json").Option("path", jsonFilePath).Load());
        }
        static void Main(string[] args)
        {
            Func <Column, Column> udfArray =
                Udf <string, string[]>((str) => new string[] { str, $"{str} {str.Length}" });

            var hostname = "localhost";
            var port     = 65001;


            var windowDuration = "30 seconds";
            var slideDuration  = "10 seconds";

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("StructuredNetworkWordCountWindowed")
                                 .GetOrCreate();


            spark.SparkContext.SetLogLevel("warn");


            DataFrame lines = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", hostname)
                              .Option("port", port)
                              .Load();


            var linesWithTime = lines
                                .WithColumn("timestamp", CurrentTimestamp())
                                .WithColumn("DayOfTheWeek", DayOfYear(Col("timestamp")));



            var words = linesWithTime
                        .WithColumn("words", Split(Col("value"), " "));

            var word = words.WithColumn("word", Explode(Col("words")));

            var windowedCounts = word
                                 .GroupBy(Window(Col("timestamp"), windowDuration, slideDuration),
                                          Col("word"))
                                 .Count()
                                 .OrderBy(Desc("window"));


            var query = windowedCounts
                        .WriteStream()
                        .OutputMode("complete")
                        .Format("console")
                        .Option("truncate", false)
                        .OutputMode(OutputMode.Complete)
                        .Start();

            query.AwaitTermination();
        }
Example #8
0
        public void Run(string[] args)
        {
            if (args.Length != 3 && args.Length != 4)
            {
                Console.Error.WriteLine(
                    "Usage: StructuredNetworkWordCountWindowed " +
                    "<hostname> <port> <window duration in seconds> " +
                    "[<slide duration in seconds>]");
                Environment.Exit(1);
            }

            string hostname   = args[0];
            var    port       = int.Parse(args[1]);
            var    windowSize = int.Parse(args[2]);
            var    slideSize  = (args.Length == 3) ? windowSize : int.Parse(args[3]);

            if (slideSize > windowSize)
            {
                Console.Error.WriteLine(
                    "<slide duration> must be less than or equal " +
                    "to <window duration>");
            }
            var windowDuration = $"{windowSize} seconds";
            var slideDuration  = $"{slideSize} seconds";

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("StructuredNetworkWordCountWindowed")
                                 .GetOrCreate();

            DataFrame lines = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", hostname)
                              .Option("port", port)
                              .Option("includeTimestamp", true)
                              .Load();

            DataFrame words = lines
                              .Select(Explode(Split(lines["value"], " "))
                                      .Alias("word"), lines["timestamp"]);
            DataFrame windowedCounts = words
                                       .GroupBy(Window(words["timestamp"], windowDuration, slideDuration),
                                                words["word"])
                                       .Count()
                                       .OrderBy("window");

            Spark.Sql.Streaming.StreamingQuery query = windowedCounts
                                                       .WriteStream()
                                                       .OutputMode("complete")
                                                       .Format("console")
                                                       .Option("truncate", false)
                                                       .Start();

            query.AwaitTermination();
        }
        static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                Console.Error.WriteLine(
                    "Usage: Remember to include input and output path as arguments");
                Environment.Exit(1);
            }

            var sparkConf = SparkConfUtils.GetSparkConfigurationForFilePath(args);

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Streaming example using Spark.NET")
                                 .GetOrCreate();

            if (sparkConf != null)
            {
                sparkConf.ToList().ForEach(kv => { spark.Conf().Set(kv.Key, kv.Value); });
            }


            var events = spark
                         .ReadStream()
                         .Format("eventhubs")
                         .Options(EventHubConnection.GetEventHubConnectionSettings(eventHubPartitionCount: 2))
                         .Load();

            var processedEvents = events
                                  .Select(
                FromJson(Col("body").Cast("string"), "temperature String, humidity String").As("Raw"),
                Col("properties"),
                Col("enqueuedTime")
                )
                                  .WithColumn("Raw.temperature", Col("Raw.temperature").Cast("double"))
                                  .WithColumn("Raw.humidity", Col("Raw.humidity").Cast("double"))
                                  .WithColumnRenamed("Raw.temperature", "temperature")
                                  .WithColumnRenamed("Raw.humidity", "humidity")
                                  .WithColumn("temperatureAlert", Col("temperature") >= 40)
                                  .SelectExpr("temperature", "humidity", "properties", "enqueuedTime", "temperatureAlert");

            processedEvents.PrintSchema();


            var streamingQuery = processedEvents
                                 .WriteStream()
                                 .OutputMode(OutputMode.Append)
                                 .Format("console")
                                 .Option("path", args[0])
                                 .Option("checkpointLocation", args[1])
                                 .Start();

            streamingQuery.AwaitTermination();
        }
Example #10
0
 public Worker(ILogger <Worker> logger)
 {
     _logger = logger;
     spark   = SparkSession.Builder()
               .AppName("meuovo")
               .GetOrCreate();
     input = spark.ReadStream()
             .Format("kafka")
             .Option("kafka.bootstrap.servers", "localhost:9092")
             .Option("subscribe", "b7f45352-6abf-436b-9c4a-98141699728c")
             .Load()
             .SelectExpr("CAST(value AS STRING)");
 }
        static void Main(string[] args)
        {
            var hostname = "localhost";
            var port     = 65001;


            var windowDuration = "30 seconds";
            var slideDuration  = "10 seconds";

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("StructuredNetworkWordCountWindowed")
                                 .GetOrCreate();


            spark.SparkContext.SetLogLevel("warn");


            DataFrame lines = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", hostname)
                              .Option("port", port)
                              .Load();


            var df = lines
                     .WithColumn("tab", Split(Col("value"), ";"))
                     .WithColumn("date", ToDate(Column("tab").GetItem(0)))
                     .WithColumn("var", Column("tab").GetItem(1))
                     .WithColumn("consumption", Column("tab").GetItem(2));

            var windowedCounts = df
                                 .GroupBy(Window(Col("date"), windowDuration, slideDuration),
                                          Col("var"))
                                 .Count()
                                 .OrderBy(Desc("window"));


            var query = windowedCounts
                        .WriteStream()
                        .OutputMode("complete")
                        .Format("console")
                        .Option("truncate", false)
                        .OutputMode(OutputMode.Complete)
                        .Start();

            query.AwaitTermination();
        }
Example #12
0
        public void TestStreamingScenario()
        {
            using var tempDirectory = new TemporaryDirectory();
            // Write [0, 1, 2, 3, 4] to a Delta table.
            string sourcePath = Path.Combine(tempDirectory.Path, "source-delta-table");

            _spark.Range(0, 5).Write().Format("delta").Save(sourcePath);

            // Create a stream from the source DeltaTable to the sink DeltaTable.
            // To make the test synchronous and deterministic, we will use a series of
            // "one-time micro-batch" triggers.
            string           sinkPath         = Path.Combine(tempDirectory.Path, "sink-delta-table");
            DataStreamWriter dataStreamWriter = _spark
                                                .ReadStream()
                                                .Format("delta")
                                                .Load(sourcePath)
                                                .WriteStream()
                                                .Format("delta")
                                                .OutputMode("append")
                                                .Option("checkpointLocation", Path.Combine(tempDirectory.Path, "checkpoints"));

            // Trigger the first stream batch
            dataStreamWriter.Trigger(Trigger.Once()).Start(sinkPath).AwaitTermination();

            // Now read the sink DeltaTable and validate its content.
            DeltaTable sink = DeltaTable.ForPath(sinkPath);

            ValidateRangeDataFrame(Enumerable.Range(0, 5), sink.ToDF());

            // Write [5,6,7,8,9] to the source and trigger another stream batch.
            _spark.Range(5, 10).Write().Format("delta").Mode("append").Save(sourcePath);
            dataStreamWriter.Trigger(Trigger.Once()).Start(sinkPath).AwaitTermination();

            // Finally, validate that the new data made its way to the sink.
            ValidateRangeDataFrame(Enumerable.Range(0, 10), sink.ToDF());
        }
        static void Main(string[] args)
        {
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Test example")
                                 .GetOrCreate();

            DataFrame lines = spark
                              .ReadStream()
                              .Format("kafka")
                              .Option("kafka.bootstrap.servers", "localhost:9092")
                              .Option("subscribe", "twitterraw")
                              .Load()
                              .SelectExpr("CAST(value AS STRING)");

            lines.PrintSchema();
        }
Example #14
0
        public void Run(string[] args)
        {
            if (args.Length != 3)
            {
                Console.Error.WriteLine(
                    "Usage: SentimentAnalysisStream <host> <port> <model path>");
                Environment.Exit(1);
            }

            // Create Spark Session
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Streaming Sentiment Analysis")
                                 .GetOrCreate();

            // Setup stream connection info
            string hostname = args[0];
            string port     = args[1];

            // Read streaming data into DataFrame
            DataFrame words = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", hostname)
                              .Option("port", port)
                              .Load();

            // Use ML.NET in a UDF to evaluate each incoming entry
            spark.Udf().Register <string, bool>(
                "MLudf",
                input => Sentiment(input, args[2]));

            // Use Spark SQL to call ML.NET UDF
            // Display results of sentiment analysis on each entry
            words.CreateOrReplaceTempView("WordsSentiment");
            DataFrame sqlDf = spark
                              .Sql("SELECT WordsSentiment.value, MLudf(WordsSentiment.value) FROM WordsSentiment");

            // Handle data continuously as it arrives
            StreamingQuery query = sqlDf
                                   .WriteStream()
                                   .Format("console")
                                   .Start();

            query.AwaitTermination();
        }
        static void Main(string[] args)
        {
            var hostname = "spark";
            var port     = 5050;

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Streaming example with dotnet")
                                 .GetOrCreate();


            DataFrame lines = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", hostname)
                              .Option("port", port)
                              .Load();
        }
        public void Run(string[] args)
        {
            if (args.Length != 3)
            {
                Console.Error.WriteLine(
                    "Usage: StructuredKafkaWordCount " +
                    "<bootstrap-servers> <subscribe-type> <topics>");
                Environment.Exit(1);
            }

            string bootstrapServers = args[0];
            string subscribeType    = args[1];
            string topics           = args[2];

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("StructuredKafkaWordCount")
                                 .GetOrCreate();

            DataFrame lines = spark
                              .ReadStream()
                              .Format("kafka")
                              .Option("kafka.bootstrap.servers", bootstrapServers)
                              .Option(subscribeType, topics)
                              .Load()
                              .SelectExpr("CAST(value AS STRING)");

            DataFrame words = lines
                              .Select(Explode(Split(lines["value"], " "))
                                      .Alias("word"));
            DataFrame wordCounts = words.GroupBy("word").Count();

            Spark.Sql.Streaming.StreamingQuery query = wordCounts
                                                       .WriteStream()
                                                       .OutputMode("complete")
                                                       .Format("console")
                                                       .Start();

            query.AwaitTermination();
        }
Example #17
0
        public void TestSignaturesV2_3_X()
        {
            DataStreamReader dsr = _spark.ReadStream();

            Assert.IsType <DataStreamReader>(dsr.Format("parquet"));

            Assert.IsType <DataStreamReader>(
                dsr.Schema(
                    new StructType(new[]
            {
                new StructField("columnName", new IntegerType())
            })));
            Assert.IsType <DataStreamReader>(dsr.Schema("columnName bigint"));

            Assert.IsType <DataStreamReader>(dsr.Option("key", "value"));
            Assert.IsType <DataStreamReader>(dsr.Option("key", true));
            Assert.IsType <DataStreamReader>(dsr.Option("key", long.MaxValue));
            Assert.IsType <DataStreamReader>(dsr.Option("key", double.MaxValue));
            Assert.IsType <DataStreamReader>(dsr.Options(new Dictionary <string, string>()));
            Assert.IsType <DataStreamReader>(
                dsr.Options(
                    new Dictionary <string, string>
            {
                { "key", "value" }
            }));

            string jsonFilePath = Path.Combine(TestEnvironment.ResourceDirectory, "people.json");

            Assert.IsType <DataFrame>(dsr.Format("json").Option("path", jsonFilePath).Load());
            Assert.IsType <DataFrame>(dsr.Format("json").Load(jsonFilePath));
            Assert.IsType <DataFrame>(dsr.Json(jsonFilePath));
            Assert.IsType <DataFrame>(
                dsr.Csv(Path.Combine(TestEnvironment.ResourceDirectory, "people.csv")));
            Assert.IsType <DataFrame>(
                dsr.Orc(Path.Combine(TestEnvironment.ResourceDirectory, "users.orc")));
            Assert.IsType <DataFrame>(
                dsr.Parquet(Path.Combine(TestEnvironment.ResourceDirectory, "users.parquet")));
            Assert.IsType <DataFrame>
                (dsr.Text(Path.Combine(TestEnvironment.ResourceDirectory, "people.txt")));
        }
        public void Run(string[] args)
        {
            // Default to running on localhost:9999
            string hostname = "localhost";
            int    port     = 9999;

            // User designated their own host and port
            if (args.Length == 2)
            {
                hostname = args[0];
                port     = int.Parse(args[1]);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Streaming example with a UDF")
                                 .GetOrCreate();

            DataFrame lines = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", hostname)
                              .Option("port", port)
                              .Load();

            // UDF to produce an array
            // Array includes: 1) original string 2) original string + length of original string
            Func <Column, Column> udfArray =
                Udf <string, string[]>((str) => new string[] { str, $"{str} {str.Length}" });
            DataFrame arrayDF = lines.Select(Explode(udfArray(lines["value"])));

            // Process and display each incoming line
            StreamingQuery query = arrayDF
                                   .WriteStream()
                                   .Format("console")
                                   .Start();

            query.AwaitTermination();
        }
        public void Run(string[] args)
        {
            if (args.Length != 2)
            {
                Console.Error.WriteLine(
                    "Usage: StructuredNetworkWordCount <hostname> <port>");
                Environment.Exit(1);
            }

            string hostname = args[0];
            var    port     = int.Parse(args[1]);

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("StructuredNetworkWordCount")
                                 .GetOrCreate();

            DataFrame lines = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", hostname)
                              .Option("port", port)
                              .Load();

            DataFrame words = lines
                              .Select(Explode(Split(lines["value"], " "))
                                      .Alias("word"));
            DataFrame wordCounts = words.GroupBy("word").Count();

            Spark.Sql.Streaming.StreamingQuery query = wordCounts
                                                       .WriteStream()
                                                       .OutputMode("complete")
                                                       .Format("console")
                                                       .Start();

            query.AwaitTermination();
        }
        public static async Task StreamingRead(SparkSession sparkSession)
        {
            await Task.Run(() =>
            {
                tsRSDF = sparkSession.ReadStream()
                         .Option("sep", ";")
                         //.Option("header", "true")
                         .Schema("ttuser string, ttmessage string, ttage integer")
                         //.Schema("userId integer, movieId integer, rating double, timestamp string")
                         .Csv("file:///mnt/e/OneDrive/WorkingSpace/TestDir/ReadStreamTest/input/");

                //文本或者网页->Sql server -> 流读入Sql,运行以下计算后再次推送至临时表
                tsRSDF
                .WriteStream().Format("jdbc")
                .Option("url", "jdbc:sqlserver://127.0.0.1:1433")
                .Option("databaseName", "sparkDB")
                .Option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
                .Option("dbtable", "TestTable")
                .Option("user", "spark")
                .Option("password", "aspcore")
                .Start()
                .AwaitTermination();
            });
        }
Example #21
0
        public void TestSignaturesV2_3_X()
        {
            DataFrame df = _spark
                           .ReadStream()
                           .Format("rate")
                           .Option("rowsPerSecond", 1)
                           .Load();

            DataStreamWriter dsw = df.WriteStream();

            Assert.IsType <DataStreamWriter>(dsw.OutputMode("append"));

            Assert.IsType <DataStreamWriter>(dsw.OutputMode(OutputMode.Append));

            Assert.IsType <DataStreamWriter>(dsw.Format("json"));

            Assert.IsType <DataStreamWriter>(dsw.Option("stringOption", "value"));
            Assert.IsType <DataStreamWriter>(dsw.Option("boolOption", true));
            Assert.IsType <DataStreamWriter>(dsw.Option("longOption", 1L));
            Assert.IsType <DataStreamWriter>(dsw.Option("doubleOption", 3D));

            Assert.IsType <DataStreamWriter>(
                dsw.Options(
                    new Dictionary <string, string>
            {
                { "option1", "value1" },
                { "option2", "value2" }
            }));

            Assert.IsType <DataStreamWriter>(dsw.PartitionBy("age"));
            Assert.IsType <DataStreamWriter>(dsw.PartitionBy("age", "name"));

            Assert.IsType <DataStreamWriter>(dsw.QueryName("queryName"));

            Assert.IsType <DataStreamWriter>(dsw.Trigger(Trigger.Once()));
        }
Example #22
0
        public void Run(string[] args)
        {
            string kafkaBrokers = args[0];
            double maxSpeed     = double.Parse(args[1]);

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Credit Card Fraud")
                                 .GetOrCreate();

            spark.Conf().Set("spark.sql.shuffle.partitions", "1");  // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar

            // Criando um dataframe pra receber dados do Kafka
            DataFrame df = spark
                           .ReadStream()
                           .Format("kafka")
                           .Option("kafka.bootstrap.servers", kafkaBrokers)
                           .Option("subscribe", "transactions")
                           .Load()
                           .SelectExpr("CAST(value AS STRING)");

            /* Criando schema pra validar o JSON que virá nas mensagens do Kafka
             * Exemplo do JSON:
             * {
             *      "transaction":"431",
             *      "number":"0015-0000-0000-0000",
             *      "lat":-23.1618,
             *      "lng":-46.47201,
             *      "amount":91.01487,
             *      "category":"pets",
             *      "eventTime":"2021-01-05T19:07:19.3888"
             * }
             */
            var schema = new StructType(new[]
            {
                new StructField("transaction", new StringType()),
                new StructField("number", new StringType()),
                new StructField("lat", new DoubleType()),
                new StructField("lng", new DoubleType()),
                new StructField("amount", new DoubleType()),
                new StructField("category", new StringType()),
                new StructField("eventTime", new TimestampType())
            });

            // Fazendo o parse do JSON pra um array ...
            df = df.WithColumn("json", FromJson(
                                   df.Col("value"),
                                   schema.SimpleString)
                               )
                 .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe

            // Gerando dois dataframes distintos para poder fazer o join e analisar a correção entre as transações
            DataFrame df1 = df
                            .WithWatermark("eventTime", "7 minutes");
            DataFrame df2 = df
                            .WithColumnRenamed("transaction", "transaction2")
                            .WithColumnRenamed("lat", "lat2")
                            .WithColumnRenamed("lng", "lng2")
                            .WithColumnRenamed("eventTime", "eventTime2")
                            .WithWatermark("eventTime2", "7 minutes");

            // Efetuando o join para verificar a correlação de transações dos cartões de crédito
            DataFrame dfJoin = df1.Join(df2,
                                        df1.Col("number").EqualTo(df2.Col("number"))
                                        .And(Col("transaction").NotEqual(Col("transaction2")))
                                        .And(Col("eventTime2").Between(Col("eventTime"), Col("eventTime") + Expr("interval 5 minutes")))
                                        );

            //Registrando uma função personalizada pra ser usada no dataframe
            spark.Udf().Register <double, double, double, double, double>("CalculateDistance", (lat1, lng1, lat2, lng2) => CalculateDistance(lat1, lng1, lat2, lng2));
            spark.Udf().Register <double, Timestamp, Timestamp, double>("CalculateSpeed", (dist, eventTime, eventTime2) => CalculateSpeed(dist, eventTime, eventTime2));

            // Criando novas colunas para armazenar a execução do código da UDF
            dfJoin = dfJoin.WithColumn("dist", CallUDF("CalculateDistance", Col("lat"), Col("lng"), Col("lat2"), Col("lng2")));
            dfJoin = dfJoin.WithColumn("speed", CallUDF("CalculateSpeed", Col("dist"), Col("eventTime"), Col("eventTime2")));

            // Filtrando as transações que tiverem a velocidade acima do esperado (parâmetro "maxSpeed")
            dfJoin = dfJoin.Where(Col("speed").Gt(maxSpeed));

            // Colocando o streaming pra funcionar

            StreamingQuery query = dfJoin
                                   .WriteStream()
                                   .Format("console")
                                   .Option("truncate", "false")
                                   .OutputMode(OutputMode.Append)
                                   .Start();

            query.AwaitTermination();
        }
        public void Run(string[] args)
        {
            string servidoresKafka  = args[0];
            string connectionString = args.Length > 1 ? args[1] : string.Empty;

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Credit Card Category")
                                 .GetOrCreate();

            spark.Conf().Set("spark.sql.shuffle.partitions", "1");  // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar

            // Criando um dataframe pra receber dados do Kafka
            DataFrame df = spark
                           .ReadStream()
                           .Format("kafka")
                           .Option("kafka.bootstrap.servers", servidoresKafka)
                           .Option("subscribe", "transactions")
                           .Load()
                           .SelectExpr("CAST(value AS STRING)");

            /* Criando schema pra validar o JSON que virá nas mensagens do Kafka
             * Exemplo do JSON:
             * {
             *      "transaction":"431",
             *      "number":"0015-0000-0000-0000",
             *      "lat":-23.1618,
             *      "lng":-46.47201,
             *      "amount":91.01487,
             *      "category":"pets",
             *      "eventTime":"2021-01-05T19:07:19.3888"
             * }
             */
            var schema = new StructType(new[]
            {
                new StructField("transaction", new StringType()),
                new StructField("number", new StringType()),
                new StructField("lat", new DoubleType()),
                new StructField("lng", new DoubleType()),
                new StructField("amount", new DoubleType()),
                new StructField("category", new StringType()),
                new StructField("eventTime", new TimestampType())
            });

            // Fazendo o parse do JSON pra um array ...
            df = df.WithColumn("json", FromJson(
                                   Col("value"),
                                   schema.SimpleString)
                               )
                 .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe

            // Colocando um limite de 7 minutos para receber os eventos atrasados
            df = df.WithWatermark("eventTime", "7 minutes");

            // Somando os valores gastos, agrupando por categoria e por janelas de 2 minutos que se iniciam a cada 1 minuto
            df = df.GroupBy(Window(Col("eventTime"), "2 minutes", "1 minutes"), Col("category"))
                 .Sum("amount").WithColumnRenamed("sum(amount)", "total")
                 .Select(Col("window.start"), Col("window.end"), Col("category"), Col("total"));

            // Colocando o streaming pra funcionar e gravando os dados retornados
            StreamingQuery query = df
                                   .WriteStream()
                                   .Format("console")
                                   .OutputMode(OutputMode.Update)
                                   //.Foreach(new MySQLForeachWriter(connectionString))    // Descomentar pra gravar em banco de dados
                                   .Start();

            query.AwaitTermination();
        }