示例#1
0
        static void Main(string[] args)
        {
            var host = "localhost";
            var port = 9999;

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Emotion_Prediction")
                                 .GetOrCreate();
            DataFrame lines = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", host)
                              .Option("port", port)
                              .Load();

            Func <Column, Column> udfArray =
                Udf <string, string[]>((str) => new string[] { str, " => " + Predict(str) });

            DataFrame arrayDf = lines.Select(Explode(udfArray(lines["value"])));

            StreamingQuery query = arrayDf
                                   .WriteStream()
                                   .Format("console")
                                   .Start();

            query.AwaitTermination();
        }
        public void TestSignaturesV2_3_X()
        {
            var            intMemoryStream = new MemoryStream <int>(_spark);
            StreamingQuery sq1             = intMemoryStream
                                             .ToDF().WriteStream().QueryName("intQuery").Format("console").Start();
            string id1 = sq1.Id;

            var            stringMemoryStream = new MemoryStream <string>(_spark);
            StreamingQuery sq2 = stringMemoryStream
                                 .ToDF().WriteStream().QueryName("stringQuery").Format("console").Start();
            string id2 = sq2.Id;

            StreamingQueryManager sqm = _spark.Streams();

            StreamingQuery[] streamingQueries = sqm.Active().ToArray();
            Assert.Equal(2, streamingQueries.Length);

            Assert.IsType <StreamingQuery>(sqm.Get(id1));
            Assert.IsType <StreamingQuery>(sqm.Get(id2));

            sqm.ResetTerminated();

            sqm.AwaitAnyTermination(10);

            sq1.Stop();
            sq2.Stop();
        }
示例#3
0
        public void Run(string[] args)
        {
            string servidoresKafka = args[0];
            string topico          = args[1];
            string modelo          = args[2];

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Exemplo Streaming com Kafka")
                                 .GetOrCreate();

            // Criando um dataframe pra receber dados do Kafka
            DataFrame df = spark
                           .ReadStream()
                           .Format("kafka")
                           .Option("kafka.bootstrap.servers", servidoresKafka)
                           .Option("subscribe", topico)
                           .Load()
                           .SelectExpr("CAST(value AS STRING)");

            /* Criando schema pra validar o JSON que virá nas mensagens do Kafka
             * Exemplo do JSON:
             * {
             *      "cliente": "Fulano",
             *      "produto": "Mochila",
             *      "opiniao": "Muito boa!"
             * }
             */
            var schema = new StructType(new[]
            {
                new StructField("cliente", new StringType()),
                new StructField("produto", new StringType()),
                new StructField("opiniao", new StringType())
            }); // struct<cliente:string,produto:string,valor_total:float>

            // Fazendo o parse do JSON pra um array ...
            df = df.WithColumn("json", Functions.FromJson(
                                   df.Col("value"),
                                   schema.SimpleString)
                               )
                 .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe

            //Registrando uma função personalizada pra ser usada no dataframe
            spark.Udf().Register <string, float>("AnaliseDeSentimento",
                                                 (texto) => AnalisarSentimento(texto, modelo));
            // Criando nova coluna nota com o resultado da análise de sentimento
            df = df.WithColumn("nota", Functions.CallUDF("AnaliseDeSentimento", df.Col("opiniao")));

            // Colocando o streaming pra funcionar
            StreamingQuery query = df
                                   .WriteStream()
                                   .OutputMode(OutputMode.Append)
                                   .Format("console")
                                   //.Trigger(Trigger.Continuous(2000))
                                   //.Foreach(new RedisForeachWriter())
                                   .Start();

            query.AwaitTermination();   // Necessário pra deixar a aplcação no ar para processar os dados
        }
示例#4
0
        public void TestSignaturesV2_4_X()
        {
            var            intMemoryStream = new MemoryStream <int>(_spark);
            StreamingQuery sq = intMemoryStream
                                .ToDF()
                                .WriteStream()
                                .QueryName("testQuery")
                                .Format("console")
                                .Trigger(Trigger.Once())
                                .Start();

            sq.AwaitTermination();
            Assert.IsType <bool>(sq.AwaitTermination(10));

            Assert.IsType <string>(sq.Name);

            Assert.IsType <string>(sq.Id);

            Assert.IsType <string>(sq.RunId);

            Assert.IsType <bool>(sq.IsActive());

            sq.Explain();

            Assert.Null(sq.Exception());

            sq.Stop();
        }
示例#5
0
        static void Main(string[] args)
        {
            SparkSession ss =
                SparkSession
                .Builder()
                .AppName(".NET for Spark Streaming")
                .GetOrCreate();

            DataFrame stream =
                ss
                .ReadStream()
                .Format("socket")
                .Option("host", "localhost")
                .Option("port", 9000)
                .Load();

            DataFrame grade =
                stream
                .Select(Col("value"));

            StreamingQuery query =
                grade
                .WriteStream()
                .OutputMode(OutputMode.Append)
                .Format("console")
                .Start();

            query.AwaitTermination();
        }
示例#6
0
        public static void RunSparkStream(string streamInputPath)
        {
            var foreachWriter = new TestForeachWriter();

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("itur")
                                 .GetOrCreate();


            var mySchema = new Microsoft.Spark.Sql.Types.StructType(new[]
            {
                new StructField("IturCode", new Microsoft.Spark.Sql.Types.StringType()),
                new StructField("IturERP", new Microsoft.Spark.Sql.Types.StringType()),
                new StructField("QuantityEdit", new Microsoft.Spark.Sql.Types.StringType()),
                new StructField("PartialQuantity", new Microsoft.Spark.Sql.Types.StringType())
            });

            DataFrame lines = spark
                              .ReadStream()
                              .Schema(mySchema)
                              .Csv(streamInputPath);

            s_query = lines
                      .WriteStream()
                      .Foreach(foreachWriter)
                      .Trigger(Trigger.ProcessingTime(5000))
                      .Start();


            s_query.AwaitTermination();
        }
示例#7
0
        protected override Task ExecuteAsync(CancellationToken stoppingToken)
        {
            while (!stoppingToken.IsCancellationRequested)
            {
                StreamingQuery query = input.WriteStream()
                                       .OutputMode(Microsoft.Spark.Sql.Streaming.OutputMode.Append)
                                       .Format("console")
                                       .Start();

                query.AwaitTermination();
            }
            return(Task.CompletedTask);
        }
示例#8
0
        public void Run(string[] args)
        {
            if (args.Length != 3)
            {
                Console.Error.WriteLine(
                    "Usage: SentimentAnalysisStream <host> <port> <model path>");
                Environment.Exit(1);
            }

            // Create Spark Session
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Streaming Sentiment Analysis")
                                 .GetOrCreate();

            // Setup stream connection info
            string hostname = args[0];
            string port     = args[1];

            // Read streaming data into DataFrame
            DataFrame words = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", hostname)
                              .Option("port", port)
                              .Load();

            // Use ML.NET in a UDF to evaluate each incoming entry
            spark.Udf().Register <string, bool>(
                "MLudf",
                input => Sentiment(input, args[2]));

            // Use Spark SQL to call ML.NET UDF
            // Display results of sentiment analysis on each entry
            words.CreateOrReplaceTempView("WordsSentiment");
            DataFrame sqlDf = spark
                              .Sql("SELECT WordsSentiment.value, MLudf(WordsSentiment.value) FROM WordsSentiment");

            // Handle data continuously as it arrives
            StreamingQuery query = sqlDf
                                   .WriteStream()
                                   .Format("console")
                                   .Start();

            query.AwaitTermination();
        }
示例#9
0
        public static void Main(string[] args)
        {
            var spark = SparkSession.Builder()
                        .AppName("meuovo")
                        .GetOrCreate();
            var input = spark.ReadStream()
                        .Format("kafka")
                        .Option("kafka.bootstrap.servers", "localhost:9092")
                        .Option("subscribe", "b7f45352-6abf-436b-9c4a-98141699728c")
                        .Load()
                        .SelectExpr("CAST(value AS STRING)");

            StreamingQuery query = input.WriteStream()
                                   .OutputMode(OutputMode.Append)
                                   .Format("console")
                                   .Start();

            query.AwaitTermination();
        }
示例#10
0
        public void TestSignaturesV3_1_X()
        {
            string tableName = "output_table";

            WithTable(
                _spark,
                new string[] { tableName },
                () =>
            {
                using var tempDirectory = new TemporaryDirectory();
                var intMemoryStream     = new MemoryStream <int>(_spark);
                DataStreamWriter dsw    = intMemoryStream
                                          .ToDF()
                                          .WriteStream()
                                          .Format("parquet")
                                          .Option("checkpointLocation", tempDirectory.Path);

                StreamingQuery sq = dsw.ToTable(tableName);
                sq.Stop();
            });
        }
        public void Run(string[] args)
        {
            // Default to running on localhost:9999
            string hostname = "localhost";
            int    port     = 9999;

            // User designated their own host and port
            if (args.Length == 2)
            {
                hostname = args[0];
                port     = int.Parse(args[1]);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Streaming example with a UDF")
                                 .GetOrCreate();

            DataFrame lines = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", hostname)
                              .Option("port", port)
                              .Load();

            // UDF to produce an array
            // Array includes: 1) original string 2) original string + length of original string
            Func <Column, Column> udfArray =
                Udf <string, string[]>((str) => new string[] { str, $"{str} {str.Length}" });
            DataFrame arrayDF = lines.Select(Explode(udfArray(lines["value"])));

            // Process and display each incoming line
            StreamingQuery query = arrayDF
                                   .WriteStream()
                                   .Format("console")
                                   .Start();

            query.AwaitTermination();
        }
示例#12
0
        public void TestForeachBatch()
        {
            // Temporary folder to put our test stream input.
            using var srcTempDirectory = new TemporaryDirectory();
            // Temporary folder to write ForeachBatch output.
            using var dstTempDirectory = new TemporaryDirectory();

            Func <Column, Column> outerUdf = Udf <int, int>(i => i + 100);

            // id column: [0, 1, ..., 9]
            WriteCsv(0, 10, Path.Combine(srcTempDirectory.Path, "input1.csv"));

            DataStreamWriter dsw = _spark
                                   .ReadStream()
                                   .Schema("id INT")
                                   .Csv(srcTempDirectory.Path)
                                   .WriteStream()
                                   .ForeachBatch((df, id) =>
            {
                Func <Column, Column> innerUdf = Udf <int, int>(i => i + 200);
                df.Select(outerUdf(innerUdf(Col("id"))))
                .Write()
                .Csv(Path.Combine(dstTempDirectory.Path, id.ToString()));
            });

            StreamingQuery sq = dsw.Start();

            // Process until all available data in the source has been processed and committed
            // to the ForeachBatch sink.
            sq.ProcessAllAvailable();

            // Add new file to the source path. The spark stream will read any new files
            // added to the source path.
            // id column: [10, 11, ..., 19]
            WriteCsv(10, 10, Path.Combine(srcTempDirectory.Path, "input2.csv"));

            // Process until all available data in the source has been processed and committed
            // to the ForeachBatch sink.
            sq.ProcessAllAvailable();
            sq.Stop();

            // Verify folders in the destination path.
            string[] csvPaths =
                Directory.GetDirectories(dstTempDirectory.Path).OrderBy(s => s).ToArray();
            var expectedPaths = new string[]
            {
                Path.Combine(dstTempDirectory.Path, "0"),
                Path.Combine(dstTempDirectory.Path, "1"),
            };

            Assert.True(expectedPaths.SequenceEqual(csvPaths));

            // Read the generated csv paths and verify contents.
            DataFrame df = _spark
                           .Read()
                           .Schema("id INT")
                           .Csv(csvPaths[0], csvPaths[1])
                           .Sort("id");

            IEnumerable <int> actualIds = df.Collect().Select(r => r.GetAs <int>("id"));

            Assert.True(Enumerable.Range(300, 20).SequenceEqual(actualIds));
        }
        public void Run(string[] args)
        {
            string servidoresKafka  = args[0];
            string connectionString = args.Length > 1 ? args[1] : string.Empty;

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Credit Card Category")
                                 .GetOrCreate();

            spark.Conf().Set("spark.sql.shuffle.partitions", "1");  // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar

            // Criando um dataframe pra receber dados do Kafka
            DataFrame df = spark
                           .ReadStream()
                           .Format("kafka")
                           .Option("kafka.bootstrap.servers", servidoresKafka)
                           .Option("subscribe", "transactions")
                           .Load()
                           .SelectExpr("CAST(value AS STRING)");

            /* Criando schema pra validar o JSON que virá nas mensagens do Kafka
             * Exemplo do JSON:
             * {
             *      "transaction":"431",
             *      "number":"0015-0000-0000-0000",
             *      "lat":-23.1618,
             *      "lng":-46.47201,
             *      "amount":91.01487,
             *      "category":"pets",
             *      "eventTime":"2021-01-05T19:07:19.3888"
             * }
             */
            var schema = new StructType(new[]
            {
                new StructField("transaction", new StringType()),
                new StructField("number", new StringType()),
                new StructField("lat", new DoubleType()),
                new StructField("lng", new DoubleType()),
                new StructField("amount", new DoubleType()),
                new StructField("category", new StringType()),
                new StructField("eventTime", new TimestampType())
            });

            // Fazendo o parse do JSON pra um array ...
            df = df.WithColumn("json", FromJson(
                                   Col("value"),
                                   schema.SimpleString)
                               )
                 .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe

            // Colocando um limite de 7 minutos para receber os eventos atrasados
            df = df.WithWatermark("eventTime", "7 minutes");

            // Somando os valores gastos, agrupando por categoria e por janelas de 2 minutos que se iniciam a cada 1 minuto
            df = df.GroupBy(Window(Col("eventTime"), "2 minutes", "1 minutes"), Col("category"))
                 .Sum("amount").WithColumnRenamed("sum(amount)", "total")
                 .Select(Col("window.start"), Col("window.end"), Col("category"), Col("total"));

            // Colocando o streaming pra funcionar e gravando os dados retornados
            StreamingQuery query = df
                                   .WriteStream()
                                   .Format("console")
                                   .OutputMode(OutputMode.Update)
                                   //.Foreach(new MySQLForeachWriter(connectionString))    // Descomentar pra gravar em banco de dados
                                   .Start();

            query.AwaitTermination();
        }
示例#14
0
        public void Run(string[] args)
        {
            string kafkaBrokers = args[0];
            double maxSpeed     = double.Parse(args[1]);

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Credit Card Fraud")
                                 .GetOrCreate();

            spark.Conf().Set("spark.sql.shuffle.partitions", "1");  // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar

            // Criando um dataframe pra receber dados do Kafka
            DataFrame df = spark
                           .ReadStream()
                           .Format("kafka")
                           .Option("kafka.bootstrap.servers", kafkaBrokers)
                           .Option("subscribe", "transactions")
                           .Load()
                           .SelectExpr("CAST(value AS STRING)");

            /* Criando schema pra validar o JSON que virá nas mensagens do Kafka
             * Exemplo do JSON:
             * {
             *      "transaction":"431",
             *      "number":"0015-0000-0000-0000",
             *      "lat":-23.1618,
             *      "lng":-46.47201,
             *      "amount":91.01487,
             *      "category":"pets",
             *      "eventTime":"2021-01-05T19:07:19.3888"
             * }
             */
            var schema = new StructType(new[]
            {
                new StructField("transaction", new StringType()),
                new StructField("number", new StringType()),
                new StructField("lat", new DoubleType()),
                new StructField("lng", new DoubleType()),
                new StructField("amount", new DoubleType()),
                new StructField("category", new StringType()),
                new StructField("eventTime", new TimestampType())
            });

            // Fazendo o parse do JSON pra um array ...
            df = df.WithColumn("json", FromJson(
                                   df.Col("value"),
                                   schema.SimpleString)
                               )
                 .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe

            // Gerando dois dataframes distintos para poder fazer o join e analisar a correção entre as transações
            DataFrame df1 = df
                            .WithWatermark("eventTime", "7 minutes");
            DataFrame df2 = df
                            .WithColumnRenamed("transaction", "transaction2")
                            .WithColumnRenamed("lat", "lat2")
                            .WithColumnRenamed("lng", "lng2")
                            .WithColumnRenamed("eventTime", "eventTime2")
                            .WithWatermark("eventTime2", "7 minutes");

            // Efetuando o join para verificar a correlação de transações dos cartões de crédito
            DataFrame dfJoin = df1.Join(df2,
                                        df1.Col("number").EqualTo(df2.Col("number"))
                                        .And(Col("transaction").NotEqual(Col("transaction2")))
                                        .And(Col("eventTime2").Between(Col("eventTime"), Col("eventTime") + Expr("interval 5 minutes")))
                                        );

            //Registrando uma função personalizada pra ser usada no dataframe
            spark.Udf().Register <double, double, double, double, double>("CalculateDistance", (lat1, lng1, lat2, lng2) => CalculateDistance(lat1, lng1, lat2, lng2));
            spark.Udf().Register <double, Timestamp, Timestamp, double>("CalculateSpeed", (dist, eventTime, eventTime2) => CalculateSpeed(dist, eventTime, eventTime2));

            // Criando novas colunas para armazenar a execução do código da UDF
            dfJoin = dfJoin.WithColumn("dist", CallUDF("CalculateDistance", Col("lat"), Col("lng"), Col("lat2"), Col("lng2")));
            dfJoin = dfJoin.WithColumn("speed", CallUDF("CalculateSpeed", Col("dist"), Col("eventTime"), Col("eventTime2")));

            // Filtrando as transações que tiverem a velocidade acima do esperado (parâmetro "maxSpeed")
            dfJoin = dfJoin.Where(Col("speed").Gt(maxSpeed));

            // Colocando o streaming pra funcionar

            StreamingQuery query = dfJoin
                                   .WriteStream()
                                   .Format("console")
                                   .Option("truncate", "false")
                                   .OutputMode(OutputMode.Append)
                                   .Start();

            query.AwaitTermination();
        }