public void TestSignaturesV2_4_X() { var intMemoryStream = new MemoryStream <int>(_spark); StreamingQuery sq = intMemoryStream .ToDF() .WriteStream() .QueryName("testQuery") .Format("console") .Trigger(Trigger.Once()) .Start(); sq.AwaitTermination(); Assert.IsType <bool>(sq.AwaitTermination(10)); Assert.IsType <string>(sq.Name); Assert.IsType <string>(sq.Id); Assert.IsType <string>(sq.RunId); Assert.IsType <bool>(sq.IsActive()); sq.Explain(); Assert.Null(sq.Exception()); sq.Stop(); }
static void Main(string[] args) { var host = "localhost"; var port = 9999; SparkSession spark = SparkSession .Builder() .AppName("Emotion_Prediction") .GetOrCreate(); DataFrame lines = spark .ReadStream() .Format("socket") .Option("host", host) .Option("port", port) .Load(); Func <Column, Column> udfArray = Udf <string, string[]>((str) => new string[] { str, " => " + Predict(str) }); DataFrame arrayDf = lines.Select(Explode(udfArray(lines["value"]))); StreamingQuery query = arrayDf .WriteStream() .Format("console") .Start(); query.AwaitTermination(); }
public void Run(string[] args) { string servidoresKafka = args[0]; string topico = args[1]; string modelo = args[2]; // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Exemplo Streaming com Kafka") .GetOrCreate(); // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", servidoresKafka) .Option("subscribe", topico) .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "cliente": "Fulano", * "produto": "Mochila", * "opiniao": "Muito boa!" * } */ var schema = new StructType(new[] { new StructField("cliente", new StringType()), new StructField("produto", new StringType()), new StructField("opiniao", new StringType()) }); // struct<cliente:string,produto:string,valor_total:float> // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", Functions.FromJson( df.Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe //Registrando uma função personalizada pra ser usada no dataframe spark.Udf().Register <string, float>("AnaliseDeSentimento", (texto) => AnalisarSentimento(texto, modelo)); // Criando nova coluna nota com o resultado da análise de sentimento df = df.WithColumn("nota", Functions.CallUDF("AnaliseDeSentimento", df.Col("opiniao"))); // Colocando o streaming pra funcionar StreamingQuery query = df .WriteStream() .OutputMode(OutputMode.Append) .Format("console") //.Trigger(Trigger.Continuous(2000)) //.Foreach(new RedisForeachWriter()) .Start(); query.AwaitTermination(); // Necessário pra deixar a aplcação no ar para processar os dados }
static void Main(string[] args) { SparkSession ss = SparkSession .Builder() .AppName(".NET for Spark Streaming") .GetOrCreate(); DataFrame stream = ss .ReadStream() .Format("socket") .Option("host", "localhost") .Option("port", 9000) .Load(); DataFrame grade = stream .Select(Col("value")); StreamingQuery query = grade .WriteStream() .OutputMode(OutputMode.Append) .Format("console") .Start(); query.AwaitTermination(); }
public static void RunSparkStream(string streamInputPath) { var foreachWriter = new TestForeachWriter(); SparkSession spark = SparkSession .Builder() .AppName("itur") .GetOrCreate(); var mySchema = new Microsoft.Spark.Sql.Types.StructType(new[] { new StructField("IturCode", new Microsoft.Spark.Sql.Types.StringType()), new StructField("IturERP", new Microsoft.Spark.Sql.Types.StringType()), new StructField("QuantityEdit", new Microsoft.Spark.Sql.Types.StringType()), new StructField("PartialQuantity", new Microsoft.Spark.Sql.Types.StringType()) }); DataFrame lines = spark .ReadStream() .Schema(mySchema) .Csv(streamInputPath); s_query = lines .WriteStream() .Foreach(foreachWriter) .Trigger(Trigger.ProcessingTime(5000)) .Start(); s_query.AwaitTermination(); }
protected override Task ExecuteAsync(CancellationToken stoppingToken) { while (!stoppingToken.IsCancellationRequested) { StreamingQuery query = input.WriteStream() .OutputMode(Microsoft.Spark.Sql.Streaming.OutputMode.Append) .Format("console") .Start(); query.AwaitTermination(); } return(Task.CompletedTask); }
public void Run(string[] args) { if (args.Length != 3) { Console.Error.WriteLine( "Usage: SentimentAnalysisStream <host> <port> <model path>"); Environment.Exit(1); } // Create Spark Session SparkSession spark = SparkSession .Builder() .AppName("Streaming Sentiment Analysis") .GetOrCreate(); // Setup stream connection info string hostname = args[0]; string port = args[1]; // Read streaming data into DataFrame DataFrame words = spark .ReadStream() .Format("socket") .Option("host", hostname) .Option("port", port) .Load(); // Use ML.NET in a UDF to evaluate each incoming entry spark.Udf().Register <string, bool>( "MLudf", input => Sentiment(input, args[2])); // Use Spark SQL to call ML.NET UDF // Display results of sentiment analysis on each entry words.CreateOrReplaceTempView("WordsSentiment"); DataFrame sqlDf = spark .Sql("SELECT WordsSentiment.value, MLudf(WordsSentiment.value) FROM WordsSentiment"); // Handle data continuously as it arrives StreamingQuery query = sqlDf .WriteStream() .Format("console") .Start(); query.AwaitTermination(); }
public static void Main(string[] args) { var spark = SparkSession.Builder() .AppName("meuovo") .GetOrCreate(); var input = spark.ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", "localhost:9092") .Option("subscribe", "b7f45352-6abf-436b-9c4a-98141699728c") .Load() .SelectExpr("CAST(value AS STRING)"); StreamingQuery query = input.WriteStream() .OutputMode(OutputMode.Append) .Format("console") .Start(); query.AwaitTermination(); }
public void Run(string[] args) { // Default to running on localhost:9999 string hostname = "localhost"; int port = 9999; // User designated their own host and port if (args.Length == 2) { hostname = args[0]; port = int.Parse(args[1]); } SparkSession spark = SparkSession .Builder() .AppName("Streaming example with a UDF") .GetOrCreate(); DataFrame lines = spark .ReadStream() .Format("socket") .Option("host", hostname) .Option("port", port) .Load(); // UDF to produce an array // Array includes: 1) original string 2) original string + length of original string Func <Column, Column> udfArray = Udf <string, string[]>((str) => new string[] { str, $"{str} {str.Length}" }); DataFrame arrayDF = lines.Select(Explode(udfArray(lines["value"]))); // Process and display each incoming line StreamingQuery query = arrayDF .WriteStream() .Format("console") .Start(); query.AwaitTermination(); }
public void Run(string[] args) { string servidoresKafka = args[0]; string connectionString = args.Length > 1 ? args[1] : string.Empty; // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Credit Card Category") .GetOrCreate(); spark.Conf().Set("spark.sql.shuffle.partitions", "1"); // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", servidoresKafka) .Option("subscribe", "transactions") .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "transaction":"431", * "number":"0015-0000-0000-0000", * "lat":-23.1618, * "lng":-46.47201, * "amount":91.01487, * "category":"pets", * "eventTime":"2021-01-05T19:07:19.3888" * } */ var schema = new StructType(new[] { new StructField("transaction", new StringType()), new StructField("number", new StringType()), new StructField("lat", new DoubleType()), new StructField("lng", new DoubleType()), new StructField("amount", new DoubleType()), new StructField("category", new StringType()), new StructField("eventTime", new TimestampType()) }); // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", FromJson( Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe // Colocando um limite de 7 minutos para receber os eventos atrasados df = df.WithWatermark("eventTime", "7 minutes"); // Somando os valores gastos, agrupando por categoria e por janelas de 2 minutos que se iniciam a cada 1 minuto df = df.GroupBy(Window(Col("eventTime"), "2 minutes", "1 minutes"), Col("category")) .Sum("amount").WithColumnRenamed("sum(amount)", "total") .Select(Col("window.start"), Col("window.end"), Col("category"), Col("total")); // Colocando o streaming pra funcionar e gravando os dados retornados StreamingQuery query = df .WriteStream() .Format("console") .OutputMode(OutputMode.Update) //.Foreach(new MySQLForeachWriter(connectionString)) // Descomentar pra gravar em banco de dados .Start(); query.AwaitTermination(); }
public void Run(string[] args) { string kafkaBrokers = args[0]; double maxSpeed = double.Parse(args[1]); // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Credit Card Fraud") .GetOrCreate(); spark.Conf().Set("spark.sql.shuffle.partitions", "1"); // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", kafkaBrokers) .Option("subscribe", "transactions") .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "transaction":"431", * "number":"0015-0000-0000-0000", * "lat":-23.1618, * "lng":-46.47201, * "amount":91.01487, * "category":"pets", * "eventTime":"2021-01-05T19:07:19.3888" * } */ var schema = new StructType(new[] { new StructField("transaction", new StringType()), new StructField("number", new StringType()), new StructField("lat", new DoubleType()), new StructField("lng", new DoubleType()), new StructField("amount", new DoubleType()), new StructField("category", new StringType()), new StructField("eventTime", new TimestampType()) }); // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", FromJson( df.Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe // Gerando dois dataframes distintos para poder fazer o join e analisar a correção entre as transações DataFrame df1 = df .WithWatermark("eventTime", "7 minutes"); DataFrame df2 = df .WithColumnRenamed("transaction", "transaction2") .WithColumnRenamed("lat", "lat2") .WithColumnRenamed("lng", "lng2") .WithColumnRenamed("eventTime", "eventTime2") .WithWatermark("eventTime2", "7 minutes"); // Efetuando o join para verificar a correlação de transações dos cartões de crédito DataFrame dfJoin = df1.Join(df2, df1.Col("number").EqualTo(df2.Col("number")) .And(Col("transaction").NotEqual(Col("transaction2"))) .And(Col("eventTime2").Between(Col("eventTime"), Col("eventTime") + Expr("interval 5 minutes"))) ); //Registrando uma função personalizada pra ser usada no dataframe spark.Udf().Register <double, double, double, double, double>("CalculateDistance", (lat1, lng1, lat2, lng2) => CalculateDistance(lat1, lng1, lat2, lng2)); spark.Udf().Register <double, Timestamp, Timestamp, double>("CalculateSpeed", (dist, eventTime, eventTime2) => CalculateSpeed(dist, eventTime, eventTime2)); // Criando novas colunas para armazenar a execução do código da UDF dfJoin = dfJoin.WithColumn("dist", CallUDF("CalculateDistance", Col("lat"), Col("lng"), Col("lat2"), Col("lng2"))); dfJoin = dfJoin.WithColumn("speed", CallUDF("CalculateSpeed", Col("dist"), Col("eventTime"), Col("eventTime2"))); // Filtrando as transações que tiverem a velocidade acima do esperado (parâmetro "maxSpeed") dfJoin = dfJoin.Where(Col("speed").Gt(maxSpeed)); // Colocando o streaming pra funcionar StreamingQuery query = dfJoin .WriteStream() .Format("console") .Option("truncate", "false") .OutputMode(OutputMode.Append) .Start(); query.AwaitTermination(); }