public static void RunSparkStream(string streamInputPath) { var foreachWriter = new TestForeachWriter(); SparkSession spark = SparkSession .Builder() .AppName("itur") .GetOrCreate(); var mySchema = new Microsoft.Spark.Sql.Types.StructType(new[] { new StructField("IturCode", new Microsoft.Spark.Sql.Types.StringType()), new StructField("IturERP", new Microsoft.Spark.Sql.Types.StringType()), new StructField("QuantityEdit", new Microsoft.Spark.Sql.Types.StringType()), new StructField("PartialQuantity", new Microsoft.Spark.Sql.Types.StringType()) }); DataFrame lines = spark .ReadStream() .Schema(mySchema) .Csv(streamInputPath); s_query = lines .WriteStream() .Foreach(foreachWriter) .Trigger(Trigger.ProcessingTime(5000)) .Start(); s_query.AwaitTermination(); }
public void Run(string[] args) { string servidoresKafka = args[0]; string topico = args[1]; string modelo = args[2]; // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Exemplo Streaming com Kafka") .GetOrCreate(); // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", servidoresKafka) .Option("subscribe", topico) .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "cliente": "Fulano", * "produto": "Mochila", * "opiniao": "Muito boa!" * } */ var schema = new StructType(new[] { new StructField("cliente", new StringType()), new StructField("produto", new StringType()), new StructField("opiniao", new StringType()) }); // struct<cliente:string,produto:string,valor_total:float> // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", Functions.FromJson( df.Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe //Registrando uma função personalizada pra ser usada no dataframe spark.Udf().Register <string, float>("AnaliseDeSentimento", (texto) => AnalisarSentimento(texto, modelo)); // Criando nova coluna nota com o resultado da análise de sentimento df = df.WithColumn("nota", Functions.CallUDF("AnaliseDeSentimento", df.Col("opiniao"))); // Colocando o streaming pra funcionar StreamingQuery query = df .WriteStream() .OutputMode(OutputMode.Append) .Format("console") //.Trigger(Trigger.Continuous(2000)) //.Foreach(new RedisForeachWriter()) .Start(); query.AwaitTermination(); // Necessário pra deixar a aplcação no ar para processar os dados }
static void Main(string[] args) { var host = "localhost"; var port = 9999; SparkSession spark = SparkSession .Builder() .AppName("Emotion_Prediction") .GetOrCreate(); DataFrame lines = spark .ReadStream() .Format("socket") .Option("host", host) .Option("port", port) .Load(); Func <Column, Column> udfArray = Udf <string, string[]>((str) => new string[] { str, " => " + Predict(str) }); DataFrame arrayDf = lines.Select(Explode(udfArray(lines["value"]))); StreamingQuery query = arrayDf .WriteStream() .Format("console") .Start(); query.AwaitTermination(); }
static void Main(string[] args) { SparkSession ss = SparkSession .Builder() .AppName(".NET for Spark Streaming") .GetOrCreate(); DataFrame stream = ss .ReadStream() .Format("socket") .Option("host", "localhost") .Option("port", 9000) .Load(); DataFrame grade = stream .Select(Col("value")); StreamingQuery query = grade .WriteStream() .OutputMode(OutputMode.Append) .Format("console") .Start(); query.AwaitTermination(); }
public void TestSignaturesV2_3_X() { Assert.IsType <SparkContext>(_spark.SparkContext); Assert.IsType <Builder>(SparkSession.Builder()); SparkSession.ClearDefaultSession(); SparkSession.SetDefaultSession(_spark); Assert.IsType <SparkSession>(SparkSession.GetDefaultSession()); Assert.IsType <RuntimeConfig>(_spark.Conf()); Assert.IsType <SparkSession>(_spark.NewSession()); Assert.IsType <DataFrameReader>(_spark.Read()); Assert.IsType <DataFrame>(_spark.Range(10)); Assert.IsType <DataFrame>(_spark.Range(10, 100)); Assert.IsType <DataFrame>(_spark.Range(10, 100, 10)); Assert.IsType <DataFrame>(_spark.Range(10, 100, 10, 5)); _spark.Range(10).CreateOrReplaceTempView("testView"); Assert.IsType <DataFrame>(_spark.Table("testView")); Assert.IsType <DataStreamReader>(_spark.ReadStream()); Assert.IsType <UdfRegistration>(_spark.Udf()); Assert.IsType <Catalog>(_spark.Catalog()); }
public void TestSignaturesV2_4_X() { DataStreamReader dsr = _spark.ReadStream(); Assert.IsType <DataStreamReader>(dsr.Format("parquet")); Assert.IsType <DataStreamReader>( dsr.Schema( new StructType(new[] { new StructField("columnName", new IntegerType()) }))); Assert.IsType <DataStreamReader>(dsr.Schema("columnName bigint")); Assert.IsType <DataStreamReader>(dsr.Option("key", "value")); Assert.IsType <DataStreamReader>(dsr.Option("key", true)); Assert.IsType <DataStreamReader>(dsr.Option("key", long.MaxValue)); Assert.IsType <DataStreamReader>(dsr.Option("key", double.MaxValue)); Assert.IsType <DataStreamReader>(dsr.Options(new Dictionary <string, string>())); Assert.IsType <DataStreamReader>( dsr.Options( new Dictionary <string, string> { { "key", "value" } })); string jsonFilePath = Path.Combine(TestEnvironment.ResourceDirectory, "people.json"); Assert.IsType <DataFrame>(dsr.Format("json").Load(jsonFilePath)); Assert.IsType <DataFrame>(dsr.Json(jsonFilePath)); Assert.IsType <DataFrame>( dsr.Csv(Path.Combine(TestEnvironment.ResourceDirectory, "people.csv"))); Assert.IsType <DataFrame>( dsr.Orc(Path.Combine(TestEnvironment.ResourceDirectory, "users.orc"))); Assert.IsType <DataFrame>( dsr.Parquet(Path.Combine(TestEnvironment.ResourceDirectory, "users.parquet"))); Assert.IsType <DataFrame> (dsr.Text(Path.Combine(TestEnvironment.ResourceDirectory, "people.txt"))); // In Spark 3.1.1+ setting the `path` Option and then calling .Load(path) is not // supported unless `spark.sql.legacy.pathOptionBehavior.enabled` conf is set. // .Json(path), .Parquet(path), etc follow the same code path so the conf // needs to be set in these scenarios as well. Assert.IsType <DataFrame>(dsr.Format("json").Option("path", jsonFilePath).Load()); }
static void Main(string[] args) { Func <Column, Column> udfArray = Udf <string, string[]>((str) => new string[] { str, $"{str} {str.Length}" }); var hostname = "localhost"; var port = 65001; var windowDuration = "30 seconds"; var slideDuration = "10 seconds"; SparkSession spark = SparkSession .Builder() .AppName("StructuredNetworkWordCountWindowed") .GetOrCreate(); spark.SparkContext.SetLogLevel("warn"); DataFrame lines = spark .ReadStream() .Format("socket") .Option("host", hostname) .Option("port", port) .Load(); var linesWithTime = lines .WithColumn("timestamp", CurrentTimestamp()) .WithColumn("DayOfTheWeek", DayOfYear(Col("timestamp"))); var words = linesWithTime .WithColumn("words", Split(Col("value"), " ")); var word = words.WithColumn("word", Explode(Col("words"))); var windowedCounts = word .GroupBy(Window(Col("timestamp"), windowDuration, slideDuration), Col("word")) .Count() .OrderBy(Desc("window")); var query = windowedCounts .WriteStream() .OutputMode("complete") .Format("console") .Option("truncate", false) .OutputMode(OutputMode.Complete) .Start(); query.AwaitTermination(); }
public void Run(string[] args) { if (args.Length != 3 && args.Length != 4) { Console.Error.WriteLine( "Usage: StructuredNetworkWordCountWindowed " + "<hostname> <port> <window duration in seconds> " + "[<slide duration in seconds>]"); Environment.Exit(1); } string hostname = args[0]; var port = int.Parse(args[1]); var windowSize = int.Parse(args[2]); var slideSize = (args.Length == 3) ? windowSize : int.Parse(args[3]); if (slideSize > windowSize) { Console.Error.WriteLine( "<slide duration> must be less than or equal " + "to <window duration>"); } var windowDuration = $"{windowSize} seconds"; var slideDuration = $"{slideSize} seconds"; SparkSession spark = SparkSession .Builder() .AppName("StructuredNetworkWordCountWindowed") .GetOrCreate(); DataFrame lines = spark .ReadStream() .Format("socket") .Option("host", hostname) .Option("port", port) .Option("includeTimestamp", true) .Load(); DataFrame words = lines .Select(Explode(Split(lines["value"], " ")) .Alias("word"), lines["timestamp"]); DataFrame windowedCounts = words .GroupBy(Window(words["timestamp"], windowDuration, slideDuration), words["word"]) .Count() .OrderBy("window"); Spark.Sql.Streaming.StreamingQuery query = windowedCounts .WriteStream() .OutputMode("complete") .Format("console") .Option("truncate", false) .Start(); query.AwaitTermination(); }
static void Main(string[] args) { if (args.Length != 2) { Console.Error.WriteLine( "Usage: Remember to include input and output path as arguments"); Environment.Exit(1); } var sparkConf = SparkConfUtils.GetSparkConfigurationForFilePath(args); SparkSession spark = SparkSession .Builder() .AppName("Streaming example using Spark.NET") .GetOrCreate(); if (sparkConf != null) { sparkConf.ToList().ForEach(kv => { spark.Conf().Set(kv.Key, kv.Value); }); } var events = spark .ReadStream() .Format("eventhubs") .Options(EventHubConnection.GetEventHubConnectionSettings(eventHubPartitionCount: 2)) .Load(); var processedEvents = events .Select( FromJson(Col("body").Cast("string"), "temperature String, humidity String").As("Raw"), Col("properties"), Col("enqueuedTime") ) .WithColumn("Raw.temperature", Col("Raw.temperature").Cast("double")) .WithColumn("Raw.humidity", Col("Raw.humidity").Cast("double")) .WithColumnRenamed("Raw.temperature", "temperature") .WithColumnRenamed("Raw.humidity", "humidity") .WithColumn("temperatureAlert", Col("temperature") >= 40) .SelectExpr("temperature", "humidity", "properties", "enqueuedTime", "temperatureAlert"); processedEvents.PrintSchema(); var streamingQuery = processedEvents .WriteStream() .OutputMode(OutputMode.Append) .Format("console") .Option("path", args[0]) .Option("checkpointLocation", args[1]) .Start(); streamingQuery.AwaitTermination(); }
public Worker(ILogger <Worker> logger) { _logger = logger; spark = SparkSession.Builder() .AppName("meuovo") .GetOrCreate(); input = spark.ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", "localhost:9092") .Option("subscribe", "b7f45352-6abf-436b-9c4a-98141699728c") .Load() .SelectExpr("CAST(value AS STRING)"); }
static void Main(string[] args) { var hostname = "localhost"; var port = 65001; var windowDuration = "30 seconds"; var slideDuration = "10 seconds"; SparkSession spark = SparkSession .Builder() .AppName("StructuredNetworkWordCountWindowed") .GetOrCreate(); spark.SparkContext.SetLogLevel("warn"); DataFrame lines = spark .ReadStream() .Format("socket") .Option("host", hostname) .Option("port", port) .Load(); var df = lines .WithColumn("tab", Split(Col("value"), ";")) .WithColumn("date", ToDate(Column("tab").GetItem(0))) .WithColumn("var", Column("tab").GetItem(1)) .WithColumn("consumption", Column("tab").GetItem(2)); var windowedCounts = df .GroupBy(Window(Col("date"), windowDuration, slideDuration), Col("var")) .Count() .OrderBy(Desc("window")); var query = windowedCounts .WriteStream() .OutputMode("complete") .Format("console") .Option("truncate", false) .OutputMode(OutputMode.Complete) .Start(); query.AwaitTermination(); }
public void TestStreamingScenario() { using var tempDirectory = new TemporaryDirectory(); // Write [0, 1, 2, 3, 4] to a Delta table. string sourcePath = Path.Combine(tempDirectory.Path, "source-delta-table"); _spark.Range(0, 5).Write().Format("delta").Save(sourcePath); // Create a stream from the source DeltaTable to the sink DeltaTable. // To make the test synchronous and deterministic, we will use a series of // "one-time micro-batch" triggers. string sinkPath = Path.Combine(tempDirectory.Path, "sink-delta-table"); DataStreamWriter dataStreamWriter = _spark .ReadStream() .Format("delta") .Load(sourcePath) .WriteStream() .Format("delta") .OutputMode("append") .Option("checkpointLocation", Path.Combine(tempDirectory.Path, "checkpoints")); // Trigger the first stream batch dataStreamWriter.Trigger(Trigger.Once()).Start(sinkPath).AwaitTermination(); // Now read the sink DeltaTable and validate its content. DeltaTable sink = DeltaTable.ForPath(sinkPath); ValidateRangeDataFrame(Enumerable.Range(0, 5), sink.ToDF()); // Write [5,6,7,8,9] to the source and trigger another stream batch. _spark.Range(5, 10).Write().Format("delta").Mode("append").Save(sourcePath); dataStreamWriter.Trigger(Trigger.Once()).Start(sinkPath).AwaitTermination(); // Finally, validate that the new data made its way to the sink. ValidateRangeDataFrame(Enumerable.Range(0, 10), sink.ToDF()); }
static void Main(string[] args) { SparkSession spark = SparkSession .Builder() .AppName("Test example") .GetOrCreate(); DataFrame lines = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", "localhost:9092") .Option("subscribe", "twitterraw") .Load() .SelectExpr("CAST(value AS STRING)"); lines.PrintSchema(); }
public void Run(string[] args) { if (args.Length != 3) { Console.Error.WriteLine( "Usage: SentimentAnalysisStream <host> <port> <model path>"); Environment.Exit(1); } // Create Spark Session SparkSession spark = SparkSession .Builder() .AppName("Streaming Sentiment Analysis") .GetOrCreate(); // Setup stream connection info string hostname = args[0]; string port = args[1]; // Read streaming data into DataFrame DataFrame words = spark .ReadStream() .Format("socket") .Option("host", hostname) .Option("port", port) .Load(); // Use ML.NET in a UDF to evaluate each incoming entry spark.Udf().Register <string, bool>( "MLudf", input => Sentiment(input, args[2])); // Use Spark SQL to call ML.NET UDF // Display results of sentiment analysis on each entry words.CreateOrReplaceTempView("WordsSentiment"); DataFrame sqlDf = spark .Sql("SELECT WordsSentiment.value, MLudf(WordsSentiment.value) FROM WordsSentiment"); // Handle data continuously as it arrives StreamingQuery query = sqlDf .WriteStream() .Format("console") .Start(); query.AwaitTermination(); }
static void Main(string[] args) { var hostname = "spark"; var port = 5050; SparkSession spark = SparkSession .Builder() .AppName("Streaming example with dotnet") .GetOrCreate(); DataFrame lines = spark .ReadStream() .Format("socket") .Option("host", hostname) .Option("port", port) .Load(); }
public void Run(string[] args) { if (args.Length != 3) { Console.Error.WriteLine( "Usage: StructuredKafkaWordCount " + "<bootstrap-servers> <subscribe-type> <topics>"); Environment.Exit(1); } string bootstrapServers = args[0]; string subscribeType = args[1]; string topics = args[2]; SparkSession spark = SparkSession .Builder() .AppName("StructuredKafkaWordCount") .GetOrCreate(); DataFrame lines = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", bootstrapServers) .Option(subscribeType, topics) .Load() .SelectExpr("CAST(value AS STRING)"); DataFrame words = lines .Select(Explode(Split(lines["value"], " ")) .Alias("word")); DataFrame wordCounts = words.GroupBy("word").Count(); Spark.Sql.Streaming.StreamingQuery query = wordCounts .WriteStream() .OutputMode("complete") .Format("console") .Start(); query.AwaitTermination(); }
public void TestSignaturesV2_3_X() { DataStreamReader dsr = _spark.ReadStream(); Assert.IsType <DataStreamReader>(dsr.Format("parquet")); Assert.IsType <DataStreamReader>( dsr.Schema( new StructType(new[] { new StructField("columnName", new IntegerType()) }))); Assert.IsType <DataStreamReader>(dsr.Schema("columnName bigint")); Assert.IsType <DataStreamReader>(dsr.Option("key", "value")); Assert.IsType <DataStreamReader>(dsr.Option("key", true)); Assert.IsType <DataStreamReader>(dsr.Option("key", long.MaxValue)); Assert.IsType <DataStreamReader>(dsr.Option("key", double.MaxValue)); Assert.IsType <DataStreamReader>(dsr.Options(new Dictionary <string, string>())); Assert.IsType <DataStreamReader>( dsr.Options( new Dictionary <string, string> { { "key", "value" } })); string jsonFilePath = Path.Combine(TestEnvironment.ResourceDirectory, "people.json"); Assert.IsType <DataFrame>(dsr.Format("json").Option("path", jsonFilePath).Load()); Assert.IsType <DataFrame>(dsr.Format("json").Load(jsonFilePath)); Assert.IsType <DataFrame>(dsr.Json(jsonFilePath)); Assert.IsType <DataFrame>( dsr.Csv(Path.Combine(TestEnvironment.ResourceDirectory, "people.csv"))); Assert.IsType <DataFrame>( dsr.Orc(Path.Combine(TestEnvironment.ResourceDirectory, "users.orc"))); Assert.IsType <DataFrame>( dsr.Parquet(Path.Combine(TestEnvironment.ResourceDirectory, "users.parquet"))); Assert.IsType <DataFrame> (dsr.Text(Path.Combine(TestEnvironment.ResourceDirectory, "people.txt"))); }
public void Run(string[] args) { // Default to running on localhost:9999 string hostname = "localhost"; int port = 9999; // User designated their own host and port if (args.Length == 2) { hostname = args[0]; port = int.Parse(args[1]); } SparkSession spark = SparkSession .Builder() .AppName("Streaming example with a UDF") .GetOrCreate(); DataFrame lines = spark .ReadStream() .Format("socket") .Option("host", hostname) .Option("port", port) .Load(); // UDF to produce an array // Array includes: 1) original string 2) original string + length of original string Func <Column, Column> udfArray = Udf <string, string[]>((str) => new string[] { str, $"{str} {str.Length}" }); DataFrame arrayDF = lines.Select(Explode(udfArray(lines["value"]))); // Process and display each incoming line StreamingQuery query = arrayDF .WriteStream() .Format("console") .Start(); query.AwaitTermination(); }
public void Run(string[] args) { if (args.Length != 2) { Console.Error.WriteLine( "Usage: StructuredNetworkWordCount <hostname> <port>"); Environment.Exit(1); } string hostname = args[0]; var port = int.Parse(args[1]); SparkSession spark = SparkSession .Builder() .AppName("StructuredNetworkWordCount") .GetOrCreate(); DataFrame lines = spark .ReadStream() .Format("socket") .Option("host", hostname) .Option("port", port) .Load(); DataFrame words = lines .Select(Explode(Split(lines["value"], " ")) .Alias("word")); DataFrame wordCounts = words.GroupBy("word").Count(); Spark.Sql.Streaming.StreamingQuery query = wordCounts .WriteStream() .OutputMode("complete") .Format("console") .Start(); query.AwaitTermination(); }
public static async Task StreamingRead(SparkSession sparkSession) { await Task.Run(() => { tsRSDF = sparkSession.ReadStream() .Option("sep", ";") //.Option("header", "true") .Schema("ttuser string, ttmessage string, ttage integer") //.Schema("userId integer, movieId integer, rating double, timestamp string") .Csv("file:///mnt/e/OneDrive/WorkingSpace/TestDir/ReadStreamTest/input/"); //文本或者网页->Sql server -> 流读入Sql,运行以下计算后再次推送至临时表 tsRSDF .WriteStream().Format("jdbc") .Option("url", "jdbc:sqlserver://127.0.0.1:1433") .Option("databaseName", "sparkDB") .Option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") .Option("dbtable", "TestTable") .Option("user", "spark") .Option("password", "aspcore") .Start() .AwaitTermination(); }); }
public void TestSignaturesV2_3_X() { DataFrame df = _spark .ReadStream() .Format("rate") .Option("rowsPerSecond", 1) .Load(); DataStreamWriter dsw = df.WriteStream(); Assert.IsType <DataStreamWriter>(dsw.OutputMode("append")); Assert.IsType <DataStreamWriter>(dsw.OutputMode(OutputMode.Append)); Assert.IsType <DataStreamWriter>(dsw.Format("json")); Assert.IsType <DataStreamWriter>(dsw.Option("stringOption", "value")); Assert.IsType <DataStreamWriter>(dsw.Option("boolOption", true)); Assert.IsType <DataStreamWriter>(dsw.Option("longOption", 1L)); Assert.IsType <DataStreamWriter>(dsw.Option("doubleOption", 3D)); Assert.IsType <DataStreamWriter>( dsw.Options( new Dictionary <string, string> { { "option1", "value1" }, { "option2", "value2" } })); Assert.IsType <DataStreamWriter>(dsw.PartitionBy("age")); Assert.IsType <DataStreamWriter>(dsw.PartitionBy("age", "name")); Assert.IsType <DataStreamWriter>(dsw.QueryName("queryName")); Assert.IsType <DataStreamWriter>(dsw.Trigger(Trigger.Once())); }
public void Run(string[] args) { string kafkaBrokers = args[0]; double maxSpeed = double.Parse(args[1]); // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Credit Card Fraud") .GetOrCreate(); spark.Conf().Set("spark.sql.shuffle.partitions", "1"); // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", kafkaBrokers) .Option("subscribe", "transactions") .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "transaction":"431", * "number":"0015-0000-0000-0000", * "lat":-23.1618, * "lng":-46.47201, * "amount":91.01487, * "category":"pets", * "eventTime":"2021-01-05T19:07:19.3888" * } */ var schema = new StructType(new[] { new StructField("transaction", new StringType()), new StructField("number", new StringType()), new StructField("lat", new DoubleType()), new StructField("lng", new DoubleType()), new StructField("amount", new DoubleType()), new StructField("category", new StringType()), new StructField("eventTime", new TimestampType()) }); // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", FromJson( df.Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe // Gerando dois dataframes distintos para poder fazer o join e analisar a correção entre as transações DataFrame df1 = df .WithWatermark("eventTime", "7 minutes"); DataFrame df2 = df .WithColumnRenamed("transaction", "transaction2") .WithColumnRenamed("lat", "lat2") .WithColumnRenamed("lng", "lng2") .WithColumnRenamed("eventTime", "eventTime2") .WithWatermark("eventTime2", "7 minutes"); // Efetuando o join para verificar a correlação de transações dos cartões de crédito DataFrame dfJoin = df1.Join(df2, df1.Col("number").EqualTo(df2.Col("number")) .And(Col("transaction").NotEqual(Col("transaction2"))) .And(Col("eventTime2").Between(Col("eventTime"), Col("eventTime") + Expr("interval 5 minutes"))) ); //Registrando uma função personalizada pra ser usada no dataframe spark.Udf().Register <double, double, double, double, double>("CalculateDistance", (lat1, lng1, lat2, lng2) => CalculateDistance(lat1, lng1, lat2, lng2)); spark.Udf().Register <double, Timestamp, Timestamp, double>("CalculateSpeed", (dist, eventTime, eventTime2) => CalculateSpeed(dist, eventTime, eventTime2)); // Criando novas colunas para armazenar a execução do código da UDF dfJoin = dfJoin.WithColumn("dist", CallUDF("CalculateDistance", Col("lat"), Col("lng"), Col("lat2"), Col("lng2"))); dfJoin = dfJoin.WithColumn("speed", CallUDF("CalculateSpeed", Col("dist"), Col("eventTime"), Col("eventTime2"))); // Filtrando as transações que tiverem a velocidade acima do esperado (parâmetro "maxSpeed") dfJoin = dfJoin.Where(Col("speed").Gt(maxSpeed)); // Colocando o streaming pra funcionar StreamingQuery query = dfJoin .WriteStream() .Format("console") .Option("truncate", "false") .OutputMode(OutputMode.Append) .Start(); query.AwaitTermination(); }
public void Run(string[] args) { string servidoresKafka = args[0]; string connectionString = args.Length > 1 ? args[1] : string.Empty; // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Credit Card Category") .GetOrCreate(); spark.Conf().Set("spark.sql.shuffle.partitions", "1"); // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", servidoresKafka) .Option("subscribe", "transactions") .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "transaction":"431", * "number":"0015-0000-0000-0000", * "lat":-23.1618, * "lng":-46.47201, * "amount":91.01487, * "category":"pets", * "eventTime":"2021-01-05T19:07:19.3888" * } */ var schema = new StructType(new[] { new StructField("transaction", new StringType()), new StructField("number", new StringType()), new StructField("lat", new DoubleType()), new StructField("lng", new DoubleType()), new StructField("amount", new DoubleType()), new StructField("category", new StringType()), new StructField("eventTime", new TimestampType()) }); // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", FromJson( Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe // Colocando um limite de 7 minutos para receber os eventos atrasados df = df.WithWatermark("eventTime", "7 minutes"); // Somando os valores gastos, agrupando por categoria e por janelas de 2 minutos que se iniciam a cada 1 minuto df = df.GroupBy(Window(Col("eventTime"), "2 minutes", "1 minutes"), Col("category")) .Sum("amount").WithColumnRenamed("sum(amount)", "total") .Select(Col("window.start"), Col("window.end"), Col("category"), Col("total")); // Colocando o streaming pra funcionar e gravando os dados retornados StreamingQuery query = df .WriteStream() .Format("console") .OutputMode(OutputMode.Update) //.Foreach(new MySQLForeachWriter(connectionString)) // Descomentar pra gravar em banco de dados .Start(); query.AwaitTermination(); }