public void Run(string[] args) { string servidoresKafka = args[0]; string topico = args[1]; string modelo = args[2]; // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Exemplo Streaming com Kafka") .GetOrCreate(); // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", servidoresKafka) .Option("subscribe", topico) .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "cliente": "Fulano", * "produto": "Mochila", * "opiniao": "Muito boa!" * } */ var schema = new StructType(new[] { new StructField("cliente", new StringType()), new StructField("produto", new StringType()), new StructField("opiniao", new StringType()) }); // struct<cliente:string,produto:string,valor_total:float> // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", Functions.FromJson( df.Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe //Registrando uma função personalizada pra ser usada no dataframe spark.Udf().Register <string, float>("AnaliseDeSentimento", (texto) => AnalisarSentimento(texto, modelo)); // Criando nova coluna nota com o resultado da análise de sentimento df = df.WithColumn("nota", Functions.CallUDF("AnaliseDeSentimento", df.Col("opiniao"))); // Colocando o streaming pra funcionar StreamingQuery query = df .WriteStream() .OutputMode(OutputMode.Append) .Format("console") //.Trigger(Trigger.Continuous(2000)) //.Foreach(new RedisForeachWriter()) .Start(); query.AwaitTermination(); // Necessário pra deixar a aplcação no ar para processar os dados }
public void TestSignaturesV3_0_X() { DataFrame df = _spark .Read() .Schema("age INT, name STRING") .Json($"{TestEnvironment.ResourceDirectory}people.json"); DataFrameWriterV2 dfwV2 = df.WriteTo("testtable"); Assert.IsType <DataFrameWriterV2>(dfwV2.Using("json")); Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key1", "value")); Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key2", true)); Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key3", 1L)); Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key4", 2D)); Assert.IsType <DataFrameWriterV2>(dfwV2.Options( new Dictionary <string, string>() { { "key", "value" } })); Assert.IsType <DataFrameWriterV2>(dfwV2.TableProperty("prop", "value")); _spark.Sql("DROP TABLE IF EXISTS default.testtable"); dfwV2.Create(); Assert.IsType <DataFrameWriterV2>(dfwV2.PartitionedBy(df.Col("age"))); // Throws the following exception: // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported // with v2 tables. Assert.Throws <Exception>(() => dfwV2.Replace()); // Throws the following exception: // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported // with v2 tables. Assert.Throws <Exception>(() => dfwV2.CreateOrReplace()); // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // append in batch mode. Assert.Throws <Exception>(() => dfwV2.Append()); // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // overwrite by filter in batch mode. Assert.Throws <Exception>(() => dfwV2.Overwrite(df.Col("age"))); // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // dynamic overwrite in batch mode. Assert.Throws <Exception>(() => dfwV2.OverwritePartitions()); }
public DataFrame Run(DataFrame moviesDataFrame, DataFrame ratingsDataFrame) { // Find the users that rated at least 250 movies. var qualifyingUsers = ratingsDataFrame .GroupBy("userId") .Count() .Filter("count >= 250") .WithColumnRenamed("userId", "r_userId"); // Find all the movies rated by qualifying users. var moviesAndQualifyingUsersRating = ratingsDataFrame .Join(qualifyingUsers, ratingsDataFrame.Col("userId").EqualTo(qualifyingUsers.Col("r_userId"))) .Select("userId", "movieId", "rating") .WithColumnRenamed("movieId", "r_movieId"); // Explode categories into a row per movie - category combination. // Group the categories by userId and get each average rating. var explodedCategoriesAggregatedByAverageRatingPerUser = moviesDataFrame .Join(moviesAndQualifyingUsersRating, moviesDataFrame.Col("movieId").EqualTo(moviesAndQualifyingUsersRating.Col("r_movieId"))) .WithColumn("genres", Split(Col("genres"), "\\|")) .Select(Col("userId"), Col("movieId"), Col("rating"), Explode(Col("genres"))) .WithColumnRenamed("col", "genre") .GroupBy("userId", "genre") .Agg(Avg("rating")) .OrderBy(Col("userId").Asc()) .Select("userId", "avg(rating)", "genre") .WithColumnRenamed("avg(rating)", "average score") .Limit(20); return(explodedCategoriesAggregatedByAverageRatingPerUser); }
public void TestSignaturesV2_4_X() { DataFrame df = _spark.Range(1); string jsonSchema = "{\"type\":\"long\", \"name\":\"col\"}"; Column inputCol = df.Col("id"); Column avroCol = ToAvro(inputCol); Assert.IsType <Column>(FromAvro(avroCol, jsonSchema)); }
public void TestSignaturesV3_0_X() { DataFrame df = _spark.Range(1); string jsonSchema = "{\"type\":\"long\", \"name\":\"col\"}"; var options = new Dictionary <string, string>() { { "mode", "PERMISSIVE" } }; Column inputCol = df.Col("id"); Column avroCol = ToAvro(inputCol, jsonSchema); Assert.IsType <Column>(FromAvro(avroCol, jsonSchema, options)); }
public static void TestMetadata(string lib = "net.securities", string host = "localhost", bool purge = true, bool del = true, string symbol = "S1") { var driver = new MongoClient("mongodb://" + host); var arctic = new Arctic(driver, lib, purge: purge); if (del) { var delcnt = arctic.DeleteAsync(symbol).Result; Console.WriteLine("Deleted {0} versi\tons for {1}".Args(delcnt, symbol)); } var df = new DataFrame(); df.FilledCount = 2; df.Col <DateTime>("date")[0] = new DateTime(2015, 1, 1); df.Col <DateTime>("date")[1] = new DateTime(2015, 2, 1); df.Col <long>("value")[0] = 15; df.Metadata["sector"] = "internet"; df.Index = df["date"]; arctic.Append("AAPL", df); var df1 = arctic.Read("AAPL"); Console.WriteLine($"AAPL metadata {df1.Metadata["sector"]}"); }
public DataFrame Run(DataFrame moviesDataFrame, DataFrame ratingsDataFrame) { // First we find which movies have been rated 5000 or more times. var qualifyingMovies = ratingsDataFrame .GroupBy("movieId") .Agg(Avg("rating"), Count("rating")) .Filter("count(rating) >= 5000"); // Within the filtered movies we query the 10 with a greatest average rating. var tenGreatestMoviesByAverageRating = moviesDataFrame .Join(qualifyingMovies, moviesDataFrame.Col("movieId").EqualTo(qualifyingMovies.Col("movieId"))) .Select("avg(rating)", "title") .OrderBy(Col("avg(rating)").Desc()) .Limit(10) .WithColumnRenamed("avg(rating)", "average score"); return(tenGreatestMoviesByAverageRating); }
public DataFrame Run(DataFrame moviesDataFrame, DataFrame ratingsDataFrame) { // First we find the ten movies with the greatest amount of ratings. var mostRatedTen = ratingsDataFrame .GroupBy("movieId") .Count() .OrderBy(Col("count").Desc()) .Limit(10); // Considering only the ten movies we need, we get all the fields to display. var fullDataMostRatedTen = moviesDataFrame .Join(mostRatedTen, moviesDataFrame.Col("movieId").EqualTo(mostRatedTen.Col("movieId"))) .Select("count", "title") .OrderBy(Col("count").Desc()) .WithColumnRenamed("count", "times rated"); return(fullDataMostRatedTen); }
public DataFrame Run(DataFrame moviesDataFrame, DataFrame ratingsDataFrame) { // Explode categories into a row per movie - category combination. // Join movies and ratings dataframes. // Group by categories and get the average rating. var overallGenresByAverageRating = moviesDataFrame .WithColumn("genres", Split(Col("genres"), "\\|")) .Select(Col("movieId"), Explode(Col("genres"))) .Join(ratingsDataFrame, moviesDataFrame.Col("movieId").EqualTo(ratingsDataFrame.Col("movieId"))) .GroupBy(Col("col")) .Agg(Avg("rating")) .WithColumnRenamed("col", "genre") .WithColumnRenamed("avg(rating)", "average rating") .OrderBy(Col("average rating").Desc()) .Select("average rating", "genre"); return(overallGenresByAverageRating); }
public void TestSignaturesV2_3_X() { Column col = _df["name"]; col = _df["age"]; DataFrame df = _df.ToDF(); df = df.ToDF("name2", "age2"); StructType schema = _df.Schema(); Assert.NotNull(schema); _df.PrintSchema(); _df.Explain(); _df.Explain(true); _df.Explain(false); Assert.Equal(2, _df.Columns().ToArray().Length); _df.IsLocal(); _df.IsStreaming(); using (var tempDir = new TemporaryDirectory()) { // The following is required for *CheckPoint(). _spark.SparkContext.SetCheckpointDir(tempDir.Path); _df.Checkpoint(); _df.Checkpoint(false); _df.LocalCheckpoint(); _df.LocalCheckpoint(false); } _df.WithWatermark("time", "10 minutes"); _df.Show(); _df.Show(10); _df.Show(10, 10); _df.Show(10, 10, true); _df.Join(_df); _df.Join(_df, "name"); _df.Join(_df, new[] { "name" }); _df.Join(_df, new[] { "name" }, "outer"); _df.Join(_df, _df["age"] == _df["age"]); _df.Join(_df, _df["age"] == _df["age"], "outer"); _df.CrossJoin(_df); _df.SortWithinPartitions("age"); _df.SortWithinPartitions("age", "name"); _df.SortWithinPartitions(); _df.SortWithinPartitions(_df["age"]); _df.SortWithinPartitions(_df["age"], _df["name"]); _df.Sort("age"); _df.Sort("age", "name"); _df.Sort(); _df.Sort(_df["age"]); _df.Sort(_df["age"], _df["name"]); _df.OrderBy("age"); _df.OrderBy("age", "name"); _df.OrderBy(); _df.OrderBy(_df["age"]); _df.OrderBy(_df["age"], _df["name"]); _df.Hint("broadcast"); _df.Hint("broadcast", new[] { "hello", "world" }); _df.Col("age"); _df.ColRegex("age"); _df.As("alias"); _df.Alias("alias"); _df.Select("age"); _df.Select("age", "name"); _df.Select(); _df.Select(_df["age"]); _df.Select(_df["age"], _df["name"]); _df.SelectExpr(); _df.SelectExpr("age * 2"); _df.SelectExpr("age * 2", "abs(age)"); _df.Filter(_df["age"] > 21); _df.Filter("age > 21"); _df.Where(_df["age"] > 21); _df.Where("age > 21"); _df.GroupBy("age"); _df.GroupBy("age", "name"); _df.GroupBy(); _df.GroupBy(_df["age"]); _df.GroupBy(_df["age"], _df["name"]); _df.Rollup("age"); _df.Rollup("age", "name"); _df.Rollup(); _df.Rollup(_df["age"]); _df.Rollup(_df["age"], _df["name"]); _df.Cube("age"); _df.Cube("age", "name"); _df.Cube(); _df.Cube(_df["age"]); _df.Cube(_df["age"], _df["name"]); _df.Agg(Avg(_df["age"])); _df.Agg(Avg(_df["age"]), Avg(_df["name"])); _df.Limit(10); _df.Union(_df); _df.UnionByName(_df); _df.Intersect(_df); _df.Except(_df); _df.Sample(0.5); _df.Sample(0.5, true); _df.Sample(0.5, false, 12345); _df.RandomSplit(new[] { 0.2, 0.8 }); _df.RandomSplit(new[] { 0.2, 0.8 }, 12345); _df.WithColumn("age2", _df["age"]); _df.WithColumnRenamed("age", "age2"); _df.Drop(); _df.Drop("age"); _df.Drop("age", "name"); _df.Drop(_df["age"]); _df.DropDuplicates(); _df.DropDuplicates("age"); _df.DropDuplicates("age", "name"); _df.Describe(); _df.Describe("age"); _df.Describe("age", "name"); _df.Summary(); _df.Summary("count"); _df.Summary("count", "mean"); _df.Head(2); _df.Head(); _df.First(); _df.Take(3).ToArray(); _df.Collect().ToArray(); _df.ToLocalIterator().ToArray(); _df.Count(); _df.Repartition(2); _df.Repartition(2, _df["age"]); _df.Repartition(_df["age"]); _df.Repartition(); _df.RepartitionByRange(2, _df["age"]); _df.RepartitionByRange(_df["age"]); _df.Coalesce(1); _df.Distinct(); _df.Persist(); _df.Cache(); _df.Unpersist(); _df.CreateTempView("view"); _df.CreateOrReplaceTempView("view"); _df.CreateGlobalTempView("global_view"); _df.CreateOrReplaceGlobalTempView("global_view"); }
static void Main(string[] args) { /* * Copiar mysql-connector-java-8.0.19.jar para pasta do Spark / Hadoop * Rodar o comando abaixo a partir da pasta inicial deste projeto: * %SPARK_HOME%\bin\spark-submit * --master local * --class org.apache.spark.deploy.dotnet.DotnetRunner * bin\Debug\netcoreapp3.1\microsoft-spark-2-4_2.11-1.0.0.jar * dotnet * bin\Debug\netcoreapp3.1\BatchDemo.dll * data\amostra.csv * jdbc:mysql://localhost:3306/teste_spark beneficios spark_user my-secret-password */ if (args.Length == 0) { throw new ArgumentException("Informar os caminhos onde encontrar os arquivos CSV"); } string arquivoEntrada = args[0]; // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Exemplo Batch") .GetOrCreate(); // Definindo um schema fixo, com os nomes de coluna que eu quero e seus tipos StructType schema = new StructType(new[] { new StructField("MES_REFERENCIA", new StringType()), new StructField("MES_COMPETENCIA", new StringType()), new StructField("UF", new StringType()), new StructField("CODIGO_MUNICIPIO", new IntegerType()), new StructField("MUNICIPIO", new StringType()), new StructField("CODIGO_FAVORECIDO", new StringType()), new StructField("NOME", new StringType()), new StructField("DATA_SAQUE", new DateType()), new StructField("VALOR_TEXTO", new StringType()) }); // Leitura dos dados em disco para dentro do Spark DataFrame df = spark.Read() .Format("csv") .Schema(schema) .Option("sep", ";") .Option("header", true) .Option("dateFormat", "dd/MM/yyyy") .Load(arquivoEntrada); df.PrintSchema(); df.Show(5, 10); // Removendo colunas que não precisamos mais df = df.Drop("MES_REFERENCIA") .Drop("MES_COMPETENCIA") .Drop("CODIGO_MUNICIPIO") .Drop("CODIGO_FAVORECIDO"); df.Show(5, 10); // Convertendo a coluna VALOR de string para decimal, considerando que o padrão brasileiro é diferente do americano df = df.WithColumn("VALOR", RegexpReplace( RegexpReplace( df.Col("VALOR_TEXTO") , "\\.", "") , ",", ".") .Cast("decimal(10,2)")) .Drop("VALOR_TEXTO"); df.PrintSchema(); df.Show(5, 10); // Efetuando um filtro em cima dos dados df = df.Where(df.Col("UF").NotEqual("AC")); //df = df.Where("UF <> 'AC'"); // passar uma expressão WHERE também funciona como filtro df.Show(5, 10); spark.Udf().Register <string, string, string>("ConcatenarMunicipio", (uf, municipio) => ConcatenarMunicipio(uf, municipio)); // Criando uma nova coluna a partir de uma concatenação e removendo colunas antigas e que não precisamos mais df = df.WithColumn("MUNICIPIO", CallUDF("ConcatenarMunicipio", df.Col("UF"), df.Col("MUNICIPIO"))) .Drop("UF"); // Efetuando uma agregação DataFrame somatorio = df.GroupBy("MUNICIPIO") .Sum("VALOR") .WithColumnRenamed("sum(VALOR)", "SOMA_BENEFICIOS"); somatorio .OrderBy(somatorio.Col("SOMA_BENEFICIOS").Desc()) .Show(15, 40); if (args.Length >= 2) { string urlJdbc = args[1]; // jdbc:mysql://localhost:3306/teste_spark string tabela = args[2]; // beneficios string usuario = args[3]; // spark_user string senha = args[4]; // my-secret-password // Salvando em banco de dados com funcionalidade nativa do Spark somatorio .Write() .Format("jdbc") .Option("driver", "com.mysql.cj.jdbc.Driver") .Option("url", "jdbc:mysql://localhost:3306/teste_spark") .Option("dbtable", "beneficios") .Option("user", "spark_user") .Option("password", "my-secret-password") .Mode(SaveMode.Overwrite) .Save(); } spark.Stop(); }
public static void Main(string[] args) { // Create Spark session. SparkSession spark = SparkSession .Builder() .AppName("Hyperspace example") .Config("spark.some.config.option", "some-value") .GetOrCreate(); // Sample department records. var departments = new List <GenericRow>() { new GenericRow(new object[] { 10, "Accounting", "New York" }), new GenericRow(new object[] { 20, "Research", "Dallas" }), new GenericRow(new object[] { 30, "Sales", "Chicago" }), new GenericRow(new object[] { 40, "Operations", "Boston" }) }; // Sample employee records. var employees = new List <GenericRow>() { new GenericRow(new object[] { 7369, "SMITH", 20 }), new GenericRow(new object[] { 7499, "ALLEN", 30 }), new GenericRow(new object[] { 7521, "WARD", 30 }), new GenericRow(new object[] { 7566, "JONES", 20 }), new GenericRow(new object[] { 7698, "BLAKE", 30 }), new GenericRow(new object[] { 7782, "CLARK", 10 }), new GenericRow(new object[] { 7788, "SCOTT", 20 }), new GenericRow(new object[] { 7839, "KING", 10 }), new GenericRow(new object[] { 7844, "TURNER", 30 }), new GenericRow(new object[] { 7876, "ADAMS", 20 }), new GenericRow(new object[] { 7900, "JAMES", 30 }), new GenericRow(new object[] { 7934, "MILLER", 10 }), new GenericRow(new object[] { 7902, "FORD", 20 }), new GenericRow(new object[] { 7654, "MARTIN", 30 }) }; // Save example data records as Parquet. string deptLocation = "departments"; spark.CreateDataFrame(departments, new StructType(new List <StructField>() { new StructField("deptId", new IntegerType()), new StructField("deptName", new StringType()), new StructField("location", new StringType()) })) .Write() .Mode("overwrite") .Parquet(deptLocation); string empLocation = "employees"; spark.CreateDataFrame(employees, new StructType(new List <StructField>() { new StructField("empId", new IntegerType()), new StructField("empName", new StringType()), new StructField("deptId", new IntegerType()) })) .Write() .Mode("overwrite") .Parquet(empLocation); // Create Hyperspace indexes. var hyperspace = new Hyperspace(spark); DataFrame deptDF = spark.Read().Parquet(deptLocation); DataFrame empDF = spark.Read().Parquet(empLocation); var deptIndexConfig = new IndexConfig( "deptIndex", new[] { "deptId" }, new[] { "deptName" }); var empIndexConfig = new IndexConfig("empIndex", new[] { "deptId" }, new[] { "empName" }); hyperspace.CreateIndex(deptDF, deptIndexConfig); hyperspace.CreateIndex(empDF, empIndexConfig); // List all indexes. hyperspace.Indexes().Show(); // Enable Hyperspace to leverage indexes. spark.EnableHyperspace(); // Example of index usage for filtered selection. DataFrame eqFilter = deptDF.Filter("deptId = 20").Select("deptName"); eqFilter.Show(); hyperspace.Explain(eqFilter, false); // Example of index usage for join. DataFrame eqJoin = empDF .Join(deptDF, "deptId") .Select(empDF.Col("empName"), deptDF.Col("deptName")); eqJoin.Show(); hyperspace.Explain(eqJoin, false); // Stop Spark session. spark.Stop(); }
public void Run(string[] args) { string kafkaBrokers = args[0]; double maxSpeed = double.Parse(args[1]); // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Credit Card Fraud") .GetOrCreate(); spark.Conf().Set("spark.sql.shuffle.partitions", "1"); // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", kafkaBrokers) .Option("subscribe", "transactions") .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "transaction":"431", * "number":"0015-0000-0000-0000", * "lat":-23.1618, * "lng":-46.47201, * "amount":91.01487, * "category":"pets", * "eventTime":"2021-01-05T19:07:19.3888" * } */ var schema = new StructType(new[] { new StructField("transaction", new StringType()), new StructField("number", new StringType()), new StructField("lat", new DoubleType()), new StructField("lng", new DoubleType()), new StructField("amount", new DoubleType()), new StructField("category", new StringType()), new StructField("eventTime", new TimestampType()) }); // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", FromJson( df.Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe // Gerando dois dataframes distintos para poder fazer o join e analisar a correção entre as transações DataFrame df1 = df .WithWatermark("eventTime", "7 minutes"); DataFrame df2 = df .WithColumnRenamed("transaction", "transaction2") .WithColumnRenamed("lat", "lat2") .WithColumnRenamed("lng", "lng2") .WithColumnRenamed("eventTime", "eventTime2") .WithWatermark("eventTime2", "7 minutes"); // Efetuando o join para verificar a correlação de transações dos cartões de crédito DataFrame dfJoin = df1.Join(df2, df1.Col("number").EqualTo(df2.Col("number")) .And(Col("transaction").NotEqual(Col("transaction2"))) .And(Col("eventTime2").Between(Col("eventTime"), Col("eventTime") + Expr("interval 5 minutes"))) ); //Registrando uma função personalizada pra ser usada no dataframe spark.Udf().Register <double, double, double, double, double>("CalculateDistance", (lat1, lng1, lat2, lng2) => CalculateDistance(lat1, lng1, lat2, lng2)); spark.Udf().Register <double, Timestamp, Timestamp, double>("CalculateSpeed", (dist, eventTime, eventTime2) => CalculateSpeed(dist, eventTime, eventTime2)); // Criando novas colunas para armazenar a execução do código da UDF dfJoin = dfJoin.WithColumn("dist", CallUDF("CalculateDistance", Col("lat"), Col("lng"), Col("lat2"), Col("lng2"))); dfJoin = dfJoin.WithColumn("speed", CallUDF("CalculateSpeed", Col("dist"), Col("eventTime"), Col("eventTime2"))); // Filtrando as transações que tiverem a velocidade acima do esperado (parâmetro "maxSpeed") dfJoin = dfJoin.Where(Col("speed").Gt(maxSpeed)); // Colocando o streaming pra funcionar StreamingQuery query = dfJoin .WriteStream() .Format("console") .Option("truncate", "false") .OutputMode(OutputMode.Append) .Start(); query.AwaitTermination(); }