public void TestSignaturesV2_3_X() { Assert.IsType <SparkContext>(_spark.SparkContext); Assert.IsType <Builder>(SparkSession.Builder()); SparkSession.ClearDefaultSession(); SparkSession.SetDefaultSession(_spark); Assert.IsType <SparkSession>(SparkSession.GetDefaultSession()); Assert.IsType <RuntimeConfig>(_spark.Conf()); Assert.IsType <SparkSession>(_spark.NewSession()); Assert.IsType <DataFrameReader>(_spark.Read()); Assert.IsType <DataFrame>(_spark.Range(10)); Assert.IsType <DataFrame>(_spark.Range(10, 100)); Assert.IsType <DataFrame>(_spark.Range(10, 100, 10)); Assert.IsType <DataFrame>(_spark.Range(10, 100, 10, 5)); _spark.Range(10).CreateOrReplaceTempView("testView"); Assert.IsType <DataFrame>(_spark.Table("testView")); Assert.IsType <DataStreamReader>(_spark.ReadStream()); Assert.IsType <UdfRegistration>(_spark.Udf()); Assert.IsType <Catalog>(_spark.Catalog()); }
public static void runSpark(string file_path, string cores, string nodes, int nrows) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .Config("spark.executor.cores", cores) .Config("spark.executor.instances", nodes) .GetOrCreate(); // // Create initial DataFrame DataFrame dataFrame = spark .Read() .Option("header", true) .Option("inferSchema", true) .Schema("quizzes string, solutions string") .Csv(file_path); DataFrame dataFrame2 = dataFrame.Limit(nrows); spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => sudokusolution(sudoku)); dataFrame2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved"); sqlDf.Show(); spark.Stop(); Console.WriteLine("SCRAPY"); }
public void Run(string[] args) { string servidoresKafka = args[0]; string topico = args[1]; string modelo = args[2]; // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Exemplo Streaming com Kafka") .GetOrCreate(); // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", servidoresKafka) .Option("subscribe", topico) .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "cliente": "Fulano", * "produto": "Mochila", * "opiniao": "Muito boa!" * } */ var schema = new StructType(new[] { new StructField("cliente", new StringType()), new StructField("produto", new StringType()), new StructField("opiniao", new StringType()) }); // struct<cliente:string,produto:string,valor_total:float> // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", Functions.FromJson( df.Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe //Registrando uma função personalizada pra ser usada no dataframe spark.Udf().Register <string, float>("AnaliseDeSentimento", (texto) => AnalisarSentimento(texto, modelo)); // Criando nova coluna nota com o resultado da análise de sentimento df = df.WithColumn("nota", Functions.CallUDF("AnaliseDeSentimento", df.Col("opiniao"))); // Colocando o streaming pra funcionar StreamingQuery query = df .WriteStream() .OutputMode(OutputMode.Append) .Format("console") //.Trigger(Trigger.Continuous(2000)) //.Foreach(new RedisForeachWriter()) .Start(); query.AwaitTermination(); // Necessário pra deixar a aplcação no ar para processar os dados }
static void runSpark(string file_path, string cores, string nodes, int nrows) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("Resolution de " + nrows + " sudokus par évolution combinatoire de " + cores + " noyau(x) et " + nodes + " noeud(s)") .Config("spark.executor.cores", cores) .Config("spark.executor.instances", nodes) .GetOrCreate(); // Create initial DataFrame DataFrame dataFrame = spark .Read() .Option("header", true) .Option("inferSchema", true) .Schema("quizzes string, solutions string") .Csv(file_path); DataFrame dataFrame2 = dataFrame.Limit(nrows); spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => sudokusolution(sudoku)); dataFrame2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved"); sqlDf.Show(); spark.Stop(); }
private static void ElectronicsReviewsSentimentAnalysis(SparkSession spark) { spark.Udf().Register <string, int>("sentiment_udf", text => Sentiment(text)); var reviewsSentiment = spark.Sql("SELECT *, sentiment_udf(review_text) AS sentiment FROM ElectronicsReviews"); reviewsSentiment.Cache(); reviewsSentiment.CreateOrReplaceTempView("ElectronicsReviewSentiment"); }
//Méthode qui est appelée depuis le main pour lancer une session spark avec un nombbre de noyaux et d'instances différents et lancer la résolution du soduku grace à la méthode Sudokusolution(). //private static void Sudokures(string cores, string nodes, string mem, int nrows){ private static void Sudokures(int nrows) { // Initialisation de la session Spark SparkSession spark = SparkSession .Builder() .Config("spark.executor.memory", "4G") .GetOrCreate(); //.AppName("Resolution of " + nrows + " sudokus using DlxLib with " + cores + " cores and " + nodes + " instances") //.Config("spark.driver.cores", cores) //.Config("spark.executor.instances", nodes) //.Config("spark.executor.memory", mem) //.GetOrCreate(); // Intégration du csv dans un dataframe DataFrame df = spark .Read() .Option("header", true) .Option("inferSchema", true) .Csv(_filePath); //limit du dataframe avec un nombre de ligne prédéfini lors de l'appel de la fonction DataFrame df2 = df.Limit(nrows); //Watch seulement pour la résolution des sudokus var watch2 = new System.Diagnostics.Stopwatch(); watch2.Start(); // Création de la spark User Defined Function spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => Sudokusolution(sudoku)); // Appel de l'UDF dans un nouveau dataframe spark qui contiendra les résultats aussi df2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT Sudokus, SukoduUDF(Sudokus) as Resolution from Resolved"); sqlDf.Show(); watch2.Stop(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution : " + watch2.ElapsedMilliseconds + " ms"); //Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution with " + cores + " core and " + nodes + " instance: " + watch2.ElapsedMilliseconds + " ms"); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); spark.Stop(); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: GitHubProjects <path to projects.csv>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName("GitHub and Spark Batch") .GetOrCreate(); DataFrame projectsDf = spark .Read() .Schema("id INT, url STRING, owner_id INT, " + "name STRING, descriptor STRING, language STRING, " + "created_at STRING, forked_from INT, deleted STRING, " + "updated_at STRING") .Csv(args[0]); projectsDf.Show(); // Drop any rows with NA values DataFrameNaFunctions dropEmptyProjects = projectsDf.Na(); DataFrame cleanedProjects = dropEmptyProjects.Drop("any"); // Remove unnecessary columns cleanedProjects = cleanedProjects.Drop("id", "url", "owner_id"); cleanedProjects.Show(); // Average number of times each language has been forked DataFrame groupedDF = cleanedProjects .GroupBy("language") .Agg(Avg(cleanedProjects["forked_from"])); // Sort by most forked languages first groupedDF.OrderBy(Desc("avg(forked_from)")).Show(); spark.Udf().Register <string, bool>( "MyUDF", (date) => DateTime.TryParse(date, out DateTime convertedDate) && (convertedDate > s_referenceDate)); cleanedProjects.CreateOrReplaceTempView("dateView"); DataFrame dateDf = spark.Sql( "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView"); dateDf.Show(); spark.Stop(); }
public void Run(string[] args) { if (args.Length != 3) { Console.Error.WriteLine( "Usage: SentimentAnalysisStream <host> <port> <model path>"); Environment.Exit(1); } // Create Spark Session SparkSession spark = SparkSession .Builder() .AppName("Streaming Sentiment Analysis") .GetOrCreate(); // Setup stream connection info string hostname = args[0]; string port = args[1]; // Read streaming data into DataFrame DataFrame words = spark .ReadStream() .Format("socket") .Option("host", hostname) .Option("port", port) .Load(); // Use ML.NET in a UDF to evaluate each incoming entry spark.Udf().Register <string, bool>( "MLudf", input => Sentiment(input, args[2])); // Use Spark SQL to call ML.NET UDF // Display results of sentiment analysis on each entry words.CreateOrReplaceTempView("WordsSentiment"); DataFrame sqlDf = spark .Sql("SELECT WordsSentiment.value, MLudf(WordsSentiment.value) FROM WordsSentiment"); // Handle data continuously as it arrives StreamingQuery query = sqlDf .WriteStream() .Format("console") .Start(); query.AwaitTermination(); }
public void Run(string[] args) { if (args.Length != 2) { Console.Error.WriteLine( "Usage: <path to yelptest.csv> <path to MLModel.zip>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName(".NET for Apache Spark Sentiment Analysis") .GetOrCreate(); // Read in and display Yelp reviews DataFrame df = spark .Read() .Option("header", true) .Option("inferSchema", true) .Csv(args[0]); df.Show(); // Use ML.NET in a UDF to evaluate each review spark.Udf().Register <string, bool>( "MLudf", (text) => Sentiment(text, args[1])); // Use Spark SQL to call ML.NET UDF // Display results of sentiment analysis on reviews df.CreateOrReplaceTempView("Reviews"); DataFrame sqlDf = spark.Sql("SELECT ReviewText, MLudf(ReviewText) FROM Reviews"); sqlDf.Show(); // Print out first 20 rows of data // Prevent data getting cut off by setting truncate = 0 sqlDf.Show(20, 0, false); spark.Stop(); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName(".NET Spark SQL basic example") .Config("spark.some.config.option", "some-value") .GetOrCreate(); // Need to explicitly specify the schema since pickling vs. arrow formatting // will return different types. Pickling will turn longs into ints if the values fit. // Same as the "age INT, name STRING" DDL-format string. var inputSchema = new StructType(new[] { new StructField("age", new IntegerType()), new StructField("name", new StringType()) }); DataFrame df = spark.Read().Schema(inputSchema).Json(args[0]); Spark.Sql.Types.StructType schema = df.Schema(); Console.WriteLine(schema.SimpleString); IEnumerable <Row> rows = df.Collect(); foreach (Row row in rows) { Console.WriteLine(row); } df.Show(); df.PrintSchema(); df.Select("name", "age", "age", "name").Show(); df.Select(df["name"], df["age"] + 1).Show(); df.Filter(df["age"] > 21).Show(); df.GroupBy("age") .Agg(Avg(df["age"]), Avg(df["age"]), CountDistinct(df["age"], df["age"])) .Show(); df.CreateOrReplaceTempView("people"); // Registering Udf for SQL expression. DataFrame sqlDf = spark.Sql("SELECT * FROM people"); sqlDf.Show(); spark.Udf().Register <int?, string, string>( "my_udf", (age, name) => name + " with " + ((age.HasValue) ? age.Value.ToString() : "null")); sqlDf = spark.Sql("SELECT my_udf(*) FROM people"); sqlDf.Show(); // Using UDF via data frames. Func <Column, Column, Column> addition = Udf <int?, string, string>( (age, name) => name + " is " + (age.HasValue ? age.Value + 10 : 0)); df.Select(addition(df["age"], df["name"])).Show(); // Chaining example: Func <Column, Column> addition2 = Udf <string, string>(str => $"hello {str}!"); df.Select(addition2(addition(df["age"], df["name"]))).Show(); // Multiple UDF example: df.Select(addition(df["age"], df["name"]), addition2(df["name"])).Show(); // UDF return type as array. Func <Column, Column> udfArray = Udf <string, string[]>((str) => new string[] { str, str + str }); df.Select(Explode(udfArray(df["name"]))).Show(); // UDF return type as map. Func <Column, Column> udfMap = Udf <string, IDictionary <string, string[]> >( (str) => new Dictionary <string, string[]> { { str, new[] { str, str } } }); df.Select(udfMap(df["name"]).As("UdfMap")).Show(truncate: 50); // Joins. DataFrame joinedDf = df.Join(df, "name"); joinedDf.Show(); DataFrame joinedDf2 = df.Join(df, new[] { "name", "age" }); joinedDf2.Show(); DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer"); joinedDf3.Show(); spark.Stop(); }
public void TestVectorUdf() { Func <Int32Array, StringArray, StringArray> udf1Func = (ages, names) => (StringArray)ToArrowArray( Enumerable.Range(0, names.Length) .Select(i => $"{names.GetString(i)} is {ages.GetValue(i) ?? 0}") .ToArray()); // Single UDF. Func <Column, Column, Column> udf1 = VectorUdf(udf1Func); { Row[] rows = _df.Select(udf1(_df["age"], _df["name"])).Collect().ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("Michael is 0", rows[0].GetAs <string>(0)); Assert.Equal("Andy is 30", rows[1].GetAs <string>(0)); Assert.Equal("Justin is 19", rows[2].GetAs <string>(0)); } // Chained UDFs. Func <Column, Column> udf2 = VectorUdf <StringArray, StringArray>( (strings) => (StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"hello {strings.GetString(i)}!") .ToArray())); { Row[] rows = _df .Select(udf2(udf1(_df["age"], _df["name"]))) .Collect() .ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("hello Michael is 0!", rows[0].GetAs <string>(0)); Assert.Equal("hello Andy is 30!", rows[1].GetAs <string>(0)); Assert.Equal("hello Justin is 19!", rows[2].GetAs <string>(0)); } // Multiple UDFs: { Row[] rows = _df .Select(udf1(_df["age"], _df["name"]), udf2(_df["name"])) .Collect() .ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("Michael is 0", rows[0].GetAs <string>(0)); Assert.Equal("hello Michael!", rows[0].GetAs <string>(1)); Assert.Equal("Andy is 30", rows[1].GetAs <string>(0)); Assert.Equal("hello Andy!", rows[1].GetAs <string>(1)); Assert.Equal("Justin is 19", rows[2].GetAs <string>(0)); Assert.Equal("hello Justin!", rows[2].GetAs <string>(1)); } // Register UDF { _df.CreateOrReplaceTempView("people"); _spark.Udf().RegisterVector("udf1", udf1Func); Row[] rows = _spark.Sql("SELECT udf1(age, name) FROM people") .Collect() .ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("Michael is 0", rows[0].GetAs <string>(0)); Assert.Equal("Andy is 30", rows[1].GetAs <string>(0)); Assert.Equal("Justin is 19", rows[2].GetAs <string>(0)); } }
public void TestUdfRegistrationWithReturnAsRowType() { // Test UDF that returns a Row object with a single column. { var schema = new StructType(new[] { new StructField("col1", new IntegerType()), new StructField("col2", new StringType()) }); _df.CreateOrReplaceTempView("people"); _spark.Udf().Register <string>( "udf1", str => new GenericRow(new object[] { 1, "abc" }), schema); Row[] rows = _spark.Sql("SELECT udf1(name) AS col FROM people") .Collect() .ToArray(); Assert.Equal(3, rows.Length); foreach (Row row in rows) { Assert.Equal(1, row.Size()); Row outerCol = row.GetAs <Row>("col"); Assert.Equal(2, outerCol.Size()); Assert.Equal(1, outerCol.GetAs <int>("col1")); Assert.Equal("abc", outerCol.GetAs <string>("col2")); } } // Test UDF that returns a Row object with multiple columns. { var schema = new StructType(new[] { new StructField("col1", new IntegerType()) }); _df.CreateOrReplaceTempView("people"); _spark.Udf().Register <string>( "udf2", str => new GenericRow(new object[] { 111 }), schema); Row[] rows = _spark.Sql("SELECT udf2(name) AS col, name FROM people") .Collect() .ToArray(); Assert.Equal(3, rows.Length); foreach (Row row in rows) { Assert.Equal(2, row.Size()); Row col1 = row.GetAs <Row>("col"); Assert.Equal(1, col1.Size()); Assert.Equal(111, col1.GetAs <int>("col1")); string col2 = row.GetAs <string>("name"); Assert.NotEmpty(col2); } } // Test UDF that returns a nested Row object. { var subSchema1 = new StructType(new[] { new StructField("col1", new IntegerType()), }); var subSchema2 = new StructType(new[] { new StructField("col1", new StringType()), new StructField("col2", subSchema1), }); var schema = new StructType(new[] { new StructField("col1", new IntegerType()), new StructField("col2", subSchema1), new StructField("col3", subSchema2) }); _df.CreateOrReplaceTempView("people"); _spark.Udf().Register <string>( "udf3", str => new GenericRow( new object[] { 1, new GenericRow(new object[] { 1 }), new GenericRow(new object[] { "abc", new GenericRow(new object[] { 10 }) }) }), schema); Row[] rows = _spark.Sql("SELECT udf3(name) AS col FROM people") .Collect() .ToArray(); Assert.Equal(3, rows.Length); foreach (Row row in rows) { Assert.Equal(1, row.Size()); Row outerCol = row.GetAs <Row>("col"); Assert.Equal(3, outerCol.Size()); Assert.Equal(1, outerCol.GetAs <int>("col1")); Assert.Equal( new Row(new object[] { 1 }, subSchema1), outerCol.GetAs <Row>("col2")); Assert.Equal( new Row( new object[] { "abc", new Row(new object[] { 10 }, subSchema1) }, subSchema2), outerCol.GetAs <Row>("col3")); } } // Chained UDFs. { var schema = new StructType(new[] { new StructField("col1", new IntegerType()), new StructField("col2", new StringType()) }); _df.CreateOrReplaceTempView("people"); _spark.Udf().Register <string>( "udf4", str => new GenericRow(new object[] { 1, str }), schema); _spark.Udf().Register <Row, string>( "udf5", row => row.GetAs <string>(1)); Row[] rows = _spark.Sql("SELECT udf5(udf4(name)) FROM people") .Collect() .ToArray(); Assert.Equal(3, rows.Length); var expected = new string[] { "Michael", "Andy", "Justin" }; for (int i = 0; i < rows.Length; ++i) { Assert.Equal(1, rows[i].Size()); Assert.Equal(expected[i], rows[i].GetAs <string>(0)); } } }
static void Main(string[] args) { // Initialize Session SparkSession ss = SparkSession .Builder() .AppName("Working with DataFrames") .GetOrCreate(); // Read Data DataFrame businesses = ss .Read() .Option("header", "true") .Option("inferSchema", "true") .Csv("Data/NYC-Restaurant-Inspections.csv"); businesses = businesses.Select("CAMIS", "DBA", "BORO", "CUISINE DESCRIPTION"); DataFrame inspections = ss .Read() .Option("header", "true") .Option("inferSchema", "true") .Csv("Data/NYC-Restaurant-Inspections.csv"); inspections = inspections.Select("CAMIS", "INSPECTION DATE", "VIOLATION CODE", "CRITICAL FLAG", "SCORE", "GRADE", "INSPECTION TYPE"); // Select columns businesses.Select(Col("CAMIS"), Col("DBA")).Show(1); inspections.Select(inspections["VIOLATION CODE"]).Show(1); // Filter businesses .Filter(Col("BORO") == "Manhattan") .Select("DBA", "BORO") .Show(3); // Group / Aggregate businesses .GroupBy("CUISINE DESCRIPTION") .Agg(Count("CUISINE DESCRIPTION").Alias("CUISINE COUNT")) .Show(10); // Order businesses .GroupBy("CUISINE DESCRIPTION") .Agg(Count("CUISINE DESCRIPTION").Alias("CUISINE COUNT")) .OrderBy(Col("CUISINE COUNT").Desc()) .Show(3); // Join DataFrame joinedDf = businesses .Join(inspections, "CAMIS") .Select(Col("DBA"), Col("CUISINE DESCRIPTION"), Col("GRADE")); joinedDf.Show(5); // SQL businesses.CreateOrReplaceTempView("businesses"); inspections.CreateOrReplaceTempView("inspections"); ss.Sql(@"SELECT b.DBA,b.`CUISINE DESCRIPTION`,i.GRADE FROM businesses b JOIN inspections i ON b.CAMIS = i.CAMIS").Show(5); // UDF ss.Udf().Register <string, string>("Tupper", Tupper); inspections .Select(CallUDF("Tupper", Col("INSPECTION TYPE")).Alias("CAPITALIZED")) .Show(3); // Save joinedDf .Write() .Mode(SaveMode.Overwrite) .Csv("output"); }
static void Main(string[] args) { /* * Copiar mysql-connector-java-8.0.19.jar para pasta do Spark / Hadoop * Rodar o comando abaixo a partir da pasta inicial deste projeto: * %SPARK_HOME%\bin\spark-submit * --master local * --class org.apache.spark.deploy.dotnet.DotnetRunner * bin\Debug\netcoreapp3.1\microsoft-spark-2-4_2.11-1.0.0.jar * dotnet * bin\Debug\netcoreapp3.1\BatchDemo.dll * data\amostra.csv * jdbc:mysql://localhost:3306/teste_spark beneficios spark_user my-secret-password */ if (args.Length == 0) { throw new ArgumentException("Informar os caminhos onde encontrar os arquivos CSV"); } string arquivoEntrada = args[0]; // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Exemplo Batch") .GetOrCreate(); // Definindo um schema fixo, com os nomes de coluna que eu quero e seus tipos StructType schema = new StructType(new[] { new StructField("MES_REFERENCIA", new StringType()), new StructField("MES_COMPETENCIA", new StringType()), new StructField("UF", new StringType()), new StructField("CODIGO_MUNICIPIO", new IntegerType()), new StructField("MUNICIPIO", new StringType()), new StructField("CODIGO_FAVORECIDO", new StringType()), new StructField("NOME", new StringType()), new StructField("DATA_SAQUE", new DateType()), new StructField("VALOR_TEXTO", new StringType()) }); // Leitura dos dados em disco para dentro do Spark DataFrame df = spark.Read() .Format("csv") .Schema(schema) .Option("sep", ";") .Option("header", true) .Option("dateFormat", "dd/MM/yyyy") .Load(arquivoEntrada); df.PrintSchema(); df.Show(5, 10); // Removendo colunas que não precisamos mais df = df.Drop("MES_REFERENCIA") .Drop("MES_COMPETENCIA") .Drop("CODIGO_MUNICIPIO") .Drop("CODIGO_FAVORECIDO"); df.Show(5, 10); // Convertendo a coluna VALOR de string para decimal, considerando que o padrão brasileiro é diferente do americano df = df.WithColumn("VALOR", RegexpReplace( RegexpReplace( df.Col("VALOR_TEXTO") , "\\.", "") , ",", ".") .Cast("decimal(10,2)")) .Drop("VALOR_TEXTO"); df.PrintSchema(); df.Show(5, 10); // Efetuando um filtro em cima dos dados df = df.Where(df.Col("UF").NotEqual("AC")); //df = df.Where("UF <> 'AC'"); // passar uma expressão WHERE também funciona como filtro df.Show(5, 10); spark.Udf().Register <string, string, string>("ConcatenarMunicipio", (uf, municipio) => ConcatenarMunicipio(uf, municipio)); // Criando uma nova coluna a partir de uma concatenação e removendo colunas antigas e que não precisamos mais df = df.WithColumn("MUNICIPIO", CallUDF("ConcatenarMunicipio", df.Col("UF"), df.Col("MUNICIPIO"))) .Drop("UF"); // Efetuando uma agregação DataFrame somatorio = df.GroupBy("MUNICIPIO") .Sum("VALOR") .WithColumnRenamed("sum(VALOR)", "SOMA_BENEFICIOS"); somatorio .OrderBy(somatorio.Col("SOMA_BENEFICIOS").Desc()) .Show(15, 40); if (args.Length >= 2) { string urlJdbc = args[1]; // jdbc:mysql://localhost:3306/teste_spark string tabela = args[2]; // beneficios string usuario = args[3]; // spark_user string senha = args[4]; // my-secret-password // Salvando em banco de dados com funcionalidade nativa do Spark somatorio .Write() .Format("jdbc") .Option("driver", "com.mysql.cj.jdbc.Driver") .Option("url", "jdbc:mysql://localhost:3306/teste_spark") .Option("dbtable", "beneficios") .Option("user", "spark_user") .Option("password", "my-secret-password") .Mode(SaveMode.Overwrite) .Save(); } spark.Stop(); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Logging <path to Apache User Logs>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName("Apache User Log Processing") .GetOrCreate(); // Read input log file and display it DataFrame df = spark.Read().Text(args[0]); df.Show(); // Step 1: UDF to determine if each line is a valid log entry // Remove any invalid entries before further filtering spark.Udf().Register <string, bool>( "GeneralReg", log => Regex.IsMatch(log, s_apacheRx)); df.CreateOrReplaceTempView("Logs"); // Apply the UDF to get valid log entries DataFrame generalDf = spark.Sql( "SELECT logs.value, GeneralReg(logs.value) FROM Logs"); // Only keep log entries that matched the reg ex generalDf = generalDf.Filter(generalDf["GeneralReg(value)"]); generalDf.Show(); // View the resulting schema // Notice we created a new column "GeneralReg(value)" generalDf.PrintSchema(); // Step 2: Choose valid log entries that start with 10 spark.Udf().Register <string, bool>( "IPReg", log => Regex.IsMatch(log, "^(?=10)")); generalDf.CreateOrReplaceTempView("IPLogs"); // Apply UDF to get valid log entries starting with 10 // Use SQL "WHERE" rather than doing ipDf.Filter(), // which avoids creating an extra column "IPReg(value)" DataFrame ipDf = spark.Sql( "SELECT iplogs.value FROM IPLogs WHERE IPReg(iplogs.value)"); ipDf.Show(); // Step 3: Choose valid log entries that start // with 10 and deal with spam spark.Udf().Register <string, bool>( "SpamRegEx", log => Regex.IsMatch(log, "\\b(?=spam)\\b")); ipDf.CreateOrReplaceTempView("SpamLogs"); // Apply UDF to get valid, start with 10, spam entries DataFrame spamDF = spark.Sql( "SELECT spamlogs.value FROM SpamLogs WHERE SpamRegEx(spamlogs.value)"); // Let's explore the columns in the data we have filtered // Use LINQ to count the number of GET requests int numGetRequests = spamDF .Collect() .Where(r => ContainsGet(r.GetAs <string>("value"))) .Count(); Console.WriteLine("Number of GET requests: " + numGetRequests); spark.Stop(); }
public void Run(string[] args) { string kafkaBrokers = args[0]; double maxSpeed = double.Parse(args[1]); // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Credit Card Fraud") .GetOrCreate(); spark.Conf().Set("spark.sql.shuffle.partitions", "1"); // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", kafkaBrokers) .Option("subscribe", "transactions") .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "transaction":"431", * "number":"0015-0000-0000-0000", * "lat":-23.1618, * "lng":-46.47201, * "amount":91.01487, * "category":"pets", * "eventTime":"2021-01-05T19:07:19.3888" * } */ var schema = new StructType(new[] { new StructField("transaction", new StringType()), new StructField("number", new StringType()), new StructField("lat", new DoubleType()), new StructField("lng", new DoubleType()), new StructField("amount", new DoubleType()), new StructField("category", new StringType()), new StructField("eventTime", new TimestampType()) }); // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", FromJson( df.Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe // Gerando dois dataframes distintos para poder fazer o join e analisar a correção entre as transações DataFrame df1 = df .WithWatermark("eventTime", "7 minutes"); DataFrame df2 = df .WithColumnRenamed("transaction", "transaction2") .WithColumnRenamed("lat", "lat2") .WithColumnRenamed("lng", "lng2") .WithColumnRenamed("eventTime", "eventTime2") .WithWatermark("eventTime2", "7 minutes"); // Efetuando o join para verificar a correlação de transações dos cartões de crédito DataFrame dfJoin = df1.Join(df2, df1.Col("number").EqualTo(df2.Col("number")) .And(Col("transaction").NotEqual(Col("transaction2"))) .And(Col("eventTime2").Between(Col("eventTime"), Col("eventTime") + Expr("interval 5 minutes"))) ); //Registrando uma função personalizada pra ser usada no dataframe spark.Udf().Register <double, double, double, double, double>("CalculateDistance", (lat1, lng1, lat2, lng2) => CalculateDistance(lat1, lng1, lat2, lng2)); spark.Udf().Register <double, Timestamp, Timestamp, double>("CalculateSpeed", (dist, eventTime, eventTime2) => CalculateSpeed(dist, eventTime, eventTime2)); // Criando novas colunas para armazenar a execução do código da UDF dfJoin = dfJoin.WithColumn("dist", CallUDF("CalculateDistance", Col("lat"), Col("lng"), Col("lat2"), Col("lng2"))); dfJoin = dfJoin.WithColumn("speed", CallUDF("CalculateSpeed", Col("dist"), Col("eventTime"), Col("eventTime2"))); // Filtrando as transações que tiverem a velocidade acima do esperado (parâmetro "maxSpeed") dfJoin = dfJoin.Where(Col("speed").Gt(maxSpeed)); // Colocando o streaming pra funcionar StreamingQuery query = dfJoin .WriteStream() .Format("console") .Option("truncate", "false") .OutputMode(OutputMode.Append) .Start(); query.AwaitTermination(); }