Beispiel #1
0
        public void TestSignaturesV2_3_X()
        {
            Assert.IsType <SparkContext>(_spark.SparkContext);

            Assert.IsType <Builder>(SparkSession.Builder());

            SparkSession.ClearDefaultSession();
            SparkSession.SetDefaultSession(_spark);
            Assert.IsType <SparkSession>(SparkSession.GetDefaultSession());

            Assert.IsType <RuntimeConfig>(_spark.Conf());

            Assert.IsType <SparkSession>(_spark.NewSession());

            Assert.IsType <DataFrameReader>(_spark.Read());

            Assert.IsType <DataFrame>(_spark.Range(10));
            Assert.IsType <DataFrame>(_spark.Range(10, 100));
            Assert.IsType <DataFrame>(_spark.Range(10, 100, 10));
            Assert.IsType <DataFrame>(_spark.Range(10, 100, 10, 5));

            _spark.Range(10).CreateOrReplaceTempView("testView");
            Assert.IsType <DataFrame>(_spark.Table("testView"));

            Assert.IsType <DataStreamReader>(_spark.ReadStream());

            Assert.IsType <UdfRegistration>(_spark.Udf());

            Assert.IsType <Catalog>(_spark.Catalog());
        }
        public static void runSpark(string file_path, string cores, string nodes, int nrows)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("word_count_sample")
                .Config("spark.executor.cores", cores)
                .Config("spark.executor.instances", nodes)
                .GetOrCreate();

            // // Create initial DataFrame

            DataFrame dataFrame = spark
                                  .Read()
                                  .Option("header", true)
                                  .Option("inferSchema", true)
                                  .Schema("quizzes string, solutions string")
                                  .Csv(file_path);

            DataFrame dataFrame2 = dataFrame.Limit(nrows);

            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => sudokusolution(sudoku));

            dataFrame2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved");

            sqlDf.Show();

            spark.Stop();
            Console.WriteLine("SCRAPY");
        }
Beispiel #3
0
        public void Run(string[] args)
        {
            string servidoresKafka = args[0];
            string topico          = args[1];
            string modelo          = args[2];

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Exemplo Streaming com Kafka")
                                 .GetOrCreate();

            // Criando um dataframe pra receber dados do Kafka
            DataFrame df = spark
                           .ReadStream()
                           .Format("kafka")
                           .Option("kafka.bootstrap.servers", servidoresKafka)
                           .Option("subscribe", topico)
                           .Load()
                           .SelectExpr("CAST(value AS STRING)");

            /* Criando schema pra validar o JSON que virá nas mensagens do Kafka
             * Exemplo do JSON:
             * {
             *      "cliente": "Fulano",
             *      "produto": "Mochila",
             *      "opiniao": "Muito boa!"
             * }
             */
            var schema = new StructType(new[]
            {
                new StructField("cliente", new StringType()),
                new StructField("produto", new StringType()),
                new StructField("opiniao", new StringType())
            }); // struct<cliente:string,produto:string,valor_total:float>

            // Fazendo o parse do JSON pra um array ...
            df = df.WithColumn("json", Functions.FromJson(
                                   df.Col("value"),
                                   schema.SimpleString)
                               )
                 .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe

            //Registrando uma função personalizada pra ser usada no dataframe
            spark.Udf().Register <string, float>("AnaliseDeSentimento",
                                                 (texto) => AnalisarSentimento(texto, modelo));
            // Criando nova coluna nota com o resultado da análise de sentimento
            df = df.WithColumn("nota", Functions.CallUDF("AnaliseDeSentimento", df.Col("opiniao")));

            // Colocando o streaming pra funcionar
            StreamingQuery query = df
                                   .WriteStream()
                                   .OutputMode(OutputMode.Append)
                                   .Format("console")
                                   //.Trigger(Trigger.Continuous(2000))
                                   //.Foreach(new RedisForeachWriter())
                                   .Start();

            query.AwaitTermination();   // Necessário pra deixar a aplcação no ar para processar os dados
        }
Beispiel #4
0
        static void runSpark(string file_path, string cores, string nodes, int nrows)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("Resolution de " + nrows + " sudokus par évolution combinatoire de " + cores + " noyau(x) et " + nodes + " noeud(s)")
                .Config("spark.executor.cores", cores)
                .Config("spark.executor.instances", nodes)
                .GetOrCreate();

            // Create initial DataFrame
            DataFrame dataFrame = spark
                                  .Read()
                                  .Option("header", true)
                                  .Option("inferSchema", true)
                                  .Schema("quizzes string, solutions string")
                                  .Csv(file_path);

            DataFrame dataFrame2 = dataFrame.Limit(nrows);

            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => sudokusolution(sudoku));

            dataFrame2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved");

            sqlDf.Show();

            spark.Stop();
        }
Beispiel #5
0
        private static void ElectronicsReviewsSentimentAnalysis(SparkSession spark)
        {
            spark.Udf().Register <string, int>("sentiment_udf", text => Sentiment(text));

            var reviewsSentiment = spark.Sql("SELECT *, sentiment_udf(review_text) AS sentiment FROM ElectronicsReviews");

            reviewsSentiment.Cache();
            reviewsSentiment.CreateOrReplaceTempView("ElectronicsReviewSentiment");
        }
Beispiel #6
0
        //Méthode qui est appelée depuis le main pour lancer une session spark avec un nombbre de noyaux et d'instances différents et lancer la résolution du soduku grace à la méthode Sudokusolution().
        //private static void Sudokures(string cores, string nodes, string mem, int nrows){
        private static void Sudokures(int nrows)
        {
            // Initialisation de la session Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .Config("spark.executor.memory", "4G")
                                 .GetOrCreate();
            //.AppName("Resolution of " + nrows + " sudokus using DlxLib with " + cores + " cores and " + nodes + " instances")
            //.Config("spark.driver.cores", cores)
            //.Config("spark.executor.instances", nodes)
            //.Config("spark.executor.memory", mem)
            //.GetOrCreate();

            // Intégration du csv dans un dataframe
            DataFrame df = spark
                           .Read()
                           .Option("header", true)
                           .Option("inferSchema", true)
                           .Csv(_filePath);

            //limit du dataframe avec un nombre de ligne prédéfini lors de l'appel de la fonction
            DataFrame df2 = df.Limit(nrows);

            //Watch seulement pour la résolution des sudokus
            var watch2 = new System.Diagnostics.Stopwatch();

            watch2.Start();

            // Création de la spark User Defined Function
            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => Sudokusolution(sudoku));

            // Appel de l'UDF dans un nouveau dataframe spark qui contiendra les résultats aussi
            df2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT Sudokus, SukoduUDF(Sudokus) as Resolution from Resolved");

            sqlDf.Show();

            watch2.Stop();

            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution : " + watch2.ElapsedMilliseconds + " ms");
            //Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution with " + cores + " core and " + nodes + " instance: " + watch2.ElapsedMilliseconds + " ms");
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();

            spark.Stop();
        }
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: GitHubProjects <path to projects.csv>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("GitHub and Spark Batch")
                                 .GetOrCreate();

            DataFrame projectsDf = spark
                                   .Read()
                                   .Schema("id INT, url STRING, owner_id INT, " +
                                           "name STRING, descriptor STRING, language STRING, " +
                                           "created_at STRING, forked_from INT, deleted STRING, " +
                                           "updated_at STRING")
                                   .Csv(args[0]);

            projectsDf.Show();

            // Drop any rows with NA values
            DataFrameNaFunctions dropEmptyProjects = projectsDf.Na();
            DataFrame            cleanedProjects   = dropEmptyProjects.Drop("any");

            // Remove unnecessary columns
            cleanedProjects = cleanedProjects.Drop("id", "url", "owner_id");
            cleanedProjects.Show();

            // Average number of times each language has been forked
            DataFrame groupedDF = cleanedProjects
                                  .GroupBy("language")
                                  .Agg(Avg(cleanedProjects["forked_from"]));

            // Sort by most forked languages first
            groupedDF.OrderBy(Desc("avg(forked_from)")).Show();

            spark.Udf().Register <string, bool>(
                "MyUDF",
                (date) => DateTime.TryParse(date, out DateTime convertedDate) &&
                (convertedDate > s_referenceDate));

            cleanedProjects.CreateOrReplaceTempView("dateView");

            DataFrame dateDf = spark.Sql(
                "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView");

            dateDf.Show();

            spark.Stop();
        }
Beispiel #8
0
        public void Run(string[] args)
        {
            if (args.Length != 3)
            {
                Console.Error.WriteLine(
                    "Usage: SentimentAnalysisStream <host> <port> <model path>");
                Environment.Exit(1);
            }

            // Create Spark Session
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Streaming Sentiment Analysis")
                                 .GetOrCreate();

            // Setup stream connection info
            string hostname = args[0];
            string port     = args[1];

            // Read streaming data into DataFrame
            DataFrame words = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", hostname)
                              .Option("port", port)
                              .Load();

            // Use ML.NET in a UDF to evaluate each incoming entry
            spark.Udf().Register <string, bool>(
                "MLudf",
                input => Sentiment(input, args[2]));

            // Use Spark SQL to call ML.NET UDF
            // Display results of sentiment analysis on each entry
            words.CreateOrReplaceTempView("WordsSentiment");
            DataFrame sqlDf = spark
                              .Sql("SELECT WordsSentiment.value, MLudf(WordsSentiment.value) FROM WordsSentiment");

            // Handle data continuously as it arrives
            StreamingQuery query = sqlDf
                                   .WriteStream()
                                   .Format("console")
                                   .Start();

            query.AwaitTermination();
        }
Beispiel #9
0
        public void Run(string[] args)
        {
            if (args.Length != 2)
            {
                Console.Error.WriteLine(
                    "Usage: <path to yelptest.csv> <path to MLModel.zip>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName(".NET for Apache Spark Sentiment Analysis")
                                 .GetOrCreate();

            // Read in and display Yelp reviews
            DataFrame df = spark
                           .Read()
                           .Option("header", true)
                           .Option("inferSchema", true)
                           .Csv(args[0]);

            df.Show();

            // Use ML.NET in a UDF to evaluate each review
            spark.Udf().Register <string, bool>(
                "MLudf",
                (text) => Sentiment(text, args[1]));

            // Use Spark SQL to call ML.NET UDF
            // Display results of sentiment analysis on reviews
            df.CreateOrReplaceTempView("Reviews");
            DataFrame sqlDf = spark.Sql("SELECT ReviewText, MLudf(ReviewText) FROM Reviews");

            sqlDf.Show();

            // Print out first 20 rows of data
            // Prevent data getting cut off by setting truncate = 0
            sqlDf.Show(20, 0, false);

            spark.Stop();
        }
Beispiel #10
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName(".NET Spark SQL basic example")
                                 .Config("spark.some.config.option", "some-value")
                                 .GetOrCreate();

            // Need to explicitly specify the schema since pickling vs. arrow formatting
            // will return different types. Pickling will turn longs into ints if the values fit.
            // Same as the "age INT, name STRING" DDL-format string.
            var inputSchema = new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("name", new StringType())
            });
            DataFrame df = spark.Read().Schema(inputSchema).Json(args[0]);

            Spark.Sql.Types.StructType schema = df.Schema();
            Console.WriteLine(schema.SimpleString);

            IEnumerable <Row> rows = df.Collect();

            foreach (Row row in rows)
            {
                Console.WriteLine(row);
            }

            df.Show();

            df.PrintSchema();

            df.Select("name", "age", "age", "name").Show();

            df.Select(df["name"], df["age"] + 1).Show();

            df.Filter(df["age"] > 21).Show();

            df.GroupBy("age")
            .Agg(Avg(df["age"]), Avg(df["age"]), CountDistinct(df["age"], df["age"]))
            .Show();

            df.CreateOrReplaceTempView("people");

            // Registering Udf for SQL expression.
            DataFrame sqlDf = spark.Sql("SELECT * FROM people");

            sqlDf.Show();

            spark.Udf().Register <int?, string, string>(
                "my_udf",
                (age, name) => name + " with " + ((age.HasValue) ? age.Value.ToString() : "null"));

            sqlDf = spark.Sql("SELECT my_udf(*) FROM people");
            sqlDf.Show();

            // Using UDF via data frames.
            Func <Column, Column, Column> addition = Udf <int?, string, string>(
                (age, name) => name + " is " + (age.HasValue ? age.Value + 10 : 0));

            df.Select(addition(df["age"], df["name"])).Show();

            // Chaining example:
            Func <Column, Column> addition2 = Udf <string, string>(str => $"hello {str}!");

            df.Select(addition2(addition(df["age"], df["name"]))).Show();

            // Multiple UDF example:
            df.Select(addition(df["age"], df["name"]), addition2(df["name"])).Show();

            // UDF return type as array.
            Func <Column, Column> udfArray =
                Udf <string, string[]>((str) => new string[] { str, str + str });

            df.Select(Explode(udfArray(df["name"]))).Show();

            // UDF return type as map.
            Func <Column, Column> udfMap =
                Udf <string, IDictionary <string, string[]> >(
                    (str) => new Dictionary <string, string[]> {
                { str, new[] { str, str } }
            });

            df.Select(udfMap(df["name"]).As("UdfMap")).Show(truncate: 50);

            // Joins.
            DataFrame joinedDf = df.Join(df, "name");

            joinedDf.Show();

            DataFrame joinedDf2 = df.Join(df, new[] { "name", "age" });

            joinedDf2.Show();

            DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer");

            joinedDf3.Show();

            spark.Stop();
        }
Beispiel #11
0
        public void TestVectorUdf()
        {
            Func <Int32Array, StringArray, StringArray> udf1Func =
                (ages, names) => (StringArray)ToArrowArray(
                    Enumerable.Range(0, names.Length)
                    .Select(i => $"{names.GetString(i)} is {ages.GetValue(i) ?? 0}")
                    .ToArray());

            // Single UDF.
            Func <Column, Column, Column> udf1 = VectorUdf(udf1Func);
            {
                Row[] rows = _df.Select(udf1(_df["age"], _df["name"])).Collect().ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("Michael is 0", rows[0].GetAs <string>(0));
                Assert.Equal("Andy is 30", rows[1].GetAs <string>(0));
                Assert.Equal("Justin is 19", rows[2].GetAs <string>(0));
            }

            // Chained UDFs.
            Func <Column, Column> udf2 = VectorUdf <StringArray, StringArray>(
                (strings) => (StringArray)ToArrowArray(
                    Enumerable.Range(0, strings.Length)
                    .Select(i => $"hello {strings.GetString(i)}!")
                    .ToArray()));
            {
                Row[] rows = _df
                             .Select(udf2(udf1(_df["age"], _df["name"])))
                             .Collect()
                             .ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("hello Michael is 0!", rows[0].GetAs <string>(0));
                Assert.Equal("hello Andy is 30!", rows[1].GetAs <string>(0));
                Assert.Equal("hello Justin is 19!", rows[2].GetAs <string>(0));
            }

            // Multiple UDFs:
            {
                Row[] rows = _df
                             .Select(udf1(_df["age"], _df["name"]), udf2(_df["name"]))
                             .Collect()
                             .ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("Michael is 0", rows[0].GetAs <string>(0));
                Assert.Equal("hello Michael!", rows[0].GetAs <string>(1));

                Assert.Equal("Andy is 30", rows[1].GetAs <string>(0));
                Assert.Equal("hello Andy!", rows[1].GetAs <string>(1));

                Assert.Equal("Justin is 19", rows[2].GetAs <string>(0));
                Assert.Equal("hello Justin!", rows[2].GetAs <string>(1));
            }

            // Register UDF
            {
                _df.CreateOrReplaceTempView("people");
                _spark.Udf().RegisterVector("udf1", udf1Func);
                Row[] rows = _spark.Sql("SELECT udf1(age, name) FROM people")
                             .Collect()
                             .ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("Michael is 0", rows[0].GetAs <string>(0));
                Assert.Equal("Andy is 30", rows[1].GetAs <string>(0));
                Assert.Equal("Justin is 19", rows[2].GetAs <string>(0));
            }
        }
        public void TestUdfRegistrationWithReturnAsRowType()
        {
            // Test UDF that returns a Row object with a single column.
            {
                var schema = new StructType(new[]
                {
                    new StructField("col1", new IntegerType()),
                    new StructField("col2", new StringType())
                });

                _df.CreateOrReplaceTempView("people");

                _spark.Udf().Register <string>(
                    "udf1",
                    str => new GenericRow(new object[] { 1, "abc" }),
                    schema);

                Row[] rows =
                    _spark.Sql("SELECT udf1(name) AS col FROM people")
                    .Collect()
                    .ToArray();
                Assert.Equal(3, rows.Length);
                foreach (Row row in rows)
                {
                    Assert.Equal(1, row.Size());
                    Row outerCol = row.GetAs <Row>("col");
                    Assert.Equal(2, outerCol.Size());
                    Assert.Equal(1, outerCol.GetAs <int>("col1"));
                    Assert.Equal("abc", outerCol.GetAs <string>("col2"));
                }
            }

            // Test UDF that returns a Row object with multiple columns.
            {
                var schema = new StructType(new[]
                {
                    new StructField("col1", new IntegerType())
                });

                _df.CreateOrReplaceTempView("people");

                _spark.Udf().Register <string>(
                    "udf2",
                    str => new GenericRow(new object[] { 111 }),
                    schema);

                Row[] rows =
                    _spark.Sql("SELECT udf2(name) AS col, name FROM people")
                    .Collect()
                    .ToArray();
                Assert.Equal(3, rows.Length);

                foreach (Row row in rows)
                {
                    Assert.Equal(2, row.Size());
                    Row col1 = row.GetAs <Row>("col");
                    Assert.Equal(1, col1.Size());
                    Assert.Equal(111, col1.GetAs <int>("col1"));

                    string col2 = row.GetAs <string>("name");
                    Assert.NotEmpty(col2);
                }
            }

            // Test UDF that returns a nested Row object.
            {
                var subSchema1 = new StructType(new[]
                {
                    new StructField("col1", new IntegerType()),
                });
                var subSchema2 = new StructType(new[]
                {
                    new StructField("col1", new StringType()),
                    new StructField("col2", subSchema1),
                });
                var schema = new StructType(new[]
                {
                    new StructField("col1", new IntegerType()),
                    new StructField("col2", subSchema1),
                    new StructField("col3", subSchema2)
                });

                _df.CreateOrReplaceTempView("people");

                _spark.Udf().Register <string>(
                    "udf3",
                    str => new GenericRow(
                        new object[]
                {
                    1,
                    new GenericRow(new object[] { 1 }),
                    new GenericRow(new object[]
                    {
                        "abc",
                        new GenericRow(new object[] { 10 })
                    })
                }),
                    schema);

                Row[] rows =
                    _spark.Sql("SELECT udf3(name) AS col FROM people")
                    .Collect()
                    .ToArray();
                Assert.Equal(3, rows.Length);

                foreach (Row row in rows)
                {
                    Assert.Equal(1, row.Size());
                    Row outerCol = row.GetAs <Row>("col");
                    Assert.Equal(3, outerCol.Size());
                    Assert.Equal(1, outerCol.GetAs <int>("col1"));
                    Assert.Equal(
                        new Row(new object[] { 1 }, subSchema1),
                        outerCol.GetAs <Row>("col2"));
                    Assert.Equal(
                        new Row(
                            new object[] { "abc", new Row(new object[] { 10 }, subSchema1) },
                            subSchema2),
                        outerCol.GetAs <Row>("col3"));
                }
            }

            // Chained UDFs.
            {
                var schema = new StructType(new[]
                {
                    new StructField("col1", new IntegerType()),
                    new StructField("col2", new StringType())
                });

                _df.CreateOrReplaceTempView("people");

                _spark.Udf().Register <string>(
                    "udf4",
                    str => new GenericRow(new object[] { 1, str }),
                    schema);

                _spark.Udf().Register <Row, string>(
                    "udf5",
                    row => row.GetAs <string>(1));

                Row[] rows =
                    _spark.Sql("SELECT udf5(udf4(name)) FROM people")
                    .Collect()
                    .ToArray();
                Assert.Equal(3, rows.Length);

                var expected = new string[] { "Michael", "Andy", "Justin" };
                for (int i = 0; i < rows.Length; ++i)
                {
                    Assert.Equal(1, rows[i].Size());
                    Assert.Equal(expected[i], rows[i].GetAs <string>(0));
                }
            }
        }
        static void Main(string[] args)
        {
            // Initialize Session
            SparkSession ss =
                SparkSession
                .Builder()
                .AppName("Working with DataFrames")
                .GetOrCreate();

            // Read Data
            DataFrame businesses =
                ss
                .Read()
                .Option("header", "true")
                .Option("inferSchema", "true")
                .Csv("Data/NYC-Restaurant-Inspections.csv");

            businesses = businesses.Select("CAMIS", "DBA", "BORO", "CUISINE DESCRIPTION");

            DataFrame inspections =
                ss
                .Read()
                .Option("header", "true")
                .Option("inferSchema", "true")
                .Csv("Data/NYC-Restaurant-Inspections.csv");

            inspections = inspections.Select("CAMIS", "INSPECTION DATE", "VIOLATION CODE", "CRITICAL FLAG", "SCORE", "GRADE", "INSPECTION TYPE");

            // Select columns
            businesses.Select(Col("CAMIS"), Col("DBA")).Show(1);

            inspections.Select(inspections["VIOLATION CODE"]).Show(1);

            // Filter
            businesses
            .Filter(Col("BORO") == "Manhattan")
            .Select("DBA", "BORO")
            .Show(3);

            // Group / Aggregate
            businesses
            .GroupBy("CUISINE DESCRIPTION")
            .Agg(Count("CUISINE DESCRIPTION").Alias("CUISINE COUNT"))
            .Show(10);

            // Order
            businesses
            .GroupBy("CUISINE DESCRIPTION")
            .Agg(Count("CUISINE DESCRIPTION").Alias("CUISINE COUNT"))
            .OrderBy(Col("CUISINE COUNT").Desc())
            .Show(3);

            // Join
            DataFrame joinedDf =
                businesses
                .Join(inspections, "CAMIS")
                .Select(Col("DBA"), Col("CUISINE DESCRIPTION"), Col("GRADE"));

            joinedDf.Show(5);

            // SQL
            businesses.CreateOrReplaceTempView("businesses");

            inspections.CreateOrReplaceTempView("inspections");

            ss.Sql(@"SELECT b.DBA,b.`CUISINE DESCRIPTION`,i.GRADE FROM businesses b JOIN inspections i ON b.CAMIS = i.CAMIS").Show(5);

            // UDF
            ss.Udf().Register <string, string>("Tupper", Tupper);

            inspections
            .Select(CallUDF("Tupper", Col("INSPECTION TYPE")).Alias("CAPITALIZED"))
            .Show(3);

            // Save
            joinedDf
            .Write()
            .Mode(SaveMode.Overwrite)
            .Csv("output");
        }
Beispiel #14
0
        static void Main(string[] args)
        {
            /*
             * Copiar mysql-connector-java-8.0.19.jar para pasta do Spark / Hadoop
             * Rodar o comando abaixo a partir da pasta inicial deste projeto:
             *   %SPARK_HOME%\bin\spark-submit
             *   --master local
             *   --class org.apache.spark.deploy.dotnet.DotnetRunner
             *   bin\Debug\netcoreapp3.1\microsoft-spark-2-4_2.11-1.0.0.jar
             *   dotnet
             *   bin\Debug\netcoreapp3.1\BatchDemo.dll
             *   data\amostra.csv
             *   jdbc:mysql://localhost:3306/teste_spark beneficios spark_user my-secret-password
             */

            if (args.Length == 0)
            {
                throw new ArgumentException("Informar os caminhos onde encontrar os arquivos CSV");
            }

            string arquivoEntrada = args[0];

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Exemplo Batch")
                                 .GetOrCreate();

            // Definindo um schema fixo, com os nomes de coluna que eu quero e seus tipos
            StructType schema = new StructType(new[]
            {
                new StructField("MES_REFERENCIA", new StringType()),
                new StructField("MES_COMPETENCIA", new StringType()),
                new StructField("UF", new StringType()),
                new StructField("CODIGO_MUNICIPIO", new IntegerType()),
                new StructField("MUNICIPIO", new StringType()),
                new StructField("CODIGO_FAVORECIDO", new StringType()),
                new StructField("NOME", new StringType()),
                new StructField("DATA_SAQUE", new DateType()),
                new StructField("VALOR_TEXTO", new StringType())
            });

            // Leitura dos dados em disco para dentro do Spark
            DataFrame df = spark.Read()
                           .Format("csv")
                           .Schema(schema)
                           .Option("sep", ";")
                           .Option("header", true)
                           .Option("dateFormat", "dd/MM/yyyy")
                           .Load(arquivoEntrada);

            df.PrintSchema();
            df.Show(5, 10);

            // Removendo colunas que não precisamos mais
            df = df.Drop("MES_REFERENCIA")
                 .Drop("MES_COMPETENCIA")
                 .Drop("CODIGO_MUNICIPIO")
                 .Drop("CODIGO_FAVORECIDO");
            df.Show(5, 10);

            // Convertendo a coluna VALOR de string para decimal, considerando que o padrão brasileiro é diferente do americano
            df = df.WithColumn("VALOR", RegexpReplace(
                                   RegexpReplace(
                                       df.Col("VALOR_TEXTO")
                                       , "\\.", "")
                                   , ",", ".")
                               .Cast("decimal(10,2)"))
                 .Drop("VALOR_TEXTO");
            df.PrintSchema();
            df.Show(5, 10);

            // Efetuando um filtro em cima dos dados
            df = df.Where(df.Col("UF").NotEqual("AC"));
            //df = df.Where("UF <> 'AC'");  // passar uma expressão WHERE também funciona como filtro
            df.Show(5, 10);

            spark.Udf().Register <string, string, string>("ConcatenarMunicipio",
                                                          (uf, municipio) => ConcatenarMunicipio(uf, municipio));

            // Criando uma nova coluna a partir de uma concatenação e removendo colunas antigas e que não precisamos mais
            df = df.WithColumn("MUNICIPIO",
                               CallUDF("ConcatenarMunicipio", df.Col("UF"), df.Col("MUNICIPIO")))
                 .Drop("UF");
            // Efetuando uma agregação
            DataFrame somatorio = df.GroupBy("MUNICIPIO")
                                  .Sum("VALOR")
                                  .WithColumnRenamed("sum(VALOR)", "SOMA_BENEFICIOS");

            somatorio
            .OrderBy(somatorio.Col("SOMA_BENEFICIOS").Desc())
            .Show(15, 40);

            if (args.Length >= 2)
            {
                string urlJdbc = args[1];   // jdbc:mysql://localhost:3306/teste_spark
                string tabela  = args[2];   // beneficios
                string usuario = args[3];   // spark_user
                string senha   = args[4];   // my-secret-password

                // Salvando em banco de dados com funcionalidade nativa do Spark
                somatorio
                .Write()
                .Format("jdbc")
                .Option("driver", "com.mysql.cj.jdbc.Driver")
                .Option("url", "jdbc:mysql://localhost:3306/teste_spark")
                .Option("dbtable", "beneficios")
                .Option("user", "spark_user")
                .Option("password", "my-secret-password")
                .Mode(SaveMode.Overwrite)
                .Save();
            }
            spark.Stop();
        }
Beispiel #15
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Logging <path to Apache User Logs>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Apache User Log Processing")
                                 .GetOrCreate();

            // Read input log file and display it
            DataFrame df = spark.Read().Text(args[0]);

            df.Show();

            // Step 1: UDF to determine if each line is a valid log entry
            // Remove any invalid entries before further filtering
            spark.Udf().Register <string, bool>(
                "GeneralReg",
                log => Regex.IsMatch(log, s_apacheRx));

            df.CreateOrReplaceTempView("Logs");

            // Apply the UDF to get valid log entries
            DataFrame generalDf = spark.Sql(
                "SELECT logs.value, GeneralReg(logs.value) FROM Logs");

            // Only keep log entries that matched the reg ex
            generalDf = generalDf.Filter(generalDf["GeneralReg(value)"]);
            generalDf.Show();

            // View the resulting schema
            // Notice we created a new column "GeneralReg(value)"
            generalDf.PrintSchema();

            // Step 2: Choose valid log entries that start with 10
            spark.Udf().Register <string, bool>(
                "IPReg",
                log => Regex.IsMatch(log, "^(?=10)"));

            generalDf.CreateOrReplaceTempView("IPLogs");

            // Apply UDF to get valid log entries starting with 10
            // Use SQL "WHERE" rather than doing ipDf.Filter(),
            // which avoids creating an extra column "IPReg(value)"
            DataFrame ipDf = spark.Sql(
                "SELECT iplogs.value FROM IPLogs WHERE IPReg(iplogs.value)");

            ipDf.Show();

            // Step 3: Choose valid log entries that start
            // with 10 and deal with spam
            spark.Udf().Register <string, bool>(
                "SpamRegEx",
                log => Regex.IsMatch(log, "\\b(?=spam)\\b"));

            ipDf.CreateOrReplaceTempView("SpamLogs");

            // Apply UDF to get valid, start with 10, spam entries
            DataFrame spamDF = spark.Sql(
                "SELECT spamlogs.value FROM SpamLogs WHERE SpamRegEx(spamlogs.value)");

            // Let's explore the columns in the data we have filtered
            // Use LINQ to count the number of GET requests
            int numGetRequests = spamDF
                                 .Collect()
                                 .Where(r => ContainsGet(r.GetAs <string>("value")))
                                 .Count();

            Console.WriteLine("Number of GET requests: " + numGetRequests);

            spark.Stop();
        }
        public void Run(string[] args)
        {
            string kafkaBrokers = args[0];
            double maxSpeed     = double.Parse(args[1]);

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Credit Card Fraud")
                                 .GetOrCreate();

            spark.Conf().Set("spark.sql.shuffle.partitions", "1");  // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar

            // Criando um dataframe pra receber dados do Kafka
            DataFrame df = spark
                           .ReadStream()
                           .Format("kafka")
                           .Option("kafka.bootstrap.servers", kafkaBrokers)
                           .Option("subscribe", "transactions")
                           .Load()
                           .SelectExpr("CAST(value AS STRING)");

            /* Criando schema pra validar o JSON que virá nas mensagens do Kafka
             * Exemplo do JSON:
             * {
             *      "transaction":"431",
             *      "number":"0015-0000-0000-0000",
             *      "lat":-23.1618,
             *      "lng":-46.47201,
             *      "amount":91.01487,
             *      "category":"pets",
             *      "eventTime":"2021-01-05T19:07:19.3888"
             * }
             */
            var schema = new StructType(new[]
            {
                new StructField("transaction", new StringType()),
                new StructField("number", new StringType()),
                new StructField("lat", new DoubleType()),
                new StructField("lng", new DoubleType()),
                new StructField("amount", new DoubleType()),
                new StructField("category", new StringType()),
                new StructField("eventTime", new TimestampType())
            });

            // Fazendo o parse do JSON pra um array ...
            df = df.WithColumn("json", FromJson(
                                   df.Col("value"),
                                   schema.SimpleString)
                               )
                 .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe

            // Gerando dois dataframes distintos para poder fazer o join e analisar a correção entre as transações
            DataFrame df1 = df
                            .WithWatermark("eventTime", "7 minutes");
            DataFrame df2 = df
                            .WithColumnRenamed("transaction", "transaction2")
                            .WithColumnRenamed("lat", "lat2")
                            .WithColumnRenamed("lng", "lng2")
                            .WithColumnRenamed("eventTime", "eventTime2")
                            .WithWatermark("eventTime2", "7 minutes");

            // Efetuando o join para verificar a correlação de transações dos cartões de crédito
            DataFrame dfJoin = df1.Join(df2,
                                        df1.Col("number").EqualTo(df2.Col("number"))
                                        .And(Col("transaction").NotEqual(Col("transaction2")))
                                        .And(Col("eventTime2").Between(Col("eventTime"), Col("eventTime") + Expr("interval 5 minutes")))
                                        );

            //Registrando uma função personalizada pra ser usada no dataframe
            spark.Udf().Register <double, double, double, double, double>("CalculateDistance", (lat1, lng1, lat2, lng2) => CalculateDistance(lat1, lng1, lat2, lng2));
            spark.Udf().Register <double, Timestamp, Timestamp, double>("CalculateSpeed", (dist, eventTime, eventTime2) => CalculateSpeed(dist, eventTime, eventTime2));

            // Criando novas colunas para armazenar a execução do código da UDF
            dfJoin = dfJoin.WithColumn("dist", CallUDF("CalculateDistance", Col("lat"), Col("lng"), Col("lat2"), Col("lng2")));
            dfJoin = dfJoin.WithColumn("speed", CallUDF("CalculateSpeed", Col("dist"), Col("eventTime"), Col("eventTime2")));

            // Filtrando as transações que tiverem a velocidade acima do esperado (parâmetro "maxSpeed")
            dfJoin = dfJoin.Where(Col("speed").Gt(maxSpeed));

            // Colocando o streaming pra funcionar

            StreamingQuery query = dfJoin
                                   .WriteStream()
                                   .Format("console")
                                   .Option("truncate", "false")
                                   .OutputMode(OutputMode.Append)
                                   .Start();

            query.AwaitTermination();
        }