예제 #1
0
        public void TestDrop()
        {
            // Arrange
            const string columnNameToDrop             = "column1";
            var          expectedResultDataFrameProxy = new Mock <IDataFrameProxy>().Object;

            mockDataFrameProxy.Setup(m => m.Drop(It.IsAny <string>())).Returns(expectedResultDataFrameProxy);
            var sc = new SparkContext(null);

            // Act
            var originalDataFrame     = new DataFrame(mockDataFrameProxy.Object, sc);
            var actualResultDataFrame = originalDataFrame.Drop(columnNameToDrop);

            // Assert
            mockDataFrameProxy.Verify(m => m.Drop(columnNameToDrop)); // assert Drop of Proxy was invoked with correct parameters
            Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy);
        }
예제 #2
0
        public void RemoveColumns_Test()
        {
            var dict = new Dictionary <string, List <object> >
            {
                { "itemID", new List <object>()
                  {
                      "foo", "bar", "baz", "foo"
                  } },
                { "catId", new List <object>()
                  {
                      "A", "A", "B", "B"
                  } },
                { "value1", new List <object>()
                  {
                      1, 2, 3, 4
                  } },
            };

            //
            var df1 = new DataFrame(dict);

            var df2 = df1.Drop("catId");

            //test
            var c1f1 = df1["itemID"].ToList();
            var c1f2 = df1["value1"].ToList();

            Assert.Equal(3, df1.Columns.Count);

            var c2f1 = df2["itemID"].ToList();
            var c2f2 = df2["value1"].ToList();

            Assert.Equal(2, df2.Columns.Count);

            for (int i = 0; i < c1f1.Count(); i++)
            {
                Assert.Equal(c1f1[i].ToString(), c2f1[i].ToString());
            }
            for (int i = 0; i < c2f2.Count(); i++)
            {
                Assert.Equal(c1f2[i], c2f2[i]);
            }
        }
예제 #3
0
        static void Main(string[] args)
        {
            SparkSession spark = SparkSession.Builder()
                                 .AppName("emrapp")
                                 .GetOrCreate();
            DataFrame df = spark.Read()
                           .Format("avro")
                           .Load(args[0]);

            df = df.Drop("address")
                 .GroupBy("itemid")
                 .Count();

            df.Show();
            df.Coalesce(1)
            .Write()
            .Format("csv")
            .Save($"{args[1]}/{DateTime.UtcNow.ToString("yyyy/MM/dd/hh-mm-ss")}");
        }
예제 #4
0
        public void TestSignaturesV2_3_X()
        {
            Column col = _df["name"];

            col = _df["age"];

            DataFrame df = _df.ToDF();

            df = df.ToDF("name2", "age2");

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            _df.IsLocal();

            _df.IsStreaming();

            using (var tempDir = new TemporaryDirectory())
            {
                // The following is required for *CheckPoint().
                _spark.SparkContext.SetCheckpointDir(tempDir.Path);

                _df.Checkpoint();
                _df.Checkpoint(false);

                _df.LocalCheckpoint();
                _df.LocalCheckpoint(false);
            }

            _df.WithWatermark("time", "10 minutes");

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            _df.Join(_df);
            _df.Join(_df, "name");
            _df.Join(_df, new[] { "name" });
            _df.Join(_df, new[] { "name" }, "outer");
            _df.Join(_df, _df["age"] == _df["age"]);
            _df.Join(_df, _df["age"] == _df["age"], "outer");

            _df.CrossJoin(_df);

            _df.SortWithinPartitions("age");
            _df.SortWithinPartitions("age", "name");
            _df.SortWithinPartitions();
            _df.SortWithinPartitions(_df["age"]);
            _df.SortWithinPartitions(_df["age"], _df["name"]);

            _df.Sort("age");
            _df.Sort("age", "name");
            _df.Sort();
            _df.Sort(_df["age"]);
            _df.Sort(_df["age"], _df["name"]);

            _df.OrderBy("age");
            _df.OrderBy("age", "name");
            _df.OrderBy();
            _df.OrderBy(_df["age"]);
            _df.OrderBy(_df["age"], _df["name"]);

            _df.Hint("broadcast");
            _df.Hint("broadcast", new[] { "hello", "world" });

            _df.Col("age");

            _df.ColRegex("age");

            _df.As("alias");

            _df.Alias("alias");

            _df.Select("age");
            _df.Select("age", "name");
            _df.Select();
            _df.Select(_df["age"]);
            _df.Select(_df["age"], _df["name"]);

            _df.SelectExpr();
            _df.SelectExpr("age * 2");
            _df.SelectExpr("age * 2", "abs(age)");

            _df.Filter(_df["age"] > 21);
            _df.Filter("age > 21");

            _df.Where(_df["age"] > 21);
            _df.Where("age > 21");

            _df.GroupBy("age");
            _df.GroupBy("age", "name");
            _df.GroupBy();
            _df.GroupBy(_df["age"]);
            _df.GroupBy(_df["age"], _df["name"]);

            _df.Rollup("age");
            _df.Rollup("age", "name");
            _df.Rollup();
            _df.Rollup(_df["age"]);
            _df.Rollup(_df["age"], _df["name"]);

            _df.Cube("age");
            _df.Cube("age", "name");
            _df.Cube();
            _df.Cube(_df["age"]);
            _df.Cube(_df["age"], _df["name"]);

            _df.Agg(Avg(_df["age"]));
            _df.Agg(Avg(_df["age"]), Avg(_df["name"]));

            _df.Limit(10);

            _df.Union(_df);

            _df.UnionByName(_df);

            _df.Intersect(_df);

            _df.Except(_df);

            _df.Sample(0.5);
            _df.Sample(0.5, true);
            _df.Sample(0.5, false, 12345);

            _df.RandomSplit(new[] { 0.2, 0.8 });
            _df.RandomSplit(new[] { 0.2, 0.8 }, 12345);

            _df.WithColumn("age2", _df["age"]);

            _df.WithColumnRenamed("age", "age2");

            _df.Drop();
            _df.Drop("age");
            _df.Drop("age", "name");

            _df.Drop(_df["age"]);

            _df.DropDuplicates();
            _df.DropDuplicates("age");
            _df.DropDuplicates("age", "name");

            _df.Describe();
            _df.Describe("age");
            _df.Describe("age", "name");

            _df.Summary();
            _df.Summary("count");
            _df.Summary("count", "mean");

            _df.Head(2);
            _df.Head();

            _df.First();

            _df.Take(3).ToArray();

            _df.Collect().ToArray();

            _df.ToLocalIterator().ToArray();

            _df.Count();

            _df.Repartition(2);
            _df.Repartition(2, _df["age"]);
            _df.Repartition(_df["age"]);
            _df.Repartition();

            _df.RepartitionByRange(2, _df["age"]);
            _df.RepartitionByRange(_df["age"]);

            _df.Coalesce(1);

            _df.Distinct();

            _df.Persist();

            _df.Cache();

            _df.Unpersist();

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }
예제 #5
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("SQL basic example using .NET for Apache Spark")
                                 .Config("spark.some.config.option", "some-value")
                                 .GetOrCreate();

            // Need to explicitly specify the schema since pickling vs. arrow formatting
            // will return different types. Pickling will turn longs into ints if the values fit.
            // Same as the "age INT, name STRING" DDL-format string.
            var inputSchema = new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("name", new StringType())
            });
            DataFrame df = spark.Read().Schema(inputSchema).Json(args[0]);

            Spark.Sql.Types.StructType schema = df.Schema();
            Console.WriteLine(schema.SimpleString);

            IEnumerable <Row> rows = df.Collect();

            foreach (Row row in rows)
            {
                Console.WriteLine(row);
            }

            df.Show();

            df.PrintSchema();

            df.Select("name", "age", "age", "name").Show();

            df.Select(df["name"], df["age"] + 1).Show();

            df.Filter(df["age"] > 21).Show();

            df.GroupBy("age")
            .Agg(Avg(df["age"]), Avg(df["age"]), CountDistinct(df["age"], df["age"]))
            .Show();

            df.CreateOrReplaceTempView("people");

            // Registering Udf for SQL expression.
            DataFrame sqlDf = spark.Sql("SELECT * FROM people");

            sqlDf.Show();

            spark.Udf().Register <int?, string, string>(
                "my_udf",
                (age, name) => name + " with " + ((age.HasValue) ? age.Value.ToString() : "null"));

            sqlDf = spark.Sql("SELECT my_udf(*) FROM people");
            sqlDf.Show();

            // Using UDF via data frames.
            Func <Column, Column, Column> addition = Udf <int?, string, string>(
                (age, name) => name + " is " + (age.HasValue ? age.Value + 10 : 0));

            df.Select(addition(df["age"], df["name"])).Show();

            // Chaining example:
            Func <Column, Column> addition2 = Udf <string, string>(str => $"hello {str}!");

            df.Select(addition2(addition(df["age"], df["name"]))).Show();

            // Multiple UDF example:
            df.Select(addition(df["age"], df["name"]), addition2(df["name"])).Show();

            // UDF return type as array.
            Func <Column, Column> udfArray =
                Udf <string, string[]>((str) => new[] { str, str + str });

            df.Select(Explode(udfArray(df["name"]))).Show();

            // UDF return type as map.
            Func <Column, Column> udfMap =
                Udf <string, IDictionary <string, string[]> >(
                    (str) => new Dictionary <string, string[]> {
                { str, new[] { str, str } }
            });

            df.Select(udfMap(df["name"]).As("UdfMap")).Show(truncate: 50);

            // Joins.
            DataFrame joinedDf = df.Join(df, "name");

            joinedDf.Show();

            DataFrame joinedDf2 = df.Join(df, new[] { "name", "age" });

            joinedDf2.Show();

            DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer");

            joinedDf3.Show();

            // Union of two data frames
            DataFrame unionDf = df.Union(df);

            unionDf.Show();

            // Add new column to data frame
            df.WithColumn("location", Lit("Seattle")).Show();

            // Rename existing column
            df.WithColumnRenamed("name", "fullname").Show();

            // Filter rows with null age
            df.Filter(Col("age").IsNull()).Show();

            // Fill null values in age column with -1
            df.Na().Fill(-1, new[] { "age" }).Show();

            // Drop age column
            df.Drop(new[] { "age" }).Show();

            spark.Stop();
        }
예제 #6
0
        static void Main(string[] args)
        {
            /*
             * Copiar mysql-connector-java-8.0.19.jar para pasta do Spark / Hadoop
             * Rodar o comando abaixo a partir da pasta inicial deste projeto:
             *   %SPARK_HOME%\bin\spark-submit
             *   --master local
             *   --class org.apache.spark.deploy.dotnet.DotnetRunner
             *   bin\Debug\netcoreapp3.1\microsoft-spark-2-4_2.11-1.0.0.jar
             *   dotnet
             *   bin\Debug\netcoreapp3.1\BatchDemo.dll
             *   data\amostra.csv
             *   jdbc:mysql://localhost:3306/teste_spark beneficios spark_user my-secret-password
             */

            if (args.Length == 0)
            {
                throw new ArgumentException("Informar os caminhos onde encontrar os arquivos CSV");
            }

            string arquivoEntrada = args[0];

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Exemplo Batch")
                                 .GetOrCreate();

            // Definindo um schema fixo, com os nomes de coluna que eu quero e seus tipos
            StructType schema = new StructType(new[]
            {
                new StructField("MES_REFERENCIA", new StringType()),
                new StructField("MES_COMPETENCIA", new StringType()),
                new StructField("UF", new StringType()),
                new StructField("CODIGO_MUNICIPIO", new IntegerType()),
                new StructField("MUNICIPIO", new StringType()),
                new StructField("CODIGO_FAVORECIDO", new StringType()),
                new StructField("NOME", new StringType()),
                new StructField("DATA_SAQUE", new DateType()),
                new StructField("VALOR_TEXTO", new StringType())
            });

            // Leitura dos dados em disco para dentro do Spark
            DataFrame df = spark.Read()
                           .Format("csv")
                           .Schema(schema)
                           .Option("sep", ";")
                           .Option("header", true)
                           .Option("dateFormat", "dd/MM/yyyy")
                           .Load(arquivoEntrada);

            df.PrintSchema();
            df.Show(5, 10);

            // Removendo colunas que não precisamos mais
            df = df.Drop("MES_REFERENCIA")
                 .Drop("MES_COMPETENCIA")
                 .Drop("CODIGO_MUNICIPIO")
                 .Drop("CODIGO_FAVORECIDO");
            df.Show(5, 10);

            // Convertendo a coluna VALOR de string para decimal, considerando que o padrão brasileiro é diferente do americano
            df = df.WithColumn("VALOR", RegexpReplace(
                                   RegexpReplace(
                                       df.Col("VALOR_TEXTO")
                                       , "\\.", "")
                                   , ",", ".")
                               .Cast("decimal(10,2)"))
                 .Drop("VALOR_TEXTO");
            df.PrintSchema();
            df.Show(5, 10);

            // Efetuando um filtro em cima dos dados
            df = df.Where(df.Col("UF").NotEqual("AC"));
            //df = df.Where("UF <> 'AC'");  // passar uma expressão WHERE também funciona como filtro
            df.Show(5, 10);

            spark.Udf().Register <string, string, string>("ConcatenarMunicipio",
                                                          (uf, municipio) => ConcatenarMunicipio(uf, municipio));

            // Criando uma nova coluna a partir de uma concatenação e removendo colunas antigas e que não precisamos mais
            df = df.WithColumn("MUNICIPIO",
                               CallUDF("ConcatenarMunicipio", df.Col("UF"), df.Col("MUNICIPIO")))
                 .Drop("UF");
            // Efetuando uma agregação
            DataFrame somatorio = df.GroupBy("MUNICIPIO")
                                  .Sum("VALOR")
                                  .WithColumnRenamed("sum(VALOR)", "SOMA_BENEFICIOS");

            somatorio
            .OrderBy(somatorio.Col("SOMA_BENEFICIOS").Desc())
            .Show(15, 40);

            if (args.Length >= 2)
            {
                string urlJdbc = args[1];   // jdbc:mysql://localhost:3306/teste_spark
                string tabela  = args[2];   // beneficios
                string usuario = args[3];   // spark_user
                string senha   = args[4];   // my-secret-password

                // Salvando em banco de dados com funcionalidade nativa do Spark
                somatorio
                .Write()
                .Format("jdbc")
                .Option("driver", "com.mysql.cj.jdbc.Driver")
                .Option("url", "jdbc:mysql://localhost:3306/teste_spark")
                .Option("dbtable", "beneficios")
                .Option("user", "spark_user")
                .Option("password", "my-secret-password")
                .Mode(SaveMode.Overwrite)
                .Save();
            }
            spark.Stop();
        }
예제 #7
0
        static void Main(string[] args)
        {
            // Define columns to remove
            string[] dropCols = new string[]
            {
                "CAMIS",
                "CUISINE DESCRIPTION",
                "VIOLATION DESCRIPTION",
                "BORO",
                "BUILDING",
                "STREET",
                "ZIPCODE",
                "PHONE",
                "ACTION",
                "GRADE DATE",
                "RECORD DATE",
                "Latitude",
                "Longitude",
                "Community Board",
                "Council District",
                "Census Tract",
                "BIN",
                "BBL",
                "NTA"
            };

            // Create SparkSession
            var sc =
                SparkSession
                .Builder()
                .AppName("Restaurant_Inspections_ETL")
                .GetOrCreate();

            // Load data
            DataFrame df =
                sc
                .Read()
                .Option("header", "true")
                .Option("inferSchema", "true")
                .Csv("Data/NYC-Restaurant-Inspections.csv");

            //Remove columns and missing values
            DataFrame cleanDf =
                df
                .Drop(dropCols)
                .WithColumnRenamed("INSPECTION DATE", "INSPECTIONDATE")
                .WithColumnRenamed("INSPECTION TYPE", "INSPECTIONTYPE")
                .WithColumnRenamed("CRITICAL FLAG", "CRITICALFLAG")
                .WithColumnRenamed("VIOLATION CODE", "VIOLATIONCODE")
                .Na()
                .Drop();

            // Encode CRITICAL FLAG column
            DataFrame labeledFlagDf =
                cleanDf
                .WithColumn("CRITICALFLAG",
                            When(Col("CRITICALFLAG") == "Y", 1)
                            .Otherwise(0));

            // Aggregate violations by business and inspection
            DataFrame groupedDf =
                labeledFlagDf
                .GroupBy("DBA", "INSPECTIONDATE", "INSPECTIONTYPE", "CRITICALFLAG", "SCORE", "GRADE")
                .Agg(
                    CollectSet(Col("VIOLATIONCODE")).Alias("CODES"),
                    Sum("CRITICALFLAG").Alias("FLAGS"))
                .Drop("DBA", "INSPECTIONDATE")
                .WithColumn("CODES", ArrayJoin(Col("CODES"), ","))
                .Select("INSPECTIONTYPE", "CODES", "FLAGS", "SCORE", "GRADE");

            // Split into graded and ungraded DataFrames
            DataFrame gradedDf =
                groupedDf
                .Filter(
                    Col("GRADE") == "A" |
                    Col("GRADE") == "B" |
                    Col("GRADE") == "C");

            DataFrame ungradedDf =
                groupedDf
                .Filter(
                    Col("GRADE") != "A" &
                    Col("GRADE") != "B" &
                    Col("GRADE") != "C");

            // Save DataFrames
            var timestamp = ((DateTimeOffset)DateTime.UtcNow).ToUnixTimeSeconds().ToString();

            var saveDirectory = Path.Join("Output", timestamp);

            if (!Directory.Exists(saveDirectory))
            {
                Directory.CreateDirectory(saveDirectory);
            }

            gradedDf.Write().Mode(SaveMode.Overwrite).Csv(Path.Join(saveDirectory, "Graded"));

            ungradedDf.Write().Mode(SaveMode.Overwrite).Csv(Path.Join(saveDirectory, "Ungraded"));
        }