public void TestDrop() { // Arrange const string columnNameToDrop = "column1"; var expectedResultDataFrameProxy = new Mock <IDataFrameProxy>().Object; mockDataFrameProxy.Setup(m => m.Drop(It.IsAny <string>())).Returns(expectedResultDataFrameProxy); var sc = new SparkContext(null); // Act var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc); var actualResultDataFrame = originalDataFrame.Drop(columnNameToDrop); // Assert mockDataFrameProxy.Verify(m => m.Drop(columnNameToDrop)); // assert Drop of Proxy was invoked with correct parameters Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy); }
public void RemoveColumns_Test() { var dict = new Dictionary <string, List <object> > { { "itemID", new List <object>() { "foo", "bar", "baz", "foo" } }, { "catId", new List <object>() { "A", "A", "B", "B" } }, { "value1", new List <object>() { 1, 2, 3, 4 } }, }; // var df1 = new DataFrame(dict); var df2 = df1.Drop("catId"); //test var c1f1 = df1["itemID"].ToList(); var c1f2 = df1["value1"].ToList(); Assert.Equal(3, df1.Columns.Count); var c2f1 = df2["itemID"].ToList(); var c2f2 = df2["value1"].ToList(); Assert.Equal(2, df2.Columns.Count); for (int i = 0; i < c1f1.Count(); i++) { Assert.Equal(c1f1[i].ToString(), c2f1[i].ToString()); } for (int i = 0; i < c2f2.Count(); i++) { Assert.Equal(c1f2[i], c2f2[i]); } }
static void Main(string[] args) { SparkSession spark = SparkSession.Builder() .AppName("emrapp") .GetOrCreate(); DataFrame df = spark.Read() .Format("avro") .Load(args[0]); df = df.Drop("address") .GroupBy("itemid") .Count(); df.Show(); df.Coalesce(1) .Write() .Format("csv") .Save($"{args[1]}/{DateTime.UtcNow.ToString("yyyy/MM/dd/hh-mm-ss")}"); }
public void TestSignaturesV2_3_X() { Column col = _df["name"]; col = _df["age"]; DataFrame df = _df.ToDF(); df = df.ToDF("name2", "age2"); StructType schema = _df.Schema(); Assert.NotNull(schema); _df.PrintSchema(); _df.Explain(); _df.Explain(true); _df.Explain(false); Assert.Equal(2, _df.Columns().ToArray().Length); _df.IsLocal(); _df.IsStreaming(); using (var tempDir = new TemporaryDirectory()) { // The following is required for *CheckPoint(). _spark.SparkContext.SetCheckpointDir(tempDir.Path); _df.Checkpoint(); _df.Checkpoint(false); _df.LocalCheckpoint(); _df.LocalCheckpoint(false); } _df.WithWatermark("time", "10 minutes"); _df.Show(); _df.Show(10); _df.Show(10, 10); _df.Show(10, 10, true); _df.Join(_df); _df.Join(_df, "name"); _df.Join(_df, new[] { "name" }); _df.Join(_df, new[] { "name" }, "outer"); _df.Join(_df, _df["age"] == _df["age"]); _df.Join(_df, _df["age"] == _df["age"], "outer"); _df.CrossJoin(_df); _df.SortWithinPartitions("age"); _df.SortWithinPartitions("age", "name"); _df.SortWithinPartitions(); _df.SortWithinPartitions(_df["age"]); _df.SortWithinPartitions(_df["age"], _df["name"]); _df.Sort("age"); _df.Sort("age", "name"); _df.Sort(); _df.Sort(_df["age"]); _df.Sort(_df["age"], _df["name"]); _df.OrderBy("age"); _df.OrderBy("age", "name"); _df.OrderBy(); _df.OrderBy(_df["age"]); _df.OrderBy(_df["age"], _df["name"]); _df.Hint("broadcast"); _df.Hint("broadcast", new[] { "hello", "world" }); _df.Col("age"); _df.ColRegex("age"); _df.As("alias"); _df.Alias("alias"); _df.Select("age"); _df.Select("age", "name"); _df.Select(); _df.Select(_df["age"]); _df.Select(_df["age"], _df["name"]); _df.SelectExpr(); _df.SelectExpr("age * 2"); _df.SelectExpr("age * 2", "abs(age)"); _df.Filter(_df["age"] > 21); _df.Filter("age > 21"); _df.Where(_df["age"] > 21); _df.Where("age > 21"); _df.GroupBy("age"); _df.GroupBy("age", "name"); _df.GroupBy(); _df.GroupBy(_df["age"]); _df.GroupBy(_df["age"], _df["name"]); _df.Rollup("age"); _df.Rollup("age", "name"); _df.Rollup(); _df.Rollup(_df["age"]); _df.Rollup(_df["age"], _df["name"]); _df.Cube("age"); _df.Cube("age", "name"); _df.Cube(); _df.Cube(_df["age"]); _df.Cube(_df["age"], _df["name"]); _df.Agg(Avg(_df["age"])); _df.Agg(Avg(_df["age"]), Avg(_df["name"])); _df.Limit(10); _df.Union(_df); _df.UnionByName(_df); _df.Intersect(_df); _df.Except(_df); _df.Sample(0.5); _df.Sample(0.5, true); _df.Sample(0.5, false, 12345); _df.RandomSplit(new[] { 0.2, 0.8 }); _df.RandomSplit(new[] { 0.2, 0.8 }, 12345); _df.WithColumn("age2", _df["age"]); _df.WithColumnRenamed("age", "age2"); _df.Drop(); _df.Drop("age"); _df.Drop("age", "name"); _df.Drop(_df["age"]); _df.DropDuplicates(); _df.DropDuplicates("age"); _df.DropDuplicates("age", "name"); _df.Describe(); _df.Describe("age"); _df.Describe("age", "name"); _df.Summary(); _df.Summary("count"); _df.Summary("count", "mean"); _df.Head(2); _df.Head(); _df.First(); _df.Take(3).ToArray(); _df.Collect().ToArray(); _df.ToLocalIterator().ToArray(); _df.Count(); _df.Repartition(2); _df.Repartition(2, _df["age"]); _df.Repartition(_df["age"]); _df.Repartition(); _df.RepartitionByRange(2, _df["age"]); _df.RepartitionByRange(_df["age"]); _df.Coalesce(1); _df.Distinct(); _df.Persist(); _df.Cache(); _df.Unpersist(); _df.CreateTempView("view"); _df.CreateOrReplaceTempView("view"); _df.CreateGlobalTempView("global_view"); _df.CreateOrReplaceGlobalTempView("global_view"); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName("SQL basic example using .NET for Apache Spark") .Config("spark.some.config.option", "some-value") .GetOrCreate(); // Need to explicitly specify the schema since pickling vs. arrow formatting // will return different types. Pickling will turn longs into ints if the values fit. // Same as the "age INT, name STRING" DDL-format string. var inputSchema = new StructType(new[] { new StructField("age", new IntegerType()), new StructField("name", new StringType()) }); DataFrame df = spark.Read().Schema(inputSchema).Json(args[0]); Spark.Sql.Types.StructType schema = df.Schema(); Console.WriteLine(schema.SimpleString); IEnumerable <Row> rows = df.Collect(); foreach (Row row in rows) { Console.WriteLine(row); } df.Show(); df.PrintSchema(); df.Select("name", "age", "age", "name").Show(); df.Select(df["name"], df["age"] + 1).Show(); df.Filter(df["age"] > 21).Show(); df.GroupBy("age") .Agg(Avg(df["age"]), Avg(df["age"]), CountDistinct(df["age"], df["age"])) .Show(); df.CreateOrReplaceTempView("people"); // Registering Udf for SQL expression. DataFrame sqlDf = spark.Sql("SELECT * FROM people"); sqlDf.Show(); spark.Udf().Register <int?, string, string>( "my_udf", (age, name) => name + " with " + ((age.HasValue) ? age.Value.ToString() : "null")); sqlDf = spark.Sql("SELECT my_udf(*) FROM people"); sqlDf.Show(); // Using UDF via data frames. Func <Column, Column, Column> addition = Udf <int?, string, string>( (age, name) => name + " is " + (age.HasValue ? age.Value + 10 : 0)); df.Select(addition(df["age"], df["name"])).Show(); // Chaining example: Func <Column, Column> addition2 = Udf <string, string>(str => $"hello {str}!"); df.Select(addition2(addition(df["age"], df["name"]))).Show(); // Multiple UDF example: df.Select(addition(df["age"], df["name"]), addition2(df["name"])).Show(); // UDF return type as array. Func <Column, Column> udfArray = Udf <string, string[]>((str) => new[] { str, str + str }); df.Select(Explode(udfArray(df["name"]))).Show(); // UDF return type as map. Func <Column, Column> udfMap = Udf <string, IDictionary <string, string[]> >( (str) => new Dictionary <string, string[]> { { str, new[] { str, str } } }); df.Select(udfMap(df["name"]).As("UdfMap")).Show(truncate: 50); // Joins. DataFrame joinedDf = df.Join(df, "name"); joinedDf.Show(); DataFrame joinedDf2 = df.Join(df, new[] { "name", "age" }); joinedDf2.Show(); DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer"); joinedDf3.Show(); // Union of two data frames DataFrame unionDf = df.Union(df); unionDf.Show(); // Add new column to data frame df.WithColumn("location", Lit("Seattle")).Show(); // Rename existing column df.WithColumnRenamed("name", "fullname").Show(); // Filter rows with null age df.Filter(Col("age").IsNull()).Show(); // Fill null values in age column with -1 df.Na().Fill(-1, new[] { "age" }).Show(); // Drop age column df.Drop(new[] { "age" }).Show(); spark.Stop(); }
static void Main(string[] args) { /* * Copiar mysql-connector-java-8.0.19.jar para pasta do Spark / Hadoop * Rodar o comando abaixo a partir da pasta inicial deste projeto: * %SPARK_HOME%\bin\spark-submit * --master local * --class org.apache.spark.deploy.dotnet.DotnetRunner * bin\Debug\netcoreapp3.1\microsoft-spark-2-4_2.11-1.0.0.jar * dotnet * bin\Debug\netcoreapp3.1\BatchDemo.dll * data\amostra.csv * jdbc:mysql://localhost:3306/teste_spark beneficios spark_user my-secret-password */ if (args.Length == 0) { throw new ArgumentException("Informar os caminhos onde encontrar os arquivos CSV"); } string arquivoEntrada = args[0]; // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Exemplo Batch") .GetOrCreate(); // Definindo um schema fixo, com os nomes de coluna que eu quero e seus tipos StructType schema = new StructType(new[] { new StructField("MES_REFERENCIA", new StringType()), new StructField("MES_COMPETENCIA", new StringType()), new StructField("UF", new StringType()), new StructField("CODIGO_MUNICIPIO", new IntegerType()), new StructField("MUNICIPIO", new StringType()), new StructField("CODIGO_FAVORECIDO", new StringType()), new StructField("NOME", new StringType()), new StructField("DATA_SAQUE", new DateType()), new StructField("VALOR_TEXTO", new StringType()) }); // Leitura dos dados em disco para dentro do Spark DataFrame df = spark.Read() .Format("csv") .Schema(schema) .Option("sep", ";") .Option("header", true) .Option("dateFormat", "dd/MM/yyyy") .Load(arquivoEntrada); df.PrintSchema(); df.Show(5, 10); // Removendo colunas que não precisamos mais df = df.Drop("MES_REFERENCIA") .Drop("MES_COMPETENCIA") .Drop("CODIGO_MUNICIPIO") .Drop("CODIGO_FAVORECIDO"); df.Show(5, 10); // Convertendo a coluna VALOR de string para decimal, considerando que o padrão brasileiro é diferente do americano df = df.WithColumn("VALOR", RegexpReplace( RegexpReplace( df.Col("VALOR_TEXTO") , "\\.", "") , ",", ".") .Cast("decimal(10,2)")) .Drop("VALOR_TEXTO"); df.PrintSchema(); df.Show(5, 10); // Efetuando um filtro em cima dos dados df = df.Where(df.Col("UF").NotEqual("AC")); //df = df.Where("UF <> 'AC'"); // passar uma expressão WHERE também funciona como filtro df.Show(5, 10); spark.Udf().Register <string, string, string>("ConcatenarMunicipio", (uf, municipio) => ConcatenarMunicipio(uf, municipio)); // Criando uma nova coluna a partir de uma concatenação e removendo colunas antigas e que não precisamos mais df = df.WithColumn("MUNICIPIO", CallUDF("ConcatenarMunicipio", df.Col("UF"), df.Col("MUNICIPIO"))) .Drop("UF"); // Efetuando uma agregação DataFrame somatorio = df.GroupBy("MUNICIPIO") .Sum("VALOR") .WithColumnRenamed("sum(VALOR)", "SOMA_BENEFICIOS"); somatorio .OrderBy(somatorio.Col("SOMA_BENEFICIOS").Desc()) .Show(15, 40); if (args.Length >= 2) { string urlJdbc = args[1]; // jdbc:mysql://localhost:3306/teste_spark string tabela = args[2]; // beneficios string usuario = args[3]; // spark_user string senha = args[4]; // my-secret-password // Salvando em banco de dados com funcionalidade nativa do Spark somatorio .Write() .Format("jdbc") .Option("driver", "com.mysql.cj.jdbc.Driver") .Option("url", "jdbc:mysql://localhost:3306/teste_spark") .Option("dbtable", "beneficios") .Option("user", "spark_user") .Option("password", "my-secret-password") .Mode(SaveMode.Overwrite) .Save(); } spark.Stop(); }
static void Main(string[] args) { // Define columns to remove string[] dropCols = new string[] { "CAMIS", "CUISINE DESCRIPTION", "VIOLATION DESCRIPTION", "BORO", "BUILDING", "STREET", "ZIPCODE", "PHONE", "ACTION", "GRADE DATE", "RECORD DATE", "Latitude", "Longitude", "Community Board", "Council District", "Census Tract", "BIN", "BBL", "NTA" }; // Create SparkSession var sc = SparkSession .Builder() .AppName("Restaurant_Inspections_ETL") .GetOrCreate(); // Load data DataFrame df = sc .Read() .Option("header", "true") .Option("inferSchema", "true") .Csv("Data/NYC-Restaurant-Inspections.csv"); //Remove columns and missing values DataFrame cleanDf = df .Drop(dropCols) .WithColumnRenamed("INSPECTION DATE", "INSPECTIONDATE") .WithColumnRenamed("INSPECTION TYPE", "INSPECTIONTYPE") .WithColumnRenamed("CRITICAL FLAG", "CRITICALFLAG") .WithColumnRenamed("VIOLATION CODE", "VIOLATIONCODE") .Na() .Drop(); // Encode CRITICAL FLAG column DataFrame labeledFlagDf = cleanDf .WithColumn("CRITICALFLAG", When(Col("CRITICALFLAG") == "Y", 1) .Otherwise(0)); // Aggregate violations by business and inspection DataFrame groupedDf = labeledFlagDf .GroupBy("DBA", "INSPECTIONDATE", "INSPECTIONTYPE", "CRITICALFLAG", "SCORE", "GRADE") .Agg( CollectSet(Col("VIOLATIONCODE")).Alias("CODES"), Sum("CRITICALFLAG").Alias("FLAGS")) .Drop("DBA", "INSPECTIONDATE") .WithColumn("CODES", ArrayJoin(Col("CODES"), ",")) .Select("INSPECTIONTYPE", "CODES", "FLAGS", "SCORE", "GRADE"); // Split into graded and ungraded DataFrames DataFrame gradedDf = groupedDf .Filter( Col("GRADE") == "A" | Col("GRADE") == "B" | Col("GRADE") == "C"); DataFrame ungradedDf = groupedDf .Filter( Col("GRADE") != "A" & Col("GRADE") != "B" & Col("GRADE") != "C"); // Save DataFrames var timestamp = ((DateTimeOffset)DateTime.UtcNow).ToUnixTimeSeconds().ToString(); var saveDirectory = Path.Join("Output", timestamp); if (!Directory.Exists(saveDirectory)) { Directory.CreateDirectory(saveDirectory); } gradedDf.Write().Mode(SaveMode.Overwrite).Csv(Path.Join(saveDirectory, "Graded")); ungradedDf.Write().Mode(SaveMode.Overwrite).Csv(Path.Join(saveDirectory, "Ungraded")); }