public static void leerTxt() { Console.WriteLine("Hello World!"); // Create a Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // Create initial DataFrame DataFrame dataFrame = spark.Read().Text("input.txt"); // Count words DataFrame words = dataFrame .Select(Functions.Split(Functions.Col("value"), " ").Alias("words")) .Select(Functions.Explode(Functions.Col("words")) .Alias("word")) .GroupBy("word") .Count() .OrderBy(Functions.Col("count").Desc()); // Show results words.Show(); // Stop Spark session spark.Stop(); }
static void Main(string[] args) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // Create initial DataFrame string filePath = args[0]; DataFrame dataFrame = spark.Read().Text(filePath); //Count words DataFrame words = dataFrame .Select(Split(Col("value"), " ").Alias("words")) .Select(Explode(Col("words")).Alias("word")) .GroupBy("word") .Count() .OrderBy(Col("count").Desc()); // Display results words.Show(); // Stop Spark session spark.Stop(); }
public static void leerJSON() { SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // A CSV dataset is pointed to by path. // The path can be either a single CSV file or a directory of CSV files string path = "data/sample_data.csv"; //Dataset<Row> df = spark.Read().Csv(path);//.csv(path); DataFrame df = spark.Read().Csv(path); df.Show(); // +------------------+ // | _c0| // +------------------+ // | name;age;job| // |Jorge;30;Developer| // | Bob;32;Developer| // +------------------+ //realizar conteo de nombres con sql DataFrame sqlDf = spark.Sql("SELECT * FROM sample_data"); // Show results sqlDf.Show(); // Stop Spark session spark.Stop(); }
static void Main(string[] args) { // Set the debug backend port, has to be same as the one in the Dockerfile. System.Environment.SetEnvironmentVariable("DOTNETBACKEND_PORT", "12345"); // Create a Spark session. SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // Create initial DataFrame. DataFrame dataFrame = spark.Read().Text("input.txt"); // Count words. DataFrame words = dataFrame .Select(Functions.Split(Functions.Col("value"), " ").Alias("words")) .Select(Functions.Explode(Functions.Col("words")) .Alias("word")) .GroupBy("word") .Count() .OrderBy(Functions.Col("count").Desc()); // Show results. words.Show(); // Stop Spark session. spark.Stop(); }
static void Main(string[] args) { Console.WriteLine("Start SparkSession"); SparkSession sparkSession = SparkSession.Builder().AppName("Street Counter").GetOrCreate(); DataFrame dfCsv = sparkSession .Read() .Option("delimiter", ";") .Schema("WOJ string ,POW string ,GMI string ,RODZ_GMI string , " + "SYM string , SYM_UL string , " + "CECHA string , NAZWA_1 string ,NAZWA_2 string , " + "STAN_NA string") .Csv("streets.csv"); DataFrame dataIn = dfCsv .WithColumn("STREET", Functions.ConcatWs(" ", dfCsv["CECHA"], dfCsv["NAZWA_1"], dfCsv["NAZWA_2"])); DataFrame dataGroup = dataIn .Select("STREET") .GroupBy("STREET") .Count() .WithColumnRenamed("count", "COUNT"); DataFrame dataOut = dataGroup .OrderBy(dataGroup["COUNT"] .Desc() ); dataOut .Coalesce(1) .Write() .Option("delimiter", ";") .Csv("result"); sparkSession.Stop(); Console.WriteLine("Stop SparkSession"); }
static void Main(string[] args) { //1. Create a Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); //2. Create initial DataFrame DataFrame dataFrame = spark.Read() //.Schema("Assertid STRING,properties STRING,Value BOOLEAN,TimeSatmp TIMESTAMP") .Schema("Assertid STRING,properties STRING,Value STRING,TimeSatmp TIMESTAMP") .Csv("DataBook.csv"); dataFrame.Show(); //Drop any rows with Null/Empty values DataFrameNaFunctions dropEmptytablesrows = dataFrame.Na(); DataFrame CleanedProjects = dropEmptytablesrows.Drop("any"); var testdata = 0; //remove unnecessary Columns CleanedProjects = CleanedProjects.Drop("Assertid", "properties", "Value", "TimeSatmp"); CleanedProjects.Show(); // Stop Spark session--checked spark.Stop(); }
public static void runSpark(string file_path, string cores, string nodes, int nrows) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .Config("spark.executor.cores", cores) .Config("spark.executor.instances", nodes) .GetOrCreate(); // // Create initial DataFrame DataFrame dataFrame = spark .Read() .Option("header", true) .Option("inferSchema", true) .Schema("quizzes string, solutions string") .Csv(file_path); DataFrame dataFrame2 = dataFrame.Limit(nrows); spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => sudokusolution(sudoku)); dataFrame2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved"); sqlDf.Show(); spark.Stop(); Console.WriteLine("SCRAPY"); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Datasource <path to SPARK_HOME/examples/src/main/resources/>"); Environment.Exit(1); } string parquet = Path.Combine(args[0], "users.parquet"); string json = Path.Combine(args[0], "people.json"); string csv = Path.Combine(args[0], "people.csv"); string orc = Path.Combine(args[0], "users.orc"); SparkSession spark = SparkSession .Builder() .AppName("SQL Datasource example using .NET for Apache Spark") .Config("spark.some.config.option", "some-value") .GetOrCreate(); RunBasicDatasourceExample(spark, parquet, json, csv, orc); RunParquetExample(spark, json); RunDatasourceExample(spark); spark.Stop(); }
static void runSpark(string file_path, string cores, string nodes, int nrows) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("Resolution de " + nrows + " sudokus par évolution combinatoire de " + cores + " noyau(x) et " + nodes + " noeud(s)") .Config("spark.executor.cores", cores) .Config("spark.executor.instances", nodes) .GetOrCreate(); // Create initial DataFrame DataFrame dataFrame = spark .Read() .Option("header", true) .Option("inferSchema", true) .Schema("quizzes string, solutions string") .Csv(file_path); DataFrame dataFrame2 = dataFrame.Limit(nrows); spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => sudokusolution(sudoku)); dataFrame2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved"); sqlDf.Show(); spark.Stop(); }
public void TestStop() { var mockSparkSessionProxy = new Mock <ISparkSessionProxy>(); var sparkSession = new SparkSession(mockSparkSessionProxy.Object); sparkSession.Stop(); mockSparkSessionProxy.Verify(m => m.Stop(), Times.Once); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: GitHubProjects <path to projects.csv>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName("GitHub and Spark Batch") .GetOrCreate(); DataFrame projectsDf = spark .Read() .Schema("id INT, url STRING, owner_id INT, " + "name STRING, descriptor STRING, language STRING, " + "created_at STRING, forked_from INT, deleted STRING, " + "updated_at STRING") .Csv(args[0]); projectsDf.Show(); // Drop any rows with NA values DataFrameNaFunctions dropEmptyProjects = projectsDf.Na(); DataFrame cleanedProjects = dropEmptyProjects.Drop("any"); // Remove unnecessary columns cleanedProjects = cleanedProjects.Drop("id", "url", "owner_id"); cleanedProjects.Show(); // Average number of times each language has been forked DataFrame groupedDF = cleanedProjects .GroupBy("language") .Agg(Avg(cleanedProjects["forked_from"])); // Sort by most forked languages first groupedDF.OrderBy(Desc("avg(forked_from)")).Show(); spark.Udf().Register <string, bool>( "MyUDF", (date) => DateTime.TryParse(date, out DateTime convertedDate) && (convertedDate > s_referenceDate)); cleanedProjects.CreateOrReplaceTempView("dateView"); DataFrame dateDf = spark.Sql( "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView"); dateDf.Show(); spark.Stop(); }
//Méthode qui est appelée depuis le main pour lancer une session spark avec un nombbre de noyaux et d'instances différents et lancer la résolution du soduku grace à la méthode Sudokusolution(). //private static void Sudokures(string cores, string nodes, string mem, int nrows){ private static void Sudokures(int nrows) { // Initialisation de la session Spark SparkSession spark = SparkSession .Builder() .Config("spark.executor.memory", "4G") .GetOrCreate(); //.AppName("Resolution of " + nrows + " sudokus using DlxLib with " + cores + " cores and " + nodes + " instances") //.Config("spark.driver.cores", cores) //.Config("spark.executor.instances", nodes) //.Config("spark.executor.memory", mem) //.GetOrCreate(); // Intégration du csv dans un dataframe DataFrame df = spark .Read() .Option("header", true) .Option("inferSchema", true) .Csv(_filePath); //limit du dataframe avec un nombre de ligne prédéfini lors de l'appel de la fonction DataFrame df2 = df.Limit(nrows); //Watch seulement pour la résolution des sudokus var watch2 = new System.Diagnostics.Stopwatch(); watch2.Start(); // Création de la spark User Defined Function spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => Sudokusolution(sudoku)); // Appel de l'UDF dans un nouveau dataframe spark qui contiendra les résultats aussi df2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT Sudokus, SukoduUDF(Sudokus) as Resolution from Resolved"); sqlDf.Show(); watch2.Stop(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution : " + watch2.ElapsedMilliseconds + " ms"); //Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution with " + cores + " core and " + nodes + " instance: " + watch2.ElapsedMilliseconds + " ms"); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); spark.Stop(); }
public static void Main(string[] args) { // arquivo usado : https://www.kaggle.com/gbonesso/b3-stock-quotes/data?select=COTAHIST_A2009_to_A2020_P.csv // essa poc calcula o preco medio da acao nesse periodo SparkConf sparkConf = new SparkConf(); sparkConf.SetMaster("local[*]"); // '*' indica pra usar todos os cores SparkSession spark = SparkSession .Builder() .Config(sparkConf) .AppName("SparkNetPOC") .GetOrCreate(); Stopwatch sw = new Stopwatch(); sw.Start(); DataFrame dataFrameGeral = spark.Read() .Schema("vazio STRING, TIPREG STRING,DATPRE STRING,CODBDI STRING,CODNEG STRING,TPMERC STRING,NOMRES STRING,ESPECI STRING," + "PRAZOT STRING,MODREF STRING,PREABE STRING,PREMAX STRING,PREMIN STRING,PREMED STRING,PREULT STRING,PREOFC STRING," + "PREOFV STRING,TOTNEG STRING,QUATOT STRING," + "VOLTOT STRING,PREEXE STRING,INDOPC STRING,DATVEN STRING,FATCOT STRING,PTOEXE STRING,CODISI STRING,DISMES STRING") .Csv(@"C:\InternetDownloads\10318_1101179_compressed_COTAHIST_A2009_to_A2020_P.csv\COTAHIST_A2009_to_A2020_P.csv"); DataFrame dataFrameColunasUteis = dataFrameGeral .Drop("vazio", "TIPREG", "DATPRE", "CODBDI", "TPMERC", "NOMRES", "ESPECI", "PRAZOT", "MODREF", "PREABE", "PREMIN", "PREMED", "PREULT", "PREOFC", "PREOFV", "TOTNEG", "QUATOT", "VOLTOT", "PREEXE", "INDOPC", "DATVEN", "FATCOT", "PTOEXE", "CODISI", "DISMES"); DataFrame dataFrameFiltro = dataFrameColunasUteis .Filter("CODNEG = 'ITSA3' OR CODNEG = 'ABEV3' OR CODNEG = 'PETR4'"); DataFrame dataFrameFinal = dataFrameFiltro .GroupBy("CODNEG") .Agg(Avg("PREMAX")); dataFrameFinal.Show(); spark.Stop(); sw.Stop(); Console.WriteLine("Tempo = " + sw.ElapsedMilliseconds); }
private static void Main(string[] args) { if (args.Length != 4) { Console.WriteLine("Usage:"); Console.WriteLine("\t<spark-submit> --master local"); Console.WriteLine("\t\t--class org.apache.spark.deploy.dotnet.DotnetRunner <path-to-microsoft-spark-jar>"); Console.WriteLine("\t\tTpch.exe <tpch_data_root_path> <query_number> <num_iterations> <true for SQL | false for functional>"); return; } var tpchRoot = args[0]; var queryNumber = args[1]; var numIteration = int.Parse(args[2]); var isSQL = bool.Parse(args[3]); SparkSession spark = SparkSession .Builder() .AppName("TPC-H Benchmark for DotNet") .GetOrCreate(); for (var i = 0; i < numIteration; ++i) { Stopwatch sw = Stopwatch.StartNew(); Stopwatch swFunc = new Stopwatch(); if (!isSQL) { var tpchFunctional = new TpchFunctionalQueries(tpchRoot, spark); swFunc.Start(); tpchFunctional.Run(queryNumber.ToString()); swFunc.Stop(); } else { var tpchSql = new TpchSqlQueries(tpchRoot, spark); tpchSql.Run(queryNumber.ToString()); } sw.Stop(); var typeStr = isSQL ? "SQL" : "Functional"; Console.WriteLine($"TPCH_Result,DotNet,{typeStr},{queryNumber},{i},{sw.ElapsedMilliseconds},{swFunc.ElapsedMilliseconds}"); } spark.Stop(); }
public string SparkTest([FromServices] IAWSSettings awsSettings) { string result = "ok"; try { SparkSession spark = SparkSession .Builder() .AppName("itur") .GetOrCreate(); var mySchema = new Microsoft.Spark.Sql.Types.StructType(new[] { new StructField("IturCode", new Microsoft.Spark.Sql.Types.StringType()), new StructField("IturERP", new Microsoft.Spark.Sql.Types.StringType()), new StructField("QuantityEdit", new Microsoft.Spark.Sql.Types.StringType()), new StructField("PartialQuantity", new Microsoft.Spark.Sql.Types.StringType()) }); string assemblyLocation = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); string iturInputPath = Path.Combine(assemblyLocation, "data", "itur.csv"); DataFrame df = spark.Read() .Format("csv") .Schema(mySchema) .Option("delimiter", ",") .Option("header", true) //.Option("dateFormat", "dd/MM/yyyy") .Load(iturInputPath); string dt = DateTime.Now.ToString("MMddhhmmss"); string outputfile = Path.Combine(assemblyLocation, "outputData", $"itur_out{dt}.json"); df.Write().Json(outputfile); //string toPath = $"s3n://{awsSettings.AccessKey}:{awsSettings.SecretKey}@{_bucketName}/{path}"; //spark.Range(100).Repartition(5).Write().Mode("overwrite").Text(toPath) ; spark.Stop(); } catch (Exception ex) { result = ex.Message; } return(result); }
public void Run(string[] args) { if (args.Length != 2) { Console.Error.WriteLine( "Usage: <path to yelptest.csv> <path to MLModel.zip>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName(".NET for Apache Spark Sentiment Analysis") .GetOrCreate(); // Read in and display Yelp reviews DataFrame df = spark .Read() .Option("header", true) .Option("inferSchema", true) .Csv(args[0]); df.Show(); // Use ML.NET in a UDF to evaluate each review spark.Udf().Register <string, bool>( "MLudf", (text) => Sentiment(text, args[1])); // Use Spark SQL to call ML.NET UDF // Display results of sentiment analysis on reviews df.CreateOrReplaceTempView("Reviews"); DataFrame sqlDf = spark.Sql("SELECT ReviewText, MLudf(ReviewText) FROM Reviews"); sqlDf.Show(); // Print out first 20 rows of data // Prevent data getting cut off by setting truncate = 0 sqlDf.Show(20, 0, false); spark.Stop(); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Sql.VectorDataFrameUdfs <path to SPARK_HOME/examples/src/main/resources/people.json>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() // Lower the shuffle partitions to speed up groupBy() operations. .Config("spark.sql.shuffle.partitions", "3") .AppName("SQL VectorUdfs example using .NET for Apache Spark") .GetOrCreate(); DataFrame df = spark.Read().Schema("age INT, name STRING").Json(args[0]); StructType schema = df.Schema(); Console.WriteLine(schema.SimpleString); df.Show(); df.PrintSchema(); // Grouped Map Vector UDF // able to return different shapes and record lengths df.GroupBy("age") .Apply( new StructType(new[] { new StructField("age", new IntegerType()), new StructField("nameCharCount", new IntegerType()) }), r => CountCharacters(r)) .Show(); spark.Stop(); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Logging <path to Apache User Logs>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName("Apache User Log Processing") .GetOrCreate(); // Read input log file and display it DataFrame df = spark.Read().Text(args[0]); df.Show(); // Step 1: UDF to determine if each line is a valid log entry // Remove any invalid entries before further filtering spark.Udf().Register <string, bool>( "GeneralReg", log => Regex.IsMatch(log, s_apacheRx)); df.CreateOrReplaceTempView("Logs"); // Apply the UDF to get valid log entries DataFrame generalDf = spark.Sql( "SELECT logs.value, GeneralReg(logs.value) FROM Logs"); // Only keep log entries that matched the reg ex generalDf = generalDf.Filter(generalDf["GeneralReg(value)"]); generalDf.Show(); // View the resulting schema // Notice we created a new column "GeneralReg(value)" generalDf.PrintSchema(); // Step 2: Choose valid log entries that start with 10 spark.Udf().Register <string, bool>( "IPReg", log => Regex.IsMatch(log, "^(?=10)")); generalDf.CreateOrReplaceTempView("IPLogs"); // Apply UDF to get valid log entries starting with 10 // Use SQL "WHERE" rather than doing ipDf.Filter(), // which avoids creating an extra column "IPReg(value)" DataFrame ipDf = spark.Sql( "SELECT iplogs.value FROM IPLogs WHERE IPReg(iplogs.value)"); ipDf.Show(); // Step 3: Choose valid log entries that start // with 10 and deal with spam spark.Udf().Register <string, bool>( "SpamRegEx", log => Regex.IsMatch(log, "\\b(?=spam)\\b")); ipDf.CreateOrReplaceTempView("SpamLogs"); // Apply UDF to get valid, start with 10, spam entries DataFrame spamDF = spark.Sql( "SELECT spamlogs.value FROM SpamLogs WHERE SpamRegEx(spamlogs.value)"); // Let's explore the columns in the data we have filtered // Use LINQ to count the number of GET requests int numGetRequests = spamDF .Collect() .Where(r => ContainsGet(r.GetAs <string>("value"))) .Count(); Console.WriteLine("Number of GET requests: " + numGetRequests); spark.Stop(); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName(".NET Spark SQL basic example") .Config("spark.some.config.option", "some-value") .GetOrCreate(); // Need to explicitly specify the schema since pickling vs. arrow formatting // will return different types. Pickling will turn longs into ints if the values fit. // Same as the "age INT, name STRING" DDL-format string. var inputSchema = new StructType(new[] { new StructField("age", new IntegerType()), new StructField("name", new StringType()) }); DataFrame df = spark.Read().Schema(inputSchema).Json(args[0]); Spark.Sql.Types.StructType schema = df.Schema(); Console.WriteLine(schema.SimpleString); IEnumerable <Row> rows = df.Collect(); foreach (Row row in rows) { Console.WriteLine(row); } df.Show(); df.PrintSchema(); df.Select("name", "age", "age", "name").Show(); df.Select(df["name"], df["age"] + 1).Show(); df.Filter(df["age"] > 21).Show(); df.GroupBy("age") .Agg(Avg(df["age"]), Avg(df["age"]), CountDistinct(df["age"], df["age"])) .Show(); df.CreateOrReplaceTempView("people"); // Registering Udf for SQL expression. DataFrame sqlDf = spark.Sql("SELECT * FROM people"); sqlDf.Show(); spark.Udf().Register <int?, string, string>( "my_udf", (age, name) => name + " with " + ((age.HasValue) ? age.Value.ToString() : "null")); sqlDf = spark.Sql("SELECT my_udf(*) FROM people"); sqlDf.Show(); // Using UDF via data frames. Func <Column, Column, Column> addition = Udf <int?, string, string>( (age, name) => name + " is " + (age.HasValue ? age.Value + 10 : 0)); df.Select(addition(df["age"], df["name"])).Show(); // Chaining example: Func <Column, Column> addition2 = Udf <string, string>(str => $"hello {str}!"); df.Select(addition2(addition(df["age"], df["name"]))).Show(); // Multiple UDF example: df.Select(addition(df["age"], df["name"]), addition2(df["name"])).Show(); // UDF return type as array. Func <Column, Column> udfArray = Udf <string, string[]>((str) => new string[] { str, str + str }); df.Select(Explode(udfArray(df["name"]))).Show(); // UDF return type as map. Func <Column, Column> udfMap = Udf <string, IDictionary <string, string[]> >( (str) => new Dictionary <string, string[]> { { str, new[] { str, str } } }); df.Select(udfMap(df["name"]).As("UdfMap")).Show(truncate: 50); // Joins. DataFrame joinedDf = df.Join(df, "name"); joinedDf.Show(); DataFrame joinedDf2 = df.Join(df, new[] { "name", "age" }); joinedDf2.Show(); DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer"); joinedDf3.Show(); spark.Stop(); }
public static void leerCSV() { SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // A CSV dataset is pointed to by path. // The path can be either a single CSV file or a directory of CSV files string path = "data/sample_data.csv"; //Dataset<Row> df = spark.Read().Csv(path);//.csv(path); DataFrame df = spark.Read().Csv(path); df.Show(); // +------------------+ // | _c0| // +------------------+ // | name;age;job| // |Jorge;30;Developer| // | Bob;32;Developer| // +------------------+ // Read a csv with delimiter, the default delimiter is "," //Dataset<Row> df2 = spark.read().option("delimiter", ";").csv(path); DataFrame df2 = spark.Read().Option("delimiter", ";").Csv(path); df2.Show(); // +-----+---+---------+ // | _c0|_c1| _c2| // +-----+---+---------+ // | name|age| job| // |Jorge| 30|Developer| // | Bob| 32|Developer| // +-----+---+---------+ // Read a csv with delimiter and a header //Dataset<Row> df3 = spark.read().option("delimiter", ";").option("header", "true").csv(path); DataFrame df3 = spark.Read().Option("delimiter", ";").Option("header", "true").Csv(path); df3.Show(); // +-----+---+---------+ // | name|age| job| // +-----+---+---------+ // |Jorge| 30|Developer| // | Bob| 32|Developer| // +-----+---+---------+ // You can also use options() to use multiple options Dictionary <string, string> optionsMap = new Dictionary <string, string>(); optionsMap.Add("delimiter", ";"); optionsMap.Add("header", "true"); var df4 = spark.Read().Options(optionsMap).Csv(path); // "output" is a folder which contains multiple csv files and a _SUCCESS file. df3.Write().Csv("output"); // Read all files in a folder, please make sure only CSV files should present in the folder. string folderPath = "data/sample_data.csv"; DataFrame df5 = spark.Read().Csv(folderPath); df5.Show(); // Wrong schema because non-CSV files are read // +-----------+ // | _c0| // +-----------+ // |238val_238| // | 86val_86| // |311val_311| // | 27val_27| // |165val_165| // +-----------+ // Stop Spark session spark.Stop(); }
public static void Main(string[] args) { // Create Spark session. SparkSession spark = SparkSession .Builder() .AppName("Hyperspace example") .Config("spark.some.config.option", "some-value") .GetOrCreate(); // Sample department records. var departments = new List <GenericRow>() { new GenericRow(new object[] { 10, "Accounting", "New York" }), new GenericRow(new object[] { 20, "Research", "Dallas" }), new GenericRow(new object[] { 30, "Sales", "Chicago" }), new GenericRow(new object[] { 40, "Operations", "Boston" }) }; // Sample employee records. var employees = new List <GenericRow>() { new GenericRow(new object[] { 7369, "SMITH", 20 }), new GenericRow(new object[] { 7499, "ALLEN", 30 }), new GenericRow(new object[] { 7521, "WARD", 30 }), new GenericRow(new object[] { 7566, "JONES", 20 }), new GenericRow(new object[] { 7698, "BLAKE", 30 }), new GenericRow(new object[] { 7782, "CLARK", 10 }), new GenericRow(new object[] { 7788, "SCOTT", 20 }), new GenericRow(new object[] { 7839, "KING", 10 }), new GenericRow(new object[] { 7844, "TURNER", 30 }), new GenericRow(new object[] { 7876, "ADAMS", 20 }), new GenericRow(new object[] { 7900, "JAMES", 30 }), new GenericRow(new object[] { 7934, "MILLER", 10 }), new GenericRow(new object[] { 7902, "FORD", 20 }), new GenericRow(new object[] { 7654, "MARTIN", 30 }) }; // Save example data records as Parquet. string deptLocation = "departments"; spark.CreateDataFrame(departments, new StructType(new List <StructField>() { new StructField("deptId", new IntegerType()), new StructField("deptName", new StringType()), new StructField("location", new StringType()) })) .Write() .Mode("overwrite") .Parquet(deptLocation); string empLocation = "employees"; spark.CreateDataFrame(employees, new StructType(new List <StructField>() { new StructField("empId", new IntegerType()), new StructField("empName", new StringType()), new StructField("deptId", new IntegerType()) })) .Write() .Mode("overwrite") .Parquet(empLocation); // Create Hyperspace indexes. var hyperspace = new Hyperspace(spark); DataFrame deptDF = spark.Read().Parquet(deptLocation); DataFrame empDF = spark.Read().Parquet(empLocation); var deptIndexConfig = new IndexConfig( "deptIndex", new[] { "deptId" }, new[] { "deptName" }); var empIndexConfig = new IndexConfig("empIndex", new[] { "deptId" }, new[] { "empName" }); hyperspace.CreateIndex(deptDF, deptIndexConfig); hyperspace.CreateIndex(empDF, empIndexConfig); // List all indexes. hyperspace.Indexes().Show(); // Enable Hyperspace to leverage indexes. spark.EnableHyperspace(); // Example of index usage for filtered selection. DataFrame eqFilter = deptDF.Filter("deptId = 20").Select("deptName"); eqFilter.Show(); hyperspace.Explain(eqFilter, false); // Example of index usage for join. DataFrame eqJoin = empDF .Join(deptDF, "deptId") .Select(empDF.Col("empName"), deptDF.Col("deptName")); eqJoin.Show(); hyperspace.Explain(eqJoin, false); // Stop Spark session. spark.Stop(); }
static void Main(string[] args) { /* * Copiar mysql-connector-java-8.0.19.jar para pasta do Spark / Hadoop * Rodar o comando abaixo a partir da pasta inicial deste projeto: * %SPARK_HOME%\bin\spark-submit * --master local * --class org.apache.spark.deploy.dotnet.DotnetRunner * bin\Debug\netcoreapp3.1\microsoft-spark-2-4_2.11-1.0.0.jar * dotnet * bin\Debug\netcoreapp3.1\BatchDemo.dll * data\amostra.csv * jdbc:mysql://localhost:3306/teste_spark beneficios spark_user my-secret-password */ if (args.Length == 0) { throw new ArgumentException("Informar os caminhos onde encontrar os arquivos CSV"); } string arquivoEntrada = args[0]; // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Exemplo Batch") .GetOrCreate(); // Definindo um schema fixo, com os nomes de coluna que eu quero e seus tipos StructType schema = new StructType(new[] { new StructField("MES_REFERENCIA", new StringType()), new StructField("MES_COMPETENCIA", new StringType()), new StructField("UF", new StringType()), new StructField("CODIGO_MUNICIPIO", new IntegerType()), new StructField("MUNICIPIO", new StringType()), new StructField("CODIGO_FAVORECIDO", new StringType()), new StructField("NOME", new StringType()), new StructField("DATA_SAQUE", new DateType()), new StructField("VALOR_TEXTO", new StringType()) }); // Leitura dos dados em disco para dentro do Spark DataFrame df = spark.Read() .Format("csv") .Schema(schema) .Option("sep", ";") .Option("header", true) .Option("dateFormat", "dd/MM/yyyy") .Load(arquivoEntrada); df.PrintSchema(); df.Show(5, 10); // Removendo colunas que não precisamos mais df = df.Drop("MES_REFERENCIA") .Drop("MES_COMPETENCIA") .Drop("CODIGO_MUNICIPIO") .Drop("CODIGO_FAVORECIDO"); df.Show(5, 10); // Convertendo a coluna VALOR de string para decimal, considerando que o padrão brasileiro é diferente do americano df = df.WithColumn("VALOR", RegexpReplace( RegexpReplace( df.Col("VALOR_TEXTO") , "\\.", "") , ",", ".") .Cast("decimal(10,2)")) .Drop("VALOR_TEXTO"); df.PrintSchema(); df.Show(5, 10); // Efetuando um filtro em cima dos dados df = df.Where(df.Col("UF").NotEqual("AC")); //df = df.Where("UF <> 'AC'"); // passar uma expressão WHERE também funciona como filtro df.Show(5, 10); spark.Udf().Register <string, string, string>("ConcatenarMunicipio", (uf, municipio) => ConcatenarMunicipio(uf, municipio)); // Criando uma nova coluna a partir de uma concatenação e removendo colunas antigas e que não precisamos mais df = df.WithColumn("MUNICIPIO", CallUDF("ConcatenarMunicipio", df.Col("UF"), df.Col("MUNICIPIO"))) .Drop("UF"); // Efetuando uma agregação DataFrame somatorio = df.GroupBy("MUNICIPIO") .Sum("VALOR") .WithColumnRenamed("sum(VALOR)", "SOMA_BENEFICIOS"); somatorio .OrderBy(somatorio.Col("SOMA_BENEFICIOS").Desc()) .Show(15, 40); if (args.Length >= 2) { string urlJdbc = args[1]; // jdbc:mysql://localhost:3306/teste_spark string tabela = args[2]; // beneficios string usuario = args[3]; // spark_user string senha = args[4]; // my-secret-password // Salvando em banco de dados com funcionalidade nativa do Spark somatorio .Write() .Format("jdbc") .Option("driver", "com.mysql.cj.jdbc.Driver") .Option("url", "jdbc:mysql://localhost:3306/teste_spark") .Option("dbtable", "beneficios") .Option("user", "spark_user") .Option("password", "my-secret-password") .Mode(SaveMode.Overwrite) .Save(); } spark.Stop(); }
public string DeltaTest([FromServices] IAWSSettings awsSettings) { string result = String.Empty; try { SparkSession spark = SparkSession .Builder() .AppName("DeltaTest") .GetOrCreate(); string tempDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); string dt = DateTime.Now.ToString("MMddhhmmss"); string path = Path.Combine(tempDirectory, $"delta-table{dt}"); // Write data to a Delta table. DataFrame data = spark.Range(0, 5); result += "Write data to a Delta table >> spark.Range(0, 5)" + " "; foreach (var row in data.ToDF().Collect()) { result += row.Values[0]; result += " | "; } result += " "; data.Write().Format("delta").Save(path); // Create a second iteration of the table. data = spark.Range(5, 10); result += "Create a second iteration of the table >> spark.Range(0, 5)" + " "; foreach (var row in data.ToDF().Collect()) { result += row.Values[0]; result += " | "; } result += " "; data.Write().Format("delta").Mode("overwrite").Save(path); // Load the data into a DeltaTable object. DeltaTable deltaTable = DeltaTable.ForPath(path); result += "Load the data into a DeltaTable object >> DeltaTable.ForPath" + " "; foreach (var row in deltaTable.ToDF().Collect()) { result += row.Values[0]; result += " | "; } result += " "; // Update every even value by adding 100 to it. deltaTable.Update( condition: Functions.Expr("id % 2 == 0"), set: new Dictionary <string, Column>() { { "id", Functions.Expr("id + 100") } }); result += "Update every even value by adding 100 to it." + " "; foreach (var row in deltaTable.ToDF().Collect()) { result += row.Values[0]; result += " | "; } result += " "; // Delete every even value. deltaTable.Delete(condition: Functions.Expr("id % 2 == 0")); result += "Delete every even value id % 2 == 0" + " "; foreach (var row in deltaTable.ToDF().Collect()) { result += row.Values[0]; result += " | "; } result += " "; // Upsert (merge) new data. DataFrame newData = spark.Range(0, 20).As("newData").ToDF(); result += "Upsert (merge) new data" + Environment.NewLine; foreach (var row in newData.ToDF().Collect()) { result += row.Values[0]; result += " | "; } result += " "; deltaTable.As("oldData") .Merge(newData, "oldData.id = newData.id") .WhenMatched() .Update( new Dictionary <string, Column>() { { "id", Functions.Col("newData.id") } }) .WhenNotMatched() .InsertExpr(new Dictionary <string, string>() { { "id", "newData.id" } }) .Execute(); spark.Stop(); } catch (Exception ex) { result = ex.Message; } return(result); }