internal MemoryStream(SparkSession sparkSession) { JvmObjectReference sparkSessionRef = sparkSession.Reference; Reference = (JvmObjectReference)sparkSessionRef.Jvm.CallStaticJavaMethod( "org.apache.spark.sql.test.TestUtils", "createMemoryStream", sparkSessionRef.Invoke("sqlContext"), typeof(T).Name); }
static void Main(string[] args) { var spark = SparkSession.Builder().GetOrCreate(); spark.Udf().RegisterJavaUDAF("java_function", "com.company.ClassName"); var dataFrame = spark.Sql("SELECT ID, java_function(ID) as java_function_output FROM range(1000)"); dataFrame.Select(CallUDF("java_udf", dataFrame["ID"])).Show(); }
private static void ProcessEntity(SparkSession spark, string sourceFile, string dataLakePath, string sourceSystem, string entity, string year, string month) { var data = OfgemExpensesEntity.ReadFromSource(spark, sourceFile); OfgemExpensesEntity.WriteToStructured(data, $"{dataLakePath}/structured/{sourceSystem}/{entity}/{year}/{month}"); OfgemExpensesEntity.WriteToCurated(data, $"{dataLakePath}/curated/{sourceSystem}/{entity}"); OfgemExpensesEntity.WriteToPublish(spark, $"{dataLakePath}/curated/{sourceSystem}/{entity}", $"{dataLakePath}/publish/{sourceSystem}/{entity}"); }
static void Main(string[] args) { var spark = SparkSession.Builder().GetOrCreate(); var dataFrame = spark.Range(100); dataFrame.Write().Mode("overwrite").Csv("output.csv"); dataFrame.Write().Mode("ignore").Csv("output.csv"); dataFrame.Write().Mode("append").Csv("output.csv"); dataFrame.Write().Mode("error").Csv("output.csv"); }
static void Main(string[] args) { var spark = SparkSession.Builder().Config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") .Config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") .GetOrCreate(); spark.Range(100).WithColumn("name", Functions.Lit("Sammy")).Write().Mode("overwrite").Parquet("/tmp/delta-sql-demo"); spark.Sql("CONVERT TO DELTA parquet.`/tmp/delta-sql-demo"); spark.Sql("SELECT * FROM delta.`/tmp/delta-sql-demo`").Show(); }
static void Main(string[] args) { var spark = SparkSession.Builder().GetOrCreate(); var dataFrame = spark.Read().Option("sep", ",").Option("header", "false") .Schema("greeting string, first_number int, second_number float") .Csv("csv_file.csv"); dataFrame.PrintSchema(); dataFrame.Show(); }
static void Main(string[] args) { var spark = SparkSession.Builder().GetOrCreate(); var dataFrame = spark.Sql("SELECT ID FROM range(1000)"); var add100 = Udf <int?, int>((input) => input + 100 ?? 100); dataFrame.Select(add100(dataFrame["ID"])).Show(); }
private static void Evaluate(SparkSession session, Action <AnalyzerContext, IMetricsRepository> func) { DataFrame data = FixtureSupport.GetDFFull(session); AnalyzerContext results = CreateAnalysis().Run(data, Option <IStateLoader> .None, Option <IStatePersister> .None); IMetricsRepository repository = CreateRepository(); func(results, repository); }
//Méthode qui est appelée depuis le main pour lancer une session spark avec un nombbre de noyaux et d'instances différents et lancer la résolution du soduku grace à la méthode Sudokusolution(). //private static void Sudokures(string cores, string nodes, string mem, int nrows){ private static void Sudokures(int nrows) { // Initialisation de la session Spark SparkSession spark = SparkSession .Builder() .Config("spark.executor.memory", "4G") .GetOrCreate(); //.AppName("Resolution of " + nrows + " sudokus using DlxLib with " + cores + " cores and " + nodes + " instances") //.Config("spark.driver.cores", cores) //.Config("spark.executor.instances", nodes) //.Config("spark.executor.memory", mem) //.GetOrCreate(); // Intégration du csv dans un dataframe DataFrame df = spark .Read() .Option("header", true) .Option("inferSchema", true) .Csv(_filePath); //limit du dataframe avec un nombre de ligne prédéfini lors de l'appel de la fonction DataFrame df2 = df.Limit(nrows); //Watch seulement pour la résolution des sudokus var watch2 = new System.Diagnostics.Stopwatch(); watch2.Start(); // Création de la spark User Defined Function spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => Sudokusolution(sudoku)); // Appel de l'UDF dans un nouveau dataframe spark qui contiendra les résultats aussi df2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT Sudokus, SukoduUDF(Sudokus) as Resolution from Resolved"); sqlDf.Show(); watch2.Stop(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution : " + watch2.ElapsedMilliseconds + " ms"); //Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution with " + cores + " core and " + nodes + " instance: " + watch2.ElapsedMilliseconds + " ms"); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); spark.Stop(); }
/// <summary> /// Drops tables in <paramref name="tableNames"/> after calling <paramref name="action"/>. /// </summary> /// <param name="spark">The <see cref="SparkSession"/></param> /// <param name="tableNames">Names of the tables to drop</param> /// <param name="action"><see cref="Action"/> to execute.</param> public static void WithTable(SparkSession spark, IEnumerable <string> tableNames, Action action) { try { action(); } finally { tableNames.ToList().ForEach(name => spark.Sql($"DROP TABLE IF EXISTS {name}")); } }
static void Main(string[] args) { var spark = SparkSession.Builder().AppName("Creator").GetOrCreate(); CreateByReadingData(spark); CreateUsingRange(spark); CreateUsingRangeInSql(spark); CreateUsingRangeAndDataFrameAPI(spark); CreateUsingBuiltInType(spark); CreateUsingGenericRowAndStructType(spark); }
public async Task TestGetSparkSession() { SparkSessionCollection sparkSessions = (await SparkSessionClient.GetSparkSessionsAsync()).Value; foreach (SparkSession expectedSparkSession in sparkSessions.Sessions) { SparkSession actualSparkSession = await SparkSessionClient.GetSparkSessionAsync(expectedSparkSession.Id); ValidateSparkSession(expectedSparkSession, actualSparkSession); } }
static void Main(string[] args) { var spark = SparkSession.Builder().GetOrCreate(); DataFrameReader reader = spark.Read().Format("csv").Option("header", true).Option("sep", ","); var dataFrame = reader.Load("./csv_file.csv"); dataFrame.Show(); }
static void Main(string[] args) { if (args.Length != 2) { Console.Error.WriteLine( "Usage: Remember to include input and output path as arguments"); Environment.Exit(1); } var sparkConf = SparkConfUtils.GetSparkConfigurationForFilePath(args); SparkSession spark = SparkSession .Builder() .AppName("Streaming example using Spark.NET") .GetOrCreate(); if (sparkConf != null) { sparkConf.ToList().ForEach(kv => { spark.Conf().Set(kv.Key, kv.Value); }); } var events = spark .ReadStream() .Format("eventhubs") .Options(EventHubConnection.GetEventHubConnectionSettings(eventHubPartitionCount: 2)) .Load(); var processedEvents = events .Select( FromJson(Col("body").Cast("string"), "temperature String, humidity String").As("Raw"), Col("properties"), Col("enqueuedTime") ) .WithColumn("Raw.temperature", Col("Raw.temperature").Cast("double")) .WithColumn("Raw.humidity", Col("Raw.humidity").Cast("double")) .WithColumnRenamed("Raw.temperature", "temperature") .WithColumnRenamed("Raw.humidity", "humidity") .WithColumn("temperatureAlert", Col("temperature") >= 40) .SelectExpr("temperature", "humidity", "properties", "enqueuedTime", "temperatureAlert"); processedEvents.PrintSchema(); var streamingQuery = processedEvents .WriteStream() .OutputMode(OutputMode.Append) .Format("console") .Option("path", args[0]) .Option("checkpointLocation", args[1]) .Start(); streamingQuery.AwaitTermination(); }
public static DeltaTable ConvertToDelta( SparkSession spark, string identifier, string partitionSchema) => new DeltaTable( (JvmObjectReference)SparkEnvironment.JvmBridge.CallStaticJavaMethod( s_deltaTableClassName, "convertToDelta", spark, identifier, partitionSchema));
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: GitHubProjects <path to projects.csv>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName("GitHub and Spark Batch") .GetOrCreate(); DataFrame projectsDf = spark .Read() .Schema("id INT, url STRING, owner_id INT, " + "name STRING, descriptor STRING, language STRING, " + "created_at STRING, forked_from INT, deleted STRING, " + "updated_at STRING") .Csv(args[0]); projectsDf.Show(); // Drop any rows with NA values DataFrameNaFunctions dropEmptyProjects = projectsDf.Na(); DataFrame cleanedProjects = dropEmptyProjects.Drop("any"); // Remove unnecessary columns cleanedProjects = cleanedProjects.Drop("id", "url", "owner_id"); cleanedProjects.Show(); // Average number of times each language has been forked DataFrame groupedDF = cleanedProjects .GroupBy("language") .Agg(Avg(cleanedProjects["forked_from"])); // Sort by most forked languages first groupedDF.OrderBy(Desc("avg(forked_from)")).Show(); spark.Udf().Register <string, bool>( "MyUDF", (date) => DateTime.TryParse(date, out DateTime convertedDate) && (convertedDate > s_referenceDate)); cleanedProjects.CreateOrReplaceTempView("dateView"); DataFrame dateDf = spark.Sql( "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView"); dateDf.Show(); spark.Stop(); }
static void Main(string[] args) { var spark = SparkSession.Builder().GetOrCreate(); var dataFrame = spark.Range(10000); Func <Column, Column> discPrice = VectorUdf <Int64DataFrameColumn, Int64DataFrameColumn>( (id) => id.Multiply(100).Divide(2)); dataFrame.Select(dataFrame["ID"], discPrice(dataFrame["ID"])).Show(); }
static void Main(string[] args) { // Verify environment variables if (args.Length != 4) { Console.Error.WriteLine("Usage: $TENANT_ID $ADLS_NAME $ADLS_SP_CLIENT_ID $ADLS_SP_CLIENT_SECRET"); Environment.Exit(1); } // Specify file path in Azure Data Lake Gen1 string filePath = $"adl://{args[1]}.azuredatalakestore.net/parquet/people.parquet"; // Create SparkSession SparkSession spark = SparkSession .Builder() .AppName("Azure Data Lake Storage example using .NET for Apache Spark") .Config("fs.adl.impl", "org.apache.hadoop.fs.adl.AdlFileSystem") .Config("fs.adl.oauth2.access.token.provider.type", "ClientCredential") .Config("fs.adl.oauth2.client.id", args[2]) .Config("fs.adl.oauth2.credential", args[3]) .Config("fs.adl.oauth2.refresh.url", $"https://login.microsoftonline.com/{args[0]}/oauth2/token") .GetOrCreate(); // Create sample data var data = new List <GenericRow> { new GenericRow(new object[] { 1, "John Doe" }), new GenericRow(new object[] { 2, "Jane Doe" }), new GenericRow(new object[] { 3, "Foo Bar" }) }; // Create schema for sample data var schema = new StructType(new List <StructField>() { new StructField("Id", new IntegerType()), new StructField("Name", new StringType()), }); // Create DataFrame using data and schema DataFrame df = spark.CreateDataFrame(data, schema); // Print DataFrame df.Show(); // Write DataFrame to Azure Data Lake Gen1 df.Write().Mode(SaveMode.Overwrite).Parquet(filePath); // Read saved DataFrame from Azure Data Lake Gen1 DataFrame readDf = spark.Read().Parquet(filePath); // Print DataFrame readDf.Show(); }
private static DataFrame LoadAnomalyDetectionData(IEnumerable <object[]> rows) => SparkSession.Builder().GetOrCreate().CreateDataFrame( rows.Select(row => new GenericRow(row)), new StructType(new List <StructField> { new StructField("id", new IntegerType()), new StructField("productName", new StringType()), new StructField("description", new StringType()), new StructField("priority", new StringType()), new StructField("numViews", new IntegerType()), }));
public void ExecuteSparkStatementSync() { // Environment variable with the Synapse workspace endpoint. string workspaceUrl = TestEnvironment.WorkspaceUrl; // Environment variable with the Synapse Spark pool name. string sparkPoolName = TestEnvironment.SparkPoolName; #region Snippet:SparkSessionSample1SparkSessionClient SparkSessionClient client = new SparkSessionClient(new Uri(workspaceUrl), sparkPoolName, new DefaultAzureCredential()); #endregion #region Snippet:SparkSessionSample1StartSparkSession SparkSessionOptions request = new SparkSessionOptions(name: $"session-{Guid.NewGuid()}") { DriverMemory = "28g", DriverCores = 4, ExecutorMemory = "28g", ExecutorCores = 4, ExecutorCount = 2 }; SparkSession sessionCreated = client.CreateSparkSession(request); #endregion #region Snippet:SparkSessionSample1GetSparkSession SparkSession session = client.GetSparkSession(sessionCreated.Id); Debug.WriteLine($"Session is returned with name {session.Name} and state {session.State}"); #endregion #region Snippet:SparkSessionSample1ExecuteSparkStatement SparkStatementOptions sparkStatementRequest = new SparkStatementOptions { Kind = SparkStatementLanguageType.Spark, Code = @"print(""Hello world\n"")" }; SparkStatement statementCreated = client.CreateSparkStatement(sessionCreated.Id, sparkStatementRequest); #endregion #region Snippet:SparkSessionSample1GetSparkStatement SparkStatement statement = client.GetSparkStatement(sessionCreated.Id, statementCreated.Id); Debug.WriteLine($"Statement is returned with id {statement.Id} and state {statement.State}"); #endregion #region Snippet:SparkSessionSample1CancelSparkStatement SparkStatementCancellationResult cancellationResult = client.CancelSparkStatement(sessionCreated.Id, statementCreated.Id); Debug.WriteLine($"Statement is cancelled with message {cancellationResult.Msg}"); #endregion #region Snippet:SparkSessionSample1StopSparkSession Response operation = client.CancelSparkSession(sessionCreated.Id); #endregion }
static void Main(string[] args) { var spark = SparkSession .Builder() .GetOrCreate(); var filtered = spark.Read().Parquet("1.parquet") .Filter(Functions.Col("event_type") == Functions.Lit(999)); filtered.Write().Mode("overwrite").Parquet("output.parquet"); Console.WriteLine($"Wrote: {filtered.Count()} rows"); }
private static void Main(string[] args) { var spark = SparkSession.Builder() .Config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") .GetOrCreate(); if (!DeltaTable.IsDeltaTable("parquet.`/tmp/delta-demo`")) { spark.Range(1000).WithColumn("name", Lit("Sammy")).Write().Mode("overwrite") .Parquet("/tmp/delta-demo"); DeltaTable.ConvertToDelta(spark, "parquet.`/tmp/delta-demo`"); } var delta = DeltaTable.ForPath("/tmp/delta-demo"); delta.ToDF().OrderBy(Desc("Id")).Show(); spark.Range(5, 500).WithColumn("name", Lit("Lucy")).Write().Mode("append").Format("delta") .Save("/tmp/delta-demo"); delta.Update(Expr("id > 500"), new Dictionary <string, Column> { { "id", Lit(999) } }); delta.Delete(Column("id").EqualTo(999)); spark.Range(100000, 100100).Write().Format("delta").Mode("append").Save("/tmp/delta-demo"); delta.History().Show(1000, 10000); spark.Read().Format("delta").Option("versionAsOf", 0).Load("/tmp/delta-demo").OrderBy(Desc("Id")) .Show(); spark.Read().Format("delta").Option("timestampAsOf", "2021-10-22 22:03:36") .Load("/tmp/delta-demo").OrderBy(Desc("Id")).Show(); var newData = spark.Range(10).WithColumn("name", Lit("Ed")); delta.Alias("target") .Merge(newData.Alias("source"), "target.id = source.id") .WhenMatched(newData["id"].Mod(2).EqualTo(0)).Update(new Dictionary <string, Column> { { "name", newData["name"] } }) .WhenMatched(newData["id"].Mod(2).EqualTo(1)).Delete() .WhenNotMatched().InsertAll() .Execute(); delta.ToDF().OrderBy("id").Show(1000, 10000); delta.Vacuum(1F); }
static void CreateUsingRangeAndDataFrameAPI(SparkSession spark) { Console.WriteLine("spark.Sql"); var dataFrame = spark.Sql("select id from range(1000)"); dataFrame.Show(5); /* * +---+ | id| +---+ | 0| | 1| | 2| | 3| | 4| +---+ * */ Console.WriteLine("spark.Sql().WithColumn"); dataFrame = dataFrame.WithColumn("Another Column", Functions.Lit("Literal")); dataFrame.Show(5); /* * +---+--------------+ | id|Another Column| +---+--------------+ | 0| Literal| | 1| Literal| | 2| Literal| | 3| Literal| | 4| Literal| +---+--------------+ */ Console.WriteLine("spark.Sql().WithColumn"); dataFrame = dataFrame.WithColumn("Mod", Functions.Pmod(Functions.Col("id"), Functions.Lit(2))); dataFrame.Show(5); /* * +---+--------------+---+ | id|Another Column|Mod| +---+--------------+---+ | 0| Literal| 0| | 1| Literal| 1| | 2| Literal| 0| | 3| Literal| 1| | 4| Literal| 0| +---+--------------+---+ */ }
internal TpchBase(string tpchRoot, SparkSession spark) { // Load all the TPC-H tables. tpchRoot += Path.DirectorySeparatorChar; _customer = spark.Read().Parquet($"{tpchRoot}customer"); _lineitem = spark.Read().Parquet($"{tpchRoot}lineitem"); _nation = spark.Read().Parquet($"{tpchRoot}nation"); _orders = spark.Read().Parquet($"{tpchRoot}orders"); _part = spark.Read().Parquet($"{tpchRoot}part"); _partsupp = spark.Read().Parquet($"{tpchRoot}partsupp"); _region = spark.Read().Parquet($"{tpchRoot}region"); _supplier = spark.Read().Parquet($"{tpchRoot}supplier"); }
protected override DataFrame ReadFromInternal(SparkSession sparkSession, SqlServerDataTableInputEndpoint inputEndpoint, ProjectContext projectContext) { return(sparkSession.Read() .Format("jdbc") .Option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") .Option("url", inputEndpoint.ConnectionUrl) .Option("dbtable", inputEndpoint.DataTable) .Option("user", inputEndpoint.UserName) .Option("password", inputEndpoint.Password) .Load()); }
static void Main(string[] args) { var path = args.FirstOrDefault(); var spark = SparkSession .Builder() .GetOrCreate(); var dataFrame = spark.Read().Option("header", "true").Csv(path); var count = dataFrame.Filter(Col("name") == "Ed Elliott").Count(); Console.WriteLine($"There are {count} row(s)"); }
public Worker(ILogger <Worker> logger) { _logger = logger; spark = SparkSession.Builder() .AppName("meuovo") .GetOrCreate(); input = spark.ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", "localhost:9092") .Option("subscribe", "b7f45352-6abf-436b-9c4a-98141699728c") .Load() .SelectExpr("CAST(value AS STRING)"); }
static void Main(string[] args) { // Verify environment variables if (args.Length != 2) { Console.Error.WriteLine("Usage: $AZURE_STORAGE_ACCOUNT $AZURE_STORAGE_KEY"); Environment.Exit(1); } // Specify file path in Azure Storage string filePath = $"wasbs://dotnet-spark@{args[0]}.blob.core.windows.net/json/people.json"; // Create SparkSession SparkSession spark = SparkSession .Builder() .AppName("Azure Storage example using .NET for Apache Spark") .Config("fs.wasbs.impl", "org.apache.hadoop.fs.azure.NativeAzureFileSystem") .Config($"fs.azure.account.key.{args[0]}.blob.core.windows.net", args[1]) .GetOrCreate(); // Create sample data var data = new List <GenericRow> { new GenericRow(new object[] { 1, "John Doe" }), new GenericRow(new object[] { 2, "Jane Doe" }), new GenericRow(new object[] { 3, "Foo Bar" }) }; // Create schema for sample data var schema = new StructType(new List <StructField>() { new StructField("Id", new IntegerType()), new StructField("Name", new StringType()), }); // Create DataFrame using data and schema DataFrame df = spark.CreateDataFrame(data, schema); // Print DataFrame df.Show(); // Write DataFrame to Azure Storage df.Write().Mode(SaveMode.Overwrite).Json(filePath); // Read saved DataFrame from Azure Storage DataFrame readDf = spark.Read().Json(filePath); // Print DataFrame readDf.Show(); }
private static void CreateUsingGenericRowAndStructType(SparkSession spark) { Console.WriteLine("spark.CreateDataFrame using StructType"); var rowOne = new GenericRow(new object[] { "columnOne Row One", 1.1 }); var rowTwo = new GenericRow(new object[] { "columnOne Row Two", null }); var rowThree = new GenericRow(new object[] { "columnOne Row Three", 3.3 }); var rows = new List <GenericRow>() { rowOne, rowTwo, rowThree }; var structType = new StructType(new List <StructField>() { new StructField("column one", new StringType(), isNullable: false), new StructField("column two", new DoubleType(), isNullable: true) }); var dataFrame = spark.CreateDataFrame(rows, structType); dataFrame.Show(); /* * +-------------------+----------+ | column one|column two| +-------------------+----------+ | columnOne Row One| 1.1| | columnOne Row Two| null| |columnOne Row Three| 3.3| +-------------------+----------+ */ dataFrame.PrintSchema(); /* * root |-- column one: string (nullable = false) |-- column two: double (nullable = true) */ }
protected override DataFrame ReadFromInternal(SparkSession sparkSession, TextInputEndpoint inputEndpoint, ProjectContext projectContext) { var textFiles = inputEndpoint .Files .Select(f => $"s3a://{f.Bucket}/{f.Key}/{f.File}") .ToArray(); if (textFiles?.Length == 0) { throw new SparkRunnerException("No files could be read by the TextFileInputReader."); } return(sparkSession.Read().Text(textFiles)); }