static void Main(string[] args) { if (args.Length != 2) { Console.Error.WriteLine( "Usage: Remember to include input and output path as arguments"); Environment.Exit(1); } var sparkConf = SparkConfUtils.GetSparkConfigurationForFilePath(args); var spark = SparkSession .Builder() .AppName("Batch Job example using Apache Spark .Net") .GetOrCreate(); if (sparkConf != null) { sparkConf.ToList().ForEach(kv => { spark.Conf().Set(kv.Key, kv.Value); }); } var df = spark .Read() .Schema(GetSchema()) .Option("header", true) .Csv(args[0]); var processedDF = ProcessDataset(df); ShowDatasetInfo(processedDF); WriteData(processedDF, args[1]); Console.WriteLine("Finished .Net Spark Job!!"); }
static void Main(string[] args) { var host = "localhost"; var port = 9999; SparkSession spark = SparkSession .Builder() .AppName("Emotion_Prediction") .GetOrCreate(); DataFrame lines = spark .ReadStream() .Format("socket") .Option("host", host) .Option("port", port) .Load(); Func <Column, Column> udfArray = Udf <string, string[]>((str) => new string[] { str, " => " + Predict(str) }); DataFrame arrayDf = lines.Select(Explode(udfArray(lines["value"]))); StreamingQuery query = arrayDf .WriteStream() .Format("console") .Start(); query.AwaitTermination(); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Datasource <path to SPARK_HOME/examples/src/main/resources/>"); Environment.Exit(1); } string parquet = Path.Combine(args[0], "users.parquet"); string json = Path.Combine(args[0], "people.json"); string csv = Path.Combine(args[0], "people.csv"); string orc = Path.Combine(args[0], "users.orc"); SparkSession spark = SparkSession .Builder() .AppName("SQL Datasource example using .NET for Apache Spark") .Config("spark.some.config.option", "some-value") .GetOrCreate(); RunBasicDatasourceExample(spark, parquet, json, csv, orc); RunParquetExample(spark, json); RunDatasourceExample(spark); spark.Stop(); }
public void TestThreadLocalSessions() { SparkSession.ClearActiveSession(); void testChildThread(string appName) { var thread = new Thread(() => { Assert.Null(SparkSession.GetActiveSession()); SparkSession.SetActiveSession( SparkSession.Builder().AppName(appName).GetOrCreate()); // Since we are in the child thread, GetActiveSession() should return the child // SparkSession. SparkSession activeSession = SparkSession.GetActiveSession(); Assert.NotNull(activeSession); Assert.Equal(appName, activeSession.Conf().Get("spark.app.name", null)); }); thread.Start(); thread.Join(); } for (int i = 0; i < 5; ++i) { testChildThread(i.ToString()); } Assert.Null(SparkSession.GetActiveSession()); }
public void Run(string[] args) { string servidoresKafka = args[0]; string topico = args[1]; string modelo = args[2]; // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Exemplo Streaming com Kafka") .GetOrCreate(); // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", servidoresKafka) .Option("subscribe", topico) .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "cliente": "Fulano", * "produto": "Mochila", * "opiniao": "Muito boa!" * } */ var schema = new StructType(new[] { new StructField("cliente", new StringType()), new StructField("produto", new StringType()), new StructField("opiniao", new StringType()) }); // struct<cliente:string,produto:string,valor_total:float> // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", Functions.FromJson( df.Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe //Registrando uma função personalizada pra ser usada no dataframe spark.Udf().Register <string, float>("AnaliseDeSentimento", (texto) => AnalisarSentimento(texto, modelo)); // Criando nova coluna nota com o resultado da análise de sentimento df = df.WithColumn("nota", Functions.CallUDF("AnaliseDeSentimento", df.Col("opiniao"))); // Colocando o streaming pra funcionar StreamingQuery query = df .WriteStream() .OutputMode(OutputMode.Append) .Format("console") //.Trigger(Trigger.Continuous(2000)) //.Foreach(new RedisForeachWriter()) .Start(); query.AwaitTermination(); // Necessário pra deixar a aplcação no ar para processar os dados }
static void Main(string[] args) { var spark = SparkSession.Builder().GetOrCreate(); spark.Range(5).Show(); spark.Range(10, 12).Show(); }
public static void leerTxt() { Console.WriteLine("Hello World!"); // Create a Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // Create initial DataFrame DataFrame dataFrame = spark.Read().Text("input.txt"); // Count words DataFrame words = dataFrame .Select(Functions.Split(Functions.Col("value"), " ").Alias("words")) .Select(Functions.Explode(Functions.Col("words")) .Alias("word")) .GroupBy("word") .Count() .OrderBy(Functions.Col("count").Desc()); // Show results words.Show(); // Stop Spark session spark.Stop(); }
static void Main(string[] args) { var spark = SparkSession .Builder() .AppName("DemoApp") .GetOrCreate(); var dataFrame = spark.Sql("select id, rand() as random_number from range(1000)"); dataFrame .Write() .Format("csv") .Option("header", true) .Option("sep", "|") .Mode("overwrite") .Save(args[0]); foreach (var row in dataFrame.Collect()) { if (row[0] as int? % 2 == 0) { Console.WriteLine($"line: {row[0]}"); } } }
static void Main(string[] args) { Console.WriteLine("Start SparkSession"); SparkSession sparkSession = SparkSession.Builder().AppName("Street Counter").GetOrCreate(); DataFrame dfCsv = sparkSession .Read() .Option("delimiter", ";") .Schema("WOJ string ,POW string ,GMI string ,RODZ_GMI string , " + "SYM string , SYM_UL string , " + "CECHA string , NAZWA_1 string ,NAZWA_2 string , " + "STAN_NA string") .Csv("streets.csv"); DataFrame dataIn = dfCsv .WithColumn("STREET", Functions.ConcatWs(" ", dfCsv["CECHA"], dfCsv["NAZWA_1"], dfCsv["NAZWA_2"])); DataFrame dataGroup = dataIn .Select("STREET") .GroupBy("STREET") .Count() .WithColumnRenamed("count", "COUNT"); DataFrame dataOut = dataGroup .OrderBy(dataGroup["COUNT"] .Desc() ); dataOut .Coalesce(1) .Write() .Option("delimiter", ";") .Csv("result"); sparkSession.Stop(); Console.WriteLine("Stop SparkSession"); }
static void Main(string[] args) { var file = args[0]; Console.WriteLine("Reading file from:" + file); // Create Spark context var spark = SparkSession.Builder() .Master("local[*]") .AppName("SparkDotNet") .GetOrCreate(); // Read csv file var df = spark.Read() .Option("sep", "\t") .Option("header", "true") .Option("inferSchema", "true") .Csv(file); // Show schema and some rows df.PrintSchema(); df.Show(); // Register as a table df.CreateOrReplaceTempView("nyse"); spark.Sql("SELECT `stock_symbol`, AVG(`stock_price_open`) FROM nyse GROUP BY 1").Show(); }
static void Main(string[] args) { var spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); var conf = spark.Conf(); HttpClient client = new HttpClient(); var content = client.GetAsync("https://raw.githubusercontent.com/AMustapha/meduim_words/master/storie.txt") .Result.Content.ReadAsStringAsync().Result; File.WriteAllText("../../../storie.txt", content); DataFrame dataFrame = spark.Read().Text("storie.txt"); var words = dataFrame .Select(Functions.Split(Functions.Col("value"), " ").Alias("words")) .Select(Functions.Explode(Functions.Col("words")) .Alias("word")) .GroupBy("word") .Count() .OrderBy(Functions.Col("count").Desc()); // Show results words.Show(); }
static void Main(string[] args) { //1. Create a Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); //2. Create initial DataFrame DataFrame dataFrame = spark.Read() //.Schema("Assertid STRING,properties STRING,Value BOOLEAN,TimeSatmp TIMESTAMP") .Schema("Assertid STRING,properties STRING,Value STRING,TimeSatmp TIMESTAMP") .Csv("DataBook.csv"); dataFrame.Show(); //Drop any rows with Null/Empty values DataFrameNaFunctions dropEmptytablesrows = dataFrame.Na(); DataFrame CleanedProjects = dropEmptytablesrows.Drop("any"); var testdata = 0; //remove unnecessary Columns CleanedProjects = CleanedProjects.Drop("Assertid", "properties", "Value", "TimeSatmp"); CleanedProjects.Show(); // Stop Spark session--checked spark.Stop(); }
public static void runSpark(string file_path, string cores, string nodes, int nrows) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .Config("spark.executor.cores", cores) .Config("spark.executor.instances", nodes) .GetOrCreate(); // // Create initial DataFrame DataFrame dataFrame = spark .Read() .Option("header", true) .Option("inferSchema", true) .Schema("quizzes string, solutions string") .Csv(file_path); DataFrame dataFrame2 = dataFrame.Limit(nrows); spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => sudokusolution(sudoku)); dataFrame2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved"); sqlDf.Show(); spark.Stop(); Console.WriteLine("SCRAPY"); }
static void Main(string[] args) { SparkSession spark = SparkSession .Builder() .AppName("emrApp") .GetOrCreate(); DataFrame dataFrame = spark .Read() .Format("avro") .Load(args[0]); RegionModel regionModel = new RegionModel(); Func <Column, Column> udfConvertRegion = Udf <string, string>( city => { var regionCode = city.Split('_')[1].Substring(0, 1); var convertedRegion = String.Empty; regionModel.ConversionTable.TryGetValue(regionCode, out convertedRegion); return(convertedRegion); } // city_23 --> 23 --> 2 --> {2 : Brisbane} --> ** Brisbane ** ); dataFrame = dataFrame .WithColumn("Region", udfConvertRegion(dataFrame["address.city"])) .Drop("orderunits", "address"); dataFrame .Coalesce(1) .Write() .Format("csv") .Save($"{args[1]}/{DateTime.UtcNow.ToString("yyyy/MM/dd/hh-mm-ss")}"); }
public SparkFixture() { string workerDirEnvVarName = Services.ConfigurationService.WorkerDirEnvVarName; // The worker directory must be set for the Microsoft.Spark.Worker executable. if (string.IsNullOrEmpty(Environment.GetEnvironmentVariable(workerDirEnvVarName))) { throw new Exception($"Environment variable '{workerDirEnvVarName}' must be set."); } BuildSparkCmd(out var filename, out var args); // Configure the process using the StartInfo properties. _process.StartInfo.FileName = filename; _process.StartInfo.Arguments = args; // UseShellExecute defaults to true in .NET Framework, // but defaults to false in .NET Core. To support both, set it // to false which is required for stream redirection. _process.StartInfo.UseShellExecute = false; _process.StartInfo.RedirectStandardInput = true; _process.StartInfo.RedirectStandardOutput = true; _process.StartInfo.RedirectStandardError = true; bool isSparkReady = false; _process.OutputDataReceived += (sender, arguments) => { // Scala-side driver for .NET emits the following message after it is // launched and ready to accept connections. if (!isSparkReady && arguments.Data.Contains("Backend running debug mode")) { isSparkReady = true; } }; _process.Start(); _process.BeginOutputReadLine(); bool processExited = false; while (!isSparkReady && !processExited) { processExited = _process.WaitForExit(500); } if (processExited) { _process.Dispose(); // The process should not have been exited. throw new Exception( $"Process exited prematurely with '{filename} {args}'."); } Spark = SparkSession .Builder() .AppName("Microsoft.Spark.E2ETest") .GetOrCreate(); }
private static void Exemplo1() { // Create a Spark session var spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // Create initial DataFrame DataFrame dataFrame = spark.Read().Text("input.txt"); // Count words var words = dataFrame .Select(Functions.Split(Functions.Col("value"), " ").Alias("words")) .Select(Functions.Explode(Functions.Col("words")) .Alias("word")) .GroupBy("word") .Count() .OrderBy(Functions.Col("count").Desc()); // Show results words.Show(); // Stop Spark session spark.Stop(); }
static void Main(string[] args) { // Create a Spark session var spark = SparkSession .Builder() .AppName("post_analysis") .GetOrCreate(); // Create initial DataFrame var dataFrame = spark.Read() .Json(@"C:\data\3dprinting.meta.stackexchange.com\Posts.json"); dataFrame.CreateOrReplaceTempView("posts"); // TODO: Filter tags to only contain C# questions //dataFrame // .Select(Split(Col("Body"), " ").As("words")) // .Select(Explode(Col("words")).As("word")) // .GroupBy("word").Count().OrderBy(Col("count").Desc()) // .Show(); // TODO: process text to find code fences and extract C# code // TODO: parse remaining C# code //var parseCSharp = // Udf<string, IDictionary<string, string[]>>( // (str) => GetSyntaxKindsMap(str)); }
static void Main(string[] args) { // Set the debug backend port, has to be same as the one in the Dockerfile. System.Environment.SetEnvironmentVariable("DOTNETBACKEND_PORT", "12345"); // Create a Spark session. SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // Create initial DataFrame. DataFrame dataFrame = spark.Read().Text("input.txt"); // Count words. DataFrame words = dataFrame .Select(Functions.Split(Functions.Col("value"), " ").Alias("words")) .Select(Functions.Explode(Functions.Col("words")) .Alias("word")) .GroupBy("word") .Count() .OrderBy(Functions.Col("count").Desc()); // Show results. words.Show(); // Stop Spark session. spark.Stop(); }
static void Main(string[] args) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // Create initial DataFrame string filePath = args[0]; DataFrame dataFrame = spark.Read().Text(filePath); //Count words DataFrame words = dataFrame .Select(Split(Col("value"), " ").Alias("words")) .Select(Explode(Col("words")).Alias("word")) .GroupBy("word") .Count() .OrderBy(Col("count").Desc()); // Display results words.Show(); // Stop Spark session spark.Stop(); }
static void Main(string[] args) { // Create a Spark session var spark = SparkSession .Builder() .AppName("DotNet-Word-Count") .GetOrCreate(); // Create initial DataFrame var df = spark.Read().Text("file:/home/anderson.souza/bin/lorem_action.txt"); df.PrintSchema(); // Count words var words = df .Select(Functions.Split(Functions.Col("value"), " ").Alias("words")) .Select(Functions.Explode(Functions.Col("words")).Alias("words")) .Select(Functions.RegexpReplace(Functions.Col("words"), "\\.|,", "").Alias("words")) .Filter(Functions.Length(Functions.Col("words")) > 5).Alias("words") .GroupBy("words") .Count() .OrderBy(Functions.Col("count").Desc()); // Show results words.PrintSchema(); words.Show(); }
public static void leerJSON() { SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // A CSV dataset is pointed to by path. // The path can be either a single CSV file or a directory of CSV files string path = "data/sample_data.csv"; //Dataset<Row> df = spark.Read().Csv(path);//.csv(path); DataFrame df = spark.Read().Csv(path); df.Show(); // +------------------+ // | _c0| // +------------------+ // | name;age;job| // |Jorge;30;Developer| // | Bob;32;Developer| // +------------------+ //realizar conteo de nombres con sql DataFrame sqlDf = spark.Sql("SELECT * FROM sample_data"); // Show results sqlDf.Show(); // Stop Spark session spark.Stop(); }
private static void Run(string logLevel, List <string> metricsToObtain) { var spark = SparkSession .Builder() .AppName(SparkAppName) .GetOrCreate(); spark.SparkContext.SetLogLevel(logLevel); // Read initial dataframes as non-streaming DataFrames var moviesDataFrame = ReadCsvIntoDataframe(spark, MoviesCsvFile, SchemaLoader.MovieSchema); // Read initial dataframes as non-streaming DataFrames var ratingsDataFrame = ReadCsvIntoDataframe(spark, RatingsCsvFile, SchemaLoader.RatingSchema); foreach (var metric in metricsToObtain) { var watch = new Stopwatch(); watch.Start(); var colRows = RunMetric(moviesDataFrame, ratingsDataFrame, metric); watch.Stop(); PrintRows(colRows.ToList(), watch.Elapsed.TotalSeconds); } }
static void Main(string[] args) { var spark = SparkSession.Builder().GetOrCreate(); spark.Sql("CREATE DATABASE InputData"); spark.Catalog.SetCurrentDatabase("InputData"); spark.Catalog.CreateTable("id_list", "./ID.parquet"); var tables = spark.Catalog.ListTables("InputData"); foreach (var row in tables.Collect()) { var name = row[0].ToString(); var database = row[1].ToString(); Console.WriteLine($"Database: {database}, Table: {name}"); var table = spark.Catalog.ListColumns(database, name); foreach (var column in table.Collect()) { var columnName = column[0].ToString(); var dataType = column[2].ToString(); Console.WriteLine($"{columnName}\t{dataType}"); } } }
static void Main(string[] args) { SparkSession ss = SparkSession .Builder() .AppName(".NET for Spark Streaming") .GetOrCreate(); DataFrame stream = ss .ReadStream() .Format("socket") .Option("host", "localhost") .Option("port", 9000) .Load(); DataFrame grade = stream .Select(Col("value")); StreamingQuery query = grade .WriteStream() .OutputMode(OutputMode.Append) .Format("console") .Start(); query.AwaitTermination(); }
public static void ExecuteSimpleVerificationSuiteWithExternalFile() { var spark = SparkSession.Builder().GetOrCreate(); var data = spark.Read().Json("data/inventory.json"); data.Show(); VerificationResult verificationResult = new VerificationSuite() .OnData(data) .AddCheck( new Check(CheckLevel.Error, "integrity checks") .HasSize(value => value == 5) .IsComplete("id") .IsUnique("id") .IsComplete("productName") .IsContainedIn("priority", new[] { "high", "low" }) .IsNonNegative("numViews") ) .AddCheck( new Check(CheckLevel.Warning, "distribution checks") .ContainsURL("description", value => value >= .5) ) .Run(); verificationResult.Debug(); }
public static void RunSparkStream(string streamInputPath) { var foreachWriter = new TestForeachWriter(); SparkSession spark = SparkSession .Builder() .AppName("itur") .GetOrCreate(); var mySchema = new Microsoft.Spark.Sql.Types.StructType(new[] { new StructField("IturCode", new Microsoft.Spark.Sql.Types.StringType()), new StructField("IturERP", new Microsoft.Spark.Sql.Types.StringType()), new StructField("QuantityEdit", new Microsoft.Spark.Sql.Types.StringType()), new StructField("PartialQuantity", new Microsoft.Spark.Sql.Types.StringType()) }); DataFrame lines = spark .ReadStream() .Schema(mySchema) .Csv(streamInputPath); s_query = lines .WriteStream() .Foreach(foreachWriter) .Trigger(Trigger.ProcessingTime(5000)) .Start(); s_query.AwaitTermination(); }
public void TestSignaturesV2_3_X() { Assert.IsType <SparkContext>(_spark.SparkContext); Assert.IsType <Builder>(SparkSession.Builder()); SparkSession.ClearDefaultSession(); SparkSession.SetDefaultSession(_spark); Assert.IsType <SparkSession>(SparkSession.GetDefaultSession()); Assert.IsType <RuntimeConfig>(_spark.Conf()); Assert.IsType <SparkSession>(_spark.NewSession()); Assert.IsType <DataFrameReader>(_spark.Read()); Assert.IsType <DataFrame>(_spark.Range(10)); Assert.IsType <DataFrame>(_spark.Range(10, 100)); Assert.IsType <DataFrame>(_spark.Range(10, 100, 10)); Assert.IsType <DataFrame>(_spark.Range(10, 100, 10, 5)); _spark.Range(10).CreateOrReplaceTempView("testView"); Assert.IsType <DataFrame>(_spark.Table("testView")); Assert.IsType <DataStreamReader>(_spark.ReadStream()); Assert.IsType <UdfRegistration>(_spark.Udf()); Assert.IsType <Catalog>(_spark.Catalog()); }
static void Main(string[] args) { var spark = SparkSession.Builder().GetOrCreate(); var df = spark.Read().Json("people.json"); df.Show(); }
/// <summary> /// To integrate with Hive operations /// </summary> private static void HiveDataFrame() { var builder = SparkSession.Builder().EnableHiveSupport(); builder = builder.Config("spark.master", "yarn"); builder = builder.Config("spark.app.name", "HiveDataFrame"); builder = builder.Config("spark.sql.warehouse.dir", "/user/hive/warehouse"); session = builder.GetOrCreate(); var peopleDataFrame = session.Read().Json(jsonFilePath); logger.LogInfo("****Create table if not exists****"); session.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists logger.LogInfo("****Database Created****"); session.Sql(string.Format("USE {0}", dbName)); logger.LogInfo("****Create Table operation started****"); peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table logger.LogInfo("****Table Created successfully****"); var tablesDataFrame = session.Table(tableName); logger.LogInfo(string.Format("****Table count in database {0}: {1}", dbName, tablesDataFrame.Count()) + "****"); var rowCollections = tablesDataFrame.Collect(); logger.LogInfo("**********************************************"); foreach (var row in rowCollections) { Console.WriteLine("{0}", row); } logger.LogInfo("*********************************************"); logger.LogInfo("Executed Successfully................."); }
static void runSpark(string file_path, string cores, string nodes, int nrows) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("Resolution de " + nrows + " sudokus par évolution combinatoire de " + cores + " noyau(x) et " + nodes + " noeud(s)") .Config("spark.executor.cores", cores) .Config("spark.executor.instances", nodes) .GetOrCreate(); // Create initial DataFrame DataFrame dataFrame = spark .Read() .Option("header", true) .Option("inferSchema", true) .Schema("quizzes string, solutions string") .Csv(file_path); DataFrame dataFrame2 = dataFrame.Limit(nrows); spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => sudokusolution(sudoku)); dataFrame2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved"); sqlDf.Show(); spark.Stop(); }