internal MemoryStream(SparkSession sparkSession)
        {
            JvmObjectReference sparkSessionRef = sparkSession.Reference;

            Reference = (JvmObjectReference)sparkSessionRef.Jvm.CallStaticJavaMethod(
                "org.apache.spark.sql.test.TestUtils",
                "createMemoryStream",
                sparkSessionRef.Invoke("sqlContext"),
                typeof(T).Name);
        }
        static void Main(string[] args)
        {
            var spark = SparkSession.Builder().GetOrCreate();

            spark.Udf().RegisterJavaUDAF("java_function", "com.company.ClassName");

            var dataFrame = spark.Sql("SELECT ID, java_function(ID) as java_function_output FROM range(1000)");

            dataFrame.Select(CallUDF("java_udf", dataFrame["ID"])).Show();
        }
Beispiel #3
0
        private static void ProcessEntity(SparkSession spark, string sourceFile, string dataLakePath, string sourceSystem, string entity, string year, string month)
        {
            var data = OfgemExpensesEntity.ReadFromSource(spark, sourceFile);

            OfgemExpensesEntity.WriteToStructured(data, $"{dataLakePath}/structured/{sourceSystem}/{entity}/{year}/{month}");

            OfgemExpensesEntity.WriteToCurated(data, $"{dataLakePath}/curated/{sourceSystem}/{entity}");

            OfgemExpensesEntity.WriteToPublish(spark, $"{dataLakePath}/curated/{sourceSystem}/{entity}", $"{dataLakePath}/publish/{sourceSystem}/{entity}");
        }
Beispiel #4
0
        static void Main(string[] args)
        {
            var spark     = SparkSession.Builder().GetOrCreate();
            var dataFrame = spark.Range(100);

            dataFrame.Write().Mode("overwrite").Csv("output.csv");
            dataFrame.Write().Mode("ignore").Csv("output.csv");
            dataFrame.Write().Mode("append").Csv("output.csv");
            dataFrame.Write().Mode("error").Csv("output.csv");
        }
Beispiel #5
0
        static void Main(string[] args)
        {
            var spark = SparkSession.Builder().Config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
                        .Config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
                        .GetOrCreate();

            spark.Range(100).WithColumn("name", Functions.Lit("Sammy")).Write().Mode("overwrite").Parquet("/tmp/delta-sql-demo");
            spark.Sql("CONVERT TO DELTA parquet.`/tmp/delta-sql-demo");
            spark.Sql("SELECT * FROM delta.`/tmp/delta-sql-demo`").Show();
        }
        static void Main(string[] args)
        {
            var spark     = SparkSession.Builder().GetOrCreate();
            var dataFrame = spark.Read().Option("sep", ",").Option("header", "false")
                            .Schema("greeting string, first_number int, second_number float")
                            .Csv("csv_file.csv");

            dataFrame.PrintSchema();
            dataFrame.Show();
        }
Beispiel #7
0
        static void Main(string[] args)
        {
            var spark = SparkSession.Builder().GetOrCreate();

            var dataFrame = spark.Sql("SELECT ID FROM range(1000)");

            var add100 = Udf <int?, int>((input) => input + 100 ?? 100);

            dataFrame.Select(add100(dataFrame["ID"])).Show();
        }
        private static void Evaluate(SparkSession session, Action <AnalyzerContext, IMetricsRepository> func)
        {
            DataFrame data = FixtureSupport.GetDFFull(session);

            AnalyzerContext results = CreateAnalysis().Run(data, Option <IStateLoader> .None,
                                                           Option <IStatePersister> .None);

            IMetricsRepository repository = CreateRepository();

            func(results, repository);
        }
Beispiel #9
0
        //Méthode qui est appelée depuis le main pour lancer une session spark avec un nombbre de noyaux et d'instances différents et lancer la résolution du soduku grace à la méthode Sudokusolution().
        //private static void Sudokures(string cores, string nodes, string mem, int nrows){
        private static void Sudokures(int nrows)
        {
            // Initialisation de la session Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .Config("spark.executor.memory", "4G")
                                 .GetOrCreate();
            //.AppName("Resolution of " + nrows + " sudokus using DlxLib with " + cores + " cores and " + nodes + " instances")
            //.Config("spark.driver.cores", cores)
            //.Config("spark.executor.instances", nodes)
            //.Config("spark.executor.memory", mem)
            //.GetOrCreate();

            // Intégration du csv dans un dataframe
            DataFrame df = spark
                           .Read()
                           .Option("header", true)
                           .Option("inferSchema", true)
                           .Csv(_filePath);

            //limit du dataframe avec un nombre de ligne prédéfini lors de l'appel de la fonction
            DataFrame df2 = df.Limit(nrows);

            //Watch seulement pour la résolution des sudokus
            var watch2 = new System.Diagnostics.Stopwatch();

            watch2.Start();

            // Création de la spark User Defined Function
            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => Sudokusolution(sudoku));

            // Appel de l'UDF dans un nouveau dataframe spark qui contiendra les résultats aussi
            df2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT Sudokus, SukoduUDF(Sudokus) as Resolution from Resolved");

            sqlDf.Show();

            watch2.Stop();

            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution : " + watch2.ElapsedMilliseconds + " ms");
            //Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution with " + cores + " core and " + nodes + " instance: " + watch2.ElapsedMilliseconds + " ms");
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();

            spark.Stop();
        }
Beispiel #10
0
 /// <summary>
 /// Drops tables in <paramref name="tableNames"/> after calling <paramref name="action"/>.
 /// </summary>
 /// <param name="spark">The <see cref="SparkSession"/></param>
 /// <param name="tableNames">Names of the tables to drop</param>
 /// <param name="action"><see cref="Action"/> to execute.</param>
 public static void WithTable(SparkSession spark, IEnumerable <string> tableNames, Action action)
 {
     try
     {
         action();
     }
     finally
     {
         tableNames.ToList().ForEach(name => spark.Sql($"DROP TABLE IF EXISTS {name}"));
     }
 }
        static void Main(string[] args)
        {
            var spark = SparkSession.Builder().AppName("Creator").GetOrCreate();

            CreateByReadingData(spark);
            CreateUsingRange(spark);
            CreateUsingRangeInSql(spark);
            CreateUsingRangeAndDataFrameAPI(spark);
            CreateUsingBuiltInType(spark);
            CreateUsingGenericRowAndStructType(spark);
        }
        public async Task TestGetSparkSession()
        {
            SparkSessionCollection sparkSessions = (await SparkSessionClient.GetSparkSessionsAsync()).Value;

            foreach (SparkSession expectedSparkSession in sparkSessions.Sessions)
            {
                SparkSession actualSparkSession = await SparkSessionClient.GetSparkSessionAsync(expectedSparkSession.Id);

                ValidateSparkSession(expectedSparkSession, actualSparkSession);
            }
        }
        static void Main(string[] args)
        {
            var spark = SparkSession.Builder().GetOrCreate();

            DataFrameReader reader =
                spark.Read().Format("csv").Option("header", true).Option("sep", ",");

            var dataFrame = reader.Load("./csv_file.csv");

            dataFrame.Show();
        }
        static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                Console.Error.WriteLine(
                    "Usage: Remember to include input and output path as arguments");
                Environment.Exit(1);
            }

            var sparkConf = SparkConfUtils.GetSparkConfigurationForFilePath(args);

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Streaming example using Spark.NET")
                                 .GetOrCreate();

            if (sparkConf != null)
            {
                sparkConf.ToList().ForEach(kv => { spark.Conf().Set(kv.Key, kv.Value); });
            }


            var events = spark
                         .ReadStream()
                         .Format("eventhubs")
                         .Options(EventHubConnection.GetEventHubConnectionSettings(eventHubPartitionCount: 2))
                         .Load();

            var processedEvents = events
                                  .Select(
                FromJson(Col("body").Cast("string"), "temperature String, humidity String").As("Raw"),
                Col("properties"),
                Col("enqueuedTime")
                )
                                  .WithColumn("Raw.temperature", Col("Raw.temperature").Cast("double"))
                                  .WithColumn("Raw.humidity", Col("Raw.humidity").Cast("double"))
                                  .WithColumnRenamed("Raw.temperature", "temperature")
                                  .WithColumnRenamed("Raw.humidity", "humidity")
                                  .WithColumn("temperatureAlert", Col("temperature") >= 40)
                                  .SelectExpr("temperature", "humidity", "properties", "enqueuedTime", "temperatureAlert");

            processedEvents.PrintSchema();


            var streamingQuery = processedEvents
                                 .WriteStream()
                                 .OutputMode(OutputMode.Append)
                                 .Format("console")
                                 .Option("path", args[0])
                                 .Option("checkpointLocation", args[1])
                                 .Start();

            streamingQuery.AwaitTermination();
        }
Beispiel #15
0
 public static DeltaTable ConvertToDelta(
     SparkSession spark,
     string identifier,
     string partitionSchema) =>
 new DeltaTable(
     (JvmObjectReference)SparkEnvironment.JvmBridge.CallStaticJavaMethod(
         s_deltaTableClassName,
         "convertToDelta",
         spark,
         identifier,
         partitionSchema));
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: GitHubProjects <path to projects.csv>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("GitHub and Spark Batch")
                                 .GetOrCreate();

            DataFrame projectsDf = spark
                                   .Read()
                                   .Schema("id INT, url STRING, owner_id INT, " +
                                           "name STRING, descriptor STRING, language STRING, " +
                                           "created_at STRING, forked_from INT, deleted STRING, " +
                                           "updated_at STRING")
                                   .Csv(args[0]);

            projectsDf.Show();

            // Drop any rows with NA values
            DataFrameNaFunctions dropEmptyProjects = projectsDf.Na();
            DataFrame            cleanedProjects   = dropEmptyProjects.Drop("any");

            // Remove unnecessary columns
            cleanedProjects = cleanedProjects.Drop("id", "url", "owner_id");
            cleanedProjects.Show();

            // Average number of times each language has been forked
            DataFrame groupedDF = cleanedProjects
                                  .GroupBy("language")
                                  .Agg(Avg(cleanedProjects["forked_from"]));

            // Sort by most forked languages first
            groupedDF.OrderBy(Desc("avg(forked_from)")).Show();

            spark.Udf().Register <string, bool>(
                "MyUDF",
                (date) => DateTime.TryParse(date, out DateTime convertedDate) &&
                (convertedDate > s_referenceDate));

            cleanedProjects.CreateOrReplaceTempView("dateView");

            DataFrame dateDf = spark.Sql(
                "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView");

            dateDf.Show();

            spark.Stop();
        }
        static void Main(string[] args)
        {
            var spark = SparkSession.Builder().GetOrCreate();

            var dataFrame = spark.Range(10000);

            Func <Column, Column> discPrice = VectorUdf <Int64DataFrameColumn, Int64DataFrameColumn>(
                (id) => id.Multiply(100).Divide(2));

            dataFrame.Select(dataFrame["ID"], discPrice(dataFrame["ID"])).Show();
        }
        static void Main(string[] args)
        {
            // Verify environment variables
            if (args.Length != 4)
            {
                Console.Error.WriteLine("Usage: $TENANT_ID $ADLS_NAME $ADLS_SP_CLIENT_ID $ADLS_SP_CLIENT_SECRET");
                Environment.Exit(1);
            }

            // Specify file path in Azure Data Lake Gen1
            string filePath =
                $"adl://{args[1]}.azuredatalakestore.net/parquet/people.parquet";

            // Create SparkSession
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Azure Data Lake Storage example using .NET for Apache Spark")
                                 .Config("fs.adl.impl", "org.apache.hadoop.fs.adl.AdlFileSystem")
                                 .Config("fs.adl.oauth2.access.token.provider.type", "ClientCredential")
                                 .Config("fs.adl.oauth2.client.id", args[2])
                                 .Config("fs.adl.oauth2.credential", args[3])
                                 .Config("fs.adl.oauth2.refresh.url", $"https://login.microsoftonline.com/{args[0]}/oauth2/token")
                                 .GetOrCreate();

            // Create sample data
            var data = new List <GenericRow>
            {
                new GenericRow(new object[] { 1, "John Doe" }),
                new GenericRow(new object[] { 2, "Jane Doe" }),
                new GenericRow(new object[] { 3, "Foo Bar" })
            };

            // Create schema for sample data
            var schema = new StructType(new List <StructField>()
            {
                new StructField("Id", new IntegerType()),
                new StructField("Name", new StringType()),
            });

            // Create DataFrame using data and schema
            DataFrame df = spark.CreateDataFrame(data, schema);

            // Print DataFrame
            df.Show();

            // Write DataFrame to Azure Data Lake Gen1
            df.Write().Mode(SaveMode.Overwrite).Parquet(filePath);

            // Read saved DataFrame from Azure Data Lake Gen1
            DataFrame readDf = spark.Read().Parquet(filePath);

            // Print DataFrame
            readDf.Show();
        }
Beispiel #19
0
 private static DataFrame LoadAnomalyDetectionData(IEnumerable <object[]> rows) =>
 SparkSession.Builder().GetOrCreate().CreateDataFrame(
     rows.Select(row => new GenericRow(row)),
     new StructType(new List <StructField>
 {
     new StructField("id", new IntegerType()),
     new StructField("productName", new StringType()),
     new StructField("description", new StringType()),
     new StructField("priority", new StringType()),
     new StructField("numViews", new IntegerType()),
 }));
        public void ExecuteSparkStatementSync()
        {
            // Environment variable with the Synapse workspace endpoint.
            string workspaceUrl = TestEnvironment.WorkspaceUrl;

            // Environment variable with the Synapse Spark pool name.
            string sparkPoolName = TestEnvironment.SparkPoolName;

            #region Snippet:SparkSessionSample1SparkSessionClient
            SparkSessionClient client = new SparkSessionClient(new Uri(workspaceUrl), sparkPoolName, new DefaultAzureCredential());
            #endregion

            #region Snippet:SparkSessionSample1StartSparkSession
            SparkSessionOptions request = new SparkSessionOptions(name: $"session-{Guid.NewGuid()}")
            {
                DriverMemory   = "28g",
                DriverCores    = 4,
                ExecutorMemory = "28g",
                ExecutorCores  = 4,
                ExecutorCount  = 2
            };

            SparkSession sessionCreated = client.CreateSparkSession(request);
            #endregion

            #region Snippet:SparkSessionSample1GetSparkSession
            SparkSession session = client.GetSparkSession(sessionCreated.Id);
            Debug.WriteLine($"Session is returned with name {session.Name} and state {session.State}");
            #endregion

            #region Snippet:SparkSessionSample1ExecuteSparkStatement
            SparkStatementOptions sparkStatementRequest = new SparkStatementOptions
            {
                Kind = SparkStatementLanguageType.Spark,
                Code = @"print(""Hello world\n"")"
            };
            SparkStatement statementCreated = client.CreateSparkStatement(sessionCreated.Id, sparkStatementRequest);
            #endregion

            #region Snippet:SparkSessionSample1GetSparkStatement
            SparkStatement statement = client.GetSparkStatement(sessionCreated.Id, statementCreated.Id);
            Debug.WriteLine($"Statement is returned with id {statement.Id} and state {statement.State}");
            #endregion

            #region Snippet:SparkSessionSample1CancelSparkStatement
            SparkStatementCancellationResult cancellationResult = client.CancelSparkStatement(sessionCreated.Id, statementCreated.Id);
            Debug.WriteLine($"Statement is cancelled with message {cancellationResult.Msg}");
            #endregion

            #region Snippet:SparkSessionSample1StopSparkSession
            Response operation = client.CancelSparkSession(sessionCreated.Id);
            #endregion
        }
Beispiel #21
0
        static void Main(string[] args)
        {
            var spark = SparkSession
                        .Builder()
                        .GetOrCreate();

            var filtered = spark.Read().Parquet("1.parquet")
                           .Filter(Functions.Col("event_type") == Functions.Lit(999));

            filtered.Write().Mode("overwrite").Parquet("output.parquet");
            Console.WriteLine($"Wrote: {filtered.Count()} rows");
        }
        private static void Main(string[] args)
        {
            var spark = SparkSession.Builder()
                        .Config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
                        .GetOrCreate();

            if (!DeltaTable.IsDeltaTable("parquet.`/tmp/delta-demo`"))
            {
                spark.Range(1000).WithColumn("name", Lit("Sammy")).Write().Mode("overwrite")
                .Parquet("/tmp/delta-demo");
                DeltaTable.ConvertToDelta(spark, "parquet.`/tmp/delta-demo`");
            }

            var delta = DeltaTable.ForPath("/tmp/delta-demo");

            delta.ToDF().OrderBy(Desc("Id")).Show();

            spark.Range(5, 500).WithColumn("name", Lit("Lucy")).Write().Mode("append").Format("delta")
            .Save("/tmp/delta-demo");

            delta.Update(Expr("id > 500"), new Dictionary <string, Column>
            {
                { "id", Lit(999) }
            });

            delta.Delete(Column("id").EqualTo(999));

            spark.Range(100000, 100100).Write().Format("delta").Mode("append").Save("/tmp/delta-demo");

            delta.History().Show(1000, 10000);

            spark.Read().Format("delta").Option("versionAsOf", 0).Load("/tmp/delta-demo").OrderBy(Desc("Id"))
            .Show();

            spark.Read().Format("delta").Option("timestampAsOf", "2021-10-22 22:03:36")
            .Load("/tmp/delta-demo").OrderBy(Desc("Id")).Show();

            var newData = spark.Range(10).WithColumn("name", Lit("Ed"));

            delta.Alias("target")
            .Merge(newData.Alias("source"), "target.id = source.id")
            .WhenMatched(newData["id"].Mod(2).EqualTo(0)).Update(new Dictionary <string, Column>
            {
                { "name", newData["name"] }
            })
            .WhenMatched(newData["id"].Mod(2).EqualTo(1)).Delete()
            .WhenNotMatched().InsertAll()
            .Execute();

            delta.ToDF().OrderBy("id").Show(1000, 10000);

            delta.Vacuum(1F);
        }
        static void CreateUsingRangeAndDataFrameAPI(SparkSession spark)
        {
            Console.WriteLine("spark.Sql");
            var dataFrame = spark.Sql("select id from range(1000)");

            dataFrame.Show(5);

            /*
             *  +---+
             | id|
             +---+
             |  0|
             |  1|
             |  2|
             |  3|
             |  4|
             +---+
             *
             */

            Console.WriteLine("spark.Sql().WithColumn");
            dataFrame = dataFrame.WithColumn("Another Column", Functions.Lit("Literal"));
            dataFrame.Show(5);

            /*
             *  +---+--------------+
             | id|Another Column|
             +---+--------------+
             |  0|       Literal|
             |  1|       Literal|
             |  2|       Literal|
             |  3|       Literal|
             |  4|       Literal|
             +---+--------------+
             */

            Console.WriteLine("spark.Sql().WithColumn");
            dataFrame = dataFrame.WithColumn("Mod", Functions.Pmod(Functions.Col("id"), Functions.Lit(2)));
            dataFrame.Show(5);

            /*
             *  +---+--------------+---+
             | id|Another Column|Mod|
             +---+--------------+---+
             |  0|       Literal|  0|
             |  1|       Literal|  1|
             |  2|       Literal|  0|
             |  3|       Literal|  1|
             |  4|       Literal|  0|
             +---+--------------+---+
             */
        }
Beispiel #24
0
 internal TpchBase(string tpchRoot, SparkSession spark)
 {
     // Load all the TPC-H tables.
     tpchRoot += Path.DirectorySeparatorChar;
     _customer = spark.Read().Parquet($"{tpchRoot}customer");
     _lineitem = spark.Read().Parquet($"{tpchRoot}lineitem");
     _nation   = spark.Read().Parquet($"{tpchRoot}nation");
     _orders   = spark.Read().Parquet($"{tpchRoot}orders");
     _part     = spark.Read().Parquet($"{tpchRoot}part");
     _partsupp = spark.Read().Parquet($"{tpchRoot}partsupp");
     _region   = spark.Read().Parquet($"{tpchRoot}region");
     _supplier = spark.Read().Parquet($"{tpchRoot}supplier");
 }
 protected override DataFrame ReadFromInternal(SparkSession sparkSession,
                                               SqlServerDataTableInputEndpoint inputEndpoint,
                                               ProjectContext projectContext)
 {
     return(sparkSession.Read()
            .Format("jdbc")
            .Option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
            .Option("url", inputEndpoint.ConnectionUrl)
            .Option("dbtable", inputEndpoint.DataTable)
            .Option("user", inputEndpoint.UserName)
            .Option("password", inputEndpoint.Password)
            .Load());
 }
        static void Main(string[] args)
        {
            var path = args.FirstOrDefault();

            var spark = SparkSession
                        .Builder()
                        .GetOrCreate();

            var dataFrame = spark.Read().Option("header", "true").Csv(path);
            var count     = dataFrame.Filter(Col("name") == "Ed Elliott").Count();

            Console.WriteLine($"There are {count} row(s)");
        }
Beispiel #27
0
 public Worker(ILogger <Worker> logger)
 {
     _logger = logger;
     spark   = SparkSession.Builder()
               .AppName("meuovo")
               .GetOrCreate();
     input = spark.ReadStream()
             .Format("kafka")
             .Option("kafka.bootstrap.servers", "localhost:9092")
             .Option("subscribe", "b7f45352-6abf-436b-9c4a-98141699728c")
             .Load()
             .SelectExpr("CAST(value AS STRING)");
 }
        static void Main(string[] args)
        {
            // Verify environment variables
            if (args.Length != 2)
            {
                Console.Error.WriteLine("Usage: $AZURE_STORAGE_ACCOUNT $AZURE_STORAGE_KEY");
                Environment.Exit(1);
            }

            // Specify file path in Azure Storage
            string filePath =
                $"wasbs://dotnet-spark@{args[0]}.blob.core.windows.net/json/people.json";

            // Create SparkSession
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Azure Storage example using .NET for Apache Spark")
                                 .Config("fs.wasbs.impl", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
                                 .Config($"fs.azure.account.key.{args[0]}.blob.core.windows.net", args[1])
                                 .GetOrCreate();

            // Create sample data
            var data = new List <GenericRow>
            {
                new GenericRow(new object[] { 1, "John Doe" }),
                new GenericRow(new object[] { 2, "Jane Doe" }),
                new GenericRow(new object[] { 3, "Foo Bar" })
            };

            // Create schema for sample data
            var schema = new StructType(new List <StructField>()
            {
                new StructField("Id", new IntegerType()),
                new StructField("Name", new StringType()),
            });

            // Create DataFrame using data and schema
            DataFrame df = spark.CreateDataFrame(data, schema);

            // Print DataFrame
            df.Show();

            // Write DataFrame to Azure Storage
            df.Write().Mode(SaveMode.Overwrite).Json(filePath);

            // Read saved DataFrame from Azure Storage
            DataFrame readDf = spark.Read().Json(filePath);

            // Print DataFrame
            readDf.Show();
        }
        private static void CreateUsingGenericRowAndStructType(SparkSession spark)
        {
            Console.WriteLine("spark.CreateDataFrame using StructType");
            var rowOne = new GenericRow(new object[]
            {
                "columnOne Row One", 1.1
            });

            var rowTwo = new GenericRow(new object[]
            {
                "columnOne Row Two", null
            });

            var rowThree = new GenericRow(new object[]
            {
                "columnOne Row Three", 3.3
            });

            var rows = new List <GenericRow>()
            {
                rowOne, rowTwo, rowThree
            };

            var structType = new StructType(new List <StructField>()
            {
                new StructField("column one", new StringType(), isNullable: false),
                new StructField("column two", new DoubleType(), isNullable: true)
            });

            var dataFrame = spark.CreateDataFrame(rows, structType);

            dataFrame.Show();

            /*
             *  +-------------------+----------+
             |         column one|column two|
             +-------------------+----------+
             |  columnOne Row One|       1.1|
             |  columnOne Row Two|      null|
             |columnOne Row Three|       3.3|
             +-------------------+----------+
             */

            dataFrame.PrintSchema();

            /*
             *  root
             |-- column one: string (nullable = false)
             |-- column two: double (nullable = true)
             */
        }
Beispiel #30
0
        protected override DataFrame ReadFromInternal(SparkSession sparkSession, TextInputEndpoint inputEndpoint, ProjectContext projectContext)
        {
            var textFiles = inputEndpoint
                            .Files
                            .Select(f => $"s3a://{f.Bucket}/{f.Key}/{f.File}")
                            .ToArray();

            if (textFiles?.Length == 0)
            {
                throw new SparkRunnerException("No files could be read by the TextFileInputReader.");
            }

            return(sparkSession.Read().Text(textFiles));
        }