private void RunBasicDatasourceExample(SparkSession spark, string parquet, string json, string csv, string orc) { DataFrame df = spark.Read().Load(parquet); df.PrintSchema(); df.Select("name", "favorite_color") .Write() .Mode(SaveMode.Overwrite) .Save("namesPartByColor.parquet"); df.Write() .Mode(SaveMode.Overwrite) .PartitionBy("favorite_color") .BucketBy(42, "name") .SaveAsTable("people_partitioned_bucketed"); df = spark.Read().Format("json").Load(json); df.PrintSchema(); df.Select("name", "age") .Write() .Mode(SaveMode.Overwrite) .Format("parquet") .Save("namesAndAges.parquet"); df = spark.Read() .Format("csv") .Option("sep", ";") .Option("inferSchema", true) .Option("header", true) .Load(csv); df = spark.Read().Orc(orc); df.Write() .Format("orc") .Options(new Dictionary <string, string> { { "orc.bloom.filter.columns", "favorite_color" }, { "orc.dictionary.key.threshold", "1.0" }, { "orc.column.encoding.direct", "name" } }) .Mode(SaveMode.Overwrite) .Save("users_with_options.orc"); df.Write() .BucketBy(42, "name") .SortBy("favorite_color") .SaveAsTable("people_bucketed"); spark.Sql($"SELECT * FROM parquet.`{parquet}`").Show(); spark.Sql("SELECT * FROM people_bucketed").Show(); spark.Sql("SELECT * FROM people_partitioned_bucketed").Show(); spark.Sql("DROP TABLE IF EXISTS people_bucketed"); spark.Sql("DROP TABLE IF EXISTS people_partitioned_bucketed"); }
static void BasicDfExample(SparkSession spark) { var dataFrame = spark.Read().Json("/Users/ed/spark-2.4.6-bin-without-hadoop/examples/src/main/resources/people.json"); dataFrame.Show(); dataFrame.PrintSchema(); dataFrame.Select("name").Show(); dataFrame.Select(dataFrame["name"], dataFrame["age"] + 1).Show(); dataFrame.Select(dataFrame["name"], dataFrame["age"].Plus(1)).Show(); dataFrame.Filter(dataFrame["age"] > 21).Show(); dataFrame.Filter(dataFrame["age"].Gt(21)).Show(); dataFrame.GroupBy(dataFrame["age"]).Count().Show(); dataFrame.CreateOrReplaceTempView("people"); var sqlDataFrame = spark.Sql("SELECT * FROM people"); dataFrame.CreateGlobalTempView("people"); spark.Sql("SELECT * FROM global_temp.people").Show(); spark.NewSession().Sql("SELECT * FROM global_temp.people").Show(); }
public void TestBucketizer() { var expectedSplits = new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue }; string expectedHandle = "skip"; string expectedUid = "uid"; string expectedInputCol = "input_col"; string expectedOutputCol = "output_col"; var bucketizer = new Bucketizer(expectedUid); bucketizer.SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetHandleInvalid(expectedHandle) .SetSplits(expectedSplits); Assert.Equal(expectedHandle, bucketizer.GetHandleInvalid()); Assert.Equal(expectedUid, bucketizer.Uid()); DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)"); DataFrame output = bucketizer.Transform(input); Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol)); Assert.Equal(expectedInputCol, bucketizer.GetInputCol()); Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol()); Assert.Equal(expectedSplits, bucketizer.GetSplits()); }
/// <summary> /// To integrate with Hive operations /// </summary> private static void HiveDataFrame() { var builder = SparkSession.Builder().EnableHiveSupport(); builder = builder.Config("spark.master", "yarn"); builder = builder.Config("spark.app.name", "HiveDataFrame"); builder = builder.Config("spark.sql.warehouse.dir", "/user/hive/warehouse"); session = builder.GetOrCreate(); var peopleDataFrame = session.Read().Json(jsonFilePath); logger.LogInfo("****Create table if not exists****"); session.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists logger.LogInfo("****Database Created****"); session.Sql(string.Format("USE {0}", dbName)); logger.LogInfo("****Create Table operation started****"); peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table logger.LogInfo("****Table Created successfully****"); var tablesDataFrame = session.Table(tableName); logger.LogInfo(string.Format("****Table count in database {0}: {1}", dbName, tablesDataFrame.Count()) + "****"); var rowCollections = tablesDataFrame.Collect(); logger.LogInfo("**********************************************"); foreach (var row in rowCollections) { Console.WriteLine("{0}", row); } logger.LogInfo("*********************************************"); logger.LogInfo("Executed Successfully................."); }
public void TestBucketizer() { var expectedSplits = new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue }; string expectedHandle = "skip"; string expectedUid = "uid"; string expectedInputCol = "input_col"; string expectedOutputCol = "output_col"; var bucketizer = new Bucketizer(expectedUid); bucketizer.SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetHandleInvalid(expectedHandle) .SetSplits(expectedSplits); Assert.Equal(expectedHandle, bucketizer.GetHandleInvalid()); Assert.Equal(expectedUid, bucketizer.Uid()); DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)"); DataFrame output = bucketizer.Transform(input); Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol)); Assert.Equal(expectedInputCol, bucketizer.GetInputCol()); Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol()); Assert.Equal(expectedSplits, bucketizer.GetSplits()); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "bucket"); bucketizer.Save(savePath); Bucketizer loadedBucketizer = Bucketizer.Load(savePath); Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid()); } Assert.NotEmpty(bucketizer.ExplainParams()); Param handleInvalidParam = bucketizer.GetParam("handleInvalid"); Assert.NotEmpty(handleInvalidParam.Doc); Assert.NotEmpty(handleInvalidParam.Name); Assert.Equal(handleInvalidParam.Parent, bucketizer.Uid()); Assert.NotEmpty(bucketizer.ExplainParam(handleInvalidParam)); bucketizer.Set(handleInvalidParam, "keep"); Assert.Equal("keep", bucketizer.GetHandleInvalid()); Assert.Equal("error", bucketizer.Clear(handleInvalidParam).GetHandleInvalid()); }
private static void AnalyseBrandDemand(SparkSession spark, CustomerSentimentContext context) { Console.WriteLine("Analysing popular brand demand"); var brands = context.BrandSentiment .OrderByDescending(b => b.ReviewCount) .ThenBy(b => b.SentimentRank) .Take(10) .Select(b => $"'{b.Brand}'") .ToArray(); var brandList = string.Join(',', brands); var brandsDemand = spark.Sql( "SELECT EM.brand, FROM_UNIXTIME(ER.unix_time, 'MM') as month, COUNT(1) as demand " + "FROM ElectronicsReviews ER " + "JOIN ElectronicsMetadata EM ON EM.asin = ER.asin " + $"WHERE EM.brand IN ({brandList}) " + "GROUP BY EM.brand, FROM_UNIXTIME(ER.unix_time, 'MM') " + "ORDER BY EM.brand, FROM_UNIXTIME(ER.unix_time, 'MM')"); brandsDemand.Cache(); brandsDemand.CreateOrReplaceTempView("BrandsDemand"); var items = Mapper.MapRows( brandsDemand.Collect(), r => new BrandDemand { Brand = r.GetAs <string>(0), Month = int.Parse(r.GetAs <string>(1)), Demand = r.GetAs <int>(2) }, o => $"{o.Brand}-{o.Month}"); context.BrandDemand.RemoveRange(context.BrandDemand); context.BrandDemand.AddRange(items); context.SaveChanges(); foreach (var brand in brands) { Console.WriteLine($"Analysing consumer demand for {brand}"); var brandDemand = spark.Sql( "SELECT * " + "FROM BrandsDemand " + $"WHERE brand = {brand}"); brandDemand.Show(); } }
static void CreateUsingRangeInSql(SparkSession spark) { Console.WriteLine("Range in SQL"); var dataFrame = spark.Sql("select id from range(1000)"); dataFrame.Show(5); /* * +---+ | id| +---+ | 0| | 1| | 2| | 3| | 4| +---+ */ dataFrame = spark.Sql("select id, 'Literal' as `Another Column` from range(1000)"); dataFrame.Show(5); /* * +---+--------------+ | id|Another Column| +---+--------------+ | 0| Literal| | 1| Literal| | 2| Literal| | 3| Literal| | 4| Literal| +---+--------------+ */ dataFrame = spark.Sql("select id, 'Literal' as `Another Column`, pmod(id, 2) as `Mod` from range(1000)"); dataFrame.Show(5); /* * +---+--------------+---+ | id|Another Column|Mod| +---+--------------+---+ | 0| Literal| 0| | 1| Literal| 1| | 2| Literal| 0| | 3| Literal| 1| | 4| Literal| 0| +---+--------------+---+ */ }
public void TestTryAddThread() { using var threadPool = new JvmThreadPoolGC( _loggerService, _jvmBridge, TimeSpan.FromMinutes(30)); var thread = new Thread(() => _spark.Sql("SELECT TRUE")); thread.Start(); Assert.True(threadPool.TryAddThread(thread)); // Subsequent call should return false, because the thread has already been added. Assert.False(threadPool.TryAddThread(thread)); thread.Join(); }
static void runSpark(string file_path, string cores, string nodes, int nrows) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("Resolution de " + nrows + " sudokus par évolution combinatoire de " + cores + " noyau(x) et " + nodes + " noeud(s)") .Config("spark.executor.cores", cores) .Config("spark.executor.instances", nodes) .GetOrCreate(); // Create initial DataFrame DataFrame dataFrame = spark .Read() .Option("header", true) .Option("inferSchema", true) .Schema("quizzes string, solutions string") .Csv(file_path); DataFrame dataFrame2 = dataFrame.Limit(nrows); spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => sudokusolution(sudoku)); dataFrame2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved"); sqlDf.Show(); spark.Stop(); }
public static void runSpark(string file_path, string cores, string nodes, int nrows) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .Config("spark.executor.cores", cores) .Config("spark.executor.instances", nodes) .GetOrCreate(); // // Create initial DataFrame DataFrame dataFrame = spark .Read() .Option("header", true) .Option("inferSchema", true) .Schema("quizzes string, solutions string") .Csv(file_path); DataFrame dataFrame2 = dataFrame.Limit(nrows); spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => sudokusolution(sudoku)); dataFrame2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved"); sqlDf.Show(); spark.Stop(); Console.WriteLine("SCRAPY"); }
public static void leerJSON() { SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // A CSV dataset is pointed to by path. // The path can be either a single CSV file or a directory of CSV files string path = "data/sample_data.csv"; //Dataset<Row> df = spark.Read().Csv(path);//.csv(path); DataFrame df = spark.Read().Csv(path); df.Show(); // +------------------+ // | _c0| // +------------------+ // | name;age;job| // |Jorge;30;Developer| // | Bob;32;Developer| // +------------------+ //realizar conteo de nombres con sql DataFrame sqlDf = spark.Sql("SELECT * FROM sample_data"); // Show results sqlDf.Show(); // Stop Spark session spark.Stop(); }
private static DataFrame LoadReviewPathFile(string reviewPath, SparkSession spark) { Console.WriteLine("Loading Electronics_Reviews.json File"); var ratingSchema = new StructType(new[] { new StructField("reviewerID", new StringType(), isNullable: false), new StructField("asin", new StringType(), isNullable: false), new StructField("reviewText", new StringType()), new StructField("unixReviewTime", new LongType()) }); var dfRatings = spark .Read() .Schema(ratingSchema) .Json(reviewPath); var itemIds = spark.Sql( "SELECT asin AS id " + "FROM ElectronicsMetadata"); var avaliableItemReviws = dfRatings .Join(itemIds, dfRatings["asin"] == itemIds["id"]) .Drop("id"); Console.WriteLine("Done"); Console.WriteLine(); return(avaliableItemReviws); }
public void TestTokenizer() { string expectedUid = "theUid"; string expectedInputCol = "input_col"; string expectedOutputCol = "output_col"; DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + " from range(100)"); Tokenizer tokenizer = new Tokenizer(expectedUid) .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol); DataFrame output = tokenizer.Transform(input); Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol)); Assert.Equal(expectedInputCol, tokenizer.GetInputCol()); Assert.Equal(expectedOutputCol, tokenizer.GetOutputCol()); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "Tokenizer"); tokenizer.Save(savePath); Tokenizer loadedTokenizer = Tokenizer.Load(savePath); Assert.Equal(tokenizer.Uid(), loadedTokenizer.Uid()); } Assert.Equal(expectedUid, tokenizer.Uid()); }
public void TestWord2VecModel() { DataFrame documentDataFrame = _spark.Sql("SELECT split('Hi I heard about Spark', ' ') as text"); Word2Vec word2vec = new Word2Vec() .SetInputCol("text") .SetOutputCol("result") .SetMinCount(1); Word2VecModel model = word2vec.Fit(documentDataFrame); const int expectedSynonyms = 2; DataFrame synonyms = model.FindSynonyms("Hi", expectedSynonyms); Assert.Equal(expectedSynonyms, synonyms.Count()); synonyms.Show(); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "word2vecModel"); model.Save(savePath); Word2VecModel loadedModel = Word2VecModel.Load(savePath); Assert.Equal(model.Uid(), loadedModel.Uid()); } }
private static void AnalyseCategoryDemand(SparkSession spark, CustomerSentimentContext context) { Console.WriteLine("Analysing category consumer demand"); var categoriesDemand = spark.Sql( "SELECT EM.main_cat, FROM_UNIXTIME(ER.unix_time, 'MM') as month, COUNT(1) as demand " + "FROM ElectronicsReviews ER " + "JOIN ElectronicsMetadata EM ON EM.asin = ER.asin " + "GROUP BY EM.main_cat, from_unixtime(ER.unix_time, 'MM') " + "ORDER BY EM.main_cat, FROM_UNIXTIME(ER.unix_time, 'MM')"); categoriesDemand.Cache(); categoriesDemand.CreateOrReplaceTempView("CategoryDemand"); var items = Mapper.MapRows( categoriesDemand.Collect(), r => new CategoryDemand { Category = r.GetAs <string>(0), Month = int.Parse(r.GetAs <string>(1)), Demand = r.GetAs <int>(2) }, o => $"{o.Category}-{o.Month}"); context.CategoryDemand.RemoveRange(context.CategoryDemand); context.CategoryDemand.AddRange(items); context.SaveChangesAsync(); var categories = spark.Sql("SELECT main_cat FROM CategoryDemand GROUP BY main_cat") .Collect() .Select(r => r.GetAs <string>(0)) .ToArray(); foreach (var category in categories) { Console.WriteLine($"Analysing consumer demand for {category}"); var categoryDemand = spark.Sql( "SELECT * " + "FROM CategoryDemand " + $"WHERE main_cat = '{category}'"); categoryDemand.Show(); } }
private static void ElectronicsReviewsSentimentAnalysis(SparkSession spark) { spark.Udf().Register <string, int>("sentiment_udf", text => Sentiment(text)); var reviewsSentiment = spark.Sql("SELECT *, sentiment_udf(review_text) AS sentiment FROM ElectronicsReviews"); reviewsSentiment.Cache(); reviewsSentiment.CreateOrReplaceTempView("ElectronicsReviewSentiment"); }
public void TestSignaturesV3_0_X() { DataFrame df = _spark .Read() .Schema("age INT, name STRING") .Json($"{TestEnvironment.ResourceDirectory}people.json"); DataFrameWriterV2 dfwV2 = df.WriteTo("testtable"); Assert.IsType <DataFrameWriterV2>(dfwV2.Using("json")); Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key1", "value")); Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key2", true)); Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key3", 1L)); Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key4", 2D)); Assert.IsType <DataFrameWriterV2>(dfwV2.Options( new Dictionary <string, string>() { { "key", "value" } })); Assert.IsType <DataFrameWriterV2>(dfwV2.TableProperty("prop", "value")); _spark.Sql("DROP TABLE IF EXISTS default.testtable"); dfwV2.Create(); Assert.IsType <DataFrameWriterV2>(dfwV2.PartitionedBy(df.Col("age"))); // Throws the following exception: // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported // with v2 tables. Assert.Throws <Exception>(() => dfwV2.Replace()); // Throws the following exception: // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported // with v2 tables. Assert.Throws <Exception>(() => dfwV2.CreateOrReplace()); // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // append in batch mode. Assert.Throws <Exception>(() => dfwV2.Append()); // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // overwrite by filter in batch mode. Assert.Throws <Exception>(() => dfwV2.Overwrite(df.Col("age"))); // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // dynamic overwrite in batch mode. Assert.Throws <Exception>(() => dfwV2.OverwritePartitions()); }
//Méthode qui est appelée depuis le main pour lancer une session spark avec un nombbre de noyaux et d'instances différents et lancer la résolution du soduku grace à la méthode Sudokusolution(). //private static void Sudokures(string cores, string nodes, string mem, int nrows){ private static void Sudokures(int nrows) { // Initialisation de la session Spark SparkSession spark = SparkSession .Builder() .Config("spark.executor.memory", "4G") .GetOrCreate(); //.AppName("Resolution of " + nrows + " sudokus using DlxLib with " + cores + " cores and " + nodes + " instances") //.Config("spark.driver.cores", cores) //.Config("spark.executor.instances", nodes) //.Config("spark.executor.memory", mem) //.GetOrCreate(); // Intégration du csv dans un dataframe DataFrame df = spark .Read() .Option("header", true) .Option("inferSchema", true) .Csv(_filePath); //limit du dataframe avec un nombre de ligne prédéfini lors de l'appel de la fonction DataFrame df2 = df.Limit(nrows); //Watch seulement pour la résolution des sudokus var watch2 = new System.Diagnostics.Stopwatch(); watch2.Start(); // Création de la spark User Defined Function spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => Sudokusolution(sudoku)); // Appel de l'UDF dans un nouveau dataframe spark qui contiendra les résultats aussi df2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT Sudokus, SukoduUDF(Sudokus) as Resolution from Resolved"); sqlDf.Show(); watch2.Stop(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution : " + watch2.ElapsedMilliseconds + " ms"); //Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution with " + cores + " core and " + nodes + " instance: " + watch2.ElapsedMilliseconds + " ms"); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); spark.Stop(); }
/// <summary> /// Drops tables in <paramref name="tableNames"/> after calling <paramref name="action"/>. /// </summary> /// <param name="spark">The <see cref="SparkSession"/></param> /// <param name="tableNames">Names of the tables to drop</param> /// <param name="action"><see cref="Action"/> to execute.</param> public static void WithTable(SparkSession spark, IEnumerable <string> tableNames, Action action) { try { action(); } finally { tableNames.ToList().ForEach(name => spark.Sql($"DROP TABLE IF EXISTS {name}")); } }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: GitHubProjects <path to projects.csv>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName("GitHub and Spark Batch") .GetOrCreate(); DataFrame projectsDf = spark .Read() .Schema("id INT, url STRING, owner_id INT, " + "name STRING, descriptor STRING, language STRING, " + "created_at STRING, forked_from INT, deleted STRING, " + "updated_at STRING") .Csv(args[0]); projectsDf.Show(); // Drop any rows with NA values DataFrameNaFunctions dropEmptyProjects = projectsDf.Na(); DataFrame cleanedProjects = dropEmptyProjects.Drop("any"); // Remove unnecessary columns cleanedProjects = cleanedProjects.Drop("id", "url", "owner_id"); cleanedProjects.Show(); // Average number of times each language has been forked DataFrame groupedDF = cleanedProjects .GroupBy("language") .Agg(Avg(cleanedProjects["forked_from"])); // Sort by most forked languages first groupedDF.OrderBy(Desc("avg(forked_from)")).Show(); spark.Udf().Register <string, bool>( "MyUDF", (date) => DateTime.TryParse(date, out DateTime convertedDate) && (convertedDate > s_referenceDate)); cleanedProjects.CreateOrReplaceTempView("dateView"); DataFrame dateDf = spark.Sql( "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView"); dateDf.Show(); spark.Stop(); }
internal void Run(string queryNumber) { Console.WriteLine($"Spark .NET TPCH SQL Query: #{queryNumber}"); Type thisType = GetType(); var queryString = (string)thisType.GetField( $"s_q{queryNumber}", BindingFlags.Static | BindingFlags.NonPublic).GetValue(null); var sw = Stopwatch.StartNew(); _spark.Sql(queryString).Show(numRows: 20, truncate: 0); Console.WriteLine($"\tElapsed: {sw.Elapsed}"); }
static void CreateUsingRangeAndDataFrameAPI(SparkSession spark) { Console.WriteLine("spark.Sql"); var dataFrame = spark.Sql("select id from range(1000)"); dataFrame.Show(5); /* * +---+ | id| +---+ | 0| | 1| | 2| | 3| | 4| +---+ * */ Console.WriteLine("spark.Sql().WithColumn"); dataFrame = dataFrame.WithColumn("Another Column", Functions.Lit("Literal")); dataFrame.Show(5); /* * +---+--------------+ | id|Another Column| +---+--------------+ | 0| Literal| | 1| Literal| | 2| Literal| | 3| Literal| | 4| Literal| +---+--------------+ */ Console.WriteLine("spark.Sql().WithColumn"); dataFrame = dataFrame.WithColumn("Mod", Functions.Pmod(Functions.Col("id"), Functions.Lit(2))); dataFrame.Show(5); /* * +---+--------------+---+ | id|Another Column|Mod| +---+--------------+---+ | 0| Literal| 0| | 1| Literal| 1| | 2| Literal| 0| | 3| Literal| 1| | 4| Literal| 0| +---+--------------+---+ */ }
public void TestInnerJvmException() { try { _spark.Sql("THROW!!!"); } catch (Exception ex) { Assert.NotNull(ex.InnerException); Assert.IsType <JvmException>(ex.InnerException); Assert.False(string.IsNullOrWhiteSpace(ex.InnerException.Message)); } }
public void TestCountVectorizerModel() { DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " + "'TOKENIZE') as input from range(100)"); const string inputColumn = "input"; const string outputColumn = "output"; const double minTf = 10.0; const bool binary = false; var vocabulary = new List <string>() { "hello", "I", "AM", "TO", "TOKENIZE" }; var countVectorizerModel = new CountVectorizerModel(vocabulary); Assert.IsType <CountVectorizerModel>(new CountVectorizerModel("my-uid", vocabulary)); countVectorizerModel = countVectorizerModel .SetInputCol(inputColumn) .SetOutputCol(outputColumn) .SetMinTF(minTf) .SetBinary(binary); Assert.Equal(inputColumn, countVectorizerModel.GetInputCol()); Assert.Equal(outputColumn, countVectorizerModel.GetOutputCol()); Assert.Equal(minTf, countVectorizerModel.GetMinTF()); Assert.Equal(binary, countVectorizerModel.GetBinary()); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "countVectorizerModel"); countVectorizerModel.Save(savePath); CountVectorizerModel loadedModel = CountVectorizerModel.Load(savePath); Assert.Equal(countVectorizerModel.Uid(), loadedModel.Uid()); } Assert.IsType <int>(countVectorizerModel.GetVocabSize()); Assert.NotEmpty(countVectorizerModel.ExplainParams()); Assert.NotEmpty(countVectorizerModel.ToString()); Assert.IsType <StructType>(countVectorizerModel.TransformSchema(input.Schema())); Assert.IsType <DataFrame>(countVectorizerModel.Transform(input)); TestFeatureBase(countVectorizerModel, "minDF", 100); }
public void TestPipelineModelTransform() { var expectedSplits = new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue }; string expectedHandle = "skip"; string expectedUid = "uid"; string expectedInputCol = "input_col"; string expectedOutputCol = "output_col"; var bucketizer = new Bucketizer(expectedUid); bucketizer.SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetHandleInvalid(expectedHandle) .SetSplits(expectedSplits); var stages = new JavaTransformer[] { bucketizer }; PipelineModel pipelineModel = new PipelineModel("randomUID", stages); DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)"); DataFrame output = pipelineModel.Transform(input); Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol)); Assert.Equal(expectedInputCol, bucketizer.GetInputCol()); Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol()); Assert.Equal(expectedSplits, bucketizer.GetSplits()); Assert.IsType <StructType>(pipelineModel.TransformSchema(input.Schema())); Assert.IsType <DataFrame>(output); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "pipelineModel"); pipelineModel.Save(savePath); PipelineModel loadedPipelineModel = PipelineModel.Load(savePath); Assert.Equal(pipelineModel.Uid(), loadedPipelineModel.Uid()); string writePath = Path.Join(tempDirectory.Path, "pipelineModelWithWrite"); pipelineModel.Write().Save(writePath); PipelineModel loadedPipelineModelWithRead = pipelineModel.Read().Load(writePath); Assert.Equal(pipelineModel.Uid(), loadedPipelineModelWithRead.Uid()); } }
private static void AnalyseCategorySentiment(SparkSession spark, CustomerSentimentContext context) { Console.WriteLine("Analyzing category sentiment"); var itemCategorySentiment = spark.Sql( "SELECT EM.main_cat, SUM(ERS.sentiment) / COUNT(1) * 100 as sentiment_rank, COUNT(1) review_count " + "FROM ElectronicsMetadata EM " + "JOIN ElectronicsReviewSentiment ERS ON ERS.asin = EM.asin " + "GROUP BY EM.main_cat"); itemCategorySentiment.Cache(); itemCategorySentiment.CreateOrReplaceTempView("ItemCategorySentiment"); Console.WriteLine("Analyzing categories with best consumer sentiment."); var categorySentiment = spark.Sql( "SELECT * " + "FROM ItemCategorySentiment " + "ORDER BY sentiment_rank DESC, review_count DESC"); categorySentiment.Show(); var items = Mapper.MapRows( categorySentiment.Collect(), r => new ItemCategorySentiment { Category = r.GetAs <string>(0), SentimentRank = r.GetAs <double>(1), ReviewCount = r.GetAs <int>(2) }, o => o.Category); context.ItemCategorySentiment.RemoveRange(context.ItemCategorySentiment); context.ItemCategorySentiment.AddRange(items); context.SaveChanges(); }
public void TestWord2Vec() { DataFrame documentDataFrame = _spark.Sql("SELECT split('Spark dotnet is cool', ' ')"); const string expectedInputCol = "text"; const string expectedOutputCol = "result"; const int expectedMinCount = 0; const int expectedMaxIter = 10; const int expectedMaxSentenceLength = 100; const int expectedNumPartitions = 1000; const int expectedSeed = 10000; const double expectedStepSize = 1.9; const int expectedVectorSize = 20; const int expectedWindowSize = 200; Word2Vec word2vec = new Word2Vec() .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetMinCount(expectedMinCount) .SetMaxIter(expectedMaxIter) .SetMaxSentenceLength(expectedMaxSentenceLength) .SetNumPartitions(expectedNumPartitions) .SetSeed(expectedSeed) .SetStepSize(expectedStepSize) .SetVectorSize(expectedVectorSize) .SetWindowSize(expectedWindowSize); Assert.Equal(expectedInputCol, word2vec.GetInputCol()); Assert.Equal(expectedOutputCol, word2vec.GetOutputCol()); Assert.Equal(expectedMinCount, word2vec.GetMinCount()); Assert.Equal(expectedMaxIter, word2vec.GetMaxIter()); Assert.Equal(expectedMaxSentenceLength, word2vec.GetMaxSentenceLength()); Assert.Equal(expectedNumPartitions, word2vec.GetNumPartitions()); Assert.Equal(expectedSeed, word2vec.GetSeed()); Assert.Equal(expectedStepSize, word2vec.GetStepSize()); Assert.Equal(expectedVectorSize, word2vec.GetVectorSize()); Assert.Equal(expectedWindowSize, word2vec.GetWindowSize()); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "word2vec"); word2vec.Save(savePath); Word2Vec loadedWord2Vec = Word2Vec.Load(savePath); Assert.Equal(word2vec.Uid(), loadedWord2Vec.Uid()); } TestFeatureBase(word2vec, "maxIter", 2); }
private void RunParquetExample(SparkSession spark, string json) { DataFrame peopleDf = spark.Read().Json(json); peopleDf.Write().Mode(SaveMode.Overwrite).Parquet("people.parquet"); DataFrame parquetFile = spark.Read().Parquet("people.parquet"); parquetFile.CreateTempView("parquet"); DataFrame teenagers = spark.Sql( "SELECT name FROM parquet WHERE age >= 13 and age <= 19"); teenagers.Show(); }
public void TestIDFModel() { int expectedDocFrequency = 1980; string expectedInputCol = "rawFeatures"; string expectedOutputCol = "features"; DataFrame sentenceData = _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence"); Tokenizer tokenizer = new Tokenizer() .SetInputCol("sentence") .SetOutputCol("words"); DataFrame wordsData = tokenizer.Transform(sentenceData); HashingTF hashingTF = new HashingTF() .SetInputCol("words") .SetOutputCol(expectedInputCol) .SetNumFeatures(20); DataFrame featurizedData = hashingTF.Transform(wordsData); IDF idf = new IDF() .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetMinDocFreq(expectedDocFrequency); IDFModel idfModel = idf.Fit(featurizedData); DataFrame rescaledData = idfModel.Transform(featurizedData); Assert.Contains(expectedOutputCol, rescaledData.Columns()); Assert.Equal(expectedInputCol, idfModel.GetInputCol()); Assert.Equal(expectedOutputCol, idfModel.GetOutputCol()); Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq()); using (var tempDirectory = new TemporaryDirectory()) { string modelPath = Path.Join(tempDirectory.Path, "idfModel"); idfModel.Save(modelPath); IDFModel loadedModel = IDFModel.Load(modelPath); Assert.Equal(idfModel.Uid(), loadedModel.Uid()); } TestFeatureBase(idfModel, "minDocFreq", 1000); }
/// Tests for the Catclog Functions - returned from SparkSession.Catalog public void CatalogFunctions() { Catalog catalog = _spark.Catalog(); Assert.IsType <DataFrame>(catalog.ListDatabases()); Assert.IsType <DataFrame>(catalog.ListFunctions()); Assert.IsType <DataFrame>(catalog.ListFunctions("default")); DataFrame table = catalog.CreateTable("users", Path.Combine(TestEnvironment.ResourceDirectory, "users.parquet")); Assert.IsType <DataFrame>(table); Assert.IsType <string>(catalog.CurrentDatabase()); Assert.IsType <bool>(catalog.DatabaseExists("default")); Assert.IsType <bool>(catalog.DropGlobalTempView("no-view")); Assert.IsType <bool>(catalog.DropTempView("no-view")); Assert.IsType <bool>(catalog.FunctionExists("default", "functionname")); Assert.IsType <bool>(catalog.FunctionExists("functionname")); Assert.IsType <Database>(catalog.GetDatabase("default")); Assert.IsType <Function>(catalog.GetFunction("abs")); Assert.IsType <Function>(catalog.GetFunction(null, "abs")); Assert.IsType <Table>(catalog.GetTable("users")); Assert.IsType <Table>(catalog.GetTable("default", "users")); Assert.IsType <bool>(catalog.IsCached("users")); Assert.IsType <DataFrame>(catalog.ListColumns("users")); Assert.IsType <DataFrame>(catalog.ListColumns("default", "users")); Assert.IsType <DataFrame>(catalog.ListDatabases()); Assert.IsType <DataFrame>(catalog.ListFunctions()); Assert.IsType <DataFrame>(catalog.ListFunctions("default")); Assert.IsType <DataFrame>(catalog.ListTables()); Assert.IsType <DataFrame>(catalog.ListTables("default")); catalog.RefreshByPath("/"); catalog.RefreshTable("users"); catalog.SetCurrentDatabase("default"); catalog.CacheTable("users"); catalog.UncacheTable("users"); catalog.ClearCache(); Assert.IsType <bool>(catalog.TableExists("users")); Assert.IsType <bool>(catalog.TableExists("default", "users")); _spark.Sql(@"CREATE TABLE IF NOT EXISTS usersp USING PARQUET PARTITIONED BY (name) AS SELECT * FROM users"); catalog.RecoverPartitions("usersp"); }