public void TestThreadLocalSessions() { SparkSession.ClearActiveSession(); void testChildThread(string appName) { var thread = new Thread(() => { Assert.Null(SparkSession.GetActiveSession()); SparkSession.SetActiveSession( SparkSession.Builder().AppName(appName).GetOrCreate()); // Since we are in the child thread, GetActiveSession() should return the child // SparkSession. SparkSession activeSession = SparkSession.GetActiveSession(); Assert.NotNull(activeSession); Assert.Equal(appName, activeSession.Conf().Get("spark.app.name", null)); }); thread.Start(); thread.Join(); } for (int i = 0; i < 5; ++i) { testChildThread(i.ToString()); } Assert.Null(SparkSession.GetActiveSession()); }
public void TestSignaturesV2_3_X() { Assert.IsType <SparkContext>(_spark.SparkContext); Assert.IsType <Builder>(SparkSession.Builder()); SparkSession.ClearDefaultSession(); SparkSession.SetDefaultSession(_spark); Assert.IsType <SparkSession>(SparkSession.GetDefaultSession()); Assert.IsType <RuntimeConfig>(_spark.Conf()); Assert.IsType <SparkSession>(_spark.NewSession()); Assert.IsType <DataFrameReader>(_spark.Read()); Assert.IsType <DataFrame>(_spark.Range(10)); Assert.IsType <DataFrame>(_spark.Range(10, 100)); Assert.IsType <DataFrame>(_spark.Range(10, 100, 10)); Assert.IsType <DataFrame>(_spark.Range(10, 100, 10, 5)); _spark.Range(10).CreateOrReplaceTempView("testView"); Assert.IsType <DataFrame>(_spark.Table("testView")); Assert.IsType <DataStreamReader>(_spark.ReadStream()); Assert.IsType <UdfRegistration>(_spark.Udf()); Assert.IsType <Catalog>(_spark.Catalog()); }
public void TestSignaturesV2_3_X() { RuntimeConfig conf = _spark.Conf(); conf.Set("stringKey", "stringValue"); conf.Set("boolKey", false); conf.Set("longKey", 1234L); Assert.Equal("stringValue", conf.Get("stringKey")); Assert.Equal("false", conf.Get("boolKey")); Assert.Equal("1234", conf.Get("longKey")); conf.Unset("stringKey"); Assert.Equal("defaultValue", conf.Get("stringKey", "defaultValue")); Assert.Equal("false", conf.Get("boolKey", "true")); }
static void Main(string[] args) { if (args.Length != 2) { Console.Error.WriteLine( "Usage: Remember to include input and output path as arguments"); Environment.Exit(1); } var sparkConf = SparkConfUtils.GetSparkConfigurationForFilePath(args); SparkSession spark = SparkSession .Builder() .AppName("Streaming example using Spark.NET") .GetOrCreate(); if (sparkConf != null) { sparkConf.ToList().ForEach(kv => { spark.Conf().Set(kv.Key, kv.Value); }); } var events = spark .ReadStream() .Format("eventhubs") .Options(EventHubConnection.GetEventHubConnectionSettings(eventHubPartitionCount: 2)) .Load(); var processedEvents = events .Select( FromJson(Col("body").Cast("string"), "temperature String, humidity String").As("Raw"), Col("properties"), Col("enqueuedTime") ) .WithColumn("Raw.temperature", Col("Raw.temperature").Cast("double")) .WithColumn("Raw.humidity", Col("Raw.humidity").Cast("double")) .WithColumnRenamed("Raw.temperature", "temperature") .WithColumnRenamed("Raw.humidity", "humidity") .WithColumn("temperatureAlert", Col("temperature") >= 40) .SelectExpr("temperature", "humidity", "properties", "enqueuedTime", "temperatureAlert"); processedEvents.PrintSchema(); var streamingQuery = processedEvents .WriteStream() .OutputMode(OutputMode.Append) .Format("console") .Option("path", args[0]) .Option("checkpointLocation", args[1]) .Start(); streamingQuery.AwaitTermination(); }
public void TestUnicode() { string expected = "①Ⅻㄨㄩ 啊阿鼾齄丂丄狚狛狜狝﨨﨩ˊˋ˙–⿻〇㐀㐁㐃㐄䶴䶵U1[]U2[]U3[]"; RuntimeConfig conf = _spark.Conf(); string key = "SerDeTests.TestUnicode"; conf.Set(key, expected); string actual = conf.Get(key); Assert.Equal(expected, actual); }
public HyperspaceTests(HyperspaceFixture fixture) { _spark = fixture.SparkFixture.Spark; _hyperspaceSystemDirectory = new TemporaryDirectory(); _spark.Conf().Set("spark.hyperspace.system.path", _hyperspaceSystemDirectory.Path); _hyperspace = new Hyperspace(_spark); _sampleDataFrame = _spark.Read() .Option("header", true) .Option("delimiter", ";") .Csv($"{TestEnvironment.ResourceDirectory}people.csv"); _sampleIndexName = "sample_dataframe"; _sampleIndexConfig = new IndexConfig(_sampleIndexName, new[] { "job" }, new[] { "name" }); _hyperspace.CreateIndex(_sampleDataFrame, _sampleIndexConfig); }
public void TestSignaturesV2_4_X() { RuntimeConfig conf = _spark.Conf(); conf.Set("stringKey", "stringValue"); conf.Set("boolKey", false); conf.Set("longKey", 1234L); Assert.Equal("stringValue", conf.Get("stringKey")); Assert.Equal("false", conf.Get("boolKey")); Assert.Equal("1234", conf.Get("longKey")); conf.Unset("stringKey"); Assert.Equal("defaultValue", conf.Get("stringKey", "defaultValue")); Assert.Equal("false", conf.Get("boolKey", "true")); Assert.True(conf.IsModifiable("spark.sql.streaming.checkpointLocation")); Assert.False(conf.IsModifiable("missingKey")); }
public void Run(string[] args) { string servidoresKafka = args[0]; string connectionString = args.Length > 1 ? args[1] : string.Empty; // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Credit Card Category") .GetOrCreate(); spark.Conf().Set("spark.sql.shuffle.partitions", "1"); // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", servidoresKafka) .Option("subscribe", "transactions") .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "transaction":"431", * "number":"0015-0000-0000-0000", * "lat":-23.1618, * "lng":-46.47201, * "amount":91.01487, * "category":"pets", * "eventTime":"2021-01-05T19:07:19.3888" * } */ var schema = new StructType(new[] { new StructField("transaction", new StringType()), new StructField("number", new StringType()), new StructField("lat", new DoubleType()), new StructField("lng", new DoubleType()), new StructField("amount", new DoubleType()), new StructField("category", new StringType()), new StructField("eventTime", new TimestampType()) }); // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", FromJson( Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe // Colocando um limite de 7 minutos para receber os eventos atrasados df = df.WithWatermark("eventTime", "7 minutes"); // Somando os valores gastos, agrupando por categoria e por janelas de 2 minutos que se iniciam a cada 1 minuto df = df.GroupBy(Window(Col("eventTime"), "2 minutes", "1 minutes"), Col("category")) .Sum("amount").WithColumnRenamed("sum(amount)", "total") .Select(Col("window.start"), Col("window.end"), Col("category"), Col("total")); // Colocando o streaming pra funcionar e gravando os dados retornados StreamingQuery query = df .WriteStream() .Format("console") .OutputMode(OutputMode.Update) //.Foreach(new MySQLForeachWriter(connectionString)) // Descomentar pra gravar em banco de dados .Start(); query.AwaitTermination(); }
public void Run(string[] args) { string kafkaBrokers = args[0]; double maxSpeed = double.Parse(args[1]); // Obtém a referência ao contexto de execução do Spark SparkSession spark = SparkSession .Builder() .AppName("Credit Card Fraud") .GetOrCreate(); spark.Conf().Set("spark.sql.shuffle.partitions", "1"); // sem essa configuração, cada stage ficou com 200 tasks, o que levou uns 4 minutos pra cada batch executar // Criando um dataframe pra receber dados do Kafka DataFrame df = spark .ReadStream() .Format("kafka") .Option("kafka.bootstrap.servers", kafkaBrokers) .Option("subscribe", "transactions") .Load() .SelectExpr("CAST(value AS STRING)"); /* Criando schema pra validar o JSON que virá nas mensagens do Kafka * Exemplo do JSON: * { * "transaction":"431", * "number":"0015-0000-0000-0000", * "lat":-23.1618, * "lng":-46.47201, * "amount":91.01487, * "category":"pets", * "eventTime":"2021-01-05T19:07:19.3888" * } */ var schema = new StructType(new[] { new StructField("transaction", new StringType()), new StructField("number", new StringType()), new StructField("lat", new DoubleType()), new StructField("lng", new DoubleType()), new StructField("amount", new DoubleType()), new StructField("category", new StringType()), new StructField("eventTime", new TimestampType()) }); // Fazendo o parse do JSON pra um array ... df = df.WithColumn("json", FromJson( df.Col("value"), schema.SimpleString) ) .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe // Gerando dois dataframes distintos para poder fazer o join e analisar a correção entre as transações DataFrame df1 = df .WithWatermark("eventTime", "7 minutes"); DataFrame df2 = df .WithColumnRenamed("transaction", "transaction2") .WithColumnRenamed("lat", "lat2") .WithColumnRenamed("lng", "lng2") .WithColumnRenamed("eventTime", "eventTime2") .WithWatermark("eventTime2", "7 minutes"); // Efetuando o join para verificar a correlação de transações dos cartões de crédito DataFrame dfJoin = df1.Join(df2, df1.Col("number").EqualTo(df2.Col("number")) .And(Col("transaction").NotEqual(Col("transaction2"))) .And(Col("eventTime2").Between(Col("eventTime"), Col("eventTime") + Expr("interval 5 minutes"))) ); //Registrando uma função personalizada pra ser usada no dataframe spark.Udf().Register <double, double, double, double, double>("CalculateDistance", (lat1, lng1, lat2, lng2) => CalculateDistance(lat1, lng1, lat2, lng2)); spark.Udf().Register <double, Timestamp, Timestamp, double>("CalculateSpeed", (dist, eventTime, eventTime2) => CalculateSpeed(dist, eventTime, eventTime2)); // Criando novas colunas para armazenar a execução do código da UDF dfJoin = dfJoin.WithColumn("dist", CallUDF("CalculateDistance", Col("lat"), Col("lng"), Col("lat2"), Col("lng2"))); dfJoin = dfJoin.WithColumn("speed", CallUDF("CalculateSpeed", Col("dist"), Col("eventTime"), Col("eventTime2"))); // Filtrando as transações que tiverem a velocidade acima do esperado (parâmetro "maxSpeed") dfJoin = dfJoin.Where(Col("speed").Gt(maxSpeed)); // Colocando o streaming pra funcionar StreamingQuery query = dfJoin .WriteStream() .Format("console") .Option("truncate", "false") .OutputMode(OutputMode.Append) .Start(); query.AwaitTermination(); }