static void Main(string[] args) { LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set var logger = LoggerServiceFactory.GetLogger(typeof(HiveDataFrameExample)); var sparkConf = new SparkConf(); var sparkContext = new SparkContext(sparkConf); var hiveContext = new HiveContext(sparkContext); var peopleDataFrame = hiveContext.Read().Json(Path.Combine(Environment.CurrentDirectory, @"data\people.json")); const string dbName = "SampleHiveDataBaseForMobius"; const string tableName = "people"; hiveContext.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists hiveContext.Sql(string.Format("USE {0}", dbName)); hiveContext.Sql(string.Format("DROP TABLE {0}", tableName)); // drop table if exists peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table var tablesDataFrame = hiveContext.Tables(dbName); // get all tables in database logger.LogInfo(string.Format("table count in database {0}: {1}", dbName, tablesDataFrame.Count())); tablesDataFrame.Show(); hiveContext.Sql(string.Format("SELECT * FROM {0}", tableName)).Show(); // select from table }
/// <summary> /// /// To read the data from the xml file and to retrieve the data /// </summary> private static void SparkXml() { var sparkConf = new SparkConf(); sparkConf.SetMaster("yarn"); sparkConf.SetAppName("SparkXmlMobius"); sparkContext = new SparkContext(sparkConf); var sqlContext = new SqlContext(sparkContext); var dataframe = sqlContext.Read() .Format("com.databricks.spark.xml") .Option("rowTag", "book") .Load(inputXmlFilePath); var rowCount = dataframe.Count(); logger.LogInfo("****Row count is " + rowCount + "****"); var rowCollections = dataframe.Collect(); logger.LogInfo("**********************************************"); foreach (var row in rowCollections) { Console.WriteLine("{0}", row); } logger.LogInfo("*********************************************"); logger.LogInfo("Executed Successfully................."); }
public void TestSqlContextGetConf() { // arrange const string key = "key"; const string value = "value"; mockSqlContextProxy.Setup(m => m.GetConf(key, "")).Returns(value); var mockSparkContextProxy = new Mock <ISparkContextProxy>(); var mockSparkSessionProxy = new Mock <ISparkSessionProxy>(); var mockCatalogProxy = new Mock <ICatalogProxy>(); mockCatalogProxy.Setup(m => m.RefreshTable(It.IsAny <string>())); mockSparkSessionProxy.Setup(m => m.GetCatalog()).Returns(mockCatalogProxy.Object); mockSparkContextProxy.Setup(m => m.CreateSparkSession()).Returns(mockSparkSessionProxy.Object); mockSparkSessionProxy.Setup(m => m.SqlContextProxy).Returns(mockSqlContextProxy.Object); var mockSparkConfProxy = new Mock <ISparkConfProxy>(); mockSparkConfProxy.Setup(m => m.GetSparkConfAsString()) .Returns("spark.master=master;spark.app.name=appname;config1=value1;config2=value2;"); var conf = new SparkConf(mockSparkConfProxy.Object); var sqlContext = new SqlContext(new SparkContext(mockSparkContextProxy.Object, conf)); sqlContext.SparkSession.SparkSessionProxy = mockSparkSessionProxy.Object; //act var actualValue = sqlContext.GetConf(key, ""); // assert Assert.AreEqual(value, actualValue); }
/// <summary> /// To calculate the wordcount for the Hdfs file /// </summary> private static void WordCount() { var sparkConf = new SparkConf(); sparkConf.SetAppName("MobiusWordCountC#"); sparkConf.SetMaster("yarn"); sparkContext = new SparkContext(sparkConf); try { var lines = sparkContext.TextFile(hdfsFile); var counts = lines .FlatMap(x => x.Split(' ')) .Map(w => new Tuple <string, int>(w, 1)) .ReduceByKey((x, y) => x + y); logger.LogInfo("**********************************************"); foreach (var wordcount in counts.Collect()) { Console.WriteLine("{0}: {1}", wordcount.Item1, wordcount.Item2); } logger.LogInfo("**********************************************"); logger.LogInfo("Executed Successfully................."); } catch (Exception ex) { logger.LogError("Error performing Word Count"); logger.LogException(ex); } }
/// <summary> /// To calculate Pi value /// </summary> private static void Pi() { var sparkConf = new SparkConf(); sparkConf.SetAppName("MobiusSimpleSamplePI"); sparkConf.SetMaster("yarn"); sparkContext = new SparkContext(sparkConf); try { const int slices = 3; var numberOfItems = (int)Math.Min(100000L * slices, int.MaxValue); var values = new List <int>(numberOfItems); for (var i = 0; i <= numberOfItems; i++) { values.Add(i); } var rdd = sparkContext.Parallelize(values, slices); logger.LogInfo("Started Calculating Pi"); CalculatePiUsingAnonymousMethod(numberOfItems, rdd); CalculatePiUsingSerializedClassApproach(numberOfItems, rdd); logger.LogInfo("Completed calculating the value of Pi"); logger.LogInfo("Executed Successfully................."); } catch (Exception ex) { logger.LogError("Error calculating Pi"); logger.LogException(ex); } }
public void TestHiveContextRefreshTable() { // arrange var mockSparkContextProxy = new Mock <ISparkContextProxy>(); var mockSparkSessionProxy = new Mock <ISparkSessionProxy>(); var mockCatalogProxy = new Mock <ICatalogProxy>(); mockCatalogProxy.Setup(m => m.RefreshTable(It.IsAny <string>())); mockSparkSessionProxy.Setup(m => m.GetCatalog()).Returns(mockCatalogProxy.Object); mockSparkContextProxy.Setup(m => m.CreateSparkSession()).Returns(mockSparkSessionProxy.Object); var mockSparkConfProxy = new Mock <ISparkConfProxy>(); mockSparkConfProxy.Setup(m => m.GetSparkConfAsString()) .Returns("spark.master=master;spark.app.name=appname;config1=value1;config2=value2;"); var conf = new SparkConf(mockSparkConfProxy.Object); var hiveContext = new HiveContext(new SparkContext(mockSparkContextProxy.Object, conf)); hiveContext.SparkSession.SparkSessionProxy = mockSparkSessionProxy.Object; // act hiveContext.RefreshTable("table"); // assert mockCatalogProxy.Verify(m => m.RefreshTable("table")); }
public void TestSparkConf() { var sparkConf = new SparkConf(false); sparkConf.SetMaster("master"); sparkConf.SetAppName("test"); sparkConf.SetSparkHome("test home"); sparkConf.Set("key_string", "value"); sparkConf.Set("key_int", "100"); var expectedConfigs = new Dictionary <string, string>() { { "spark.master", "master" }, { "spark.app.name", "test" }, { "spark.home", "test home" }, { "key_string", "value" }, { "key_int", "100" } }; foreach (KeyValuePair <string, string> kv in expectedConfigs) { Assert.Equal(kv.Value, sparkConf.Get(kv.Key, string.Empty)); } Assert.Equal(100, sparkConf.GetInt("key_int", 0)); // Validate GetAll(). Dictionary <string, string> actualAllConfigs = sparkConf.GetAll().ToDictionary(x => x.Key, x => x.Value); Assert.Equal(expectedConfigs, actualAllConfigs); }
/// <summary> /// /// To process with the given connection string for the SQL /// </summary> private static void JdbcDataFrame() { if (!string.IsNullOrEmpty(connectionString) && !string.IsNullOrEmpty(tableName)) { var sparkConf = new SparkConf(); sparkConf.SetAppName("SqlConnectionFromMobius"); sparkConf.SetMaster("yarn"); sparkConf.Set("spark.sql.warehouse.dir", "/user/hive/warehouse"); sparkContext = new SparkContext(sparkConf); var sqlContext = new SqlContext(sparkContext); var df = sqlContext .Read() .Jdbc(connectionString, tableName, new Dictionary <string, string>()); var rowCount = df.Count(); logger.LogInfo("****Row count is " + rowCount + "****"); logger.LogInfo("Executed Successfully................."); } else { logger.LogInfo("****Please provide correct connectionstring and table name****"); GetValues(); JdbcDataFrame(); } }
static void Main(string[] args) { LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set var logger = LoggerServiceFactory.GetLogger(typeof(SparkXmlExample)); var inputXmlFilePath = args[0]; var outputXmlFilePath = args[1]; var sparkConf = new SparkConf(); sparkConf.SetAppName("myapp"); var sparkContext = new SparkContext(sparkConf); var sqlContext = new SqlContext(sparkContext); var df = sqlContext.Read() .Format("com.databricks.spark.xml") .Option("rowTag", "book") .Load(inputXmlFilePath); //"D:\temp\books.xml", "file:/D:/temp/books.xml" or "hdfs://temp/books.xml" df.ShowSchema(); var rowCount = df.Count(); logger.LogInfo("Row count is " + rowCount); var selectedData = df.Select("author", "@id"); selectedData.Write() .Format("com.databricks.spark.xml") .Option("rootTag", "books") .Option("rowTag", "book") .Save(outputXmlFilePath); //"D:\temp\booksUpdated.xml", "file:/D:/temp/booksUpdated.xml" or "hdfs://temp/booksUpdated.xml" sparkContext.Stop(); }
private static void InitializeSparkContext(string[] args) { var sparkConf = new SparkConf(); sparkConf.Set("spark.local.dir", args[0]); sparkConf.SetAppName("SparkCLR perf suite - C#"); SparkContext = new SparkContext(sparkConf); SqlContext = new SqlContext(PerfBenchmark.SparkContext); }
/// <summary> /// Sets a list of config options based on the given SparkConf /// </summary> public Builder Config(SparkConf conf) { foreach (KeyValuePair <string, string> keyValuePair in conf.GetAll()) { _options[keyValuePair.Key] = keyValuePair.Value; } return(this); }
// Creates and returns a context private static SparkContext CreateSparkContext() { var conf = new SparkConf(); if (Configuration.SparkLocalDirectoryOverride != null) { conf.Set("spark.local.dir", Configuration.SparkLocalDirectoryOverride); } return(new SparkContext(conf)); }
internal static void DStreamDirectKafkaWithRepartitionSample() { count = 0; string directory = SparkCLRSamples.Configuration.SampleDataLocation; string checkpointPath = Path.Combine(directory, "checkpoint"); StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { var conf = new SparkConf(); SparkContext sc = new SparkContext(conf); StreamingContext context = new StreamingContext(sc, 2000L); context.Checkpoint(checkpointPath); var kafkaParams = new List <Tuple <string, string> > { new Tuple <string, string>("metadata.broker.list", brokers), new Tuple <string, string>("auto.offset.reset", "smallest") }; conf.Set("spark.mobius.streaming.kafka.numPartitions." + topic, partitions.ToString()); var dstream = KafkaUtils.CreateDirectStream(context, new List <string> { topic }, kafkaParams, Enumerable.Empty <Tuple <string, long> >()); dstream.ForeachRDD((time, rdd) => { long batchCount = rdd.Count(); int numPartitions = rdd.GetNumPartitions(); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Count: " + batchCount); Console.WriteLine("Partitions: " + numPartitions); // only first batch has data and is repartitioned into 10 partitions if (count++ == 0) { Assert.AreEqual(messages, batchCount); Assert.IsTrue(numPartitions >= partitions); } else { Assert.AreEqual(0, batchCount); Assert.IsTrue(numPartitions == 0); } }); return(context); }); ssc.Start(); ssc.AwaitTermination(); }
static void Main(string[] args) { SparkConf sparkConf = new SparkConf(); SparkContext sc = new SparkContext(sparkConf); SqlContext sqlContext = new SqlContext(sc); var scriptEngine = new RoslynScriptEngine(sc, sqlContext); var repl = new Repl(scriptEngine, new ConsoleIoHandler()); repl.Init(); repl.Run(); scriptEngine.Close(); }
/// <summary> /// Gets an existing [[SparkSession]] or, if there is no existing one, creates a new /// one based on the options set in this builder. /// </summary> /// <returns></returns> public SparkSession GetOrCreate() { var sparkConf = new SparkConf(); foreach (var option in options) { sparkConf.Set(option.Key, option.Value); } var sparkContext = SparkContext.GetOrCreate(sparkConf); return(SqlContext.GetOrCreate(sparkContext).SparkSession); }
/// <summary> /// Gets an existing [[SparkSession]] or, if there is no existing one, creates a new /// one based on the options set in this builder. /// </summary> /// <returns></returns> public SparkSession GetOrCreate() { var sparkConf = new SparkConf(); foreach (KeyValuePair <string, string> option in _options) { sparkConf.Set(option.Key, option.Value); } _jvmObject.Invoke("config", sparkConf); return(new SparkSession((JvmObjectReference)_jvmObject.Invoke("getOrCreate"))); }
// Creates and returns a context private static SparkContext CreateSparkContext() { var conf = new SparkConf() { }; conf.SetMaster(Env.SPARK_MASTER_URL); if (Configuration.SparkLocalDirectoryOverride != null) { conf.Set("spark.local.dir", Configuration.SparkLocalDirectoryOverride); } return(new SparkContext(conf)); }
public static void Main(string[] args) { // arquivo usado : https://www.kaggle.com/gbonesso/b3-stock-quotes/data?select=COTAHIST_A2009_to_A2020_P.csv // essa poc calcula o preco medio da acao nesse periodo SparkConf sparkConf = new SparkConf(); sparkConf.SetMaster("local[*]"); // '*' indica pra usar todos os cores SparkSession spark = SparkSession .Builder() .Config(sparkConf) .AppName("SparkNetPOC") .GetOrCreate(); Stopwatch sw = new Stopwatch(); sw.Start(); DataFrame dataFrameGeral = spark.Read() .Schema("vazio STRING, TIPREG STRING,DATPRE STRING,CODBDI STRING,CODNEG STRING,TPMERC STRING,NOMRES STRING,ESPECI STRING," + "PRAZOT STRING,MODREF STRING,PREABE STRING,PREMAX STRING,PREMIN STRING,PREMED STRING,PREULT STRING,PREOFC STRING," + "PREOFV STRING,TOTNEG STRING,QUATOT STRING," + "VOLTOT STRING,PREEXE STRING,INDOPC STRING,DATVEN STRING,FATCOT STRING,PTOEXE STRING,CODISI STRING,DISMES STRING") .Csv(@"C:\InternetDownloads\10318_1101179_compressed_COTAHIST_A2009_to_A2020_P.csv\COTAHIST_A2009_to_A2020_P.csv"); DataFrame dataFrameColunasUteis = dataFrameGeral .Drop("vazio", "TIPREG", "DATPRE", "CODBDI", "TPMERC", "NOMRES", "ESPECI", "PRAZOT", "MODREF", "PREABE", "PREMIN", "PREMED", "PREULT", "PREOFC", "PREOFV", "TOTNEG", "QUATOT", "VOLTOT", "PREEXE", "INDOPC", "DATVEN", "FATCOT", "PTOEXE", "CODISI", "DISMES"); DataFrame dataFrameFiltro = dataFrameColunasUteis .Filter("CODNEG = 'ITSA3' OR CODNEG = 'ABEV3' OR CODNEG = 'PETR4'"); DataFrame dataFrameFinal = dataFrameFiltro .GroupBy("CODNEG") .Agg(Avg("PREMAX")); dataFrameFinal.Show(); spark.Stop(); sw.Stop(); Console.WriteLine("Tempo = " + sw.ElapsedMilliseconds); }
public void TestSparkConfMethods() { var sparkConf = new SparkConf(); sparkConf.SetMaster("masterUrl"); Assert.AreEqual("masterUrl", sparkConf.Get(MockSparkConfProxy.MockMasterKey, "")); sparkConf.SetAppName("app name "); Assert.AreEqual("app name ", sparkConf.Get(MockSparkConfProxy.MockAppNameKey, "")); sparkConf.SetSparkHome(@"c:\path\to\sparkfolder"); Assert.AreEqual(@"c:\path\to\sparkfolder", sparkConf.Get(MockSparkConfProxy.MockHomeKey, "")); Assert.AreEqual("default value", sparkConf.Get("non existent key", "default value")); Assert.AreEqual(3, sparkConf.GetInt("non existent key", 3)); }
/// <summary> /// Creates and returns a context /// </summary> /// <returns>SparkContext</returns> private static SparkContext CreateSparkContext() { var conf = new SparkConf(); // set up local directory var tempDir = Environment.GetEnvironmentVariable("spark.local.dir"); if (string.IsNullOrEmpty(tempDir)) { tempDir = Path.GetTempPath(); } conf.Set("spark.local.dir", tempDir); Logger.DebugFormat("spark.local.dir is set to {0}", tempDir); return(new SparkContext(conf)); }
static void Main(string[] args) { LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set var logger = LoggerServiceFactory.GetLogger(typeof(JdbcDataFrameExample)); var sparkConf = new SparkConf(); var sparkContext = new SparkContext(sparkConf); var sqlContext = new SqlContext(sparkContext); var df = sqlContext.Read() .Jdbc("jdbc:sqlserver://localhost:1433;databaseName=Temp;;integratedSecurity=true;", "xyz", new Dictionary <string, string>()); df.ShowSchema(); var rowCount = df.Count(); logger.LogInfo("Row count is " + rowCount); }
public RoslynScriptEngine(SparkContext sc, SqlContext sqlContext) { this.sc = sc; sparkConf = sc.GetConf(); host = new SparkCLRHost { sc = sc, sqlContext = sqlContext }; var sparkLocalDir = sparkConf.Get("spark.local.dir", Path.GetTempPath()); compilationDumpDirectory = Path.Combine(sparkLocalDir, Path.GetRandomFileName()); Directory.CreateDirectory(compilationDumpDirectory); options = new CSharpParseOptions(LanguageVersion.CSharp6, DocumentationMode.Parse, SourceCodeKind.Script); }
public void TestDStreamMapWithState() { var mapwithStateDStreamProxy = new Mock <IDStreamProxy>(); var streamingContextProxy = new Mock <IStreamingContextProxy>(); streamingContextProxy.Setup(p => p.CreateCSharpStateDStream(It.IsAny <IDStreamProxy>(), It.IsAny <byte[]>(), It.IsAny <string>(), It.IsAny <string>(), It.IsAny <string>())) .Returns(mapwithStateDStreamProxy.Object); var sparkContextProxy = new Mock <ISparkContextProxy>(); var sparkConfProxy = new Mock <ISparkConfProxy>(); var sparkClrProxy = new Mock <ISparkCLRProxy>(); sparkClrProxy.Setup(p => p.StreamingContextProxy).Returns(streamingContextProxy.Object); sparkClrProxy.Setup(p => p.SparkContextProxy).Returns(sparkContextProxy.Object); sparkClrProxy.Setup(p => p.CreateSparkContext(It.IsAny <ISparkConfProxy>())).Returns(sparkContextProxy.Object); sparkClrProxy.Setup(p => p.CreateSparkConf(It.IsAny <bool>())).Returns(sparkConfProxy.Object); // reset sparkCLRProxy for after test completes var originalSparkCLRProxy = SparkCLREnvironment.SparkCLRProxy; try { SparkCLREnvironment.SparkCLRProxy = sparkClrProxy.Object; var sparkConf = new SparkConf(false); var ssc = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10); var dstreamProxy = new Mock <IDStreamProxy>(); var pairDStream = new DStream <KeyValuePair <string, int> >(dstreamProxy.Object, ssc); var stateSpec = new StateSpec <string, int, int, int>((k, v, s) => v); var stateDStream = pairDStream.MapWithState(stateSpec); var snapshotDStream = stateDStream.StateSnapshots(); Assert.IsNotNull(stateDStream); Assert.IsNotNull(snapshotDStream); } finally { SparkCLREnvironment.SparkCLRProxy = originalSparkCLRProxy; } }
static void Main(string[] args) { if (args.Length < 2) { Console.WriteLine("Usage: HdfsWordCount <checkpointDirectory> <inputDirectory>"); return; } string checkpointPath = args[0]; string inputDir = args[1]; StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { var sparkConf = new SparkConf(); sparkConf.SetAppName("HdfsWordCount"); var sc = new SparkContext(sparkConf); StreamingContext context = new StreamingContext(sc, 30000); context.Checkpoint(checkpointPath); var lines = context.TextFileStream(inputDir); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1)); var wordCounts = pairs.ReduceByKey((x, y) => x + y); wordCounts.ForeachRDD((time, rdd) => { Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); object[] taken = rdd.Take(10); foreach (object record in taken) { Console.WriteLine(record); } Console.WriteLine(); }); return(context); }); ssc.Start(); ssc.AwaitTermination(); ssc.Stop(); }
public void TestHiveContextConstructor() { var mockSparkContextProxy = new Mock <ISparkContextProxy>(); var mockSparkSessionProxy = new Mock <ISparkSessionProxy>(); var mockCatalogProxy = new Mock <ICatalogProxy>(); mockCatalogProxy.Setup(m => m.RefreshTable(It.IsAny <string>())); mockSparkSessionProxy.Setup(m => m.GetCatalog()).Returns(mockCatalogProxy.Object); mockSparkContextProxy.Setup(m => m.CreateSparkSession()).Returns(mockSparkSessionProxy.Object); var mockSparkConfProxy = new Mock <ISparkConfProxy>(); mockSparkConfProxy.Setup(m => m.GetSparkConfAsString()) .Returns("spark.master=master;spark.app.name=appname;config1=value1;config2=value2;"); var conf = new SparkConf(mockSparkConfProxy.Object); var hiveContext = new HiveContext(new SparkContext(mockSparkContextProxy.Object, conf)); Assert.IsNotNull(hiveContext.SparkSession); }
static void Main(string[] args) { LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set var logger = LoggerServiceFactory.GetLogger(typeof(JdbcDataFrameExample)); //For SQL Server use the connection string formats below //"jdbc:sqlserver://localhost:1433;databaseName=Temp;integratedSecurity=true;" or //"jdbc:sqlserver://localhost;databaseName=Temp;user=MyUserName;password=myPassword;" var connectionString = args[0]; var tableName = args[1]; var sparkConf = new SparkConf(); var sparkContext = new SparkContext(sparkConf); var sqlContext = new SqlContext(sparkContext); var df = sqlContext .Read() .Jdbc(connectionString, tableName, new Dictionary <string, string>()); df.ShowSchema(); var rowCount = df.Count(); logger.LogInfo("Row count is " + rowCount); sparkContext.Stop(); }
static void Main(string[] args) { var cassandraHostName = "localhost"; var cassandraKeySpace = "ks"; var cassandraTableToRead = "users"; var cassandraTableToInsert = "filteredusers"; if (args.Length == 4) { cassandraHostName = args[0]; cassandraKeySpace = args[1]; cassandraTableToRead = args[2]; cassandraTableToInsert = args[3]; } /* ** CQL used to create table in Cassandra for this example ** ** ** CREATE TABLE users ( ** username VARCHAR, ** firstname VARCHAR, ** lastname VARCHAR, ** PRIMARY KEY (username) ** ); ** ** INSERT INTO ks.users (username, firstname, lastname) VALUES ('JD123', 'John', 'Doe'); ** INSERT INTO ks.users (username, firstname, lastname) VALUES ('BillJ', 'Bill', 'Jones'); ** INSERT INTO ks.users (username, firstname, lastname) VALUES ('SL', 'Steve', 'Little'); ** ** CREATE TABLE filteredusers ( ** username VARCHAR, ** firstname VARCHAR, ** lastname VARCHAR, ** PRIMARY KEY (username) ** ); */ var sparkConf = new SparkConf().Set("spark.cassandra.connection.host", cassandraHostName); var sparkContext = new SparkContext(sparkConf); var sqlContext = new SqlContext(sparkContext); //read from cassandra table var usersDataFrame = sqlContext.Read() .Format("org.apache.spark.sql.cassandra") .Options(new Dictionary <string, string> { { "keyspace", cassandraKeySpace }, { "table", cassandraTableToRead } }) .Load(); //display rows in the console usersDataFrame.Show(); var createTempTableStatement = string.Format( "CREATE TEMPORARY TABLE userstemp USING org.apache.spark.sql.cassandra OPTIONS(table \"{0}\", keyspace \"{1}\")", cassandraTableToRead, cassandraKeySpace); //create a temp table sqlContext.Sql(createTempTableStatement); //read from temp table, filter it and display schema and rows var filteredUsersDataFrame = sqlContext.Sql("SELECT * FROM userstemp").Filter("username = '******'"); filteredUsersDataFrame.ShowSchema(); filteredUsersDataFrame.Show(); //write filtered rows to another table filteredUsersDataFrame.Write() .Format("org.apache.spark.sql.cassandra") .Options(new Dictionary <string, string> { { "keyspace", cassandraKeySpace }, { "table", cassandraTableToInsert } }) .Save(); //convert to RDD, execute map & filter and collect result var rddCollectedItems = usersDataFrame.ToRDD() .Map( r => string.Format("{0},{1},{2}", r.GetAs <string>("username"), r.GetAs <string>("firstname"), r.GetAs <string>("lastname"))) .Filter(s => s.Contains("SL")) .Collect(); foreach (var rddCollectedItem in rddCollectedItems) { Console.WriteLine(rddCollectedItem); } Console.WriteLine("Completed running example"); }
//private static RDD<string> getUserVisit static void Main(string[] args) { string filepath = @"hdfs:///common/vistizationData/"; var OutputPath = @"hdfs:///user/t-zhuxia/vistizationRes/"; string uetLogPath = filepath + "gat_20160902_0600.csv"; var UICLogPath = filepath + "uic_20160902_0600.csv"; string AnidPath = filepath + "ANID_20160831.csv"; string MuidPath = filepath + "MUID_20160831.csv"; var Visitization_AppInstall_Output = OutputPath + "Visitization_AppInstall_20160902_00"; var NewEscrowFile = OutputPath + "NewEscrowCandidates_20160902"; SparkConf conf = (new SparkConf()).SetAppName("VisitizationStreaming"); SparkContext sc = new SparkContext(conf); RDD <string> rawUetLogs = getDataFromFile(sc, uetLogPath); var uetLogs = getUETLogs(rawUetLogs); var uetLogsKeyValpair = uetLogs.Map(line => { if (!string.IsNullOrEmpty(line)) { UETLogView data = UETLogView.Deserialize(line); string key = data.DedupKey + "," + data.ANID + "," + data.IsNewMUID + "," + data.UAIPId + "," + data.ReferrerURL + "," + data.QueryString + "," + data.AnalyticsGuid; return(new KeyValuePair <string, string>(key, line)); } return(new KeyValuePair <string, string>(null, null)); }); uetLogs = uetLogsKeyValpair.ReduceByKey((x, y) => { if (!string.IsNullOrEmpty(x) && !string.IsNullOrEmpty(y)) { return(x + delimeter + y); } if (!string.IsNullOrEmpty(x)) { return(x); } if (!string.IsNullOrEmpty(y)) { return(y); } return(null); }).Map <string>(UETLogDedupReducer.INSTANCE.GetData).Filter(line => !string.IsNullOrEmpty(line)); /*****************************************to do after this ****************************************************/ var uetLogs_PageVisit = uetLogs.Filter(line => { UETLogView data = UETLogView.Deserialize(line); return(string.IsNullOrEmpty(data.AppInstallClickId)); }); Console.Out.WriteLine("----------------uetLogs_PageVisitCount: " + uetLogs_PageVisit.Count()); var uetLogs_AppInstall = uetLogs.Filter(line => { UETLogView data = UETLogView.Deserialize(line); return(!string.IsNullOrEmpty(data.AppInstallClickId)); }); RDD <string> appInstallVisits = uetLogs_AppInstall.Map <string>(AppInstallProcessor.INSTANCE.GetData); Console.Out.WriteLine("----------------appInstallVisitsCount: " + appInstallVisits.Count()); //appInstallVisits.Repartition(1).SaveAsTextFile(Visitization_AppInstall_Output); //----- Get UIC log var uicRaw = getDataFromFile(sc, UICLogPath); var UserIdConverage = getUICData(uicRaw); //----- Join uetlog with uic log var uetColumns = uetLogs_PageVisit.Map(line => { var uetLog = UETLogView.Deserialize(line); return(new KeyValuePair <Guid?, string>(uetLog.UETMatchingGuid, line)); }); var uicColumns = UserIdConverage.Map(line => { var uic = UserIdCoverageShcema.Deserialize(line); return(new KeyValuePair <Guid?, Guid?>(uic.UETMatchingGuid, uic.AnalyticsGuid)); }); var UETLogProcessedEntriesPageVisit = uetColumns.LeftOuterJoin(uicColumns).Map(line => { var value = UETLogView.Deserialize(line.Value.Item1); if (line.Value.Item2.IsDefined) { var agid = line.Value.Item2.GetValue(); if (agid != null) { value.AnalyticsGuid = agid; } value.DedupKey = null; value.QueryString = null; } return(UETLogView.Serialize(value)); }); var visitsForUsersKeyValuePair = UETLogProcessedEntriesPageVisit.Map(line => { var value = UETLogView.Deserialize(line); var key = value.UAIPId.ToString() + "," + value.TagId.ToString(); return(new KeyValuePair <string, string>(key, line)); }).ReduceByKey((x, y) => { return(x + delimeter + y); }); var visitsForUsers = visitsForUsersKeyValuePair.FlatMap <string>(line => { return(VisitizeReducer.INSTANCE.GetData(line)); }); // Step 7: First field to fill is UserIdType and build the general "UETUserId", by default it is UAIPID during the construction of SAEventConversionFacts. // Step 7.1: Build the TypeOfUser field. // The way of deciding the TypeOfUser is: // 1. If MUID is not NULL and IsNewMUID is false, UserIdType is MUID (TypeOfUser 2), later will join with UMS MUID view. // 2. If MUID is NULL but ANID is not, UserIdType is ANID (TypeOfUser 1), ater will join with UMS ANID view. // 3. If both MUID and ANID are NULL, but AnalyticsGuid is nut NULL, UserIdType is AnalyticsGuid (TypeOfUser 3) // 4. If AnalyticsGuid is also NULL, UserIdType is Unknown (TypeOfUser -1) var VisitForUserWithTypeOfUser = getVisitsForUsersWithTypeOfUser(visitsForUsers); // Step 7.2: Get the ANID and MUID sub-table out of the VisitsForUsers_WithTypeOfUser because we need to update // the ANID/MUID to StableIdValue according to UMS mapping var VisitsForUsers_WithTypeOfUser_ANID = VisitForUserWithTypeOfUser.Filter(line => { var data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(data.TypeOfUser == 1); }); var VisitsForUsers_WithTypeOfUser_MUID = VisitForUserWithTypeOfUser.Filter(line => { var data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(data.TypeOfUser == 2); }); // Step 7.3: Buid the UMS ANID/MUID view from "/shares/adCenter.BICore.SubjectArea/SubjectArea/Conversion/UMS/ANID_{yyyyMMdd}.ss(12.43GB)/MUID_{yyyyMMdd}.ss(166.66GB)" var UMS_ANIDData = getDataFromFile(sc, AnidPath); var UMS_MUIDData = getDataFromFile(sc, MuidPath); // Step 7.4: Join VisitsForUsers_WithTypeOfUser_ANID(MUID) with UMS_ANID(MUID)_MappingFile to get to use the StableIdValue. var VisitsForUsers_WithStableIdANIDGuid = VisitsForUsers_WithTypeOfUser_ANID.Map(line => { var data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(data.ANID); }); Console.Out.WriteLine("----------------VisitsForUsers_WithStableIdANIDGuid: " + VisitsForUsers_WithStableIdANIDGuid.Count()); var VisitsForUsers_WithStableIdMUIDGuid = VisitsForUsers_WithTypeOfUser_MUID.Map(line => { var data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(data.MUID); }); Console.Out.WriteLine("----------------VisitsForUsers_WithStableIdMUIDGuid: " + VisitsForUsers_WithStableIdMUIDGuid.Count()); var anid = getUMS_ANIDData(UMS_ANIDData).Map <KeyValuePair <Guid?, SerializaType> >(line => { var an = line.DeserializeObject <UMS_ANID>(); return(new KeyValuePair <Guid?, SerializaType>(an.ANID, line)); }).FlatMap <KeyValuePair <Guid?, SerializaType> >(new BroadcastJoinWrapper(VisitsForUsers_WithStableIdANIDGuid, sc).Filter); var muid = getUMS_MUIDData(UMS_MUIDData).Map <KeyValuePair <Guid?, SerializaType> >(line => { var an = line.DeserializeObject <UMS_MUID>(); return(new KeyValuePair <Guid?, SerializaType>(an.MUID, line)); }).FlatMap <KeyValuePair <Guid?, SerializaType> >(new BroadcastJoinWrapper(VisitsForUsers_WithStableIdMUIDGuid, sc).Filter); var VisitsForUsers_WithStableIdFromANID = VisitsForUsers_WithTypeOfUser_ANID.Map(line => { VisitsForUser_WithTypeOfUser data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(new KeyValuePair <Guid?, SerializaType>(data.ANID, line)); }).LeftOuterJoin(anid).Map(line => { VisitsForUser_WithTypeOfUser data = VisitsForUser_WithTypeOfUser.Deserialize(line.Value.Item1); var VA = new VisitsForUsersWithStableIdFromID(); VA.UAIPId = data.UAIPId; VA.TagId = data.TagId; VA.TagName = data.TagName; VA.AnalyticsGuid = data.AnalyticsGuid; VA.SAEventConversionFactsRow = data.SAEventConversionFactsRow; if (line.Value.Item2.IsDefined) { var an = line.Value.Item2.GetValue().DeserializeObject <UMS_ANID>(); VA.StableId = an.ANID; } else { VA.StableId = data.ANID; } return(VA.SerializeObject()); }); var VisitsForUsers_WithStableIdFromMUID = VisitsForUsers_WithTypeOfUser_MUID.Map(line => { VisitsForUser_WithTypeOfUser data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(new KeyValuePair <Guid?, SerializaType>(data.MUID, line)); }).LeftOuterJoin(muid).Map(line => { VisitsForUser_WithTypeOfUser data = VisitsForUser_WithTypeOfUser.Deserialize(line.Value.Item1); var VA = new VisitsForUsersWithStableIdFromID(); VA.UAIPId = data.UAIPId; VA.TagId = data.TagId; VA.TagName = data.TagName; VA.AnalyticsGuid = data.AnalyticsGuid; VA.SAEventConversionFactsRow = data.SAEventConversionFactsRow; if (line.Value.Item2.IsDefined) { var an = line.Value.Item2.GetValue().DeserializeObject <UMS_MUID>(); VA.StableId = an.MUID; } else { VA.StableId = data.MUID; } return(VA.SerializeObject()); }); Console.WriteLine("-----------------VisitsForUsers_WithStableIdFromANID: " + VisitsForUsers_WithStableIdFromANID.Count()); Console.WriteLine("-----------------VisitsForUsers_WithStableIdFromMUID: " + VisitsForUsers_WithStableIdFromMUID.Count()); // Step 7.5: Select the UETUserId from the StableId and add the UserType according to whether it is from ANID or MUID var VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part1 = VisitsForUsers_WithStableIdFromANID.Map(line => { var VA = line.DeserializeObject <VisitsForUsersWithStableIdFromID>(); VisitsForUsersWithUETUserIdMUIDANIDPart data = new VisitsForUsersWithUETUserIdMUIDANIDPart(); data.UETUserId = VA.StableId; data.TypeOfUser = UserType.A; data.UAIPId = VA.UAIPId; data.TagId = VA.TagId; data.TagName = VA.TagName; data.AnalyticsGuid = VA.AnalyticsGuid; data.SAEventConversionFactsRow = VA.SAEventConversionFactsRow; return(data.SerializeObject()); }); var VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part2 = VisitsForUsers_WithStableIdFromMUID.Map(line => { var VA = line.DeserializeObject <VisitsForUsersWithStableIdFromID>(); VisitsForUsersWithUETUserIdMUIDANIDPart data = new VisitsForUsersWithUETUserIdMUIDANIDPart(); data.UETUserId = VA.StableId; data.TypeOfUser = UserType.M; data.UAIPId = VA.UAIPId; data.TagId = VA.TagId; data.TagName = VA.TagName; data.AnalyticsGuid = VA.AnalyticsGuid; data.SAEventConversionFactsRow = VA.SAEventConversionFactsRow; return(data.SerializeObject()); }); var VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part = VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part2.Union(VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part1); // Step 7.6: For the AnalyticsGuid sub-table of the VisitsForUsers_WithTypeOfUser, use AnalyticsGuid as the UETUserId and "AG" as the UserType. var VisitsForUsers_WithUETUserId_AnalyticsGuid_Other_UNION_Part = VisitForUserWithTypeOfUser.Filter(line => { var data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(data.TypeOfUser == 3 || data.TypeOfUser == -1); }).Map(line => { var Visits = VisitsForUser_WithTypeOfUser.Deserialize(line); VisitsForUsersWithUETUserIdMUIDANIDPart data = new VisitsForUsersWithUETUserIdMUIDANIDPart(); data.UAIPId = Visits.UAIPId; data.TagId = Visits.TagId; data.TagName = Visits.TagName; data.AnalyticsGuid = Visits.AnalyticsGuid; data.SAEventConversionFactsRow = Visits.SAEventConversionFactsRow; if (Visits.TypeOfUser == 3) { data.UETUserId = Visits.AnalyticsGuid; data.TypeOfUser = UserType.AG; } else { data.UETUserId = Visits.UAIPId; data.TypeOfUser = UserType.UA; } return(data.SerializeObject()); }); // Step 7.7: Union result from 7.5 and 7.6 var VisitsForUsers_WithUETUserId = VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part.Union(VisitsForUsers_WithUETUserId_AnalyticsGuid_Other_UNION_Part); // Step 7.8: Reduce on UETUserId, UAIPId, TagId, using UserCombineReducer VisitsForUsers_WithUETUserId = VisitsForUsers_WithUETUserId.Map(line => { var data = line.DeserializeObject <VisitsForUsersWithUETUserIdMUIDANIDPart>(); return(new VisitsForUsersWithUETUserId(data, data.SAEventConversionFactsRow.Visits[0].Events[0].EventDateTime).SerializeObject()); }); var VisitsForUsers_Current = VisitsForUsers_WithUETUserId .Map(line => { var data = line.DeserializeObject <VisitsForUsersWithUETUserId>(); return(new KeyValuePair <long, string>(data.EventDateTime, line)); }) .SortByKey() .Map(line => { var data = line.Value.DeserializeObject <VisitsForUsersWithUETUserId>(); var key = string.Format("{0},{1},{2}", data.UETUserId, data.UAIPId, data.TagId); return(new KeyValuePair <string, string>(key, line.Value)); }) .ReduceByKey((x, y) => { if (!string.IsNullOrEmpty(x) && !string.IsNullOrEmpty(y)) { return(x + delimeter + y); } if (!string.IsNullOrEmpty(x)) { return(x); } if (!string.IsNullOrEmpty(y)) { return(y); } return(null); }).Map <SerializaType>(UserCombineReducer.INSTANCE.getData); // Step 8: Handle the current hour result with Escrow visits from the previous hour: //As EscrowFile doesn't exists, so skip this step // Step 9: Calculate conversions for each visit using GoalConversionProcessor and output it. var VisitsWithConversions = VisitsForUsers_Current.MapPartitions(GoalConversionProcessor.INSTANCE.getData); // Step 10: Update the Escrow file var VisitsWithConversions_notUAIP = VisitsWithConversions.Filter(line => { var data = line.DeserializeObject <VisitsWithConversion>(); return(data.SAEventConversionFactsRow.UserIdType != UETUserIdType.UAIPID); }); var NewEscrowCandidates = VisitsWithConversions_notUAIP.MapPartitions(EscrowCandidateProcessor.INSTANCE.getData); // Step 10.2: Output the result to the new escrow file NewEscrowCandidates.Repartition(1).SaveAsTextFile(NewEscrowFile); return; }