public void Schema(StructType schema) { var structTypeIpcProxy = schema.StructTypeProxy as StructTypeIpcProxy; if (structTypeIpcProxy != null) SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmDataFrameReaderReference, "schema", new object[] { structTypeIpcProxy.JvmStructTypeReference }); }
public void TestDTypes() { var requestsSchema = new StructType(new List<StructField> { new StructField("test", new StringType(), false), }); var x = requestsSchema.JsonValue.ToString(); Mock<IStructTypeProxy> mockStructTypeProxy = new Mock<IStructTypeProxy>(); mockStructTypeProxy.Setup(m => m.ToJson()).Returns(x); Mock<IStructFieldProxy> mockStructFieldProxy = new Mock<IStructFieldProxy>(); mockStructFieldProxy.Setup(m => m.GetStructFieldName()).Returns("testcol"); Mock<IStructDataTypeProxy> mockStructDataTypeProxy = new Mock<IStructDataTypeProxy>(); mockStructDataTypeProxy.Setup(m => m.GetDataTypeSimpleString()).Returns("ss"); mockStructFieldProxy.Setup(m => m.GetStructFieldDataType()).Returns(mockStructDataTypeProxy.Object); mockStructTypeProxy.Setup(m => m.GetStructTypeFields()) .Returns(new List<IStructFieldProxy>() { mockStructFieldProxy.Object }); Mock<IDataFrameProxy> mockDataFrameProxy = new Mock<IDataFrameProxy>(); mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockStructTypeProxy.Object); mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); var dataset = new Dataset(mockDatasetProxy.Object); var dTypes = dataset.DTypes(); Assert.AreEqual(1, dTypes.Count()); var first = dTypes.First(); Assert.AreEqual("testcol", first.Item1); Assert.AreEqual("ss", first.Item2); }
public IDataFrameProxy ReadDataFrame(string path, StructType schema, Dictionary<string, string> options) { //TODO parameter Dictionary<string, string> options is not used right now - it is meant to be passed on to data sources return new DataFrameIpcProxy( new JvmObjectReference( SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "loadDF", new object[] { jvmSqlContextReference, path, (schema.StructTypeProxy as StructTypeIpcProxy).JvmStructTypeReference }).ToString() ), this ); }
public IDataFrameProxy TextFile(string path, StructType schema, string delimiter) { return new DataFrameIpcProxy( new JvmObjectReference( SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod( "org.apache.spark.sql.api.csharp.SQLUtils", "loadTextFile", new object[] {jvmSqlContextReference, path, delimiter, (schema.StructTypeProxy as StructTypeIpcProxy).JvmStructTypeReference}).ToString() ), this ); }
internal static void DFCreateDataFrameSample() { var schemaPeople = new StructType(new List<StructField> { new StructField("id", new StringType()), new StructField("name", new StringType()), new StructField("age", new IntegerType()), new StructField("address", new StructType(new List<StructField> { new StructField("city", new StringType()), new StructField("state", new StringType()) })), new StructField("phone numbers", new ArrayType(new StringType())) }); var rddPeople = SparkCLRSamples.SparkContext.Parallelize( new List<object[]> { new object[] { "123", "Bill", 43, new object[]{ "Columbus", "Ohio" }, new string[]{ "Tel1", "Tel2" } }, new object[] { "456", "Steve", 34, new object[]{ "Seattle", "Washington" }, new string[]{ "Tel3", "Tel4" } } }); var dataFramePeople = GetSqlContext().CreateDataFrame(rddPeople, schemaPeople); Console.WriteLine("------ Schema of People Data Frame:\r\n"); dataFramePeople.ShowSchema(); Console.WriteLine(); var collected = dataFramePeople.Collect().ToArray(); foreach (var people in collected) { string id = people.Get("id"); string name = people.Get("name"); int age = people.Get("age"); Row address = people.Get("address"); string city = address.Get("city"); string state = address.Get("state"); object[] phoneNumbers = people.Get("phone numbers"); Console.WriteLine("id:{0}, name:{1}, age:{2}, address:(city:{3},state:{4}), phoneNumbers:[{5},{6}]\r\n", id, name, age, city, state, phoneNumbers[0], phoneNumbers[1]); } if (SparkCLRSamples.Configuration.IsValidationEnabled) { Assert.AreEqual(2, dataFramePeople.Rdd.Count()); Assert.AreEqual(schemaPeople.Json, dataFramePeople.Schema.Json); } }
/// <summary> /// Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema /// automatically from data. By specifying the schema here, the underlying data source can /// skip the schema inference step, and thus speed up data loading. /// </summary> public DataFrameReader Schema(StructType schema) { dataFrameReaderProxy.Schema(schema); return this; }
public void TestSchema() { // arrange mockDataFrameReaderProxy.Setup(m => m.Schema(It.IsAny<StructType>())); var dataFrameReader = new DataFrameReader(mockDataFrameReaderProxy.Object, sparkContext); const string jsonSchema = @" { ""type"" : ""struct"", ""fields"" : [ { ""name"" : ""address"", ""type"" : { ""type"" : ""struct"", ""fields"" : [ { ""name"" : ""city"", ""type"" : ""string"", ""nullable"" : true, ""metadata"" : { } }, { ""name"" : ""state"", ""type"" : ""string"", ""nullable"" : true, ""metadata"" : { } } ] }, ""nullable"" : true, ""metadata"" : { } }, { ""name"" : ""age"", ""type"" : ""long"", ""nullable"" : true, ""metadata"" : { } }, { ""name"" : ""id"", ""type"" : ""string"", ""nullable"" : true, ""metadata"" : { } }, { ""name"" : ""name"", ""type"" : ""string"", ""nullable"" : true, ""metadata"" : { } } ] }"; var mockStructTypeProxy = new MockStructTypeProxy(jsonSchema); var schema = new StructType(mockStructTypeProxy); // act var reader = dataFrameReader.Schema(schema); // verify Assert.IsNotNull(reader); Assert.AreSame(reader, dataFrameReader); mockDataFrameReaderProxy.Verify(m => m.Schema(schema), Times.Once); }
/// <summary> /// Verify the schema of people dataframe. /// </summary> /// <param name="schema"> RowSchema of people DataFrame </param> internal static void VerifySchemaOfPeopleDataFrame(StructType schema) { Assert.IsNotNull(schema); Assert.AreEqual("struct", schema.TypeName); Assert.IsNotNull(schema.Fields); Assert.AreEqual(4, schema.Fields.Count); // name var nameColSchema = schema.Fields.Find(c => c.Name.Equals("name")); Assert.IsNotNull(nameColSchema); Assert.AreEqual("name", nameColSchema.Name); Assert.IsTrue(nameColSchema.IsNullable); Assert.AreEqual("string", nameColSchema.DataType.TypeName); // id var idColSchema = schema.Fields.Find(c => c.Name.Equals("id")); Assert.IsNotNull(idColSchema); Assert.AreEqual("id", idColSchema.Name); Assert.IsTrue(idColSchema.IsNullable); Assert.AreEqual("string", nameColSchema.DataType.TypeName); // age var ageColSchema = schema.Fields.Find(c => c.Name.Equals("age")); Assert.IsNotNull(ageColSchema); Assert.AreEqual("age", ageColSchema.Name); Assert.IsTrue(ageColSchema.IsNullable); Assert.AreEqual("long", ageColSchema.DataType.TypeName); // address var addressColSchema = schema.Fields.Find(c => c.Name.Equals("address")); Assert.IsNotNull(addressColSchema); Assert.AreEqual("address", addressColSchema.Name); Assert.IsTrue(addressColSchema.IsNullable); Assert.IsNotNull(addressColSchema.DataType); Assert.AreEqual("struct", addressColSchema.DataType.TypeName); Assert.IsNotNull(((StructType)addressColSchema.DataType).Fields.Find(c => c.Name.Equals("state"))); Assert.IsNotNull(((StructType)addressColSchema.DataType).Fields.Find(c => c.Name.Equals("city"))); }
internal static void DFTextFileLoadDataFrameSample() { var requestsSchema = new StructType(new List<StructField> { new StructField("guid", new StringType(), false), new StructField("datacenter", new StringType(), false), new StructField("abtestid", new StringType(), false), new StructField("traffictype", new StringType(), false), }); var requestsDateFrame = GetSqlContext().TextFile(SparkCLRSamples.Configuration.GetInputDataPath(RequestsLog), requestsSchema); requestsDateFrame.RegisterTempTable("requests"); var guidFilteredDataFrame = GetSqlContext().Sql("SELECT guid, datacenter FROM requests where guid = '4628deca-139d-4121-b540-8341b9c05c2a'"); guidFilteredDataFrame.Show(); requestsDateFrame.ShowSchema(); requestsDateFrame.Show(); var count = requestsDateFrame.Count(); guidFilteredDataFrame.ShowSchema(); guidFilteredDataFrame.Show(); var filteredCount = guidFilteredDataFrame.Count(); if (SparkCLRSamples.Configuration.IsValidationEnabled) { Assert.AreEqual(10, count); Assert.AreEqual(1, filteredCount); } }
static RowHelper() { BasicSchema = DataType.ParseDataTypeFromJson(BasicJsonSchema) as StructType; ComplexSchema = DataType.ParseDataTypeFromJson(ComplexJsonSchema) as StructType; }
public DataFrame JsonFile(string path, StructType schema) { return(Read().Schema(schema).Json(path)); }
/// <summary> /// Loads a dataframe the source path using the given schema and options /// </summary> /// <param name="path"></param> /// <param name="schema"></param> /// <param name="options"></param> /// <returns></returns> public DataFrame ReadDataFrame(string path, StructType schema, Dictionary <string, string> options) { return(new DataFrame(sqlContextProxy.ReadDataFrame(path, schema, options), sparkContext)); }
/// <summary> /// Creates a <see cref="DataFrame"/> from a RDD containing array of object using the given schema. /// </summary> /// <param name="rdd">RDD containing array of object. The array acts as a row and items within the array act as columns which the schema is specified in <paramref name="schema"/>. </param> /// <param name="schema">The schema of DataFrame.</param> /// <returns></returns> public DataFrame CreateDataFrame(RDD<object[]> rdd, StructType schema) { // Note: This is for pickling RDD, convert to RDD<byte[]> which happens in CSharpWorker. // The below sqlContextProxy.CreateDataFrame() will call byteArrayRDDToAnyArrayRDD() of SQLUtils.scala which only accept RDD of type RDD[Array[Byte]]. // In byteArrayRDDToAnyArrayRDD() of SQLUtils.scala, the SerDeUtil.pythonToJava() will be called which is a mapPartitions inside. // It will be executed until the CSharpWorker finishes Pickling to RDD[Array[Byte]]. var rddRow = rdd.Map(r => r); rddRow.serializedMode = SerializedMode.Row; return new DataFrame(sparkSessionProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), sparkContext); }
public IDataFrameProxy ReadDataFrame(string path, StructType schema, System.Collections.Generic.Dictionary<string, string> options) { throw new NotImplementedException(); }
public void TestSchema(bool usePrintSchema) { var requestsSchema = new StructType(new List<StructField> { new StructField("test", new StringType(), false), }); var jsonValue = requestsSchema.JsonValue.ToString(); Mock<IStructTypeProxy> mockStructTypeProxy = new Mock<IStructTypeProxy>(); mockStructTypeProxy.Setup(m => m.ToJson()).Returns(jsonValue); Mock<IDataFrameProxy> mockDataFrameProxy = new Mock<IDataFrameProxy>(); mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockStructTypeProxy.Object); mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); var dataset = new Dataset(mockDatasetProxy.Object); if (usePrintSchema) dataset.PrintSchema(); else dataset.ShowSchema(); mockDataFrameProxy.Verify(m => m.GetSchema(), Times.Once); mockStructTypeProxy.Verify(m => m.ToJson(), Times.Once()); }
/// <summary> /// Loads text file with the specific column delimited using the given schema /// </summary> /// <param name="path">path to text file</param> /// <param name="schema">schema to use</param> /// <param name="delimiter">delimiter to use</param> /// <returns></returns> public DataFrame TextFile(string path, StructType schema, string delimiter = ",") { return(new DataFrame(sqlContextProxy.TextFile(path, schema, delimiter), sparkContext)); }
/// <summary> /// Loads a JSON file (one object per line) and applies the given schema /// </summary> /// <param name="path">path to JSON file</param> /// <param name="schema">schema to use</param> /// <returns></returns> public DataFrame JsonFile(string path, StructType schema) { throw new NotImplementedException(); }
public DataFrame CreateDataFrame(RDD <byte[]> rdd, StructType schema) { throw new NotImplementedException(); }
/// <summary> /// Loads a dataframe the source path using the given schema and options /// </summary> /// <param name="path"></param> /// <param name="schema"></param> /// <param name="options"></param> /// <returns></returns> public DataFrame ReadDataFrame(string path, StructType schema, Dictionary <string, string> options) { logger.LogInfo("Reading DataFrame from file {0}", path); return(new DataFrame(sqlContextProxy.ReadDataFrame(path, schema, options), sparkContext)); }
public DataFrame CreateExternalTable(string tableName, string source, StructType schema, Dictionary<string, string> options) { throw new NotImplementedException(); //TODO - implement }
public void TestSqlContextTextFile() { var sqlContext = new SqlContext(new SparkContext("", "")); var dataFrame = sqlContext.TextFile(@"c:\path\to\input.txt"); var paramValuesToTextFileMethod = (dataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference; Assert.AreEqual(@"c:\path\to\input.txt", paramValuesToTextFileMethod[0]); Assert.AreEqual(@",", paramValuesToTextFileMethod[1]); Assert.IsFalse(bool.Parse(paramValuesToTextFileMethod[2].ToString())); Assert.IsFalse(bool.Parse(paramValuesToTextFileMethod[3].ToString())); sqlContext = new SqlContext(new SparkContext("", "")); dataFrame = sqlContext.TextFile(@"c:\path\to\input.txt", "|", true, true); paramValuesToTextFileMethod = (dataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference; Assert.AreEqual(@"c:\path\to\input.txt", paramValuesToTextFileMethod[0]); Assert.AreEqual(@"|", paramValuesToTextFileMethod[1]); Assert.IsTrue(bool.Parse(paramValuesToTextFileMethod[2].ToString())); Assert.IsTrue(bool.Parse(paramValuesToTextFileMethod[3].ToString())); // Test with a given schema sqlContext = new SqlContext(new SparkContext("", "")); var structTypeProxy = new Mock<IStructTypeProxy>(); structTypeProxy.Setup(m => m.ToJson()).Returns(RowHelper.BasicJsonSchema); var structType = new StructType(structTypeProxy.Object); dataFrame = sqlContext.TextFile(@"c:\path\to\input.txt", structType); paramValuesToTextFileMethod = (dataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference; Assert.AreEqual(@"c:\path\to\input.txt", paramValuesToTextFileMethod[0]); Assert.AreEqual(structType, paramValuesToTextFileMethod[1]); Assert.AreEqual(@",", paramValuesToTextFileMethod[2]); }
/// <summary> /// Loads text file with the specific column delimited using the given schema /// </summary> /// <param name="path">path to text file</param> /// <param name="schema">schema to use</param> /// <param name="delimiter">delimiter to use</param> /// <returns></returns> public DataFrame TextFile(string path, StructType schema, string delimiter = ",") { logger.LogInfo("Path of the text file {0}", path); return(new DataFrame(sqlContextProxy.TextFile(path, schema, delimiter), sparkContext)); }
public IDataFrameProxy TextFile(string path, StructType schema, string delimiter) { return new MockDataFrameProxy(new object[] { path, schema, delimiter }, this); }
public static void Initialize() { basicSchema = new StructType(JObject.Parse(basicJsonSchema)); complexSchema = new StructType(JObject.Parse(complexJsonSchema)); }