public void Schema(StructType schema)
 {
     var structTypeIpcProxy = schema.StructTypeProxy as StructTypeIpcProxy;
     if (structTypeIpcProxy != null)
         SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmDataFrameReaderReference, "schema", 
             new object[] { structTypeIpcProxy.JvmStructTypeReference });
 }
Beispiel #2
0
        public void TestDTypes()
        {
            var requestsSchema = new StructType(new List<StructField>
            {
                new StructField("test", new StringType(), false),
            });
            var x = requestsSchema.JsonValue.ToString();
            Mock<IStructTypeProxy> mockStructTypeProxy = new Mock<IStructTypeProxy>();
            mockStructTypeProxy.Setup(m => m.ToJson()).Returns(x);
            Mock<IStructFieldProxy> mockStructFieldProxy = new Mock<IStructFieldProxy>();
            mockStructFieldProxy.Setup(m => m.GetStructFieldName()).Returns("testcol");
            Mock<IStructDataTypeProxy> mockStructDataTypeProxy = new Mock<IStructDataTypeProxy>();
            mockStructDataTypeProxy.Setup(m => m.GetDataTypeSimpleString()).Returns("ss");
            mockStructFieldProxy.Setup(m => m.GetStructFieldDataType()).Returns(mockStructDataTypeProxy.Object);
            mockStructTypeProxy.Setup(m => m.GetStructTypeFields())
                .Returns(new List<IStructFieldProxy>() { mockStructFieldProxy.Object });
            Mock<IDataFrameProxy> mockDataFrameProxy = new Mock<IDataFrameProxy>();
            mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockStructTypeProxy.Object);
            mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object);

            var dataset = new Dataset(mockDatasetProxy.Object);
            var dTypes = dataset.DTypes();
            Assert.AreEqual(1, dTypes.Count());
            var first = dTypes.First();
            Assert.AreEqual("testcol", first.Item1);
            Assert.AreEqual("ss", first.Item2);
        }
 public IDataFrameProxy ReadDataFrame(string path, StructType schema, Dictionary<string, string> options)
 {
     //TODO parameter Dictionary<string, string> options is not used right now - it is meant to be passed on to data sources
     return new DataFrameIpcProxy(
                 new JvmObjectReference(
                        SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "loadDF", new object[] { jvmSqlContextReference, path, (schema.StructTypeProxy as StructTypeIpcProxy).JvmStructTypeReference }).ToString()
                     ), this
             );
 }
 public IDataFrameProxy TextFile(string path, StructType schema, string delimiter)
 {
     return new DataFrameIpcProxy(
             new JvmObjectReference(
                 SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod(
                     "org.apache.spark.sql.api.csharp.SQLUtils", "loadTextFile",
                     new object[] {jvmSqlContextReference, path, delimiter, (schema.StructTypeProxy as StructTypeIpcProxy).JvmStructTypeReference}).ToString()
                 ), this
             );
 }
Beispiel #5
0
        internal static void DFCreateDataFrameSample()
        {
            var schemaPeople = new StructType(new List<StructField>
                                        {
                                            new StructField("id", new StringType()), 
                                            new StructField("name", new StringType()),
                                            new StructField("age", new IntegerType()),
                                            new StructField("address", new StructType(new List<StructField>
                                                                                      {
                                                                                          new StructField("city", new StringType()),
                                                                                          new StructField("state", new StringType())
                                                                                      })),
                                            new StructField("phone numbers", new ArrayType(new StringType()))
                                        });

            var rddPeople = SparkCLRSamples.SparkContext.Parallelize(
                                    new List<object[]>
                                    {
                                        new object[] { "123", "Bill", 43, new object[]{ "Columbus", "Ohio" }, new string[]{ "Tel1", "Tel2" } },
                                        new object[] { "456", "Steve", 34,  new object[]{ "Seattle", "Washington" }, new string[]{ "Tel3", "Tel4" } }
                                    });

            var dataFramePeople = GetSqlContext().CreateDataFrame(rddPeople, schemaPeople);
            Console.WriteLine("------ Schema of People Data Frame:\r\n");
            dataFramePeople.ShowSchema();
            Console.WriteLine();
            var collected = dataFramePeople.Collect().ToArray();
            foreach (var people in collected)
            {
                string id = people.Get("id");
                string name = people.Get("name");
                int age = people.Get("age");
                Row address = people.Get("address");
                string city = address.Get("city");
                string state = address.Get("state");
                object[] phoneNumbers = people.Get("phone numbers");
                Console.WriteLine("id:{0}, name:{1}, age:{2}, address:(city:{3},state:{4}), phoneNumbers:[{5},{6}]\r\n", id, name, age, city, state, phoneNumbers[0], phoneNumbers[1]);
            }

            if (SparkCLRSamples.Configuration.IsValidationEnabled)
            {
                Assert.AreEqual(2, dataFramePeople.Rdd.Count());
                Assert.AreEqual(schemaPeople.Json, dataFramePeople.Schema.Json);
            }
        }
Beispiel #6
0
 /// <summary>
 /// Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema
 /// automatically from data. By specifying the schema here, the underlying data source can
 /// skip the schema inference step, and thus speed up data loading.
 /// </summary>
 public DataFrameReader Schema(StructType schema)
 {
     dataFrameReaderProxy.Schema(schema);
     return this;
 }
Beispiel #7
0
        public void TestSchema()
        {
            // arrange
            mockDataFrameReaderProxy.Setup(m => m.Schema(It.IsAny<StructType>()));
            var dataFrameReader = new DataFrameReader(mockDataFrameReaderProxy.Object, sparkContext);
            const string jsonSchema = @"
                {
                  ""type"" : ""struct"",
                  ""fields"" : [ {
                    ""name"" : ""address"",
                    ""type"" : {
                      ""type"" : ""struct"",
                      ""fields"" : [ {
                        ""name"" : ""city"",
                        ""type"" : ""string"",
                        ""nullable"" : true,
                        ""metadata"" : { }
                      }, {
                        ""name"" : ""state"",
                        ""type"" : ""string"",
                        ""nullable"" : true,
                        ""metadata"" : { }
                      } ]
                    },
                    ""nullable"" : true,
                    ""metadata"" : { }
                  }, {
                    ""name"" : ""age"",
                    ""type"" : ""long"",
                    ""nullable"" : true,
                    ""metadata"" : { }
                  }, {
                    ""name"" : ""id"",
                    ""type"" : ""string"",
                    ""nullable"" : true,
                    ""metadata"" : { }
                  }, {
                    ""name"" : ""name"",
                    ""type"" : ""string"",
                    ""nullable"" : true,
                    ""metadata"" : { }
                  } ]
                }";
            var mockStructTypeProxy = new MockStructTypeProxy(jsonSchema);
            var schema = new StructType(mockStructTypeProxy);

            // act
            var reader = dataFrameReader.Schema(schema);

            // verify
            Assert.IsNotNull(reader);
            Assert.AreSame(reader, dataFrameReader);
            mockDataFrameReaderProxy.Verify(m => m.Schema(schema), Times.Once);
        }
Beispiel #8
0
        /// <summary>
        /// Verify the schema of people dataframe.
        /// </summary>
        /// <param name="schema"> RowSchema of people DataFrame </param>
        internal static void VerifySchemaOfPeopleDataFrame(StructType schema)
        {
            Assert.IsNotNull(schema);
            Assert.AreEqual("struct", schema.TypeName);
            Assert.IsNotNull(schema.Fields);
            Assert.AreEqual(4, schema.Fields.Count);

            // name
            var nameColSchema = schema.Fields.Find(c => c.Name.Equals("name"));
            Assert.IsNotNull(nameColSchema);
            Assert.AreEqual("name", nameColSchema.Name);
            Assert.IsTrue(nameColSchema.IsNullable);
            Assert.AreEqual("string", nameColSchema.DataType.TypeName);

            // id
            var idColSchema = schema.Fields.Find(c => c.Name.Equals("id"));
            Assert.IsNotNull(idColSchema);
            Assert.AreEqual("id", idColSchema.Name);
            Assert.IsTrue(idColSchema.IsNullable);
            Assert.AreEqual("string", nameColSchema.DataType.TypeName);

            // age
            var ageColSchema = schema.Fields.Find(c => c.Name.Equals("age"));
            Assert.IsNotNull(ageColSchema);
            Assert.AreEqual("age", ageColSchema.Name);
            Assert.IsTrue(ageColSchema.IsNullable);
            Assert.AreEqual("long", ageColSchema.DataType.TypeName);

            // address
            var addressColSchema = schema.Fields.Find(c => c.Name.Equals("address"));
            Assert.IsNotNull(addressColSchema);
            Assert.AreEqual("address", addressColSchema.Name);
            Assert.IsTrue(addressColSchema.IsNullable);
            Assert.IsNotNull(addressColSchema.DataType);
            Assert.AreEqual("struct", addressColSchema.DataType.TypeName);
            Assert.IsNotNull(((StructType)addressColSchema.DataType).Fields.Find(c => c.Name.Equals("state")));
            Assert.IsNotNull(((StructType)addressColSchema.DataType).Fields.Find(c => c.Name.Equals("city")));
        }
Beispiel #9
0
        internal static void DFTextFileLoadDataFrameSample()
        {
            var requestsSchema = new StructType(new List<StructField>
                                                {
                                                    new StructField("guid", new StringType(), false),
                                                    new StructField("datacenter", new StringType(), false),
                                                    new StructField("abtestid", new StringType(), false),
                                                    new StructField("traffictype", new StringType(), false),
                                                });

            var requestsDateFrame = GetSqlContext().TextFile(SparkCLRSamples.Configuration.GetInputDataPath(RequestsLog), requestsSchema);
            requestsDateFrame.RegisterTempTable("requests");
            var guidFilteredDataFrame = GetSqlContext().Sql("SELECT guid, datacenter FROM requests where guid = '4628deca-139d-4121-b540-8341b9c05c2a'");
            guidFilteredDataFrame.Show();

            requestsDateFrame.ShowSchema();
            requestsDateFrame.Show();
            var count = requestsDateFrame.Count();

            guidFilteredDataFrame.ShowSchema();
            guidFilteredDataFrame.Show();
            var filteredCount = guidFilteredDataFrame.Count();

            if (SparkCLRSamples.Configuration.IsValidationEnabled)
            {
                Assert.AreEqual(10, count);
                Assert.AreEqual(1, filteredCount);
            }
        }
Beispiel #10
0
 static RowHelper()
 {
     BasicSchema = DataType.ParseDataTypeFromJson(BasicJsonSchema) as StructType;
     ComplexSchema = DataType.ParseDataTypeFromJson(ComplexJsonSchema) as StructType;
 }
Beispiel #11
0
 public DataFrame JsonFile(string path, StructType schema)
 {
     return(Read().Schema(schema).Json(path));
 }
Beispiel #12
0
 /// <summary>
 /// Loads a dataframe the source path using the given schema and options
 /// </summary>
 /// <param name="path"></param>
 /// <param name="schema"></param>
 /// <param name="options"></param>
 /// <returns></returns>
 public DataFrame ReadDataFrame(string path, StructType schema, Dictionary <string, string> options)
 {
     return(new DataFrame(sqlContextProxy.ReadDataFrame(path, schema, options), sparkContext));
 }
Beispiel #13
0
        /// <summary>
        /// Creates a <see cref="DataFrame"/> from a RDD containing array of object using the given schema.
        /// </summary>
        /// <param name="rdd">RDD containing array of object. The array acts as a row and items within the array act as columns which the schema is specified in <paramref name="schema"/>. </param>
        /// <param name="schema">The schema of DataFrame.</param>
        /// <returns></returns>
        public DataFrame CreateDataFrame(RDD<object[]> rdd, StructType schema)
        {
            // Note: This is for pickling RDD, convert to RDD<byte[]> which happens in CSharpWorker.
            // The below sqlContextProxy.CreateDataFrame() will call byteArrayRDDToAnyArrayRDD() of SQLUtils.scala which only accept RDD of type RDD[Array[Byte]].
            // In byteArrayRDDToAnyArrayRDD() of SQLUtils.scala, the SerDeUtil.pythonToJava() will be called which is a mapPartitions inside.
            // It will be executed until the CSharpWorker finishes Pickling to RDD[Array[Byte]].
            var rddRow = rdd.Map(r => r);
            rddRow.serializedMode = SerializedMode.Row;

            return new DataFrame(sparkSessionProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), sparkContext);
        }
 public IDataFrameProxy ReadDataFrame(string path, StructType schema, System.Collections.Generic.Dictionary<string, string> options)
 {
     throw new NotImplementedException();
 }
Beispiel #15
0
        public void TestSchema(bool usePrintSchema)
        {
            var requestsSchema = new StructType(new List<StructField>
            {
                new StructField("test", new StringType(), false),
            });
            var jsonValue = requestsSchema.JsonValue.ToString();
            Mock<IStructTypeProxy> mockStructTypeProxy = new Mock<IStructTypeProxy>();
            mockStructTypeProxy.Setup(m => m.ToJson()).Returns(jsonValue);
            Mock<IDataFrameProxy> mockDataFrameProxy = new Mock<IDataFrameProxy>();
            mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockStructTypeProxy.Object);
            mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object);

            var dataset = new Dataset(mockDatasetProxy.Object);

            if (usePrintSchema)
                dataset.PrintSchema();
            else
                dataset.ShowSchema();

            mockDataFrameProxy.Verify(m => m.GetSchema(), Times.Once);
            mockStructTypeProxy.Verify(m => m.ToJson(), Times.Once());
        }
Beispiel #16
0
 /// <summary>
 /// Loads text file with the specific column delimited using the given schema
 /// </summary>
 /// <param name="path">path to text file</param>
 /// <param name="schema">schema to use</param>
 /// <param name="delimiter">delimiter to use</param>
 /// <returns></returns>
 public DataFrame TextFile(string path, StructType schema, string delimiter = ",")
 {
     return(new DataFrame(sqlContextProxy.TextFile(path, schema, delimiter), sparkContext));
 }
Beispiel #17
0
 /// <summary>
 /// Loads a JSON file (one object per line) and applies the given schema
 /// </summary>
 /// <param name="path">path to JSON file</param>
 /// <param name="schema">schema to use</param>
 /// <returns></returns>
 public DataFrame JsonFile(string path, StructType schema)
 {
     throw new NotImplementedException();
 }
Beispiel #18
0
 public DataFrame CreateDataFrame(RDD <byte[]> rdd, StructType schema)
 {
     throw new NotImplementedException();
 }
Beispiel #19
0
 /// <summary>
 /// Loads a dataframe the source path using the given schema and options
 /// </summary>
 /// <param name="path"></param>
 /// <param name="schema"></param>
 /// <param name="options"></param>
 /// <returns></returns>
 public DataFrame ReadDataFrame(string path, StructType schema, Dictionary <string, string> options)
 {
     logger.LogInfo("Reading DataFrame from file {0}", path);
     return(new DataFrame(sqlContextProxy.ReadDataFrame(path, schema, options), sparkContext));
 }
Beispiel #20
0
 public DataFrame CreateExternalTable(string tableName, string source, StructType schema, Dictionary<string, string> options)
 {
     throw new NotImplementedException(); //TODO - implement
 }
Beispiel #21
0
        public void TestSqlContextTextFile()
        {
            var sqlContext = new SqlContext(new SparkContext("", ""));
            var dataFrame = sqlContext.TextFile(@"c:\path\to\input.txt");
            var paramValuesToTextFileMethod = (dataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
            Assert.AreEqual(@"c:\path\to\input.txt", paramValuesToTextFileMethod[0]);
            Assert.AreEqual(@",", paramValuesToTextFileMethod[1]);
            Assert.IsFalse(bool.Parse(paramValuesToTextFileMethod[2].ToString()));
            Assert.IsFalse(bool.Parse(paramValuesToTextFileMethod[3].ToString()));

            sqlContext = new SqlContext(new SparkContext("", "")); 
            dataFrame = sqlContext.TextFile(@"c:\path\to\input.txt", "|", true, true);
            paramValuesToTextFileMethod = (dataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
            Assert.AreEqual(@"c:\path\to\input.txt", paramValuesToTextFileMethod[0]);
            Assert.AreEqual(@"|", paramValuesToTextFileMethod[1]);
            Assert.IsTrue(bool.Parse(paramValuesToTextFileMethod[2].ToString()));
            Assert.IsTrue(bool.Parse(paramValuesToTextFileMethod[3].ToString()));

            // Test with a given schema
            sqlContext = new SqlContext(new SparkContext("", ""));
            var structTypeProxy = new Mock<IStructTypeProxy>();
            structTypeProxy.Setup(m => m.ToJson()).Returns(RowHelper.BasicJsonSchema);
            var structType = new StructType(structTypeProxy.Object);
            dataFrame = sqlContext.TextFile(@"c:\path\to\input.txt", structType);
            paramValuesToTextFileMethod = (dataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
            Assert.AreEqual(@"c:\path\to\input.txt", paramValuesToTextFileMethod[0]);
            Assert.AreEqual(structType, paramValuesToTextFileMethod[1]);
            Assert.AreEqual(@",", paramValuesToTextFileMethod[2]);
        }
Beispiel #22
0
 /// <summary>
 /// Loads text file with the specific column delimited using the given schema
 /// </summary>
 /// <param name="path">path to text file</param>
 /// <param name="schema">schema to use</param>
 /// <param name="delimiter">delimiter to use</param>
 /// <returns></returns>
 public DataFrame TextFile(string path, StructType schema, string delimiter = ",")
 {
     logger.LogInfo("Path of the text file {0}", path);
     return(new DataFrame(sqlContextProxy.TextFile(path, schema, delimiter), sparkContext));
 }
 public IDataFrameProxy TextFile(string path, StructType schema, string delimiter)
 {
     return new MockDataFrameProxy(new object[] { path, schema, delimiter }, this);
 }
Beispiel #24
0
 public static void Initialize()
 {
     basicSchema = new StructType(JObject.Parse(basicJsonSchema));
     complexSchema = new StructType(JObject.Parse(complexJsonSchema));
 }