/// <summary> /// Creates a <see cref="DataFrame"/> from a RDD containing array of object using the given schema. /// </summary> /// <param name="rdd">RDD containing array of object. The array acts as a row and items within the array act as columns which the schema is specified in <paramref name="schema"/>. </param> /// <param name="schema">The schema of DataFrame.</param> /// <returns></returns> public DataFrame CreateDataFrame(RDD <object[]> rdd, StructType schema) { // Note: This is for pickling RDD, convert to RDD<byte[]> which happens in CSharpWorker. // The below sqlContextProxy.CreateDataFrame() will call byteArrayRDDToAnyArrayRDD() of SQLUtils.scala which only accept RDD of type RDD[Array[Byte]]. // In byteArrayRDDToAnyArrayRDD() of SQLUtils.scala, the SerDeUtil.pythonToJava() will be called which is a mapPartitions inside. // It will be executed until the CSharpWorker finishes Pickling to RDD[Array[Byte]]. var rddRow = rdd.MapPartitions(r => r.Select(rr => rr)); rddRow.serializedMode = SerializedMode.Row; return(new DataFrame(sparkSessionProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), SparkContext)); }