Example #1
0
        /// <summary>
        /// Creates a <see cref="DataFrame"/> from a RDD containing array of object using the given schema.
        /// </summary>
        /// <param name="rdd">RDD containing array of object. The array acts as a row and items within the array act as columns which the schema is specified in <paramref name="schema"/>. </param>
        /// <param name="schema">The schema of DataFrame.</param>
        /// <returns></returns>
        public DataFrame CreateDataFrame(RDD <object[]> rdd, StructType schema)
        {
            // Note: This is for pickling RDD, convert to RDD<byte[]> which happens in CSharpWorker.
            // The below sqlContextProxy.CreateDataFrame() will call byteArrayRDDToAnyArrayRDD() of SQLUtils.scala which only accept RDD of type RDD[Array[Byte]].
            // In byteArrayRDDToAnyArrayRDD() of SQLUtils.scala, the SerDeUtil.pythonToJava() will be called which is a mapPartitions inside.
            // It will be executed until the CSharpWorker finishes Pickling to RDD[Array[Byte]].
            var rddRow = rdd.MapPartitions(r => r.Select(rr => rr));

            rddRow.serializedMode = SerializedMode.Row;

            return(new DataFrame(sparkSessionProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), SparkContext));
        }