/// <summary>
        /// Distribute a local collection to form an RDD.
        ///
        /// sc.Parallelize(new int[] {0, 2, 3, 4, 6}, 5).Glom().Collect()
        /// [[0], [2], [3], [4], [6]]
        /// 
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="serializableObjects"></param>
        /// <param name="numSlices"></param>
        /// <returns></returns>
        public RDD<T> Parallelize<T>(IEnumerable<T> serializableObjects, int numSlices = 1)
        {
            List<byte[]> collectionOfByteRepresentationOfObjects = new List<byte[]>();
            foreach (T obj in serializableObjects)
            {
                var memoryStream = new MemoryStream();
                var formatter = new BinaryFormatter();
                formatter.Serialize(memoryStream, obj);
                collectionOfByteRepresentationOfObjects.Add(memoryStream.ToArray());
            }

            if (numSlices < 1)
                numSlices = 1;

            return new RDD<T>(SparkContextProxy.Parallelize(collectionOfByteRepresentationOfObjects, numSlices), this);
        }
Beispiel #2
0
        /// <summary>
        /// Distribute a local collection to form an RDD.
        ///
        /// sc.Parallelize(new int[] {0, 2, 3, 4, 6}, 5).Glom().Collect()
        /// [[0], [2], [3], [4], [6]]
        ///
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="serializableObjects"></param>
        /// <param name="numSlices"></param>
        /// <returns></returns>
        public RDD <T> Parallelize <T>(IEnumerable <T> serializableObjects, int numSlices = 1)
        {
            List <byte[]> collectionOfByteRepresentationOfObjects = new List <byte[]>();

            foreach (T obj in serializableObjects)
            {
                var memoryStream = new MemoryStream();
                var formatter    = new BinaryFormatter();
                formatter.Serialize(memoryStream, obj);
                collectionOfByteRepresentationOfObjects.Add(memoryStream.ToArray());
            }

            if (numSlices < 1)
            {
                numSlices = 1;
            }

            logger.LogInfo("Parallelizing {0} items to form RDD in the cluster with {1} partitions", collectionOfByteRepresentationOfObjects.Count, numSlices);
            return(new RDD <T>(SparkContextProxy.Parallelize(collectionOfByteRepresentationOfObjects, numSlices), this));
        }