/// <summary> /// Distribute a local collection to form an RDD. /// /// sc.Parallelize(new int[] {0, 2, 3, 4, 6}, 5).Glom().Collect() /// [[0], [2], [3], [4], [6]] /// /// </summary> /// <typeparam name="T"></typeparam> /// <param name="serializableObjects"></param> /// <param name="numSlices"></param> /// <returns></returns> public RDD<T> Parallelize<T>(IEnumerable<T> serializableObjects, int numSlices = 1) { List<byte[]> collectionOfByteRepresentationOfObjects = new List<byte[]>(); foreach (T obj in serializableObjects) { var memoryStream = new MemoryStream(); var formatter = new BinaryFormatter(); formatter.Serialize(memoryStream, obj); collectionOfByteRepresentationOfObjects.Add(memoryStream.ToArray()); } if (numSlices < 1) numSlices = 1; return new RDD<T>(SparkContextProxy.Parallelize(collectionOfByteRepresentationOfObjects, numSlices), this); }
/// <summary> /// Distribute a local collection to form an RDD. /// /// sc.Parallelize(new int[] {0, 2, 3, 4, 6}, 5).Glom().Collect() /// [[0], [2], [3], [4], [6]] /// /// </summary> /// <typeparam name="T"></typeparam> /// <param name="serializableObjects"></param> /// <param name="numSlices"></param> /// <returns></returns> public RDD <T> Parallelize <T>(IEnumerable <T> serializableObjects, int numSlices = 1) { List <byte[]> collectionOfByteRepresentationOfObjects = new List <byte[]>(); foreach (T obj in serializableObjects) { var memoryStream = new MemoryStream(); var formatter = new BinaryFormatter(); formatter.Serialize(memoryStream, obj); collectionOfByteRepresentationOfObjects.Add(memoryStream.ToArray()); } if (numSlices < 1) { numSlices = 1; } logger.LogInfo("Parallelizing {0} items to form RDD in the cluster with {1} partitions", collectionOfByteRepresentationOfObjects.Count, numSlices); return(new RDD <T>(SparkContextProxy.Parallelize(collectionOfByteRepresentationOfObjects, numSlices), this)); }