Exemple #1
0
        /// <summary>
        /// Randomly splits this DataFrame with the provided weights.
        /// Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, randomSplit(self, weights, seed=None)
        /// </summary>
        /// <param name="weights">list of weights with which to split the DataFrame. Weights will be normalized if they don't sum up to 1.0</param>
        /// <param name="seed">The seed for sampling</param>
        /// <returns></returns>
        public IEnumerable <DataFrame> RandomSplit(IEnumerable <double> weights, int?seed = null)
        {
            foreach (var weight in weights)
            {
                if (weight < 0.0)
                {
                    throw new ArgumentException(string.Format("Weights must be positive. Found weight value: {0}", weight));
                }
            }

            if (seed == null)
            {
                seed = new Random().Next();
            }

            return(dataFrameProxy.RandomSplit(weights, seed.Value).Select(dfProxy => new DataFrame(dfProxy, sparkContext)));
        }