예제 #1
0
        /// <summary>
        /// Returns a new DataFrame omitting rows with null values.
        /// Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dropna(self, how='any', thresh=None, subset=None)
        /// </summary>
        /// <param name="how">'any' or 'all'.
        /// If 'any', drop a row if it contains any nulls.
        /// If 'all', drop a row only if all its values are null.</param>
        /// <param name="thresh">thresh: int, default null.
        /// If specified, drop rows that have less than `thresh` non-null values.
        /// This overwrites the `how` parameter.</param>
        /// <param name="subset">optional list of column names to consider.</param>
        /// <returns>A new DataFrame omitting rows with null values</returns>
        public DataFrame DropNa(string how = "any", int?thresh = null, string[] subset = null)
        {
            if (how != "any" && how != "all")
            {
                throw new ArgumentException(string.Format(@"how ({0}) should be 'any' or 'all'.", how));
            }

            string[] columnNames = null;
            if (subset == null || subset.Length == 0)
            {
                columnNames = dataFrameProxy.GetSchema().GetStructTypeFields().Select(f => f.GetStructFieldName().ToString()).ToArray();
            }

            if (thresh == null)
            {
                thresh = how == "any" ? (subset == null ? columnNames.Length : subset.Length) : 1;
            }

            return
                (new DataFrame(dataFrameProxy.DropNa(thresh, subset ?? columnNames), sparkContext));
        }