/// <summary> /// Returns a new DataFrame omitting rows with null values. /// Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dropna(self, how='any', thresh=None, subset=None) /// </summary> /// <param name="how">'any' or 'all'. /// If 'any', drop a row if it contains any nulls. /// If 'all', drop a row only if all its values are null.</param> /// <param name="thresh">thresh: int, default null. /// If specified, drop rows that have less than `thresh` non-null values. /// This overwrites the `how` parameter.</param> /// <param name="subset">optional list of column names to consider.</param> /// <returns>A new DataFrame omitting rows with null values</returns> public DataFrame DropNa(string how = "any", int?thresh = null, string[] subset = null) { if (how != "any" && how != "all") { throw new ArgumentException(string.Format(@"how ({0}) should be 'any' or 'all'.", how)); } string[] columnNames = null; if (subset == null || subset.Length == 0) { columnNames = dataFrameProxy.GetSchema().GetStructTypeFields().Select(f => f.GetStructFieldName().ToString()).ToArray(); } if (thresh == null) { thresh = how == "any" ? (subset == null ? columnNames.Length : subset.Length) : 1; } return (new DataFrame(dataFrameProxy.DropNa(thresh, subset ?? columnNames), sparkContext)); }