/// <summary> /// Returns a new DataFrame with duplicate rows removed, considering only the subset of columns. /// Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dropDuplicates(self, subset=None) /// </summary> /// <param name="subset">drop duplicated rows on these columns.</param> /// <returns>A new DataFrame with duplicate rows removed.</returns> public DataFrame DropDuplicates(string[] subset = null) { return((subset == null || subset.Length == 0) ? new DataFrame(dataFrameProxy.DropDuplicates(), sparkContext) : new DataFrame(dataFrameProxy.DropDuplicates(subset), sparkContext)); }