예제 #1
0
 /// <summary>
 /// Returns a new DataFrame with duplicate rows removed, considering only the subset of columns.
 /// Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dropDuplicates(self, subset=None)
 /// </summary>
 /// <param name="subset">drop duplicated rows on these columns.</param>
 /// <returns>A new DataFrame with duplicate rows removed.</returns>
 public DataFrame DropDuplicates(string[] subset = null)
 {
     return((subset == null || subset.Length == 0) ?
            new DataFrame(dataFrameProxy.DropDuplicates(), sparkContext) :
            new DataFrame(dataFrameProxy.DropDuplicates(subset), sparkContext));
 }