public void TestDropWithAny() { // arrange const string columnName = "column1"; var mockSchemaProxy = new Mock<IStructTypeProxy>(); var mockFieldProxy = new Mock<IStructFieldProxy>(); mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object); mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object }); mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName); var sparkContext = new SparkContext("", ""); mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); // act var cols = new[] { "col1", "col2" }; var df1 = f.Drop("any", cols); var df2 = f.Drop(); var df3 = f.Drop("any"); // verify Assert.IsNotNull(df1); Assert.AreEqual(df1.DataFrameProxy, dataFrame.DataFrameProxy); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(cols.Length, cols), Times.Once); Assert.IsNotNull(df2); Assert.AreEqual(df2.DataFrameProxy, dataFrame.DataFrameProxy); Assert.IsNotNull(df3); Assert.AreEqual(df3.DataFrameProxy, dataFrame.DataFrameProxy); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(1, new[] { columnName }), Times.Exactly(2)); }
public void TestReplaceWithColumns() { // arrange var sparkContext = new SparkContext("", ""); mockDataFrameNaFunctionsProxy.Setup(m => m.Replace(It.IsAny <string[]>(), It.IsAny <Dictionary <string, string> >())) .Returns(mockDataFrameProxy.Object); // act var replacement = new Dictionary <string, string>() { { "", "unknown" }, { "?", "unknown" } }; var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); var cols = new[] { "col1", "col2" }; var df = f.Replace(cols, replacement); // verify Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Replace(cols, replacement), Times.Once); }
static void Main(string[] args) { //1. Create a Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); //2. Create initial DataFrame DataFrame dataFrame = spark.Read() //.Schema("Assertid STRING,properties STRING,Value BOOLEAN,TimeSatmp TIMESTAMP") .Schema("Assertid STRING,properties STRING,Value STRING,TimeSatmp TIMESTAMP") .Csv("DataBook.csv"); dataFrame.Show(); //Drop any rows with Null/Empty values DataFrameNaFunctions dropEmptytablesrows = dataFrame.Na(); DataFrame CleanedProjects = dropEmptytablesrows.Drop("any"); var testdata = 0; //remove unnecessary Columns CleanedProjects = CleanedProjects.Drop("Assertid", "properties", "Value", "TimeSatmp"); CleanedProjects.Show(); // Stop Spark session--checked spark.Stop(); }
public void TestDropWithMinNonNulls() { const string columnName = "column1"; var mockSchemaProxy = new Mock <IStructTypeProxy>(); var mockFieldProxy = new Mock <IStructFieldProxy>(); mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object); mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List <IStructFieldProxy> { mockFieldProxy.Object }); mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName); var sparkContext = new SparkContext("", ""); mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny <int>(), It.IsAny <string[]>())).Returns(mockDataFrameProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); var df = f.Drop(20); Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(20, new[] { columnName }), Times.Once); }
public void TestDropWithCols() { // arrange var sparkContext = new SparkContext("", ""); mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny <int>(), It.IsAny <string[]>())).Returns(mockDataFrameProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); // act var cols = new[] { "col1", "col2" }; var df = f.Drop(cols); // verify Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(cols.Length, cols), Times.Once); mockDataFrameNaFunctionsProxy.Reset(); df = f.Drop(new string[] { }); Assert.AreSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(It.IsAny <int>(), It.IsAny <string[]>()), Times.Never); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: GitHubProjects <path to projects.csv>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName("GitHub and Spark Batch") .GetOrCreate(); DataFrame projectsDf = spark .Read() .Schema("id INT, url STRING, owner_id INT, " + "name STRING, descriptor STRING, language STRING, " + "created_at STRING, forked_from INT, deleted STRING, " + "updated_at STRING") .Csv(args[0]); projectsDf.Show(); // Drop any rows with NA values DataFrameNaFunctions dropEmptyProjects = projectsDf.Na(); DataFrame cleanedProjects = dropEmptyProjects.Drop("any"); // Remove unnecessary columns cleanedProjects = cleanedProjects.Drop("id", "url", "owner_id"); cleanedProjects.Show(); // Average number of times each language has been forked DataFrame groupedDF = cleanedProjects .GroupBy("language") .Agg(Avg(cleanedProjects["forked_from"])); // Sort by most forked languages first groupedDF.OrderBy(Desc("avg(forked_from)")).Show(); spark.Udf().Register <string, bool>( "MyUDF", (date) => DateTime.TryParse(date, out DateTime convertedDate) && (convertedDate > s_referenceDate)); cleanedProjects.CreateOrReplaceTempView("dateView"); DataFrame dateDf = spark.Sql( "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView"); dateDf.Show(); spark.Stop(); }
public void TestFill() { // arrange var sparkContext = new SparkContext("", ""); // test fill with double value mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny <double>(), It.IsAny <string[]>())).Returns(mockDataFrameProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); var cols = new[] { "col1", "col2" }; const double doubleValue = 0.001; var df = f.Fill(doubleValue, cols); // verify Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(doubleValue, cols), Times.Once); // test fill with string value mockDataFrameNaFunctionsProxy.Reset(); mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny <string>(), It.IsAny <string[]>())).Returns(mockDataFrameProxy.Object); dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); const string strValue = "UNKNOWN"; df = f.Fill(strValue, cols); Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(strValue, cols), Times.Once); // test fill with dictonary mockDataFrameNaFunctionsProxy.Reset(); mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny <Dictionary <string, object> >())).Returns(mockDataFrameProxy.Object); dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); var valueDict = new Dictionary <string, object>() { { "col1", -1 }, { "col2", "UNKNOWN" } }; df = f.Fill(valueDict); // verify Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(valueDict), Times.Once); }
public void TestDropWithAll() { // arrange var sparkContext = new SparkContext("", ""); mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); // act var cols = new[] { "col1", "col2" }; var df = f.Drop("all", cols); // verify Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(1, cols), Times.Once); }
public void TestDropWithAny() { // arrange const string columnName = "column1"; var mockSchemaProxy = new Mock <IStructTypeProxy>(); var mockFieldProxy = new Mock <IStructFieldProxy>(); mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object); mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List <IStructFieldProxy> { mockFieldProxy.Object }); mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName); var sparkContext = new SparkContext("", ""); mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny <int>(), It.IsAny <string[]>())).Returns(mockDataFrameProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); // act var cols = new[] { "col1", "col2" }; var df1 = f.Drop("any", cols); var df2 = f.Drop(); var df3 = f.Drop("any"); // verify Assert.IsNotNull(df1); Assert.AreEqual(df1.DataFrameProxy, dataFrame.DataFrameProxy); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(cols.Length, cols), Times.Once); Assert.IsNotNull(df2); Assert.AreEqual(df2.DataFrameProxy, dataFrame.DataFrameProxy); Assert.IsNotNull(df3); Assert.AreEqual(df3.DataFrameProxy, dataFrame.DataFrameProxy); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(1, new[] { columnName }), Times.Exactly(2)); }
public void TestReplaceWithColumns() { // arrange var sparkContext = new SparkContext("", ""); mockDataFrameNaFunctionsProxy.Setup(m => m.Replace(It.IsAny<string[]>(), It.IsAny<Dictionary<string, string>>())) .Returns(mockDataFrameProxy.Object); // act var replacement = new Dictionary<string, string>() { {"", "unknown"}, {"?", "unknown"} }; var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); var cols = new[] { "col1", "col2" }; var df = f.Replace(cols, replacement); // verify Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Replace(cols, replacement), Times.Once); }
public void TestFill() { // arrange var sparkContext = new SparkContext("", ""); // test fill with double value mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny<double>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); var cols = new[] { "col1", "col2" }; const double doubleValue = 0.001; var df = f.Fill(doubleValue, cols); // verify Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(doubleValue, cols), Times.Once); // test fill with string value mockDataFrameNaFunctionsProxy.Reset(); mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny<string>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object); dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); const string strValue = "UNKNOWN"; df = f.Fill(strValue, cols); Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(strValue, cols), Times.Once); // test fill with dictonary mockDataFrameNaFunctionsProxy.Reset(); mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny<Dictionary<string,object>>())).Returns(mockDataFrameProxy.Object); dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); var valueDict = new Dictionary<string, object>() { {"col1", -1}, {"col2", "UNKNOWN"} }; df = f.Fill(valueDict); // verify Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(valueDict), Times.Once); }
public void TestDropWithMinNonNulls() { const string columnName = "column1"; var mockSchemaProxy = new Mock<IStructTypeProxy>(); var mockFieldProxy = new Mock<IStructFieldProxy>(); mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object); mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object }); mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName); var sparkContext = new SparkContext("", ""); mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); var df = f.Drop(20); Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(20, new[] { columnName }), Times.Once); }
public void TestDataFrameNaFunctionSignatures() { DataFrameNaFunctions dfNaFuncs = _df.Na(); var emptyColumn = new string[] { }; var validColumn = new string[] { "age" }; DataFrame df = dfNaFuncs.Drop("any"); df = dfNaFuncs.Drop("all"); df = dfNaFuncs.Drop(emptyColumn); df = dfNaFuncs.Drop(validColumn); df = dfNaFuncs.Drop("any", emptyColumn); df = dfNaFuncs.Drop("all", validColumn); df = dfNaFuncs.Drop(20); df = dfNaFuncs.Drop(20, emptyColumn); df = dfNaFuncs.Drop(20, validColumn); df = dfNaFuncs.Fill(100L); df = dfNaFuncs.Fill(100.0); df = dfNaFuncs.Fill("hello"); df = dfNaFuncs.Fill(false); df = dfNaFuncs.Fill(100L, emptyColumn); df = dfNaFuncs.Fill(100L, validColumn); df = dfNaFuncs.Fill(100.0, emptyColumn); df = dfNaFuncs.Fill(100.0, validColumn); df = dfNaFuncs.Fill("hello", emptyColumn); df = dfNaFuncs.Fill("hello", validColumn); df = dfNaFuncs.Fill(true, emptyColumn); df = dfNaFuncs.Fill(true, validColumn); df = dfNaFuncs.Fill(new Dictionary <string, int>() { { "age", 10 } }); df = dfNaFuncs.Fill(new Dictionary <string, long>() { { "age", 10L } }); df = dfNaFuncs.Fill(new Dictionary <string, double>() { { "age", 10.0 } }); df = dfNaFuncs.Fill(new Dictionary <string, string>() { { "age", "name" } }); df = dfNaFuncs.Fill(new Dictionary <string, bool>() { { "age", false } }); var doubleReplacement = new Dictionary <double, double>() { { 1.0, 5.0 } }; var boolReplacement = new Dictionary <bool, bool>() { { true, false } }; var stringReplacement = new Dictionary <string, string>() { { "a", "b" } }; df = dfNaFuncs.Replace("age", doubleReplacement); df = dfNaFuncs.Replace("age", boolReplacement); df = dfNaFuncs.Replace("age", stringReplacement); df = dfNaFuncs.Replace(emptyColumn, doubleReplacement); df = dfNaFuncs.Replace(validColumn, doubleReplacement); df = dfNaFuncs.Replace(emptyColumn, boolReplacement); df = dfNaFuncs.Replace(validColumn, boolReplacement); df = dfNaFuncs.Replace(emptyColumn, stringReplacement); df = dfNaFuncs.Replace(validColumn, stringReplacement); }