Ejemplo n.º 1
0
        public void TestDropWithAny()
        {
            // arrange
            const string columnName = "column1";
            var mockSchemaProxy = new Mock<IStructTypeProxy>();
            var mockFieldProxy = new Mock<IStructFieldProxy>();
            mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
            mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object });
            mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);

            var sparkContext = new SparkContext("", "");
            mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);

            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);

            // act
            var cols = new[] { "col1", "col2" };
            var df1 = f.Drop("any", cols);
            var df2 = f.Drop();
            var df3 = f.Drop("any");

            // verify
            Assert.IsNotNull(df1);
            Assert.AreEqual(df1.DataFrameProxy, dataFrame.DataFrameProxy);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(cols.Length, cols), Times.Once);

            Assert.IsNotNull(df2);
            Assert.AreEqual(df2.DataFrameProxy, dataFrame.DataFrameProxy);
            
            Assert.IsNotNull(df3);
            Assert.AreEqual(df3.DataFrameProxy, dataFrame.DataFrameProxy);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(1, new[] { columnName }), Times.Exactly(2));
        }
Ejemplo n.º 2
0
        public void TestReplaceWithColumns()
        {
            // arrange
            var sparkContext = new SparkContext("", "");

            mockDataFrameNaFunctionsProxy.Setup(m => m.Replace(It.IsAny <string[]>(), It.IsAny <Dictionary <string, string> >()))
            .Returns(mockDataFrameProxy.Object);

            // act
            var replacement = new Dictionary <string, string>()
            {
                { "", "unknown" },
                { "?", "unknown" }
            };
            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f         = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);
            var cols      = new[] { "col1", "col2" };

            var df = f.Replace(cols, replacement);

            // verify
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Replace(cols, replacement), Times.Once);
        }
Ejemplo n.º 3
0
        static void Main(string[] args)
        {
            //1. Create a Spark session
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();
            //2. Create initial DataFrame
            DataFrame dataFrame = spark.Read()
                                  //.Schema("Assertid STRING,properties STRING,Value BOOLEAN,TimeSatmp TIMESTAMP")
                                  .Schema("Assertid STRING,properties STRING,Value STRING,TimeSatmp TIMESTAMP")
                                  .Csv("DataBook.csv");

            dataFrame.Show();

            //Drop any rows with Null/Empty values
            DataFrameNaFunctions dropEmptytablesrows = dataFrame.Na();
            DataFrame            CleanedProjects     = dropEmptytablesrows.Drop("any");
            var testdata = 0;

            //remove unnecessary Columns
            CleanedProjects = CleanedProjects.Drop("Assertid", "properties", "Value", "TimeSatmp");
            CleanedProjects.Show();
            // Stop Spark session--checked
            spark.Stop();
        }
Ejemplo n.º 4
0
        public void TestDropWithMinNonNulls()
        {
            const string columnName      = "column1";
            var          mockSchemaProxy = new Mock <IStructTypeProxy>();
            var          mockFieldProxy  = new Mock <IStructFieldProxy>();

            mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
            mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List <IStructFieldProxy> {
                mockFieldProxy.Object
            });
            mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);

            var sparkContext = new SparkContext("", "");

            mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny <int>(), It.IsAny <string[]>())).Returns(mockDataFrameProxy.Object);

            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f         = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);

            var df = f.Drop(20);

            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(20, new[] { columnName }), Times.Once);
        }
Ejemplo n.º 5
0
        public void TestDropWithCols()
        {
            // arrange
            var sparkContext = new SparkContext("", "");

            mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny <int>(), It.IsAny <string[]>())).Returns(mockDataFrameProxy.Object);

            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f         = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);

            // act
            var cols = new[] { "col1", "col2" };
            var df   = f.Drop(cols);

            // verify
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(cols.Length, cols), Times.Once);

            mockDataFrameNaFunctionsProxy.Reset();

            df = f.Drop(new string[] { });

            Assert.AreSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(It.IsAny <int>(), It.IsAny <string[]>()), Times.Never);
        }
Ejemplo n.º 6
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: GitHubProjects <path to projects.csv>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("GitHub and Spark Batch")
                                 .GetOrCreate();

            DataFrame projectsDf = spark
                                   .Read()
                                   .Schema("id INT, url STRING, owner_id INT, " +
                                           "name STRING, descriptor STRING, language STRING, " +
                                           "created_at STRING, forked_from INT, deleted STRING, " +
                                           "updated_at STRING")
                                   .Csv(args[0]);

            projectsDf.Show();

            // Drop any rows with NA values
            DataFrameNaFunctions dropEmptyProjects = projectsDf.Na();
            DataFrame            cleanedProjects   = dropEmptyProjects.Drop("any");

            // Remove unnecessary columns
            cleanedProjects = cleanedProjects.Drop("id", "url", "owner_id");
            cleanedProjects.Show();

            // Average number of times each language has been forked
            DataFrame groupedDF = cleanedProjects
                                  .GroupBy("language")
                                  .Agg(Avg(cleanedProjects["forked_from"]));

            // Sort by most forked languages first
            groupedDF.OrderBy(Desc("avg(forked_from)")).Show();

            spark.Udf().Register <string, bool>(
                "MyUDF",
                (date) => DateTime.TryParse(date, out DateTime convertedDate) &&
                (convertedDate > s_referenceDate));

            cleanedProjects.CreateOrReplaceTempView("dateView");

            DataFrame dateDf = spark.Sql(
                "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView");

            dateDf.Show();

            spark.Stop();
        }
Ejemplo n.º 7
0
        public void TestFill()
        {
            // arrange
            var sparkContext = new SparkContext("", "");

            // test fill with double value
            mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny <double>(), It.IsAny <string[]>())).Returns(mockDataFrameProxy.Object);
            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f         = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);

            var          cols        = new[] { "col1", "col2" };
            const double doubleValue = 0.001;
            var          df          = f.Fill(doubleValue, cols);

            // verify
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(doubleValue, cols), Times.Once);

            // test fill with string value
            mockDataFrameNaFunctionsProxy.Reset();
            mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny <string>(), It.IsAny <string[]>())).Returns(mockDataFrameProxy.Object);
            dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            f         = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);
            const string strValue = "UNKNOWN";

            df = f.Fill(strValue, cols);

            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(strValue, cols), Times.Once);

            // test fill with dictonary
            mockDataFrameNaFunctionsProxy.Reset();
            mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny <Dictionary <string, object> >())).Returns(mockDataFrameProxy.Object);
            dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            f         = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);
            var valueDict = new Dictionary <string, object>()
            {
                { "col1", -1 },
                { "col2", "UNKNOWN" }
            };

            df = f.Fill(valueDict);

            // verify
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(valueDict), Times.Once);
        }
Ejemplo n.º 8
0
        public void TestDropWithAll()
        {
            // arrange
            var sparkContext = new SparkContext("", "");
            mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);

            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);

            // act
            var cols = new[] { "col1", "col2" };
            var df = f.Drop("all", cols);

            // verify
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(1, cols), Times.Once);
        }
Ejemplo n.º 9
0
        public void TestDropWithAny()
        {
            // arrange
            const string columnName      = "column1";
            var          mockSchemaProxy = new Mock <IStructTypeProxy>();
            var          mockFieldProxy  = new Mock <IStructFieldProxy>();

            mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
            mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List <IStructFieldProxy> {
                mockFieldProxy.Object
            });
            mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);

            var sparkContext = new SparkContext("", "");

            mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny <int>(), It.IsAny <string[]>())).Returns(mockDataFrameProxy.Object);

            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f         = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);

            // act
            var cols = new[] { "col1", "col2" };
            var df1  = f.Drop("any", cols);
            var df2  = f.Drop();
            var df3  = f.Drop("any");

            // verify
            Assert.IsNotNull(df1);
            Assert.AreEqual(df1.DataFrameProxy, dataFrame.DataFrameProxy);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(cols.Length, cols), Times.Once);

            Assert.IsNotNull(df2);
            Assert.AreEqual(df2.DataFrameProxy, dataFrame.DataFrameProxy);

            Assert.IsNotNull(df3);
            Assert.AreEqual(df3.DataFrameProxy, dataFrame.DataFrameProxy);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(1, new[] { columnName }), Times.Exactly(2));
        }
Ejemplo n.º 10
0
        public void TestReplaceWithColumns()
        {
            // arrange
            var sparkContext = new SparkContext("", "");
            mockDataFrameNaFunctionsProxy.Setup(m => m.Replace(It.IsAny<string[]>(), It.IsAny<Dictionary<string, string>>()))
                .Returns(mockDataFrameProxy.Object);

            // act
            var replacement = new Dictionary<string, string>()
            {
                {"", "unknown"},
                {"?", "unknown"}
            };
            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);
            var cols = new[] { "col1", "col2" };

            var df = f.Replace(cols, replacement);

            // verify
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Replace(cols, replacement), Times.Once);
        }
Ejemplo n.º 11
0
        public void TestFill()
        {
            // arrange
            var sparkContext = new SparkContext("", "");

            // test fill with double value
            mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny<double>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);
            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);

            var cols = new[] { "col1", "col2" };
            const double doubleValue = 0.001;
            var df = f.Fill(doubleValue, cols);

            // verify
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(doubleValue, cols), Times.Once);

            // test fill with string value
            mockDataFrameNaFunctionsProxy.Reset();
            mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny<string>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);
            dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);
            const string strValue = "UNKNOWN";
            df = f.Fill(strValue, cols);

            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(strValue, cols), Times.Once);

            // test fill with dictonary
            mockDataFrameNaFunctionsProxy.Reset();
            mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny<Dictionary<string,object>>())).Returns(mockDataFrameProxy.Object);
            dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);
            var valueDict = new Dictionary<string, object>()
            {
                {"col1", -1},
                {"col2", "UNKNOWN"}
            };
            df = f.Fill(valueDict);

            // verify
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(valueDict), Times.Once);
        }
Ejemplo n.º 12
0
        public void TestDropWithMinNonNulls()
        {
            const string columnName = "column1";
            var mockSchemaProxy = new Mock<IStructTypeProxy>();
            var mockFieldProxy = new Mock<IStructFieldProxy>();
            mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
            mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object });
            mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);

            var sparkContext = new SparkContext("", "");
            mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);

            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);

            var df = f.Drop(20);
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(20, new[] { columnName }), Times.Once);
        }
        public void TestDataFrameNaFunctionSignatures()
        {
            DataFrameNaFunctions dfNaFuncs = _df.Na();

            var emptyColumn = new string[] { };
            var validColumn = new string[] { "age" };

            DataFrame df = dfNaFuncs.Drop("any");

            df = dfNaFuncs.Drop("all");
            df = dfNaFuncs.Drop(emptyColumn);
            df = dfNaFuncs.Drop(validColumn);
            df = dfNaFuncs.Drop("any", emptyColumn);
            df = dfNaFuncs.Drop("all", validColumn);
            df = dfNaFuncs.Drop(20);
            df = dfNaFuncs.Drop(20, emptyColumn);
            df = dfNaFuncs.Drop(20, validColumn);

            df = dfNaFuncs.Fill(100L);
            df = dfNaFuncs.Fill(100.0);
            df = dfNaFuncs.Fill("hello");
            df = dfNaFuncs.Fill(false);
            df = dfNaFuncs.Fill(100L, emptyColumn);
            df = dfNaFuncs.Fill(100L, validColumn);
            df = dfNaFuncs.Fill(100.0, emptyColumn);
            df = dfNaFuncs.Fill(100.0, validColumn);
            df = dfNaFuncs.Fill("hello", emptyColumn);
            df = dfNaFuncs.Fill("hello", validColumn);
            df = dfNaFuncs.Fill(true, emptyColumn);
            df = dfNaFuncs.Fill(true, validColumn);
            df = dfNaFuncs.Fill(new Dictionary <string, int>()
            {
                { "age", 10 }
            });
            df = dfNaFuncs.Fill(new Dictionary <string, long>()
            {
                { "age", 10L }
            });
            df = dfNaFuncs.Fill(new Dictionary <string, double>()
            {
                { "age", 10.0 }
            });
            df = dfNaFuncs.Fill(new Dictionary <string, string>()
            {
                { "age", "name" }
            });
            df = dfNaFuncs.Fill(new Dictionary <string, bool>()
            {
                { "age", false }
            });

            var doubleReplacement = new Dictionary <double, double>()
            {
                { 1.0, 5.0 }
            };
            var boolReplacement = new Dictionary <bool, bool>()
            {
                { true, false }
            };
            var stringReplacement = new Dictionary <string, string>()
            {
                { "a", "b" }
            };

            df = dfNaFuncs.Replace("age", doubleReplacement);
            df = dfNaFuncs.Replace("age", boolReplacement);
            df = dfNaFuncs.Replace("age", stringReplacement);
            df = dfNaFuncs.Replace(emptyColumn, doubleReplacement);
            df = dfNaFuncs.Replace(validColumn, doubleReplacement);
            df = dfNaFuncs.Replace(emptyColumn, boolReplacement);
            df = dfNaFuncs.Replace(validColumn, boolReplacement);
            df = dfNaFuncs.Replace(emptyColumn, stringReplacement);
            df = dfNaFuncs.Replace(validColumn, stringReplacement);
        }