public void TestDropWithAny()
        {
            // arrange
            const string columnName = "column1";
            var mockSchemaProxy = new Mock<IStructTypeProxy>();
            var mockFieldProxy = new Mock<IStructFieldProxy>();
            mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
            mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object });
            mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);

            var sparkContext = new SparkContext("", "");
            mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);

            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);

            // act
            var cols = new[] { "col1", "col2" };
            var df1 = f.Drop("any", cols);
            var df2 = f.Drop();
            var df3 = f.Drop("any");

            // verify
            Assert.IsNotNull(df1);
            Assert.AreEqual(df1.DataFrameProxy, dataFrame.DataFrameProxy);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(cols.Length, cols), Times.Once);

            Assert.IsNotNull(df2);
            Assert.AreEqual(df2.DataFrameProxy, dataFrame.DataFrameProxy);
            
            Assert.IsNotNull(df3);
            Assert.AreEqual(df3.DataFrameProxy, dataFrame.DataFrameProxy);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(1, new[] { columnName }), Times.Exactly(2));
        }
        public void TestDropWithAll()
        {
            // arrange
            var sparkContext = new SparkContext("", "");
            mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);

            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);

            // act
            var cols = new[] { "col1", "col2" };
            var df = f.Drop("all", cols);

            // verify
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(1, cols), Times.Once);
        }
        public void TestFill()
        {
            // arrange
            var sparkContext = new SparkContext("", "");

            // test fill with double value
            mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny<double>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);
            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);

            var cols = new[] { "col1", "col2" };
            const double doubleValue = 0.001;
            var df = f.Fill(doubleValue, cols);

            // verify
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(doubleValue, cols), Times.Once);

            // test fill with string value
            mockDataFrameNaFunctionsProxy.Reset();
            mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny<string>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);
            dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);
            const string strValue = "UNKNOWN";
            df = f.Fill(strValue, cols);

            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(strValue, cols), Times.Once);

            // test fill with dictonary
            mockDataFrameNaFunctionsProxy.Reset();
            mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny<Dictionary<string,object>>())).Returns(mockDataFrameProxy.Object);
            dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);
            var valueDict = new Dictionary<string, object>()
            {
                {"col1", -1},
                {"col2", "UNKNOWN"}
            };
            df = f.Fill(valueDict);

            // verify
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(valueDict), Times.Once);
        }
        public void TestReplaceWithColumns()
        {
            // arrange
            var sparkContext = new SparkContext("", "");
            mockDataFrameNaFunctionsProxy.Setup(m => m.Replace(It.IsAny<string[]>(), It.IsAny<Dictionary<string, string>>()))
                .Returns(mockDataFrameProxy.Object);

            // act
            var replacement = new Dictionary<string, string>()
            {
                {"", "unknown"},
                {"?", "unknown"}
            };
            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);
            var cols = new[] { "col1", "col2" };

            var df = f.Replace(cols, replacement);

            // verify
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Replace(cols, replacement), Times.Once);
        }
Esempio n. 5
0
 /// <summary>
 /// Converts this strongly typed collection of data to generic Dataframe. In contrast to the
 /// strongly typed objects that Dataset operations work on, a Dataframe returns generic[[Row]]
 /// objects that allow fields to be accessed by ordinal or name.
 /// </summary>
 /// <returns>DataFrame created from Dataset</returns>
 public DataFrame ToDF()
 {
     return dataFrame ?? (dataFrame = new DataFrame(datasetProxy.ToDF(), SparkContext.GetActiveSparkContext()));
 }
        public void TestDropWithMinNonNulls()
        {
            const string columnName = "column1";
            var mockSchemaProxy = new Mock<IStructTypeProxy>();
            var mockFieldProxy = new Mock<IStructFieldProxy>();
            mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
            mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object });
            mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);

            var sparkContext = new SparkContext("", "");
            mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);

            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
            var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);

            var df = f.Drop(20);
            Assert.IsNotNull(df);
            Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
            Assert.AreNotSame(dataFrame, df);
            mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(20, new[] { columnName }), Times.Once);
        }
Esempio n. 7
0
        public void TestSqlContextRegisterDataFrameAsTable()
        {
            // arrange
            mockSqlContextProxy.Setup(m => m.RegisterDataFrameAsTable(It.IsAny<IDataFrameProxy>(), It.IsAny<string>()));
            var sqlContext = new SqlContext(new SparkContext("", ""), mockSqlContextProxy.Object);
            var dataFrameProxy = new DataFrameIpcProxy(new JvmObjectReference("1"), mockSqlContextProxy.Object);
            var dataFrame = new DataFrame(dataFrameProxy, new SparkContext(new SparkConf()));

            // act
            sqlContext.RegisterDataFrameAsTable(dataFrame, "table");

            // assert
            mockSqlContextProxy.Verify(m => m.RegisterDataFrameAsTable(dataFrameProxy, "table"));
        }
Esempio n. 8
0
        public void TestDataFrameCollect()
        {
            string jsonSchema = @"
                {
                  ""type"" : ""struct"",
                  ""fields"" : [ {
                    ""name"" : ""address"",
                    ""type"" : {
                      ""type"" : ""struct"",
                      ""fields"" : [ {
                        ""name"" : ""city"",
                        ""type"" : ""string"",
                        ""nullable"" : true,
                        ""metadata"" : { }
                      }, {
                        ""name"" : ""state"",
                        ""type"" : ""string"",
                        ""nullable"" : true,
                        ""metadata"" : { }
                      } ]
                    },
                    ""nullable"" : true,
                    ""metadata"" : { }
                  }, {
                    ""name"" : ""age"",
                    ""type"" : ""long"",
                    ""nullable"" : true,
                    ""metadata"" : { }
                  }, {
                    ""name"" : ""id"",
                    ""type"" : ""string"",
                    ""nullable"" : true,
                    ""metadata"" : { }
                  }, {
                    ""name"" : ""name"",
                    ""type"" : ""string"",
                    ""nullable"" : true,
                    ""metadata"" : { }
                  } ]
                }";

            int localPort = 4000;
            object row1 = new object[] {
                new object[] {"Columbus", "Ohio"},
                34,
                "123",
                "Bill"
            };

            object row2 = new object[] {
                new object[] {"Seattle", "Washington"},
                43,
                "789",
                "Bill"
            };

            IStructTypeProxy structTypeProxy = new MockStructTypeProxy(jsonSchema);
            IDataFrameProxy dataFrameProxy =
                new MockDataFrameProxy(localPort,
                                       new List<object>() { row1, row2 },
                                       structTypeProxy);
            DataFrame dataFrame = new DataFrame(dataFrameProxy, null);

            List<Row> rows = new List<Row>();
            foreach (var row in dataFrame.Collect())
            {
                rows.Add(row);
                Console.WriteLine("{0}", row);
            }

            Assert.AreEqual(rows.Count, 2);
            Row firstRow = rows[0];

            string id = firstRow.GetAs<string>("id");
            Assert.IsTrue(id.Equals("123"));
            string name = firstRow.GetAs<string>("name");
            Assert.IsTrue(name.Equals("Bill"));
            int age = firstRow.GetAs<int>("age");
            Assert.AreEqual(age, 34);

            Row address = firstRow.GetAs<Row>("address");
            Assert.AreNotEqual(address, null);
            string city = address.GetAs<string>("city");
            Assert.IsTrue(city.Equals("Columbus"));
            string state = address.GetAs<string>("state");
            Assert.IsTrue(state.Equals("Ohio"));
        }
Esempio n. 9
0
 /// <summary>
 /// Union with another DataFrame WITHOUT removing duplicated rows.
 /// This is equivalent to `UNION ALL` in SQL.
 /// </summary>
 /// <param name="otherDataFrame">DataFrame to union all with.</param>
 /// <returns>Unioned DataFrame.</returns>
 public DataFrame UnionAll(DataFrame otherDataFrame)
 {
     return
         new DataFrame(dataFrameProxy.UnionAll(otherDataFrame.dataFrameProxy), sparkContext);
 }
Esempio n. 10
0
 internal GroupedData(IGroupedDataProxy groupedDataProxy, DataFrame dataFrame)
 {
     this.groupedDataProxy = groupedDataProxy;
     this.dataFrame = dataFrame;
 }
Esempio n. 11
0
 /// <summary>
 /// Returns a new DataFrame containing rows in this frame but not in another frame.
 /// This is equivalent to `EXCEPT` in SQL.
 /// </summary>
 /// <param name="otherDataFrame">DataFrame to subtract from this frame.</param>
 /// <returns>A new DataFrame containing rows in this frame but not in another frame.</returns>
 public DataFrame Subtract(DataFrame otherDataFrame)
 {
     return
         new DataFrame(dataFrameProxy.Subtract(otherDataFrame.dataFrameProxy), sparkContext);
 }
Esempio n. 12
0
        /// <summary>
        /// Join with another DataFrame, using the specified JoinType
        /// </summary>
        /// <param name="otherDataFrame">DataFrame to join with</param>
        /// <param name="joinExpression">Column to join with.</param>
        /// <param name="joinType">Type of join to perform (default null value means <c>JoinType.Inner</c>)</param>
        /// <returns>Joined DataFrame</returns>
        public DataFrame Join(DataFrame otherDataFrame, Column joinExpression, JoinType joinType = null)
        {
            if (joinType == null)
            {
                joinType = JoinType.Inner;
            }

            return
                new DataFrame(dataFrameProxy.Join(otherDataFrame.dataFrameProxy, joinExpression.ColumnProxy, joinType.Value), sparkContext);
        }
Esempio n. 13
0
 // TODO: need aliasing for self join
 /// <summary>
 /// Join with another DataFrame - Inner equi-join using given column name 
 /// </summary>
 /// <param name="otherDataFrame">DataFrame to join with</param>
 /// <param name="joinColumnNames">Columns to join with.</param>
 /// <returns>Joined DataFrame</returns>
 public DataFrame Join(DataFrame otherDataFrame, string[] joinColumnNames)
 {
     return new DataFrame(
         dataFrameProxy.Join(otherDataFrame.dataFrameProxy, joinColumnNames),
         sparkContext);
 }
Esempio n. 14
0
 /// <summary>
 /// Join with another DataFrame - Cartesian join
 /// </summary>
 /// <param name="otherDataFrame">DataFrame to join with</param>
 /// <returns>Joined DataFrame</returns>
 public DataFrame Join(DataFrame otherDataFrame)
 {
     throw new NotImplementedException();
 }
Esempio n. 15
0
 /// <summary>
 /// Intersect with another DataFrame.
 /// This is equivalent to `INTERSECT` in SQL.
 /// </summary>
 /// <param name="otherDataFrame">DataFrame to intersect with.</param>
 /// <returns>Intersected DataFrame.</returns>
 public DataFrame Intersect(DataFrame otherDataFrame)
 {
     return
         new DataFrame(dataFrameProxy.Intersect(otherDataFrame.dataFrameProxy), sparkContext);
 }