public void TestDropWithAny() { // arrange const string columnName = "column1"; var mockSchemaProxy = new Mock<IStructTypeProxy>(); var mockFieldProxy = new Mock<IStructFieldProxy>(); mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object); mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object }); mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName); var sparkContext = new SparkContext("", ""); mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); // act var cols = new[] { "col1", "col2" }; var df1 = f.Drop("any", cols); var df2 = f.Drop(); var df3 = f.Drop("any"); // verify Assert.IsNotNull(df1); Assert.AreEqual(df1.DataFrameProxy, dataFrame.DataFrameProxy); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(cols.Length, cols), Times.Once); Assert.IsNotNull(df2); Assert.AreEqual(df2.DataFrameProxy, dataFrame.DataFrameProxy); Assert.IsNotNull(df3); Assert.AreEqual(df3.DataFrameProxy, dataFrame.DataFrameProxy); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(1, new[] { columnName }), Times.Exactly(2)); }
public void TestDropWithAll() { // arrange var sparkContext = new SparkContext("", ""); mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); // act var cols = new[] { "col1", "col2" }; var df = f.Drop("all", cols); // verify Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(1, cols), Times.Once); }
public void TestFill() { // arrange var sparkContext = new SparkContext("", ""); // test fill with double value mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny<double>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); var cols = new[] { "col1", "col2" }; const double doubleValue = 0.001; var df = f.Fill(doubleValue, cols); // verify Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(doubleValue, cols), Times.Once); // test fill with string value mockDataFrameNaFunctionsProxy.Reset(); mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny<string>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object); dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); const string strValue = "UNKNOWN"; df = f.Fill(strValue, cols); Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(strValue, cols), Times.Once); // test fill with dictonary mockDataFrameNaFunctionsProxy.Reset(); mockDataFrameNaFunctionsProxy.Setup(m => m.Fill(It.IsAny<Dictionary<string,object>>())).Returns(mockDataFrameProxy.Object); dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); var valueDict = new Dictionary<string, object>() { {"col1", -1}, {"col2", "UNKNOWN"} }; df = f.Fill(valueDict); // verify Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Fill(valueDict), Times.Once); }
public void TestReplaceWithColumns() { // arrange var sparkContext = new SparkContext("", ""); mockDataFrameNaFunctionsProxy.Setup(m => m.Replace(It.IsAny<string[]>(), It.IsAny<Dictionary<string, string>>())) .Returns(mockDataFrameProxy.Object); // act var replacement = new Dictionary<string, string>() { {"", "unknown"}, {"?", "unknown"} }; var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); var cols = new[] { "col1", "col2" }; var df = f.Replace(cols, replacement); // verify Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Replace(cols, replacement), Times.Once); }
/// <summary> /// Converts this strongly typed collection of data to generic Dataframe. In contrast to the /// strongly typed objects that Dataset operations work on, a Dataframe returns generic[[Row]] /// objects that allow fields to be accessed by ordinal or name. /// </summary> /// <returns>DataFrame created from Dataset</returns> public DataFrame ToDF() { return dataFrame ?? (dataFrame = new DataFrame(datasetProxy.ToDF(), SparkContext.GetActiveSparkContext())); }
public void TestDropWithMinNonNulls() { const string columnName = "column1"; var mockSchemaProxy = new Mock<IStructTypeProxy>(); var mockFieldProxy = new Mock<IStructFieldProxy>(); mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object); mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object }); mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName); var sparkContext = new SparkContext("", ""); mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext); var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext); var df = f.Drop(20); Assert.IsNotNull(df); Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy); Assert.AreNotSame(dataFrame, df); mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(20, new[] { columnName }), Times.Once); }
public void TestSqlContextRegisterDataFrameAsTable() { // arrange mockSqlContextProxy.Setup(m => m.RegisterDataFrameAsTable(It.IsAny<IDataFrameProxy>(), It.IsAny<string>())); var sqlContext = new SqlContext(new SparkContext("", ""), mockSqlContextProxy.Object); var dataFrameProxy = new DataFrameIpcProxy(new JvmObjectReference("1"), mockSqlContextProxy.Object); var dataFrame = new DataFrame(dataFrameProxy, new SparkContext(new SparkConf())); // act sqlContext.RegisterDataFrameAsTable(dataFrame, "table"); // assert mockSqlContextProxy.Verify(m => m.RegisterDataFrameAsTable(dataFrameProxy, "table")); }
public void TestDataFrameCollect() { string jsonSchema = @" { ""type"" : ""struct"", ""fields"" : [ { ""name"" : ""address"", ""type"" : { ""type"" : ""struct"", ""fields"" : [ { ""name"" : ""city"", ""type"" : ""string"", ""nullable"" : true, ""metadata"" : { } }, { ""name"" : ""state"", ""type"" : ""string"", ""nullable"" : true, ""metadata"" : { } } ] }, ""nullable"" : true, ""metadata"" : { } }, { ""name"" : ""age"", ""type"" : ""long"", ""nullable"" : true, ""metadata"" : { } }, { ""name"" : ""id"", ""type"" : ""string"", ""nullable"" : true, ""metadata"" : { } }, { ""name"" : ""name"", ""type"" : ""string"", ""nullable"" : true, ""metadata"" : { } } ] }"; int localPort = 4000; object row1 = new object[] { new object[] {"Columbus", "Ohio"}, 34, "123", "Bill" }; object row2 = new object[] { new object[] {"Seattle", "Washington"}, 43, "789", "Bill" }; IStructTypeProxy structTypeProxy = new MockStructTypeProxy(jsonSchema); IDataFrameProxy dataFrameProxy = new MockDataFrameProxy(localPort, new List<object>() { row1, row2 }, structTypeProxy); DataFrame dataFrame = new DataFrame(dataFrameProxy, null); List<Row> rows = new List<Row>(); foreach (var row in dataFrame.Collect()) { rows.Add(row); Console.WriteLine("{0}", row); } Assert.AreEqual(rows.Count, 2); Row firstRow = rows[0]; string id = firstRow.GetAs<string>("id"); Assert.IsTrue(id.Equals("123")); string name = firstRow.GetAs<string>("name"); Assert.IsTrue(name.Equals("Bill")); int age = firstRow.GetAs<int>("age"); Assert.AreEqual(age, 34); Row address = firstRow.GetAs<Row>("address"); Assert.AreNotEqual(address, null); string city = address.GetAs<string>("city"); Assert.IsTrue(city.Equals("Columbus")); string state = address.GetAs<string>("state"); Assert.IsTrue(state.Equals("Ohio")); }
/// <summary> /// Union with another DataFrame WITHOUT removing duplicated rows. /// This is equivalent to `UNION ALL` in SQL. /// </summary> /// <param name="otherDataFrame">DataFrame to union all with.</param> /// <returns>Unioned DataFrame.</returns> public DataFrame UnionAll(DataFrame otherDataFrame) { return new DataFrame(dataFrameProxy.UnionAll(otherDataFrame.dataFrameProxy), sparkContext); }
internal GroupedData(IGroupedDataProxy groupedDataProxy, DataFrame dataFrame) { this.groupedDataProxy = groupedDataProxy; this.dataFrame = dataFrame; }
/// <summary> /// Returns a new DataFrame containing rows in this frame but not in another frame. /// This is equivalent to `EXCEPT` in SQL. /// </summary> /// <param name="otherDataFrame">DataFrame to subtract from this frame.</param> /// <returns>A new DataFrame containing rows in this frame but not in another frame.</returns> public DataFrame Subtract(DataFrame otherDataFrame) { return new DataFrame(dataFrameProxy.Subtract(otherDataFrame.dataFrameProxy), sparkContext); }
/// <summary> /// Join with another DataFrame, using the specified JoinType /// </summary> /// <param name="otherDataFrame">DataFrame to join with</param> /// <param name="joinExpression">Column to join with.</param> /// <param name="joinType">Type of join to perform (default null value means <c>JoinType.Inner</c>)</param> /// <returns>Joined DataFrame</returns> public DataFrame Join(DataFrame otherDataFrame, Column joinExpression, JoinType joinType = null) { if (joinType == null) { joinType = JoinType.Inner; } return new DataFrame(dataFrameProxy.Join(otherDataFrame.dataFrameProxy, joinExpression.ColumnProxy, joinType.Value), sparkContext); }
// TODO: need aliasing for self join /// <summary> /// Join with another DataFrame - Inner equi-join using given column name /// </summary> /// <param name="otherDataFrame">DataFrame to join with</param> /// <param name="joinColumnNames">Columns to join with.</param> /// <returns>Joined DataFrame</returns> public DataFrame Join(DataFrame otherDataFrame, string[] joinColumnNames) { return new DataFrame( dataFrameProxy.Join(otherDataFrame.dataFrameProxy, joinColumnNames), sparkContext); }
/// <summary> /// Join with another DataFrame - Cartesian join /// </summary> /// <param name="otherDataFrame">DataFrame to join with</param> /// <returns>Joined DataFrame</returns> public DataFrame Join(DataFrame otherDataFrame) { throw new NotImplementedException(); }
/// <summary> /// Intersect with another DataFrame. /// This is equivalent to `INTERSECT` in SQL. /// </summary> /// <param name="otherDataFrame">DataFrame to intersect with.</param> /// <returns>Intersected DataFrame.</returns> public DataFrame Intersect(DataFrame otherDataFrame) { return new DataFrame(dataFrameProxy.Intersect(otherDataFrame.dataFrameProxy), sparkContext); }