public void DataFrameColumnManipulations() { DataList column1 = new DataList <int>("Integer"); DataList column2 = new DataList <double>("Double"); DataList column3 = new DataList <DateTime>("Timestamp"); DataFrame frame = new DataFrame(column1, column2); Assert.IsTrue(frame.Columns.Count == 2); Assert.IsTrue(frame.Columns[0].Name == column1.Name); Assert.IsTrue(frame.Columns[0].StorageType == column1.StorageType); Assert.IsTrue(frame.Column <object>(column1.Name).Name == column1.Name); frame.AddColumn(column3); Assert.IsTrue(frame.Columns.Count == 3); Assert.IsTrue(frame.Columns[2].Name == column3.Name); Assert.IsTrue(frame.Columns[2].StorageType == column3.StorageType); Assert.IsTrue(frame.Column <object>(column3.Name).Name == column3.Name); frame.RemoveColumn(column1.Name); Assert.IsTrue(frame.Columns.Count == 2); Assert.IsTrue(frame.Columns[0].Name == column2.Name); Assert.IsTrue(frame.Columns[0].StorageType == column2.StorageType); Assert.IsTrue(frame.Column <object>(column2.Name).Name == column2.Name); }
public void TestBinaryOperationsWithConversions() { DataFrame df = DataFrameTests.MakeDataFrameWithTwoColumns(10); // Add a double to an int column DataFrame dfd = df.Add(5.0f); var dtype = dfd.Column(0).DataType; Assert.True(dtype == typeof(double)); // Add a decimal to an int column DataFrame dfm = df.Add(5.0m); dtype = dfm.Column(0).DataType; Assert.True(dtype == typeof(decimal)); // int + bool should throw Assert.Throws <NotSupportedException>(() => df.Add(true)); var dataFrameColumn1 = new PrimitiveColumn <double>("Double1", Enumerable.Range(0, 10).Select(x => (double)x)); df.SetColumn(0, dataFrameColumn1); // Double + comparison ops should throw Assert.Throws <NotSupportedException>(() => df.And(true)); }
public static DataFrame MakeDataFrameWithNumericColumns(int length, bool withNulls = true) { BaseColumn byteColumn = new PrimitiveColumn <byte>("Byte", Enumerable.Range(0, length).Select(x => (byte)x)); BaseColumn charColumn = new PrimitiveColumn <char>("Char", Enumerable.Range(0, length).Select(x => (char)(x + 65))); BaseColumn decimalColumn = new PrimitiveColumn <decimal>("Decimal", Enumerable.Range(0, length).Select(x => (decimal)x)); BaseColumn doubleColumn = new PrimitiveColumn <double>("Double", Enumerable.Range(0, length).Select(x => (double)x)); BaseColumn floatColumn = new PrimitiveColumn <float>("Float", Enumerable.Range(0, length).Select(x => (float)x)); BaseColumn intColumn = new PrimitiveColumn <int>("Int", Enumerable.Range(0, length).Select(x => x)); BaseColumn longColumn = new PrimitiveColumn <long>("Long", Enumerable.Range(0, length).Select(x => (long)x)); BaseColumn sbyteColumn = new PrimitiveColumn <sbyte>("Sbyte", Enumerable.Range(0, length).Select(x => (sbyte)x)); BaseColumn shortColumn = new PrimitiveColumn <short>("Short", Enumerable.Range(0, length).Select(x => (short)x)); BaseColumn uintColumn = new PrimitiveColumn <uint>("Uint", Enumerable.Range(0, length).Select(x => (uint)x)); BaseColumn ulongColumn = new PrimitiveColumn <ulong>("Ulong", Enumerable.Range(0, length).Select(x => (ulong)x)); BaseColumn ushortColumn = new PrimitiveColumn <ushort>("Ushort", Enumerable.Range(0, length).Select(x => (ushort)x)); DataFrame dataFrame = new DataFrame(new List <BaseColumn> { byteColumn, charColumn, decimalColumn, doubleColumn, floatColumn, intColumn, longColumn, sbyteColumn, shortColumn, uintColumn, ulongColumn, ushortColumn }); if (withNulls) { for (int i = 0; i < dataFrame.ColumnCount; i++) { dataFrame.Column(i)[length / 2] = null; } } return(dataFrame); }
public void Timings() { int n = 10000; int m = 10000; Random rng = new Random(1); double mean = 0.0; double[] array = new double[n]; for (int j = 0; j < array.Length; j++) { array[j] = rng.NextDouble(); } Stopwatch arrayTimer = Stopwatch.StartNew(); for (int i = 0; i < m; i++) { mean += array.Mean(); } arrayTimer.Stop(); Console.WriteLine(arrayTimer.ElapsedMilliseconds); DataList <double> list = new DataList <double>("list"); for (int j = 0; j < n; j++) { list.Add(rng.NextDouble()); } Stopwatch listTimer = Stopwatch.StartNew(); for (int i = 0; i < m; i++) { mean += list.Mean(); } listTimer.Stop(); Console.WriteLine(listTimer.ElapsedMilliseconds); DataFrame frame = new DataFrame(list); Stopwatch frameTimer = Stopwatch.StartNew(); DataColumn <double> frameList = frame.Column <double>("list"); for (int i = 0; i < m; i++) { mean += frameList.Mean(); } frameTimer.Stop(); Console.WriteLine(frameTimer.ElapsedMilliseconds); }
public static DataFrame MakeDataFrame <T1, T2>(int length, bool withNulls = true) where T1 : unmanaged where T2 : unmanaged { BaseColumn baseColumn1 = new PrimitiveColumn <T1>("Column1", Enumerable.Range(0, length).Select(x => (T1)Convert.ChangeType(x % 2 == 0 ? 0 : 1, typeof(T1)))); BaseColumn baseColumn2 = new PrimitiveColumn <T2>("Column2", Enumerable.Range(0, length).Select(x => (T2)Convert.ChangeType(x % 2 == 0 ? 0 : 1, typeof(T2)))); DataFrame dataFrame = new DataFrame(new List <BaseColumn> { baseColumn1, baseColumn2 }); if (withNulls) { for (int i = 0; i < dataFrame.ColumnCount; i++) { dataFrame.Column(i)[length / 2] = null; } } return(dataFrame); }
public void ColumnAndTableCreationTest() { BaseColumn intColumn = new PrimitiveColumn <int>("IntColumn", Enumerable.Range(0, 10).Select(x => x)); BaseColumn floatColumn = new PrimitiveColumn <float>("FloatColumn", Enumerable.Range(0, 10).Select(x => (float)x)); DataFrame dataFrame = new DataFrame(); dataFrame.InsertColumn(0, intColumn); dataFrame.InsertColumn(1, floatColumn); Assert.Equal(10, dataFrame.RowCount); Assert.Equal(2, dataFrame.ColumnCount); Assert.Equal(10, dataFrame.Column(0).Length); Assert.Equal("IntColumn", dataFrame.Column(0).Name); Assert.Equal(10, dataFrame.Column(1).Length); Assert.Equal("FloatColumn", dataFrame.Column(1).Name); BaseColumn bigColumn = new PrimitiveColumn <float>("BigColumn", Enumerable.Range(0, 11).Select(x => (float)x)); BaseColumn repeatedName = new PrimitiveColumn <float>("FloatColumn", Enumerable.Range(0, 10).Select(x => (float)x)); Assert.Throws <ArgumentException>(() => dataFrame.InsertColumn(2, bigColumn)); Assert.Throws <ArgumentException>(() => dataFrame.InsertColumn(2, repeatedName)); Assert.Throws <ArgumentOutOfRangeException>(() => dataFrame.InsertColumn(10, repeatedName)); Assert.Equal(2, dataFrame.ColumnCount); BaseColumn intColumnCopy = new PrimitiveColumn <int>("IntColumn", Enumerable.Range(0, 10).Select(x => x)); Assert.Throws <ArgumentException>(() => dataFrame.SetColumn(1, intColumnCopy)); BaseColumn differentIntColumn = new PrimitiveColumn <int>("IntColumn1", Enumerable.Range(0, 10).Select(x => x)); dataFrame.SetColumn(1, differentIntColumn); Assert.True(object.ReferenceEquals(differentIntColumn, dataFrame.Column(1))); dataFrame.RemoveColumn(1); Assert.Equal(1, dataFrame.ColumnCount); Assert.True(ReferenceEquals(intColumn, dataFrame.Column(0))); }
public void TestComputations() { DataFrame df = MakeDataFrameWithAllColumnTypes(10); df["Int"][0] = -10; Assert.Equal(-10, df["Int"][0]); df["Int"].Abs(); Assert.Equal(10, df["Int"][0]); Assert.Throws <NotSupportedException>(() => df["Byte"].All()); Assert.Throws <NotSupportedException>(() => df["Byte"].Any()); Assert.Throws <NotSupportedException>(() => df["Char"].All()); Assert.Throws <NotSupportedException>(() => df["Char"].Any()); Assert.Throws <NotSupportedException>(() => df["Decimal"].All()); Assert.Throws <NotSupportedException>(() => df["Decimal"].Any()); Assert.Throws <NotSupportedException>(() => df["Double"].All()); Assert.Throws <NotSupportedException>(() => df["Double"].Any()); Assert.Throws <NotSupportedException>(() => df["Float"].All()); Assert.Throws <NotSupportedException>(() => df["Float"].Any()); Assert.Throws <NotSupportedException>(() => df["Int"].All()); Assert.Throws <NotSupportedException>(() => df["Int"].Any()); Assert.Throws <NotSupportedException>(() => df["Long"].All()); Assert.Throws <NotSupportedException>(() => df["Long"].Any()); Assert.Throws <NotSupportedException>(() => df["Sbyte"].All()); Assert.Throws <NotSupportedException>(() => df["Sbyte"].Any()); Assert.Throws <NotSupportedException>(() => df["Short"].All()); Assert.Throws <NotSupportedException>(() => df["Short"].Any()); Assert.Throws <NotSupportedException>(() => df["Uint"].All()); Assert.Throws <NotSupportedException>(() => df["Uint"].Any()); Assert.Throws <NotSupportedException>(() => df["Ulong"].All()); Assert.Throws <NotSupportedException>(() => df["Ulong"].Any()); Assert.Throws <NotSupportedException>(() => df["Ushort"].All()); Assert.Throws <NotSupportedException>(() => df["Ushort"].Any()); bool any = df["Bool"].Any(); bool all = df["Bool"].All(); Assert.True(any); Assert.False(all); // Test the computation results df["Double"][0] = 100.0; df["Double"].CumulativeMax(); Assert.Equal(100.0, df["Double"][9]); df["Float"][0] = -10.0f; df["Float"].CumulativeMin(); Assert.Equal(-10.0f, df["Float"][9]); df["Uint"].CumulativeProduct(); Assert.Equal((uint)0, df["Uint"][9]); df["Ushort"].CumulativeSum(); Assert.Equal((ushort)45, df["Ushort"][9]); Assert.Equal(100.0, df["Double"].Max()); Assert.Equal(-10.0f, df["Float"].Min()); Assert.Equal((uint)0, df["Uint"].Product()); Assert.Equal((ushort)165, df["Ushort"].Sum()); df["Double"][0] = 100.1; Assert.Equal(100.1, df["Double"][0]); df["Double"].Round(); Assert.Equal(100.0, df["Double"][0]); // Test that none of the numeric column types throw for (int i = 0; i < df.ColumnCount; i++) { BaseColumn column = df.Column(i); if (column.DataType == typeof(bool)) { Assert.Throws <NotSupportedException>(() => column.CumulativeMax()); Assert.Throws <NotSupportedException>(() => column.CumulativeMin()); Assert.Throws <NotSupportedException>(() => column.CumulativeProduct()); Assert.Throws <NotSupportedException>(() => column.CumulativeSum()); Assert.Throws <NotSupportedException>(() => column.Max()); Assert.Throws <NotSupportedException>(() => column.Min()); Assert.Throws <NotSupportedException>(() => column.Product()); Assert.Throws <NotSupportedException>(() => column.Sum()); continue; } else if (column.DataType == typeof(string)) { Assert.Throws <NotImplementedException>(() => column.CumulativeMax()); Assert.Throws <NotImplementedException>(() => column.CumulativeMin()); Assert.Throws <NotImplementedException>(() => column.CumulativeProduct()); Assert.Throws <NotImplementedException>(() => column.CumulativeSum()); Assert.Throws <NotImplementedException>(() => column.Max()); Assert.Throws <NotImplementedException>(() => column.Min()); Assert.Throws <NotImplementedException>(() => column.Product()); Assert.Throws <NotImplementedException>(() => column.Sum()); continue; } column.CumulativeMax(); column.CumulativeMin(); column.CumulativeProduct(); column.CumulativeSum(); column.Max(); column.Min(); column.Product(); column.Sum(); } }
public void TestGroupBy() { DataFrame df = MakeDataFrameWithNumericAndBoolColumns(10); DataFrame count = df.GroupBy("Bool").Count(); Assert.Equal(2, count.RowCount); Assert.Equal((long)5, count["Int"][0]); Assert.Equal((long)4, count["Decimal"][1]); for (int r = 0; r < count.RowCount; r++) { for (int c = 1; c < count.ColumnCount; c++) { Assert.Equal((long)(r == 0 ? 5 : 4), count.Column(c)[r]); } } DataFrame first = df.GroupBy("Bool").First(); Assert.Equal(2, first.RowCount); for (int r = 0; r < 2; r++) { for (int c = 0; c < count.ColumnCount; c++) { BaseColumn originalColumn = df.Column(c); BaseColumn firstColumn = first[originalColumn.Name]; Assert.Equal(originalColumn[r], firstColumn[r]); } } DataFrame head = df.GroupBy("Bool").Head(3); List <int> verify = new List <int>() { 0, 3, 1, 4, 2, 5 }; for (int r = 0; r < 5; r++) { for (int c = 0; c < count.ColumnCount; c++) { BaseColumn originalColumn = df.Column(c); BaseColumn headColumn = head[originalColumn.Name]; Assert.Equal(originalColumn[r].ToString(), headColumn[verify[r]].ToString()); } } for (int c = 0; c < count.ColumnCount; c++) { BaseColumn originalColumn = df.Column(c); if (originalColumn.Name == "Bool") { continue; } BaseColumn headColumn = head[originalColumn.Name]; Assert.Equal(originalColumn[5], headColumn[verify[5]]); } Assert.Equal(6, head.RowCount); DataFrame tail = df.GroupBy("Bool").Tail(3); Assert.Equal(6, tail.RowCount); List <int> originalColumnVerify = new List <int>() { 6, 8, 7, 9 }; List <int> tailColumnVerity = new List <int>() { 1, 2, 4, 5 }; for (int r = 0; r < 4; r++) { for (int c = 0; c < count.ColumnCount; c++) { BaseColumn originalColumn = df.Column(c); BaseColumn tailColumn = tail[originalColumn.Name]; Assert.Equal(originalColumn[originalColumnVerify[r]].ToString(), tailColumn[tailColumnVerity[r]].ToString()); } } DataFrame max = df.GroupBy("Bool").Max(); Assert.Equal(2, max.RowCount); for (int r = 0; r < 2; r++) { for (int c = 0; c < count.ColumnCount; c++) { BaseColumn originalColumn = df.Column(c); if (originalColumn.Name == "Bool" || originalColumn.Name == "Char") { continue; } BaseColumn maxColumn = max[originalColumn.Name]; Assert.Equal(((long)(r == 0 ? 8 : 9)).ToString(), maxColumn[r].ToString()); } } DataFrame min = df.GroupBy("Bool").Min(); Assert.Equal(2, min.RowCount); DataFrame product = df.GroupBy("Bool").Product(); Assert.Equal(2, product.RowCount); DataFrame sum = df.GroupBy("Bool").Sum(); Assert.Equal(2, sum.RowCount); for (int r = 0; r < 2; r++) { for (int c = 0; c < count.ColumnCount; c++) { BaseColumn originalColumn = df.Column(c); if (originalColumn.Name == "Bool" || originalColumn.Name == "Char") { continue; } BaseColumn minColumn = min[originalColumn.Name]; Assert.Equal("0", minColumn[r].ToString()); BaseColumn productColumn = product[originalColumn.Name]; Assert.Equal("0", productColumn[r].ToString()); BaseColumn sumColumn = sum[originalColumn.Name]; Assert.Equal("20", sumColumn[r].ToString()); } } }
private void VerifyJoin(DataFrame join, DataFrame left, DataFrame right, JoinAlgorithm joinAlgorithm) { PrimitiveColumn <long> mapIndices = new PrimitiveColumn <long>("map", join.RowCount); for (long i = 0; i < join.RowCount; i++) { mapIndices[i] = i; } for (int i = 0; i < join.ColumnCount; i++) { BaseColumn joinColumn = join.Column(i); BaseColumn isEqual; if (joinAlgorithm == JoinAlgorithm.Left) { if (i < left.ColumnCount) { BaseColumn leftColumn = left.Column(i); isEqual = joinColumn == leftColumn; } else { int columnIndex = i - left.ColumnCount; BaseColumn rightColumn = right.Column(columnIndex); BaseColumn compareColumn = rightColumn.Length <= join.RowCount ? rightColumn.Clone(numberOfNullsToAppend: join.RowCount - rightColumn.Length) : rightColumn.Clone(mapIndices); isEqual = joinColumn == compareColumn; } } else if (joinAlgorithm == JoinAlgorithm.Right) { if (i < left.ColumnCount) { BaseColumn leftColumn = left.Column(i); BaseColumn compareColumn = leftColumn.Length <= join.RowCount ? leftColumn.Clone(numberOfNullsToAppend: join.RowCount - leftColumn.Length) : leftColumn.Clone(mapIndices); isEqual = joinColumn == compareColumn; } else { int columnIndex = i - left.ColumnCount; BaseColumn rightColumn = right.Column(columnIndex); isEqual = joinColumn == rightColumn; } } else if (joinAlgorithm == JoinAlgorithm.Inner) { if (i < left.ColumnCount) { BaseColumn leftColumn = left.Column(i); isEqual = joinColumn == leftColumn.Clone(mapIndices); } else { int columnIndex = i - left.ColumnCount; BaseColumn rightColumn = right.Column(columnIndex); isEqual = joinColumn == rightColumn.Clone(mapIndices); } } else { if (i < left.ColumnCount) { BaseColumn leftColumn = left.Column(i); isEqual = joinColumn == leftColumn.Clone(numberOfNullsToAppend: join.RowCount - leftColumn.Length); } else { int columnIndex = i - left.ColumnCount; BaseColumn rightColumn = right.Column(columnIndex); isEqual = joinColumn == rightColumn.Clone(numberOfNullsToAppend: join.RowCount - rightColumn.Length); } } for (int j = 0; j < join.RowCount; j++) { Assert.Equal(true, isEqual[j]); } } }
public void TestIEnumerable() { DataFrame df = MakeDataFrameWithAllColumnTypes(10); int totalValueCount = 0; for (int i = 0; i < df.ColumnCount; i++) { BaseColumn baseColumn = df.Column(i); foreach (object value in baseColumn) { totalValueCount++; } } Assert.Equal(10 * df.ColumnCount, totalValueCount); // spot check a few column types: StringColumn stringColumn = (StringColumn)df["String"]; StringBuilder actualStrings = new StringBuilder(); foreach (string value in stringColumn) { if (value == null) { actualStrings.Append("<null>"); } else { actualStrings.Append(value); } } Assert.Equal("01234<null>6789", actualStrings.ToString()); ArrowStringColumn arrowStringColumn = (ArrowStringColumn)df["ArrowString"]; actualStrings.Clear(); foreach (string value in arrowStringColumn) { if (value == null) { actualStrings.Append("<null>"); } else { actualStrings.Append(value); } } Assert.Equal("foofoofoofoofoofoofoofoofoofoo", actualStrings.ToString()); PrimitiveColumn <float> floatColumn = (PrimitiveColumn <float>)df["Float"]; actualStrings.Clear(); foreach (float?value in floatColumn) { if (value == null) { actualStrings.Append("<null>"); } else { actualStrings.Append(value); } } Assert.Equal("01234<null>6789", actualStrings.ToString()); PrimitiveColumn <int> intColumn = (PrimitiveColumn <int>)df["Int"]; actualStrings.Clear(); foreach (int?value in intColumn) { if (value == null) { actualStrings.Append("<null>"); } else { actualStrings.Append(value); } } Assert.Equal("01234<null>6789", actualStrings.ToString()); }