public void TestSignaturesV2_3_X() { Column col = _df["name"]; col = _df["age"]; DataFrame df = _df.ToDF(); df = df.ToDF("name2", "age2"); StructType schema = _df.Schema(); Assert.NotNull(schema); _df.PrintSchema(); _df.Explain(); _df.Explain(true); _df.Explain(false); Assert.Equal(2, _df.Columns().ToArray().Length); _df.IsLocal(); _df.IsStreaming(); using (var tempDir = new TemporaryDirectory()) { // The following is required for *CheckPoint(). _spark.SparkContext.SetCheckpointDir(tempDir.Path); _df.Checkpoint(); _df.Checkpoint(false); _df.LocalCheckpoint(); _df.LocalCheckpoint(false); } _df.WithWatermark("time", "10 minutes"); _df.Show(); _df.Show(10); _df.Show(10, 10); _df.Show(10, 10, true); _df.Join(_df); _df.Join(_df, "name"); _df.Join(_df, new[] { "name" }); _df.Join(_df, new[] { "name" }, "outer"); _df.Join(_df, _df["age"] == _df["age"]); _df.Join(_df, _df["age"] == _df["age"], "outer"); _df.CrossJoin(_df); _df.SortWithinPartitions("age"); _df.SortWithinPartitions("age", "name"); _df.SortWithinPartitions(); _df.SortWithinPartitions(_df["age"]); _df.SortWithinPartitions(_df["age"], _df["name"]); _df.Sort("age"); _df.Sort("age", "name"); _df.Sort(); _df.Sort(_df["age"]); _df.Sort(_df["age"], _df["name"]); _df.OrderBy("age"); _df.OrderBy("age", "name"); _df.OrderBy(); _df.OrderBy(_df["age"]); _df.OrderBy(_df["age"], _df["name"]); _df.Hint("broadcast"); _df.Hint("broadcast", new[] { "hello", "world" }); _df.Col("age"); _df.ColRegex("age"); _df.As("alias"); _df.Alias("alias"); _df.Select("age"); _df.Select("age", "name"); _df.Select(); _df.Select(_df["age"]); _df.Select(_df["age"], _df["name"]); _df.SelectExpr(); _df.SelectExpr("age * 2"); _df.SelectExpr("age * 2", "abs(age)"); _df.Filter(_df["age"] > 21); _df.Filter("age > 21"); _df.Where(_df["age"] > 21); _df.Where("age > 21"); _df.GroupBy("age"); _df.GroupBy("age", "name"); _df.GroupBy(); _df.GroupBy(_df["age"]); _df.GroupBy(_df["age"], _df["name"]); _df.Rollup("age"); _df.Rollup("age", "name"); _df.Rollup(); _df.Rollup(_df["age"]); _df.Rollup(_df["age"], _df["name"]); _df.Cube("age"); _df.Cube("age", "name"); _df.Cube(); _df.Cube(_df["age"]); _df.Cube(_df["age"], _df["name"]); _df.Agg(Avg(_df["age"])); _df.Agg(Avg(_df["age"]), Avg(_df["name"])); _df.Limit(10); _df.Union(_df); _df.UnionByName(_df); _df.Intersect(_df); _df.Except(_df); _df.Sample(0.5); _df.Sample(0.5, true); _df.Sample(0.5, false, 12345); _df.RandomSplit(new[] { 0.2, 0.8 }); _df.RandomSplit(new[] { 0.2, 0.8 }, 12345); _df.WithColumn("age2", _df["age"]); _df.WithColumnRenamed("age", "age2"); _df.Drop(); _df.Drop("age"); _df.Drop("age", "name"); _df.Drop(_df["age"]); _df.DropDuplicates(); _df.DropDuplicates("age"); _df.DropDuplicates("age", "name"); _df.Describe(); _df.Describe("age"); _df.Describe("age", "name"); _df.Summary(); _df.Summary("count"); _df.Summary("count", "mean"); _df.Head(2); _df.Head(); _df.First(); _df.Take(3).ToArray(); _df.Collect().ToArray(); _df.ToLocalIterator().ToArray(); _df.Count(); _df.Repartition(2); _df.Repartition(2, _df["age"]); _df.Repartition(_df["age"]); _df.Repartition(); _df.RepartitionByRange(2, _df["age"]); _df.RepartitionByRange(_df["age"]); _df.Coalesce(1); _df.Distinct(); _df.Persist(); _df.Cache(); _df.Unpersist(); _df.CreateTempView("view"); _df.CreateOrReplaceTempView("view"); _df.CreateGlobalTempView("global_view"); _df.CreateOrReplaceGlobalTempView("global_view"); }
public void Describe_Test01() { var mlContext = new MLContext(); var dict = new Dictionary <string, List <object> > { { "product_id", new List <object>() { 1, 1, 2, 2, 2, 2, 2 } }, { "retail_price", new List <object>() { 2, 2, 5, 5, 5, 5, 5 } }, { "quantity", new List <object>() { 1, 2, 4, 8, 16, 32, 64 } }, { "city", new List <object>() { "SF", "SJ", "SF", "SJ", "Miami", "Orlando", "SJ" } }, { "state", new List <object>() { "CA", "CA", "CA", "CA", "FL", "FL", "PR" } }, }; var df = new DataFrame(dict); var descDf = df.Describe(false, "product_id", "quantity", "state"); Assert.True(descDf.RowCount() == 11); Assert.True(descDf.ColCount() == 3); var str = descDf.ToStringBuilder(); Assert.Equal("Count", descDf.Index[0]); Assert.Equal(7, descDf["product_id", 0]); Assert.Equal(7, descDf["quantity", 0]); Assert.Equal(7, descDf["state", 0]); Assert.Equal("Unique", descDf.Index[1]); Assert.Equal(2, descDf["product_id", 1]); Assert.Equal(7, descDf["quantity", 1]); Assert.Equal(3, descDf["state", 1]); Assert.Equal("Top", descDf.Index[2]); Assert.Equal(2, descDf["product_id", 2]); Assert.Equal(1, descDf["quantity", 2]); Assert.Equal("CA", descDf["state", 2]); Assert.Equal("Freq", descDf.Index[3]); Assert.Equal(5, descDf["product_id", 3]); Assert.Equal(1, descDf["quantity", 3]); Assert.Equal(4, descDf["state", 3]); Assert.Equal("Mean", descDf.Index[4]); Assert.Equal(1.714286, descDf["product_id", 4]); Assert.Equal(18.142857, descDf["quantity", 4]); Assert.Equal(DataFrame.NAN, descDf["state", 4]); Assert.Equal("Std", descDf.Index[5]); Assert.Equal(0.487950, descDf["product_id", 5]); Assert.Equal(22.937804, descDf["quantity", 5]); Assert.Equal(DataFrame.NAN, descDf["state", 5]); Assert.Equal("Min", descDf.Index[6]); Assert.Equal(1, descDf["product_id", 6]); Assert.Equal(1, descDf["quantity", 6]); Assert.Equal(DataFrame.NAN, descDf["state", 6]); Assert.Equal("25%", descDf.Index[7]); Assert.Equal(1.5, descDf["product_id", 7]); Assert.Equal(3.0, descDf["quantity", 7]); Assert.Equal(DataFrame.NAN, descDf["state", 7]); Assert.Equal("Median", descDf.Index[8]); Assert.Equal(2.0, descDf["product_id", 8]); Assert.Equal(8.0, descDf["quantity", 8]); Assert.Equal(DataFrame.NAN, descDf["state", 8]); Assert.Equal("75%", descDf.Index[9]); Assert.Equal(2d, descDf["product_id", 9]); Assert.Equal(24d, descDf["quantity", 9]); Assert.Equal(DataFrame.NAN, descDf["state", 9]); Assert.Equal("Max", descDf.Index[10]); Assert.Equal(2, descDf["product_id", 10]); Assert.Equal(64, descDf["quantity", 10]); Assert.Equal(DataFrame.NAN, descDf["state", 10]); }