internal void Q15() { Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y)); DataFrame revenue = _lineitem.Filter(Col("l_shipdate") >= "1996-01-01" & Col("l_shipdate") < "1996-04-01") .Select(Col("l_suppkey"), decrease(Col("l_extendedprice"), Col("l_discount")).As("value")) .GroupBy(Col("l_suppkey")) .Agg(Sum(Col("value")).As("total")); revenue.Agg(Max(Col("total")).As("max_total")) .Join(revenue, Col("max_total") == revenue["total"]) .Join(_supplier, Col("l_suppkey") == _supplier["s_suppkey"]) .Select(Col("s_suppkey"), Col("s_name"), Col("s_address"), Col("s_phone"), Col("total")) .Sort(Col("s_suppkey")) .Show(); }
internal void Q11() { Func <Column, Column, Column> mul = Udf <double, int, double>((x, y) => x * y); Func <Column, Column> mul01 = Udf <double, double>(x => x * 0.0001); DataFrame tmp = _nation.Filter(Col("n_name") == "GERMANY") .Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"]) .Select(Col("s_suppkey")) .Join(_partsupp, Col("s_suppkey") == _partsupp["ps_suppkey"]) .Select(Col("ps_partkey"), mul(Col("ps_supplycost"), Col("ps_availqty")).As("value")); DataFrame sumRes = tmp.Agg(Sum("value").As("total_value")); tmp.GroupBy(Col("ps_partkey")).Agg(Sum("value").As("part_value")) .Join(sumRes, Col("part_value") > mul01(Col("total_value"))) .Sort(Col("part_value").Desc()) .Show(); }
public void TestSignaturesV2_3_X() { Assert.IsType <Column>(_df["name"]); Assert.IsType <Column>(_df["age"]); Assert.IsType <DataFrame>(_df.ToDF()); Assert.IsType <DataFrame>(_df.ToDF("name2", "age2")); StructType schema = _df.Schema(); Assert.NotNull(schema); _df.PrintSchema(); _df.Explain(); _df.Explain(true); _df.Explain(false); Assert.Equal(2, _df.Columns().ToArray().Length); var expected = new List <Tuple <string, string> > { new Tuple <string, string>("age", "integer"), new Tuple <string, string>("name", "string") }; Assert.Equal(expected, _df.DTypes()); Assert.IsType <bool>(_df.IsLocal()); Assert.IsType <bool>(_df.IsStreaming()); using (var tempDir = new TemporaryDirectory()) { // The following is required for *CheckPoint(). _spark.SparkContext.SetCheckpointDir(tempDir.Path); Assert.IsType <DataFrame>(_df.Checkpoint()); Assert.IsType <DataFrame>(_df.Checkpoint(false)); Assert.IsType <DataFrame>(_df.LocalCheckpoint()); Assert.IsType <DataFrame>(_df.LocalCheckpoint(false)); } Assert.IsType <DataFrame>(_df.WithWatermark("time", "10 minutes")); _df.Show(); _df.Show(10); _df.Show(10, 10); _df.Show(10, 10, true); Assert.IsType <DataFrame>(_df.Join(_df)); Assert.IsType <DataFrame>(_df.Join(_df, "name")); Assert.IsType <DataFrame>(_df.Join(_df, new[] { "name" })); Assert.IsType <DataFrame>(_df.Join(_df, new[] { "name" }, "outer")); Assert.IsType <DataFrame>(_df.Join(_df, _df["age"] == _df["age"])); Assert.IsType <DataFrame>(_df.Join(_df, _df["age"] == _df["age"], "outer")); Assert.IsType <DataFrame>(_df.CrossJoin(_df)); Assert.IsType <DataFrame>(_df.SortWithinPartitions("age")); Assert.IsType <DataFrame>(_df.SortWithinPartitions("age", "name")); Assert.IsType <DataFrame>(_df.SortWithinPartitions()); Assert.IsType <DataFrame>(_df.SortWithinPartitions(_df["age"])); Assert.IsType <DataFrame>(_df.SortWithinPartitions(_df["age"], _df["name"])); Assert.IsType <DataFrame>(_df.Sort("age")); Assert.IsType <DataFrame>(_df.Sort("age", "name")); Assert.IsType <DataFrame>(_df.Sort()); Assert.IsType <DataFrame>(_df.Sort(_df["age"])); Assert.IsType <DataFrame>(_df.Sort(_df["age"], _df["name"])); Assert.IsType <DataFrame>(_df.OrderBy("age")); Assert.IsType <DataFrame>(_df.OrderBy("age", "name")); Assert.IsType <DataFrame>(_df.OrderBy()); Assert.IsType <DataFrame>(_df.OrderBy(_df["age"])); Assert.IsType <DataFrame>(_df.OrderBy(_df["age"], _df["name"])); Assert.IsType <DataFrame>(_df.Hint("broadcast")); Assert.IsType <DataFrame>(_df.Hint("broadcast", new[] { "hello", "world" })); Assert.IsType <Column>(_df.Col("age")); Assert.IsType <Column>(_df.ColRegex("age")); Assert.IsType <DataFrame>(_df.As("alias")); Assert.IsType <DataFrame>(_df.Alias("alias")); Assert.IsType <DataFrame>(_df.Select("age")); Assert.IsType <DataFrame>(_df.Select("age", "name")); Assert.IsType <DataFrame>(_df.Select()); Assert.IsType <DataFrame>(_df.Select(_df["age"])); Assert.IsType <DataFrame>(_df.Select(_df["age"], _df["name"])); Assert.IsType <DataFrame>(_df.SelectExpr()); Assert.IsType <DataFrame>(_df.SelectExpr("age * 2")); Assert.IsType <DataFrame>(_df.SelectExpr("age * 2", "abs(age)")); Assert.IsType <DataFrame>(_df.Filter(_df["age"] > 21)); Assert.IsType <DataFrame>(_df.Filter("age > 21")); Assert.IsType <DataFrame>(_df.Where(_df["age"] > 21)); Assert.IsType <DataFrame>(_df.Where("age > 21")); Assert.IsType <RelationalGroupedDataset>(_df.GroupBy("age")); Assert.IsType <RelationalGroupedDataset>(_df.GroupBy("age", "name")); Assert.IsType <RelationalGroupedDataset>(_df.GroupBy()); Assert.IsType <RelationalGroupedDataset>(_df.GroupBy(_df["age"])); Assert.IsType <RelationalGroupedDataset>(_df.GroupBy(_df["age"], _df["name"])); { RelationalGroupedDataset df = _df.WithColumn("tempAge", _df["age"]).GroupBy("name"); Assert.IsType <DataFrame>(df.Mean("age")); Assert.IsType <DataFrame>(df.Mean("age", "tempAge")); Assert.IsType <DataFrame>(df.Max("age")); Assert.IsType <DataFrame>(df.Max("age", "tempAge")); Assert.IsType <DataFrame>(df.Avg("age")); Assert.IsType <DataFrame>(df.Avg("age", "tempAge")); Assert.IsType <DataFrame>(df.Min("age")); Assert.IsType <DataFrame>(df.Min("age", "tempAge")); Assert.IsType <DataFrame>(df.Sum("age")); Assert.IsType <DataFrame>(df.Sum("age", "tempAge")); } Assert.IsType <RelationalGroupedDataset>(_df.Rollup("age")); Assert.IsType <RelationalGroupedDataset>(_df.Rollup("age", "name")); Assert.IsType <RelationalGroupedDataset>(_df.Rollup()); Assert.IsType <RelationalGroupedDataset>(_df.Rollup(_df["age"])); Assert.IsType <RelationalGroupedDataset>(_df.Rollup(_df["age"], _df["name"])); Assert.IsType <RelationalGroupedDataset>(_df.Cube("age")); Assert.IsType <RelationalGroupedDataset>(_df.Cube("age", "name")); Assert.IsType <RelationalGroupedDataset>(_df.Cube()); Assert.IsType <RelationalGroupedDataset>(_df.Cube(_df["age"])); Assert.IsType <RelationalGroupedDataset>(_df.Cube(_df["age"], _df["name"])); Assert.IsType <DataFrame>(_df.Agg(Avg(_df["age"]))); Assert.IsType <DataFrame>(_df.Agg(Avg(_df["age"]), Avg(_df["name"]))); Assert.IsType <DataFrame>(_df.Limit(10)); Assert.IsType <DataFrame>(_df.Union(_df)); Assert.IsType <DataFrame>(_df.UnionByName(_df)); Assert.IsType <DataFrame>(_df.Intersect(_df)); Assert.IsType <DataFrame>(_df.Except(_df)); Assert.IsType <DataFrame>(_df.Sample(0.5)); Assert.IsType <DataFrame>(_df.Sample(0.5, true)); Assert.IsType <DataFrame>(_df.Sample(0.5, false, 12345)); Assert.IsType <DataFrame[]>(_df.RandomSplit(new[] { 0.2, 0.8 })); Assert.IsType <DataFrame[]>(_df.RandomSplit(new[] { 0.2, 0.8 }, 12345)); Assert.IsType <DataFrame>(_df.WithColumn("age2", _df["age"])); Assert.IsType <DataFrame>(_df.WithColumnRenamed("age", "age2")); Assert.IsType <DataFrame>(_df.Drop()); Assert.IsType <DataFrame>(_df.Drop("age")); Assert.IsType <DataFrame>(_df.Drop("age", "name")); Assert.IsType <DataFrame>(_df.Drop(_df["age"])); Assert.IsType <DataFrame>(_df.DropDuplicates()); Assert.IsType <DataFrame>(_df.DropDuplicates("age")); Assert.IsType <DataFrame>(_df.DropDuplicates("age", "name")); Assert.IsType <DataFrame>(_df.Describe()); Assert.IsType <DataFrame>(_df.Describe("age")); Assert.IsType <DataFrame>(_df.Describe("age", "name")); Assert.IsType <DataFrame>(_df.Summary()); Assert.IsType <DataFrame>(_df.Summary("count")); Assert.IsType <DataFrame>(_df.Summary("count", "mean")); Assert.IsType <Row[]>(_df.Head(2).ToArray()); Assert.IsType <Row>(_df.Head()); Assert.IsType <Row>(_df.First()); Assert.IsType <DataFrame>(_df.Transform(df => df.Drop("age"))); Assert.IsType <Row[]>(_df.Take(3).ToArray()); Assert.IsType <Row[]>(_df.Collect().ToArray()); Assert.IsType <Row[]>(_df.ToLocalIterator().ToArray()); Assert.IsType <long>(_df.Count()); Assert.IsType <DataFrame>(_df.Repartition(2)); Assert.IsType <DataFrame>(_df.Repartition(2, _df["age"])); Assert.IsType <DataFrame>(_df.Repartition(_df["age"])); Assert.IsType <DataFrame>(_df.Repartition()); Assert.IsType <DataFrame>(_df.RepartitionByRange(2, _df["age"])); Assert.IsType <DataFrame>(_df.RepartitionByRange(_df["age"])); Assert.IsType <DataFrame>(_df.Coalesce(1)); Assert.IsType <DataFrame>(_df.Distinct()); Assert.IsType <DataFrame>(_df.Persist()); Assert.IsType <DataFrame>(_df.Persist(StorageLevel.DISK_ONLY)); Assert.IsType <DataFrame>(_df.Cache()); Assert.IsType <StorageLevel>(_df.StorageLevel()); Assert.IsType <DataFrame>(_df.Unpersist()); _df.CreateTempView("view"); _df.CreateOrReplaceTempView("view"); _df.CreateGlobalTempView("global_view"); _df.CreateOrReplaceGlobalTempView("global_view"); }