internal void Q21() { DataFrame fsupplier = _supplier.Select(Col("s_suppkey"), Col("s_nationkey"), Col("s_name")); DataFrame plineitem = _lineitem .Select(Col("l_suppkey"), Col("l_orderkey"), Col("l_receiptdate"), Col("l_commitdate")); DataFrame flineitem = plineitem.Filter(Col("l_receiptdate") > Col("l_commitdate")); DataFrame line1 = plineitem.GroupBy(Col("l_orderkey")) .Agg(CountDistinct(Col("l_suppkey")).As("suppkey_count"), Max(Col("l_suppkey")).As("suppkey_max")) .Select(Col("l_orderkey").As("key"), Col("suppkey_count"), Col("suppkey_max")); DataFrame line2 = flineitem.GroupBy(Col("l_orderkey")) .Agg(CountDistinct(Col("l_suppkey")).As("suppkey_count"), Max(Col("l_suppkey")).As("suppkey_max")) .Select(Col("l_orderkey").As("key"), Col("suppkey_count"), Col("suppkey_max")); DataFrame forder = _orders.Select(Col("o_orderkey"), Col("o_orderstatus")) .Filter(Col("o_orderstatus") == "F"); _nation.Filter(Col("n_name") == "SAUDI ARABIA") .Join(fsupplier, Col("n_nationkey") == fsupplier["s_nationkey"]) .Join(flineitem, Col("s_suppkey") == flineitem["l_suppkey"]) .Join(forder, Col("l_orderkey") == forder["o_orderkey"]) .Join(line1, Col("l_orderkey") == line1["key"]) .Filter(Col("suppkey_count") > 1) .Select(Col("s_name"), Col("l_orderkey"), Col("l_suppkey")) .Join(line2, Col("l_orderkey") == line2["key"], "left_outer") .Select(Col("s_name"), Col("l_orderkey"), Col("l_suppkey"), Col("suppkey_count"), Col("suppkey_max")) .Filter(Col("suppkey_count") == 1 & Col("l_suppkey") == Col("suppkey_max")) .GroupBy(Col("s_name")) .Agg(Count(Col("l_suppkey")).As("numwait")) .Sort(Col("numwait").Desc(), Col("s_name")) .Show(); }
internal void Q22() { Func <Column, Column> sub2 = Udf <string, string>(x => x.Substring(0, 2)); Func <Column, Column> phone = Udf <string, bool>(x => s_q22PhoneRegex.IsMatch(x)); DataFrame fcustomer = _customer.Select(Col("c_acctbal"), Col("c_custkey"), sub2(Col("c_phone")).As("cntrycode")) .Filter(phone(Col("cntrycode"))); DataFrame avg_customer = fcustomer.Filter(Col("c_acctbal") > 0.0) .Agg(Avg(Col("c_acctbal")).As("avg_acctbal")); _orders.GroupBy(Col("o_custkey")) .Agg(Col("o_custkey")).Select(Col("o_custkey")) .Join(fcustomer, Col("o_custkey") == fcustomer["c_custkey"], "right_outer") .Filter(Col("o_custkey").IsNull()) .Join(avg_customer) .Filter(Col("c_acctbal") > Col("avg_acctbal")) .GroupBy(Col("cntrycode")) .Agg(Count(Col("c_acctbal")).As("numcust"), Sum(Col("c_acctbal")).As("totacctbal")) .Sort(Col("cntrycode")) .Show(); }
public void TestSignaturesV2_3_X() { Assert.IsType <Column>(_df["name"]); Assert.IsType <Column>(_df["age"]); Assert.IsType <DataFrame>(_df.ToDF()); Assert.IsType <DataFrame>(_df.ToDF("name2", "age2")); StructType schema = _df.Schema(); Assert.NotNull(schema); _df.PrintSchema(); _df.Explain(); _df.Explain(true); _df.Explain(false); Assert.Equal(2, _df.Columns().ToArray().Length); var expected = new List <Tuple <string, string> > { new Tuple <string, string>("age", "integer"), new Tuple <string, string>("name", "string") }; Assert.Equal(expected, _df.DTypes()); Assert.IsType <bool>(_df.IsLocal()); Assert.IsType <bool>(_df.IsStreaming()); using (var tempDir = new TemporaryDirectory()) { // The following is required for *CheckPoint(). _spark.SparkContext.SetCheckpointDir(tempDir.Path); Assert.IsType <DataFrame>(_df.Checkpoint()); Assert.IsType <DataFrame>(_df.Checkpoint(false)); Assert.IsType <DataFrame>(_df.LocalCheckpoint()); Assert.IsType <DataFrame>(_df.LocalCheckpoint(false)); } Assert.IsType <DataFrame>(_df.WithWatermark("time", "10 minutes")); _df.Show(); _df.Show(10); _df.Show(10, 10); _df.Show(10, 10, true); Assert.IsType <DataFrame>(_df.Join(_df)); Assert.IsType <DataFrame>(_df.Join(_df, "name")); Assert.IsType <DataFrame>(_df.Join(_df, new[] { "name" })); Assert.IsType <DataFrame>(_df.Join(_df, new[] { "name" }, "outer")); Assert.IsType <DataFrame>(_df.Join(_df, _df["age"] == _df["age"])); Assert.IsType <DataFrame>(_df.Join(_df, _df["age"] == _df["age"], "outer")); Assert.IsType <DataFrame>(_df.CrossJoin(_df)); Assert.IsType <DataFrame>(_df.SortWithinPartitions("age")); Assert.IsType <DataFrame>(_df.SortWithinPartitions("age", "name")); Assert.IsType <DataFrame>(_df.SortWithinPartitions()); Assert.IsType <DataFrame>(_df.SortWithinPartitions(_df["age"])); Assert.IsType <DataFrame>(_df.SortWithinPartitions(_df["age"], _df["name"])); Assert.IsType <DataFrame>(_df.Sort("age")); Assert.IsType <DataFrame>(_df.Sort("age", "name")); Assert.IsType <DataFrame>(_df.Sort()); Assert.IsType <DataFrame>(_df.Sort(_df["age"])); Assert.IsType <DataFrame>(_df.Sort(_df["age"], _df["name"])); Assert.IsType <DataFrame>(_df.OrderBy("age")); Assert.IsType <DataFrame>(_df.OrderBy("age", "name")); Assert.IsType <DataFrame>(_df.OrderBy()); Assert.IsType <DataFrame>(_df.OrderBy(_df["age"])); Assert.IsType <DataFrame>(_df.OrderBy(_df["age"], _df["name"])); Assert.IsType <DataFrame>(_df.Hint("broadcast")); Assert.IsType <DataFrame>(_df.Hint("broadcast", new[] { "hello", "world" })); Assert.IsType <Column>(_df.Col("age")); Assert.IsType <Column>(_df.ColRegex("age")); Assert.IsType <DataFrame>(_df.As("alias")); Assert.IsType <DataFrame>(_df.Alias("alias")); Assert.IsType <DataFrame>(_df.Select("age")); Assert.IsType <DataFrame>(_df.Select("age", "name")); Assert.IsType <DataFrame>(_df.Select()); Assert.IsType <DataFrame>(_df.Select(_df["age"])); Assert.IsType <DataFrame>(_df.Select(_df["age"], _df["name"])); Assert.IsType <DataFrame>(_df.SelectExpr()); Assert.IsType <DataFrame>(_df.SelectExpr("age * 2")); Assert.IsType <DataFrame>(_df.SelectExpr("age * 2", "abs(age)")); Assert.IsType <DataFrame>(_df.Filter(_df["age"] > 21)); Assert.IsType <DataFrame>(_df.Filter("age > 21")); Assert.IsType <DataFrame>(_df.Where(_df["age"] > 21)); Assert.IsType <DataFrame>(_df.Where("age > 21")); Assert.IsType <RelationalGroupedDataset>(_df.GroupBy("age")); Assert.IsType <RelationalGroupedDataset>(_df.GroupBy("age", "name")); Assert.IsType <RelationalGroupedDataset>(_df.GroupBy()); Assert.IsType <RelationalGroupedDataset>(_df.GroupBy(_df["age"])); Assert.IsType <RelationalGroupedDataset>(_df.GroupBy(_df["age"], _df["name"])); { RelationalGroupedDataset df = _df.WithColumn("tempAge", _df["age"]).GroupBy("name"); Assert.IsType <DataFrame>(df.Mean("age")); Assert.IsType <DataFrame>(df.Mean("age", "tempAge")); Assert.IsType <DataFrame>(df.Max("age")); Assert.IsType <DataFrame>(df.Max("age", "tempAge")); Assert.IsType <DataFrame>(df.Avg("age")); Assert.IsType <DataFrame>(df.Avg("age", "tempAge")); Assert.IsType <DataFrame>(df.Min("age")); Assert.IsType <DataFrame>(df.Min("age", "tempAge")); Assert.IsType <DataFrame>(df.Sum("age")); Assert.IsType <DataFrame>(df.Sum("age", "tempAge")); } Assert.IsType <RelationalGroupedDataset>(_df.Rollup("age")); Assert.IsType <RelationalGroupedDataset>(_df.Rollup("age", "name")); Assert.IsType <RelationalGroupedDataset>(_df.Rollup()); Assert.IsType <RelationalGroupedDataset>(_df.Rollup(_df["age"])); Assert.IsType <RelationalGroupedDataset>(_df.Rollup(_df["age"], _df["name"])); Assert.IsType <RelationalGroupedDataset>(_df.Cube("age")); Assert.IsType <RelationalGroupedDataset>(_df.Cube("age", "name")); Assert.IsType <RelationalGroupedDataset>(_df.Cube()); Assert.IsType <RelationalGroupedDataset>(_df.Cube(_df["age"])); Assert.IsType <RelationalGroupedDataset>(_df.Cube(_df["age"], _df["name"])); Assert.IsType <DataFrame>(_df.Agg(Avg(_df["age"]))); Assert.IsType <DataFrame>(_df.Agg(Avg(_df["age"]), Avg(_df["name"]))); Assert.IsType <DataFrame>(_df.Limit(10)); Assert.IsType <DataFrame>(_df.Union(_df)); Assert.IsType <DataFrame>(_df.UnionByName(_df)); Assert.IsType <DataFrame>(_df.Intersect(_df)); Assert.IsType <DataFrame>(_df.Except(_df)); Assert.IsType <DataFrame>(_df.Sample(0.5)); Assert.IsType <DataFrame>(_df.Sample(0.5, true)); Assert.IsType <DataFrame>(_df.Sample(0.5, false, 12345)); Assert.IsType <DataFrame[]>(_df.RandomSplit(new[] { 0.2, 0.8 })); Assert.IsType <DataFrame[]>(_df.RandomSplit(new[] { 0.2, 0.8 }, 12345)); Assert.IsType <DataFrame>(_df.WithColumn("age2", _df["age"])); Assert.IsType <DataFrame>(_df.WithColumnRenamed("age", "age2")); Assert.IsType <DataFrame>(_df.Drop()); Assert.IsType <DataFrame>(_df.Drop("age")); Assert.IsType <DataFrame>(_df.Drop("age", "name")); Assert.IsType <DataFrame>(_df.Drop(_df["age"])); Assert.IsType <DataFrame>(_df.DropDuplicates()); Assert.IsType <DataFrame>(_df.DropDuplicates("age")); Assert.IsType <DataFrame>(_df.DropDuplicates("age", "name")); Assert.IsType <DataFrame>(_df.Describe()); Assert.IsType <DataFrame>(_df.Describe("age")); Assert.IsType <DataFrame>(_df.Describe("age", "name")); Assert.IsType <DataFrame>(_df.Summary()); Assert.IsType <DataFrame>(_df.Summary("count")); Assert.IsType <DataFrame>(_df.Summary("count", "mean")); Assert.IsType <Row[]>(_df.Head(2).ToArray()); Assert.IsType <Row>(_df.Head()); Assert.IsType <Row>(_df.First()); Assert.IsType <DataFrame>(_df.Transform(df => df.Drop("age"))); Assert.IsType <Row[]>(_df.Take(3).ToArray()); Assert.IsType <Row[]>(_df.Collect().ToArray()); Assert.IsType <Row[]>(_df.ToLocalIterator().ToArray()); Assert.IsType <long>(_df.Count()); Assert.IsType <DataFrame>(_df.Repartition(2)); Assert.IsType <DataFrame>(_df.Repartition(2, _df["age"])); Assert.IsType <DataFrame>(_df.Repartition(_df["age"])); Assert.IsType <DataFrame>(_df.Repartition()); Assert.IsType <DataFrame>(_df.RepartitionByRange(2, _df["age"])); Assert.IsType <DataFrame>(_df.RepartitionByRange(_df["age"])); Assert.IsType <DataFrame>(_df.Coalesce(1)); Assert.IsType <DataFrame>(_df.Distinct()); Assert.IsType <DataFrame>(_df.Persist()); Assert.IsType <DataFrame>(_df.Persist(StorageLevel.DISK_ONLY)); Assert.IsType <DataFrame>(_df.Cache()); Assert.IsType <StorageLevel>(_df.StorageLevel()); Assert.IsType <DataFrame>(_df.Unpersist()); _df.CreateTempView("view"); _df.CreateOrReplaceTempView("view"); _df.CreateGlobalTempView("global_view"); _df.CreateOrReplaceGlobalTempView("global_view"); }