Example #1
0
        // C#, Scala and SparkSQL results match but SparkSQL has different precision.
        internal void Q7()
        {
            Func <Column, Column>         getYear  = Udf <string, string>(x => x.Substring(0, 4));
            Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y));

            DataFrame fnation = _nation.Filter(Col("n_name") == "FRANCE" | Col("n_name") == "GERMANY");
            DataFrame fline   = _lineitem.Filter(Col("l_shipdate") >= "1995-01-01" & Col("l_shipdate") <= "1996-12-31");

            DataFrame supNation = fnation.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
                                  .Join(fline, Col("s_suppkey") == fline["l_suppkey"])
                                  .Select(Col("n_name").As("supp_nation"), Col("l_orderkey"), Col("l_extendedprice"), Col("l_discount"), Col("l_shipdate"));

            fnation.Join(_customer, Col("n_nationkey") == _customer["c_nationkey"])
            .Join(_orders, Col("c_custkey") == _orders["o_custkey"])
            .Select(Col("n_name").As("cust_nation"), Col("o_orderkey"))
            .Join(supNation, Col("o_orderkey") == supNation["l_orderkey"])
            .Filter(Col("supp_nation") == "FRANCE" & Col("cust_nation") == "GERMANY"
                    | Col("supp_nation") == "GERMANY" & Col("cust_nation") == "FRANCE")
            .Select(Col("supp_nation"), Col("cust_nation"),
                    getYear(Col("l_shipdate")).As("l_year"),
                    decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"))
            .GroupBy(Col("supp_nation"), Col("cust_nation"), Col("l_year"))
            .Agg(Sum(Col("volume")).As("revenue"))
            .Sort(Col("supp_nation"), Col("cust_nation"), Col("l_year"))
            .Show();
        }
Example #2
0
        internal void Q4()
        {
            DataFrame forders = _orders.Filter(Col("o_orderdate") >= "1993-07-01" &
                                               Col("o_orderdate") < "1993-10-01");
            DataFrame flineitems = _lineitem.Filter(Col("l_commitdate") < Col("l_receiptdate"))
                                   .Select($"l_orderkey")
                                   .Distinct();

            flineitems.Join(forders, Col("l_orderkey") == forders["o_orderkey"])
            .GroupBy(Col("o_orderpriority"))
            .Agg(Count(Col("o_orderpriority")).As("order_count"))
            .Sort(Col("o_orderpriority"))
            .Show();
        }
Example #3
0
        internal void Q3()
        {
            Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y));

            DataFrame fcust      = _customer.Filter(Col("c_mktsegment") == "BUILDING");
            DataFrame forders    = _orders.Filter(Col("o_orderdate") < "1995-03-15");
            DataFrame flineitems = _lineitem.Filter(Col("l_shipdate") > "1995-03-15");

            fcust.Join(forders, Col("c_custkey") == forders["o_custkey"])
            .Select(Col("o_orderkey"), Col("o_orderdate"), Col("o_shippriority"))
            .Join(flineitems, Col("o_orderkey") == flineitems["l_orderkey"])
            .Select(Col("l_orderkey"),
                    decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"),
                    Col("o_orderdate"), Col("o_shippriority"))
            .GroupBy(Col("l_orderkey"), Col("o_orderdate"), Col("o_shippriority"))
            .Agg(Sum(Col("volume")).As("revenue"))
            .Sort(Col("revenue").Desc(), Col("o_orderdate"))
            .Show();
        }
Example #4
0
        internal void Q2()
        {
            DataFrame europe = _region.Filter(Col("r_name") == "EUROPE")
                               .Join(_nation, Col("r_regionkey") == _nation["n_regionkey"])
                               .Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
                               .Join(_partsupp, _supplier["s_suppkey"] == _partsupp["ps_suppkey"]);

            DataFrame brass = _part
                              .Filter(_part["p_size"] == 15 & _part["p_type"].EndsWith("BRASS"))
                              .Join(europe, europe["ps_partkey"] == Col("p_partkey"));

            DataFrame minCost = brass.GroupBy(brass["ps_partkey"])
                                .Agg(Min("ps_supplycost").As("min"));

            brass.Join(minCost, brass["ps_partkey"] == minCost["ps_partkey"])
            .Filter(brass["ps_supplycost"] == minCost["min"])
            .Select("s_acctbal", "s_name", "n_name", "p_partkey", "p_mfgr", "s_address", "s_phone", "s_comment")
            .Sort(Col("s_acctbal").Desc(), Col("n_name"), Col("s_name"), Col("p_partkey"))
            .Show();
        }
Example #5
0
        internal void Q9()
        {
            Func <Column, Column> getYear = Udf <string, string>(x => x.Substring(0, 4));
            Func <Column, Column, Column, Column, Column> expr = Udf <double, double, double, double, double>((x, y, v, w) => x * (1 - y) - (v * w));

            DataFrame linePart = _part.Filter(Col("p_name").Contains("green"))
                                 .Join(_lineitem, Col("p_partkey") == _lineitem["l_partkey"]);

            DataFrame natSup = _nation.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"]);

            linePart.Join(natSup, Col("l_suppkey") == natSup["s_suppkey"])
            .Join(_partsupp, Col("l_suppkey") == _partsupp["ps_suppkey"]
                  & Col("l_partkey") == _partsupp["ps_partkey"])
            .Join(_orders, Col("l_orderkey") == _orders["o_orderkey"])
            .Select(Col("n_name"), getYear(Col("o_orderdate")).As("o_year"),
                    expr(Col("l_extendedprice"), Col("l_discount"), Col("ps_supplycost"), Col("l_quantity")).As("amount"))
            .GroupBy(Col("n_name"), Col("o_year"))
            .Agg(Sum(Col("amount")))
            .Sort(Col("n_name"), Col("o_year").Desc())
            .Show();
        }
Example #6
0
        public void TestSignaturesV2_3_X()
        {
            Assert.IsType <Column>(_df["name"]);
            Assert.IsType <Column>(_df["age"]);

            Assert.IsType <DataFrame>(_df.ToDF());
            Assert.IsType <DataFrame>(_df.ToDF("name2", "age2"));

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            var expected = new List <Tuple <string, string> >
            {
                new Tuple <string, string>("age", "integer"),
                new Tuple <string, string>("name", "string")
            };

            Assert.Equal(expected, _df.DTypes());

            Assert.IsType <bool>(_df.IsLocal());

            Assert.IsType <bool>(_df.IsStreaming());

            using (var tempDir = new TemporaryDirectory())
            {
                // The following is required for *CheckPoint().
                _spark.SparkContext.SetCheckpointDir(tempDir.Path);

                Assert.IsType <DataFrame>(_df.Checkpoint());
                Assert.IsType <DataFrame>(_df.Checkpoint(false));

                Assert.IsType <DataFrame>(_df.LocalCheckpoint());
                Assert.IsType <DataFrame>(_df.LocalCheckpoint(false));
            }

            Assert.IsType <DataFrame>(_df.WithWatermark("time", "10 minutes"));

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            Assert.IsType <DataFrame>(_df.Join(_df));
            Assert.IsType <DataFrame>(_df.Join(_df, "name"));
            Assert.IsType <DataFrame>(_df.Join(_df, new[] { "name" }));
            Assert.IsType <DataFrame>(_df.Join(_df, new[] { "name" }, "outer"));
            Assert.IsType <DataFrame>(_df.Join(_df, _df["age"] == _df["age"]));
            Assert.IsType <DataFrame>(_df.Join(_df, _df["age"] == _df["age"], "outer"));

            Assert.IsType <DataFrame>(_df.CrossJoin(_df));

            Assert.IsType <DataFrame>(_df.SortWithinPartitions("age"));
            Assert.IsType <DataFrame>(_df.SortWithinPartitions("age", "name"));
            Assert.IsType <DataFrame>(_df.SortWithinPartitions());
            Assert.IsType <DataFrame>(_df.SortWithinPartitions(_df["age"]));
            Assert.IsType <DataFrame>(_df.SortWithinPartitions(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.Sort("age"));
            Assert.IsType <DataFrame>(_df.Sort("age", "name"));
            Assert.IsType <DataFrame>(_df.Sort());
            Assert.IsType <DataFrame>(_df.Sort(_df["age"]));
            Assert.IsType <DataFrame>(_df.Sort(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.OrderBy("age"));
            Assert.IsType <DataFrame>(_df.OrderBy("age", "name"));
            Assert.IsType <DataFrame>(_df.OrderBy());
            Assert.IsType <DataFrame>(_df.OrderBy(_df["age"]));
            Assert.IsType <DataFrame>(_df.OrderBy(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.Hint("broadcast"));
            Assert.IsType <DataFrame>(_df.Hint("broadcast", new[] { "hello", "world" }));

            Assert.IsType <Column>(_df.Col("age"));

            Assert.IsType <Column>(_df.ColRegex("age"));

            Assert.IsType <DataFrame>(_df.As("alias"));

            Assert.IsType <DataFrame>(_df.Alias("alias"));

            Assert.IsType <DataFrame>(_df.Select("age"));
            Assert.IsType <DataFrame>(_df.Select("age", "name"));
            Assert.IsType <DataFrame>(_df.Select());
            Assert.IsType <DataFrame>(_df.Select(_df["age"]));
            Assert.IsType <DataFrame>(_df.Select(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.SelectExpr());
            Assert.IsType <DataFrame>(_df.SelectExpr("age * 2"));
            Assert.IsType <DataFrame>(_df.SelectExpr("age * 2", "abs(age)"));

            Assert.IsType <DataFrame>(_df.Filter(_df["age"] > 21));
            Assert.IsType <DataFrame>(_df.Filter("age > 21"));

            Assert.IsType <DataFrame>(_df.Where(_df["age"] > 21));
            Assert.IsType <DataFrame>(_df.Where("age > 21"));

            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy("age"));
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy("age", "name"));
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy());
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy(_df["age"]));
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy(_df["age"], _df["name"]));

            {
                RelationalGroupedDataset df =
                    _df.WithColumn("tempAge", _df["age"]).GroupBy("name");

                Assert.IsType <DataFrame>(df.Mean("age"));
                Assert.IsType <DataFrame>(df.Mean("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Max("age"));
                Assert.IsType <DataFrame>(df.Max("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Avg("age"));
                Assert.IsType <DataFrame>(df.Avg("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Min("age"));
                Assert.IsType <DataFrame>(df.Min("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Sum("age"));
                Assert.IsType <DataFrame>(df.Sum("age", "tempAge"));
            }

            Assert.IsType <RelationalGroupedDataset>(_df.Rollup("age"));
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup("age", "name"));
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup());
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup(_df["age"]));
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup(_df["age"], _df["name"]));

            Assert.IsType <RelationalGroupedDataset>(_df.Cube("age"));
            Assert.IsType <RelationalGroupedDataset>(_df.Cube("age", "name"));
            Assert.IsType <RelationalGroupedDataset>(_df.Cube());
            Assert.IsType <RelationalGroupedDataset>(_df.Cube(_df["age"]));
            Assert.IsType <RelationalGroupedDataset>(_df.Cube(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.Agg(Avg(_df["age"])));
            Assert.IsType <DataFrame>(_df.Agg(Avg(_df["age"]), Avg(_df["name"])));

            Assert.IsType <DataFrame>(_df.Limit(10));

            Assert.IsType <DataFrame>(_df.Union(_df));

            Assert.IsType <DataFrame>(_df.UnionByName(_df));

            Assert.IsType <DataFrame>(_df.Intersect(_df));

            Assert.IsType <DataFrame>(_df.Except(_df));

            Assert.IsType <DataFrame>(_df.Sample(0.5));
            Assert.IsType <DataFrame>(_df.Sample(0.5, true));
            Assert.IsType <DataFrame>(_df.Sample(0.5, false, 12345));

            Assert.IsType <DataFrame[]>(_df.RandomSplit(new[] { 0.2, 0.8 }));
            Assert.IsType <DataFrame[]>(_df.RandomSplit(new[] { 0.2, 0.8 }, 12345));

            Assert.IsType <DataFrame>(_df.WithColumn("age2", _df["age"]));

            Assert.IsType <DataFrame>(_df.WithColumnRenamed("age", "age2"));

            Assert.IsType <DataFrame>(_df.Drop());
            Assert.IsType <DataFrame>(_df.Drop("age"));
            Assert.IsType <DataFrame>(_df.Drop("age", "name"));

            Assert.IsType <DataFrame>(_df.Drop(_df["age"]));

            Assert.IsType <DataFrame>(_df.DropDuplicates());
            Assert.IsType <DataFrame>(_df.DropDuplicates("age"));
            Assert.IsType <DataFrame>(_df.DropDuplicates("age", "name"));

            Assert.IsType <DataFrame>(_df.Describe());
            Assert.IsType <DataFrame>(_df.Describe("age"));
            Assert.IsType <DataFrame>(_df.Describe("age", "name"));

            Assert.IsType <DataFrame>(_df.Summary());
            Assert.IsType <DataFrame>(_df.Summary("count"));
            Assert.IsType <DataFrame>(_df.Summary("count", "mean"));

            Assert.IsType <Row[]>(_df.Head(2).ToArray());
            Assert.IsType <Row>(_df.Head());

            Assert.IsType <Row>(_df.First());

            Assert.IsType <DataFrame>(_df.Transform(df => df.Drop("age")));

            Assert.IsType <Row[]>(_df.Take(3).ToArray());

            Assert.IsType <Row[]>(_df.Collect().ToArray());

            Assert.IsType <Row[]>(_df.ToLocalIterator().ToArray());

            Assert.IsType <long>(_df.Count());

            Assert.IsType <DataFrame>(_df.Repartition(2));
            Assert.IsType <DataFrame>(_df.Repartition(2, _df["age"]));
            Assert.IsType <DataFrame>(_df.Repartition(_df["age"]));
            Assert.IsType <DataFrame>(_df.Repartition());

            Assert.IsType <DataFrame>(_df.RepartitionByRange(2, _df["age"]));
            Assert.IsType <DataFrame>(_df.RepartitionByRange(_df["age"]));

            Assert.IsType <DataFrame>(_df.Coalesce(1));

            Assert.IsType <DataFrame>(_df.Distinct());

            Assert.IsType <DataFrame>(_df.Persist());

            Assert.IsType <DataFrame>(_df.Persist(StorageLevel.DISK_ONLY));

            Assert.IsType <DataFrame>(_df.Cache());

            Assert.IsType <StorageLevel>(_df.StorageLevel());

            Assert.IsType <DataFrame>(_df.Unpersist());

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }