示例#1
0
        public void TestGroupedMapUdf()
        {
            DataFrame df = _spark
                           .Read()
                           .Schema("age INT, name STRING")
                           .Json($"{TestEnvironment.ResourceDirectory}more_people.json");

            // Data:
            // { "name":"Michael"}
            // { "name":"Andy", "age":30}
            // { "name":"Seth", "age":30}
            // { "name":"Justin", "age":19}
            // { "name":"Kathy", "age":19}

            Row[] rows = df.GroupBy("age")
                         .Apply(
                new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("nameCharCount", new IntegerType())
            }),
                batch => ArrowBasedCountCharacters(batch))
                         .Collect()
                         .ToArray();

            Assert.Equal(3, rows.Length);
            foreach (Row row in rows)
            {
                int?age       = row.GetAs <int?>("age");
                int charCount = row.GetAs <int>("nameCharCount");
                switch (age)
                {
                case null:
                    Assert.Equal(7, charCount);
                    break;

                case 19:
                    Assert.Equal(11, charCount);
                    break;

                case 30:
                    Assert.Equal(8, charCount);
                    break;

                default:
                    throw new Exception($"Unexpected age: {age}.");
                }
            }
        }
示例#2
0
        internal void Q13()
        {
            Func <Column, Column> special = Udf <string, bool>((x) => s_q13SpecialRegex.IsMatch(x));

            DataFrame c_orders = _customer.Join(_orders, Col("c_custkey") == _orders["o_custkey"]
                                                & !special(_orders["o_comment"]), "left_outer")
                                 .GroupBy(Col("c_custkey"))
                                 .Agg(Count(Col("o_orderkey")).As("c_count"));

            c_orders
            .GroupBy(Col("c_count"))
            .Agg(Count(Col("*")).As("custdist"))
            .Sort(Col("custdist").Desc(), Col("c_count").Desc())
            .Show();
        }
示例#3
0
        internal void Q15()
        {
            Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y));

            DataFrame revenue = _lineitem.Filter(Col("l_shipdate") >= "1996-01-01" &
                                                 Col("l_shipdate") < "1996-04-01")
                                .Select(Col("l_suppkey"), decrease(Col("l_extendedprice"), Col("l_discount")).As("value"))
                                .GroupBy(Col("l_suppkey"))
                                .Agg(Sum(Col("value")).As("total"));

            revenue.Agg(Max(Col("total")).As("max_total"))
            .Join(revenue, Col("max_total") == revenue["total"])
            .Join(_supplier, Col("l_suppkey") == _supplier["s_suppkey"])
            .Select(Col("s_suppkey"), Col("s_name"), Col("s_address"), Col("s_phone"), Col("total"))
            .Sort(Col("s_suppkey"))
            .Show();
        }
示例#4
0
        internal void Q17()
        {
            Func <Column, Column> mul02 = Udf <double, double>((x) => x * 0.2);

            DataFrame flineitem = _lineitem.Select(Col("l_partkey"), Col("l_quantity"), Col("l_extendedprice"));

            DataFrame fpart = _part.Filter(Col("p_brand") == "Brand#23" & Col("p_container") == "MED BOX")
                              .Select(Col("p_partkey"))
                              .Join(_lineitem, Col("p_partkey") == _lineitem["l_partkey"], "left_outer");

            fpart.GroupBy("p_partkey")
            .Agg(mul02(Avg(Col("l_quantity"))).As("avg_quantity"))
            .Select(Col("p_partkey").As("key"), Col("avg_quantity"))
            .Join(fpart, Col("key") == fpart["p_partkey"])
            .Filter(Col("l_quantity") < Col("avg_quantity"))
            .Agg((Sum(Col("l_extendedprice")) / 7.0).As("avg_yearly"))
            .Show();
        }
示例#5
0
        internal void Q10()
        {
            Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y));

            DataFrame flineitem = _lineitem.Filter(Col("l_returnflag") == "R");

            _orders.Filter(Col("o_orderdate") < "1994-01-01" & Col("o_orderdate") >= "1993-10-01")
            .Join(_customer, Col("o_custkey") == _customer["c_custkey"])
            .Join(_nation, Col("c_nationkey") == _nation["n_nationkey"])
            .Join(flineitem, Col("o_orderkey") == flineitem["l_orderkey"])
            .Select(Col("c_custkey"), Col("c_name"),
                    decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"),
                    Col("c_acctbal"), Col("n_name"), Col("c_address"), Col("c_phone"), Col("c_comment"))
            .GroupBy(Col("c_custkey"), Col("c_name"), Col("c_acctbal"), Col("c_phone"), Col("n_name"), Col("c_address"), Col("c_comment"))
            .Agg(Sum(Col("volume")).As("revenue"))
            .Sort(Col("revenue").Desc())
            .Show();
        }
示例#6
0
        internal void Q11()
        {
            Func <Column, Column, Column> mul   = Udf <double, int, double>((x, y) => x * y);
            Func <Column, Column>         mul01 = Udf <double, double>(x => x * 0.0001);

            DataFrame tmp = _nation.Filter(Col("n_name") == "GERMANY")
                            .Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
                            .Select(Col("s_suppkey"))
                            .Join(_partsupp, Col("s_suppkey") == _partsupp["ps_suppkey"])
                            .Select(Col("ps_partkey"), mul(Col("ps_supplycost"), Col("ps_availqty")).As("value"));

            DataFrame sumRes = tmp.Agg(Sum("value").As("total_value"));

            tmp.GroupBy(Col("ps_partkey")).Agg(Sum("value").As("part_value"))
            .Join(sumRes, Col("part_value") > mul01(Col("total_value")))
            .Sort(Col("part_value").Desc())
            .Show();
        }
示例#7
0
        internal void Q3()
        {
            Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y));

            DataFrame fcust      = _customer.Filter(Col("c_mktsegment") == "BUILDING");
            DataFrame forders    = _orders.Filter(Col("o_orderdate") < "1995-03-15");
            DataFrame flineitems = _lineitem.Filter(Col("l_shipdate") > "1995-03-15");

            fcust.Join(forders, Col("c_custkey") == forders["o_custkey"])
            .Select(Col("o_orderkey"), Col("o_orderdate"), Col("o_shippriority"))
            .Join(flineitems, Col("o_orderkey") == flineitems["l_orderkey"])
            .Select(Col("l_orderkey"),
                    decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"),
                    Col("o_orderdate"), Col("o_shippriority"))
            .GroupBy(Col("l_orderkey"), Col("o_orderdate"), Col("o_shippriority"))
            .Agg(Sum(Col("volume")).As("revenue"))
            .Sort(Col("revenue").Desc(), Col("o_orderdate"))
            .Show();
        }
示例#8
0
        public void TestSignaturesV3_X_X()
        {
            // Validate ToLocalIterator
            var data = new List <GenericRow>
            {
                new GenericRow(new object[] { "Alice", 20 }),
                new GenericRow(new object[] { "Bob", 30 })
            };
            var schema = new StructType(new List <StructField>()
            {
                new StructField("Name", new StringType()),
                new StructField("Age", new IntegerType())
            });
            DataFrame         df       = _spark.CreateDataFrame(data, schema);
            IEnumerable <Row> actual   = df.ToLocalIterator(true).ToArray();
            IEnumerable <Row> expected = data.Select(r => new Row(r.Values, schema));

            Assert.Equal(expected, actual);
        }
示例#9
0
        internal void Q5()
        {
            Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y));

            DataFrame forders = _orders.Filter(Col("o_orderdate") < "1995-01-01" & Col("o_orderdate") >= "1994-01-01");

            _region.Filter(Col("r_name") == "ASIA")
            .Join(_nation, Col("r_regionkey") == _nation["n_regionkey"])
            .Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
            .Join(_lineitem, Col("s_suppkey") == _lineitem["l_suppkey"])
            .Select(Col("n_name"), Col("l_extendedprice"), Col("l_discount"), Col("l_orderkey"), Col("s_nationkey"))
            .Join(forders, Col("l_orderkey") == forders["o_orderkey"])
            .Join(_customer, Col("o_custkey") == _customer["c_custkey"]
                  & Col("s_nationkey") == _customer["c_nationkey"])
            .Select(Col("n_name"), decrease(Col("l_extendedprice"), Col("l_discount")).As("value"))
            .GroupBy(Col("n_name"))
            .Agg(Sum(Col("value")).As("revenue"))
            .Sort(Col("revenue").Desc())
            .Show();
        }
示例#10
0
        internal void Q2()
        {
            DataFrame europe = _region.Filter(Col("r_name") == "EUROPE")
                               .Join(_nation, Col("r_regionkey") == _nation["n_regionkey"])
                               .Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
                               .Join(_partsupp, _supplier["s_suppkey"] == _partsupp["ps_suppkey"]);

            DataFrame brass = _part
                              .Filter(_part["p_size"] == 15 & _part["p_type"].EndsWith("BRASS"))
                              .Join(europe, europe["ps_partkey"] == Col("p_partkey"));

            DataFrame minCost = brass.GroupBy(brass["ps_partkey"])
                                .Agg(Min("ps_supplycost").As("min"));

            brass.Join(minCost, brass["ps_partkey"] == minCost["ps_partkey"])
            .Filter(brass["ps_supplycost"] == minCost["min"])
            .Select("s_acctbal", "s_name", "n_name", "p_partkey", "p_mfgr", "s_address", "s_phone", "s_comment")
            .Sort(Col("s_acctbal").Desc(), Col("n_name"), Col("s_name"), Col("p_partkey"))
            .Show();
        }
示例#11
0
        internal void Q16()
        {
            Func <Column, Column> complains = Udf <string, bool>((x) => s_q16CompainsRegex.Match(x).Success);

            Func <Column, Column> polished = Udf <string, bool>((x) => x.StartsWith("MEDIUM POLISHED"));

            Func <Column, Column> numbers = Udf <int, bool>((x) => s_q16NumbersRegex.Match(x.ToString()).Success);

            DataFrame fparts = _part.Filter((Col("p_brand") != "Brand#45") & !polished(Col("p_type")) &
                                            numbers(Col("p_size")))
                               .Select(Col("p_partkey"), Col("p_brand"), Col("p_type"), Col("p_size"));

            _supplier.Filter(!complains(Col("s_comment")))
            .Join(_partsupp, Col("s_suppkey") == _partsupp["ps_suppkey"])
            .Select(Col("ps_partkey"), Col("ps_suppkey"))
            .Join(fparts, Col("ps_partkey") == fparts["p_partkey"])
            .GroupBy(Col("p_brand"), Col("p_type"), Col("p_size"))
            .Agg(CountDistinct(Col("ps_suppkey")).As("supplier_count"))
            .Sort(Col("supplier_count").Desc(), Col("p_brand"), Col("p_type"), Col("p_size"))
            .Show();
        }
示例#12
0
        internal void Q9()
        {
            Func <Column, Column> getYear = Udf <string, string>(x => x.Substring(0, 4));
            Func <Column, Column, Column, Column, Column> expr = Udf <double, double, double, double, double>((x, y, v, w) => x * (1 - y) - (v * w));

            DataFrame linePart = _part.Filter(Col("p_name").Contains("green"))
                                 .Join(_lineitem, Col("p_partkey") == _lineitem["l_partkey"]);

            DataFrame natSup = _nation.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"]);

            linePart.Join(natSup, Col("l_suppkey") == natSup["s_suppkey"])
            .Join(_partsupp, Col("l_suppkey") == _partsupp["ps_suppkey"]
                  & Col("l_partkey") == _partsupp["ps_partkey"])
            .Join(_orders, Col("l_orderkey") == _orders["o_orderkey"])
            .Select(Col("n_name"), getYear(Col("o_orderdate")).As("o_year"),
                    expr(Col("l_extendedprice"), Col("l_discount"), Col("ps_supplycost"), Col("l_quantity")).As("amount"))
            .GroupBy(Col("n_name"), Col("o_year"))
            .Agg(Sum(Col("amount")))
            .Sort(Col("n_name"), Col("o_year").Desc())
            .Show();
        }
示例#13
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Sql.VectorDataFrameUdfs <path to SPARK_HOME/examples/src/main/resources/people.json>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 // Lower the shuffle partitions to speed up groupBy() operations.
                                 .Config("spark.sql.shuffle.partitions", "3")
                                 .AppName("SQL VectorUdfs example using .NET for Apache Spark")
                                 .GetOrCreate();

            DataFrame df = spark.Read().Schema("age INT, name STRING").Json(args[0]);

            StructType schema = df.Schema();

            Console.WriteLine(schema.SimpleString);

            df.Show();

            df.PrintSchema();

            // Grouped Map Vector UDF
            // able to return different shapes and record lengths
            df.GroupBy("age")
            .Apply(
                new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("nameCharCount", new IntegerType())
            }),
                r => CountCharacters(r))
            .Show();

            spark.Stop();
        }
示例#14
0
        internal void Q22()
        {
            Func <Column, Column> sub2 = Udf <string, string>(x => x.Substring(0, 2));

            Func <Column, Column> phone = Udf <string, bool>(x => s_q22PhoneRegex.IsMatch(x));

            DataFrame fcustomer = _customer.Select(Col("c_acctbal"), Col("c_custkey"), sub2(Col("c_phone")).As("cntrycode"))
                                  .Filter(phone(Col("cntrycode")));

            DataFrame avg_customer = fcustomer.Filter(Col("c_acctbal") > 0.0)
                                     .Agg(Avg(Col("c_acctbal")).As("avg_acctbal"));

            _orders.GroupBy(Col("o_custkey"))
            .Agg(Col("o_custkey")).Select(Col("o_custkey"))
            .Join(fcustomer, Col("o_custkey") == fcustomer["c_custkey"], "right_outer")
            .Filter(Col("o_custkey").IsNull())
            .Join(avg_customer)
            .Filter(Col("c_acctbal") > Col("avg_acctbal"))
            .GroupBy(Col("cntrycode"))
            .Agg(Count(Col("c_acctbal")).As("numcust"), Sum(Col("c_acctbal")).As("totacctbal"))
            .Sort(Col("cntrycode"))
            .Show();
        }