public void TestGroupedMapUdf() { DataFrame df = _spark .Read() .Schema("age INT, name STRING") .Json($"{TestEnvironment.ResourceDirectory}more_people.json"); // Data: // { "name":"Michael"} // { "name":"Andy", "age":30} // { "name":"Seth", "age":30} // { "name":"Justin", "age":19} // { "name":"Kathy", "age":19} Row[] rows = df.GroupBy("age") .Apply( new StructType(new[] { new StructField("age", new IntegerType()), new StructField("nameCharCount", new IntegerType()) }), batch => ArrowBasedCountCharacters(batch)) .Collect() .ToArray(); Assert.Equal(3, rows.Length); foreach (Row row in rows) { int?age = row.GetAs <int?>("age"); int charCount = row.GetAs <int>("nameCharCount"); switch (age) { case null: Assert.Equal(7, charCount); break; case 19: Assert.Equal(11, charCount); break; case 30: Assert.Equal(8, charCount); break; default: throw new Exception($"Unexpected age: {age}."); } } }
internal void Q13() { Func <Column, Column> special = Udf <string, bool>((x) => s_q13SpecialRegex.IsMatch(x)); DataFrame c_orders = _customer.Join(_orders, Col("c_custkey") == _orders["o_custkey"] & !special(_orders["o_comment"]), "left_outer") .GroupBy(Col("c_custkey")) .Agg(Count(Col("o_orderkey")).As("c_count")); c_orders .GroupBy(Col("c_count")) .Agg(Count(Col("*")).As("custdist")) .Sort(Col("custdist").Desc(), Col("c_count").Desc()) .Show(); }
internal void Q15() { Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y)); DataFrame revenue = _lineitem.Filter(Col("l_shipdate") >= "1996-01-01" & Col("l_shipdate") < "1996-04-01") .Select(Col("l_suppkey"), decrease(Col("l_extendedprice"), Col("l_discount")).As("value")) .GroupBy(Col("l_suppkey")) .Agg(Sum(Col("value")).As("total")); revenue.Agg(Max(Col("total")).As("max_total")) .Join(revenue, Col("max_total") == revenue["total"]) .Join(_supplier, Col("l_suppkey") == _supplier["s_suppkey"]) .Select(Col("s_suppkey"), Col("s_name"), Col("s_address"), Col("s_phone"), Col("total")) .Sort(Col("s_suppkey")) .Show(); }
internal void Q17() { Func <Column, Column> mul02 = Udf <double, double>((x) => x * 0.2); DataFrame flineitem = _lineitem.Select(Col("l_partkey"), Col("l_quantity"), Col("l_extendedprice")); DataFrame fpart = _part.Filter(Col("p_brand") == "Brand#23" & Col("p_container") == "MED BOX") .Select(Col("p_partkey")) .Join(_lineitem, Col("p_partkey") == _lineitem["l_partkey"], "left_outer"); fpart.GroupBy("p_partkey") .Agg(mul02(Avg(Col("l_quantity"))).As("avg_quantity")) .Select(Col("p_partkey").As("key"), Col("avg_quantity")) .Join(fpart, Col("key") == fpart["p_partkey"]) .Filter(Col("l_quantity") < Col("avg_quantity")) .Agg((Sum(Col("l_extendedprice")) / 7.0).As("avg_yearly")) .Show(); }
internal void Q10() { Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y)); DataFrame flineitem = _lineitem.Filter(Col("l_returnflag") == "R"); _orders.Filter(Col("o_orderdate") < "1994-01-01" & Col("o_orderdate") >= "1993-10-01") .Join(_customer, Col("o_custkey") == _customer["c_custkey"]) .Join(_nation, Col("c_nationkey") == _nation["n_nationkey"]) .Join(flineitem, Col("o_orderkey") == flineitem["l_orderkey"]) .Select(Col("c_custkey"), Col("c_name"), decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"), Col("c_acctbal"), Col("n_name"), Col("c_address"), Col("c_phone"), Col("c_comment")) .GroupBy(Col("c_custkey"), Col("c_name"), Col("c_acctbal"), Col("c_phone"), Col("n_name"), Col("c_address"), Col("c_comment")) .Agg(Sum(Col("volume")).As("revenue")) .Sort(Col("revenue").Desc()) .Show(); }
internal void Q11() { Func <Column, Column, Column> mul = Udf <double, int, double>((x, y) => x * y); Func <Column, Column> mul01 = Udf <double, double>(x => x * 0.0001); DataFrame tmp = _nation.Filter(Col("n_name") == "GERMANY") .Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"]) .Select(Col("s_suppkey")) .Join(_partsupp, Col("s_suppkey") == _partsupp["ps_suppkey"]) .Select(Col("ps_partkey"), mul(Col("ps_supplycost"), Col("ps_availqty")).As("value")); DataFrame sumRes = tmp.Agg(Sum("value").As("total_value")); tmp.GroupBy(Col("ps_partkey")).Agg(Sum("value").As("part_value")) .Join(sumRes, Col("part_value") > mul01(Col("total_value"))) .Sort(Col("part_value").Desc()) .Show(); }
internal void Q3() { Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y)); DataFrame fcust = _customer.Filter(Col("c_mktsegment") == "BUILDING"); DataFrame forders = _orders.Filter(Col("o_orderdate") < "1995-03-15"); DataFrame flineitems = _lineitem.Filter(Col("l_shipdate") > "1995-03-15"); fcust.Join(forders, Col("c_custkey") == forders["o_custkey"]) .Select(Col("o_orderkey"), Col("o_orderdate"), Col("o_shippriority")) .Join(flineitems, Col("o_orderkey") == flineitems["l_orderkey"]) .Select(Col("l_orderkey"), decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"), Col("o_orderdate"), Col("o_shippriority")) .GroupBy(Col("l_orderkey"), Col("o_orderdate"), Col("o_shippriority")) .Agg(Sum(Col("volume")).As("revenue")) .Sort(Col("revenue").Desc(), Col("o_orderdate")) .Show(); }
public void TestSignaturesV3_X_X() { // Validate ToLocalIterator var data = new List <GenericRow> { new GenericRow(new object[] { "Alice", 20 }), new GenericRow(new object[] { "Bob", 30 }) }; var schema = new StructType(new List <StructField>() { new StructField("Name", new StringType()), new StructField("Age", new IntegerType()) }); DataFrame df = _spark.CreateDataFrame(data, schema); IEnumerable <Row> actual = df.ToLocalIterator(true).ToArray(); IEnumerable <Row> expected = data.Select(r => new Row(r.Values, schema)); Assert.Equal(expected, actual); }
internal void Q5() { Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y)); DataFrame forders = _orders.Filter(Col("o_orderdate") < "1995-01-01" & Col("o_orderdate") >= "1994-01-01"); _region.Filter(Col("r_name") == "ASIA") .Join(_nation, Col("r_regionkey") == _nation["n_regionkey"]) .Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"]) .Join(_lineitem, Col("s_suppkey") == _lineitem["l_suppkey"]) .Select(Col("n_name"), Col("l_extendedprice"), Col("l_discount"), Col("l_orderkey"), Col("s_nationkey")) .Join(forders, Col("l_orderkey") == forders["o_orderkey"]) .Join(_customer, Col("o_custkey") == _customer["c_custkey"] & Col("s_nationkey") == _customer["c_nationkey"]) .Select(Col("n_name"), decrease(Col("l_extendedprice"), Col("l_discount")).As("value")) .GroupBy(Col("n_name")) .Agg(Sum(Col("value")).As("revenue")) .Sort(Col("revenue").Desc()) .Show(); }
internal void Q2() { DataFrame europe = _region.Filter(Col("r_name") == "EUROPE") .Join(_nation, Col("r_regionkey") == _nation["n_regionkey"]) .Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"]) .Join(_partsupp, _supplier["s_suppkey"] == _partsupp["ps_suppkey"]); DataFrame brass = _part .Filter(_part["p_size"] == 15 & _part["p_type"].EndsWith("BRASS")) .Join(europe, europe["ps_partkey"] == Col("p_partkey")); DataFrame minCost = brass.GroupBy(brass["ps_partkey"]) .Agg(Min("ps_supplycost").As("min")); brass.Join(minCost, brass["ps_partkey"] == minCost["ps_partkey"]) .Filter(brass["ps_supplycost"] == minCost["min"]) .Select("s_acctbal", "s_name", "n_name", "p_partkey", "p_mfgr", "s_address", "s_phone", "s_comment") .Sort(Col("s_acctbal").Desc(), Col("n_name"), Col("s_name"), Col("p_partkey")) .Show(); }
internal void Q16() { Func <Column, Column> complains = Udf <string, bool>((x) => s_q16CompainsRegex.Match(x).Success); Func <Column, Column> polished = Udf <string, bool>((x) => x.StartsWith("MEDIUM POLISHED")); Func <Column, Column> numbers = Udf <int, bool>((x) => s_q16NumbersRegex.Match(x.ToString()).Success); DataFrame fparts = _part.Filter((Col("p_brand") != "Brand#45") & !polished(Col("p_type")) & numbers(Col("p_size"))) .Select(Col("p_partkey"), Col("p_brand"), Col("p_type"), Col("p_size")); _supplier.Filter(!complains(Col("s_comment"))) .Join(_partsupp, Col("s_suppkey") == _partsupp["ps_suppkey"]) .Select(Col("ps_partkey"), Col("ps_suppkey")) .Join(fparts, Col("ps_partkey") == fparts["p_partkey"]) .GroupBy(Col("p_brand"), Col("p_type"), Col("p_size")) .Agg(CountDistinct(Col("ps_suppkey")).As("supplier_count")) .Sort(Col("supplier_count").Desc(), Col("p_brand"), Col("p_type"), Col("p_size")) .Show(); }
internal void Q9() { Func <Column, Column> getYear = Udf <string, string>(x => x.Substring(0, 4)); Func <Column, Column, Column, Column, Column> expr = Udf <double, double, double, double, double>((x, y, v, w) => x * (1 - y) - (v * w)); DataFrame linePart = _part.Filter(Col("p_name").Contains("green")) .Join(_lineitem, Col("p_partkey") == _lineitem["l_partkey"]); DataFrame natSup = _nation.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"]); linePart.Join(natSup, Col("l_suppkey") == natSup["s_suppkey"]) .Join(_partsupp, Col("l_suppkey") == _partsupp["ps_suppkey"] & Col("l_partkey") == _partsupp["ps_partkey"]) .Join(_orders, Col("l_orderkey") == _orders["o_orderkey"]) .Select(Col("n_name"), getYear(Col("o_orderdate")).As("o_year"), expr(Col("l_extendedprice"), Col("l_discount"), Col("ps_supplycost"), Col("l_quantity")).As("amount")) .GroupBy(Col("n_name"), Col("o_year")) .Agg(Sum(Col("amount"))) .Sort(Col("n_name"), Col("o_year").Desc()) .Show(); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Sql.VectorDataFrameUdfs <path to SPARK_HOME/examples/src/main/resources/people.json>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() // Lower the shuffle partitions to speed up groupBy() operations. .Config("spark.sql.shuffle.partitions", "3") .AppName("SQL VectorUdfs example using .NET for Apache Spark") .GetOrCreate(); DataFrame df = spark.Read().Schema("age INT, name STRING").Json(args[0]); StructType schema = df.Schema(); Console.WriteLine(schema.SimpleString); df.Show(); df.PrintSchema(); // Grouped Map Vector UDF // able to return different shapes and record lengths df.GroupBy("age") .Apply( new StructType(new[] { new StructField("age", new IntegerType()), new StructField("nameCharCount", new IntegerType()) }), r => CountCharacters(r)) .Show(); spark.Stop(); }
internal void Q22() { Func <Column, Column> sub2 = Udf <string, string>(x => x.Substring(0, 2)); Func <Column, Column> phone = Udf <string, bool>(x => s_q22PhoneRegex.IsMatch(x)); DataFrame fcustomer = _customer.Select(Col("c_acctbal"), Col("c_custkey"), sub2(Col("c_phone")).As("cntrycode")) .Filter(phone(Col("cntrycode"))); DataFrame avg_customer = fcustomer.Filter(Col("c_acctbal") > 0.0) .Agg(Avg(Col("c_acctbal")).As("avg_acctbal")); _orders.GroupBy(Col("o_custkey")) .Agg(Col("o_custkey")).Select(Col("o_custkey")) .Join(fcustomer, Col("o_custkey") == fcustomer["c_custkey"], "right_outer") .Filter(Col("o_custkey").IsNull()) .Join(avg_customer) .Filter(Col("c_acctbal") > Col("avg_acctbal")) .GroupBy(Col("cntrycode")) .Agg(Count(Col("c_acctbal")).As("numcust"), Sum(Col("c_acctbal")).As("totacctbal")) .Sort(Col("cntrycode")) .Show(); }