/// <inheritdoc cref="Analyzer{S,M}.ComputeStateFrom"/> public override Option <S> ComputeStateFrom(DataFrame dataFrame) { IEnumerable <Column> aggregations = AggregationFunctions(); Row result = dataFrame .Agg(aggregations.First(), aggregations.Skip(1).ToArray()) .Collect() .FirstOrDefault(); return(FromAggregationResult(result, 0)); }
internal void Q15() { Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y)); DataFrame revenue = _lineitem.Filter(Col("l_shipdate") >= "1996-01-01" & Col("l_shipdate") < "1996-04-01") .Select(Col("l_suppkey"), decrease(Col("l_extendedprice"), Col("l_discount")).As("value")) .GroupBy(Col("l_suppkey")) .Agg(Sum(Col("value")).As("total")); revenue.Agg(Max(Col("total")).As("max_total")) .Join(revenue, Col("max_total") == revenue["total"]) .Join(_supplier, Col("l_suppkey") == _supplier["s_suppkey"]) .Select(Col("s_suppkey"), Col("s_name"), Col("s_address"), Col("s_phone"), Col("total")) .Sort(Col("s_suppkey")) .Show(); }
internal void Q11() { Func <Column, Column, Column> mul = Udf <double, int, double>((x, y) => x * y); Func <Column, Column> mul01 = Udf <double, double>(x => x * 0.0001); DataFrame tmp = _nation.Filter(Col("n_name") == "GERMANY") .Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"]) .Select(Col("s_suppkey")) .Join(_partsupp, Col("s_suppkey") == _partsupp["ps_suppkey"]) .Select(Col("ps_partkey"), mul(Col("ps_supplycost"), Col("ps_availqty")).As("value")); DataFrame sumRes = tmp.Agg(Sum("value").As("total_value")); tmp.GroupBy(Col("ps_partkey")).Agg(Sum("value").As("part_value")) .Join(sumRes, Col("part_value") > mul01(Col("total_value"))) .Sort(Col("part_value").Desc()) .Show(); }
public void TestSignaturesV2_3_X() { Column col = _df["name"]; col = _df["age"]; DataFrame df = _df.ToDF(); df = df.ToDF("name2", "age2"); StructType schema = _df.Schema(); Assert.NotNull(schema); _df.PrintSchema(); _df.Explain(); _df.Explain(true); _df.Explain(false); Assert.Equal(2, _df.Columns().ToArray().Length); _df.IsLocal(); _df.IsStreaming(); using (var tempDir = new TemporaryDirectory()) { // The following is required for *CheckPoint(). _spark.SparkContext.SetCheckpointDir(tempDir.Path); _df.Checkpoint(); _df.Checkpoint(false); _df.LocalCheckpoint(); _df.LocalCheckpoint(false); } _df.WithWatermark("time", "10 minutes"); _df.Show(); _df.Show(10); _df.Show(10, 10); _df.Show(10, 10, true); _df.Join(_df); _df.Join(_df, "name"); _df.Join(_df, new[] { "name" }); _df.Join(_df, new[] { "name" }, "outer"); _df.Join(_df, _df["age"] == _df["age"]); _df.Join(_df, _df["age"] == _df["age"], "outer"); _df.CrossJoin(_df); _df.SortWithinPartitions("age"); _df.SortWithinPartitions("age", "name"); _df.SortWithinPartitions(); _df.SortWithinPartitions(_df["age"]); _df.SortWithinPartitions(_df["age"], _df["name"]); _df.Sort("age"); _df.Sort("age", "name"); _df.Sort(); _df.Sort(_df["age"]); _df.Sort(_df["age"], _df["name"]); _df.OrderBy("age"); _df.OrderBy("age", "name"); _df.OrderBy(); _df.OrderBy(_df["age"]); _df.OrderBy(_df["age"], _df["name"]); _df.Hint("broadcast"); _df.Hint("broadcast", new[] { "hello", "world" }); _df.Col("age"); _df.ColRegex("age"); _df.As("alias"); _df.Alias("alias"); _df.Select("age"); _df.Select("age", "name"); _df.Select(); _df.Select(_df["age"]); _df.Select(_df["age"], _df["name"]); _df.SelectExpr(); _df.SelectExpr("age * 2"); _df.SelectExpr("age * 2", "abs(age)"); _df.Filter(_df["age"] > 21); _df.Filter("age > 21"); _df.Where(_df["age"] > 21); _df.Where("age > 21"); _df.GroupBy("age"); _df.GroupBy("age", "name"); _df.GroupBy(); _df.GroupBy(_df["age"]); _df.GroupBy(_df["age"], _df["name"]); _df.Rollup("age"); _df.Rollup("age", "name"); _df.Rollup(); _df.Rollup(_df["age"]); _df.Rollup(_df["age"], _df["name"]); _df.Cube("age"); _df.Cube("age", "name"); _df.Cube(); _df.Cube(_df["age"]); _df.Cube(_df["age"], _df["name"]); _df.Agg(Avg(_df["age"])); _df.Agg(Avg(_df["age"]), Avg(_df["name"])); _df.Limit(10); _df.Union(_df); _df.UnionByName(_df); _df.Intersect(_df); _df.Except(_df); _df.Sample(0.5); _df.Sample(0.5, true); _df.Sample(0.5, false, 12345); _df.RandomSplit(new[] { 0.2, 0.8 }); _df.RandomSplit(new[] { 0.2, 0.8 }, 12345); _df.WithColumn("age2", _df["age"]); _df.WithColumnRenamed("age", "age2"); _df.Drop(); _df.Drop("age"); _df.Drop("age", "name"); _df.Drop(_df["age"]); _df.DropDuplicates(); _df.DropDuplicates("age"); _df.DropDuplicates("age", "name"); _df.Describe(); _df.Describe("age"); _df.Describe("age", "name"); _df.Summary(); _df.Summary("count"); _df.Summary("count", "mean"); _df.Head(2); _df.Head(); _df.First(); _df.Take(3).ToArray(); _df.Collect().ToArray(); _df.ToLocalIterator().ToArray(); _df.Count(); _df.Repartition(2); _df.Repartition(2, _df["age"]); _df.Repartition(_df["age"]); _df.Repartition(); _df.RepartitionByRange(2, _df["age"]); _df.RepartitionByRange(_df["age"]); _df.Coalesce(1); _df.Distinct(); _df.Persist(); _df.Cache(); _df.Unpersist(); _df.CreateTempView("view"); _df.CreateOrReplaceTempView("view"); _df.CreateGlobalTempView("global_view"); _df.CreateOrReplaceGlobalTempView("global_view"); }
private static AnalyzerContext RunScanningAnalyzers(DataFrame dataFrame, IEnumerable <IAnalyzer <IMetric> > analyzers, Option <IStateLoader> aggregateWith, Option <IStatePersister> saveStateTo ) { IEnumerable <IScanSharableAnalyzer <IState, IMetric> > sharable = analyzers.OfType <IScanSharableAnalyzer <IState, IMetric> >(); IEnumerable <IAnalyzer <IMetric> > others = analyzers.Except(sharable); AnalyzerContext sharedResults; if (sharable.Any()) { IEnumerable <KeyValuePair <IAnalyzer <IMetric>, IMetric> > metricsByAnalyzer; try { IEnumerable <Column> aggregations = sharable .SelectMany(analyzer => analyzer.AggregationFunctions()); int i = 0; List <int> offsets = sharable.Select(analyzer => { i += analyzer.AggregationFunctions().Count(); return(i); }).ToList(); offsets.Insert(0, 0); Row results = dataFrame.Agg(aggregations.FirstOrDefault(), aggregations.Skip(1).ToArray()).Collect() .First(); metricsByAnalyzer = sharable .Zip(offsets, (analyzer, i1) => (analyzer, i1)) .Select(analyzerOffset => new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzerOffset.analyzer, SuccessOfFailureMetricFrom(analyzerOffset.analyzer, results, analyzerOffset.i1, aggregateWith, saveStateTo))); } catch (Exception e) { metricsByAnalyzer = sharable.Select(analyzer => new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, analyzer.ToFailureMetric(e))); } Dictionary <IAnalyzer <IMetric>, IMetric> metricsByAnalyzerDict = new Dictionary <IAnalyzer <IMetric>, IMetric>(metricsByAnalyzer); sharedResults = new AnalyzerContext(metricsByAnalyzerDict); } else { sharedResults = AnalyzerContext.Empty(); } Dictionary <IAnalyzer <IMetric>, IMetric> otherMetrics = new Dictionary <IAnalyzer <IMetric>, IMetric>( others.Select(analyzer => new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, analyzer.Calculate(dataFrame, aggregateWith, saveStateTo)))); return(sharedResults + new AnalyzerContext(otherMetrics)); }