예제 #1
0
        /// <inheritdoc cref="Analyzer{S,M}.ComputeStateFrom"/>
        public override Option <S> ComputeStateFrom(DataFrame dataFrame)
        {
            IEnumerable <Column> aggregations = AggregationFunctions();
            Row result = dataFrame
                         .Agg(aggregations.First(), aggregations.Skip(1).ToArray())
                         .Collect()
                         .FirstOrDefault();

            return(FromAggregationResult(result, 0));
        }
예제 #2
0
        internal void Q15()
        {
            Func <Column, Column, Column> decrease = Udf <double, double, double>((x, y) => x * (1 - y));

            DataFrame revenue = _lineitem.Filter(Col("l_shipdate") >= "1996-01-01" &
                                                 Col("l_shipdate") < "1996-04-01")
                                .Select(Col("l_suppkey"), decrease(Col("l_extendedprice"), Col("l_discount")).As("value"))
                                .GroupBy(Col("l_suppkey"))
                                .Agg(Sum(Col("value")).As("total"));

            revenue.Agg(Max(Col("total")).As("max_total"))
            .Join(revenue, Col("max_total") == revenue["total"])
            .Join(_supplier, Col("l_suppkey") == _supplier["s_suppkey"])
            .Select(Col("s_suppkey"), Col("s_name"), Col("s_address"), Col("s_phone"), Col("total"))
            .Sort(Col("s_suppkey"))
            .Show();
        }
예제 #3
0
        internal void Q11()
        {
            Func <Column, Column, Column> mul   = Udf <double, int, double>((x, y) => x * y);
            Func <Column, Column>         mul01 = Udf <double, double>(x => x * 0.0001);

            DataFrame tmp = _nation.Filter(Col("n_name") == "GERMANY")
                            .Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
                            .Select(Col("s_suppkey"))
                            .Join(_partsupp, Col("s_suppkey") == _partsupp["ps_suppkey"])
                            .Select(Col("ps_partkey"), mul(Col("ps_supplycost"), Col("ps_availqty")).As("value"));

            DataFrame sumRes = tmp.Agg(Sum("value").As("total_value"));

            tmp.GroupBy(Col("ps_partkey")).Agg(Sum("value").As("part_value"))
            .Join(sumRes, Col("part_value") > mul01(Col("total_value")))
            .Sort(Col("part_value").Desc())
            .Show();
        }
예제 #4
0
        public void TestSignaturesV2_3_X()
        {
            Column col = _df["name"];

            col = _df["age"];

            DataFrame df = _df.ToDF();

            df = df.ToDF("name2", "age2");

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            _df.IsLocal();

            _df.IsStreaming();

            using (var tempDir = new TemporaryDirectory())
            {
                // The following is required for *CheckPoint().
                _spark.SparkContext.SetCheckpointDir(tempDir.Path);

                _df.Checkpoint();
                _df.Checkpoint(false);

                _df.LocalCheckpoint();
                _df.LocalCheckpoint(false);
            }

            _df.WithWatermark("time", "10 minutes");

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            _df.Join(_df);
            _df.Join(_df, "name");
            _df.Join(_df, new[] { "name" });
            _df.Join(_df, new[] { "name" }, "outer");
            _df.Join(_df, _df["age"] == _df["age"]);
            _df.Join(_df, _df["age"] == _df["age"], "outer");

            _df.CrossJoin(_df);

            _df.SortWithinPartitions("age");
            _df.SortWithinPartitions("age", "name");
            _df.SortWithinPartitions();
            _df.SortWithinPartitions(_df["age"]);
            _df.SortWithinPartitions(_df["age"], _df["name"]);

            _df.Sort("age");
            _df.Sort("age", "name");
            _df.Sort();
            _df.Sort(_df["age"]);
            _df.Sort(_df["age"], _df["name"]);

            _df.OrderBy("age");
            _df.OrderBy("age", "name");
            _df.OrderBy();
            _df.OrderBy(_df["age"]);
            _df.OrderBy(_df["age"], _df["name"]);

            _df.Hint("broadcast");
            _df.Hint("broadcast", new[] { "hello", "world" });

            _df.Col("age");

            _df.ColRegex("age");

            _df.As("alias");

            _df.Alias("alias");

            _df.Select("age");
            _df.Select("age", "name");
            _df.Select();
            _df.Select(_df["age"]);
            _df.Select(_df["age"], _df["name"]);

            _df.SelectExpr();
            _df.SelectExpr("age * 2");
            _df.SelectExpr("age * 2", "abs(age)");

            _df.Filter(_df["age"] > 21);
            _df.Filter("age > 21");

            _df.Where(_df["age"] > 21);
            _df.Where("age > 21");

            _df.GroupBy("age");
            _df.GroupBy("age", "name");
            _df.GroupBy();
            _df.GroupBy(_df["age"]);
            _df.GroupBy(_df["age"], _df["name"]);

            _df.Rollup("age");
            _df.Rollup("age", "name");
            _df.Rollup();
            _df.Rollup(_df["age"]);
            _df.Rollup(_df["age"], _df["name"]);

            _df.Cube("age");
            _df.Cube("age", "name");
            _df.Cube();
            _df.Cube(_df["age"]);
            _df.Cube(_df["age"], _df["name"]);

            _df.Agg(Avg(_df["age"]));
            _df.Agg(Avg(_df["age"]), Avg(_df["name"]));

            _df.Limit(10);

            _df.Union(_df);

            _df.UnionByName(_df);

            _df.Intersect(_df);

            _df.Except(_df);

            _df.Sample(0.5);
            _df.Sample(0.5, true);
            _df.Sample(0.5, false, 12345);

            _df.RandomSplit(new[] { 0.2, 0.8 });
            _df.RandomSplit(new[] { 0.2, 0.8 }, 12345);

            _df.WithColumn("age2", _df["age"]);

            _df.WithColumnRenamed("age", "age2");

            _df.Drop();
            _df.Drop("age");
            _df.Drop("age", "name");

            _df.Drop(_df["age"]);

            _df.DropDuplicates();
            _df.DropDuplicates("age");
            _df.DropDuplicates("age", "name");

            _df.Describe();
            _df.Describe("age");
            _df.Describe("age", "name");

            _df.Summary();
            _df.Summary("count");
            _df.Summary("count", "mean");

            _df.Head(2);
            _df.Head();

            _df.First();

            _df.Take(3).ToArray();

            _df.Collect().ToArray();

            _df.ToLocalIterator().ToArray();

            _df.Count();

            _df.Repartition(2);
            _df.Repartition(2, _df["age"]);
            _df.Repartition(_df["age"]);
            _df.Repartition();

            _df.RepartitionByRange(2, _df["age"]);
            _df.RepartitionByRange(_df["age"]);

            _df.Coalesce(1);

            _df.Distinct();

            _df.Persist();

            _df.Cache();

            _df.Unpersist();

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }
예제 #5
0
        private static AnalyzerContext RunScanningAnalyzers(DataFrame dataFrame,
                                                            IEnumerable <IAnalyzer <IMetric> > analyzers,
                                                            Option <IStateLoader> aggregateWith,
                                                            Option <IStatePersister> saveStateTo
                                                            )
        {
            IEnumerable <IScanSharableAnalyzer <IState, IMetric> > sharable =
                analyzers.OfType <IScanSharableAnalyzer <IState, IMetric> >();
            IEnumerable <IAnalyzer <IMetric> > others = analyzers.Except(sharable);

            AnalyzerContext sharedResults;

            if (sharable.Any())
            {
                IEnumerable <KeyValuePair <IAnalyzer <IMetric>, IMetric> > metricsByAnalyzer;

                try
                {
                    IEnumerable <Column> aggregations = sharable
                                                        .SelectMany(analyzer => analyzer.AggregationFunctions());

                    int i = 0;


                    List <int> offsets = sharable.Select(analyzer =>
                    {
                        i += analyzer.AggregationFunctions().Count();
                        return(i);
                    }).ToList();

                    offsets.Insert(0, 0);

                    Row results = dataFrame.Agg(aggregations.FirstOrDefault(), aggregations.Skip(1).ToArray()).Collect()
                                  .First();

                    metricsByAnalyzer = sharable
                                        .Zip(offsets, (analyzer, i1) => (analyzer, i1))
                                        .Select(analyzerOffset => new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzerOffset.analyzer,
                                                                                                                  SuccessOfFailureMetricFrom(analyzerOffset.analyzer, results,
                                                                                                                                             analyzerOffset.i1, aggregateWith, saveStateTo)));
                }
                catch (Exception e)
                {
                    metricsByAnalyzer = sharable.Select(analyzer =>
                                                        new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer, analyzer.ToFailureMetric(e)));
                }

                Dictionary <IAnalyzer <IMetric>, IMetric> metricsByAnalyzerDict =
                    new Dictionary <IAnalyzer <IMetric>, IMetric>(metricsByAnalyzer);
                sharedResults = new AnalyzerContext(metricsByAnalyzerDict);
            }
            else
            {
                sharedResults = AnalyzerContext.Empty();
            }

            Dictionary <IAnalyzer <IMetric>, IMetric> otherMetrics = new Dictionary <IAnalyzer <IMetric>, IMetric>(
                others.Select(analyzer =>
                              new KeyValuePair <IAnalyzer <IMetric>, IMetric>(analyzer,
                                                                              analyzer.Calculate(dataFrame, aggregateWith, saveStateTo))));

            return(sharedResults + new AnalyzerContext(otherMetrics));
        }