Esempio n. 1
0
        static void Main(string[] args)
        {
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("emrApp")
                                 .GetOrCreate();

            DataFrame dataFrame = spark
                                  .Read()
                                  .Format("avro")
                                  .Load(args[0]);

            RegionModel regionModel = new RegionModel();

            Func <Column, Column> udfConvertRegion = Udf <string, string>(
                city => {
                var regionCode      = city.Split('_')[1].Substring(0, 1);
                var convertedRegion = String.Empty;
                regionModel.ConversionTable.TryGetValue(regionCode, out convertedRegion);
                return(convertedRegion);
            }     // city_23 --> 23 --> 2 --> {2 : Brisbane} --> ** Brisbane **
                );

            dataFrame = dataFrame
                        .WithColumn("Region", udfConvertRegion(dataFrame["address.city"]))
                        .Drop("orderunits", "address");

            dataFrame
            .Coalesce(1)
            .Write()
            .Format("csv")
            .Save($"{args[1]}/{DateTime.UtcNow.ToString("yyyy/MM/dd/hh-mm-ss")}");
        }
Esempio n. 2
0
        static void Main(string[] args)
        {
            Console.WriteLine("Start SparkSession");
            SparkSession sparkSession = SparkSession.Builder().AppName("Street Counter").GetOrCreate();
            DataFrame    dfCsv        =
                sparkSession
                .Read()
                .Option("delimiter", ";")
                .Schema("WOJ string ,POW string ,GMI string ,RODZ_GMI string , " +
                        "SYM string , SYM_UL string , " +
                        "CECHA string , NAZWA_1 string ,NAZWA_2 string , " +
                        "STAN_NA string")
                .Csv("streets.csv");
            DataFrame dataIn = dfCsv
                               .WithColumn("STREET", Functions.ConcatWs(" ", dfCsv["CECHA"], dfCsv["NAZWA_1"], dfCsv["NAZWA_2"]));
            DataFrame dataGroup = dataIn
                                  .Select("STREET")
                                  .GroupBy("STREET")
                                  .Count()
                                  .WithColumnRenamed("count", "COUNT");
            DataFrame dataOut = dataGroup
                                .OrderBy(dataGroup["COUNT"]
                                         .Desc()
                                         );

            dataOut
            .Coalesce(1)
            .Write()
            .Option("delimiter", ";")
            .Csv("result");
            sparkSession.Stop();
            Console.WriteLine("Stop SparkSession");
        }
Esempio n. 3
0
        public void TestCoalesce()
        {
            // arrange
            mockDataFrameProxy.Reset();
            mockDataFrameProxy.Setup(m => m.Coalesce(It.IsAny <int>()));

            var sc        = new SparkContext(null);
            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sc);

            const int numPartitions = 4;

            dataFrame.Coalesce(numPartitions);
            mockDataFrameProxy.Verify(m => m.Coalesce(numPartitions), Times.Once());
        }
Esempio n. 4
0
        static void Main(string[] args)
        {
            SparkSession spark = SparkSession.Builder()
                                 .AppName("emrapp")
                                 .GetOrCreate();
            DataFrame df = spark.Read()
                           .Format("avro")
                           .Load(args[0]);

            df = df.Drop("address")
                 .GroupBy("itemid")
                 .Count();

            df.Show();
            df.Coalesce(1)
            .Write()
            .Format("csv")
            .Save($"{args[1]}/{DateTime.UtcNow.ToString("yyyy/MM/dd/hh-mm-ss")}");
        }
Esempio n. 5
0
        public void TestSignaturesV2_3_X()
        {
            Column col = _df["name"];

            col = _df["age"];

            DataFrame df = _df.ToDF();

            df = df.ToDF("name2", "age2");

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            _df.IsLocal();

            _df.IsStreaming();

            using (var tempDir = new TemporaryDirectory())
            {
                // The following is required for *CheckPoint().
                _spark.SparkContext.SetCheckpointDir(tempDir.Path);

                _df.Checkpoint();
                _df.Checkpoint(false);

                _df.LocalCheckpoint();
                _df.LocalCheckpoint(false);
            }

            _df.WithWatermark("time", "10 minutes");

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            _df.Join(_df);
            _df.Join(_df, "name");
            _df.Join(_df, new[] { "name" });
            _df.Join(_df, new[] { "name" }, "outer");
            _df.Join(_df, _df["age"] == _df["age"]);
            _df.Join(_df, _df["age"] == _df["age"], "outer");

            _df.CrossJoin(_df);

            _df.SortWithinPartitions("age");
            _df.SortWithinPartitions("age", "name");
            _df.SortWithinPartitions();
            _df.SortWithinPartitions(_df["age"]);
            _df.SortWithinPartitions(_df["age"], _df["name"]);

            _df.Sort("age");
            _df.Sort("age", "name");
            _df.Sort();
            _df.Sort(_df["age"]);
            _df.Sort(_df["age"], _df["name"]);

            _df.OrderBy("age");
            _df.OrderBy("age", "name");
            _df.OrderBy();
            _df.OrderBy(_df["age"]);
            _df.OrderBy(_df["age"], _df["name"]);

            _df.Hint("broadcast");
            _df.Hint("broadcast", new[] { "hello", "world" });

            _df.Col("age");

            _df.ColRegex("age");

            _df.As("alias");

            _df.Alias("alias");

            _df.Select("age");
            _df.Select("age", "name");
            _df.Select();
            _df.Select(_df["age"]);
            _df.Select(_df["age"], _df["name"]);

            _df.SelectExpr();
            _df.SelectExpr("age * 2");
            _df.SelectExpr("age * 2", "abs(age)");

            _df.Filter(_df["age"] > 21);
            _df.Filter("age > 21");

            _df.Where(_df["age"] > 21);
            _df.Where("age > 21");

            _df.GroupBy("age");
            _df.GroupBy("age", "name");
            _df.GroupBy();
            _df.GroupBy(_df["age"]);
            _df.GroupBy(_df["age"], _df["name"]);

            _df.Rollup("age");
            _df.Rollup("age", "name");
            _df.Rollup();
            _df.Rollup(_df["age"]);
            _df.Rollup(_df["age"], _df["name"]);

            _df.Cube("age");
            _df.Cube("age", "name");
            _df.Cube();
            _df.Cube(_df["age"]);
            _df.Cube(_df["age"], _df["name"]);

            _df.Agg(Avg(_df["age"]));
            _df.Agg(Avg(_df["age"]), Avg(_df["name"]));

            _df.Limit(10);

            _df.Union(_df);

            _df.UnionByName(_df);

            _df.Intersect(_df);

            _df.Except(_df);

            _df.Sample(0.5);
            _df.Sample(0.5, true);
            _df.Sample(0.5, false, 12345);

            _df.RandomSplit(new[] { 0.2, 0.8 });
            _df.RandomSplit(new[] { 0.2, 0.8 }, 12345);

            _df.WithColumn("age2", _df["age"]);

            _df.WithColumnRenamed("age", "age2");

            _df.Drop();
            _df.Drop("age");
            _df.Drop("age", "name");

            _df.Drop(_df["age"]);

            _df.DropDuplicates();
            _df.DropDuplicates("age");
            _df.DropDuplicates("age", "name");

            _df.Describe();
            _df.Describe("age");
            _df.Describe("age", "name");

            _df.Summary();
            _df.Summary("count");
            _df.Summary("count", "mean");

            _df.Head(2);
            _df.Head();

            _df.First();

            _df.Take(3).ToArray();

            _df.Collect().ToArray();

            _df.ToLocalIterator().ToArray();

            _df.Count();

            _df.Repartition(2);
            _df.Repartition(2, _df["age"]);
            _df.Repartition(_df["age"]);
            _df.Repartition();

            _df.RepartitionByRange(2, _df["age"]);
            _df.RepartitionByRange(_df["age"]);

            _df.Coalesce(1);

            _df.Distinct();

            _df.Persist();

            _df.Cache();

            _df.Unpersist();

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }