Ejemplo n.º 1
0
        public static void runSpark(string file_path, string cores, string nodes, int nrows)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("word_count_sample")
                .Config("spark.executor.cores", cores)
                .Config("spark.executor.instances", nodes)
                .GetOrCreate();

            // // Create initial DataFrame

            DataFrame dataFrame = spark
                                  .Read()
                                  .Option("header", true)
                                  .Option("inferSchema", true)
                                  .Schema("quizzes string, solutions string")
                                  .Csv(file_path);

            DataFrame dataFrame2 = dataFrame.Limit(nrows);

            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => sudokusolution(sudoku));

            dataFrame2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved");

            sqlDf.Show();

            spark.Stop();
            Console.WriteLine("SCRAPY");
        }
Ejemplo n.º 2
0
        static void runSpark(string file_path, string cores, string nodes, int nrows)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("Resolution de " + nrows + " sudokus par évolution combinatoire de " + cores + " noyau(x) et " + nodes + " noeud(s)")
                .Config("spark.executor.cores", cores)
                .Config("spark.executor.instances", nodes)
                .GetOrCreate();

            // Create initial DataFrame
            DataFrame dataFrame = spark
                                  .Read()
                                  .Option("header", true)
                                  .Option("inferSchema", true)
                                  .Schema("quizzes string, solutions string")
                                  .Csv(file_path);

            DataFrame dataFrame2 = dataFrame.Limit(nrows);

            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => sudokusolution(sudoku));

            dataFrame2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved");

            sqlDf.Show();

            spark.Stop();
        }
Ejemplo n.º 3
0
        //Méthode qui est appelée depuis le main pour lancer une session spark avec un nombbre de noyaux et d'instances différents et lancer la résolution du soduku grace à la méthode Sudokusolution().
        //private static void Sudokures(string cores, string nodes, string mem, int nrows){
        private static void Sudokures(int nrows)
        {
            // Initialisation de la session Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .Config("spark.executor.memory", "4G")
                                 .GetOrCreate();
            //.AppName("Resolution of " + nrows + " sudokus using DlxLib with " + cores + " cores and " + nodes + " instances")
            //.Config("spark.driver.cores", cores)
            //.Config("spark.executor.instances", nodes)
            //.Config("spark.executor.memory", mem)
            //.GetOrCreate();

            // Intégration du csv dans un dataframe
            DataFrame df = spark
                           .Read()
                           .Option("header", true)
                           .Option("inferSchema", true)
                           .Csv(_filePath);

            //limit du dataframe avec un nombre de ligne prédéfini lors de l'appel de la fonction
            DataFrame df2 = df.Limit(nrows);

            //Watch seulement pour la résolution des sudokus
            var watch2 = new System.Diagnostics.Stopwatch();

            watch2.Start();

            // Création de la spark User Defined Function
            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => Sudokusolution(sudoku));

            // Appel de l'UDF dans un nouveau dataframe spark qui contiendra les résultats aussi
            df2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT Sudokus, SukoduUDF(Sudokus) as Resolution from Resolved");

            sqlDf.Show();

            watch2.Stop();

            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution : " + watch2.ElapsedMilliseconds + " ms");
            //Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution with " + cores + " core and " + nodes + " instance: " + watch2.ElapsedMilliseconds + " ms");
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();

            spark.Stop();
        }
Ejemplo n.º 4
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: GitHubProjects <path to projects.csv>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("GitHub and Spark Batch")
                                 .GetOrCreate();

            DataFrame projectsDf = spark
                                   .Read()
                                   .Schema("id INT, url STRING, owner_id INT, " +
                                           "name STRING, descriptor STRING, language STRING, " +
                                           "created_at STRING, forked_from INT, deleted STRING, " +
                                           "updated_at STRING")
                                   .Csv(args[0]);

            projectsDf.Show();

            // Drop any rows with NA values
            DataFrameNaFunctions dropEmptyProjects = projectsDf.Na();
            DataFrame            cleanedProjects   = dropEmptyProjects.Drop("any");

            // Remove unnecessary columns
            cleanedProjects = cleanedProjects.Drop("id", "url", "owner_id");
            cleanedProjects.Show();

            // Average number of times each language has been forked
            DataFrame groupedDF = cleanedProjects
                                  .GroupBy("language")
                                  .Agg(Avg(cleanedProjects["forked_from"]));

            // Sort by most forked languages first
            groupedDF.OrderBy(Desc("avg(forked_from)")).Show();

            spark.Udf().Register <string, bool>(
                "MyUDF",
                (date) => DateTime.TryParse(date, out DateTime convertedDate) &&
                (convertedDate > s_referenceDate));

            cleanedProjects.CreateOrReplaceTempView("dateView");

            DataFrame dateDf = spark.Sql(
                "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView");

            dateDf.Show();

            spark.Stop();
        }
Ejemplo n.º 5
0
        public void Run(string[] args)
        {
            if (args.Length != 3)
            {
                Console.Error.WriteLine(
                    "Usage: SentimentAnalysisStream <host> <port> <model path>");
                Environment.Exit(1);
            }

            // Create Spark Session
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Streaming Sentiment Analysis")
                                 .GetOrCreate();

            // Setup stream connection info
            string hostname = args[0];
            string port     = args[1];

            // Read streaming data into DataFrame
            DataFrame words = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", hostname)
                              .Option("port", port)
                              .Load();

            // Use ML.NET in a UDF to evaluate each incoming entry
            spark.Udf().Register <string, bool>(
                "MLudf",
                input => Sentiment(input, args[2]));

            // Use Spark SQL to call ML.NET UDF
            // Display results of sentiment analysis on each entry
            words.CreateOrReplaceTempView("WordsSentiment");
            DataFrame sqlDf = spark
                              .Sql("SELECT WordsSentiment.value, MLudf(WordsSentiment.value) FROM WordsSentiment");

            // Handle data continuously as it arrives
            StreamingQuery query = sqlDf
                                   .WriteStream()
                                   .Format("console")
                                   .Start();

            query.AwaitTermination();
        }
Ejemplo n.º 6
0
        public void Run(string[] args)
        {
            if (args.Length != 2)
            {
                Console.Error.WriteLine(
                    "Usage: <path to yelptest.csv> <path to MLModel.zip>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName(".NET for Apache Spark Sentiment Analysis")
                                 .GetOrCreate();

            // Read in and display Yelp reviews
            DataFrame df = spark
                           .Read()
                           .Option("header", true)
                           .Option("inferSchema", true)
                           .Csv(args[0]);

            df.Show();

            // Use ML.NET in a UDF to evaluate each review
            spark.Udf().Register <string, bool>(
                "MLudf",
                (text) => Sentiment(text, args[1]));

            // Use Spark SQL to call ML.NET UDF
            // Display results of sentiment analysis on reviews
            df.CreateOrReplaceTempView("Reviews");
            DataFrame sqlDf = spark.Sql("SELECT ReviewText, MLudf(ReviewText) FROM Reviews");

            sqlDf.Show();

            // Print out first 20 rows of data
            // Prevent data getting cut off by setting truncate = 0
            sqlDf.Show(20, 0, false);

            spark.Stop();
        }
Ejemplo n.º 7
0
        private static void ReviewsCleanup(DataFrame dataFrame)
        {
            Console.WriteLine("Ratings Clean-up");

            dataFrame = dataFrame
                        .Filter(
                dataFrame["reviewerID"].IsNotNull()
                .And(dataFrame["asin"].IsNotNull())
                .And(dataFrame["reviewText"].IsNotNull()));

            dataFrame = dataFrame
                        .WithColumnRenamed("reviewerID", "rid")
                        .WithColumnRenamed("reviewText", "review_text")
                        .WithColumnRenamed("unixReviewTime", "unix_time");

            dataFrame.Cache();

            dataFrame.CreateOrReplaceTempView("ElectronicsReviews");

            Console.WriteLine($"Reviews Count: {dataFrame.Count()}");
            Console.WriteLine("Done");
            Console.WriteLine();
        }
Ejemplo n.º 8
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName(".NET Spark SQL basic example")
                                 .Config("spark.some.config.option", "some-value")
                                 .GetOrCreate();

            // Need to explicitly specify the schema since pickling vs. arrow formatting
            // will return different types. Pickling will turn longs into ints if the values fit.
            // Same as the "age INT, name STRING" DDL-format string.
            var inputSchema = new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("name", new StringType())
            });
            DataFrame df = spark.Read().Schema(inputSchema).Json(args[0]);

            Spark.Sql.Types.StructType schema = df.Schema();
            Console.WriteLine(schema.SimpleString);

            IEnumerable <Row> rows = df.Collect();

            foreach (Row row in rows)
            {
                Console.WriteLine(row);
            }

            df.Show();

            df.PrintSchema();

            df.Select("name", "age", "age", "name").Show();

            df.Select(df["name"], df["age"] + 1).Show();

            df.Filter(df["age"] > 21).Show();

            df.GroupBy("age")
            .Agg(Avg(df["age"]), Avg(df["age"]), CountDistinct(df["age"], df["age"]))
            .Show();

            df.CreateOrReplaceTempView("people");

            // Registering Udf for SQL expression.
            DataFrame sqlDf = spark.Sql("SELECT * FROM people");

            sqlDf.Show();

            spark.Udf().Register <int?, string, string>(
                "my_udf",
                (age, name) => name + " with " + ((age.HasValue) ? age.Value.ToString() : "null"));

            sqlDf = spark.Sql("SELECT my_udf(*) FROM people");
            sqlDf.Show();

            // Using UDF via data frames.
            Func <Column, Column, Column> addition = Udf <int?, string, string>(
                (age, name) => name + " is " + (age.HasValue ? age.Value + 10 : 0));

            df.Select(addition(df["age"], df["name"])).Show();

            // Chaining example:
            Func <Column, Column> addition2 = Udf <string, string>(str => $"hello {str}!");

            df.Select(addition2(addition(df["age"], df["name"]))).Show();

            // Multiple UDF example:
            df.Select(addition(df["age"], df["name"]), addition2(df["name"])).Show();

            // UDF return type as array.
            Func <Column, Column> udfArray =
                Udf <string, string[]>((str) => new string[] { str, str + str });

            df.Select(Explode(udfArray(df["name"]))).Show();

            // UDF return type as map.
            Func <Column, Column> udfMap =
                Udf <string, IDictionary <string, string[]> >(
                    (str) => new Dictionary <string, string[]> {
                { str, new[] { str, str } }
            });

            df.Select(udfMap(df["name"]).As("UdfMap")).Show(truncate: 50);

            // Joins.
            DataFrame joinedDf = df.Join(df, "name");

            joinedDf.Show();

            DataFrame joinedDf2 = df.Join(df, new[] { "name", "age" });

            joinedDf2.Show();

            DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer");

            joinedDf3.Show();

            spark.Stop();
        }
Ejemplo n.º 9
0
        public void TestVectorUdf()
        {
            Func <Int32Array, StringArray, StringArray> udf1Func =
                (ages, names) => (StringArray)ToArrowArray(
                    Enumerable.Range(0, names.Length)
                    .Select(i => $"{names.GetString(i)} is {ages.GetValue(i) ?? 0}")
                    .ToArray());

            // Single UDF.
            Func <Column, Column, Column> udf1 =
                ExperimentalFunctions.VectorUdf(udf1Func);
            {
                Row[] rows = _df.Select(udf1(_df["age"], _df["name"])).Collect().ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("Michael is 0", rows[0].GetAs <string>(0));
                Assert.Equal("Andy is 30", rows[1].GetAs <string>(0));
                Assert.Equal("Justin is 19", rows[2].GetAs <string>(0));
            }

            // Chained UDFs.
            Func <Column, Column> udf2 = ExperimentalFunctions.VectorUdf <StringArray, StringArray>(
                (strings) => (StringArray)ToArrowArray(
                    Enumerable.Range(0, strings.Length)
                    .Select(i => $"hello {strings.GetString(i)}!")
                    .ToArray()));
            {
                Row[] rows = _df
                             .Select(udf2(udf1(_df["age"], _df["name"])))
                             .Collect()
                             .ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("hello Michael is 0!", rows[0].GetAs <string>(0));
                Assert.Equal("hello Andy is 30!", rows[1].GetAs <string>(0));
                Assert.Equal("hello Justin is 19!", rows[2].GetAs <string>(0));
            }

            // Multiple UDFs:
            {
                Row[] rows = _df
                             .Select(udf1(_df["age"], _df["name"]), udf2(_df["name"]))
                             .Collect()
                             .ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("Michael is 0", rows[0].GetAs <string>(0));
                Assert.Equal("hello Michael!", rows[0].GetAs <string>(1));

                Assert.Equal("Andy is 30", rows[1].GetAs <string>(0));
                Assert.Equal("hello Andy!", rows[1].GetAs <string>(1));

                Assert.Equal("Justin is 19", rows[2].GetAs <string>(0));
                Assert.Equal("hello Justin!", rows[2].GetAs <string>(1));
            }

            // Register UDF
            {
                _df.CreateOrReplaceTempView("people");
                _spark.Udf().RegisterVector("udf1", udf1Func);
                Row[] rows = _spark.Sql("SELECT udf1(age, name) FROM people")
                             .Collect()
                             .ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("Michael is 0", rows[0].GetAs <string>(0));
                Assert.Equal("Andy is 30", rows[1].GetAs <string>(0));
                Assert.Equal("Justin is 19", rows[2].GetAs <string>(0));
            }
        }
        public void TestUdfRegistrationWithReturnAsRowType()
        {
            // Test UDF that returns a Row object with a single column.
            {
                var schema = new StructType(new[]
                {
                    new StructField("col1", new IntegerType()),
                    new StructField("col2", new StringType())
                });

                _df.CreateOrReplaceTempView("people");

                _spark.Udf().Register <string>(
                    "udf1",
                    str => new GenericRow(new object[] { 1, "abc" }),
                    schema);

                Row[] rows =
                    _spark.Sql("SELECT udf1(name) AS col FROM people")
                    .Collect()
                    .ToArray();
                Assert.Equal(3, rows.Length);
                foreach (Row row in rows)
                {
                    Assert.Equal(1, row.Size());
                    Row outerCol = row.GetAs <Row>("col");
                    Assert.Equal(2, outerCol.Size());
                    Assert.Equal(1, outerCol.GetAs <int>("col1"));
                    Assert.Equal("abc", outerCol.GetAs <string>("col2"));
                }
            }

            // Test UDF that returns a Row object with multiple columns.
            {
                var schema = new StructType(new[]
                {
                    new StructField("col1", new IntegerType())
                });

                _df.CreateOrReplaceTempView("people");

                _spark.Udf().Register <string>(
                    "udf2",
                    str => new GenericRow(new object[] { 111 }),
                    schema);

                Row[] rows =
                    _spark.Sql("SELECT udf2(name) AS col, name FROM people")
                    .Collect()
                    .ToArray();
                Assert.Equal(3, rows.Length);

                foreach (Row row in rows)
                {
                    Assert.Equal(2, row.Size());
                    Row col1 = row.GetAs <Row>("col");
                    Assert.Equal(1, col1.Size());
                    Assert.Equal(111, col1.GetAs <int>("col1"));

                    string col2 = row.GetAs <string>("name");
                    Assert.NotEmpty(col2);
                }
            }

            // Test UDF that returns a nested Row object.
            {
                var subSchema1 = new StructType(new[]
                {
                    new StructField("col1", new IntegerType()),
                });
                var subSchema2 = new StructType(new[]
                {
                    new StructField("col1", new StringType()),
                    new StructField("col2", subSchema1),
                });
                var schema = new StructType(new[]
                {
                    new StructField("col1", new IntegerType()),
                    new StructField("col2", subSchema1),
                    new StructField("col3", subSchema2)
                });

                _df.CreateOrReplaceTempView("people");

                _spark.Udf().Register <string>(
                    "udf3",
                    str => new GenericRow(
                        new object[]
                {
                    1,
                    new GenericRow(new object[] { 1 }),
                    new GenericRow(new object[]
                    {
                        "abc",
                        new GenericRow(new object[] { 10 })
                    })
                }),
                    schema);

                Row[] rows =
                    _spark.Sql("SELECT udf3(name) AS col FROM people")
                    .Collect()
                    .ToArray();
                Assert.Equal(3, rows.Length);

                foreach (Row row in rows)
                {
                    Assert.Equal(1, row.Size());
                    Row outerCol = row.GetAs <Row>("col");
                    Assert.Equal(3, outerCol.Size());
                    Assert.Equal(1, outerCol.GetAs <int>("col1"));
                    Assert.Equal(
                        new Row(new object[] { 1 }, subSchema1),
                        outerCol.GetAs <Row>("col2"));
                    Assert.Equal(
                        new Row(
                            new object[] { "abc", new Row(new object[] { 10 }, subSchema1) },
                            subSchema2),
                        outerCol.GetAs <Row>("col3"));
                }
            }

            // Chained UDFs.
            {
                var schema = new StructType(new[]
                {
                    new StructField("col1", new IntegerType()),
                    new StructField("col2", new StringType())
                });

                _df.CreateOrReplaceTempView("people");

                _spark.Udf().Register <string>(
                    "udf4",
                    str => new GenericRow(new object[] { 1, str }),
                    schema);

                _spark.Udf().Register <Row, string>(
                    "udf5",
                    row => row.GetAs <string>(1));

                Row[] rows =
                    _spark.Sql("SELECT udf5(udf4(name)) FROM people")
                    .Collect()
                    .ToArray();
                Assert.Equal(3, rows.Length);

                var expected = new string[] { "Michael", "Andy", "Justin" };
                for (int i = 0; i < rows.Length; ++i)
                {
                    Assert.Equal(1, rows[i].Size());
                    Assert.Equal(expected[i], rows[i].GetAs <string>(0));
                }
            }
        }
Ejemplo n.º 11
0
        public void TestSignaturesV2_3_X()
        {
            Column col = _df["name"];

            col = _df["age"];

            DataFrame df = _df.ToDF();

            df = df.ToDF("name2", "age2");

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            _df.IsLocal();

            _df.IsStreaming();

            // The following is required for *CheckPoint().
            _spark.SparkContext.SetCheckpointDir(TestEnvironment.ResourceDirectory);

            _df.Checkpoint();
            _df.Checkpoint(false);

            _df.LocalCheckpoint();
            _df.LocalCheckpoint(false);

            _df.WithWatermark("time", "10 minutes");

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            _df.Join(_df);
            _df.Join(_df, "name");
            _df.Join(_df, new[] { "name" });
            _df.Join(_df, new[] { "name" }, "outer");
            _df.Join(_df, _df["age"] == _df["age"]);
            _df.Join(_df, _df["age"] == _df["age"], "outer");

            _df.CrossJoin(_df);

            _df.SortWithinPartitions("age");
            _df.SortWithinPartitions("age", "name");
            _df.SortWithinPartitions();
            _df.SortWithinPartitions(_df["age"]);
            _df.SortWithinPartitions(_df["age"], _df["name"]);

            _df.Sort("age");
            _df.Sort("age", "name");
            _df.Sort();
            _df.Sort(_df["age"]);
            _df.Sort(_df["age"], _df["name"]);

            _df.OrderBy("age");
            _df.OrderBy("age", "name");
            _df.OrderBy();
            _df.OrderBy(_df["age"]);
            _df.OrderBy(_df["age"], _df["name"]);

            _df.Hint("broadcast");
            _df.Hint("broadcast", new[] { "hello", "world" });

            _df.Col("age");

            _df.ColRegex("age");

            _df.As("alias");

            _df.Alias("alias");

            _df.Select("age");
            _df.Select("age", "name");
            _df.Select();
            _df.Select(_df["age"]);
            _df.Select(_df["age"], _df["name"]);

            _df.SelectExpr();
            _df.SelectExpr("age * 2");
            _df.SelectExpr("age * 2", "abs(age)");

            _df.Filter(_df["age"] > 21);
            _df.Filter("age > 21");

            _df.Where(_df["age"] > 21);
            _df.Where("age > 21");

            _df.GroupBy("age");
            _df.GroupBy("age", "name");
            _df.GroupBy();
            _df.GroupBy(_df["age"]);
            _df.GroupBy(_df["age"], _df["name"]);

            _df.Rollup("age");
            _df.Rollup("age", "name");
            _df.Rollup();
            _df.Rollup(_df["age"]);
            _df.Rollup(_df["age"], _df["name"]);

            _df.Cube("age");
            _df.Cube("age", "name");
            _df.Cube();
            _df.Cube(_df["age"]);
            _df.Cube(_df["age"], _df["name"]);

            _df.Agg(Avg(_df["age"]));
            _df.Agg(Avg(_df["age"]), Avg(_df["name"]));

            _df.Limit(10);

            _df.Union(_df);

            _df.UnionByName(_df);

            _df.Intersect(_df);

            _df.Except(_df);

            _df.Sample(0.5);
            _df.Sample(0.5, true);
            _df.Sample(0.5, false, 12345);

            _df.RandomSplit(new[] { 0.2, 0.8 });
            _df.RandomSplit(new[] { 0.2, 0.8 }, 12345);

            _df.WithColumn("age2", _df["age"]);

            _df.WithColumnRenamed("age", "age2");

            _df.Drop();
            _df.Drop("age");
            _df.Drop("age", "name");

            _df.Drop(_df["age"]);

            _df.DropDuplicates();
            _df.DropDuplicates("age");
            _df.DropDuplicates("age", "name");

            _df.Describe();
            _df.Describe("age");
            _df.Describe("age", "name");

            _df.Summary();
            _df.Summary("count");
            _df.Summary("count", "mean");

            _df.Head(2);
            _df.Head();

            _df.First();

            _df.Take(3).ToArray();

            _df.Collect().ToArray();

            _df.ToLocalIterator().ToArray();

            _df.Count();

            _df.Repartition(2);
            _df.Repartition(2, _df["age"]);
            _df.Repartition(_df["age"]);
            _df.Repartition();

            _df.RepartitionByRange(2, _df["age"]);
            _df.RepartitionByRange(_df["age"]);

            _df.Coalesce(1);

            _df.Distinct();

            _df.Persist();

            _df.Cache();

            _df.Unpersist();

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }
Ejemplo n.º 12
0
        static void Main(string[] args)
        {
            // Initialize Session
            SparkSession ss =
                SparkSession
                .Builder()
                .AppName("Working with DataFrames")
                .GetOrCreate();

            // Read Data
            DataFrame businesses =
                ss
                .Read()
                .Option("header", "true")
                .Option("inferSchema", "true")
                .Csv("Data/NYC-Restaurant-Inspections.csv");

            businesses = businesses.Select("CAMIS", "DBA", "BORO", "CUISINE DESCRIPTION");

            DataFrame inspections =
                ss
                .Read()
                .Option("header", "true")
                .Option("inferSchema", "true")
                .Csv("Data/NYC-Restaurant-Inspections.csv");

            inspections = inspections.Select("CAMIS", "INSPECTION DATE", "VIOLATION CODE", "CRITICAL FLAG", "SCORE", "GRADE", "INSPECTION TYPE");

            // Select columns
            businesses.Select(Col("CAMIS"), Col("DBA")).Show(1);

            inspections.Select(inspections["VIOLATION CODE"]).Show(1);

            // Filter
            businesses
            .Filter(Col("BORO") == "Manhattan")
            .Select("DBA", "BORO")
            .Show(3);

            // Group / Aggregate
            businesses
            .GroupBy("CUISINE DESCRIPTION")
            .Agg(Count("CUISINE DESCRIPTION").Alias("CUISINE COUNT"))
            .Show(10);

            // Order
            businesses
            .GroupBy("CUISINE DESCRIPTION")
            .Agg(Count("CUISINE DESCRIPTION").Alias("CUISINE COUNT"))
            .OrderBy(Col("CUISINE COUNT").Desc())
            .Show(3);

            // Join
            DataFrame joinedDf =
                businesses
                .Join(inspections, "CAMIS")
                .Select(Col("DBA"), Col("CUISINE DESCRIPTION"), Col("GRADE"));

            joinedDf.Show(5);

            // SQL
            businesses.CreateOrReplaceTempView("businesses");

            inspections.CreateOrReplaceTempView("inspections");

            ss.Sql(@"SELECT b.DBA,b.`CUISINE DESCRIPTION`,i.GRADE FROM businesses b JOIN inspections i ON b.CAMIS = i.CAMIS").Show(5);

            // UDF
            ss.Udf().Register <string, string>("Tupper", Tupper);

            inspections
            .Select(CallUDF("Tupper", Col("INSPECTION TYPE")).Alias("CAPITALIZED"))
            .Show(3);

            // Save
            joinedDf
            .Write()
            .Mode(SaveMode.Overwrite)
            .Csv("output");
        }
Ejemplo n.º 13
0
        private static void MetadataCleanup(DataFrame dataFrame)
        {
            Console.WriteLine("Metadata Clean-up");

            var priceCleanup = Udf <string, float>(
                p =>
            {
                if (!string.IsNullOrEmpty(p))
                {
                    var index = 0;

                    for (var i = 0; i < p.Length; i++)
                    {
                        if (char.IsDigit(p[i]))
                        {
                            index = i;
                            break;
                        }
                    }

                    if (float.TryParse(p.Substring(index), out var result))
                    {
                        return(result);
                    }
                }

                return(-1f);
            });

            var dateCleanup = Udf <string, double>(
                d =>
            {
                if (!string.IsNullOrEmpty(d) && DateTime.TryParse(d, out var result))
                {
                    return((result.ToUniversalTime() - new DateTime(1970, 1, 1)).TotalSeconds);
                }

                return(-1L);
            });

            var rankCleanup = Udf <string, long>(
                r =>
            {
                if (!string.IsNullOrEmpty(r))
                {
                    var regex = new Regex(@"\d+(,\d+)*", RegexOptions.Singleline);
                    var match = regex.Match(r);
                    if (match.Success && long.TryParse(match.Value.Replace(",", string.Empty), out var result))
                    {
                        return(result);
                    }
                }

                return(-1L);
            });

            dataFrame = dataFrame
                        .Filter(
                dataFrame["asin"].IsNotNull()
                .And(dataFrame["title"].IsNotNull())
                .And(dataFrame["main_cat"].IsNotNull())
                .And(dataFrame["brand"].IsNotNull())
                .And(Not(dataFrame["main_cat"].IsIn("Grocery", "Pet Supplies", "Baby", "Books", "Appstore for Android", "Gift Cards"))));

            dataFrame = dataFrame
                        .WithColumn("clean_price", priceCleanup(dataFrame["price"]))
                        .WithColumn("clean-date", dateCleanup(dataFrame["date"]))
                        .WithColumn("clean-rank", rankCleanup(dataFrame["rank"]))
                        .Drop(dataFrame["price"])
                        .Drop(dataFrame["date"])
                        .Drop(dataFrame["rank"])
                        .WithColumnRenamed("clean_price", "price")
                        .WithColumnRenamed("clean-date", "unixTime")
                        .WithColumnRenamed("clean-rank", "rank");

            dataFrame.Cache();
            dataFrame.CreateOrReplaceTempView("ElectronicsMetadata");

            Console.WriteLine($"Metadata Count: {dataFrame.Count()}");
            Console.WriteLine("Done");
            Console.WriteLine();
        }
Ejemplo n.º 14
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Logging <path to Apache User Logs>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Apache User Log Processing")
                                 .GetOrCreate();

            // Read input log file and display it
            DataFrame df = spark.Read().Text(args[0]);

            df.Show();

            // Step 1: UDF to determine if each line is a valid log entry
            // Remove any invalid entries before further filtering
            spark.Udf().Register <string, bool>(
                "GeneralReg",
                log => Regex.IsMatch(log, s_apacheRx));

            df.CreateOrReplaceTempView("Logs");

            // Apply the UDF to get valid log entries
            DataFrame generalDf = spark.Sql(
                "SELECT logs.value, GeneralReg(logs.value) FROM Logs");

            // Only keep log entries that matched the reg ex
            generalDf = generalDf.Filter(generalDf["GeneralReg(value)"]);
            generalDf.Show();

            // View the resulting schema
            // Notice we created a new column "GeneralReg(value)"
            generalDf.PrintSchema();

            // Step 2: Choose valid log entries that start with 10
            spark.Udf().Register <string, bool>(
                "IPReg",
                log => Regex.IsMatch(log, "^(?=10)"));

            generalDf.CreateOrReplaceTempView("IPLogs");

            // Apply UDF to get valid log entries starting with 10
            // Use SQL "WHERE" rather than doing ipDf.Filter(),
            // which avoids creating an extra column "IPReg(value)"
            DataFrame ipDf = spark.Sql(
                "SELECT iplogs.value FROM IPLogs WHERE IPReg(iplogs.value)");

            ipDf.Show();

            // Step 3: Choose valid log entries that start
            // with 10 and deal with spam
            spark.Udf().Register <string, bool>(
                "SpamRegEx",
                log => Regex.IsMatch(log, "\\b(?=spam)\\b"));

            ipDf.CreateOrReplaceTempView("SpamLogs");

            // Apply UDF to get valid, start with 10, spam entries
            DataFrame spamDF = spark.Sql(
                "SELECT spamlogs.value FROM SpamLogs WHERE SpamRegEx(spamlogs.value)");

            // Let's explore the columns in the data we have filtered
            // Use LINQ to count the number of GET requests
            int numGetRequests = spamDF
                                 .Collect()
                                 .Where(r => ContainsGet(r.GetAs <string>("value")))
                                 .Count();

            Console.WriteLine("Number of GET requests: " + numGetRequests);

            spark.Stop();
        }