public static void runSpark(string file_path, string cores, string nodes, int nrows) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .Config("spark.executor.cores", cores) .Config("spark.executor.instances", nodes) .GetOrCreate(); // // Create initial DataFrame DataFrame dataFrame = spark .Read() .Option("header", true) .Option("inferSchema", true) .Schema("quizzes string, solutions string") .Csv(file_path); DataFrame dataFrame2 = dataFrame.Limit(nrows); spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => sudokusolution(sudoku)); dataFrame2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved"); sqlDf.Show(); spark.Stop(); Console.WriteLine("SCRAPY"); }
static void runSpark(string file_path, string cores, string nodes, int nrows) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("Resolution de " + nrows + " sudokus par évolution combinatoire de " + cores + " noyau(x) et " + nodes + " noeud(s)") .Config("spark.executor.cores", cores) .Config("spark.executor.instances", nodes) .GetOrCreate(); // Create initial DataFrame DataFrame dataFrame = spark .Read() .Option("header", true) .Option("inferSchema", true) .Schema("quizzes string, solutions string") .Csv(file_path); DataFrame dataFrame2 = dataFrame.Limit(nrows); spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => sudokusolution(sudoku)); dataFrame2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved"); sqlDf.Show(); spark.Stop(); }
//Méthode qui est appelée depuis le main pour lancer une session spark avec un nombbre de noyaux et d'instances différents et lancer la résolution du soduku grace à la méthode Sudokusolution(). //private static void Sudokures(string cores, string nodes, string mem, int nrows){ private static void Sudokures(int nrows) { // Initialisation de la session Spark SparkSession spark = SparkSession .Builder() .Config("spark.executor.memory", "4G") .GetOrCreate(); //.AppName("Resolution of " + nrows + " sudokus using DlxLib with " + cores + " cores and " + nodes + " instances") //.Config("spark.driver.cores", cores) //.Config("spark.executor.instances", nodes) //.Config("spark.executor.memory", mem) //.GetOrCreate(); // Intégration du csv dans un dataframe DataFrame df = spark .Read() .Option("header", true) .Option("inferSchema", true) .Csv(_filePath); //limit du dataframe avec un nombre de ligne prédéfini lors de l'appel de la fonction DataFrame df2 = df.Limit(nrows); //Watch seulement pour la résolution des sudokus var watch2 = new System.Diagnostics.Stopwatch(); watch2.Start(); // Création de la spark User Defined Function spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => Sudokusolution(sudoku)); // Appel de l'UDF dans un nouveau dataframe spark qui contiendra les résultats aussi df2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT Sudokus, SukoduUDF(Sudokus) as Resolution from Resolved"); sqlDf.Show(); watch2.Stop(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution : " + watch2.ElapsedMilliseconds + " ms"); //Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution with " + cores + " core and " + nodes + " instance: " + watch2.ElapsedMilliseconds + " ms"); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); spark.Stop(); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: GitHubProjects <path to projects.csv>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName("GitHub and Spark Batch") .GetOrCreate(); DataFrame projectsDf = spark .Read() .Schema("id INT, url STRING, owner_id INT, " + "name STRING, descriptor STRING, language STRING, " + "created_at STRING, forked_from INT, deleted STRING, " + "updated_at STRING") .Csv(args[0]); projectsDf.Show(); // Drop any rows with NA values DataFrameNaFunctions dropEmptyProjects = projectsDf.Na(); DataFrame cleanedProjects = dropEmptyProjects.Drop("any"); // Remove unnecessary columns cleanedProjects = cleanedProjects.Drop("id", "url", "owner_id"); cleanedProjects.Show(); // Average number of times each language has been forked DataFrame groupedDF = cleanedProjects .GroupBy("language") .Agg(Avg(cleanedProjects["forked_from"])); // Sort by most forked languages first groupedDF.OrderBy(Desc("avg(forked_from)")).Show(); spark.Udf().Register <string, bool>( "MyUDF", (date) => DateTime.TryParse(date, out DateTime convertedDate) && (convertedDate > s_referenceDate)); cleanedProjects.CreateOrReplaceTempView("dateView"); DataFrame dateDf = spark.Sql( "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView"); dateDf.Show(); spark.Stop(); }
public void Run(string[] args) { if (args.Length != 3) { Console.Error.WriteLine( "Usage: SentimentAnalysisStream <host> <port> <model path>"); Environment.Exit(1); } // Create Spark Session SparkSession spark = SparkSession .Builder() .AppName("Streaming Sentiment Analysis") .GetOrCreate(); // Setup stream connection info string hostname = args[0]; string port = args[1]; // Read streaming data into DataFrame DataFrame words = spark .ReadStream() .Format("socket") .Option("host", hostname) .Option("port", port) .Load(); // Use ML.NET in a UDF to evaluate each incoming entry spark.Udf().Register <string, bool>( "MLudf", input => Sentiment(input, args[2])); // Use Spark SQL to call ML.NET UDF // Display results of sentiment analysis on each entry words.CreateOrReplaceTempView("WordsSentiment"); DataFrame sqlDf = spark .Sql("SELECT WordsSentiment.value, MLudf(WordsSentiment.value) FROM WordsSentiment"); // Handle data continuously as it arrives StreamingQuery query = sqlDf .WriteStream() .Format("console") .Start(); query.AwaitTermination(); }
public void Run(string[] args) { if (args.Length != 2) { Console.Error.WriteLine( "Usage: <path to yelptest.csv> <path to MLModel.zip>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName(".NET for Apache Spark Sentiment Analysis") .GetOrCreate(); // Read in and display Yelp reviews DataFrame df = spark .Read() .Option("header", true) .Option("inferSchema", true) .Csv(args[0]); df.Show(); // Use ML.NET in a UDF to evaluate each review spark.Udf().Register <string, bool>( "MLudf", (text) => Sentiment(text, args[1])); // Use Spark SQL to call ML.NET UDF // Display results of sentiment analysis on reviews df.CreateOrReplaceTempView("Reviews"); DataFrame sqlDf = spark.Sql("SELECT ReviewText, MLudf(ReviewText) FROM Reviews"); sqlDf.Show(); // Print out first 20 rows of data // Prevent data getting cut off by setting truncate = 0 sqlDf.Show(20, 0, false); spark.Stop(); }
private static void ReviewsCleanup(DataFrame dataFrame) { Console.WriteLine("Ratings Clean-up"); dataFrame = dataFrame .Filter( dataFrame["reviewerID"].IsNotNull() .And(dataFrame["asin"].IsNotNull()) .And(dataFrame["reviewText"].IsNotNull())); dataFrame = dataFrame .WithColumnRenamed("reviewerID", "rid") .WithColumnRenamed("reviewText", "review_text") .WithColumnRenamed("unixReviewTime", "unix_time"); dataFrame.Cache(); dataFrame.CreateOrReplaceTempView("ElectronicsReviews"); Console.WriteLine($"Reviews Count: {dataFrame.Count()}"); Console.WriteLine("Done"); Console.WriteLine(); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName(".NET Spark SQL basic example") .Config("spark.some.config.option", "some-value") .GetOrCreate(); // Need to explicitly specify the schema since pickling vs. arrow formatting // will return different types. Pickling will turn longs into ints if the values fit. // Same as the "age INT, name STRING" DDL-format string. var inputSchema = new StructType(new[] { new StructField("age", new IntegerType()), new StructField("name", new StringType()) }); DataFrame df = spark.Read().Schema(inputSchema).Json(args[0]); Spark.Sql.Types.StructType schema = df.Schema(); Console.WriteLine(schema.SimpleString); IEnumerable <Row> rows = df.Collect(); foreach (Row row in rows) { Console.WriteLine(row); } df.Show(); df.PrintSchema(); df.Select("name", "age", "age", "name").Show(); df.Select(df["name"], df["age"] + 1).Show(); df.Filter(df["age"] > 21).Show(); df.GroupBy("age") .Agg(Avg(df["age"]), Avg(df["age"]), CountDistinct(df["age"], df["age"])) .Show(); df.CreateOrReplaceTempView("people"); // Registering Udf for SQL expression. DataFrame sqlDf = spark.Sql("SELECT * FROM people"); sqlDf.Show(); spark.Udf().Register <int?, string, string>( "my_udf", (age, name) => name + " with " + ((age.HasValue) ? age.Value.ToString() : "null")); sqlDf = spark.Sql("SELECT my_udf(*) FROM people"); sqlDf.Show(); // Using UDF via data frames. Func <Column, Column, Column> addition = Udf <int?, string, string>( (age, name) => name + " is " + (age.HasValue ? age.Value + 10 : 0)); df.Select(addition(df["age"], df["name"])).Show(); // Chaining example: Func <Column, Column> addition2 = Udf <string, string>(str => $"hello {str}!"); df.Select(addition2(addition(df["age"], df["name"]))).Show(); // Multiple UDF example: df.Select(addition(df["age"], df["name"]), addition2(df["name"])).Show(); // UDF return type as array. Func <Column, Column> udfArray = Udf <string, string[]>((str) => new string[] { str, str + str }); df.Select(Explode(udfArray(df["name"]))).Show(); // UDF return type as map. Func <Column, Column> udfMap = Udf <string, IDictionary <string, string[]> >( (str) => new Dictionary <string, string[]> { { str, new[] { str, str } } }); df.Select(udfMap(df["name"]).As("UdfMap")).Show(truncate: 50); // Joins. DataFrame joinedDf = df.Join(df, "name"); joinedDf.Show(); DataFrame joinedDf2 = df.Join(df, new[] { "name", "age" }); joinedDf2.Show(); DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer"); joinedDf3.Show(); spark.Stop(); }
public void TestVectorUdf() { Func <Int32Array, StringArray, StringArray> udf1Func = (ages, names) => (StringArray)ToArrowArray( Enumerable.Range(0, names.Length) .Select(i => $"{names.GetString(i)} is {ages.GetValue(i) ?? 0}") .ToArray()); // Single UDF. Func <Column, Column, Column> udf1 = ExperimentalFunctions.VectorUdf(udf1Func); { Row[] rows = _df.Select(udf1(_df["age"], _df["name"])).Collect().ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("Michael is 0", rows[0].GetAs <string>(0)); Assert.Equal("Andy is 30", rows[1].GetAs <string>(0)); Assert.Equal("Justin is 19", rows[2].GetAs <string>(0)); } // Chained UDFs. Func <Column, Column> udf2 = ExperimentalFunctions.VectorUdf <StringArray, StringArray>( (strings) => (StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"hello {strings.GetString(i)}!") .ToArray())); { Row[] rows = _df .Select(udf2(udf1(_df["age"], _df["name"]))) .Collect() .ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("hello Michael is 0!", rows[0].GetAs <string>(0)); Assert.Equal("hello Andy is 30!", rows[1].GetAs <string>(0)); Assert.Equal("hello Justin is 19!", rows[2].GetAs <string>(0)); } // Multiple UDFs: { Row[] rows = _df .Select(udf1(_df["age"], _df["name"]), udf2(_df["name"])) .Collect() .ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("Michael is 0", rows[0].GetAs <string>(0)); Assert.Equal("hello Michael!", rows[0].GetAs <string>(1)); Assert.Equal("Andy is 30", rows[1].GetAs <string>(0)); Assert.Equal("hello Andy!", rows[1].GetAs <string>(1)); Assert.Equal("Justin is 19", rows[2].GetAs <string>(0)); Assert.Equal("hello Justin!", rows[2].GetAs <string>(1)); } // Register UDF { _df.CreateOrReplaceTempView("people"); _spark.Udf().RegisterVector("udf1", udf1Func); Row[] rows = _spark.Sql("SELECT udf1(age, name) FROM people") .Collect() .ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("Michael is 0", rows[0].GetAs <string>(0)); Assert.Equal("Andy is 30", rows[1].GetAs <string>(0)); Assert.Equal("Justin is 19", rows[2].GetAs <string>(0)); } }
public void TestUdfRegistrationWithReturnAsRowType() { // Test UDF that returns a Row object with a single column. { var schema = new StructType(new[] { new StructField("col1", new IntegerType()), new StructField("col2", new StringType()) }); _df.CreateOrReplaceTempView("people"); _spark.Udf().Register <string>( "udf1", str => new GenericRow(new object[] { 1, "abc" }), schema); Row[] rows = _spark.Sql("SELECT udf1(name) AS col FROM people") .Collect() .ToArray(); Assert.Equal(3, rows.Length); foreach (Row row in rows) { Assert.Equal(1, row.Size()); Row outerCol = row.GetAs <Row>("col"); Assert.Equal(2, outerCol.Size()); Assert.Equal(1, outerCol.GetAs <int>("col1")); Assert.Equal("abc", outerCol.GetAs <string>("col2")); } } // Test UDF that returns a Row object with multiple columns. { var schema = new StructType(new[] { new StructField("col1", new IntegerType()) }); _df.CreateOrReplaceTempView("people"); _spark.Udf().Register <string>( "udf2", str => new GenericRow(new object[] { 111 }), schema); Row[] rows = _spark.Sql("SELECT udf2(name) AS col, name FROM people") .Collect() .ToArray(); Assert.Equal(3, rows.Length); foreach (Row row in rows) { Assert.Equal(2, row.Size()); Row col1 = row.GetAs <Row>("col"); Assert.Equal(1, col1.Size()); Assert.Equal(111, col1.GetAs <int>("col1")); string col2 = row.GetAs <string>("name"); Assert.NotEmpty(col2); } } // Test UDF that returns a nested Row object. { var subSchema1 = new StructType(new[] { new StructField("col1", new IntegerType()), }); var subSchema2 = new StructType(new[] { new StructField("col1", new StringType()), new StructField("col2", subSchema1), }); var schema = new StructType(new[] { new StructField("col1", new IntegerType()), new StructField("col2", subSchema1), new StructField("col3", subSchema2) }); _df.CreateOrReplaceTempView("people"); _spark.Udf().Register <string>( "udf3", str => new GenericRow( new object[] { 1, new GenericRow(new object[] { 1 }), new GenericRow(new object[] { "abc", new GenericRow(new object[] { 10 }) }) }), schema); Row[] rows = _spark.Sql("SELECT udf3(name) AS col FROM people") .Collect() .ToArray(); Assert.Equal(3, rows.Length); foreach (Row row in rows) { Assert.Equal(1, row.Size()); Row outerCol = row.GetAs <Row>("col"); Assert.Equal(3, outerCol.Size()); Assert.Equal(1, outerCol.GetAs <int>("col1")); Assert.Equal( new Row(new object[] { 1 }, subSchema1), outerCol.GetAs <Row>("col2")); Assert.Equal( new Row( new object[] { "abc", new Row(new object[] { 10 }, subSchema1) }, subSchema2), outerCol.GetAs <Row>("col3")); } } // Chained UDFs. { var schema = new StructType(new[] { new StructField("col1", new IntegerType()), new StructField("col2", new StringType()) }); _df.CreateOrReplaceTempView("people"); _spark.Udf().Register <string>( "udf4", str => new GenericRow(new object[] { 1, str }), schema); _spark.Udf().Register <Row, string>( "udf5", row => row.GetAs <string>(1)); Row[] rows = _spark.Sql("SELECT udf5(udf4(name)) FROM people") .Collect() .ToArray(); Assert.Equal(3, rows.Length); var expected = new string[] { "Michael", "Andy", "Justin" }; for (int i = 0; i < rows.Length; ++i) { Assert.Equal(1, rows[i].Size()); Assert.Equal(expected[i], rows[i].GetAs <string>(0)); } } }
public void TestSignaturesV2_3_X() { Column col = _df["name"]; col = _df["age"]; DataFrame df = _df.ToDF(); df = df.ToDF("name2", "age2"); StructType schema = _df.Schema(); Assert.NotNull(schema); _df.PrintSchema(); _df.Explain(); _df.Explain(true); _df.Explain(false); Assert.Equal(2, _df.Columns().ToArray().Length); _df.IsLocal(); _df.IsStreaming(); // The following is required for *CheckPoint(). _spark.SparkContext.SetCheckpointDir(TestEnvironment.ResourceDirectory); _df.Checkpoint(); _df.Checkpoint(false); _df.LocalCheckpoint(); _df.LocalCheckpoint(false); _df.WithWatermark("time", "10 minutes"); _df.Show(); _df.Show(10); _df.Show(10, 10); _df.Show(10, 10, true); _df.Join(_df); _df.Join(_df, "name"); _df.Join(_df, new[] { "name" }); _df.Join(_df, new[] { "name" }, "outer"); _df.Join(_df, _df["age"] == _df["age"]); _df.Join(_df, _df["age"] == _df["age"], "outer"); _df.CrossJoin(_df); _df.SortWithinPartitions("age"); _df.SortWithinPartitions("age", "name"); _df.SortWithinPartitions(); _df.SortWithinPartitions(_df["age"]); _df.SortWithinPartitions(_df["age"], _df["name"]); _df.Sort("age"); _df.Sort("age", "name"); _df.Sort(); _df.Sort(_df["age"]); _df.Sort(_df["age"], _df["name"]); _df.OrderBy("age"); _df.OrderBy("age", "name"); _df.OrderBy(); _df.OrderBy(_df["age"]); _df.OrderBy(_df["age"], _df["name"]); _df.Hint("broadcast"); _df.Hint("broadcast", new[] { "hello", "world" }); _df.Col("age"); _df.ColRegex("age"); _df.As("alias"); _df.Alias("alias"); _df.Select("age"); _df.Select("age", "name"); _df.Select(); _df.Select(_df["age"]); _df.Select(_df["age"], _df["name"]); _df.SelectExpr(); _df.SelectExpr("age * 2"); _df.SelectExpr("age * 2", "abs(age)"); _df.Filter(_df["age"] > 21); _df.Filter("age > 21"); _df.Where(_df["age"] > 21); _df.Where("age > 21"); _df.GroupBy("age"); _df.GroupBy("age", "name"); _df.GroupBy(); _df.GroupBy(_df["age"]); _df.GroupBy(_df["age"], _df["name"]); _df.Rollup("age"); _df.Rollup("age", "name"); _df.Rollup(); _df.Rollup(_df["age"]); _df.Rollup(_df["age"], _df["name"]); _df.Cube("age"); _df.Cube("age", "name"); _df.Cube(); _df.Cube(_df["age"]); _df.Cube(_df["age"], _df["name"]); _df.Agg(Avg(_df["age"])); _df.Agg(Avg(_df["age"]), Avg(_df["name"])); _df.Limit(10); _df.Union(_df); _df.UnionByName(_df); _df.Intersect(_df); _df.Except(_df); _df.Sample(0.5); _df.Sample(0.5, true); _df.Sample(0.5, false, 12345); _df.RandomSplit(new[] { 0.2, 0.8 }); _df.RandomSplit(new[] { 0.2, 0.8 }, 12345); _df.WithColumn("age2", _df["age"]); _df.WithColumnRenamed("age", "age2"); _df.Drop(); _df.Drop("age"); _df.Drop("age", "name"); _df.Drop(_df["age"]); _df.DropDuplicates(); _df.DropDuplicates("age"); _df.DropDuplicates("age", "name"); _df.Describe(); _df.Describe("age"); _df.Describe("age", "name"); _df.Summary(); _df.Summary("count"); _df.Summary("count", "mean"); _df.Head(2); _df.Head(); _df.First(); _df.Take(3).ToArray(); _df.Collect().ToArray(); _df.ToLocalIterator().ToArray(); _df.Count(); _df.Repartition(2); _df.Repartition(2, _df["age"]); _df.Repartition(_df["age"]); _df.Repartition(); _df.RepartitionByRange(2, _df["age"]); _df.RepartitionByRange(_df["age"]); _df.Coalesce(1); _df.Distinct(); _df.Persist(); _df.Cache(); _df.Unpersist(); _df.CreateTempView("view"); _df.CreateOrReplaceTempView("view"); _df.CreateGlobalTempView("global_view"); _df.CreateOrReplaceGlobalTempView("global_view"); }
static void Main(string[] args) { // Initialize Session SparkSession ss = SparkSession .Builder() .AppName("Working with DataFrames") .GetOrCreate(); // Read Data DataFrame businesses = ss .Read() .Option("header", "true") .Option("inferSchema", "true") .Csv("Data/NYC-Restaurant-Inspections.csv"); businesses = businesses.Select("CAMIS", "DBA", "BORO", "CUISINE DESCRIPTION"); DataFrame inspections = ss .Read() .Option("header", "true") .Option("inferSchema", "true") .Csv("Data/NYC-Restaurant-Inspections.csv"); inspections = inspections.Select("CAMIS", "INSPECTION DATE", "VIOLATION CODE", "CRITICAL FLAG", "SCORE", "GRADE", "INSPECTION TYPE"); // Select columns businesses.Select(Col("CAMIS"), Col("DBA")).Show(1); inspections.Select(inspections["VIOLATION CODE"]).Show(1); // Filter businesses .Filter(Col("BORO") == "Manhattan") .Select("DBA", "BORO") .Show(3); // Group / Aggregate businesses .GroupBy("CUISINE DESCRIPTION") .Agg(Count("CUISINE DESCRIPTION").Alias("CUISINE COUNT")) .Show(10); // Order businesses .GroupBy("CUISINE DESCRIPTION") .Agg(Count("CUISINE DESCRIPTION").Alias("CUISINE COUNT")) .OrderBy(Col("CUISINE COUNT").Desc()) .Show(3); // Join DataFrame joinedDf = businesses .Join(inspections, "CAMIS") .Select(Col("DBA"), Col("CUISINE DESCRIPTION"), Col("GRADE")); joinedDf.Show(5); // SQL businesses.CreateOrReplaceTempView("businesses"); inspections.CreateOrReplaceTempView("inspections"); ss.Sql(@"SELECT b.DBA,b.`CUISINE DESCRIPTION`,i.GRADE FROM businesses b JOIN inspections i ON b.CAMIS = i.CAMIS").Show(5); // UDF ss.Udf().Register <string, string>("Tupper", Tupper); inspections .Select(CallUDF("Tupper", Col("INSPECTION TYPE")).Alias("CAPITALIZED")) .Show(3); // Save joinedDf .Write() .Mode(SaveMode.Overwrite) .Csv("output"); }
private static void MetadataCleanup(DataFrame dataFrame) { Console.WriteLine("Metadata Clean-up"); var priceCleanup = Udf <string, float>( p => { if (!string.IsNullOrEmpty(p)) { var index = 0; for (var i = 0; i < p.Length; i++) { if (char.IsDigit(p[i])) { index = i; break; } } if (float.TryParse(p.Substring(index), out var result)) { return(result); } } return(-1f); }); var dateCleanup = Udf <string, double>( d => { if (!string.IsNullOrEmpty(d) && DateTime.TryParse(d, out var result)) { return((result.ToUniversalTime() - new DateTime(1970, 1, 1)).TotalSeconds); } return(-1L); }); var rankCleanup = Udf <string, long>( r => { if (!string.IsNullOrEmpty(r)) { var regex = new Regex(@"\d+(,\d+)*", RegexOptions.Singleline); var match = regex.Match(r); if (match.Success && long.TryParse(match.Value.Replace(",", string.Empty), out var result)) { return(result); } } return(-1L); }); dataFrame = dataFrame .Filter( dataFrame["asin"].IsNotNull() .And(dataFrame["title"].IsNotNull()) .And(dataFrame["main_cat"].IsNotNull()) .And(dataFrame["brand"].IsNotNull()) .And(Not(dataFrame["main_cat"].IsIn("Grocery", "Pet Supplies", "Baby", "Books", "Appstore for Android", "Gift Cards")))); dataFrame = dataFrame .WithColumn("clean_price", priceCleanup(dataFrame["price"])) .WithColumn("clean-date", dateCleanup(dataFrame["date"])) .WithColumn("clean-rank", rankCleanup(dataFrame["rank"])) .Drop(dataFrame["price"]) .Drop(dataFrame["date"]) .Drop(dataFrame["rank"]) .WithColumnRenamed("clean_price", "price") .WithColumnRenamed("clean-date", "unixTime") .WithColumnRenamed("clean-rank", "rank"); dataFrame.Cache(); dataFrame.CreateOrReplaceTempView("ElectronicsMetadata"); Console.WriteLine($"Metadata Count: {dataFrame.Count()}"); Console.WriteLine("Done"); Console.WriteLine(); }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Logging <path to Apache User Logs>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName("Apache User Log Processing") .GetOrCreate(); // Read input log file and display it DataFrame df = spark.Read().Text(args[0]); df.Show(); // Step 1: UDF to determine if each line is a valid log entry // Remove any invalid entries before further filtering spark.Udf().Register <string, bool>( "GeneralReg", log => Regex.IsMatch(log, s_apacheRx)); df.CreateOrReplaceTempView("Logs"); // Apply the UDF to get valid log entries DataFrame generalDf = spark.Sql( "SELECT logs.value, GeneralReg(logs.value) FROM Logs"); // Only keep log entries that matched the reg ex generalDf = generalDf.Filter(generalDf["GeneralReg(value)"]); generalDf.Show(); // View the resulting schema // Notice we created a new column "GeneralReg(value)" generalDf.PrintSchema(); // Step 2: Choose valid log entries that start with 10 spark.Udf().Register <string, bool>( "IPReg", log => Regex.IsMatch(log, "^(?=10)")); generalDf.CreateOrReplaceTempView("IPLogs"); // Apply UDF to get valid log entries starting with 10 // Use SQL "WHERE" rather than doing ipDf.Filter(), // which avoids creating an extra column "IPReg(value)" DataFrame ipDf = spark.Sql( "SELECT iplogs.value FROM IPLogs WHERE IPReg(iplogs.value)"); ipDf.Show(); // Step 3: Choose valid log entries that start // with 10 and deal with spam spark.Udf().Register <string, bool>( "SpamRegEx", log => Regex.IsMatch(log, "\\b(?=spam)\\b")); ipDf.CreateOrReplaceTempView("SpamLogs"); // Apply UDF to get valid, start with 10, spam entries DataFrame spamDF = spark.Sql( "SELECT spamlogs.value FROM SpamLogs WHERE SpamRegEx(spamlogs.value)"); // Let's explore the columns in the data we have filtered // Use LINQ to count the number of GET requests int numGetRequests = spamDF .Collect() .Where(r => ContainsGet(r.GetAs <string>("value"))) .Count(); Console.WriteLine("Number of GET requests: " + numGetRequests); spark.Stop(); }