/// <summary> /// Parses a JSON string to create a <see cref="JvmObjectReference"/>. /// It references a <see cref="StructType"/> on the JVM side. /// </summary> /// <param name="jvm">JVM bridge to use</param> /// <param name="json">JSON string to parse</param> /// <returns>The new JvmObjectReference created from the JSON string</returns> internal static JvmObjectReference FromJson(IJvmBridge jvm, string json) { return((JvmObjectReference)jvm.CallStaticJavaMethod( "org.apache.spark.sql.types.DataType", "fromJson", json)); }
private static IJvmObjectReferenceProvider CreateEnvVarsForPythonFunction(IJvmBridge jvm) { var environmentVars = new Hashtable(jvm); string assemblySearchPath = Environment.GetEnvironmentVariable( AssemblySearchPathResolver.AssemblySearchPathsEnvVarName); if (!string.IsNullOrEmpty(assemblySearchPath)) { environmentVars.Put( AssemblySearchPathResolver.AssemblySearchPathsEnvVarName, assemblySearchPath); } // DOTNET_WORKER_SPARK_VERSION is used to handle different versions // of Spark on the worker. environmentVars.Put( "DOTNET_WORKER_SPARK_VERSION", SparkEnvironment.SparkVersion.ToString()); if (EnvironmentUtils.GetEnvironmentVariableAsBool(Constants.RunningREPLEnvVar)) { environmentVars.Put(Constants.RunningREPLEnvVar, "true"); } return(environmentVars); }
internal CallbackServer(IJvmBridge jvm, bool run = true) { AppDomain.CurrentDomain.ProcessExit += (s, e) => Shutdown(); _jvm = jvm; if (run) { Run(); } }
/// <summary> /// Construct the JvmThreadPoolGC. /// </summary> /// <param name="loggerService">Logger service.</param> /// <param name="jvmBridge">The JvmBridge used to call JVM methods.</param> /// <param name="threadGCInterval">The interval to GC finished threads.</param> public JvmThreadPoolGC(ILoggerService loggerService, IJvmBridge jvmBridge, TimeSpan threadGCInterval) { _loggerService = loggerService; _jvmBridge = jvmBridge; _threadGCInterval = threadGCInterval; _activeThreads = new ConcurrentDictionary <int, Thread>(); _activeThreadGCTimerLock = new object(); _activeThreadGCTimer = null; }
/// <summary> /// This function may be used to get or instantiate a SparkContext and register it as a /// singleton object. Because we can only have one active SparkContext per JVM, /// this is useful when applications may wish to share a SparkContext. /// </summary> /// <param name="conf"><see cref="SparkConf"/> that will be used for creating SparkContext /// </param> /// <returns> /// Current SparkContext (or a new one if it wasn't created before the function call) /// </returns> public static SparkContext GetOrCreate(SparkConf conf) { IJvmBridge jvm = ((IJvmObjectReferenceProvider)conf).Reference.Jvm; return(new SparkContext( (JvmObjectReference)jvm.CallStaticJavaMethod( "org.apache.spark.SparkContext", "getOrCreate", conf))); }
/// <summary> /// Constructor for the JvmObjectReference class. /// </summary> /// <param name="id">Id for the JVM object</param> /// <param name="jvm">IJvmBridge instance that created the JVM object</param> internal JvmObjectReference(string id, IJvmBridge jvm) { if (id is null) { throw new ArgumentNullException("JvmReferenceId cannot be null."); } Id = new JvmObjectId(id, jvm); _creationTime = DateTime.UtcNow; }
/// <summary> /// Function that creates a temporary directory inside the given directory and returns the /// absolute filepath of temporary file name in that directory. /// </summary> /// <param name="conf">SparkConf object</param> /// <returns>Absolute filepath of the created random file</returns> private string CreateTempFilePath(SparkConf conf) { IJvmBridge jvm = ((IJvmObjectReferenceProvider)conf).Reference.Jvm; var localDir = (string)jvm.CallStaticJavaMethod( "org.apache.spark.util.Utils", "getLocalDir", conf); string dir = Path.Combine(localDir, "sparkdotnet"); Directory.CreateDirectory(dir); return(Path.Combine(dir, Path.GetRandomFileName())); }
/// <summary> /// Creates the PythonFunction object on the JVM side wrapping the given command bytes. /// </summary> /// <param name="jvm">JVM bridge to use</param> /// <param name="command">Serialized command bytes</param> /// <returns>JvmObjectReference object to the PythonFunction object</returns> internal static JvmObjectReference CreatePythonFunction(IJvmBridge jvm, byte[] command) { var arrayList = new ArrayList(jvm); return((JvmObjectReference)jvm.CallStaticJavaMethod( "org.apache.spark.sql.api.dotnet.SQLUtils", "createPythonFunction", command, CreateEnvVarsForPythonFunction(jvm), arrayList, // Python includes SparkEnvironment.ConfigurationService.GetWorkerExePath(), Versions.CurrentVersion, arrayList, // Broadcast variables null)); // Accumulator }
/// <summary> /// Function to create the Broadcast variable (org.apache.spark.broadcast.Broadcast) /// </summary> /// <param name="sc">SparkContext object of type <see cref="SparkContext"/></param> /// <param name="value">Broadcast value of type object</param> /// <returns>Returns broadcast variable of type <see cref="JvmObjectReference"/></returns> private JvmObjectReference CreateBroadcast(SparkContext sc, T value) { IJvmBridge jvm = ((IJvmObjectReferenceProvider)sc).Reference.Jvm; var javaSparkContext = (JvmObjectReference)jvm.CallStaticJavaMethod( "org.apache.spark.api.java.JavaSparkContext", "fromSparkContext", sc); Version version = SparkEnvironment.SparkVersion; return((version.Major, version.Minor) switch { (2, 4) => CreateBroadcast_V2_4_X(javaSparkContext, sc, value), (3, _) => CreateBroadcast_V2_4_X(javaSparkContext, sc, value), _ => throw new NotSupportedException($"Spark {version} not supported.") });
/// <summary> /// Creates the PythonFunction object on the JVM side wrapping the given command bytes. /// </summary> /// <param name="jvm">JVM bridge to use</param> /// <param name="command">Serialized command bytes</param> /// <returns>JvmObjectReference object to the PythonFunction object</returns> internal static JvmObjectReference CreatePythonFunction(IJvmBridge jvm, byte[] command) { JvmObjectReference hashTableReference = jvm.CallConstructor("java.util.Hashtable"); JvmObjectReference arrayListReference = jvm.CallConstructor("java.util.ArrayList"); return((JvmObjectReference)jvm.CallStaticJavaMethod( "org.apache.spark.sql.api.dotnet.SQLUtils", "createPythonFunction", command, hashTableReference, // Environment variables arrayListReference, // Python includes SparkEnvironment.ConfigurationService.GetWorkerExePath(), "1.0", arrayListReference, // Broadcast variables null)); // Accumulator }
private static JvmObjectReference CreateEnvVarsForPythonFunction(IJvmBridge jvm) { JvmObjectReference environmentVars = jvm.CallConstructor("java.util.Hashtable"); string assemblySearchPath = Environment.GetEnvironmentVariable( AssemblySearchPathResolver.AssemblySearchPathsEnvVarName); if (!string.IsNullOrEmpty(assemblySearchPath)) { jvm.CallNonStaticJavaMethod( environmentVars, "put", AssemblySearchPathResolver.AssemblySearchPathsEnvVarName, assemblySearchPath); } return(environmentVars); }
internal static UserDefinedFunction Create( IJvmBridge jvm, string name, byte[] command, UdfUtils.PythonEvalType evalType, string returnType) { return(new UserDefinedFunction( jvm.CallConstructor( "org.apache.spark.sql.execution.python.UserDefinedPythonFunction", name, UdfUtils.CreatePythonFunction(jvm, command), DataType.FromJson(jvm, returnType), (int)evalType, true // udfDeterministic ))); }
/// <summary> /// Creates the PythonFunction object on the JVM side wrapping the given command bytes. /// </summary> /// <param name="jvm">JVM bridge to use</param> /// <param name="command">Serialized command bytes</param> /// <returns>JvmObjectReference object to the PythonFunction object</returns> internal static JvmObjectReference CreatePythonFunction(IJvmBridge jvm, byte[] command) { var arrayList = new ArrayList(jvm); var broadcastVariables = new ArrayList(jvm); broadcastVariables.AddAll(JvmBroadcastRegistry.GetAll()); JvmBroadcastRegistry.Clear(); return((JvmObjectReference)jvm.CallStaticJavaMethod( "org.apache.spark.sql.api.dotnet.SQLUtils", "createPythonFunction", command, CreateEnvVarsForPythonFunction(jvm), arrayList, // Python includes SparkEnvironment.ConfigurationService.GetWorkerExePath(), // Used to check the compatibility of UDFs between the driver and worker. AssemblyInfoProvider.MicrosoftSparkAssemblyInfo().AssemblyVersion, broadcastVariables, null)); // Accumulator }
private static IJvmObjectReferenceProvider CreateEnvVarsForPythonFunction(IJvmBridge jvm) { var environmentVars = new Hashtable(jvm); string assemblySearchPath = string.Join(",", new[] { Environment.GetEnvironmentVariable( AssemblySearchPathResolver.AssemblySearchPathsEnvVarName), SparkFiles.GetRootDirectory() }.Where(s => !string.IsNullOrWhiteSpace(s))); if (!string.IsNullOrEmpty(assemblySearchPath)) { environmentVars.Put( AssemblySearchPathResolver.AssemblySearchPathsEnvVarName, assemblySearchPath); } return(environmentVars); }
private static IJvmObjectReferenceProvider CreateEnvVarsForPythonFunction(IJvmBridge jvm) { var environmentVars = new Hashtable(jvm); string assemblySearchPath = string.Join(",", new[] { Environment.GetEnvironmentVariable( AssemblySearchPathResolver.AssemblySearchPathsEnvVarName), SparkFiles.GetRootDirectory() }.Where(s => !string.IsNullOrWhiteSpace(s))); if (!string.IsNullOrEmpty(assemblySearchPath)) { environmentVars.Put( AssemblySearchPathResolver.AssemblySearchPathsEnvVarName, assemblySearchPath); } // DOTNET_WORKER_SPARK_VERSION is used to handle different versions of Spark on the worker. environmentVars.Put("DOTNET_WORKER_SPARK_VERSION", SparkEnvironment.SparkVersion.ToString()); return(environmentVars); }
internal static UserDefinedFunction Create( string name, byte[] command, UdfUtils.PythonEvalType evalType, string returnType) { IJvmBridge jvm = SparkEnvironment.JvmBridge; JvmObjectReference hashTableReference = jvm.CallConstructor("java.util.Hashtable"); JvmObjectReference arrayListReference = jvm.CallConstructor("java.util.ArrayList"); var dataType = (JvmObjectReference)jvm.CallStaticJavaMethod( "org.apache.spark.sql.types.DataType", "fromJson", $"\"{returnType}\""); var pythonFunction = (JvmObjectReference)jvm.CallStaticJavaMethod( "org.apache.spark.sql.api.dotnet.SQLUtils", "createPythonFunction", command, hashTableReference, // Environment variables arrayListReference, // Python includes SparkEnvironment.ConfigurationService.GetWorkerExePath(), "1.0", arrayListReference, // Broadcast variables null); // Accumulator return(new UserDefinedFunction( jvm.CallConstructor( "org.apache.spark.sql.execution.python.UserDefinedPythonFunction", name, pythonFunction, dataType, (int)evalType, true // udfDeterministic ))); }
internal static UserDefinedFunction Create( IJvmBridge jvm, string name, byte[] command, UdfUtils.PythonEvalType evalType, string returnType) { var pythonFunction = UdfUtils.CreatePythonFunction(jvm, command); var dataType = (JvmObjectReference)jvm.CallStaticJavaMethod( "org.apache.spark.sql.types.DataType", "fromJson", $"{returnType}"); return(new UserDefinedFunction( jvm.CallConstructor( "org.apache.spark.sql.execution.python.UserDefinedPythonFunction", name, pythonFunction, dataType, (int)evalType, true // udfDeterministic ))); }
/// <summary> /// Create a <c>java.util.Properties</c> JVM object /// </summary> /// <param name="jvm">JVM bridge to use</param> internal Properties(IJvmBridge jvm) =>
public Hyperspace(SparkSession spark) { _spark = spark; _jvmBridge = ((IJvmObjectReferenceProvider)spark).Reference.Jvm; _jvmObject = _jvmBridge.CallConstructor(s_hyperspaceClassName, spark); }
public JvmThreadPoolGCTests(SparkFixture fixture) { _loggerService = LoggerServiceFactory.GetLogger(typeof(JvmThreadPoolGCTests)); _spark = fixture.Spark; _jvmBridge = _spark.Reference.Jvm; }
/// <summary> /// Create a <c>java.util.Hashtable</c> JVM object /// </summary> /// <param name="jvm">JVM bridge to use</param> internal Hashtable(IJvmBridge jvm) =>
public JvmThreadPoolGCTests(SparkFixture fixture) { _loggerService = LoggerServiceFactory.GetLogger(typeof(JvmThreadPoolGCTests)); _spark = fixture.Spark; _jvmBridge = ((IJvmObjectReferenceProvider)_spark).Reference.Jvm; }
internal ForeachBatchCallbackHandler(IJvmBridge jvm, Action <DataFrame, long> func) { _jvm = jvm; _func = func; }
/// <summary> /// Constructor for JvmObjectId class. /// </summary> /// <param name="id">Unique identifier</param> /// <param name="jvm">JVM bridge object</param> internal JvmObjectId(string id, IJvmBridge jvm) { Id = id; Jvm = jvm; }
public Hyperspace(SparkSession spark) { _spark = spark; _jvmBridge = spark.Reference.Jvm; Reference = _jvmBridge.CallConstructor(s_hyperspaceClassName, spark); }
/// <summary> /// Create a <c>java.util.ArrayList</c> JVM object /// </summary> /// <param name="jvm">JVM bridge to use</param> internal ArrayList(IJvmBridge jvm) { Reference = jvm.CallConstructor("java.util.ArrayList"); }
/// <summary> /// Create a <c>java.util.HashMap</c> JVM object /// </summary> /// <param name="jvm">JVM bridge to use</param> internal HashMap(IJvmBridge jvm) =>
public SparkFixture() { // The worker directory must be set for the Microsoft.Spark.Worker executable. if (string.IsNullOrEmpty( Environment.GetEnvironmentVariable(EnvironmentVariableNames.WorkerDir))) { throw new Exception( $"Environment variable '{EnvironmentVariableNames.WorkerDir}' must be set."); } BuildSparkCmd(out var filename, out var args); // Configure the process using the StartInfo properties. _process.StartInfo.FileName = filename; _process.StartInfo.Arguments = args; // UseShellExecute defaults to true in .NET Framework, // but defaults to false in .NET Core. To support both, set it // to false which is required for stream redirection. _process.StartInfo.UseShellExecute = false; _process.StartInfo.RedirectStandardInput = true; _process.StartInfo.RedirectStandardOutput = true; _process.StartInfo.RedirectStandardError = true; bool isSparkReady = false; _process.OutputDataReceived += (sender, arguments) => { // Scala-side driver for .NET emits the following message after it is // launched and ready to accept connections. if (!isSparkReady && arguments.Data.Contains("Backend running debug mode")) { isSparkReady = true; } }; _process.Start(); _process.BeginErrorReadLine(); _process.BeginOutputReadLine(); bool processExited = false; while (!isSparkReady && !processExited) { processExited = _process.WaitForExit(500); } if (processExited) { _process.Dispose(); // The process should not have been exited. throw new Exception( $"Process exited prematurely with '{filename} {args}'."); } Spark = SparkSession .Builder() // Lower the shuffle partitions to speed up groupBy() operations. .Config("spark.sql.shuffle.partitions", "3") .Config("spark.ui.enabled", false) .Config("spark.ui.showConsoleProgress", false) .AppName("Microsoft.Spark.E2ETest") .GetOrCreate(); Spark.SparkContext.SetLogLevel(DefaultLogLevel); Jvm = Spark.Reference.Jvm; }
/// <summary> /// Create a <c>java.util.ArrayList</c> JVM object /// </summary> /// <param name="jvm">JVM bridge to use</param> internal ArrayList(IJvmBridge jvm) { _jvmObject = jvm.CallConstructor("java.util.ArrayList"); }