public static void ExecOneRemote(SourceCode.Job cfgj, string[] ExecArgs, bool verbose, bool verbosereplication) { if (verbose) { Console.WriteLine("[{0}] [Remote: {2}]", System.DateTime.Now.ToString(), System.DateTime.Now.Millisecond, cfgj.NarrativeName); } string logname = Surrogate.SafeTextPath(cfgj.NarrativeName) + "_" + Guid.NewGuid().ToString() + ".j" + sjid + "_log.txt"; //System.Threading.Thread.Sleep(8000); /*if (cfgj.IOSettings.DFS_IOs == null || cfgj.IOSettings.DFS_IOs.Length == 0) * { * Console.Error.WriteLine("One or more IOSettings/DFS_IO needed in configuration for 'remote'"); * return; * }*/ // Could provide BlockID here, which is just the n-th DFS_IO entry. //cfgj.Remote dfs dc = LoadDfsConfig(); string[] slaves = dc.Slaves.SlaveList.Split(',', ';'); if (dc.Slaves.SlaveList.Length == 0 || slaves.Length < 1) { throw new Exception("SlaveList expected in " + dfs.DFSXMLNAME); } if (dc.Replication > 1) { string[] slavesbefore = slaves; slaves = ExcludeUnhealthySlaveMachines(slaves, true).ToArray(); if (slavesbefore.Length - slaves.Length >= dc.Replication) { throw new Exception("Not enough healthy machines to run job (hit replication count)"); } } if (cfgj.IOSettings.DFS_IO_Multis != null) { cfgj.ExpandDFSIOMultis(slaves.Length, MySpace.DataMining.DistributedObjects.MemoryUtils.NumberOfProcessors); } Dictionary <string, int> slaveIDs = new Dictionary <string, int>(); for (int si = 0; si < slaves.Length; si++) { slaveIDs.Add(slaves[si].ToUpper(), si); } bool aborting = false; try { List <RemoteBlockInfo> blocks = new List <RemoteBlockInfo>(cfgj.IOSettings.DFS_IOs.Length); if (verbose) { Console.WriteLine("{0} processes on {1} machines:", cfgj.IOSettings.DFS_IOs.Length, slaves.Length); } List <string> outputdfsdirs = new List <string>(slaves.Length); { for (int i = 0; i < slaves.Length; i++) { try { outputdfsdirs.Add(NetworkPathForHost(slaves[i])); } catch (Exception e) { Console.Error.WriteLine(" {0}", e.Message); } } } string slaveconfigxml = ""; { System.Xml.XmlDocument pdoc = new System.Xml.XmlDocument(); { System.IO.MemoryStream ms = new System.IO.MemoryStream(); System.Xml.Serialization.XmlSerializer xs = new System.Xml.Serialization.XmlSerializer(typeof(dfs)); xs.Serialize(ms, dc); ms.Seek(0, System.IO.SeekOrigin.Begin); pdoc.Load(ms); } string xml = pdoc.DocumentElement.SelectSingleNode("./slave").OuterXml; //System.Threading.Thread.Sleep(8000); slaveconfigxml = xml; } { // Temporary: for (int si = 0; si < slaves.Length; si++) { System.Threading.Mutex m = new System.Threading.Mutex(false, "AEL_SC_" + slaves[si]); try { m.WaitOne(); } catch (System.Threading.AbandonedMutexException) { } try { System.IO.File.WriteAllText(NetworkPathForHost(slaves[si]) + @"\slaveconfig.j" + sjid + ".xml", slaveconfigxml); } catch { } finally { m.ReleaseMutex(); m.Close(); } } } int nextslave = (new Random(DateTime.Now.Millisecond / 2 + System.Diagnostics.Process.GetCurrentProcess().Id / 2)).Next() % slaves.Length; int hosttypes = 0; List <int> outputrecordlengths = new List <int>(); List <int> inputrecordlengths = new List <int>(); for (int BlockID = 0; BlockID < cfgj.IOSettings.DFS_IOs.Length; BlockID++) { int slaveHostID = 0; RemoteBlockInfo bi = new RemoteBlockInfo(); bi.sampledist = dc.DataNodeBaseSize / dc.DataNodeSamples; bi.BlockID = BlockID; bi.blockcount = cfgj.IOSettings.DFS_IOs.Length; if (string.IsNullOrEmpty(cfgj.IOSettings.DFS_IOs[BlockID].Host)) { if (0 != hosttypes && 1 != hosttypes) { throw new Exception("DFS_IO/Host tag must be specified for all or none"); } hosttypes = 1; bi.SlaveHost = slaves[nextslave]; slaveHostID = nextslave; bi.explicithost = false; } else { if (0 != hosttypes && 2 != hosttypes) { throw new Exception("DFS_IO/Host tag must be specified for all or none"); } hosttypes = 2; bi.SlaveHost = cfgj.IOSettings.DFS_IOs[BlockID].Host; slaveHostID = slaveIDs[bi.SlaveHost.ToUpper()]; bi.explicithost = true; } bi.ExecArgs = ExecArgs; if (++nextslave >= slaves.Length) { nextslave = 0; } bi.logname = logname; bi.outputdfsdirs = outputdfsdirs; bi.slaves = slaves; bi.baseoutputfilesize = dc.DataNodeBaseSize; bi.cfgj = cfgj; bi.DFSWriter = cfgj.IOSettings.DFS_IOs[BlockID].DFSWriter.Trim(); bi.Meta = cfgj.IOSettings.DFS_IOs[BlockID].Meta; List <string> dfswriters = new List <string>(); if (bi.DFSWriter.Length > 0) { string[] writers = bi.DFSWriter.Split(';'); for (int wi = 0; wi < writers.Length; wi++) { string thiswriter = writers[wi].Trim(); if (thiswriter.Length == 0) { continue; } int ic = thiswriter.IndexOf('@'); int reclen = -1; if (-1 != ic) { try { reclen = Surrogate.GetRecordSize(thiswriter.Substring(ic + 1)); thiswriter = thiswriter.Substring(0, ic); } catch (FormatException e) { Console.Error.WriteLine("Error: remote output record length error: {0} ({1})", thiswriter, e.Message); SetFailure(); return; } catch (OverflowException e) { Console.Error.WriteLine("Error: remote output record length error: {0} ({1})", thiswriter, e.Message); SetFailure(); return; } } string outfn = thiswriter; if (outfn.StartsWith(@"dfs://", StringComparison.OrdinalIgnoreCase)) { outfn = outfn.Substring(6); } string reason = ""; if (dfs.IsBadFilename(outfn, out reason)) { Console.Error.WriteLine("Invalid output file: {0}", reason); return; } if (null != DfsFindAny(dc, outfn)) { Console.Error.WriteLine("Error: output file already exists in DFS: {0}", outfn); return; } dfswriters.Add(thiswriter); outputrecordlengths.Add(reclen); } } else { dfswriters.Add(""); outputrecordlengths.Add(-1); } bi.DFSWriters = dfswriters; bi.verbose = verbose; bi.rem = new MySpace.DataMining.DistributedObjects5.Remote(cfgj.NarrativeName + "_remote"); bi.rem.CookRetries = dc.slave.CookRetries; bi.rem.CookTimeout = dc.slave.CookTimeout; bi.rem.DfsSampleDistance = bi.sampledist; bi.rem.CompressFileOutput = dc.slave.CompressDfsChunks; bi.rem.LocalCompile = true; bi.rem.OutputStartingPoint = slaveHostID; bi.rem.CompilerOptions = cfgj.IOSettings.CompilerOptions; bi.rem.CompilerVersion = cfgj.IOSettings.CompilerVersion; if (cfgj.AssemblyReferencesCount > 0) { cfgj.AddAssemblyReferences(bi.rem.CompilerAssemblyReferences, Surrogate.NetworkPathForHost(dc.Slaves.GetFirstSlave())); } if (cfgj.OpenCVExtension != null) { bi.rem.AddOpenCVExtension(); } if (cfgj.MemCache != null) { bi.rem.AddMemCacheExtension(); } if (cfgj.Unsafe != null) { bi.rem.AddUnsafe(); } { List <dfs.DfsFile.FileNode> nodes = new List <dfs.DfsFile.FileNode>(); List <string> mapfileswithnodes = null; List <int> nodesoffsets = null; IList <string> mapfiles = SplitInputPaths(dc, cfgj.IOSettings.DFS_IOs[BlockID].DFSReader); if (mapfiles.Count > 0) { mapfileswithnodes = new List <string>(mapfiles.Count); nodesoffsets = new List <int>(mapfiles.Count); } for (int i = 0; i < mapfiles.Count; i++) { string dp = mapfiles[i].Trim(); int inreclen = -1; if (0 != dp.Length) // Allow empty entry where input isn't wanted. { if (dp.StartsWith("dfs://", StringComparison.OrdinalIgnoreCase)) { dp = dp.Substring(6); } { int ic = dp.IndexOf('@'); if (-1 != ic) { try { inreclen = Surrogate.GetRecordSize(dp.Substring(ic + 1)); dp = dp.Substring(0, ic); } catch (FormatException e) { Console.Error.WriteLine("Error: remote input record length error: {0} ({1})", dp, e.Message); SetFailure(); return; } catch (OverflowException e) { Console.Error.WriteLine("Error: remote input record length error: {0} ({1})", dp, e.Message); SetFailure(); return; } } } dfs.DfsFile df; if (inreclen > 0 || inreclen == -2) { df = DfsFind(dc, dp, DfsFileTypes.BINARY_RECT); if (null != df && inreclen != df.RecordLength) { Console.Error.WriteLine("Error: remote input file does not have expected record length of {0}: {1}@{2}", inreclen, dp, df.RecordLength); SetFailure(); return; } } else { df = DfsFind(dc, dp); } if (null == df) { //throw new Exception("Remote input file not found in DFS: " + dp); Console.Error.WriteLine("Remote input file not found in DFS: {0}", dp); return; } if (df.Nodes.Count > 0) { mapfileswithnodes.Add(dp); nodesoffsets.Add(nodes.Count); inputrecordlengths.Add(inreclen); nodes.AddRange(df.Nodes); } } } bi.dfsinputpaths = new List <string>(nodes.Count); //MapNodesToNetworkPaths(nodes, bi.dfsinputpaths); dfs.MapNodesToNetworkStarPaths(nodes, bi.dfsinputpaths); bi.dfsinputfilenames = mapfileswithnodes; bi.dfsinputnodesoffsets = nodesoffsets; } blocks.Add(bi); bi.thread = new System.Threading.Thread(new System.Threading.ThreadStart(bi.threadproc)); bi.thread.Name = "RemoteJobBlock" + bi.BlockID; } MySpace.DataMining.DistributedObjects.StaticGlobals.DSpace_InputRecordLength = inputrecordlengths.Count > 0 ? inputrecordlengths[0] : -1; MySpace.DataMining.DistributedObjects.StaticGlobals.DSpace_OutputRecordLength = outputrecordlengths.Count > 0 ? outputrecordlengths[0] : -1; // Need to start threads separately due to StaticGlobals being updated. for (int BlockID = 0; BlockID < cfgj.IOSettings.DFS_IOs.Length; BlockID++) { RemoteBlockInfo bi = blocks[BlockID]; bi.rem.InputRecordLength = MySpace.DataMining.DistributedObjects.StaticGlobals.DSpace_InputRecordLength; bi.rem.InputRecordLengths = inputrecordlengths; bi.rem.OutputRecordLength = MySpace.DataMining.DistributedObjects.StaticGlobals.DSpace_OutputRecordLength; bi.rem.OutputRecordLengths = outputrecordlengths; AELight_StartTraceThread(bi.thread); } for (int BlockID = 0; BlockID < blocks.Count; BlockID++) { AELight_JoinTraceThread(blocks[BlockID].thread); blocks[BlockID].rem.Close(); if (blocks[BlockID].blockfail) { Console.Error.WriteLine("BlockID {0} on host '{1}' did not complete successfully", BlockID, (blocks[BlockID].SlaveHost != null) ? blocks[BlockID].SlaveHost : "<null>"); continue; } } List <string> dfsnames = new List <string>(); List <string> dfsnamesreplicating = new List <string>(); // Reload DFS config to make sure changes since starting get rolled in, and make sure the output file wasn't created in that time... using (LockDfsMutex()) // Needed: change between load & save should be atomic. { dc = LoadDfsConfig(); for (int BlockID = 0; BlockID < blocks.Count; BlockID++) { if (blocks[BlockID].blockfail) { continue; } { bool anyoutput = false; bool nonemptyoutputpath = false; for (int oi = 0; oi < blocks[BlockID].DFSWriters.Count; oi++) { string dfswriter = blocks[BlockID].DFSWriters[oi]; if (string.IsNullOrEmpty(dfswriter)) { if (blocks[BlockID].outputdfsnodeses[oi].Count > 0) { Console.Error.WriteLine("Output data detected with no DFSWriter specified"); } } else { { if (null != DfsFind(dc, dfswriter)) { Console.Error.WriteLine("Error: output file was created during job: {0}", dfswriter); continue; } string dfspath = dfswriter; { nonemptyoutputpath = true; dfs.DfsFile df = new dfs.DfsFile(); if (blocks[BlockID].rem.OutputRecordLengths[oi] > 0) { df.XFileType = DfsFileTypes.BINARY_RECT + "@" + blocks[BlockID].rem.OutputRecordLengths[oi].ToString(); } else if (blocks[BlockID].rem.OutputRecordLengths[oi] == -2) { df.XFileType = DfsFileTypes.BINARY_RECT + "@?"; } df.Nodes = new List <dfs.DfsFile.FileNode>(); df.Size = -1; // Preset if (dfspath.StartsWith("dfs://", StringComparison.OrdinalIgnoreCase)) { dfspath = dfspath.Substring(6); } string dfspathreplicating = ".$" + dfspath + ".$replicating-" + Guid.NewGuid().ToString(); if (null != dc.FindAny(dfspathreplicating)) { Console.Error.WriteLine("Error: file exists: file put into DFS from another location during job: " + dfspathreplicating); SetFailure(); return; } dfsnames.Add(dfspath); dfsnamesreplicating.Add(dfspathreplicating); df.Name = dfspathreplicating; bool anybad = false; long totalsize = 0; { int i = BlockID; for (int j = 0; j < blocks[i].outputdfsnodeses[oi].Count; j++) { dfs.DfsFile.FileNode fn = new dfs.DfsFile.FileNode(); fn.Host = blocks[i].slaves[(blocks[i].rem.OutputStartingPoint + j) % blocks[i].slaves.Count]; fn.Name = blocks[i].outputdfsnodeses[oi][j]; df.Nodes.Add(fn); fn.Length = -1; // Preset fn.Position = -1; // Preset if (anybad) { continue; } fn.Length = blocks[i].outputsizeses[oi][j]; fn.Position = totalsize; // Position must be set before totalsize updated! if (blocks[i].outputdfsnodeses[oi].Count != blocks[i].outputsizeses[oi].Count) { anybad = true; continue; } totalsize += blocks[i].outputsizeses[oi][j]; } } if (!anybad) { df.Size = totalsize; } if (totalsize != 0) { anyoutput = true; } // Always add the file to DFS, even if blank! dc.Files.Add(df); } } } } if (!anyoutput && verbose && nonemptyoutputpath) { Console.Write(" (no DFS output) "); ConsoleFlush(); } } } UpdateDfsXml(dc); } ReplicationPhase(verbosereplication, blocks.Count, slaves, dfsnamesreplicating); using (LockDfsMutex()) // Needed: change between load & save should be atomic. { dc = LoadDfsConfig(); // Reload in case of change or user modifications. for (int nfile = 0; nfile < dfsnames.Count; nfile++) { string dfspath = dfsnames[nfile]; string dfspathreplicating = dfsnamesreplicating[nfile]; { dfs.DfsFile dfu = dc.FindAny(dfspathreplicating); if (null != dfu) { if (null != DfsFindAny(dc, dfspath)) { Console.Error.WriteLine("Error: file exists: file put into DFS from another location during job"); SetFailure(); continue; } dfu.Name = dfspath; } } } UpdateDfsXml(dc); } if (verbose) { Console.WriteLine(); // Line after output chars. } } catch (System.Threading.ThreadAbortException) { aborting = true; } finally { { for (int si = 0; si < slaves.Length; si++) { System.Threading.Mutex m = new System.Threading.Mutex(false, "AEL_SC_" + slaves[si]); try { m.WaitOne(); } catch (System.Threading.AbandonedMutexException) { } try { System.IO.File.Delete(NetworkPathForHost(slaves[si]) + @"\slaveconfig.j" + sjid + ".xml"); } catch { } finally { m.ReleaseMutex(); m.Close(); } } } if (!aborting) { CheckUserLogs(slaves, logname); } } if (verbose) { Console.WriteLine(); Console.WriteLine("[{0}] Done", System.DateTime.Now.ToString(), System.DateTime.Now.Millisecond); for (int i = 0; i < cfgj.IOSettings.DFS_IOs.Length; i++) { Console.WriteLine("Output: {0}", cfgj.IOSettings.DFS_IOs[i].DFSWriter); } } }
public static void ExecOneRemote(SourceCode.Job cfgj, string[] ExecArgs, bool verbose, bool verbosereplication) { if (verbose) { Console.WriteLine("[{0}] [Remote: {2}]", System.DateTime.Now.ToString(), System.DateTime.Now.Millisecond, cfgj.NarrativeName); } string logname = Surrogate.SafeTextPath(cfgj.NarrativeName) + "_" + Guid.NewGuid().ToString() + ".j" + sjid + "_log.txt"; //System.Threading.Thread.Sleep(8000); /*if (cfgj.IOSettings.DFS_IOs == null || cfgj.IOSettings.DFS_IOs.Length == 0) { Console.Error.WriteLine("One or more IOSettings/DFS_IO needed in configuration for 'remote'"); return; }*/ // Could provide BlockID here, which is just the n-th DFS_IO entry. //cfgj.Remote dfs dc = LoadDfsConfig(); string[] slaves = dc.Slaves.SlaveList.Split(',', ';'); if (dc.Slaves.SlaveList.Length == 0 || slaves.Length < 1) { throw new Exception("SlaveList expected in " + dfs.DFSXMLNAME); } if(dc.Replication > 1) { string[] slavesbefore = slaves; slaves = ExcludeUnhealthySlaveMachines(slaves, true).ToArray(); if (slavesbefore.Length - slaves.Length >= dc.Replication) { throw new Exception("Not enough healthy machines to run job (hit replication count)"); } } if (cfgj.IOSettings.DFS_IO_Multis != null) { cfgj.ExpandDFSIOMultis(slaves.Length, MySpace.DataMining.DistributedObjects.MemoryUtils.NumberOfProcessors); } Dictionary<string, int> slaveIDs = new Dictionary<string, int>(); for (int si = 0; si < slaves.Length; si++) { slaveIDs.Add(slaves[si].ToUpper(), si); } bool aborting = false; try { List<RemoteBlockInfo> blocks = new List<RemoteBlockInfo>(cfgj.IOSettings.DFS_IOs.Length); if (verbose) { Console.WriteLine("{0} processes on {1} machines:", cfgj.IOSettings.DFS_IOs.Length, slaves.Length); } List<string> outputdfsdirs = new List<string>(slaves.Length); { for (int i = 0; i < slaves.Length; i++) { try { outputdfsdirs.Add(NetworkPathForHost(slaves[i])); } catch (Exception e) { Console.Error.WriteLine(" {0}", e.Message); } } } string slaveconfigxml = ""; { System.Xml.XmlDocument pdoc = new System.Xml.XmlDocument(); { System.IO.MemoryStream ms = new System.IO.MemoryStream(); System.Xml.Serialization.XmlSerializer xs = new System.Xml.Serialization.XmlSerializer(typeof(dfs)); xs.Serialize(ms, dc); ms.Seek(0, System.IO.SeekOrigin.Begin); pdoc.Load(ms); } string xml = pdoc.DocumentElement.SelectSingleNode("./slave").OuterXml; //System.Threading.Thread.Sleep(8000); slaveconfigxml = xml; } { // Temporary: for (int si = 0; si < slaves.Length; si++) { System.Threading.Mutex m = new System.Threading.Mutex(false, "AEL_SC_" + slaves[si]); try { m.WaitOne(); } catch (System.Threading.AbandonedMutexException) { } try { System.IO.File.WriteAllText(NetworkPathForHost(slaves[si]) + @"\slaveconfig.j" + sjid + ".xml", slaveconfigxml); } catch { } finally { m.ReleaseMutex(); m.Close(); } } } int nextslave = (new Random(DateTime.Now.Millisecond / 2 + System.Diagnostics.Process.GetCurrentProcess().Id / 2)).Next() % slaves.Length; int hosttypes = 0; List<int> outputrecordlengths = new List<int>(); List<int> inputrecordlengths = new List<int>(); for (int BlockID = 0; BlockID < cfgj.IOSettings.DFS_IOs.Length; BlockID++) { int slaveHostID = 0; RemoteBlockInfo bi = new RemoteBlockInfo(); bi.sampledist = dc.DataNodeBaseSize / dc.DataNodeSamples; bi.BlockID = BlockID; bi.blockcount = cfgj.IOSettings.DFS_IOs.Length; if (string.IsNullOrEmpty(cfgj.IOSettings.DFS_IOs[BlockID].Host)) { if (0 != hosttypes && 1 != hosttypes) { throw new Exception("DFS_IO/Host tag must be specified for all or none"); } hosttypes = 1; bi.SlaveHost = slaves[nextslave]; slaveHostID = nextslave; bi.explicithost = false; } else { if (0 != hosttypes && 2 != hosttypes) { throw new Exception("DFS_IO/Host tag must be specified for all or none"); } hosttypes = 2; bi.SlaveHost = cfgj.IOSettings.DFS_IOs[BlockID].Host; slaveHostID = slaveIDs[bi.SlaveHost.ToUpper()]; bi.explicithost = true; } bi.ExecArgs = ExecArgs; if (++nextslave >= slaves.Length) { nextslave = 0; } bi.logname = logname; bi.outputdfsdirs = outputdfsdirs; bi.slaves = slaves; bi.baseoutputfilesize = dc.DataNodeBaseSize; bi.cfgj = cfgj; bi.DFSWriter = cfgj.IOSettings.DFS_IOs[BlockID].DFSWriter.Trim(); bi.Meta = cfgj.IOSettings.DFS_IOs[BlockID].Meta; List<string> dfswriters = new List<string>(); if (bi.DFSWriter.Length > 0) { string[] writers = bi.DFSWriter.Split(';'); for (int wi = 0; wi < writers.Length; wi++) { string thiswriter = writers[wi].Trim(); if (thiswriter.Length == 0) { continue; } int ic = thiswriter.IndexOf('@'); int reclen = -1; if (-1 != ic) { try { reclen = Surrogate.GetRecordSize(thiswriter.Substring(ic + 1)); thiswriter = thiswriter.Substring(0, ic); } catch (FormatException e) { Console.Error.WriteLine("Error: remote output record length error: {0} ({1})", thiswriter, e.Message); SetFailure(); return; } catch (OverflowException e) { Console.Error.WriteLine("Error: remote output record length error: {0} ({1})", thiswriter, e.Message); SetFailure(); return; } } string outfn = thiswriter; if (outfn.StartsWith(@"dfs://", StringComparison.OrdinalIgnoreCase)) { outfn = outfn.Substring(6); } string reason = ""; if (dfs.IsBadFilename(outfn, out reason)) { Console.Error.WriteLine("Invalid output file: {0}", reason); return; } if (null != DfsFindAny(dc, outfn)) { Console.Error.WriteLine("Error: output file already exists in DFS: {0}", outfn); return; } dfswriters.Add(thiswriter); outputrecordlengths.Add(reclen); } } else { dfswriters.Add(""); outputrecordlengths.Add(-1); } bi.DFSWriters = dfswriters; bi.verbose = verbose; bi.rem = new MySpace.DataMining.DistributedObjects5.Remote(cfgj.NarrativeName + "_remote"); bi.rem.CookRetries = dc.slave.CookRetries; bi.rem.CookTimeout = dc.slave.CookTimeout; bi.rem.DfsSampleDistance = bi.sampledist; bi.rem.CompressFileOutput = dc.slave.CompressDfsChunks; bi.rem.LocalCompile = true; bi.rem.OutputStartingPoint = slaveHostID; bi.rem.CompilerOptions = cfgj.IOSettings.CompilerOptions; bi.rem.CompilerVersion = cfgj.IOSettings.CompilerVersion; if (cfgj.AssemblyReferencesCount > 0) { cfgj.AddAssemblyReferences(bi.rem.CompilerAssemblyReferences, Surrogate.NetworkPathForHost(dc.Slaves.GetFirstSlave())); } if (cfgj.OpenCVExtension != null) { bi.rem.AddOpenCVExtension(); } if (cfgj.MemCache != null) { bi.rem.AddMemCacheExtension(); } if (cfgj.Unsafe != null) { bi.rem.AddUnsafe(); } { List<dfs.DfsFile.FileNode> nodes = new List<dfs.DfsFile.FileNode>(); List<string> mapfileswithnodes = null; List<int> nodesoffsets = null; IList<string> mapfiles = SplitInputPaths(dc, cfgj.IOSettings.DFS_IOs[BlockID].DFSReader); if (mapfiles.Count > 0) { mapfileswithnodes = new List<string>(mapfiles.Count); nodesoffsets = new List<int>(mapfiles.Count); } for (int i = 0; i < mapfiles.Count; i++) { string dp = mapfiles[i].Trim(); int inreclen = -1; if (0 != dp.Length) // Allow empty entry where input isn't wanted. { if (dp.StartsWith("dfs://", StringComparison.OrdinalIgnoreCase)) { dp = dp.Substring(6); } { int ic = dp.IndexOf('@'); if (-1 != ic) { try { inreclen = Surrogate.GetRecordSize(dp.Substring(ic + 1)); dp = dp.Substring(0, ic); } catch (FormatException e) { Console.Error.WriteLine("Error: remote input record length error: {0} ({1})", dp, e.Message); SetFailure(); return; } catch (OverflowException e) { Console.Error.WriteLine("Error: remote input record length error: {0} ({1})", dp, e.Message); SetFailure(); return; } } } dfs.DfsFile df; if (inreclen > 0 || inreclen == -2) { df = DfsFind(dc, dp, DfsFileTypes.BINARY_RECT); if (null != df && inreclen != df.RecordLength) { Console.Error.WriteLine("Error: remote input file does not have expected record length of {0}: {1}@{2}", inreclen, dp, df.RecordLength); SetFailure(); return; } } else { df = DfsFind(dc, dp); } if (null == df) { //throw new Exception("Remote input file not found in DFS: " + dp); Console.Error.WriteLine("Remote input file not found in DFS: {0}", dp); return; } if (df.Nodes.Count > 0) { mapfileswithnodes.Add(dp); nodesoffsets.Add(nodes.Count); inputrecordlengths.Add(inreclen); nodes.AddRange(df.Nodes); } } } bi.dfsinputpaths = new List<string>(nodes.Count); //MapNodesToNetworkPaths(nodes, bi.dfsinputpaths); dfs.MapNodesToNetworkStarPaths(nodes, bi.dfsinputpaths); bi.dfsinputfilenames = mapfileswithnodes; bi.dfsinputnodesoffsets = nodesoffsets; } blocks.Add(bi); bi.thread = new System.Threading.Thread(new System.Threading.ThreadStart(bi.threadproc)); bi.thread.Name = "RemoteJobBlock" + bi.BlockID; } MySpace.DataMining.DistributedObjects.StaticGlobals.DSpace_InputRecordLength = inputrecordlengths.Count > 0 ? inputrecordlengths[0] : -1; MySpace.DataMining.DistributedObjects.StaticGlobals.DSpace_OutputRecordLength = outputrecordlengths.Count > 0 ? outputrecordlengths[0] : -1; // Need to start threads separately due to StaticGlobals being updated. for (int BlockID = 0; BlockID < cfgj.IOSettings.DFS_IOs.Length; BlockID++) { RemoteBlockInfo bi = blocks[BlockID]; bi.rem.InputRecordLength = MySpace.DataMining.DistributedObjects.StaticGlobals.DSpace_InputRecordLength; bi.rem.InputRecordLengths = inputrecordlengths; bi.rem.OutputRecordLength = MySpace.DataMining.DistributedObjects.StaticGlobals.DSpace_OutputRecordLength; bi.rem.OutputRecordLengths = outputrecordlengths; AELight_StartTraceThread(bi.thread); } for (int BlockID = 0; BlockID < blocks.Count; BlockID++) { AELight_JoinTraceThread(blocks[BlockID].thread); blocks[BlockID].rem.Close(); if (blocks[BlockID].blockfail) { Console.Error.WriteLine("BlockID {0} on host '{1}' did not complete successfully", BlockID, (blocks[BlockID].SlaveHost != null) ? blocks[BlockID].SlaveHost : "<null>"); continue; } } List<string> dfsnames = new List<string>(); List<string> dfsnamesreplicating = new List<string>(); // Reload DFS config to make sure changes since starting get rolled in, and make sure the output file wasn't created in that time... using (LockDfsMutex()) // Needed: change between load & save should be atomic. { dc = LoadDfsConfig(); for (int BlockID = 0; BlockID < blocks.Count; BlockID++) { if (blocks[BlockID].blockfail) { continue; } { bool anyoutput = false; bool nonemptyoutputpath = false; for (int oi = 0; oi < blocks[BlockID].DFSWriters.Count; oi++) { string dfswriter = blocks[BlockID].DFSWriters[oi]; if (string.IsNullOrEmpty(dfswriter)) { if (blocks[BlockID].outputdfsnodeses[oi].Count > 0) { Console.Error.WriteLine("Output data detected with no DFSWriter specified"); } } else { { if (null != DfsFind(dc, dfswriter)) { Console.Error.WriteLine("Error: output file was created during job: {0}", dfswriter); continue; } string dfspath = dfswriter; { nonemptyoutputpath = true; dfs.DfsFile df = new dfs.DfsFile(); if (blocks[BlockID].rem.OutputRecordLengths[oi] > 0) { df.XFileType = DfsFileTypes.BINARY_RECT + "@" + blocks[BlockID].rem.OutputRecordLengths[oi].ToString(); } else if (blocks[BlockID].rem.OutputRecordLengths[oi] == -2) { df.XFileType = DfsFileTypes.BINARY_RECT + "@?"; } df.Nodes = new List<dfs.DfsFile.FileNode>(); df.Size = -1; // Preset if (dfspath.StartsWith("dfs://", StringComparison.OrdinalIgnoreCase)) { dfspath = dfspath.Substring(6); } string dfspathreplicating = ".$" + dfspath + ".$replicating-" + Guid.NewGuid().ToString(); if (null != dc.FindAny(dfspathreplicating)) { Console.Error.WriteLine("Error: file exists: file put into DFS from another location during job: " + dfspathreplicating); SetFailure(); return; } dfsnames.Add(dfspath); dfsnamesreplicating.Add(dfspathreplicating); df.Name = dfspathreplicating; bool anybad = false; long totalsize = 0; { int i = BlockID; for (int j = 0; j < blocks[i].outputdfsnodeses[oi].Count; j++) { dfs.DfsFile.FileNode fn = new dfs.DfsFile.FileNode(); fn.Host = blocks[i].slaves[(blocks[i].rem.OutputStartingPoint + j) % blocks[i].slaves.Count]; fn.Name = blocks[i].outputdfsnodeses[oi][j]; df.Nodes.Add(fn); fn.Length = -1; // Preset fn.Position = -1; // Preset if (anybad) { continue; } fn.Length = blocks[i].outputsizeses[oi][j]; fn.Position = totalsize; // Position must be set before totalsize updated! if (blocks[i].outputdfsnodeses[oi].Count != blocks[i].outputsizeses[oi].Count) { anybad = true; continue; } totalsize += blocks[i].outputsizeses[oi][j]; } } if (!anybad) { df.Size = totalsize; } if (totalsize != 0) { anyoutput = true; } // Always add the file to DFS, even if blank! dc.Files.Add(df); } } } } if (!anyoutput && verbose && nonemptyoutputpath) { Console.Write(" (no DFS output) "); ConsoleFlush(); } } } UpdateDfsXml(dc); } ReplicationPhase(verbosereplication, blocks.Count, slaves, dfsnamesreplicating); using (LockDfsMutex()) // Needed: change between load & save should be atomic. { dc = LoadDfsConfig(); // Reload in case of change or user modifications. for (int nfile = 0; nfile < dfsnames.Count; nfile++) { string dfspath = dfsnames[nfile]; string dfspathreplicating = dfsnamesreplicating[nfile]; { dfs.DfsFile dfu = dc.FindAny(dfspathreplicating); if (null != dfu) { if (null != DfsFindAny(dc, dfspath)) { Console.Error.WriteLine("Error: file exists: file put into DFS from another location during job"); SetFailure(); continue; } dfu.Name = dfspath; } } } UpdateDfsXml(dc); } if (verbose) { Console.WriteLine(); // Line after output chars. } } catch (System.Threading.ThreadAbortException) { aborting = true; } finally { { for (int si = 0; si < slaves.Length; si++) { System.Threading.Mutex m = new System.Threading.Mutex(false, "AEL_SC_" + slaves[si]); try { m.WaitOne(); } catch (System.Threading.AbandonedMutexException) { } try { System.IO.File.Delete(NetworkPathForHost(slaves[si]) + @"\slaveconfig.j" + sjid + ".xml"); } catch { } finally { m.ReleaseMutex(); m.Close(); } } } if (!aborting) { CheckUserLogs(slaves, logname); } } if (verbose) { Console.WriteLine(); Console.WriteLine("[{0}] Done", System.DateTime.Now.ToString(), System.DateTime.Now.Millisecond); for (int i = 0; i < cfgj.IOSettings.DFS_IOs.Length; i++) { Console.WriteLine("Output: {0}", cfgj.IOSettings.DFS_IOs[i].DFSWriter); } } }