public virtual void PutBackKnownMapOutput(MapHost host, TaskAttemptID mapId) { lock (this) { host.AddKnownMap(mapId); } }
public override void Run() { try { while (true) { // take the first host that has an expired penalty MapHost host = this._enclosing.penalties.Take().host; lock (this._enclosing._enclosing) { if (host.MarkAvailable() == MapHost.State.Pending) { this._enclosing.pendingHosts.AddItem(host); Sharpen.Runtime.NotifyAll(this._enclosing._enclosing); } } } } catch (Exception) { return; } catch (Exception t) { this._enclosing.reporter.ReportException(t); } }
private void AbortConnect(MapHost host, ICollection <TaskAttemptID> remaining) { foreach (TaskAttemptID left in remaining) { scheduler.PutBackKnownMapOutput(host, left); } CloseConnection(); }
public virtual void TestSucceedAndFailedCopyMap <K, V>() { JobConf job = new JobConf(); job.SetNumMapTasks(2); //mock creation TaskUmbilicalProtocol mockUmbilical = Org.Mockito.Mockito.Mock <TaskUmbilicalProtocol >(); Reporter mockReporter = Org.Mockito.Mockito.Mock <Reporter>(); FileSystem mockFileSystem = Org.Mockito.Mockito.Mock <FileSystem>(); Type combinerClass = job.GetCombinerClass(); Task.CombineOutputCollector <K, V> mockCombineOutputCollector = (Task.CombineOutputCollector <K, V>)Org.Mockito.Mockito.Mock <Task.CombineOutputCollector>(); // needed for mock with generic TaskAttemptID mockTaskAttemptID = Org.Mockito.Mockito.Mock <TaskAttemptID>(); LocalDirAllocator mockLocalDirAllocator = Org.Mockito.Mockito.Mock <LocalDirAllocator >(); CompressionCodec mockCompressionCodec = Org.Mockito.Mockito.Mock <CompressionCodec >(); Counters.Counter mockCounter = Org.Mockito.Mockito.Mock <Counters.Counter>(); TaskStatus mockTaskStatus = Org.Mockito.Mockito.Mock <TaskStatus>(); Progress mockProgress = Org.Mockito.Mockito.Mock <Progress>(); MapOutputFile mockMapOutputFile = Org.Mockito.Mockito.Mock <MapOutputFile>(); Org.Apache.Hadoop.Mapred.Task mockTask = Org.Mockito.Mockito.Mock <Org.Apache.Hadoop.Mapred.Task >(); MapOutput <K, V> output = Org.Mockito.Mockito.Mock <MapOutput>(); ShuffleConsumerPlugin.Context <K, V> context = new ShuffleConsumerPlugin.Context <K , V>(mockTaskAttemptID, job, mockFileSystem, mockUmbilical, mockLocalDirAllocator , mockReporter, mockCompressionCodec, combinerClass, mockCombineOutputCollector, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockTaskStatus , mockProgress, mockProgress, mockTask, mockMapOutputFile, null); TaskStatus status = new _TaskStatus_251(); Progress progress = new Progress(); ShuffleSchedulerImpl <K, V> scheduler = new ShuffleSchedulerImpl <K, V>(job, status , null, null, progress, context.GetShuffledMapsCounter(), context.GetReduceShuffleBytes (), context.GetFailedShuffleCounter()); MapHost host1 = new MapHost("host1", null); TaskAttemptID failedAttemptID = new TaskAttemptID(new TaskID(new JobID("test", 0) , TaskType.Map, 0), 0); TaskAttemptID succeedAttemptID = new TaskAttemptID(new TaskID(new JobID("test", 0 ), TaskType.Map, 1), 1); // handle output fetch failure for failedAttemptID, part I scheduler.HostFailed(host1.GetHostName()); // handle output fetch succeed for succeedAttemptID long bytes = (long)500 * 1024 * 1024; scheduler.CopySucceeded(succeedAttemptID, host1, bytes, 0, 500000, output); // handle output fetch failure for failedAttemptID, part II // for MAPREDUCE-6361: verify no NPE exception get thrown out scheduler.CopyFailed(failedAttemptID, host1, true, false); }
public virtual void CopyFailed(TaskAttemptID mapId, MapHost host, bool readError, bool connectExcpt) { lock (this) { host.Penalize(); int failures = 1; if (failureCounts.Contains(mapId)) { IntWritable x = failureCounts[mapId]; x.Set(x.Get() + 1); failures = x.Get(); } else { failureCounts[mapId] = new IntWritable(1); } string hostname = host.GetHostName(); IntWritable hostFailedNum = hostFailures[hostname]; // MAPREDUCE-6361: hostname could get cleanup from hostFailures in another // thread with copySucceeded. // In this case, add back hostname to hostFailures to get rid of NPE issue. if (hostFailedNum == null) { hostFailures[hostname] = new IntWritable(1); } //report failure if already retried maxHostFailures times bool hostFail = hostFailures[hostname].Get() > GetMaxHostFailures() ? true : false; if (failures >= abortFailureLimit) { try { throw new IOException(failures + " failures downloading " + mapId); } catch (IOException ie) { reporter.ReportException(ie); } } CheckAndInformMRAppMaster(failures, mapId, readError, connectExcpt, hostFail); CheckReducerHealth(); long delay = (long)(InitialPenalty * Math.Pow(PenaltyGrowthRate, failures)); if (delay > maxDelay) { delay = maxDelay; } penalties.AddItem(new ShuffleSchedulerImpl.Penalty(host, delay)); failedShuffleCounter.Increment(1); } }
public virtual void FreeHost(MapHost host) { lock (this) { if (host.GetState() != MapHost.State.Penalized) { if (host.MarkAvailable() == MapHost.State.Pending) { pendingHosts.AddItem(host); Sharpen.Runtime.NotifyAll(this); } } Log.Info(host + " freed by " + Sharpen.Thread.CurrentThread().GetName() + " in " + (Time.MonotonicNow() - shuffleStart.Get()) + "ms"); } }
/// <summary>Create the map-output-url.</summary> /// <remarks> /// Create the map-output-url. This will contain all the map ids /// separated by commas /// </remarks> /// <param name="host"/> /// <param name="maps"/> /// <returns/> /// <exception cref="System.UriFormatException"/> private Uri GetMapOutputURL(MapHost host, ICollection <TaskAttemptID> maps) { // Get the base url StringBuilder url = new StringBuilder(host.GetBaseUrl()); bool first = true; foreach (TaskAttemptID mapId in maps) { if (!first) { url.Append(","); } url.Append(mapId); first = false; } Log.Debug("MapOutput URL for " + host + " -> " + url.ToString()); return(new Uri(url.ToString())); }
/// <exception cref="System.IO.IOException"/> public override void Shuffle(MapHost host, InputStream input, long compressedLength , long decompressedLength, ShuffleClientMetrics metrics, Reporter reporter) { input = new IFileInputStream(input, compressedLength, conf); // Copy data to local-disk long bytesLeft = compressedLength; try { int BytesToRead = 64 * 1024; byte[] buf = new byte[BytesToRead]; while (bytesLeft > 0) { int n = ((IFileInputStream)input).ReadWithChecksum(buf, 0, (int)Math.Min(bytesLeft , BytesToRead)); if (n < 0) { throw new IOException("read past end of stream reading " + GetMapId()); } disk.Write(buf, 0, n); bytesLeft -= n; metrics.InputBytes(n); reporter.Progress(); } Log.Info("Read " + (compressedLength - bytesLeft) + " bytes from map-output for " + GetMapId()); disk.Close(); } catch (IOException ioe) { // Close the streams IOUtils.Cleanup(Log, input, disk); // Re-throw throw; } // Sanity check if (bytesLeft != 0) { throw new IOException("Incomplete map output received for " + GetMapId() + " from " + host.GetHostName() + " (" + bytesLeft + " bytes missing of " + compressedLength + ")"); } this.compressedSize = compressedLength; }
public virtual void AddKnownMapOutput(string hostName, string hostUrl, TaskAttemptID mapId) { lock (this) { MapHost host = mapLocations[hostName]; if (host == null) { host = new MapHost(hostName, hostUrl); mapLocations[hostName] = host; } host.AddKnownMap(mapId); // Mark the host as pending if (host.GetState() == MapHost.State.Pending) { pendingHosts.AddItem(host); Sharpen.Runtime.NotifyAll(this); } } }
/// <exception cref="System.IO.IOException"/> private void SetupConnectionsWithRetry(MapHost host, ICollection <TaskAttemptID> remaining , Uri url) { OpenConnectionWithRetry(host, remaining, url); if (stopped) { return; } // generate hash of the url string msgToEncode = SecureShuffleUtils.BuildMsgFrom(url); string encHash = SecureShuffleUtils.HashFromString(msgToEncode, shuffleSecretKey); SetupShuffleConnection(encHash); Connect(connection, connectionTimeout); // verify that the thread wasn't stopped during calls to connect if (stopped) { return; } VerifyConnection(url, msgToEncode, encHash); }
/// <exception cref="System.IO.IOException"/> private void OpenConnectionWithRetry(MapHost host, ICollection <TaskAttemptID> remaining , Uri url) { long startTime = Time.MonotonicNow(); bool shouldWait = true; while (shouldWait) { try { OpenConnection(url); shouldWait = false; } catch (IOException e) { if (!fetchRetryEnabled) { // throw exception directly if fetch's retry is not enabled throw; } if ((Time.MonotonicNow() - startTime) >= this.fetchRetryTimeout) { Log.Warn("Failed to connect to host: " + url + "after " + fetchRetryTimeout + " milliseconds." ); throw; } try { Sharpen.Thread.Sleep(this.fetchRetryInterval); } catch (Exception) { if (stopped) { return; } } } } }
/// <summary> /// check if hit timeout of retry, if not, throw an exception and start a /// new round of retry. /// </summary> /// <exception cref="System.IO.IOException"/> private void CheckTimeoutOrRetry(MapHost host, IOException ioe) { // First time to retry. long currentTime = Time.MonotonicNow(); if (retryStartTime == 0) { retryStartTime = currentTime; } // Retry is not timeout, let's do retry with throwing an exception. if (currentTime - retryStartTime < this.fetchRetryTimeout) { Log.Warn("Shuffle output from " + host.GetHostName() + " failed, retry it.", ioe); throw ioe; } else { // timeout, prepare to be failed. Log.Warn("Timeout for copying MapOutput with retry on host " + host + "after " + fetchRetryTimeout + " milliseconds."); } }
private DataInputStream OpenShuffleUrl(MapHost host, ICollection <TaskAttemptID> remaining , Uri url) { DataInputStream input = null; try { SetupConnectionsWithRetry(host, remaining, url); if (stopped) { AbortConnect(host, remaining); } else { input = new DataInputStream(connection.GetInputStream()); } } catch (IOException ie) { bool connectExcpt = ie is ConnectException; ioErrs.Increment(1); Log.Warn("Failed to connect to " + host + " with " + remaining.Count + " map outputs" , ie); // If connect did not succeed, just mark all the maps as failed, // indirectly penalizing the host scheduler.HostFailed(host.GetHostName()); foreach (TaskAttemptID left in remaining) { scheduler.CopyFailed(left, host, false, connectExcpt); } // Add back all the remaining maps, WITHOUT marking them as failed foreach (TaskAttemptID left_1 in remaining) { scheduler.PutBackKnownMapOutput(host, left_1); } } return(input); }
/// <exception cref="System.Exception"/> public virtual MapHost GetHost() { lock (this) { while (pendingHosts.IsEmpty()) { Sharpen.Runtime.Wait(this); } MapHost host = null; IEnumerator <MapHost> iter = pendingHosts.GetEnumerator(); int numToPick = random.Next(pendingHosts.Count); for (int i = 0; i <= numToPick; ++i) { host = iter.Next(); } pendingHosts.Remove(host); host.MarkBusy(); Log.Info("Assigning " + host + " with " + host.GetNumKnownMapOutputs() + " to " + Sharpen.Thread.CurrentThread().GetName()); shuffleStart.Set(Time.MonotonicNow()); return(host); } }
/// <exception cref="System.IO.IOException"/> public virtual void CopySucceeded(TaskAttemptID mapId, MapHost host, long bytes, long startMillis, long endMillis, MapOutput <K, V> output) { lock (this) { Sharpen.Collections.Remove(failureCounts, mapId); Sharpen.Collections.Remove(hostFailures, host.GetHostName()); int mapIndex = mapId.GetTaskID().GetId(); if (!finishedMaps[mapIndex]) { output.Commit(); finishedMaps[mapIndex] = true; shuffledMapsCounter.Increment(1); if (--remainingMaps == 0) { Sharpen.Runtime.NotifyAll(this); } // update single copy task status long copyMillis = (endMillis - startMillis); if (copyMillis == 0) { copyMillis = 1; } float bytesPerMillis = (float)bytes / copyMillis; float transferRate = bytesPerMillis * BytesPerMillisToMbs; string individualProgress = "copy task(" + mapId + " succeeded" + " at " + mbpsFormat .Format(transferRate) + " MB/s)"; // update the aggregated status copyTimeTracker.Add(startMillis, endMillis); totalBytesShuffledTillNow += bytes; UpdateStatus(individualProgress); reduceShuffleBytes.Increment(bytes); lastProgressTime = Time.MonotonicNow(); Log.Debug("map " + mapId + " done " + status.GetStateString()); } } }
public override void Run() { try { while (!stopped && !Sharpen.Thread.CurrentThread().IsInterrupted()) { MapHost host = null; try { // If merge is on, block merger.WaitForResource(); // Get a host to shuffle from host = scheduler.GetHost(); metrics.ThreadBusy(); // Shuffle CopyFromHost(host); } finally { if (host != null) { scheduler.FreeHost(host); metrics.ThreadFree(); } } } } catch (Exception) { return; } catch (Exception t) { exceptionReporter.ReportException(t); } }
/// <exception cref="System.IO.IOException"/> public override void Shuffle(MapHost host, InputStream input, long compressedLength , long decompressedLength, ShuffleClientMetrics metrics, Reporter reporter) { IFileInputStream checksumIn = new IFileInputStream(input, compressedLength, conf); input = checksumIn; // Are map-outputs compressed? if (codec != null) { decompressor.Reset(); input = codec.CreateInputStream(input, decompressor); } try { IOUtils.ReadFully(input, memory, 0, memory.Length); metrics.InputBytes(memory.Length); reporter.Progress(); Log.Info("Read " + memory.Length + " bytes from map-output for " + GetMapId()); if (input.Read() >= 0) { throw new IOException("Unexpected extra bytes from input stream for " + GetMapId( )); } } catch (IOException ioe) { // Close the streams IOUtils.Cleanup(Log, input); // Re-throw throw; } finally { CodecPool.ReturnDecompressor(decompressor); } }
public virtual IList <TaskAttemptID> GetMapsForHost(MapHost host) { lock (this) { IList <TaskAttemptID> list = host.GetAndClearKnownMaps(); IEnumerator <TaskAttemptID> itr = list.GetEnumerator(); IList <TaskAttemptID> result = new AList <TaskAttemptID>(); int includedMaps = 0; int totalSize = list.Count; // find the maps that we still need, up to the limit while (itr.HasNext()) { TaskAttemptID id = itr.Next(); if (!obsoleteMaps.Contains(id) && !finishedMaps[id.GetTaskID().GetId()]) { result.AddItem(id); if (++includedMaps >= MaxMapsAtOnce) { break; } } } // put back the maps left after the limit while (itr.HasNext()) { TaskAttemptID id = itr.Next(); if (!obsoleteMaps.Contains(id) && !finishedMaps[id.GetTaskID().GetId()]) { host.AddKnownMap(id); } } Log.Info("assigned " + includedMaps + " of " + totalSize + " to " + host + " to " + Sharpen.Thread.CurrentThread().GetName()); return(result); } }
/// <exception cref="System.IO.IOException"/> private TaskAttemptID[] CopyMapOutput(MapHost host, DataInputStream input, ICollection <TaskAttemptID> remaining, bool canRetry) { MapOutput <K, V> mapOutput = null; TaskAttemptID mapId = null; long decompressedLength = -1; long compressedLength = -1; try { long startTime = Time.MonotonicNow(); int forReduce = -1; //Read the shuffle header try { ShuffleHeader header = new ShuffleHeader(); header.ReadFields(input); mapId = TaskAttemptID.ForName(header.mapId); compressedLength = header.compressedLength; decompressedLength = header.uncompressedLength; forReduce = header.forReduce; } catch (ArgumentException e) { badIdErrs.Increment(1); Log.Warn("Invalid map id ", e); //Don't know which one was bad, so consider all of them as bad return(Sharpen.Collections.ToArray(remaining, new TaskAttemptID[remaining.Count])); } InputStream @is = input; @is = CryptoUtils.WrapIfNecessary(jobConf, @is, compressedLength); compressedLength -= CryptoUtils.CryptoPadding(jobConf); decompressedLength -= CryptoUtils.CryptoPadding(jobConf); // Do some basic sanity verification if (!VerifySanity(compressedLength, decompressedLength, forReduce, remaining, mapId )) { return(new TaskAttemptID[] { mapId }); } if (Log.IsDebugEnabled()) { Log.Debug("header: " + mapId + ", len: " + compressedLength + ", decomp len: " + decompressedLength); } // Get the location for the map output - either in-memory or on-disk try { mapOutput = merger.Reserve(mapId, decompressedLength, id); } catch (IOException ioe) { // kill this reduce attempt ioErrs.Increment(1); scheduler.ReportLocalError(ioe); return(EmptyAttemptIdArray); } // Check if we can shuffle *now* ... if (mapOutput == null) { Log.Info("fetcher#" + id + " - MergeManager returned status WAIT ..."); //Not an error but wait to process data. return(EmptyAttemptIdArray); } // The codec for lz0,lz4,snappy,bz2,etc. throw java.lang.InternalError // on decompression failures. Catching and re-throwing as IOException // to allow fetch failure logic to be processed try { // Go! Log.Info("fetcher#" + id + " about to shuffle output of map " + mapOutput.GetMapId () + " decomp: " + decompressedLength + " len: " + compressedLength + " to " + mapOutput .GetDescription()); mapOutput.Shuffle(host, @is, compressedLength, decompressedLength, metrics, reporter ); } catch (InternalError e) { Log.Warn("Failed to shuffle for fetcher#" + id, e); throw new IOException(e); } // Inform the shuffle scheduler long endTime = Time.MonotonicNow(); // Reset retryStartTime as map task make progress if retried before. retryStartTime = 0; scheduler.CopySucceeded(mapId, host, compressedLength, startTime, endTime, mapOutput ); // Note successful shuffle remaining.Remove(mapId); metrics.SuccessFetch(); return(null); } catch (IOException ioe) { if (mapOutput != null) { mapOutput.Abort(); } if (canRetry) { CheckTimeoutOrRetry(host, ioe); } ioErrs.Increment(1); if (mapId == null || mapOutput == null) { Log.Warn("fetcher#" + id + " failed to read map header" + mapId + " decomp: " + decompressedLength + ", " + compressedLength, ioe); if (mapId == null) { return(Sharpen.Collections.ToArray(remaining, new TaskAttemptID[remaining.Count])); } else { return(new TaskAttemptID[] { mapId }); } } Log.Warn("Failed to shuffle output of " + mapId + " from " + host.GetHostName(), ioe); // Inform the shuffle-scheduler metrics.FailedFetch(); return(new TaskAttemptID[] { mapId }); } }
protected internal virtual void CopyFromHost(MapHost host) { // reset retryStartTime for a new host retryStartTime = 0; // Get completed maps on 'host' IList <TaskAttemptID> maps = scheduler.GetMapsForHost(host); // Sanity check to catch hosts with only 'OBSOLETE' maps, // especially at the tail of large jobs if (maps.Count == 0) { return; } if (Log.IsDebugEnabled()) { Log.Debug("Fetcher " + id + " going to fetch from " + host + " for: " + maps); } // List of maps to be fetched yet ICollection <TaskAttemptID> remaining = new HashSet <TaskAttemptID>(maps); // Construct the url and connect Uri url = GetMapOutputURL(host, maps); DataInputStream input = OpenShuffleUrl(host, remaining, url); if (input == null) { return; } try { // Loop through available map-outputs and fetch them // On any error, faildTasks is not null and we exit // after putting back the remaining maps to the // yet_to_be_fetched list and marking the failed tasks. TaskAttemptID[] failedTasks = null; while (!remaining.IsEmpty() && failedTasks == null) { try { failedTasks = CopyMapOutput(host, input, remaining, fetchRetryEnabled); } catch (IOException) { // // Setup connection again if disconnected by NM connection.Disconnect(); // Get map output from remaining tasks only. url = GetMapOutputURL(host, remaining); input = OpenShuffleUrl(host, remaining, url); if (input == null) { return; } } } if (failedTasks != null && failedTasks.Length > 0) { Log.Warn("copyMapOutput failed for tasks " + Arrays.ToString(failedTasks)); scheduler.HostFailed(host.GetHostName()); foreach (TaskAttemptID left in failedTasks) { scheduler.CopyFailed(left, host, true, false); } } // Sanity check if (failedTasks == null && !remaining.IsEmpty()) { throw new IOException("server didn't return all expected map outputs: " + remaining .Count + " left."); } input.Close(); input = null; } finally { if (input != null) { IOUtils.Cleanup(Log, input); input = null; } foreach (TaskAttemptID left in remaining) { scheduler.PutBackKnownMapOutput(host, left); } } }
/// <exception cref="System.IO.IOException"/> public abstract void Shuffle(MapHost host, InputStream input, long compressedLength , long decompressedLength, ShuffleClientMetrics metrics, Reporter reporter);
public virtual void TestCorruptedIFile() { int fetcher = 7; Path onDiskMapOutputPath = new Path(name.GetMethodName() + "/foo"); Path shuffledToDisk = OnDiskMapOutput.GetTempPath(onDiskMapOutputPath, fetcher); fs = FileSystem.GetLocal(job).GetRaw(); MapOutputFile mof = Org.Mockito.Mockito.Mock <MapOutputFile>(); OnDiskMapOutput <Text, Text> odmo = new OnDiskMapOutput <Text, Text>(map1ID, id, mm , 100L, job, mof, fetcher, true, fs, onDiskMapOutputPath); string mapData = "MAPDATA12345678901234567890"; ShuffleHeader header = new ShuffleHeader(map1ID.ToString(), 14, 10, 1); ByteArrayOutputStream bout = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(bout); IFileOutputStream ios = new IFileOutputStream(dos); header.Write(dos); int headerSize = dos.Size(); try { ios.Write(Sharpen.Runtime.GetBytesForString(mapData)); } finally { ios.Close(); } int dataSize = bout.Size() - headerSize; // Ensure that the OnDiskMapOutput shuffler can successfully read the data. MapHost host = new MapHost("TestHost", "http://test/url"); ByteArrayInputStream bin = new ByteArrayInputStream(bout.ToByteArray()); try { // Read past the shuffle header. bin.Read(new byte[headerSize], 0, headerSize); odmo.Shuffle(host, bin, dataSize, dataSize, metrics, Reporter.Null); } finally { bin.Close(); } // Now corrupt the IFile data. byte[] corrupted = bout.ToByteArray(); corrupted[headerSize + (dataSize / 2)] = unchecked ((int)(0x0)); try { bin = new ByteArrayInputStream(corrupted); // Read past the shuffle header. bin.Read(new byte[headerSize], 0, headerSize); odmo.Shuffle(host, bin, dataSize, dataSize, metrics, Reporter.Null); NUnit.Framework.Assert.Fail("OnDiskMapOutput.shuffle didn't detect the corrupted map partition file" ); } catch (ChecksumException e) { Log.Info("The expected checksum exception was thrown.", e); } finally { bin.Close(); } // Ensure that the shuffled file can be read. IFileInputStream iFin = new IFileInputStream(fs.Open(shuffledToDisk), dataSize, job ); try { iFin.Read(new byte[dataSize], 0, dataSize); } finally { iFin.Close(); } }
internal Penalty(MapHost host, long delay) { this.host = host; this.endTime = Time.MonotonicNow() + delay; }