public virtual void TestSucceedAndFailedCopyMap <K, V>() { JobConf job = new JobConf(); job.SetNumMapTasks(2); //mock creation TaskUmbilicalProtocol mockUmbilical = Org.Mockito.Mockito.Mock <TaskUmbilicalProtocol >(); Reporter mockReporter = Org.Mockito.Mockito.Mock <Reporter>(); FileSystem mockFileSystem = Org.Mockito.Mockito.Mock <FileSystem>(); Type combinerClass = job.GetCombinerClass(); Task.CombineOutputCollector <K, V> mockCombineOutputCollector = (Task.CombineOutputCollector <K, V>)Org.Mockito.Mockito.Mock <Task.CombineOutputCollector>(); // needed for mock with generic TaskAttemptID mockTaskAttemptID = Org.Mockito.Mockito.Mock <TaskAttemptID>(); LocalDirAllocator mockLocalDirAllocator = Org.Mockito.Mockito.Mock <LocalDirAllocator >(); CompressionCodec mockCompressionCodec = Org.Mockito.Mockito.Mock <CompressionCodec >(); Counters.Counter mockCounter = Org.Mockito.Mockito.Mock <Counters.Counter>(); TaskStatus mockTaskStatus = Org.Mockito.Mockito.Mock <TaskStatus>(); Progress mockProgress = Org.Mockito.Mockito.Mock <Progress>(); MapOutputFile mockMapOutputFile = Org.Mockito.Mockito.Mock <MapOutputFile>(); Org.Apache.Hadoop.Mapred.Task mockTask = Org.Mockito.Mockito.Mock <Org.Apache.Hadoop.Mapred.Task >(); MapOutput <K, V> output = Org.Mockito.Mockito.Mock <MapOutput>(); ShuffleConsumerPlugin.Context <K, V> context = new ShuffleConsumerPlugin.Context <K , V>(mockTaskAttemptID, job, mockFileSystem, mockUmbilical, mockLocalDirAllocator , mockReporter, mockCompressionCodec, combinerClass, mockCombineOutputCollector, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockTaskStatus , mockProgress, mockProgress, mockTask, mockMapOutputFile, null); TaskStatus status = new _TaskStatus_251(); Progress progress = new Progress(); ShuffleSchedulerImpl <K, V> scheduler = new ShuffleSchedulerImpl <K, V>(job, status , null, null, progress, context.GetShuffledMapsCounter(), context.GetReduceShuffleBytes (), context.GetFailedShuffleCounter()); MapHost host1 = new MapHost("host1", null); TaskAttemptID failedAttemptID = new TaskAttemptID(new TaskID(new JobID("test", 0) , TaskType.Map, 0), 0); TaskAttemptID succeedAttemptID = new TaskAttemptID(new TaskID(new JobID("test", 0 ), TaskType.Map, 1), 1); // handle output fetch failure for failedAttemptID, part I scheduler.HostFailed(host1.GetHostName()); // handle output fetch succeed for succeedAttemptID long bytes = (long)500 * 1024 * 1024; scheduler.CopySucceeded(succeedAttemptID, host1, bytes, 0, 500000, output); // handle output fetch failure for failedAttemptID, part II // for MAPREDUCE-6361: verify no NPE exception get thrown out scheduler.CopyFailed(failedAttemptID, host1, true, false); }
public virtual void CopyFailed(TaskAttemptID mapId, MapHost host, bool readError, bool connectExcpt) { lock (this) { host.Penalize(); int failures = 1; if (failureCounts.Contains(mapId)) { IntWritable x = failureCounts[mapId]; x.Set(x.Get() + 1); failures = x.Get(); } else { failureCounts[mapId] = new IntWritable(1); } string hostname = host.GetHostName(); IntWritable hostFailedNum = hostFailures[hostname]; // MAPREDUCE-6361: hostname could get cleanup from hostFailures in another // thread with copySucceeded. // In this case, add back hostname to hostFailures to get rid of NPE issue. if (hostFailedNum == null) { hostFailures[hostname] = new IntWritable(1); } //report failure if already retried maxHostFailures times bool hostFail = hostFailures[hostname].Get() > GetMaxHostFailures() ? true : false; if (failures >= abortFailureLimit) { try { throw new IOException(failures + " failures downloading " + mapId); } catch (IOException ie) { reporter.ReportException(ie); } } CheckAndInformMRAppMaster(failures, mapId, readError, connectExcpt, hostFail); CheckReducerHealth(); long delay = (long)(InitialPenalty * Math.Pow(PenaltyGrowthRate, failures)); if (delay > maxDelay) { delay = maxDelay; } penalties.AddItem(new ShuffleSchedulerImpl.Penalty(host, delay)); failedShuffleCounter.Increment(1); } }
/// <exception cref="System.IO.IOException"/> public override void Shuffle(MapHost host, InputStream input, long compressedLength , long decompressedLength, ShuffleClientMetrics metrics, Reporter reporter) { input = new IFileInputStream(input, compressedLength, conf); // Copy data to local-disk long bytesLeft = compressedLength; try { int BytesToRead = 64 * 1024; byte[] buf = new byte[BytesToRead]; while (bytesLeft > 0) { int n = ((IFileInputStream)input).ReadWithChecksum(buf, 0, (int)Math.Min(bytesLeft , BytesToRead)); if (n < 0) { throw new IOException("read past end of stream reading " + GetMapId()); } disk.Write(buf, 0, n); bytesLeft -= n; metrics.InputBytes(n); reporter.Progress(); } Log.Info("Read " + (compressedLength - bytesLeft) + " bytes from map-output for " + GetMapId()); disk.Close(); } catch (IOException ioe) { // Close the streams IOUtils.Cleanup(Log, input, disk); // Re-throw throw; } // Sanity check if (bytesLeft != 0) { throw new IOException("Incomplete map output received for " + GetMapId() + " from " + host.GetHostName() + " (" + bytesLeft + " bytes missing of " + compressedLength + ")"); } this.compressedSize = compressedLength; }
/// <summary> /// check if hit timeout of retry, if not, throw an exception and start a /// new round of retry. /// </summary> /// <exception cref="System.IO.IOException"/> private void CheckTimeoutOrRetry(MapHost host, IOException ioe) { // First time to retry. long currentTime = Time.MonotonicNow(); if (retryStartTime == 0) { retryStartTime = currentTime; } // Retry is not timeout, let's do retry with throwing an exception. if (currentTime - retryStartTime < this.fetchRetryTimeout) { Log.Warn("Shuffle output from " + host.GetHostName() + " failed, retry it.", ioe); throw ioe; } else { // timeout, prepare to be failed. Log.Warn("Timeout for copying MapOutput with retry on host " + host + "after " + fetchRetryTimeout + " milliseconds."); } }
private DataInputStream OpenShuffleUrl(MapHost host, ICollection <TaskAttemptID> remaining , Uri url) { DataInputStream input = null; try { SetupConnectionsWithRetry(host, remaining, url); if (stopped) { AbortConnect(host, remaining); } else { input = new DataInputStream(connection.GetInputStream()); } } catch (IOException ie) { bool connectExcpt = ie is ConnectException; ioErrs.Increment(1); Log.Warn("Failed to connect to " + host + " with " + remaining.Count + " map outputs" , ie); // If connect did not succeed, just mark all the maps as failed, // indirectly penalizing the host scheduler.HostFailed(host.GetHostName()); foreach (TaskAttemptID left in remaining) { scheduler.CopyFailed(left, host, false, connectExcpt); } // Add back all the remaining maps, WITHOUT marking them as failed foreach (TaskAttemptID left_1 in remaining) { scheduler.PutBackKnownMapOutput(host, left_1); } } return(input); }
/// <exception cref="System.IO.IOException"/> public virtual void CopySucceeded(TaskAttemptID mapId, MapHost host, long bytes, long startMillis, long endMillis, MapOutput <K, V> output) { lock (this) { Sharpen.Collections.Remove(failureCounts, mapId); Sharpen.Collections.Remove(hostFailures, host.GetHostName()); int mapIndex = mapId.GetTaskID().GetId(); if (!finishedMaps[mapIndex]) { output.Commit(); finishedMaps[mapIndex] = true; shuffledMapsCounter.Increment(1); if (--remainingMaps == 0) { Sharpen.Runtime.NotifyAll(this); } // update single copy task status long copyMillis = (endMillis - startMillis); if (copyMillis == 0) { copyMillis = 1; } float bytesPerMillis = (float)bytes / copyMillis; float transferRate = bytesPerMillis * BytesPerMillisToMbs; string individualProgress = "copy task(" + mapId + " succeeded" + " at " + mbpsFormat .Format(transferRate) + " MB/s)"; // update the aggregated status copyTimeTracker.Add(startMillis, endMillis); totalBytesShuffledTillNow += bytes; UpdateStatus(individualProgress); reduceShuffleBytes.Increment(bytes); lastProgressTime = Time.MonotonicNow(); Log.Debug("map " + mapId + " done " + status.GetStateString()); } } }
/// <exception cref="System.IO.IOException"/> private TaskAttemptID[] CopyMapOutput(MapHost host, DataInputStream input, ICollection <TaskAttemptID> remaining, bool canRetry) { MapOutput <K, V> mapOutput = null; TaskAttemptID mapId = null; long decompressedLength = -1; long compressedLength = -1; try { long startTime = Time.MonotonicNow(); int forReduce = -1; //Read the shuffle header try { ShuffleHeader header = new ShuffleHeader(); header.ReadFields(input); mapId = TaskAttemptID.ForName(header.mapId); compressedLength = header.compressedLength; decompressedLength = header.uncompressedLength; forReduce = header.forReduce; } catch (ArgumentException e) { badIdErrs.Increment(1); Log.Warn("Invalid map id ", e); //Don't know which one was bad, so consider all of them as bad return(Sharpen.Collections.ToArray(remaining, new TaskAttemptID[remaining.Count])); } InputStream @is = input; @is = CryptoUtils.WrapIfNecessary(jobConf, @is, compressedLength); compressedLength -= CryptoUtils.CryptoPadding(jobConf); decompressedLength -= CryptoUtils.CryptoPadding(jobConf); // Do some basic sanity verification if (!VerifySanity(compressedLength, decompressedLength, forReduce, remaining, mapId )) { return(new TaskAttemptID[] { mapId }); } if (Log.IsDebugEnabled()) { Log.Debug("header: " + mapId + ", len: " + compressedLength + ", decomp len: " + decompressedLength); } // Get the location for the map output - either in-memory or on-disk try { mapOutput = merger.Reserve(mapId, decompressedLength, id); } catch (IOException ioe) { // kill this reduce attempt ioErrs.Increment(1); scheduler.ReportLocalError(ioe); return(EmptyAttemptIdArray); } // Check if we can shuffle *now* ... if (mapOutput == null) { Log.Info("fetcher#" + id + " - MergeManager returned status WAIT ..."); //Not an error but wait to process data. return(EmptyAttemptIdArray); } // The codec for lz0,lz4,snappy,bz2,etc. throw java.lang.InternalError // on decompression failures. Catching and re-throwing as IOException // to allow fetch failure logic to be processed try { // Go! Log.Info("fetcher#" + id + " about to shuffle output of map " + mapOutput.GetMapId () + " decomp: " + decompressedLength + " len: " + compressedLength + " to " + mapOutput .GetDescription()); mapOutput.Shuffle(host, @is, compressedLength, decompressedLength, metrics, reporter ); } catch (InternalError e) { Log.Warn("Failed to shuffle for fetcher#" + id, e); throw new IOException(e); } // Inform the shuffle scheduler long endTime = Time.MonotonicNow(); // Reset retryStartTime as map task make progress if retried before. retryStartTime = 0; scheduler.CopySucceeded(mapId, host, compressedLength, startTime, endTime, mapOutput ); // Note successful shuffle remaining.Remove(mapId); metrics.SuccessFetch(); return(null); } catch (IOException ioe) { if (mapOutput != null) { mapOutput.Abort(); } if (canRetry) { CheckTimeoutOrRetry(host, ioe); } ioErrs.Increment(1); if (mapId == null || mapOutput == null) { Log.Warn("fetcher#" + id + " failed to read map header" + mapId + " decomp: " + decompressedLength + ", " + compressedLength, ioe); if (mapId == null) { return(Sharpen.Collections.ToArray(remaining, new TaskAttemptID[remaining.Count])); } else { return(new TaskAttemptID[] { mapId }); } } Log.Warn("Failed to shuffle output of " + mapId + " from " + host.GetHostName(), ioe); // Inform the shuffle-scheduler metrics.FailedFetch(); return(new TaskAttemptID[] { mapId }); } }
protected internal virtual void CopyFromHost(MapHost host) { // reset retryStartTime for a new host retryStartTime = 0; // Get completed maps on 'host' IList <TaskAttemptID> maps = scheduler.GetMapsForHost(host); // Sanity check to catch hosts with only 'OBSOLETE' maps, // especially at the tail of large jobs if (maps.Count == 0) { return; } if (Log.IsDebugEnabled()) { Log.Debug("Fetcher " + id + " going to fetch from " + host + " for: " + maps); } // List of maps to be fetched yet ICollection <TaskAttemptID> remaining = new HashSet <TaskAttemptID>(maps); // Construct the url and connect Uri url = GetMapOutputURL(host, maps); DataInputStream input = OpenShuffleUrl(host, remaining, url); if (input == null) { return; } try { // Loop through available map-outputs and fetch them // On any error, faildTasks is not null and we exit // after putting back the remaining maps to the // yet_to_be_fetched list and marking the failed tasks. TaskAttemptID[] failedTasks = null; while (!remaining.IsEmpty() && failedTasks == null) { try { failedTasks = CopyMapOutput(host, input, remaining, fetchRetryEnabled); } catch (IOException) { // // Setup connection again if disconnected by NM connection.Disconnect(); // Get map output from remaining tasks only. url = GetMapOutputURL(host, remaining); input = OpenShuffleUrl(host, remaining, url); if (input == null) { return; } } } if (failedTasks != null && failedTasks.Length > 0) { Log.Warn("copyMapOutput failed for tasks " + Arrays.ToString(failedTasks)); scheduler.HostFailed(host.GetHostName()); foreach (TaskAttemptID left in failedTasks) { scheduler.CopyFailed(left, host, true, false); } } // Sanity check if (failedTasks == null && !remaining.IsEmpty()) { throw new IOException("server didn't return all expected map outputs: " + remaining .Count + " left."); } input.Close(); input = null; } finally { if (input != null) { IOUtils.Cleanup(Log, input); input = null; } foreach (TaskAttemptID left in remaining) { scheduler.PutBackKnownMapOutput(host, left); } } }