/// <exception cref="System.IO.IOException"/> internal static void CheckRecords(Configuration defaults, int noMaps, int noReduces , Path sortInput, Path sortOutput) { JobConf jobConf = new JobConf(defaults, typeof(SortValidator.RecordChecker)); jobConf.SetJobName("sortvalidate-record-checker"); jobConf.SetInputFormat(typeof(SequenceFileInputFormat)); jobConf.SetOutputFormat(typeof(SequenceFileOutputFormat)); jobConf.SetOutputKeyClass(typeof(BytesWritable)); jobConf.SetOutputValueClass(typeof(IntWritable)); jobConf.SetMapperClass(typeof(SortValidator.RecordChecker.Map)); jobConf.SetReducerClass(typeof(SortValidator.RecordChecker.Reduce)); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.GetClusterStatus(); if (noMaps == -1) { noMaps = cluster.GetTaskTrackers() * jobConf.GetInt(MapsPerHost, 10); } if (noReduces == -1) { noReduces = (int)(cluster.GetMaxReduceTasks() * 0.9); string sortReduces = jobConf.Get(ReducesPerHost); if (sortReduces != null) { noReduces = cluster.GetTaskTrackers() * System.Convert.ToInt32(sortReduces); } } jobConf.SetNumMapTasks(noMaps); jobConf.SetNumReduceTasks(noReduces); FileInputFormat.SetInputPaths(jobConf, sortInput); FileInputFormat.AddInputPath(jobConf, sortOutput); Path outputPath = new Path("/tmp/sortvalidate/recordchecker"); FileSystem fs = FileSystem.Get(defaults); if (fs.Exists(outputPath)) { fs.Delete(outputPath, true); } FileOutputFormat.SetOutputPath(jobConf, outputPath); // Uncomment to run locally in a single process //job_conf.set(JTConfig.JT, "local"); Path[] inputPaths = FileInputFormat.GetInputPaths(jobConf); System.Console.Out.WriteLine("\nSortValidator.RecordChecker: Running on " + cluster .GetTaskTrackers() + " nodes to validate sort from " + inputPaths[0] + ", " + inputPaths [1] + " into " + FileOutputFormat.GetOutputPath(jobConf) + " with " + noReduces + " reduces."); DateTime startTime = new DateTime(); System.Console.Out.WriteLine("Job started: " + startTime); JobClient.RunJob(jobConf); DateTime end_time = new DateTime(); System.Console.Out.WriteLine("Job ended: " + end_time); System.Console.Out.WriteLine("The job took " + (end_time.GetTime() - startTime.GetTime ()) / 1000 + " seconds."); }
/// <exception cref="System.IO.IOException"/> public override void CleanupJob(JobContext context) { System.Console.Error.WriteLine("---- HERE ----"); JobConf conf = context.GetJobConf(); Path outputPath = FileOutputFormat.GetOutputPath(conf); FileSystem fs = outputPath.GetFileSystem(conf); fs.Create(new Path(outputPath, CustomCleanupFileName)).Close(); }
/// <exception cref="System.IO.IOException"/> public override void AbortJob(JobContext context, int state) { JobConf conf = context.GetJobConf(); Path outputPath = FileOutputFormat.GetOutputPath(conf); FileSystem fs = outputPath.GetFileSystem(conf); string fileName = (state == JobStatus.Failed) ? TestJobCleanup.AbortFailedFileName : TestJobCleanup.AbortKilledFileName; fs.Create(new Path(outputPath, fileName)).Close(); }
// Input formats /// <exception cref="System.IO.IOException"/> public virtual InputSplit[] GetSplits(JobConf job, int numSplits) { InputSplit[] result = new InputSplit[numSplits]; Path outDir = FileOutputFormat.GetOutputPath(job); for (int i = 0; i < result.Length; ++i) { result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (string[])null ); } return(result); }
/// <summary>Get the directory to which skipped records are written.</summary> /// <remarks> /// Get the directory to which skipped records are written. By default it is /// the sub directory of the output _logs directory. /// User can stop writing skipped records by setting the value null. /// </remarks> /// <param name="conf">the configuration.</param> /// <returns> /// path skip output directory. Null is returned if this is not set /// and output directory is also not set. /// </returns> public static Path GetSkipOutputPath(Configuration conf) { string name = conf.Get(OutPath); if (name != null) { if ("none".Equals(name)) { return(null); } return(new Path(name)); } Path outPath = FileOutputFormat.GetOutputPath(new JobConf(conf)); return(outPath == null ? null : new Path(outPath, "_logs" + Path.Separator + "skip" )); }
/// <exception cref="System.Exception"/> /// <exception cref="System.IO.IOException"/> internal NewTrackingRecordWriter(ReduceTask reduce, TaskAttemptContext taskContext ) { this.outputRecordCounter = reduce.reduceOutputCounter; this.fileOutputByteCounter = reduce.fileOutputByteCounter; IList <FileSystem.Statistics> matchedStats = null; if (reduce.outputFormat is FileOutputFormat) { matchedStats = GetFsStatistics(FileOutputFormat.GetOutputPath(taskContext), taskContext .GetConfiguration()); } fsStats = matchedStats; long bytesOutPrev = GetOutputBytes(fsStats); this.real = (RecordWriter <K, V>)reduce.outputFormat.GetRecordWriter(taskContext); long bytesOutCurr = GetOutputBytes(fsStats); fileOutputByteCounter.Increment(bytesOutCurr - bytesOutPrev); }
/// <exception cref="System.IO.IOException"/> public OldTrackingRecordWriter(ReduceTask reduce, JobConf job, Task.TaskReporter reporter, string finalName) { this.reduceOutputCounter = reduce.reduceOutputCounter; this.fileOutputByteCounter = reduce.fileOutputByteCounter; IList <FileSystem.Statistics> matchedStats = null; if (job.GetOutputFormat() is FileOutputFormat) { matchedStats = GetFsStatistics(FileOutputFormat.GetOutputPath(job), job); } fsStats = matchedStats; FileSystem fs = FileSystem.Get(job); long bytesOutPrev = GetOutputBytes(fsStats); this.real = job.GetOutputFormat().GetRecordWriter(fs, job, finalName, reporter); long bytesOutCurr = GetOutputBytes(fsStats); fileOutputByteCounter.Increment(bytesOutCurr - bytesOutPrev); }
/// <summary>Runs a MapReduce task, given number of times.</summary> /// <remarks> /// Runs a MapReduce task, given number of times. The input to each run /// is the same file. /// </remarks> /// <exception cref="System.IO.IOException"/> private AList<long> RunJobInSequence(JobConf masterJobConf, int numRuns) { Random rand = new Random(); AList<long> execTimes = new AList<long>(); for (int i = 0; i < numRuns; i++) { // create a new job conf every time, reusing same object does not work JobConf jobConf = new JobConf(masterJobConf); // reset the job jar because the copy constructor doesn't jobConf.SetJar(masterJobConf.GetJar()); // give a new random name to output of the mapred tasks FileOutputFormat.SetOutputPath(jobConf, new Path(OutputDir, "output_" + rand.Next ())); Log.Info("Running job " + i + ":" + " input=" + FileInputFormat.GetInputPaths(jobConf )[0] + " output=" + FileOutputFormat.GetOutputPath(jobConf)); // run the mapred task now long curTime = Runtime.CurrentTimeMillis(); JobClient.RunJob(jobConf); execTimes.AddItem(Runtime.CurrentTimeMillis() - curTime); } return execTimes; }
private static Path GetOutputPath(TaskAttemptContext context) { JobConf conf = context.GetJobConf(); return(FileOutputFormat.GetOutputPath(conf)); }
/// <exception cref="System.IO.IOException"/> internal static void CheckRecords(Configuration defaults, Path sortInput, Path sortOutput ) { FileSystem inputfs = sortInput.GetFileSystem(defaults); FileSystem outputfs = sortOutput.GetFileSystem(defaults); FileSystem defaultfs = FileSystem.Get(defaults); JobConf jobConf = new JobConf(defaults, typeof(SortValidator.RecordStatsChecker)); jobConf.SetJobName("sortvalidate-recordstats-checker"); int noSortReduceTasks = outputfs.ListStatus(sortOutput, sortPathsFilter).Length; jobConf.SetInt(SortReduces, noSortReduceTasks); int noSortInputpaths = inputfs.ListStatus(sortInput).Length; jobConf.SetInputFormat(typeof(SortValidator.RecordStatsChecker.NonSplitableSequenceFileInputFormat )); jobConf.SetOutputFormat(typeof(SequenceFileOutputFormat)); jobConf.SetOutputKeyClass(typeof(IntWritable)); jobConf.SetOutputValueClass(typeof(SortValidator.RecordStatsChecker.RecordStatsWritable )); jobConf.SetMapperClass(typeof(SortValidator.RecordStatsChecker.Map)); jobConf.SetCombinerClass(typeof(SortValidator.RecordStatsChecker.Reduce)); jobConf.SetReducerClass(typeof(SortValidator.RecordStatsChecker.Reduce)); jobConf.SetNumMapTasks(noSortReduceTasks); jobConf.SetNumReduceTasks(1); FileInputFormat.SetInputPaths(jobConf, sortInput); FileInputFormat.AddInputPath(jobConf, sortOutput); Path outputPath = new Path(new Path("/tmp", "sortvalidate"), UUID.RandomUUID().ToString ()); if (defaultfs.Exists(outputPath)) { defaultfs.Delete(outputPath, true); } FileOutputFormat.SetOutputPath(jobConf, outputPath); // Uncomment to run locally in a single process //job_conf.set(JTConfig.JT, "local"); Path[] inputPaths = FileInputFormat.GetInputPaths(jobConf); System.Console.Out.WriteLine("\nSortValidator.RecordStatsChecker: Validate sort " + "from " + inputPaths[0] + " (" + noSortInputpaths + " files), " + inputPaths[ 1] + " (" + noSortReduceTasks + " files) into " + FileOutputFormat.GetOutputPath (jobConf) + " with 1 reducer."); DateTime startTime = new DateTime(); System.Console.Out.WriteLine("Job started: " + startTime); JobClient.RunJob(jobConf); try { DateTime end_time = new DateTime(); System.Console.Out.WriteLine("Job ended: " + end_time); System.Console.Out.WriteLine("The job took " + (end_time.GetTime() - startTime.GetTime ()) / 1000 + " seconds."); // Check to ensure that the statistics of the // framework's sort-input and sort-output match SequenceFile.Reader stats = new SequenceFile.Reader(defaultfs, new Path(outputPath , "part-00000"), defaults); try { IntWritable k1 = new IntWritable(); IntWritable k2 = new IntWritable(); SortValidator.RecordStatsChecker.RecordStatsWritable v1 = new SortValidator.RecordStatsChecker.RecordStatsWritable (); SortValidator.RecordStatsChecker.RecordStatsWritable v2 = new SortValidator.RecordStatsChecker.RecordStatsWritable (); if (!stats.Next(k1, v1)) { throw new IOException("Failed to read record #1 from reduce's output"); } if (!stats.Next(k2, v2)) { throw new IOException("Failed to read record #2 from reduce's output"); } if ((v1.GetBytes() != v2.GetBytes()) || (v1.GetRecords() != v2.GetRecords()) || v1 .GetChecksum() != v2.GetChecksum()) { throw new IOException("(" + v1.GetBytes() + ", " + v1.GetRecords() + ", " + v1.GetChecksum () + ") v/s (" + v2.GetBytes() + ", " + v2.GetRecords() + ", " + v2.GetChecksum( ) + ")"); } } finally { stats.Close(); } } finally { defaultfs.Delete(outputPath, true); } }
/// <exception cref="System.Exception"/> public virtual int Run(string[] argv) { JobConf job = new JobConf(GetConf()); job.SetJarByClass(typeof(GenericMRLoadGenerator)); job.SetMapperClass(typeof(GenericMRLoadGenerator.SampleMapper)); job.SetReducerClass(typeof(GenericMRLoadGenerator.SampleReducer)); if (!ParseArgs(argv, job)) { return(-1); } if (null == FileOutputFormat.GetOutputPath(job)) { // No output dir? No writes job.SetOutputFormat(typeof(NullOutputFormat)); } if (0 == FileInputFormat.GetInputPaths(job).Length) { // No input dir? Generate random data System.Console.Error.WriteLine("No input path; ignoring InputFormat"); ConfRandom(job); } else { if (null != job.GetClass(GenericMRLoadGenerator.IndirectInputFormat, null)) { // specified IndirectInputFormat? Build src list JobClient jClient = new JobClient(job); Path tmpDir = new Path(jClient.GetFs().GetHomeDirectory(), ".staging"); Random r = new Random(); Path indirInputFile = new Path(tmpDir, Sharpen.Extensions.ToString(r.Next(int.MaxValue ), 36) + "_files"); job.Set(GenericMRLoadGenerator.IndirectInputFile, indirInputFile.ToString()); SequenceFile.Writer writer = SequenceFile.CreateWriter(tmpDir.GetFileSystem(job), job, indirInputFile, typeof(LongWritable), typeof(Text), SequenceFile.CompressionType .None); try { foreach (Path p in FileInputFormat.GetInputPaths(job)) { FileSystem fs = p.GetFileSystem(job); Stack <Path> pathstack = new Stack <Path>(); pathstack.Push(p); while (!pathstack.Empty()) { foreach (FileStatus stat in fs.ListStatus(pathstack.Pop())) { if (stat.IsDirectory()) { if (!stat.GetPath().GetName().StartsWith("_")) { pathstack.Push(stat.GetPath()); } } else { writer.Sync(); writer.Append(new LongWritable(stat.GetLen()), new Text(stat.GetPath().ToUri().ToString ())); } } } } } finally { writer.Close(); } } } DateTime startTime = new DateTime(); System.Console.Out.WriteLine("Job started: " + startTime); JobClient.RunJob(job); DateTime endTime = new DateTime(); System.Console.Out.WriteLine("Job ended: " + endTime); System.Console.Out.WriteLine("The job took " + (endTime.GetTime() - startTime.GetTime ()) / 1000 + " seconds."); return(0); }