/// <summary> /// HADOOP-4466: /// This test verifies the JavSerialization impl can write to /// SequenceFiles. /// </summary> /// <remarks> /// HADOOP-4466: /// This test verifies the JavSerialization impl can write to /// SequenceFiles. by virtue other SequenceFileOutputFormat is not /// coupled to Writable types, if so, the job will fail. /// </remarks> /// <exception cref="System.Exception"/> public virtual void TestWriteToSequencefile() { JobConf conf = new JobConf(typeof(TestJavaSerialization)); conf.SetJobName("JavaSerialization"); FileSystem fs = FileSystem.Get(conf); CleanAndCreateInput(fs); conf.Set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.SetInputFormat(typeof(TextInputFormat)); // test we can write to sequence files conf.SetOutputFormat(typeof(SequenceFileOutputFormat)); conf.SetOutputKeyClass(typeof(string)); conf.SetOutputValueClass(typeof(long)); conf.SetOutputKeyComparatorClass(typeof(JavaSerializationComparator)); conf.SetMapperClass(typeof(TestJavaSerialization.WordCountMapper)); conf.SetReducerClass(typeof(TestJavaSerialization.SumReducer)); conf.Set(MRConfig.FrameworkName, MRConfig.LocalFrameworkName); FileInputFormat.SetInputPaths(conf, InputDir); FileOutputFormat.SetOutputPath(conf, OutputDir); JobClient.RunJob(conf); Path[] outputFiles = FileUtil.Stat2Paths(fs.ListStatus(OutputDir, new Utils.OutputFileUtils.OutputFilesFilter ())); NUnit.Framework.Assert.AreEqual(1, outputFiles.Length); }
public virtual void TestOldCounterC() { JobConf conf = CreateConfiguration(); CreateWordsFile(inFiles[3], conf); CreateWordsFile(inFiles[4], conf); long inputSize = 0; inputSize += GetFileSize(inFiles[0]); inputSize += GetFileSize(inFiles[1]); inputSize += GetFileSize(inFiles[2]); inputSize += GetFileSize(inFiles[3]); inputSize += GetFileSize(inFiles[4]); conf.SetNumMapTasks(4); conf.SetInt(JobContext.IoSortFactor, 3); FileInputFormat.SetInputPaths(conf, InDir); FileOutputFormat.SetOutputPath(conf, new Path(OutDir, "outputO2")); RunningJob myJob = JobClient.RunJob(conf); Counters c1 = myJob.GetCounters(); // As above, each map spills 2^14 records, so 5 maps spill 81920 // 1st merge: read + write = 6 * 8192 // final merge: unmerged = 2 * 8192 // Total reduce: 45056 // 5 files, 5120 = 5 * 1024 rec/file = 15360 input records // 4 records/line = 102400 output records ValidateCounters(c1, 122880, 25600, 102400); ValidateFileCounters(c1, inputSize, 0, 0, 0); }
/// <exception cref="System.Exception"/> public virtual void TestComplexNameWithRegex() { OutputStream os = GetFileSystem().Create(new Path(GetInputDir(), "text.txt")); TextWriter wr = new OutputStreamWriter(os); wr.Write("b a\n"); wr.Close(); JobConf conf = CreateJobConf(); conf.SetJobName("name \\Evalue]"); conf.SetInputFormat(typeof(TextInputFormat)); conf.SetOutputKeyClass(typeof(LongWritable)); conf.SetOutputValueClass(typeof(Text)); conf.SetMapperClass(typeof(IdentityMapper)); FileInputFormat.SetInputPaths(conf, GetInputDir()); FileOutputFormat.SetOutputPath(conf, GetOutputDir()); JobClient.RunJob(conf); Path[] outputFiles = FileUtil.Stat2Paths(GetFileSystem().ListStatus(GetOutputDir( ), new Utils.OutputFileUtils.OutputFilesFilter())); NUnit.Framework.Assert.AreEqual(1, outputFiles.Length); InputStream @is = GetFileSystem().Open(outputFiles[0]); BufferedReader reader = new BufferedReader(new InputStreamReader(@is)); NUnit.Framework.Assert.AreEqual("0\tb a", reader.ReadLine()); NUnit.Framework.Assert.IsNull(reader.ReadLine()); reader.Close(); }
/// <exception cref="System.IO.IOException"/> internal static void ConfigureWordCount(FileSystem fs, JobConf conf, string input , int numMaps, int numReduces, Path inDir, Path outDir) { fs.Delete(outDir, true); if (!fs.Mkdirs(inDir)) { throw new IOException("Mkdirs failed to create " + inDir.ToString()); } DataOutputStream file = fs.Create(new Path(inDir, "part-0")); file.WriteBytes(input); file.Close(); FileSystem.SetDefaultUri(conf, fs.GetUri()); conf.Set(JTConfig.FrameworkName, JTConfig.YarnFrameworkName); conf.SetJobName("wordcount"); conf.SetInputFormat(typeof(TextInputFormat)); // the keys are words (strings) conf.SetOutputKeyClass(typeof(Text)); // the values are counts (ints) conf.SetOutputValueClass(typeof(IntWritable)); conf.Set("mapred.mapper.class", "testjar.ClassWordCount$MapClass"); conf.Set("mapred.combine.class", "testjar.ClassWordCount$Reduce"); conf.Set("mapred.reducer.class", "testjar.ClassWordCount$Reduce"); FileInputFormat.SetInputPaths(conf, inDir); FileOutputFormat.SetOutputPath(conf, outDir); conf.SetNumMapTasks(numMaps); conf.SetNumReduceTasks(numReduces); //set the tests jar file conf.SetJarByClass(typeof(TestMiniMRClasspath)); }
public virtual void Configure() { Path testdir = new Path(TestDir.GetAbsolutePath()); Path inDir = new Path(testdir, "in"); Path outDir = new Path(testdir, "out"); FileSystem fs = FileSystem.Get(conf); fs.Delete(testdir, true); conf.SetInt(JobContext.IoSortMb, 1); conf.SetInputFormat(typeof(SequenceFileInputFormat)); FileInputFormat.SetInputPaths(conf, inDir); FileOutputFormat.SetOutputPath(conf, outDir); conf.SetMapperClass(typeof(TestMapOutputType.TextGen)); conf.SetReducerClass(typeof(TestMapOutputType.TextReduce)); conf.SetOutputKeyClass(typeof(Text)); conf.SetOutputValueClass(typeof(Text)); conf.Set(MRConfig.FrameworkName, MRConfig.LocalFrameworkName); conf.SetOutputFormat(typeof(SequenceFileOutputFormat)); if (!fs.Mkdirs(testdir)) { throw new IOException("Mkdirs failed to create " + testdir.ToString()); } if (!fs.Mkdirs(inDir)) { throw new IOException("Mkdirs failed to create " + inDir.ToString()); } Path inFile = new Path(inDir, "part0"); SequenceFile.Writer writer = SequenceFile.CreateWriter(fs, conf, inFile, typeof(Text ), typeof(Text)); writer.Append(new Text("rec: 1"), new Text("Hello")); writer.Close(); jc = new JobClient(conf); }
/// <exception cref="System.Exception"/> private static void RunTestLazyOutput(JobConf job, Path output, int numReducers, bool createLazily) { job.SetJobName("test-lazy-output"); FileInputFormat.SetInputPaths(job, Input); FileOutputFormat.SetOutputPath(job, output); job.SetInputFormat(typeof(TextInputFormat)); job.SetMapOutputKeyClass(typeof(LongWritable)); job.SetMapOutputValueClass(typeof(Text)); job.SetOutputKeyClass(typeof(LongWritable)); job.SetOutputValueClass(typeof(Text)); job.SetMapperClass(typeof(TestLazyOutput.TestMapper)); job.SetReducerClass(typeof(TestLazyOutput.TestReducer)); JobClient client = new JobClient(job); job.SetNumReduceTasks(numReducers); if (createLazily) { LazyOutputFormat.SetOutputFormatClass(job, typeof(TextOutputFormat)); } else { job.SetOutputFormat(typeof(TextOutputFormat)); } JobClient.RunJob(job); }
public virtual void TestStatusLimit() { Path test = new Path(testRootTempDir, "testStatusLimit"); Configuration conf = new Configuration(); Path inDir = new Path(test, "in"); Path outDir = new Path(test, "out"); FileSystem fs = FileSystem.Get(conf); if (fs.Exists(inDir)) { fs.Delete(inDir, true); } fs.Mkdirs(inDir); DataOutputStream file = fs.Create(new Path(inDir, "part-" + 0)); file.WriteBytes("testStatusLimit"); file.Close(); if (fs.Exists(outDir)) { fs.Delete(outDir, true); } Job job = Job.GetInstance(conf, "testStatusLimit"); job.SetMapperClass(typeof(TestReporter.StatusLimitMapper)); job.SetNumReduceTasks(0); FileInputFormat.AddInputPath(job, inDir); FileOutputFormat.SetOutputPath(job, outDir); job.WaitForCompletion(true); NUnit.Framework.Assert.IsTrue("Job failed", job.IsSuccessful()); }
/// <summary>Test using the gzip codec with two input files.</summary> /// <exception cref="System.IO.IOException"/> public virtual void TestGzipWithTwoInputs() { CompressionCodec gzip = new GzipCodec(); localFs.Delete(workDir, true); FixedLengthInputFormat format = new FixedLengthInputFormat(); JobConf job = new JobConf(defaultConf); FixedLengthInputFormat.SetRecordLength(job, 5); FileInputFormat.SetInputPaths(job, workDir); ReflectionUtils.SetConf(gzip, job); format.Configure(job); // Create files with fixed length records with 5 byte long records. WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one two threefour five six seveneightnine ten " ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten nine eightsevensix five four threetwo one " ); InputSplit[] splits = format.GetSplits(job, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <string> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 10, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "six ", results[5]); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 10, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "ten ", results[0]); NUnit.Framework.Assert.AreEqual("splits[1][1]", "nine ", results[1]); }
/// <summary>Test with no record length set.</summary> /// <exception cref="System.IO.IOException"/> public virtual void TestNoRecordLength() { localFs.Delete(workDir, true); Path file = new Path(workDir, new string("testFormat.txt")); CreateFile(file, null, 10, 10); // Set the fixed length record length config property JobConf job = new JobConf(defaultConf); FileInputFormat.SetInputPaths(job, workDir); FixedLengthInputFormat format = new FixedLengthInputFormat(); format.Configure(job); InputSplit[] splits = format.GetSplits(job, 1); bool exceptionThrown = false; foreach (InputSplit split in splits) { try { RecordReader <LongWritable, BytesWritable> reader = format.GetRecordReader(split, job, voidReporter); } catch (IOException ioe) { exceptionThrown = true; Log.Info("Exception message:" + ioe.Message); } } NUnit.Framework.Assert.IsTrue("Exception for not setting record length:", exceptionThrown ); }
public virtual void TestOldCounterB() { JobConf conf = CreateConfiguration(); CreateWordsFile(inFiles[3], conf); RemoveWordsFile(inFiles[4], conf); long inputSize = 0; inputSize += GetFileSize(inFiles[0]); inputSize += GetFileSize(inFiles[1]); inputSize += GetFileSize(inFiles[2]); inputSize += GetFileSize(inFiles[3]); conf.SetNumMapTasks(4); conf.SetInt(JobContext.IoSortFactor, 2); FileInputFormat.SetInputPaths(conf, InDir); FileOutputFormat.SetOutputPath(conf, new Path(OutDir, "outputO1")); RunningJob myJob = JobClient.RunJob(conf); Counters c1 = myJob.GetCounters(); // As above, each map spills 2^14 records, so 4 maps spill 2^16 records // In the reduce, there are two intermediate merges before the reduce. // 1st merge: read + write = 8192 * 4 // 2nd merge: read + write = 8192 * 4 // final merge: 0 // Total reduce: 32768 // Total: map + reduce = 2^16 + 2^15 = 98304 // 4 files, 5120 = 5 * 1024 rec/file = 15360 input records // 4 records/line = 81920 output records ValidateCounters(c1, 98304, 20480, 81920); ValidateFileCounters(c1, inputSize, 0, 0, 0); }
/// <exception cref="System.Exception"/> private void RunMapReduce(JobConf conf, IList <string> mapperBadRecords, IList <string > redBadRecords) { CreateInput(); conf.SetJobName("mr"); conf.SetNumMapTasks(1); conf.SetNumReduceTasks(1); conf.SetInt(JobContext.TaskTimeout, 30 * 1000); SkipBadRecords.SetMapperMaxSkipRecords(conf, long.MaxValue); SkipBadRecords.SetReducerMaxSkipGroups(conf, long.MaxValue); SkipBadRecords.SetAttemptsToStartSkipping(conf, 0); //the no of attempts to successfully complete the task depends //on the no of bad records. conf.SetMaxMapAttempts(SkipBadRecords.GetAttemptsToStartSkipping(conf) + 1 + mapperBadRecords .Count); conf.SetMaxReduceAttempts(SkipBadRecords.GetAttemptsToStartSkipping(conf) + 1 + redBadRecords .Count); FileInputFormat.SetInputPaths(conf, GetInputDir()); FileOutputFormat.SetOutputPath(conf, GetOutputDir()); conf.SetInputFormat(typeof(TextInputFormat)); conf.SetMapOutputKeyClass(typeof(LongWritable)); conf.SetMapOutputValueClass(typeof(Text)); conf.SetOutputFormat(typeof(TextOutputFormat)); conf.SetOutputKeyClass(typeof(LongWritable)); conf.SetOutputValueClass(typeof(Text)); RunningJob runningJob = JobClient.RunJob(conf); ValidateOutput(conf, runningJob, mapperBadRecords, redBadRecords); }
/// <exception cref="System.IO.IOException"/> private Path InitFiles(FileSystem fs, int numFiles, int numBytes) { Path dir = new Path(Runtime.GetProperty("test.build.data", ".") + "/mapred"); Path multiFileDir = new Path(dir, "test.multifile"); fs.Delete(multiFileDir, true); fs.Mkdirs(multiFileDir); Log.Info("Creating " + numFiles + " file(s) in " + multiFileDir); for (int i = 0; i < numFiles; i++) { Path path = new Path(multiFileDir, "file_" + i); FSDataOutputStream @out = fs.Create(path); if (numBytes == -1) { numBytes = rand.Next(MaxBytes); } for (int j = 0; j < numBytes; j++) { @out.Write(rand.Next()); } @out.Close(); if (Log.IsDebugEnabled()) { Log.Debug("Created file " + path + " with length " + numBytes); } lengths[path.GetName()] = System.Convert.ToInt64(numBytes); } FileInputFormat.SetInputPaths(job, multiFileDir); return(multiFileDir); }
/// <exception cref="System.Exception"/> private void _testInputFiles(bool withFilter, bool withGlob) { ICollection <Path> createdFiles = CreateFiles(); JobConf conf = new JobConf(); Path inputDir = (withGlob) ? new Path(workDir, "a*") : workDir; FileInputFormat.SetInputPaths(conf, inputDir); conf.SetInputFormat(typeof(TestFileInputFormatPathFilter.DummyFileInputFormat)); if (withFilter) { FileInputFormat.SetInputPathFilter(conf, typeof(TestFileInputFormatPathFilter.TestPathFilter )); } TestFileInputFormatPathFilter.DummyFileInputFormat inputFormat = (TestFileInputFormatPathFilter.DummyFileInputFormat )conf.GetInputFormat(); ICollection <Path> computedFiles = new HashSet <Path>(); foreach (FileStatus file in inputFormat.ListStatus(conf)) { computedFiles.AddItem(file.GetPath()); } createdFiles.Remove(localFs.MakeQualified(new Path(workDir, "_hello"))); createdFiles.Remove(localFs.MakeQualified(new Path(workDir, ".hello"))); if (withFilter) { createdFiles.Remove(localFs.MakeQualified(new Path(workDir, "aa"))); createdFiles.Remove(localFs.MakeQualified(new Path(workDir, "bb"))); } if (withGlob) { createdFiles.Remove(localFs.MakeQualified(new Path(workDir, "b"))); createdFiles.Remove(localFs.MakeQualified(new Path(workDir, "bb"))); } NUnit.Framework.Assert.AreEqual(createdFiles, computedFiles); }
// configure a job /// <exception cref="System.IO.IOException"/> private void Configure(JobConf conf, Path inDir, Path outDir, string input, Type map, Type reduce) { // set up the input file system and write input text. FileSystem inFs = inDir.GetFileSystem(conf); FileSystem outFs = outDir.GetFileSystem(conf); outFs.Delete(outDir, true); if (!inFs.Mkdirs(inDir)) { throw new IOException("Mkdirs failed to create " + inDir.ToString()); } { // write input into input file DataOutputStream file = inFs.Create(new Path(inDir, "part-0")); file.WriteBytes(input); file.Close(); } // configure the mapred Job which creates a tempfile in map. conf.SetJobName("testmap"); conf.SetMapperClass(map); conf.SetReducerClass(reduce); conf.SetNumMapTasks(1); conf.SetNumReduceTasks(0); FileInputFormat.SetInputPaths(conf, inDir); FileOutputFormat.SetOutputPath(conf, outDir); string TestRootDir = new Path(Runtime.GetProperty("test.build.data", "/tmp")).ToString ().Replace(' ', '+'); conf.Set("test.build.data", TestRootDir); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> /// <exception cref="System.TypeLoadException"/> /// <exception cref="Sharpen.URISyntaxException"/> private void TestWithConf(Configuration conf) { // Create a temporary file of length 1. Path first = CreateTempFile("distributed.first", "x"); // Create two jars with a single file inside them. Path second = MakeJar(new Path(TestRootDir, "distributed.second.jar"), 2); Path third = MakeJar(new Path(TestRootDir, "distributed.third.jar"), 3); Path fourth = MakeJar(new Path(TestRootDir, "distributed.fourth.jar"), 4); Job job = Job.GetInstance(conf); job.SetMapperClass(typeof(TestMRWithDistributedCache.DistributedCacheCheckerMapper )); job.SetReducerClass(typeof(TestMRWithDistributedCache.DistributedCacheCheckerReducer )); job.SetOutputFormatClass(typeof(NullOutputFormat)); FileInputFormat.SetInputPaths(job, first); // Creates the Job Configuration job.AddCacheFile(new URI(first.ToUri().ToString() + "#distributed.first.symlink") ); job.AddFileToClassPath(second); job.AddArchiveToClassPath(third); job.AddCacheArchive(fourth.ToUri()); job.SetMaxMapAttempts(1); // speed up failures job.Submit(); NUnit.Framework.Assert.IsTrue(job.WaitForCompletion(false)); }
/// <summary>Test using the gzip codec for reading</summary> /// <exception cref="System.IO.IOException"/> public static void TestGzip() { JobConf job = new JobConf(); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, job); localFs.Delete(workDir, true); WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n" ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n" ); FileInputFormat.SetInputPaths(job, workDir); KeyValueTextInputFormat format = new KeyValueTextInputFormat(); format.Configure(job); InputSplit[] splits = format.GetSplits(job, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString()); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }
/// <exception cref="System.Exception"/> private string RunJob() { OutputStream os = GetFileSystem().Create(new Path(GetInputDir(), "text.txt")); TextWriter wr = new OutputStreamWriter(os); wr.Write("hello1\n"); wr.Write("hello2\n"); wr.Write("hello3\n"); wr.Close(); JobConf conf = CreateJobConf(); conf.SetJobName("mr"); conf.SetJobPriority(JobPriority.High); conf.SetInputFormat(typeof(TextInputFormat)); conf.SetMapOutputKeyClass(typeof(LongWritable)); conf.SetMapOutputValueClass(typeof(Text)); conf.SetOutputFormat(typeof(TextOutputFormat)); conf.SetOutputKeyClass(typeof(LongWritable)); conf.SetOutputValueClass(typeof(Text)); conf.SetMapperClass(typeof(IdentityMapper)); conf.SetReducerClass(typeof(IdentityReducer)); FileInputFormat.SetInputPaths(conf, GetInputDir()); FileOutputFormat.SetOutputPath(conf, GetOutputDir()); return(JobClient.RunJob(conf).GetID().ToString()); }
/// <exception cref="System.Exception"/> public static Counters RunJob(JobConf conf) { conf.SetMapperClass(typeof(TestReduceFetchFromPartialMem.MapMB)); conf.SetReducerClass(typeof(TestReduceFetchFromPartialMem.MBValidate)); conf.SetOutputKeyClass(typeof(Org.Apache.Hadoop.IO.Text)); conf.SetOutputValueClass(typeof(Org.Apache.Hadoop.IO.Text)); conf.SetNumReduceTasks(1); conf.SetInputFormat(typeof(TestReduceFetchFromPartialMem.FakeIF)); conf.SetNumTasksToExecutePerJvm(1); conf.SetInt(JobContext.MapMaxAttempts, 0); conf.SetInt(JobContext.ReduceMaxAttempts, 0); FileInputFormat.SetInputPaths(conf, new Path("/in")); Path outp = new Path("/out"); FileOutputFormat.SetOutputPath(conf, outp); RunningJob job = null; try { job = JobClient.RunJob(conf); NUnit.Framework.Assert.IsTrue(job.IsSuccessful()); } finally { FileSystem fs = dfsCluster.GetFileSystem(); if (fs.Exists(outp)) { fs.Delete(outp, true); } } return(job.GetCounters()); }
// Start a job with the specified input and return its RunningJob object /// <exception cref="System.IO.IOException"/> internal static RunningJob RunJob(JobConf conf, Path inDir, Path outDir, int numMaps , int numReds, string input) { FileSystem fs = FileSystem.Get(conf); if (fs.Exists(outDir)) { fs.Delete(outDir, true); } if (!fs.Exists(inDir)) { fs.Mkdirs(inDir); } for (int i = 0; i < numMaps; ++i) { DataOutputStream file = fs.Create(new Path(inDir, "part-" + i)); file.WriteBytes(input); file.Close(); } conf.SetInputFormat(typeof(TextInputFormat)); conf.SetOutputKeyClass(typeof(LongWritable)); conf.SetOutputValueClass(typeof(Org.Apache.Hadoop.IO.Text)); FileInputFormat.SetInputPaths(conf, inDir); FileOutputFormat.SetOutputPath(conf, outDir); conf.SetNumMapTasks(numMaps); conf.SetNumReduceTasks(numReds); JobClient jobClient = new JobClient(conf); RunningJob job = jobClient.SubmitJob(conf); return(job); }
private static IntWritable DeduceInputFile(JobConf job) { Path[] inputPaths = FileInputFormat.GetInputPaths(job); Path inputFile = new Path(job.Get(JobContext.MapInputFile)); // value == one for sort-input; value == two for sort-output return((inputFile.GetParent().Equals(inputPaths[0])) ? sortInput : sortOutput); }
/// <summary>The main driver for word count map/reduce program.</summary> /// <remarks> /// The main driver for word count map/reduce program. /// Invoke this method to submit the map/reduce job. /// </remarks> /// <exception cref="System.IO.IOException"> /// When there is communication problems with the /// job tracker. /// </exception> /// <exception cref="System.Exception"/> public virtual int Run(string[] args) { JobConf conf = new JobConf(GetConf(), typeof(WordCount)); conf.SetJobName("wordcount"); // the keys are words (strings) conf.SetOutputKeyClass(typeof(Text)); // the values are counts (ints) conf.SetOutputValueClass(typeof(IntWritable)); conf.SetMapperClass(typeof(WordCount.MapClass)); conf.SetCombinerClass(typeof(WordCount.Reduce)); conf.SetReducerClass(typeof(WordCount.Reduce)); IList <string> other_args = new AList <string>(); for (int i = 0; i < args.Length; ++i) { try { if ("-m".Equals(args[i])) { conf.SetNumMapTasks(System.Convert.ToInt32(args[++i])); } else { if ("-r".Equals(args[i])) { conf.SetNumReduceTasks(System.Convert.ToInt32(args[++i])); } else { other_args.AddItem(args[i]); } } } catch (FormatException) { System.Console.Out.WriteLine("ERROR: Integer expected instead of " + args[i]); return(PrintUsage()); } catch (IndexOutOfRangeException) { System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i - 1]); return(PrintUsage()); } } // Make sure there are exactly 2 parameters left. if (other_args.Count != 2) { System.Console.Out.WriteLine("ERROR: Wrong number of parameters: " + other_args.Count + " instead of 2."); return(PrintUsage()); } FileInputFormat.SetInputPaths(conf, other_args[0]); FileOutputFormat.SetOutputPath(conf, new Path(other_args[1])); JobClient.RunJob(conf); return(0); }
/// <exception cref="System.IO.IOException"/> internal static void CheckRecords(Configuration defaults, int noMaps, int noReduces , Path sortInput, Path sortOutput) { JobConf jobConf = new JobConf(defaults, typeof(SortValidator.RecordChecker)); jobConf.SetJobName("sortvalidate-record-checker"); jobConf.SetInputFormat(typeof(SequenceFileInputFormat)); jobConf.SetOutputFormat(typeof(SequenceFileOutputFormat)); jobConf.SetOutputKeyClass(typeof(BytesWritable)); jobConf.SetOutputValueClass(typeof(IntWritable)); jobConf.SetMapperClass(typeof(SortValidator.RecordChecker.Map)); jobConf.SetReducerClass(typeof(SortValidator.RecordChecker.Reduce)); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.GetClusterStatus(); if (noMaps == -1) { noMaps = cluster.GetTaskTrackers() * jobConf.GetInt(MapsPerHost, 10); } if (noReduces == -1) { noReduces = (int)(cluster.GetMaxReduceTasks() * 0.9); string sortReduces = jobConf.Get(ReducesPerHost); if (sortReduces != null) { noReduces = cluster.GetTaskTrackers() * System.Convert.ToInt32(sortReduces); } } jobConf.SetNumMapTasks(noMaps); jobConf.SetNumReduceTasks(noReduces); FileInputFormat.SetInputPaths(jobConf, sortInput); FileInputFormat.AddInputPath(jobConf, sortOutput); Path outputPath = new Path("/tmp/sortvalidate/recordchecker"); FileSystem fs = FileSystem.Get(defaults); if (fs.Exists(outputPath)) { fs.Delete(outputPath, true); } FileOutputFormat.SetOutputPath(jobConf, outputPath); // Uncomment to run locally in a single process //job_conf.set(JTConfig.JT, "local"); Path[] inputPaths = FileInputFormat.GetInputPaths(jobConf); System.Console.Out.WriteLine("\nSortValidator.RecordChecker: Running on " + cluster .GetTaskTrackers() + " nodes to validate sort from " + inputPaths[0] + ", " + inputPaths [1] + " into " + FileOutputFormat.GetOutputPath(jobConf) + " with " + noReduces + " reduces."); DateTime startTime = new DateTime(); System.Console.Out.WriteLine("Job started: " + startTime); JobClient.RunJob(jobConf); DateTime end_time = new DateTime(); System.Console.Out.WriteLine("Job ended: " + end_time); System.Console.Out.WriteLine("The job took " + (end_time.GetTime() - startTime.GetTime ()) / 1000 + " seconds."); }
public virtual void TestJob() { Job job = CreateJob(); FileInputFormat.SetInputPaths(job, inDir); FileOutputFormat.SetOutputPath(job, new Path(outDir, "testJob")); NUnit.Framework.Assert.IsTrue(job.WaitForCompletion(true)); ValidateCounters(job.GetCounters(), 5, 25, 5, 5); }
/// <exception cref="System.Exception"/> private void CheckCompression(bool compressMapOutputs, SequenceFile.CompressionType redCompression, bool includeCombine) { JobConf conf = new JobConf(typeof(TestMapRed)); Path testdir = new Path(TestDir.GetAbsolutePath()); Path inDir = new Path(testdir, "in"); Path outDir = new Path(testdir, "out"); FileSystem fs = FileSystem.Get(conf); fs.Delete(testdir, true); FileInputFormat.SetInputPaths(conf, inDir); FileOutputFormat.SetOutputPath(conf, outDir); conf.SetMapperClass(typeof(TestMapRed.MyMap)); conf.SetReducerClass(typeof(TestMapRed.MyReduce)); conf.SetOutputKeyClass(typeof(Text)); conf.SetOutputValueClass(typeof(Text)); conf.SetOutputFormat(typeof(SequenceFileOutputFormat)); conf.Set(MRConfig.FrameworkName, MRConfig.LocalFrameworkName); if (includeCombine) { conf.SetCombinerClass(typeof(IdentityReducer)); } conf.SetCompressMapOutput(compressMapOutputs); SequenceFileOutputFormat.SetOutputCompressionType(conf, redCompression); try { if (!fs.Mkdirs(testdir)) { throw new IOException("Mkdirs failed to create " + testdir.ToString()); } if (!fs.Mkdirs(inDir)) { throw new IOException("Mkdirs failed to create " + inDir.ToString()); } Path inFile = new Path(inDir, "part0"); DataOutputStream f = fs.Create(inFile); f.WriteBytes("Owen was here\n"); f.WriteBytes("Hadoop is fun\n"); f.WriteBytes("Is this done, yet?\n"); f.Close(); RunningJob rj = JobClient.RunJob(conf); NUnit.Framework.Assert.IsTrue("job was complete", rj.IsComplete()); NUnit.Framework.Assert.IsTrue("job was successful", rj.IsSuccessful()); Path output = new Path(outDir, Task.GetOutputName(0)); NUnit.Framework.Assert.IsTrue("reduce output exists " + output, fs.Exists(output) ); SequenceFile.Reader rdr = new SequenceFile.Reader(fs, output, conf); NUnit.Framework.Assert.AreEqual("is reduce output compressed " + output, redCompression != SequenceFile.CompressionType.None, rdr.IsCompressed()); rdr.Close(); } finally { fs.Delete(testdir, true); } }
/// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(conf); Reporter reporter = Reporter.Null; Random random = new Random(); long seed = random.NextLong(); Log.Info("seed = " + seed); random.SetSeed(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); int length = 10000; int numFiles = 10; // create a file with various lengths CreateFiles(length, numFiles, random); // create a combine split for the files InputFormat <IntWritable, BytesWritable> format = new CombineSequenceFileInputFormat <IntWritable, BytesWritable>(); IntWritable key = new IntWritable(); BytesWritable value = new BytesWritable(); for (int i = 0; i < 3; i++) { int numSplits = random.Next(length / (SequenceFile.SyncInterval / 20)) + 1; Log.Info("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); Log.Info("splitting: got = " + splits.Length); // we should have a single split as the length is comfortably smaller than // the block size NUnit.Framework.Assert.AreEqual("We got more than one splits!", 1, splits.Length); InputSplit split = splits[0]; NUnit.Framework.Assert.AreEqual("It should be CombineFileSplit", typeof(CombineFileSplit ), split.GetType()); // check each split BitSet bits = new BitSet(length); RecordReader <IntWritable, BytesWritable> reader = format.GetRecordReader(split, job , reporter); try { while (reader.Next(key, value)) { NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(key.Get()) ); bits.Set(key.Get()); } } finally { reader.Close(); } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } }
/// <summary>Creates and runs an MR job</summary> /// <param name="conf"/> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> /// <exception cref="System.TypeLoadException"/> public virtual void CreateAndRunJob(Configuration conf) { JobConf job = new JobConf(conf); job.SetJarByClass(typeof(TestLineRecordReaderJobs)); job.SetMapperClass(typeof(IdentityMapper)); job.SetReducerClass(typeof(IdentityReducer)); FileInputFormat.AddInputPath(job, inputDir); FileOutputFormat.SetOutputPath(job, outputDir); JobClient.RunJob(job); }
/// <exception cref="System.IO.IOException"/> internal static string LaunchExternal(URI uri, JobConf conf, string input, int numMaps , int numReduces) { Path inDir = new Path("/testing/ext/input"); Path outDir = new Path("/testing/ext/output"); FileSystem fs = FileSystem.Get(uri, conf); fs.Delete(outDir, true); if (!fs.Mkdirs(inDir)) { throw new IOException("Mkdirs failed to create " + inDir.ToString()); } { DataOutputStream file = fs.Create(new Path(inDir, "part-0")); file.WriteBytes(input); file.Close(); } FileSystem.SetDefaultUri(conf, uri); conf.Set(JTConfig.FrameworkName, JTConfig.YarnFrameworkName); conf.SetJobName("wordcount"); conf.SetInputFormat(typeof(TextInputFormat)); // the keys are counts conf.SetOutputValueClass(typeof(IntWritable)); // the values are the messages conf.Set(JobContext.OutputKeyClass, "testjar.ExternalWritable"); FileInputFormat.SetInputPaths(conf, inDir); FileOutputFormat.SetOutputPath(conf, outDir); conf.SetNumMapTasks(numMaps); conf.SetNumReduceTasks(numReduces); conf.Set("mapred.mapper.class", "testjar.ExternalMapperReducer"); conf.Set("mapred.reducer.class", "testjar.ExternalMapperReducer"); // set the tests jar file conf.SetJarByClass(typeof(TestMiniMRClasspath)); JobClient.RunJob(conf); StringBuilder result = new StringBuilder(); Path[] fileList = FileUtil.Stat2Paths(fs.ListStatus(outDir, new Utils.OutputFileUtils.OutputFilesFilter ())); for (int i = 0; i < fileList.Length; ++i) { BufferedReader file = new BufferedReader(new InputStreamReader(fs.Open(fileList[i ]))); string line = file.ReadLine(); while (line != null) { result.Append(line); line = file.ReadLine(); result.Append("\n"); } file.Close(); } return(result.ToString()); }
/// <exception cref="System.Exception"/> public static void Launch() { JobConf conf = new JobConf(typeof(Org.Apache.Hadoop.Mapred.TestFieldSelection)); FileSystem fs = FileSystem.Get(conf); int numOfInputLines = 10; Path OutputDir = new Path("build/test/output_for_field_selection_test"); Path InputDir = new Path("build/test/input_for_field_selection_test"); string inputFile = "input.txt"; fs.Delete(InputDir, true); fs.Mkdirs(InputDir); fs.Delete(OutputDir, true); StringBuilder inputData = new StringBuilder(); StringBuilder expectedOutput = new StringBuilder(); TestMRFieldSelection.ConstructInputOutputData(inputData, expectedOutput, numOfInputLines ); FSDataOutputStream fileOut = fs.Create(new Path(InputDir, inputFile)); fileOut.Write(Sharpen.Runtime.GetBytesForString(inputData.ToString(), "utf-8")); fileOut.Close(); System.Console.Out.WriteLine("inputData:"); System.Console.Out.WriteLine(inputData.ToString()); JobConf job = new JobConf(conf, typeof(Org.Apache.Hadoop.Mapred.TestFieldSelection )); FileInputFormat.SetInputPaths(job, InputDir); job.SetInputFormat(typeof(TextInputFormat)); job.SetMapperClass(typeof(FieldSelectionMapReduce)); job.SetReducerClass(typeof(FieldSelectionMapReduce)); FileOutputFormat.SetOutputPath(job, OutputDir); job.SetOutputKeyClass(typeof(Org.Apache.Hadoop.IO.Text)); job.SetOutputValueClass(typeof(Org.Apache.Hadoop.IO.Text)); job.SetOutputFormat(typeof(TextOutputFormat)); job.SetNumReduceTasks(1); job.Set(FieldSelectionHelper.DataFieldSeperator, "-"); job.Set(FieldSelectionHelper.MapOutputKeyValueSpec, "6,5,1-3:0-"); job.Set(FieldSelectionHelper.ReduceOutputKeyValueSpec, ":4,3,2,1,0,0-"); JobClient.RunJob(job); // // Finally, we compare the reconstructed answer key with the // original one. Remember, we need to ignore zero-count items // in the original key. // bool success = true; Path outPath = new Path(OutputDir, "part-00000"); string outdata = MapReduceTestUtil.ReadOutput(outPath, job); NUnit.Framework.Assert.AreEqual(expectedOutput.ToString(), outdata); fs.Delete(OutputDir, true); fs.Delete(InputDir, true); }
public virtual void TestNullKeys() { JobConf conf = new JobConf(typeof(TestMapRed)); FileSystem fs = FileSystem.GetLocal(conf); HashSet <string> values = new HashSet <string>(); string m = "AAAAAAAAAAAAAA"; for (int i = 1; i < 11; ++i) { values.AddItem(m); m = m.Replace((char)('A' + i - 1), (char)('A' + i)); } Path testdir = new Path(Runtime.GetProperty("test.build.data", "/tmp")).MakeQualified (fs); fs.Delete(testdir, true); Path inFile = new Path(testdir, "nullin/blah"); SequenceFile.Writer w = SequenceFile.CreateWriter(fs, conf, inFile, typeof(NullWritable ), typeof(Text), SequenceFile.CompressionType.None); Text t = new Text(); foreach (string s in values) { t.Set(s); w.Append(NullWritable.Get(), t); } w.Close(); FileInputFormat.SetInputPaths(conf, inFile); FileOutputFormat.SetOutputPath(conf, new Path(testdir, "nullout")); conf.SetMapperClass(typeof(TestMapRed.NullMapper)); conf.SetReducerClass(typeof(IdentityReducer)); conf.SetOutputKeyClass(typeof(NullWritable)); conf.SetOutputValueClass(typeof(Text)); conf.SetInputFormat(typeof(SequenceFileInputFormat)); conf.SetOutputFormat(typeof(SequenceFileOutputFormat)); conf.SetNumReduceTasks(1); conf.Set(MRConfig.FrameworkName, MRConfig.LocalFrameworkName); JobClient.RunJob(conf); // Since null keys all equal, allow any ordering SequenceFile.Reader r = new SequenceFile.Reader(fs, new Path(testdir, "nullout/part-00000" ), conf); m = "AAAAAAAAAAAAAA"; for (int i_1 = 1; r.Next(NullWritable.Get(), t); ++i_1) { NUnit.Framework.Assert.IsTrue("Unexpected value: " + t, values.Remove(t.ToString( ))); m = m.Replace((char)('A' + i_1 - 1), (char)('A' + i_1)); } NUnit.Framework.Assert.IsTrue("Missing values: " + values.ToString(), values.IsEmpty ()); }
/// <exception cref="System.Exception"/> private void RunMergeTest(JobConf job, FileSystem fileSystem, int numMappers, int numReducers, int numLines, bool isUber) { fileSystem.Delete(Output, true); job.SetJobName("Test"); JobClient client = new JobClient(job); RunningJob submittedJob = null; FileInputFormat.SetInputPaths(job, InputDir); FileOutputFormat.SetOutputPath(job, Output); job.Set("mapreduce.output.textoutputformat.separator", " "); job.SetInputFormat(typeof(TextInputFormat)); job.SetMapOutputKeyClass(typeof(Text)); job.SetMapOutputValueClass(typeof(Text)); job.SetOutputKeyClass(typeof(Text)); job.SetOutputValueClass(typeof(Text)); job.SetMapperClass(typeof(TestMRIntermediateDataEncryption.MyMapper)); job.SetPartitionerClass(typeof(TestMRIntermediateDataEncryption.MyPartitioner)); job.SetOutputFormat(typeof(TextOutputFormat)); job.SetNumReduceTasks(numReducers); job.SetInt("mapreduce.map.maxattempts", 1); job.SetInt("mapreduce.reduce.maxattempts", 1); job.SetInt("mapred.test.num_lines", numLines); if (isUber) { job.SetBoolean("mapreduce.job.ubertask.enable", true); } job.SetBoolean(MRJobConfig.MrEncryptedIntermediateData, true); try { submittedJob = client.SubmitJob(job); try { if (!client.MonitorAndPrintJob(job, submittedJob)) { throw new IOException("Job failed!"); } } catch (Exception) { Sharpen.Thread.CurrentThread().Interrupt(); } } catch (IOException ioe) { System.Console.Error.WriteLine("Job failed with: " + ioe); } finally { VerifyOutput(submittedJob, fileSystem, numMappers, numLines); } }