/// <exception cref="System.Exception"/> private void TestMapFileOutputCommitterInternal(int version) { Job job = Job.GetInstance(); FileOutputFormat.SetOutputPath(job, outDir); Configuration conf = job.GetConfiguration(); conf.Set(MRJobConfig.TaskAttemptId, attempt); conf.SetInt(FileOutputCommitter.FileoutputcommitterAlgorithmVersion, version); JobContext jContext = new JobContextImpl(conf, taskID.GetJobID()); TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID); FileOutputCommitter committer = new FileOutputCommitter(outDir, tContext); // setup committer.SetupJob(jContext); committer.SetupTask(tContext); // write output MapFileOutputFormat theOutputFormat = new MapFileOutputFormat(); RecordWriter theRecordWriter = theOutputFormat.GetRecordWriter(tContext); WriteMapFileOutput(theRecordWriter, tContext); // do commit committer.CommitTask(tContext); committer.CommitJob(jContext); // validate output ValidateMapFileOutputContent(FileSystem.Get(job.GetConfiguration()), outDir); FileUtil.FullyDelete(new FilePath(outDir.ToString())); }
/// <summary>From each split sampled, take the first numSamples / numSplits records.</summary> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public virtual K[] GetSample(InputFormat <K, V> inf, Job job) { // ArrayList::toArray doesn't preserve type IList <InputSplit> splits = inf.GetSplits(job); AList <K> samples = new AList <K>(numSamples); int splitsToSample = Math.Min(maxSplitsSampled, splits.Count); int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.GetConfiguration (), new TaskAttemptID()); RecordReader <K, V> reader = inf.CreateRecordReader(splits[i], samplingContext); reader.Initialize(splits[i], samplingContext); while (reader.NextKeyValue()) { samples.AddItem(ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey (), null)); ++records; if ((i + 1) * samplesPerSplit <= records) { break; } } reader.Close(); } return((K[])Sharpen.Collections.ToArray(samples)); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public override RecordReader CreateRecordReader(InputSplit split, TaskAttemptContext taskContext) { try { if (!rrCstrMap.Contains(ident)) { throw new IOException("No RecordReader for " + ident); } Configuration conf = GetConf(taskContext.GetConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(conf, TaskAttemptID.ForName (conf.Get(MRJobConfig.TaskAttemptId)), new Parser.WrappedStatusReporter(taskContext )); return(rrCstrMap[ident].NewInstance(id, inf.CreateRecordReader(split, context), cmpcl )); } catch (MemberAccessException e) { throw new IOException(e); } catch (InstantiationException e) { throw new IOException(e); } catch (TargetInvocationException e) { throw new IOException(e); } }
public override void Run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.GetConfiguration(), new TaskAttemptID()); RecordReader <Text, Text> reader = inFormat.CreateRecordReader(splits[sampleStep * idx], context); reader.Initialize(splits[sampleStep * idx], context); while (reader.NextKeyValue()) { sampler.AddKey(new Text(reader.GetCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.Console.Error.WriteLine("Got an exception while reading splits " + StringUtils .StringifyException(ie)); throw new RuntimeException(ie); } catch (Exception) { } }
// Use the LineRecordReader to read records from the file /// <exception cref="System.IO.IOException"/> public virtual AList <string> ReadRecords(Uri testFileUrl, int splitSize) { // Set up context FilePath testFile = new FilePath(testFileUrl.GetFile()); long testFileSize = testFile.Length(); Path testFilePath = new Path(testFile.GetAbsolutePath()); Configuration conf = new Configuration(); conf.SetInt("io.file.buffer.size", 1); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); // Gather the records returned by the record reader AList <string> records = new AList <string>(); long offset = 0; while (offset < testFileSize) { FileSplit split = new FileSplit(testFilePath, offset, splitSize, null); LineRecordReader reader = new LineRecordReader(); reader.Initialize(split, context); while (reader.NextKeyValue()) { records.AddItem(reader.GetCurrentValue().ToString()); } offset += splitSize; } return(records); }
/// <summary> /// Randomize the split order, then take the specified number of keys from /// each split sampled, where each key is selected with the specified /// probability and possibly replaced by a subsequently selected key when /// the quota of keys from that split is satisfied. /// </summary> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public virtual K[] GetSample(InputFormat <K, V> inf, Job job) { // ArrayList::toArray doesn't preserve type IList <InputSplit> splits = inf.GetSplits(job); AList <K> samples = new AList <K>(numSamples); int splitsToSample = Math.Min(maxSplitsSampled, splits.Count); Random r = new Random(); long seed = r.NextLong(); r.SetSeed(seed); Log.Debug("seed: " + seed); // shuffle splits for (int i = 0; i < splits.Count; ++i) { InputSplit tmp = splits[i]; int j = r.Next(splits.Count); splits.Set(i, splits[j]); splits.Set(j, tmp); } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i_1 = 0; i_1 < splitsToSample || (i_1 < splits.Count && samples.Count < numSamples); ++i_1) { TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.GetConfiguration (), new TaskAttemptID()); RecordReader <K, V> reader = inf.CreateRecordReader(splits[i_1], samplingContext); reader.Initialize(splits[i_1], samplingContext); while (reader.NextKeyValue()) { if (r.NextDouble() <= freq) { if (samples.Count < numSamples) { samples.AddItem(ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey (), null)); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.Next(numSamples); if (ind != numSamples) { samples.Set(ind, ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey (), null)); } freq *= (numSamples - 1) / (double)numSamples; } } } reader.Close(); } return((K[])Sharpen.Collections.ToArray(samples)); }
/// <exception cref="System.IO.IOException"/> private void TestSplitRecordsForFile(Configuration conf, long firstSplitLength, long testFileSize, Path testFilePath) { conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); NUnit.Framework.Assert.IsTrue("unexpected test data at " + testFilePath, testFileSize > firstSplitLength); string delimiter = conf.Get("textinputformat.record.delimiter"); byte[] recordDelimiterBytes = null; if (null != delimiter) { recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets.Utf8 ); } TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); // read the data without splitting to count the records FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (string[])null); LineRecordReader reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); int numRecordsNoSplits = 0; while (reader.NextKeyValue()) { ++numRecordsNoSplits; } reader.Close(); // count the records in the first split split = new FileSplit(testFilePath, 0, firstSplitLength, (string[])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); int numRecordsFirstSplit = 0; while (reader.NextKeyValue()) { ++numRecordsFirstSplit; } reader.Close(); // count the records in the second split split = new FileSplit(testFilePath, firstSplitLength, testFileSize - firstSplitLength , (string[])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); int numRecordsRemainingSplits = 0; while (reader.NextKeyValue()) { ++numRecordsRemainingSplits; } reader.Close(); NUnit.Framework.Assert.AreEqual("Unexpected number of records in split ", numRecordsNoSplits , numRecordsFirstSplit + numRecordsRemainingSplits); }
/// <exception cref="System.Exception"/> private void TestConcurrentCommitTaskWithSubDir(int version) { Job job = Job.GetInstance(); FileOutputFormat.SetOutputPath(job, outDir); Configuration conf = job.GetConfiguration(); conf.Set(MRJobConfig.TaskAttemptId, attempt); conf.SetInt(FileOutputCommitter.FileoutputcommitterAlgorithmVersion, version); conf.SetClass("fs.file.impl", typeof(TestFileOutputCommitter.RLFS), typeof(FileSystem )); FileSystem.CloseAll(); JobContext jContext = new JobContextImpl(conf, taskID.GetJobID()); FileOutputCommitter amCommitter = new FileOutputCommitter(outDir, jContext); amCommitter.SetupJob(jContext); TaskAttemptContext[] taCtx = new TaskAttemptContextImpl[2]; taCtx[0] = new TaskAttemptContextImpl(conf, taskID); taCtx[1] = new TaskAttemptContextImpl(conf, taskID1); TextOutputFormat[] tof = new TextOutputFormat[2]; for (int i = 0; i < tof.Length; i++) { tof[i] = new _TextOutputFormat_508(this); } ExecutorService executor = Executors.NewFixedThreadPool(2); try { for (int i_1 = 0; i_1 < taCtx.Length; i_1++) { int taskIdx = i_1; executor.Submit(new _Callable_524(this, tof, taskIdx, taCtx)); } } finally { executor.Shutdown(); while (!executor.AwaitTermination(1, TimeUnit.Seconds)) { Log.Info("Awaiting thread termination!"); } } amCommitter.CommitJob(jContext); RawLocalFileSystem lfs = new RawLocalFileSystem(); lfs.SetConf(conf); NUnit.Framework.Assert.IsFalse("Must not end up with sub_dir/sub_dir", lfs.Exists (new Path(OutSubDir, SubDir))); // validate output ValidateContent(OutSubDir); FileUtil.FullyDelete(new FilePath(outDir.ToString())); }
// Create a taskAttemptContext for the named output with // output format and output key/value types put in the context /// <exception cref="System.IO.IOException"/> private TaskAttemptContext GetContext(string nameOutput) { TaskAttemptContext taskContext = taskContexts[nameOutput]; if (taskContext != null) { return(taskContext); } // The following trick leverages the instantiation of a record writer via // the job thus supporting arbitrary output formats. Job job = Job.GetInstance(context.GetConfiguration()); job.SetOutputFormatClass(GetNamedOutputFormatClass(context, nameOutput)); job.SetOutputKeyClass(GetNamedOutputKeyClass(context, nameOutput)); job.SetOutputValueClass(GetNamedOutputValueClass(context, nameOutput)); taskContext = new TaskAttemptContextImpl(job.GetConfiguration(), context.GetTaskAttemptID (), new MultipleOutputs.WrappedStatusReporter(context)); taskContexts[nameOutput] = taskContext; return(taskContext); }
/// <exception cref="System.IO.IOException"/> public virtual void TestInvalidVersionNumber() { Job job = Job.GetInstance(); FileOutputFormat.SetOutputPath(job, outDir); Configuration conf = job.GetConfiguration(); conf.Set(MRJobConfig.TaskAttemptId, attempt); conf.SetInt(FileOutputCommitter.FileoutputcommitterAlgorithmVersion, 3); TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID); try { new FileOutputCommitter(outDir, tContext); Fail("should've thrown an exception!"); } catch (IOException) { } }
/// <exception cref="System.Exception"/> public virtual void TestEmptyOutput() { Job job = Job.GetInstance(); FileOutputFormat.SetOutputPath(job, outDir); Configuration conf = job.GetConfiguration(); conf.Set(MRJobConfig.TaskAttemptId, attempt); JobContext jContext = new JobContextImpl(conf, taskID.GetJobID()); TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID); FileOutputCommitter committer = new FileOutputCommitter(outDir, tContext); // setup committer.SetupJob(jContext); committer.SetupTask(tContext); // Do not write any output // do commit committer.CommitTask(tContext); committer.CommitJob(jContext); FileUtil.FullyDelete(new FilePath(outDir.ToString())); }
public virtual void TestStripBOM() { // the test data contains a BOM at the start of the file // confirm the BOM is skipped by LineRecordReader string Utf8Bom = "\uFEFF"; Uri testFileUrl = GetType().GetClassLoader().GetResource("testBOM.txt"); NUnit.Framework.Assert.IsNotNull("Cannot find testBOM.txt", testFileUrl); FilePath testFile = new FilePath(testFileUrl.GetFile()); Path testFilePath = new Path(testFile.GetAbsolutePath()); long testFileSize = testFile.Length(); Configuration conf = new Configuration(); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); // read the data and check whether BOM is skipped FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (string[])null); LineRecordReader reader = new LineRecordReader(); reader.Initialize(split, context); int numRecords = 0; bool firstLine = true; bool skipBOM = true; while (reader.NextKeyValue()) { if (firstLine) { firstLine = false; if (reader.GetCurrentValue().ToString().StartsWith(Utf8Bom)) { skipBOM = false; } } ++numRecords; } reader.Close(); NUnit.Framework.Assert.IsTrue("BOM is not skipped", skipBOM); }
/// <exception cref="System.Exception"/> public virtual void TestCommitter() { Job job = Job.GetInstance(); FileOutputFormat.SetOutputPath(job, outDir); Configuration conf = job.GetConfiguration(); conf.Set(MRJobConfig.TaskAttemptId, attempt); JobContext jContext = new JobContextImpl(conf, taskID.GetJobID()); TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID); FileOutputCommitter committer = new FileOutputCommitter(outDir, tContext); // setup committer.SetupJob(jContext); committer.SetupTask(tContext); // write output TextOutputFormat theOutputFormat = new TextOutputFormat(); RecordWriter theRecordWriter = theOutputFormat.GetRecordWriter(tContext); WriteOutput(theRecordWriter, tContext); // do commit committer.CommitTask(tContext); committer.CommitJob(jContext); // validate output FilePath expectedFile = new FilePath(new Path(outDir, partFile).ToString()); StringBuilder expectedOutput = new StringBuilder(); expectedOutput.Append(key1).Append('\t').Append(val1).Append("\n"); expectedOutput.Append(val1).Append("\n"); expectedOutput.Append(val2).Append("\n"); expectedOutput.Append(key2).Append("\n"); expectedOutput.Append(key1).Append("\n"); expectedOutput.Append(key2).Append('\t').Append(val2).Append("\n"); string output = UtilsForTests.Slurp(expectedFile); NUnit.Framework.Assert.AreEqual(output, expectedOutput.ToString()); FileUtil.FullyDelete(new FilePath(outDir.ToString())); }
public virtual void TestMultipleClose() { Uri testFileUrl = GetType().GetClassLoader().GetResource("recordSpanningMultipleSplits.txt.bz2" ); NUnit.Framework.Assert.IsNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2" , testFileUrl); FilePath testFile = new FilePath(testFileUrl.GetFile()); Path testFilePath = new Path(testFile.GetAbsolutePath()); long testFileSize = testFile.Length(); Configuration conf = new Configuration(); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); // read the data and check whether BOM is skipped FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null); LineRecordReader reader = new LineRecordReader(); reader.Initialize(split, context); //noinspection StatementWithEmptyBody while (reader.NextKeyValue()) { } reader.Close(); reader.Close(); BZip2Codec codec = new BZip2Codec(); codec.SetConf(conf); ICollection <Decompressor> decompressors = new HashSet <Decompressor>(); for (int i = 0; i < 10; ++i) { decompressors.AddItem(CodecPool.GetDecompressor(codec)); } NUnit.Framework.Assert.AreEqual(10, decompressors.Count); }
//test passed /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> private void TestAbortInternal(int version) { Job job = Job.GetInstance(); FileOutputFormat.SetOutputPath(job, outDir); Configuration conf = job.GetConfiguration(); conf.Set(MRJobConfig.TaskAttemptId, attempt); conf.SetInt(FileOutputCommitter.FileoutputcommitterAlgorithmVersion, version); JobContext jContext = new JobContextImpl(conf, taskID.GetJobID()); TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID); FileOutputCommitter committer = new FileOutputCommitter(outDir, tContext); // do setup committer.SetupJob(jContext); committer.SetupTask(tContext); // write output TextOutputFormat theOutputFormat = new TextOutputFormat(); RecordWriter theRecordWriter = theOutputFormat.GetRecordWriter(tContext); WriteOutput(theRecordWriter, tContext); // do abort committer.AbortTask(tContext); FilePath expectedFile = new FilePath(new Path(committer.GetWorkPath(), partFile). ToString()); NUnit.Framework.Assert.IsFalse("task temp dir still exists", expectedFile.Exists( )); committer.AbortJob(jContext, JobStatus.State.Failed); expectedFile = new FilePath(new Path(outDir, FileOutputCommitter.PendingDirName). ToString()); NUnit.Framework.Assert.IsFalse("job temp dir still exists", expectedFile.Exists() ); NUnit.Framework.Assert.AreEqual("Output directory not empty", 0, new FilePath(outDir .ToString()).ListFiles().Length); FileUtil.FullyDelete(new FilePath(outDir.ToString())); }
public virtual void TestProgressIsReportedIfInputASeriesOfEmptyFiles() { JobConf conf = new JobConf(); Path[] paths = new Path[3]; FilePath[] files = new FilePath[3]; long[] fileLength = new long[3]; try { for (int i = 0; i < 3; i++) { FilePath dir = new FilePath(outDir.ToString()); dir.Mkdir(); files[i] = new FilePath(dir, "testfile" + i); FileWriter fileWriter = new FileWriter(files[i]); fileWriter.Flush(); fileWriter.Close(); fileLength[i] = i; paths[i] = new Path(outDir + "/testfile" + i); } CombineFileSplit combineFileSplit = new CombineFileSplit(paths, fileLength); TaskAttemptID taskAttemptID = Org.Mockito.Mockito.Mock <TaskAttemptID>(); Task.TaskReporter reporter = Org.Mockito.Mockito.Mock <Task.TaskReporter>(); TaskAttemptContextImpl taskAttemptContext = new TaskAttemptContextImpl(conf, taskAttemptID , reporter); CombineFileRecordReader cfrr = new CombineFileRecordReader(combineFileSplit, taskAttemptContext , typeof(TestCombineFileRecordReader.TextRecordReaderWrapper)); cfrr.Initialize(combineFileSplit, taskAttemptContext); Org.Mockito.Mockito.Verify(reporter).Progress(); NUnit.Framework.Assert.IsFalse(cfrr.NextKeyValue()); Org.Mockito.Mockito.Verify(reporter, Org.Mockito.Mockito.Times(3)).Progress(); } finally { FileUtil.FullyDelete(new FilePath(outDir.ToString())); } }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> private void TestFailAbortInternal(int version) { Job job = Job.GetInstance(); Configuration conf = job.GetConfiguration(); conf.Set(FileSystem.FsDefaultNameKey, "faildel:///"); conf.SetClass("fs.faildel.impl", typeof(TestFileOutputCommitter.FakeFileSystem), typeof(FileSystem)); conf.Set(MRJobConfig.TaskAttemptId, attempt); conf.SetInt(MRJobConfig.ApplicationAttemptId, 1); conf.SetInt(FileOutputCommitter.FileoutputcommitterAlgorithmVersion, version); FileOutputFormat.SetOutputPath(job, outDir); JobContext jContext = new JobContextImpl(conf, taskID.GetJobID()); TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID); FileOutputCommitter committer = new FileOutputCommitter(outDir, tContext); // do setup committer.SetupJob(jContext); committer.SetupTask(tContext); // write output TextOutputFormat <object, object> theOutputFormat = new TextOutputFormat(); RecordWriter <object, object> theRecordWriter = theOutputFormat.GetRecordWriter(tContext ); WriteOutput(theRecordWriter, tContext); // do abort Exception th = null; try { committer.AbortTask(tContext); } catch (IOException ie) { th = ie; } NUnit.Framework.Assert.IsNotNull(th); NUnit.Framework.Assert.IsTrue(th is IOException); NUnit.Framework.Assert.IsTrue(th.Message.Contains("fake delete failed")); Path jtd = committer.GetJobAttemptPath(jContext); FilePath jobTmpDir = new FilePath(jtd.ToUri().GetPath()); Path ttd = committer.GetTaskAttemptPath(tContext); FilePath taskTmpDir = new FilePath(ttd.ToUri().GetPath()); FilePath expectedFile = new FilePath(taskTmpDir, partFile); NUnit.Framework.Assert.IsTrue(expectedFile + " does not exists", expectedFile.Exists ()); th = null; try { committer.AbortJob(jContext, JobStatus.State.Failed); } catch (IOException ie) { th = ie; } NUnit.Framework.Assert.IsNotNull(th); NUnit.Framework.Assert.IsTrue(th is IOException); NUnit.Framework.Assert.IsTrue(th.Message.Contains("fake delete failed")); NUnit.Framework.Assert.IsTrue("job temp dir does not exists", jobTmpDir.Exists()); FileUtil.FullyDelete(new FilePath(outDir.ToString())); }
/// <exception cref="System.Exception"/> private void TestRecoveryInternal(int commitVersion, int recoveryVersion) { Job job = Job.GetInstance(); FileOutputFormat.SetOutputPath(job, outDir); Configuration conf = job.GetConfiguration(); conf.Set(MRJobConfig.TaskAttemptId, attempt); conf.SetInt(MRJobConfig.ApplicationAttemptId, 1); conf.SetInt(FileOutputCommitter.FileoutputcommitterAlgorithmVersion, commitVersion ); JobContext jContext = new JobContextImpl(conf, taskID.GetJobID()); TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID); FileOutputCommitter committer = new FileOutputCommitter(outDir, tContext); // setup committer.SetupJob(jContext); committer.SetupTask(tContext); // write output TextOutputFormat theOutputFormat = new TextOutputFormat(); RecordWriter theRecordWriter = theOutputFormat.GetRecordWriter(tContext); WriteOutput(theRecordWriter, tContext); // do commit committer.CommitTask(tContext); Path jobTempDir1 = committer.GetCommittedTaskPath(tContext); FilePath jtd = new FilePath(jobTempDir1.ToUri().GetPath()); if (commitVersion == 1) { NUnit.Framework.Assert.IsTrue("Version 1 commits to temporary dir " + jtd, jtd.Exists ()); ValidateContent(jtd); } else { NUnit.Framework.Assert.IsFalse("Version 2 commits to output dir " + jtd, jtd.Exists ()); } //now while running the second app attempt, //recover the task output from first attempt Configuration conf2 = job.GetConfiguration(); conf2.Set(MRJobConfig.TaskAttemptId, attempt); conf2.SetInt(MRJobConfig.ApplicationAttemptId, 2); conf2.SetInt(FileOutputCommitter.FileoutputcommitterAlgorithmVersion, recoveryVersion ); JobContext jContext2 = new JobContextImpl(conf2, taskID.GetJobID()); TaskAttemptContext tContext2 = new TaskAttemptContextImpl(conf2, taskID); FileOutputCommitter committer2 = new FileOutputCommitter(outDir, tContext2); committer2.SetupJob(tContext2); Path jobTempDir2 = committer2.GetCommittedTaskPath(tContext2); FilePath jtd2 = new FilePath(jobTempDir2.ToUri().GetPath()); committer2.RecoverTask(tContext2); if (recoveryVersion == 1) { NUnit.Framework.Assert.IsTrue("Version 1 recovers to " + jtd2, jtd2.Exists()); ValidateContent(jtd2); } else { NUnit.Framework.Assert.IsFalse("Version 2 commits to output dir " + jtd2, jtd2.Exists ()); if (commitVersion == 1) { NUnit.Framework.Assert.IsTrue("Version 2 recovery moves to output dir from " + jtd , jtd.List().Length == 0); } } committer2.CommitJob(jContext2); ValidateContent(outDir); FileUtil.FullyDelete(new FilePath(outDir.ToString())); }
public virtual void TestUncompressedInputDefaultDelimiterPosValue() { Configuration conf = new Configuration(); string inputData = "1234567890\r\n12\r\n345"; Path inputFile = CreateInputFile(conf, inputData); conf.SetInt("io.file.buffer.size", 10); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); FileSplit split = new FileSplit(inputFile, 0, 15, (string[])null); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); LineRecordReader reader = new LineRecordReader(null); reader.Initialize(split, context); LongWritable key; Text value; reader.NextKeyValue(); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); // Get first record:"1234567890" NUnit.Framework.Assert.AreEqual(10, value.GetLength()); NUnit.Framework.Assert.AreEqual(0, key.Get()); reader.NextKeyValue(); // Get second record:"12" NUnit.Framework.Assert.AreEqual(2, value.GetLength()); // Key should be 12 right after "1234567890\r\n" NUnit.Framework.Assert.AreEqual(12, key.Get()); NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); // Key should be 16 right after "1234567890\r\n12\r\n" NUnit.Framework.Assert.AreEqual(16, key.Get()); split = new FileSplit(inputFile, 15, 4, (string[])null); reader = new LineRecordReader(null); reader.Initialize(split, context); // The second split dropped the first record "\n" reader.NextKeyValue(); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); // Get third record:"345" NUnit.Framework.Assert.AreEqual(3, value.GetLength()); // Key should be 16 right after "1234567890\r\n12\r\n" NUnit.Framework.Assert.AreEqual(16, key.Get()); NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); // Key should be 19 right after "1234567890\r\n12\r\n345" NUnit.Framework.Assert.AreEqual(19, key.Get()); inputData = "123456789\r\r\n"; inputFile = CreateInputFile(conf, inputData); split = new FileSplit(inputFile, 0, 12, (string[])null); reader = new LineRecordReader(null); reader.Initialize(split, context); reader.NextKeyValue(); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); // Get first record:"123456789" NUnit.Framework.Assert.AreEqual(9, value.GetLength()); NUnit.Framework.Assert.AreEqual(0, key.Get()); reader.NextKeyValue(); // Get second record:"" NUnit.Framework.Assert.AreEqual(0, value.GetLength()); // Key should be 10 right after "123456789\r" NUnit.Framework.Assert.AreEqual(10, key.Get()); NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); // Key should be 12 right after "123456789\r\r\n" NUnit.Framework.Assert.AreEqual(12, key.Get()); }
public virtual void TestUncompressedInputCustomDelimiterPosValue() { Configuration conf = new Configuration(); conf.SetInt("io.file.buffer.size", 10); conf.SetInt(LineRecordReader.MaxLineLength, int.MaxValue); string inputData = "abcdefghij++kl++mno"; Path inputFile = CreateInputFile(conf, inputData); string delimiter = "++"; byte[] recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets .Utf8); int splitLength = 15; FileSplit split = new FileSplit(inputFile, 0, splitLength, (string[])null); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID() ); LineRecordReader reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); // Get first record: "abcdefghij" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); LongWritable key = reader.GetCurrentKey(); Text value = reader.GetCurrentValue(); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 10, value.GetLength ()); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 0, key.Get()); // Get second record: "kl" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 2, value.GetLength ()); // Key should be 12 right after "abcdefghij++" NUnit.Framework.Assert.AreEqual("Wrong position after record read", 12, key.Get() ); // Get third record: "mno" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength ()); // Key should be 16 right after "abcdefghij++kl++" NUnit.Framework.Assert.AreEqual("Wrong position after record read", 16, key.Get() ); NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); // Key should be 19 right after "abcdefghij++kl++mno" NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, key.Get() ); // after refresh should be empty key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNull("Unexpected key returned", key); reader.Close(); split = new FileSplit(inputFile, splitLength, inputData.Length - splitLength, (string [])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); // No record is in the second split because the second split dropped // the first record, which was already reported by the first split. NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.NextKeyValue( )); key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNull("Unexpected key returned", key); reader.Close(); // multi char delimiter with starting part of the delimiter in the data inputData = "abcd+efgh++ijk++mno"; inputFile = CreateInputFile(conf, inputData); splitLength = 5; split = new FileSplit(inputFile, 0, splitLength, (string[])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); // Get first record: "abcd+efgh" NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 0, key.Get()); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 9, value.GetLength ()); // should have jumped over the delimiter, no record NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 11, key.Get() ); // after refresh should be empty key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNull("Unexpected key returned", key); reader.Close(); // next split: check for duplicate or dropped records split = new FileSplit(inputFile, splitLength, inputData.Length - splitLength, (string [])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); // Get second record: "ijk" first in this split NUnit.Framework.Assert.AreEqual("Wrong position after record read", 11, key.Get() ); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength ()); // Get third record: "mno" second in this split NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 16, key.Get() ); NUnit.Framework.Assert.AreEqual("Wrong length for record value", 3, value.GetLength ()); // should be at the end of the input NUnit.Framework.Assert.IsFalse(reader.NextKeyValue()); NUnit.Framework.Assert.AreEqual("Wrong position after record read", 19, key.Get() ); reader.Close(); inputData = "abcd|efgh|+|ij|kl|+|mno|pqr"; inputFile = CreateInputFile(conf, inputData); delimiter = "|+|"; recordDelimiterBytes = Sharpen.Runtime.GetBytesForString(delimiter, Charsets.Utf8 ); // walking over the buffer and split sizes checks for proper processing // of the ambiguous bytes of the delimiter for (int bufferSize = 1; bufferSize <= inputData.Length; bufferSize++) { for (int splitSize = 1; splitSize < inputData.Length; splitSize++) { // track where we are in the inputdata int keyPosition = 0; conf.SetInt("io.file.buffer.size", bufferSize); split = new FileSplit(inputFile, 0, bufferSize, (string[])null); reader = new LineRecordReader(recordDelimiterBytes); reader.Initialize(split, context); // Get the first record: "abcd|efgh" always possible NUnit.Framework.Assert.IsTrue("Expected record got nothing", reader.NextKeyValue( )); key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); NUnit.Framework.Assert.IsTrue("abcd|efgh".Equals(value.ToString())); // Position should be 0 right at the start NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition, key.Get()); // Position should be 12 right after the first "|+|" keyPosition = 12; // get the next record: "ij|kl" if the split/buffer allows it if (reader.NextKeyValue()) { // check the record info: "ij|kl" NUnit.Framework.Assert.IsTrue("ij|kl".Equals(value.ToString())); NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition, key.Get()); // Position should be 20 after the second "|+|" keyPosition = 20; } // get the third record: "mno|pqr" if the split/buffer allows it if (reader.NextKeyValue()) { // check the record info: "mno|pqr" NUnit.Framework.Assert.IsTrue("mno|pqr".Equals(value.ToString())); NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition, key.Get()); // Position should be the end of the input keyPosition = inputData.Length; } NUnit.Framework.Assert.IsFalse("Unexpected record returned", reader.NextKeyValue( )); // no more records can be read we should be at the last position NUnit.Framework.Assert.AreEqual("Wrong position after record read", keyPosition, key.Get()); // after refresh should be empty key = reader.GetCurrentKey(); NUnit.Framework.Assert.IsNull("Unexpected key returned", key); reader.Close(); } } }