// This is also called from the old FixedLengthRecordReader API implementation /// <exception cref="System.IO.IOException"/> public virtual void Initialize(Configuration job, long splitStart, long splitLength , Path file) { start = splitStart; end = start + splitLength; long partialRecordLength = start % recordLength; long numBytesToSkip = 0; if (partialRecordLength != 0) { numBytesToSkip = recordLength - partialRecordLength; } // open the file and seek to the start of the split FileSystem fs = file.GetFileSystem(job); fileIn = fs.Open(file); CompressionCodec codec = new CompressionCodecFactory(job).GetCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.GetDecompressor(codec); CompressionInputStream cIn = codec.CreateInputStream(fileIn, decompressor); filePosition = cIn; inputStream = cIn; numRecordsRemainingInSplit = long.MaxValue; Log.Info("Compressed input; cannot compute number of records in the split"); } else { fileIn.Seek(start); filePosition = fileIn; inputStream = fileIn; long splitSize = end - start - numBytesToSkip; numRecordsRemainingInSplit = (splitSize + recordLength - 1) / recordLength; if (numRecordsRemainingInSplit < 0) { numRecordsRemainingInSplit = 0; } Log.Info("Expecting " + numRecordsRemainingInSplit + " records each with a length of " + recordLength + " bytes in the split with an effective size of " + splitSize + " bytes"); } if (numBytesToSkip != 0) { start += inputStream.Skip(numBytesToSkip); } this.pos = start; }
public virtual void TestBuiltInGzipDecompressor() { // NOTE: This fails on RHEL4 with "java.io.IOException: header crc mismatch" // due to buggy version of zlib (1.2.1.2) included. JobConf jobConf = new JobConf(defaultConf); jobConf.SetBoolean("io.native.lib.available", false); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, jobConf); localFs.Delete(workDir, true); NUnit.Framework.Assert.AreEqual("[non-native (Java) codec]", typeof(BuiltInGzipDecompressor ), gzip.GetDecompressorType()); System.Console.Out.WriteLine(ColorBrYellow + "testBuiltInGzipDecompressor() using" + " non-native (Java Inflater) Decompressor (" + gzip.GetDecompressorType() + ")" + ColorNormal); // copy single-member test file to HDFS string fn1 = "testConcatThenCompress.txt" + gzip.GetDefaultExtension(); Path fnLocal1 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn1); Path fnHDFS1 = new Path(workDir, fn1); localFs.CopyFromLocalFile(fnLocal1, fnHDFS1); // copy multiple-member test file to HDFS // (actually in "seekable gzip" format, a la JIRA PIG-42) string fn2 = "testCompressThenConcat.txt" + gzip.GetDefaultExtension(); Path fnLocal2 = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn2); Path fnHDFS2 = new Path(workDir, fn2); localFs.CopyFromLocalFile(fnLocal2, fnHDFS2); FileInputFormat.SetInputPaths(jobConf, workDir); // here's first pair of DecompressorStreams: FileInputStream in1 = new FileInputStream(fnLocal1.ToString()); FileInputStream in2 = new FileInputStream(fnLocal2.ToString()); NUnit.Framework.Assert.AreEqual("concat bytes available", 2734, in1.Available()); NUnit.Framework.Assert.AreEqual("concat bytes available", 3413, in2.Available()); // w/hdr CRC CompressionInputStream cin2 = gzip.CreateInputStream(in2); LineReader @in = new LineReader(cin2); Text @out = new Text(); int numBytes; int totalBytes = 0; int lineNum = 0; while ((numBytes = @in.ReadLine(@out)) > 0) { ++lineNum; totalBytes += numBytes; } @in.Close(); NUnit.Framework.Assert.AreEqual("total uncompressed bytes in concatenated test file" , 5346, totalBytes); NUnit.Framework.Assert.AreEqual("total uncompressed lines in concatenated test file" , 84, lineNum); // test BuiltInGzipDecompressor with lots of different input-buffer sizes DoMultipleGzipBufferSizes(jobConf, false); // test GzipZlibDecompressor (native), just to be sure // (FIXME? could move this call to testGzip(), but would need filename // setup above) (alternatively, maybe just nuke testGzip() and extend this?) DoMultipleGzipBufferSizes(jobConf, true); }
public virtual void TestCompress() { JobConf job = new JobConf(); job.Set(JobContext.TaskAttemptId, attempt); job.Set(FileOutputFormat.Compress, "true"); FileOutputFormat.SetOutputPath(job, workDir.GetParent().GetParent()); FileOutputFormat.SetWorkOutputPath(job, workDir); FileSystem fs = workDir.GetFileSystem(job); if (!fs.Mkdirs(workDir)) { NUnit.Framework.Assert.Fail("Failed to create output directory"); } string file = "test_compress.txt"; // A reporter that does nothing Reporter reporter = Reporter.Null; TextOutputFormat <object, object> theOutputFormat = new TextOutputFormat <object, object >(); RecordWriter <object, object> theRecordWriter = theOutputFormat.GetRecordWriter(localFs , job, file, reporter); Org.Apache.Hadoop.IO.Text key1 = new Org.Apache.Hadoop.IO.Text("key1"); Org.Apache.Hadoop.IO.Text key2 = new Org.Apache.Hadoop.IO.Text("key2"); Org.Apache.Hadoop.IO.Text val1 = new Org.Apache.Hadoop.IO.Text("val1"); Org.Apache.Hadoop.IO.Text val2 = new Org.Apache.Hadoop.IO.Text("val2"); NullWritable nullWritable = NullWritable.Get(); try { theRecordWriter.Write(key1, val1); theRecordWriter.Write(null, nullWritable); theRecordWriter.Write(null, val1); theRecordWriter.Write(nullWritable, val2); theRecordWriter.Write(key2, nullWritable); theRecordWriter.Write(key1, null); theRecordWriter.Write(null, null); theRecordWriter.Write(key2, val2); } finally { theRecordWriter.Close(reporter); } StringBuilder expectedOutput = new StringBuilder(); expectedOutput.Append(key1).Append("\t").Append(val1).Append("\n"); expectedOutput.Append(val1).Append("\n"); expectedOutput.Append(val2).Append("\n"); expectedOutput.Append(key2).Append("\n"); expectedOutput.Append(key1).Append("\n"); expectedOutput.Append(key2).Append("\t").Append(val2).Append("\n"); DefaultCodec codec = new DefaultCodec(); codec.SetConf(job); Path expectedFile = new Path(workDir, file + codec.GetDefaultExtension()); FileInputStream istream = new FileInputStream(expectedFile.ToString()); CompressionInputStream cistream = codec.CreateInputStream(istream); LineReader reader = new LineReader(cistream); string output = string.Empty; Org.Apache.Hadoop.IO.Text @out = new Org.Apache.Hadoop.IO.Text(); while (reader.ReadLine(@out) > 0) { output += @out; output += "\n"; } reader.Close(); NUnit.Framework.Assert.AreEqual(expectedOutput.ToString(), output); }