// this tests both files (testCompressThenConcat, testConcatThenCompress); // all should work with either native zlib or new Inflater-based decoder /// <exception cref="System.IO.IOException"/> private static void DoSingleGzipBufferSize(JobConf jConf) { TextInputFormat format = new TextInputFormat(); format.Configure(jConf); // here's Nth pair of DecompressorStreams: InputSplit[] splits = format.GetSplits(jConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("testCompressThenConcat.txt.gz")) { System.Console.Out.WriteLine(" (swapping)"); splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jConf); NUnit.Framework.Assert.AreEqual("splits[0] length (num lines)", 84, results.Count ); NUnit.Framework.Assert.AreEqual("splits[0][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having" , results[0].ToString()); NUnit.Framework.Assert.AreEqual("splits[0][42]", "Tell me, does the magnetic virtue of the needles of the compasses of" , results[42].ToString()); results = ReadSplit(format, splits[1], jConf); NUnit.Framework.Assert.AreEqual("splits[1] length (num lines)", 84, results.Count ); NUnit.Framework.Assert.AreEqual("splits[1][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having" , results[0].ToString()); NUnit.Framework.Assert.AreEqual("splits[1][42]", "Tell me, does the magnetic virtue of the needles of the compasses of" , results[42].ToString()); }
/// <exception cref="System.Exception"/> public virtual void TestNumInputs() { JobConf job = new JobConf(conf); dfs = NewDFSCluster(job); FileSystem fs = dfs.GetFileSystem(); System.Console.Out.WriteLine("FileSystem " + fs.GetUri()); Path inputDir = new Path("/foo/"); int numFiles = 10; string fileNameBase = "part-0000"; for (int i = 0; i < numFiles; ++i) { CreateInputs(fs, inputDir, fileNameBase + i.ToString()); } CreateInputs(fs, inputDir, "_meta"); CreateInputs(fs, inputDir, "_temp"); // split it using a file input format TextInputFormat.AddInputPath(job, inputDir); TextInputFormat inFormat = new TextInputFormat(); inFormat.Configure(job); InputSplit[] splits = inFormat.GetSplits(job, 1); NUnit.Framework.Assert.AreEqual("Expected value of " + FileInputFormat.NumInputFiles , numFiles, job.GetLong(FileInputFormat.NumInputFiles, 0)); }
public virtual void TestListStatusErrorOnNonExistantDir() { Configuration conf = new Configuration(); conf.SetInt(FileInputFormat.ListStatusNumThreads, numThreads); Org.Apache.Hadoop.Mapreduce.Lib.Input.TestFileInputFormat.ConfigureTestErrorOnNonExistantDir (conf, localFs); JobConf jobConf = new JobConf(conf); TextInputFormat fif = new TextInputFormat(); fif.Configure(jobConf); try { fif.ListStatus(jobConf); NUnit.Framework.Assert.Fail("Expecting an IOException for a missing Input path"); } catch (IOException e) { Path expectedExceptionPath = new Path(TestRootDir, "input2"); expectedExceptionPath = localFs.MakeQualified(expectedExceptionPath); NUnit.Framework.Assert.IsTrue(e is InvalidInputException); NUnit.Framework.Assert.AreEqual("Input path does not exist: " + expectedExceptionPath .ToString(), e.Message); } }
/// <summary>Test using the gzip codec for reading</summary> /// <exception cref="System.IO.IOException"/> public virtual void TestGzip() { JobConf job = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, job); localFs.Delete(workDir, true); WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n" ); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n" ); FileInputFormat.SetInputPaths(job, workDir); TextInputFormat format = new TextInputFormat(); format.Configure(job); InputSplit[] splits = format.GetSplits(job, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString()); results = ReadSplit(format, splits[1], job); NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }
public virtual void TestListStatusNestedNonRecursive() { Configuration conf = new Configuration(); conf.SetInt(FileInputFormat.ListStatusNumThreads, numThreads); IList <Path> expectedPaths = Org.Apache.Hadoop.Mapreduce.Lib.Input.TestFileInputFormat .ConfigureTestNestedNonRecursive(conf, localFs); JobConf jobConf = new JobConf(conf); TextInputFormat fif = new TextInputFormat(); fif.Configure(jobConf); FileStatus[] statuses = fif.ListStatus(jobConf); Org.Apache.Hadoop.Mapreduce.Lib.Input.TestFileInputFormat.VerifyFileStatuses(expectedPaths , Lists.NewArrayList(statuses), localFs); }
public virtual void TestBzip2() { JobConf jobConf = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.SetConf(bzip2, jobConf); localFs.Delete(workDir, true); System.Console.Out.WriteLine(ColorBrCyan + "testBzip2() using non-native CBZip2InputStream (presumably)" + ColorNormal); // copy prebuilt (correct!) version of concat.bz2 to HDFS string fn = "concat" + bzip2.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); WriteFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n" ); FileInputFormat.SetInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); // extends FileInputFormat format.Configure(jobConf); format.SetMinSplitSize(256); // work around 2-byte splits issue // [135 splits for a 208-byte file and a 62-byte file(!)] InputSplit[] splits = format.GetSplits(jobConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.bz2")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jobConf); NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString( )); results = ReadSplit(format, splits[1], jobConf); NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of bzip2", results[1].ToString() ); }
/// <exception cref="System.Exception"/> public virtual void TestLocality() { JobConf job = new JobConf(conf); dfs = NewDFSCluster(job); FileSystem fs = dfs.GetFileSystem(); System.Console.Out.WriteLine("FileSystem " + fs.GetUri()); Path inputDir = new Path("/foo/"); string fileName = "part-0000"; CreateInputs(fs, inputDir, fileName); // split it using a file input format TextInputFormat.AddInputPath(job, inputDir); TextInputFormat inFormat = new TextInputFormat(); inFormat.Configure(job); InputSplit[] splits = inFormat.GetSplits(job, 1); FileStatus fileStatus = fs.GetFileStatus(new Path(inputDir, fileName)); BlockLocation[] locations = fs.GetFileBlockLocations(fileStatus, 0, fileStatus.GetLen ()); System.Console.Out.WriteLine("Made splits"); // make sure that each split is a block and the locations match for (int i = 0; i < splits.Length; ++i) { FileSplit fileSplit = (FileSplit)splits[i]; System.Console.Out.WriteLine("File split: " + fileSplit); foreach (string h in fileSplit.GetLocations()) { System.Console.Out.WriteLine("Location: " + h); } System.Console.Out.WriteLine("Block: " + locations[i]); NUnit.Framework.Assert.AreEqual(locations[i].GetOffset(), fileSplit.GetStart()); NUnit.Framework.Assert.AreEqual(locations[i].GetLength(), fileSplit.GetLength()); string[] blockLocs = locations[i].GetHosts(); string[] splitLocs = fileSplit.GetLocations(); NUnit.Framework.Assert.AreEqual(2, blockLocs.Length); NUnit.Framework.Assert.AreEqual(2, splitLocs.Length); NUnit.Framework.Assert.IsTrue((blockLocs[0].Equals(splitLocs[0]) && blockLocs[1]. Equals(splitLocs[1])) || (blockLocs[1].Equals(splitLocs[0]) && blockLocs[0].Equals (splitLocs[1]))); } NUnit.Framework.Assert.AreEqual("Expected value of " + FileInputFormat.NumInputFiles , 1, job.GetLong(FileInputFormat.NumInputFiles, 0)); }
/// <summary>Test using the gzip codec and an empty input file</summary> /// <exception cref="System.IO.IOException"/> public virtual void TestGzipEmpty() { JobConf job = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, job); localFs.Delete(workDir, true); WriteFile(localFs, new Path(workDir, "empty.gz"), gzip, string.Empty); FileInputFormat.SetInputPaths(job, workDir); TextInputFormat format = new TextInputFormat(); format.Configure(job); InputSplit[] splits = format.GetSplits(job, 100); NUnit.Framework.Assert.AreEqual("Compressed files of length 0 are not returned from FileInputFormat.getSplits()." , 1, splits.Length); IList <Text> results = ReadSplit(format, splits[0], job); NUnit.Framework.Assert.AreEqual("Compressed empty file length == 0", 0, results.Count ); }
public virtual void TestListLocatedStatus() { Configuration conf = GetConfiguration(); conf.SetBoolean("fs.test.impl.disable.cache", false); conf.SetInt(FileInputFormat.ListStatusNumThreads, numThreads); conf.Set(FileInputFormat.InputDir, "test:///a1/a2"); TestFileInputFormat.MockFileSystem mockFs = (TestFileInputFormat.MockFileSystem) new Path("test:///").GetFileSystem(conf); NUnit.Framework.Assert.AreEqual("listLocatedStatus already called", 0, mockFs.numListLocatedStatusCalls ); JobConf job = new JobConf(conf); TextInputFormat fileInputFormat = new TextInputFormat(); fileInputFormat.Configure(job); InputSplit[] splits = fileInputFormat.GetSplits(job, 1); NUnit.Framework.Assert.AreEqual("Input splits are not correct", 2, splits.Length); NUnit.Framework.Assert.AreEqual("listLocatedStatuss calls", 1, mockFs.numListLocatedStatusCalls ); FileSystem.CloseAll(); }
public virtual void TestSplitLocationInfo() { Configuration conf = GetConfiguration(); conf.Set(FileInputFormat.InputDir, "test:///a1/a2"); JobConf job = new JobConf(conf); TextInputFormat fileInputFormat = new TextInputFormat(); fileInputFormat.Configure(job); FileSplit[] splits = (FileSplit[])fileInputFormat.GetSplits(job, 1); string[] locations = splits[0].GetLocations(); NUnit.Framework.Assert.AreEqual(2, locations.Length); SplitLocationInfo[] locationInfo = splits[0].GetLocationInfo(); NUnit.Framework.Assert.AreEqual(2, locationInfo.Length); SplitLocationInfo localhostInfo = locations[0].Equals("localhost") ? locationInfo [0] : locationInfo[1]; SplitLocationInfo otherhostInfo = locations[0].Equals("otherhost") ? locationInfo [0] : locationInfo[1]; NUnit.Framework.Assert.IsTrue(localhostInfo.IsOnDisk()); NUnit.Framework.Assert.IsTrue(localhostInfo.IsInMemory()); NUnit.Framework.Assert.IsTrue(otherhostInfo.IsOnDisk()); NUnit.Framework.Assert.IsFalse(otherhostInfo.IsInMemory()); }
public virtual void TestGzip() { JobConf jobConf = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.SetConf(gzip, jobConf); localFs.Delete(workDir, true); // preferred, but not compatible with Apache/trunk instance of Hudson: /* * assertFalse("[native (C/C++) codec]", * (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class == * gzip.getDecompressorType()) ); * System.out.println(COLOR_BR_RED + * "testGzip() using native-zlib Decompressor (" + * gzip.getDecompressorType() + ")" + COLOR_NORMAL); */ // alternative: if (typeof(BuiltInGzipDecompressor) == gzip.GetDecompressorType()) { System.Console.Out.WriteLine(ColorBrRed + "testGzip() using native-zlib Decompressor (" + gzip.GetDecompressorType() + ")" + ColorNormal); } else { Log.Warn("testGzip() skipped: native (C/C++) libs not loaded"); return; } /* * // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs-- * // see https://issues.apache.org/jira/browse/HADOOP-6799 * Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension()); * //OutputStream out = localFs.create(fnHDFS); * //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out); * // can just combine those two lines, probably * //GzipCodec.GzipOutputStream gzOStm = * // new GzipCodec.GzipOutputStream(localFs.create(fnHDFS)); * // oops, no: this is a protected helper class; need to access * // it via createOutputStream() instead: * OutputStream out = localFs.create(fnHDFS); * Compressor gzCmp = gzip.createCompressor(); * CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp); * // this SHOULD be going to HDFS: got out from localFs == HDFS * // ...yup, works * gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("2nd gzip concat member\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("gzip concat\nmember #3\n".getBytes()); * gzOStm.close(); * // * String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension(); * Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn); * localFs.copyToLocalFile(fnHDFS, fnLocal); */ // copy prebuilt (correct!) version of concat.gz to HDFS string fn = "concat" + gzip.GetDefaultExtension(); Path fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.CopyFromLocalFile(fnLocal, fnHDFS); WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n" ); FileInputFormat.SetInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); format.Configure(jobConf); InputSplit[] splits = format.GetSplits(jobConf, 100); NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length); FileSplit tmp = (FileSplit)splits[0]; if (tmp.GetPath().GetName().Equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } IList <Text> results = ReadSplit(format, splits[0], jobConf); NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count); NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString( )); results = ReadSplit(format, splits[1], jobConf); NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count); NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString ()); NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString()); }
/// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(defaultConf); Path file = new Path(workDir, "test.txt"); // A reporter that does nothing Reporter reporter = Reporter.Null; int seed = new Random().Next(); Log.Info("seed = " + seed); Random random = new Random(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) + 1) { Log.Debug("creating; entries = " + length); // create a file with length entries TextWriter writer = new OutputStreamWriter(localFs.Create(file)); try { for (int i = 0; i < length; i++) { writer.Write(Sharpen.Extensions.ToString(i)); writer.Write("\n"); } } finally { writer.Close(); } // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.Configure(job); LongWritable key = new LongWritable(); Text value = new Text(); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / 20) + 1; Log.Debug("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); Log.Debug("splitting: got = " + splits.Length); if (length == 0) { NUnit.Framework.Assert.AreEqual("Files of length 0 are not returned from FileInputFormat.getSplits()." , 1, splits.Length); NUnit.Framework.Assert.AreEqual("Empty file length == 0", 0, splits[0].GetLength( )); } // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { Log.Debug("split[" + j + "]= " + splits[j]); RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], job, reporter); try { int count = 0; while (reader.Next(key, value)) { int v = System.Convert.ToInt32(value.ToString()); Log.Debug("read " + v); if (bits.Get(v)) { Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos ()); } NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); count++; } Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + count); } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }
/// <exception cref="System.IO.IOException"/> public virtual void TestSplitableCodecs() { JobConf conf = new JobConf(defaultConf); int seed = new Random().Next(); // Create the codec CompressionCodec codec = null; try { codec = (CompressionCodec)ReflectionUtils.NewInstance(conf.GetClassByName("org.apache.hadoop.io.compress.BZip2Codec" ), conf); } catch (TypeLoadException) { throw new IOException("Illegal codec!"); } Path file = new Path(workDir, "test" + codec.GetDefaultExtension()); // A reporter that does nothing Reporter reporter = Reporter.Null; Log.Info("seed = " + seed); Random random = new Random(seed); FileSystem localFs = FileSystem.GetLocal(conf); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(conf, workDir); int MaxLength = 500000; // for a variety of lengths for (int length = MaxLength / 2; length < MaxLength; length += random.Next(MaxLength / 4) + 1) { Log.Info("creating; entries = " + length); // create a file with length entries TextWriter writer = new OutputStreamWriter(codec.CreateOutputStream(localFs.Create (file))); try { for (int i = 0; i < length; i++) { writer.Write(Sharpen.Extensions.ToString(i)); writer.Write("\n"); } } finally { writer.Close(); } // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.Configure(conf); LongWritable key = new LongWritable(); Text value = new Text(); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / 2000) + 1; Log.Info("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(conf, numSplits); Log.Info("splitting: got = " + splits.Length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { Log.Debug("split[" + j + "]= " + splits[j]); RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], conf, reporter); try { int counter = 0; while (reader.Next(key, value)) { int v = System.Convert.ToInt32(value.ToString()); Log.Debug("read " + v); if (bits.Get(v)) { Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos ()); } NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); counter++; } if (counter > 0) { Log.Info("splits[" + j + "]=" + splits[j] + " count=" + counter); } else { Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + counter); } } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }