// this tests both files (testCompressThenConcat, testConcatThenCompress);
        // all should work with either native zlib or new Inflater-based decoder
        /// <exception cref="System.IO.IOException"/>
        private static void DoSingleGzipBufferSize(JobConf jConf)
        {
            TextInputFormat format = new TextInputFormat();

            format.Configure(jConf);
            // here's Nth pair of DecompressorStreams:
            InputSplit[] splits = format.GetSplits(jConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("testCompressThenConcat.txt.gz"))
            {
                System.Console.Out.WriteLine("  (swapping)");
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], jConf);

            NUnit.Framework.Assert.AreEqual("splits[0] length (num lines)", 84, results.Count
                                            );
            NUnit.Framework.Assert.AreEqual("splits[0][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having"
                                            , results[0].ToString());
            NUnit.Framework.Assert.AreEqual("splits[0][42]", "Tell me, does the magnetic virtue of the needles of the compasses of"
                                            , results[42].ToString());
            results = ReadSplit(format, splits[1], jConf);
            NUnit.Framework.Assert.AreEqual("splits[1] length (num lines)", 84, results.Count
                                            );
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having"
                                            , results[0].ToString());
            NUnit.Framework.Assert.AreEqual("splits[1][42]", "Tell me, does the magnetic virtue of the needles of the compasses of"
                                            , results[42].ToString());
        }
Exemple #2
0
        public virtual void TestListStatusErrorOnNonExistantDir()
        {
            Configuration conf = new Configuration();

            conf.SetInt(FileInputFormat.ListStatusNumThreads, numThreads);
            Org.Apache.Hadoop.Mapreduce.Lib.Input.TestFileInputFormat.ConfigureTestErrorOnNonExistantDir
                (conf, localFs);
            JobConf         jobConf = new JobConf(conf);
            TextInputFormat fif     = new TextInputFormat();

            fif.Configure(jobConf);
            try
            {
                fif.ListStatus(jobConf);
                NUnit.Framework.Assert.Fail("Expecting an IOException for a missing Input path");
            }
            catch (IOException e)
            {
                Path expectedExceptionPath = new Path(TestRootDir, "input2");
                expectedExceptionPath = localFs.MakeQualified(expectedExceptionPath);
                NUnit.Framework.Assert.IsTrue(e is InvalidInputException);
                NUnit.Framework.Assert.AreEqual("Input path does not exist: " + expectedExceptionPath
                                                .ToString(), e.Message);
            }
        }
        /// <exception cref="System.Exception"/>
        public virtual void TestNumInputs()
        {
            JobConf job = new JobConf(conf);

            dfs = NewDFSCluster(job);
            FileSystem fs = dfs.GetFileSystem();

            System.Console.Out.WriteLine("FileSystem " + fs.GetUri());
            Path   inputDir     = new Path("/foo/");
            int    numFiles     = 10;
            string fileNameBase = "part-0000";

            for (int i = 0; i < numFiles; ++i)
            {
                CreateInputs(fs, inputDir, fileNameBase + i.ToString());
            }
            CreateInputs(fs, inputDir, "_meta");
            CreateInputs(fs, inputDir, "_temp");
            // split it using a file input format
            TextInputFormat.AddInputPath(job, inputDir);
            TextInputFormat inFormat = new TextInputFormat();

            inFormat.Configure(job);
            InputSplit[] splits = inFormat.GetSplits(job, 1);
            NUnit.Framework.Assert.AreEqual("Expected value of " + FileInputFormat.NumInputFiles
                                            , numFiles, job.GetLong(FileInputFormat.NumInputFiles, 0));
        }
        /// <summary>Test using the gzip codec for reading</summary>
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestGzip()
        {
            JobConf          job  = new JobConf(defaultConf);
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, job);
            localFs.Delete(workDir, true);
            WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"
                      );
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"
                      );
            FileInputFormat.SetInputPaths(job, workDir);
            TextInputFormat format = new TextInputFormat();

            format.Configure(job);
            InputSplit[] splits = format.GetSplits(job, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString());
            results = ReadSplit(format, splits[1], job);
            NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString());
        }
Exemple #5
0
        public virtual void TestListStatusNestedNonRecursive()
        {
            Configuration conf = new Configuration();

            conf.SetInt(FileInputFormat.ListStatusNumThreads, numThreads);
            IList <Path> expectedPaths = Org.Apache.Hadoop.Mapreduce.Lib.Input.TestFileInputFormat
                                         .ConfigureTestNestedNonRecursive(conf, localFs);
            JobConf         jobConf = new JobConf(conf);
            TextInputFormat fif     = new TextInputFormat();

            fif.Configure(jobConf);
            FileStatus[] statuses = fif.ListStatus(jobConf);
            Org.Apache.Hadoop.Mapreduce.Lib.Input.TestFileInputFormat.VerifyFileStatuses(expectedPaths
                                                                                         , Lists.NewArrayList(statuses), localFs);
        }
        public virtual void TestBzip2()
        {
            JobConf          jobConf = new JobConf(defaultConf);
            CompressionCodec bzip2   = new BZip2Codec();

            ReflectionUtils.SetConf(bzip2, jobConf);
            localFs.Delete(workDir, true);
            System.Console.Out.WriteLine(ColorBrCyan + "testBzip2() using non-native CBZip2InputStream (presumably)"
                                         + ColorNormal);
            // copy prebuilt (correct!) version of concat.bz2 to HDFS
            string fn      = "concat" + bzip2.GetDefaultExtension();
            Path   fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn);
            Path   fnHDFS  = new Path(workDir, fn);

            localFs.CopyFromLocalFile(fnLocal, fnHDFS);
            WriteFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n"
                      );
            FileInputFormat.SetInputPaths(jobConf, workDir);
            TextInputFormat format = new TextInputFormat();

            // extends FileInputFormat
            format.Configure(jobConf);
            format.SetMinSplitSize(256);
            // work around 2-byte splits issue
            // [135 splits for a 208-byte file and a 62-byte file(!)]
            InputSplit[] splits = format.GetSplits(jobConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.bz2"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], jobConf);

            NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString(
                                                ));
            results = ReadSplit(format, splits[1], jobConf);
            NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of bzip2", results[1].ToString()
                                            );
        }
        /// <exception cref="System.IO.IOException"/>
        private static IList <Text> ReadSplit(TextInputFormat format, InputSplit split, JobConf
                                              jobConf)
        {
            IList <Text> result = new AList <Text>();
            RecordReader <LongWritable, Text> reader = format.GetRecordReader(split, jobConf,
                                                                              voidReporter);
            LongWritable key   = reader.CreateKey();
            Text         value = reader.CreateValue();

            while (reader.Next(key, value))
            {
                result.AddItem(value);
                value = reader.CreateValue();
            }
            reader.Close();
            return(result);
        }
        /// <exception cref="System.Exception"/>
        public virtual void TestLocality()
        {
            JobConf job = new JobConf(conf);

            dfs = NewDFSCluster(job);
            FileSystem fs = dfs.GetFileSystem();

            System.Console.Out.WriteLine("FileSystem " + fs.GetUri());
            Path   inputDir = new Path("/foo/");
            string fileName = "part-0000";

            CreateInputs(fs, inputDir, fileName);
            // split it using a file input format
            TextInputFormat.AddInputPath(job, inputDir);
            TextInputFormat inFormat = new TextInputFormat();

            inFormat.Configure(job);
            InputSplit[] splits     = inFormat.GetSplits(job, 1);
            FileStatus   fileStatus = fs.GetFileStatus(new Path(inputDir, fileName));

            BlockLocation[] locations = fs.GetFileBlockLocations(fileStatus, 0, fileStatus.GetLen
                                                                     ());
            System.Console.Out.WriteLine("Made splits");
            // make sure that each split is a block and the locations match
            for (int i = 0; i < splits.Length; ++i)
            {
                FileSplit fileSplit = (FileSplit)splits[i];
                System.Console.Out.WriteLine("File split: " + fileSplit);
                foreach (string h in fileSplit.GetLocations())
                {
                    System.Console.Out.WriteLine("Location: " + h);
                }
                System.Console.Out.WriteLine("Block: " + locations[i]);
                NUnit.Framework.Assert.AreEqual(locations[i].GetOffset(), fileSplit.GetStart());
                NUnit.Framework.Assert.AreEqual(locations[i].GetLength(), fileSplit.GetLength());
                string[] blockLocs = locations[i].GetHosts();
                string[] splitLocs = fileSplit.GetLocations();
                NUnit.Framework.Assert.AreEqual(2, blockLocs.Length);
                NUnit.Framework.Assert.AreEqual(2, splitLocs.Length);
                NUnit.Framework.Assert.IsTrue((blockLocs[0].Equals(splitLocs[0]) && blockLocs[1].
                                               Equals(splitLocs[1])) || (blockLocs[1].Equals(splitLocs[0]) && blockLocs[0].Equals
                                                                             (splitLocs[1])));
            }
            NUnit.Framework.Assert.AreEqual("Expected value of " + FileInputFormat.NumInputFiles
                                            , 1, job.GetLong(FileInputFormat.NumInputFiles, 0));
        }
        /// <summary>Test using the gzip codec and an empty input file</summary>
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestGzipEmpty()
        {
            JobConf          job  = new JobConf(defaultConf);
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, job);
            localFs.Delete(workDir, true);
            WriteFile(localFs, new Path(workDir, "empty.gz"), gzip, string.Empty);
            FileInputFormat.SetInputPaths(job, workDir);
            TextInputFormat format = new TextInputFormat();

            format.Configure(job);
            InputSplit[] splits = format.GetSplits(job, 100);
            NUnit.Framework.Assert.AreEqual("Compressed files of length 0 are not returned from FileInputFormat.getSplits()."
                                            , 1, splits.Length);
            IList <Text> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("Compressed empty file length == 0", 0, results.Count
                                            );
        }
Exemple #10
0
        public virtual void TestListLocatedStatus()
        {
            Configuration conf = GetConfiguration();

            conf.SetBoolean("fs.test.impl.disable.cache", false);
            conf.SetInt(FileInputFormat.ListStatusNumThreads, numThreads);
            conf.Set(FileInputFormat.InputDir, "test:///a1/a2");
            TestFileInputFormat.MockFileSystem mockFs = (TestFileInputFormat.MockFileSystem) new
                                                        Path("test:///").GetFileSystem(conf);
            NUnit.Framework.Assert.AreEqual("listLocatedStatus already called", 0, mockFs.numListLocatedStatusCalls
                                            );
            JobConf         job             = new JobConf(conf);
            TextInputFormat fileInputFormat = new TextInputFormat();

            fileInputFormat.Configure(job);
            InputSplit[] splits = fileInputFormat.GetSplits(job, 1);
            NUnit.Framework.Assert.AreEqual("Input splits are not correct", 2, splits.Length);
            NUnit.Framework.Assert.AreEqual("listLocatedStatuss calls", 1, mockFs.numListLocatedStatusCalls
                                            );
            FileSystem.CloseAll();
        }
Exemple #11
0
        public virtual void TestSplitLocationInfo()
        {
            Configuration conf = GetConfiguration();

            conf.Set(FileInputFormat.InputDir, "test:///a1/a2");
            JobConf         job             = new JobConf(conf);
            TextInputFormat fileInputFormat = new TextInputFormat();

            fileInputFormat.Configure(job);
            FileSplit[] splits    = (FileSplit[])fileInputFormat.GetSplits(job, 1);
            string[]    locations = splits[0].GetLocations();
            NUnit.Framework.Assert.AreEqual(2, locations.Length);
            SplitLocationInfo[] locationInfo = splits[0].GetLocationInfo();
            NUnit.Framework.Assert.AreEqual(2, locationInfo.Length);
            SplitLocationInfo localhostInfo = locations[0].Equals("localhost") ? locationInfo
                                              [0] : locationInfo[1];
            SplitLocationInfo otherhostInfo = locations[0].Equals("otherhost") ? locationInfo
                                              [0] : locationInfo[1];

            NUnit.Framework.Assert.IsTrue(localhostInfo.IsOnDisk());
            NUnit.Framework.Assert.IsTrue(localhostInfo.IsInMemory());
            NUnit.Framework.Assert.IsTrue(otherhostInfo.IsOnDisk());
            NUnit.Framework.Assert.IsFalse(otherhostInfo.IsInMemory());
        }
        public virtual void TestGzip()
        {
            JobConf          jobConf = new JobConf(defaultConf);
            CompressionCodec gzip    = new GzipCodec();

            ReflectionUtils.SetConf(gzip, jobConf);
            localFs.Delete(workDir, true);
            // preferred, but not compatible with Apache/trunk instance of Hudson:

            /*
             * assertFalse("[native (C/C++) codec]",
             * (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class ==
             * gzip.getDecompressorType()) );
             * System.out.println(COLOR_BR_RED +
             * "testGzip() using native-zlib Decompressor (" +
             * gzip.getDecompressorType() + ")" + COLOR_NORMAL);
             */
            // alternative:
            if (typeof(BuiltInGzipDecompressor) == gzip.GetDecompressorType())
            {
                System.Console.Out.WriteLine(ColorBrRed + "testGzip() using native-zlib Decompressor ("
                                             + gzip.GetDecompressorType() + ")" + ColorNormal);
            }
            else
            {
                Log.Warn("testGzip() skipped:  native (C/C++) libs not loaded");
                return;
            }

            /*
             *      // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs--
             *      //                see https://issues.apache.org/jira/browse/HADOOP-6799
             *  Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension());
             *  //OutputStream out = localFs.create(fnHDFS);
             *  //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out);
             *      // can just combine those two lines, probably
             *  //GzipCodec.GzipOutputStream gzOStm =
             *  //  new GzipCodec.GzipOutputStream(localFs.create(fnHDFS));
             *      // oops, no:  this is a protected helper class; need to access
             *      //   it via createOutputStream() instead:
             *  OutputStream out = localFs.create(fnHDFS);
             *  Compressor gzCmp = gzip.createCompressor();
             *  CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp);
             *      // this SHOULD be going to HDFS:  got out from localFs == HDFS
             *      //   ...yup, works
             *  gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes());
             *  gzOStm.finish();
             *  gzOStm.resetState();
             *  gzOStm.write("2nd gzip concat member\n".getBytes());
             *  gzOStm.finish();
             *  gzOStm.resetState();
             *  gzOStm.write("gzip concat\nmember #3\n".getBytes());
             *  gzOStm.close();
             *      //
             *  String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension();
             *  Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn);
             *  localFs.copyToLocalFile(fnHDFS, fnLocal);
             */
            // copy prebuilt (correct!) version of concat.gz to HDFS
            string fn      = "concat" + gzip.GetDefaultExtension();
            Path   fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn);
            Path   fnHDFS  = new Path(workDir, fn);

            localFs.CopyFromLocalFile(fnLocal, fnHDFS);
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"
                      );
            FileInputFormat.SetInputPaths(jobConf, workDir);
            TextInputFormat format = new TextInputFormat();

            format.Configure(jobConf);
            InputSplit[] splits = format.GetSplits(jobConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
            {
                splits[0] = splits[1];
                splits[1] = tmp;
            }
            IList <Text> results = ReadSplit(format, splits[0], jobConf);

            NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString(
                                                ));
            results = ReadSplit(format, splits[1], jobConf);
            NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
                                                ());
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString());
        }
Exemple #13
0
        public virtual void TestCombiner()
        {
            if (!new FilePath(TestRootDir).Mkdirs())
            {
                throw new RuntimeException("Could not create test dir: " + TestRootDir);
            }
            FilePath @in = new FilePath(TestRootDir, "input");

            if ([email protected]())
            {
                throw new RuntimeException("Could not create test dir: " + @in);
            }
            FilePath    @out = new FilePath(TestRootDir, "output");
            PrintWriter pw   = new PrintWriter(new FileWriter(new FilePath(@in, "data.txt")));

            pw.WriteLine("A|a,1");
            pw.WriteLine("A|b,2");
            pw.WriteLine("B|a,3");
            pw.WriteLine("B|b,4");
            pw.WriteLine("B|c,5");
            pw.Close();
            JobConf job = new JobConf();

            job.Set("mapreduce.framework.name", "local");
            TextInputFormat.SetInputPaths(job, new Path(@in.GetPath()));
            TextOutputFormat.SetOutputPath(job, new Path(@out.GetPath()));
            job.SetMapperClass(typeof(TestOldCombinerGrouping.Map));
            job.SetReducerClass(typeof(TestOldCombinerGrouping.Reduce));
            job.SetInputFormat(typeof(TextInputFormat));
            job.SetMapOutputKeyClass(typeof(Text));
            job.SetMapOutputValueClass(typeof(LongWritable));
            job.SetOutputFormat(typeof(TextOutputFormat));
            job.SetOutputValueGroupingComparator(typeof(TestOldCombinerGrouping.GroupComparator
                                                        ));
            job.SetCombinerClass(typeof(TestOldCombinerGrouping.Combiner));
            job.SetCombinerKeyGroupingComparator(typeof(TestOldCombinerGrouping.GroupComparator
                                                        ));
            job.SetInt("min.num.spills.for.combine", 0);
            JobClient  client     = new JobClient(job);
            RunningJob runningJob = client.SubmitJob(job);

            runningJob.WaitForCompletion();
            if (runningJob.IsSuccessful())
            {
                Counters counters             = runningJob.GetCounters();
                long     combinerInputRecords = counters.GetGroup("org.apache.hadoop.mapreduce.TaskCounter"
                                                                  ).GetCounter("COMBINE_INPUT_RECORDS");
                long combinerOutputRecords = counters.GetGroup("org.apache.hadoop.mapreduce.TaskCounter"
                                                               ).GetCounter("COMBINE_OUTPUT_RECORDS");
                NUnit.Framework.Assert.IsTrue(combinerInputRecords > 0);
                NUnit.Framework.Assert.IsTrue(combinerInputRecords > combinerOutputRecords);
                BufferedReader br = new BufferedReader(new FileReader(new FilePath(@out, "part-00000"
                                                                                   )));
                ICollection <string> output = new HashSet <string>();
                string line = br.ReadLine();
                NUnit.Framework.Assert.IsNotNull(line);
                output.AddItem(Sharpen.Runtime.Substring(line, 0, 1) + Sharpen.Runtime.Substring(
                                   line, 4, 5));
                line = br.ReadLine();
                NUnit.Framework.Assert.IsNotNull(line);
                output.AddItem(Sharpen.Runtime.Substring(line, 0, 1) + Sharpen.Runtime.Substring(
                                   line, 4, 5));
                line = br.ReadLine();
                NUnit.Framework.Assert.IsNull(line);
                br.Close();
                ICollection <string> expected = new HashSet <string>();
                expected.AddItem("A2");
                expected.AddItem("B5");
                NUnit.Framework.Assert.AreEqual(expected, output);
            }
            else
            {
                NUnit.Framework.Assert.Fail("Job failed");
            }
        }
        /// <exception cref="System.Exception"/>
        public virtual void TestFormat()
        {
            JobConf job  = new JobConf(defaultConf);
            Path    file = new Path(workDir, "test.txt");
            // A reporter that does nothing
            Reporter reporter = Reporter.Null;
            int      seed     = new Random().Next();

            Log.Info("seed = " + seed);
            Random random = new Random(seed);

            localFs.Delete(workDir, true);
            FileInputFormat.SetInputPaths(job, workDir);
            // for a variety of lengths
            for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) +
                                                               1)
            {
                Log.Debug("creating; entries = " + length);
                // create a file with length entries
                TextWriter writer = new OutputStreamWriter(localFs.Create(file));
                try
                {
                    for (int i = 0; i < length; i++)
                    {
                        writer.Write(Sharpen.Extensions.ToString(i));
                        writer.Write("\n");
                    }
                }
                finally
                {
                    writer.Close();
                }
                // try splitting the file in a variety of sizes
                TextInputFormat format = new TextInputFormat();
                format.Configure(job);
                LongWritable key   = new LongWritable();
                Text         value = new Text();
                for (int i_1 = 0; i_1 < 3; i_1++)
                {
                    int numSplits = random.Next(MaxLength / 20) + 1;
                    Log.Debug("splitting: requesting = " + numSplits);
                    InputSplit[] splits = format.GetSplits(job, numSplits);
                    Log.Debug("splitting: got =        " + splits.Length);
                    if (length == 0)
                    {
                        NUnit.Framework.Assert.AreEqual("Files of length 0 are not returned from FileInputFormat.getSplits()."
                                                        , 1, splits.Length);
                        NUnit.Framework.Assert.AreEqual("Empty file length == 0", 0, splits[0].GetLength(
                                                            ));
                    }
                    // check each split
                    BitSet bits = new BitSet(length);
                    for (int j = 0; j < splits.Length; j++)
                    {
                        Log.Debug("split[" + j + "]= " + splits[j]);
                        RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], job,
                                                                                          reporter);
                        try
                        {
                            int count = 0;
                            while (reader.Next(key, value))
                            {
                                int v = System.Convert.ToInt32(value.ToString());
                                Log.Debug("read " + v);
                                if (bits.Get(v))
                                {
                                    Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos
                                                 ());
                                }
                                NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v));
                                bits.Set(v);
                                count++;
                            }
                            Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + count);
                        }
                        finally
                        {
                            reader.Close();
                        }
                    }
                    NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality
                                                        ());
                }
            }
        }
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestSplitableCodecs()
        {
            JobConf conf = new JobConf(defaultConf);
            int     seed = new Random().Next();
            // Create the codec
            CompressionCodec codec = null;

            try
            {
                codec = (CompressionCodec)ReflectionUtils.NewInstance(conf.GetClassByName("org.apache.hadoop.io.compress.BZip2Codec"
                                                                                          ), conf);
            }
            catch (TypeLoadException)
            {
                throw new IOException("Illegal codec!");
            }
            Path file = new Path(workDir, "test" + codec.GetDefaultExtension());
            // A reporter that does nothing
            Reporter reporter = Reporter.Null;

            Log.Info("seed = " + seed);
            Random     random  = new Random(seed);
            FileSystem localFs = FileSystem.GetLocal(conf);

            localFs.Delete(workDir, true);
            FileInputFormat.SetInputPaths(conf, workDir);
            int MaxLength = 500000;

            // for a variety of lengths
            for (int length = MaxLength / 2; length < MaxLength; length += random.Next(MaxLength
                                                                                       / 4) + 1)
            {
                Log.Info("creating; entries = " + length);
                // create a file with length entries
                TextWriter writer = new OutputStreamWriter(codec.CreateOutputStream(localFs.Create
                                                                                        (file)));
                try
                {
                    for (int i = 0; i < length; i++)
                    {
                        writer.Write(Sharpen.Extensions.ToString(i));
                        writer.Write("\n");
                    }
                }
                finally
                {
                    writer.Close();
                }
                // try splitting the file in a variety of sizes
                TextInputFormat format = new TextInputFormat();
                format.Configure(conf);
                LongWritable key   = new LongWritable();
                Text         value = new Text();
                for (int i_1 = 0; i_1 < 3; i_1++)
                {
                    int numSplits = random.Next(MaxLength / 2000) + 1;
                    Log.Info("splitting: requesting = " + numSplits);
                    InputSplit[] splits = format.GetSplits(conf, numSplits);
                    Log.Info("splitting: got =        " + splits.Length);
                    // check each split
                    BitSet bits = new BitSet(length);
                    for (int j = 0; j < splits.Length; j++)
                    {
                        Log.Debug("split[" + j + "]= " + splits[j]);
                        RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], conf,
                                                                                          reporter);
                        try
                        {
                            int counter = 0;
                            while (reader.Next(key, value))
                            {
                                int v = System.Convert.ToInt32(value.ToString());
                                Log.Debug("read " + v);
                                if (bits.Get(v))
                                {
                                    Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos
                                                 ());
                                }
                                NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v));
                                bits.Set(v);
                                counter++;
                            }
                            if (counter > 0)
                            {
                                Log.Info("splits[" + j + "]=" + splits[j] + " count=" + counter);
                            }
                            else
                            {
                                Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + counter);
                            }
                        }
                        finally
                        {
                            reader.Close();
                        }
                    }
                    NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality
                                                        ());
                }
            }
        }