コード例 #1
        // this tests both files (testCompressThenConcat, testConcatThenCompress);
        // all should work with either native zlib or new Inflater-based decoder
        /// <exception cref="System.IO.IOException"/>
        private static void DoSingleGzipBufferSize(JobConf jConf)
            TextInputFormat format = new TextInputFormat();

            // here's Nth pair of DecompressorStreams:
            InputSplit[] splits = format.GetSplits(jConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("testCompressThenConcat.txt.gz"))
                System.Console.Out.WriteLine("  (swapping)");
                splits[0] = splits[1];
                splits[1] = tmp;
            IList <Text> results = ReadSplit(format, splits[0], jConf);

            NUnit.Framework.Assert.AreEqual("splits[0] length (num lines)", 84, results.Count
            NUnit.Framework.Assert.AreEqual("splits[0][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having"
                                            , results[0].ToString());
            NUnit.Framework.Assert.AreEqual("splits[0][42]", "Tell me, does the magnetic virtue of the needles of the compasses of"
                                            , results[42].ToString());
            results = ReadSplit(format, splits[1], jConf);
            NUnit.Framework.Assert.AreEqual("splits[1] length (num lines)", 84, results.Count
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "Call me Ishmael. Some years ago--never mind how long precisely--having"
                                            , results[0].ToString());
            NUnit.Framework.Assert.AreEqual("splits[1][42]", "Tell me, does the magnetic virtue of the needles of the compasses of"
                                            , results[42].ToString());
コード例 #2
        /// <exception cref="System.Exception"/>
        public virtual void TestNumInputs()
            JobConf job = new JobConf(conf);

            dfs = NewDFSCluster(job);
            FileSystem fs = dfs.GetFileSystem();

            System.Console.Out.WriteLine("FileSystem " + fs.GetUri());
            Path   inputDir     = new Path("/foo/");
            int    numFiles     = 10;
            string fileNameBase = "part-0000";

            for (int i = 0; i < numFiles; ++i)
                CreateInputs(fs, inputDir, fileNameBase + i.ToString());
            CreateInputs(fs, inputDir, "_meta");
            CreateInputs(fs, inputDir, "_temp");
            // split it using a file input format
            TextInputFormat.AddInputPath(job, inputDir);
            TextInputFormat inFormat = new TextInputFormat();

            InputSplit[] splits = inFormat.GetSplits(job, 1);
            NUnit.Framework.Assert.AreEqual("Expected value of " + FileInputFormat.NumInputFiles
                                            , numFiles, job.GetLong(FileInputFormat.NumInputFiles, 0));
コード例 #3
        /// <summary>Test using the gzip codec for reading</summary>
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestGzip()
            JobConf          job  = new JobConf(defaultConf);
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, job);
            localFs.Delete(workDir, true);
            WriteFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"
            FileInputFormat.SetInputPaths(job, workDir);
            TextInputFormat format = new TextInputFormat();

            InputSplit[] splits = format.GetSplits(job, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
                splits[0] = splits[1];
                splits[1] = tmp;
            IList <Text> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("splits[0] length", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", " dog", results[5].ToString());
            results = ReadSplit(format, splits[1], job);
            NUnit.Framework.Assert.AreEqual("splits[1] length", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString());
コード例 #4
        public virtual void TestBzip2()
            JobConf          jobConf = new JobConf(defaultConf);
            CompressionCodec bzip2   = new BZip2Codec();

            ReflectionUtils.SetConf(bzip2, jobConf);
            localFs.Delete(workDir, true);
            System.Console.Out.WriteLine(ColorBrCyan + "testBzip2() using non-native CBZip2InputStream (presumably)"
                                         + ColorNormal);
            // copy prebuilt (correct!) version of concat.bz2 to HDFS
            string fn      = "concat" + bzip2.GetDefaultExtension();
            Path   fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn);
            Path   fnHDFS  = new Path(workDir, fn);

            localFs.CopyFromLocalFile(fnLocal, fnHDFS);
            WriteFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n"
            FileInputFormat.SetInputPaths(jobConf, workDir);
            TextInputFormat format = new TextInputFormat();

            // extends FileInputFormat
            // work around 2-byte splits issue
            // [135 splits for a 208-byte file and a 62-byte file(!)]
            InputSplit[] splits = format.GetSplits(jobConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.bz2"))
                splits[0] = splits[1];
                splits[1] = tmp;
            IList <Text> results = ReadSplit(format, splits[0], jobConf);

            NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString(
            results = ReadSplit(format, splits[1], jobConf);
            NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of bzip2", results[1].ToString()
コード例 #5
        /// <exception cref="System.Exception"/>
        public virtual void TestLocality()
            JobConf job = new JobConf(conf);

            dfs = NewDFSCluster(job);
            FileSystem fs = dfs.GetFileSystem();

            System.Console.Out.WriteLine("FileSystem " + fs.GetUri());
            Path   inputDir = new Path("/foo/");
            string fileName = "part-0000";

            CreateInputs(fs, inputDir, fileName);
            // split it using a file input format
            TextInputFormat.AddInputPath(job, inputDir);
            TextInputFormat inFormat = new TextInputFormat();

            InputSplit[] splits     = inFormat.GetSplits(job, 1);
            FileStatus   fileStatus = fs.GetFileStatus(new Path(inputDir, fileName));

            BlockLocation[] locations = fs.GetFileBlockLocations(fileStatus, 0, fileStatus.GetLen
            System.Console.Out.WriteLine("Made splits");
            // make sure that each split is a block and the locations match
            for (int i = 0; i < splits.Length; ++i)
                FileSplit fileSplit = (FileSplit)splits[i];
                System.Console.Out.WriteLine("File split: " + fileSplit);
                foreach (string h in fileSplit.GetLocations())
                    System.Console.Out.WriteLine("Location: " + h);
                System.Console.Out.WriteLine("Block: " + locations[i]);
                NUnit.Framework.Assert.AreEqual(locations[i].GetOffset(), fileSplit.GetStart());
                NUnit.Framework.Assert.AreEqual(locations[i].GetLength(), fileSplit.GetLength());
                string[] blockLocs = locations[i].GetHosts();
                string[] splitLocs = fileSplit.GetLocations();
                NUnit.Framework.Assert.AreEqual(2, blockLocs.Length);
                NUnit.Framework.Assert.AreEqual(2, splitLocs.Length);
                NUnit.Framework.Assert.IsTrue((blockLocs[0].Equals(splitLocs[0]) && blockLocs[1].
                                               Equals(splitLocs[1])) || (blockLocs[1].Equals(splitLocs[0]) && blockLocs[0].Equals
            NUnit.Framework.Assert.AreEqual("Expected value of " + FileInputFormat.NumInputFiles
                                            , 1, job.GetLong(FileInputFormat.NumInputFiles, 0));
コード例 #6
        /// <summary>Test using the gzip codec and an empty input file</summary>
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestGzipEmpty()
            JobConf          job  = new JobConf(defaultConf);
            CompressionCodec gzip = new GzipCodec();

            ReflectionUtils.SetConf(gzip, job);
            localFs.Delete(workDir, true);
            WriteFile(localFs, new Path(workDir, "empty.gz"), gzip, string.Empty);
            FileInputFormat.SetInputPaths(job, workDir);
            TextInputFormat format = new TextInputFormat();

            InputSplit[] splits = format.GetSplits(job, 100);
            NUnit.Framework.Assert.AreEqual("Compressed files of length 0 are not returned from FileInputFormat.getSplits()."
                                            , 1, splits.Length);
            IList <Text> results = ReadSplit(format, splits[0], job);

            NUnit.Framework.Assert.AreEqual("Compressed empty file length == 0", 0, results.Count
コード例 #7
        public virtual void TestListLocatedStatus()
            Configuration conf = GetConfiguration();

            conf.SetBoolean("fs.test.impl.disable.cache", false);
            conf.SetInt(FileInputFormat.ListStatusNumThreads, numThreads);
            conf.Set(FileInputFormat.InputDir, "test:///a1/a2");
            TestFileInputFormat.MockFileSystem mockFs = (TestFileInputFormat.MockFileSystem) new
            NUnit.Framework.Assert.AreEqual("listLocatedStatus already called", 0, mockFs.numListLocatedStatusCalls
            JobConf         job             = new JobConf(conf);
            TextInputFormat fileInputFormat = new TextInputFormat();

            InputSplit[] splits = fileInputFormat.GetSplits(job, 1);
            NUnit.Framework.Assert.AreEqual("Input splits are not correct", 2, splits.Length);
            NUnit.Framework.Assert.AreEqual("listLocatedStatuss calls", 1, mockFs.numListLocatedStatusCalls
コード例 #8
        public virtual void TestSplitLocationInfo()
            Configuration conf = GetConfiguration();

            conf.Set(FileInputFormat.InputDir, "test:///a1/a2");
            JobConf         job             = new JobConf(conf);
            TextInputFormat fileInputFormat = new TextInputFormat();

            FileSplit[] splits    = (FileSplit[])fileInputFormat.GetSplits(job, 1);
            string[]    locations = splits[0].GetLocations();
            NUnit.Framework.Assert.AreEqual(2, locations.Length);
            SplitLocationInfo[] locationInfo = splits[0].GetLocationInfo();
            NUnit.Framework.Assert.AreEqual(2, locationInfo.Length);
            SplitLocationInfo localhostInfo = locations[0].Equals("localhost") ? locationInfo
                                              [0] : locationInfo[1];
            SplitLocationInfo otherhostInfo = locations[0].Equals("otherhost") ? locationInfo
                                              [0] : locationInfo[1];

コード例 #9
        public virtual void TestGzip()
            JobConf          jobConf = new JobConf(defaultConf);
            CompressionCodec gzip    = new GzipCodec();

            ReflectionUtils.SetConf(gzip, jobConf);
            localFs.Delete(workDir, true);
            // preferred, but not compatible with Apache/trunk instance of Hudson:

             * assertFalse("[native (C/C++) codec]",
             * (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class ==
             * gzip.getDecompressorType()) );
             * System.out.println(COLOR_BR_RED +
             * "testGzip() using native-zlib Decompressor (" +
             * gzip.getDecompressorType() + ")" + COLOR_NORMAL);
            // alternative:
            if (typeof(BuiltInGzipDecompressor) == gzip.GetDecompressorType())
                System.Console.Out.WriteLine(ColorBrRed + "testGzip() using native-zlib Decompressor ("
                                             + gzip.GetDecompressorType() + ")" + ColorNormal);
                Log.Warn("testGzip() skipped:  native (C/C++) libs not loaded");

             *      // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs--
             *      //                see https://issues.apache.org/jira/browse/HADOOP-6799
             *  Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension());
             *  //OutputStream out = localFs.create(fnHDFS);
             *  //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out);
             *      // can just combine those two lines, probably
             *  //GzipCodec.GzipOutputStream gzOStm =
             *  //  new GzipCodec.GzipOutputStream(localFs.create(fnHDFS));
             *      // oops, no:  this is a protected helper class; need to access
             *      //   it via createOutputStream() instead:
             *  OutputStream out = localFs.create(fnHDFS);
             *  Compressor gzCmp = gzip.createCompressor();
             *  CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp);
             *      // this SHOULD be going to HDFS:  got out from localFs == HDFS
             *      //   ...yup, works
             *  gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes());
             *  gzOStm.finish();
             *  gzOStm.resetState();
             *  gzOStm.write("2nd gzip concat member\n".getBytes());
             *  gzOStm.finish();
             *  gzOStm.resetState();
             *  gzOStm.write("gzip concat\nmember #3\n".getBytes());
             *  gzOStm.close();
             *      //
             *  String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension();
             *  Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn);
             *  localFs.copyToLocalFile(fnHDFS, fnLocal);
            // copy prebuilt (correct!) version of concat.gz to HDFS
            string fn      = "concat" + gzip.GetDefaultExtension();
            Path   fnLocal = new Path(Runtime.GetProperty("test.concat.data", "/tmp"), fn);
            Path   fnHDFS  = new Path(workDir, fn);

            localFs.CopyFromLocalFile(fnLocal, fnHDFS);
            WriteFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"
            FileInputFormat.SetInputPaths(jobConf, workDir);
            TextInputFormat format = new TextInputFormat();

            InputSplit[] splits = format.GetSplits(jobConf, 100);
            NUnit.Framework.Assert.AreEqual("compressed splits == 2", 2, splits.Length);
            FileSplit tmp = (FileSplit)splits[0];

            if (tmp.GetPath().GetName().Equals("part2.txt.gz"))
                splits[0] = splits[1];
                splits[1] = tmp;
            IList <Text> results = ReadSplit(format, splits[0], jobConf);

            NUnit.Framework.Assert.AreEqual("splits[0] num lines", 6, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[0][5]", "member #3", results[5].ToString(
            results = ReadSplit(format, splits[1], jobConf);
            NUnit.Framework.Assert.AreEqual("splits[1] num lines", 2, results.Count);
            NUnit.Framework.Assert.AreEqual("splits[1][0]", "this is a test", results[0].ToString
            NUnit.Framework.Assert.AreEqual("splits[1][1]", "of gzip", results[1].ToString());
コード例 #10
        /// <exception cref="System.Exception"/>
        public virtual void TestFormat()
            JobConf job  = new JobConf(defaultConf);
            Path    file = new Path(workDir, "test.txt");
            // A reporter that does nothing
            Reporter reporter = Reporter.Null;
            int      seed     = new Random().Next();

            Log.Info("seed = " + seed);
            Random random = new Random(seed);

            localFs.Delete(workDir, true);
            FileInputFormat.SetInputPaths(job, workDir);
            // for a variety of lengths
            for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) +
                Log.Debug("creating; entries = " + length);
                // create a file with length entries
                TextWriter writer = new OutputStreamWriter(localFs.Create(file));
                    for (int i = 0; i < length; i++)
                // try splitting the file in a variety of sizes
                TextInputFormat format = new TextInputFormat();
                LongWritable key   = new LongWritable();
                Text         value = new Text();
                for (int i_1 = 0; i_1 < 3; i_1++)
                    int numSplits = random.Next(MaxLength / 20) + 1;
                    Log.Debug("splitting: requesting = " + numSplits);
                    InputSplit[] splits = format.GetSplits(job, numSplits);
                    Log.Debug("splitting: got =        " + splits.Length);
                    if (length == 0)
                        NUnit.Framework.Assert.AreEqual("Files of length 0 are not returned from FileInputFormat.getSplits()."
                                                        , 1, splits.Length);
                        NUnit.Framework.Assert.AreEqual("Empty file length == 0", 0, splits[0].GetLength(
                    // check each split
                    BitSet bits = new BitSet(length);
                    for (int j = 0; j < splits.Length; j++)
                        Log.Debug("split[" + j + "]= " + splits[j]);
                        RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], job,
                            int count = 0;
                            while (reader.Next(key, value))
                                int v = System.Convert.ToInt32(value.ToString());
                                Log.Debug("read " + v);
                                if (bits.Get(v))
                                    Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos
                                NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v));
                            Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + count);
                    NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality
コード例 #11
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestSplitableCodecs()
            JobConf conf = new JobConf(defaultConf);
            int     seed = new Random().Next();
            // Create the codec
            CompressionCodec codec = null;

                codec = (CompressionCodec)ReflectionUtils.NewInstance(conf.GetClassByName("org.apache.hadoop.io.compress.BZip2Codec"
                                                                                          ), conf);
            catch (TypeLoadException)
                throw new IOException("Illegal codec!");
            Path file = new Path(workDir, "test" + codec.GetDefaultExtension());
            // A reporter that does nothing
            Reporter reporter = Reporter.Null;

            Log.Info("seed = " + seed);
            Random     random  = new Random(seed);
            FileSystem localFs = FileSystem.GetLocal(conf);

            localFs.Delete(workDir, true);
            FileInputFormat.SetInputPaths(conf, workDir);
            int MaxLength = 500000;

            // for a variety of lengths
            for (int length = MaxLength / 2; length < MaxLength; length += random.Next(MaxLength
                                                                                       / 4) + 1)
                Log.Info("creating; entries = " + length);
                // create a file with length entries
                TextWriter writer = new OutputStreamWriter(codec.CreateOutputStream(localFs.Create
                    for (int i = 0; i < length; i++)
                // try splitting the file in a variety of sizes
                TextInputFormat format = new TextInputFormat();
                LongWritable key   = new LongWritable();
                Text         value = new Text();
                for (int i_1 = 0; i_1 < 3; i_1++)
                    int numSplits = random.Next(MaxLength / 2000) + 1;
                    Log.Info("splitting: requesting = " + numSplits);
                    InputSplit[] splits = format.GetSplits(conf, numSplits);
                    Log.Info("splitting: got =        " + splits.Length);
                    // check each split
                    BitSet bits = new BitSet(length);
                    for (int j = 0; j < splits.Length; j++)
                        Log.Debug("split[" + j + "]= " + splits[j]);
                        RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], conf,
                            int counter = 0;
                            while (reader.Next(key, value))
                                int v = System.Convert.ToInt32(value.ToString());
                                Log.Debug("read " + v);
                                if (bits.Get(v))
                                    Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos
                                NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v));
                            if (counter > 0)
                                Log.Info("splits[" + j + "]=" + splits[j] + " count=" + counter);
                                Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + counter);
                    NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality