Beispiel #1
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.Exception"/>
        public override RecordWriter <K, V> GetRecordWriter(TaskAttemptContext job)
        {
            Configuration    conf              = job.GetConfiguration();
            bool             isCompressed      = GetCompressOutput(job);
            string           keyValueSeparator = conf.Get(Seperator, "\t");
            CompressionCodec codec             = null;
            string           extension         = string.Empty;

            if (isCompressed)
            {
                Type codecClass = GetOutputCompressorClass(job, typeof(GzipCodec));
                codec     = (CompressionCodec)ReflectionUtils.NewInstance(codecClass, conf);
                extension = codec.GetDefaultExtension();
            }
            Path       file = GetDefaultWorkFile(job, extension);
            FileSystem fs   = file.GetFileSystem(conf);

            if (!isCompressed)
            {
                FSDataOutputStream fileOut = fs.Create(file, false);
                return(new TextOutputFormat.LineRecordWriter <K, V>(fileOut, keyValueSeparator));
            }
            else
            {
                FSDataOutputStream fileOut = fs.Create(file, false);
                return(new TextOutputFormat.LineRecordWriter <K, V>(new DataOutputStream(codec.CreateOutputStream
                                                                                             (fileOut)), keyValueSeparator));
            }
        }
        /// <exception cref="System.IO.IOException"/>
        public override RecordWriter <K, V> GetRecordWriter(FileSystem ignored, JobConf job
                                                            , string name, Progressable progress)
        {
            bool   isCompressed      = GetCompressOutput(job);
            string keyValueSeparator = job.Get("mapreduce.output.textoutputformat.separator",
                                               "\t");

            if (!isCompressed)
            {
                Path               file    = FileOutputFormat.GetTaskOutputPath(job, name);
                FileSystem         fs      = file.GetFileSystem(job);
                FSDataOutputStream fileOut = fs.Create(file, progress);
                return(new TextOutputFormat.LineRecordWriter <K, V>(fileOut, keyValueSeparator));
            }
            else
            {
                Type codecClass = GetOutputCompressorClass(job, typeof(GzipCodec));
                // create the named codec
                CompressionCodec codec = ReflectionUtils.NewInstance(codecClass, job);
                // build the filename including the extension
                Path file = FileOutputFormat.GetTaskOutputPath(job, name + codec.GetDefaultExtension
                                                                   ());
                FileSystem         fs      = file.GetFileSystem(job);
                FSDataOutputStream fileOut = fs.Create(file, progress);
                return(new TextOutputFormat.LineRecordWriter <K, V>(new DataOutputStream(codec.CreateOutputStream
                                                                                             (fileOut)), keyValueSeparator));
            }
        }
Beispiel #3
0
        public virtual void TestSplitableCodecs()
        {
            Job           job  = Job.GetInstance(defaultConf);
            Configuration conf = job.GetConfiguration();
            // Create the codec
            CompressionCodec codec = null;

            try
            {
                codec = (CompressionCodec)ReflectionUtils.NewInstance(conf.GetClassByName("org.apache.hadoop.io.compress.BZip2Codec"
                                                                                          ), conf);
            }
            catch (TypeLoadException)
            {
                throw new IOException("Illegal codec!");
            }
            Path file = new Path(workDir, "test" + codec.GetDefaultExtension());
            int  seed = new Random().Next();

            Log.Info("seed = " + seed);
            Random random = new Random(seed);

            localFs.Delete(workDir, true);
            FileInputFormat.SetInputPaths(job, workDir);
            int MaxLength = 500000;

            FileInputFormat.SetMaxInputSplitSize(job, MaxLength / 20);
            // for a variety of lengths
            for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 4) + 1)
            {
                Log.Info("creating; entries = " + length);
                // create a file with length entries
                TextWriter writer = new OutputStreamWriter(codec.CreateOutputStream(localFs.Create
                                                                                        (file)));
                try
                {
                    for (int i = 0; i < length; i++)
                    {
                        writer.Write(Sharpen.Extensions.ToString(i * 2));
                        writer.Write("\t");
                        writer.Write(Sharpen.Extensions.ToString(i));
                        writer.Write("\n");
                    }
                }
                finally
                {
                    writer.Close();
                }
                // try splitting the file in a variety of sizes
                KeyValueTextInputFormat format = new KeyValueTextInputFormat();
                NUnit.Framework.Assert.IsTrue("KVTIF claims not splittable", format.IsSplitable(job
                                                                                                , file));
                for (int i_1 = 0; i_1 < 3; i_1++)
                {
                    int numSplits = random.Next(MaxLength / 2000) + 1;
                    Log.Info("splitting: requesting = " + numSplits);
                    IList <InputSplit> splits = format.GetSplits(job);
                    Log.Info("splitting: got =        " + splits.Count);
                    // check each split
                    BitSet bits = new BitSet(length);
                    for (int j = 0; j < splits.Count; j++)
                    {
                        Log.Debug("split[" + j + "]= " + splits[j]);
                        TaskAttemptContext context = MapReduceTestUtil.CreateDummyMapTaskAttemptContext(job
                                                                                                        .GetConfiguration());
                        RecordReader <Text, Text> reader = format.CreateRecordReader(splits[j], context);
                        Type clazz = reader.GetType();
                        MapContext <Text, Text, Text, Text> mcontext = new MapContextImpl <Text, Text, Text
                                                                                           , Text>(job.GetConfiguration(), context.GetTaskAttemptID(), reader, null, null,
                                                                                                   MapReduceTestUtil.CreateDummyReporter(), splits[j]);
                        reader.Initialize(splits[j], mcontext);
                        Text key   = null;
                        Text value = null;
                        try
                        {
                            int count = 0;
                            while (reader.NextKeyValue())
                            {
                                key   = reader.GetCurrentKey();
                                value = reader.GetCurrentValue();
                                int k = System.Convert.ToInt32(key.ToString());
                                int v = System.Convert.ToInt32(value.ToString());
                                NUnit.Framework.Assert.AreEqual("Bad key", 0, k % 2);
                                NUnit.Framework.Assert.AreEqual("Mismatched key/value", k / 2, v);
                                Log.Debug("read " + k + "," + v);
                                NUnit.Framework.Assert.IsFalse(k + "," + v + " in multiple partitions.", bits.Get
                                                                   (v));
                                bits.Set(v);
                                count++;
                            }
                            if (count > 0)
                            {
                                Log.Info("splits[" + j + "]=" + splits[j] + " count=" + count);
                            }
                            else
                            {
                                Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + count);
                            }
                        }
                        finally
                        {
                            reader.Close();
                        }
                    }
                    NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality
                                                        ());
                }
            }
        }
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestSplitableCodecs()
        {
            JobConf conf = new JobConf(defaultConf);
            int     seed = new Random().Next();
            // Create the codec
            CompressionCodec codec = null;

            try
            {
                codec = (CompressionCodec)ReflectionUtils.NewInstance(conf.GetClassByName("org.apache.hadoop.io.compress.BZip2Codec"
                                                                                          ), conf);
            }
            catch (TypeLoadException)
            {
                throw new IOException("Illegal codec!");
            }
            Path file = new Path(workDir, "test" + codec.GetDefaultExtension());
            // A reporter that does nothing
            Reporter reporter = Reporter.Null;

            Log.Info("seed = " + seed);
            Random     random  = new Random(seed);
            FileSystem localFs = FileSystem.GetLocal(conf);

            localFs.Delete(workDir, true);
            FileInputFormat.SetInputPaths(conf, workDir);
            int MaxLength = 500000;

            // for a variety of lengths
            for (int length = MaxLength / 2; length < MaxLength; length += random.Next(MaxLength
                                                                                       / 4) + 1)
            {
                Log.Info("creating; entries = " + length);
                // create a file with length entries
                TextWriter writer = new OutputStreamWriter(codec.CreateOutputStream(localFs.Create
                                                                                        (file)));
                try
                {
                    for (int i = 0; i < length; i++)
                    {
                        writer.Write(Sharpen.Extensions.ToString(i));
                        writer.Write("\n");
                    }
                }
                finally
                {
                    writer.Close();
                }
                // try splitting the file in a variety of sizes
                TextInputFormat format = new TextInputFormat();
                format.Configure(conf);
                LongWritable key   = new LongWritable();
                Text         value = new Text();
                for (int i_1 = 0; i_1 < 3; i_1++)
                {
                    int numSplits = random.Next(MaxLength / 2000) + 1;
                    Log.Info("splitting: requesting = " + numSplits);
                    InputSplit[] splits = format.GetSplits(conf, numSplits);
                    Log.Info("splitting: got =        " + splits.Length);
                    // check each split
                    BitSet bits = new BitSet(length);
                    for (int j = 0; j < splits.Length; j++)
                    {
                        Log.Debug("split[" + j + "]= " + splits[j]);
                        RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], conf,
                                                                                          reporter);
                        try
                        {
                            int counter = 0;
                            while (reader.Next(key, value))
                            {
                                int v = System.Convert.ToInt32(value.ToString());
                                Log.Debug("read " + v);
                                if (bits.Get(v))
                                {
                                    Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos
                                                 ());
                                }
                                NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v));
                                bits.Set(v);
                                counter++;
                            }
                            if (counter > 0)
                            {
                                Log.Info("splits[" + j + "]=" + splits[j] + " count=" + counter);
                            }
                            else
                            {
                                Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + counter);
                            }
                        }
                        finally
                        {
                            reader.Close();
                        }
                    }
                    NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality
                                                        ());
                }
            }
        }