/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public override RecordWriter <K, V> GetRecordWriter(TaskAttemptContext job) { Configuration conf = job.GetConfiguration(); bool isCompressed = GetCompressOutput(job); string keyValueSeparator = conf.Get(Seperator, "\t"); CompressionCodec codec = null; string extension = string.Empty; if (isCompressed) { Type codecClass = GetOutputCompressorClass(job, typeof(GzipCodec)); codec = (CompressionCodec)ReflectionUtils.NewInstance(codecClass, conf); extension = codec.GetDefaultExtension(); } Path file = GetDefaultWorkFile(job, extension); FileSystem fs = file.GetFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.Create(file, false); return(new TextOutputFormat.LineRecordWriter <K, V>(fileOut, keyValueSeparator)); } else { FSDataOutputStream fileOut = fs.Create(file, false); return(new TextOutputFormat.LineRecordWriter <K, V>(new DataOutputStream(codec.CreateOutputStream (fileOut)), keyValueSeparator)); } }
/// <exception cref="System.IO.IOException"/> public override RecordWriter <K, V> GetRecordWriter(FileSystem ignored, JobConf job , string name, Progressable progress) { bool isCompressed = GetCompressOutput(job); string keyValueSeparator = job.Get("mapreduce.output.textoutputformat.separator", "\t"); if (!isCompressed) { Path file = FileOutputFormat.GetTaskOutputPath(job, name); FileSystem fs = file.GetFileSystem(job); FSDataOutputStream fileOut = fs.Create(file, progress); return(new TextOutputFormat.LineRecordWriter <K, V>(fileOut, keyValueSeparator)); } else { Type codecClass = GetOutputCompressorClass(job, typeof(GzipCodec)); // create the named codec CompressionCodec codec = ReflectionUtils.NewInstance(codecClass, job); // build the filename including the extension Path file = FileOutputFormat.GetTaskOutputPath(job, name + codec.GetDefaultExtension ()); FileSystem fs = file.GetFileSystem(job); FSDataOutputStream fileOut = fs.Create(file, progress); return(new TextOutputFormat.LineRecordWriter <K, V>(new DataOutputStream(codec.CreateOutputStream (fileOut)), keyValueSeparator)); } }
public virtual void TestSplitableCodecs() { Job job = Job.GetInstance(defaultConf); Configuration conf = job.GetConfiguration(); // Create the codec CompressionCodec codec = null; try { codec = (CompressionCodec)ReflectionUtils.NewInstance(conf.GetClassByName("org.apache.hadoop.io.compress.BZip2Codec" ), conf); } catch (TypeLoadException) { throw new IOException("Illegal codec!"); } Path file = new Path(workDir, "test" + codec.GetDefaultExtension()); int seed = new Random().Next(); Log.Info("seed = " + seed); Random random = new Random(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); int MaxLength = 500000; FileInputFormat.SetMaxInputSplitSize(job, MaxLength / 20); // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 4) + 1) { Log.Info("creating; entries = " + length); // create a file with length entries TextWriter writer = new OutputStreamWriter(codec.CreateOutputStream(localFs.Create (file))); try { for (int i = 0; i < length; i++) { writer.Write(Sharpen.Extensions.ToString(i * 2)); writer.Write("\t"); writer.Write(Sharpen.Extensions.ToString(i)); writer.Write("\n"); } } finally { writer.Close(); } // try splitting the file in a variety of sizes KeyValueTextInputFormat format = new KeyValueTextInputFormat(); NUnit.Framework.Assert.IsTrue("KVTIF claims not splittable", format.IsSplitable(job , file)); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / 2000) + 1; Log.Info("splitting: requesting = " + numSplits); IList <InputSplit> splits = format.GetSplits(job); Log.Info("splitting: got = " + splits.Count); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Count; j++) { Log.Debug("split[" + j + "]= " + splits[j]); TaskAttemptContext context = MapReduceTestUtil.CreateDummyMapTaskAttemptContext(job .GetConfiguration()); RecordReader <Text, Text> reader = format.CreateRecordReader(splits[j], context); Type clazz = reader.GetType(); MapContext <Text, Text, Text, Text> mcontext = new MapContextImpl <Text, Text, Text , Text>(job.GetConfiguration(), context.GetTaskAttemptID(), reader, null, null, MapReduceTestUtil.CreateDummyReporter(), splits[j]); reader.Initialize(splits[j], mcontext); Text key = null; Text value = null; try { int count = 0; while (reader.NextKeyValue()) { key = reader.GetCurrentKey(); value = reader.GetCurrentValue(); int k = System.Convert.ToInt32(key.ToString()); int v = System.Convert.ToInt32(value.ToString()); NUnit.Framework.Assert.AreEqual("Bad key", 0, k % 2); NUnit.Framework.Assert.AreEqual("Mismatched key/value", k / 2, v); Log.Debug("read " + k + "," + v); NUnit.Framework.Assert.IsFalse(k + "," + v + " in multiple partitions.", bits.Get (v)); bits.Set(v); count++; } if (count > 0) { Log.Info("splits[" + j + "]=" + splits[j] + " count=" + count); } else { Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + count); } } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }
/// <exception cref="System.IO.IOException"/> public virtual void TestSplitableCodecs() { JobConf conf = new JobConf(defaultConf); int seed = new Random().Next(); // Create the codec CompressionCodec codec = null; try { codec = (CompressionCodec)ReflectionUtils.NewInstance(conf.GetClassByName("org.apache.hadoop.io.compress.BZip2Codec" ), conf); } catch (TypeLoadException) { throw new IOException("Illegal codec!"); } Path file = new Path(workDir, "test" + codec.GetDefaultExtension()); // A reporter that does nothing Reporter reporter = Reporter.Null; Log.Info("seed = " + seed); Random random = new Random(seed); FileSystem localFs = FileSystem.GetLocal(conf); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(conf, workDir); int MaxLength = 500000; // for a variety of lengths for (int length = MaxLength / 2; length < MaxLength; length += random.Next(MaxLength / 4) + 1) { Log.Info("creating; entries = " + length); // create a file with length entries TextWriter writer = new OutputStreamWriter(codec.CreateOutputStream(localFs.Create (file))); try { for (int i = 0; i < length; i++) { writer.Write(Sharpen.Extensions.ToString(i)); writer.Write("\n"); } } finally { writer.Close(); } // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.Configure(conf); LongWritable key = new LongWritable(); Text value = new Text(); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / 2000) + 1; Log.Info("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(conf, numSplits); Log.Info("splitting: got = " + splits.Length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { Log.Debug("split[" + j + "]= " + splits[j]); RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], conf, reporter); try { int counter = 0; while (reader.Next(key, value)) { int v = System.Convert.ToInt32(value.ToString()); Log.Debug("read " + v); if (bits.Get(v)) { Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos ()); } NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); counter++; } if (counter > 0) { Log.Info("splits[" + j + "]=" + splits[j] + " count=" + counter); } else { Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + counter); } } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }