public virtual void ExtractOutputKeyValue(string key, string val, string fieldSep , IList <int> keyFieldList, IList <int> valFieldList, int allValueFieldsFrom, bool ignoreKey, bool isMap) { if (!ignoreKey) { val = key + val; } string[] fields = val.Split(fieldSep); string newKey = SelectFields(fields, keyFieldList, -1, fieldSep); string newVal = SelectFields(fields, valFieldList, allValueFieldsFrom, fieldSep); if (isMap && newKey == null) { newKey = newVal; newVal = null; } if (newKey != null) { this.key = new Org.Apache.Hadoop.IO.Text(newKey); } if (newVal != null) { this.value = new Org.Apache.Hadoop.IO.Text(newVal); } }
public virtual void TestFormatWithCustomSeparator() { JobConf job = new JobConf(); string separator = "\u0001"; job.Set("mapreduce.output.textoutputformat.separator", separator); job.Set(JobContext.TaskAttemptId, attempt); FileOutputFormat.SetOutputPath(job, workDir.GetParent().GetParent()); FileOutputFormat.SetWorkOutputPath(job, workDir); FileSystem fs = workDir.GetFileSystem(job); if (!fs.Mkdirs(workDir)) { NUnit.Framework.Assert.Fail("Failed to create output directory"); } string file = "test_custom.txt"; // A reporter that does nothing Reporter reporter = Reporter.Null; TextOutputFormat <object, object> theOutputFormat = new TextOutputFormat <object, object >(); RecordWriter <object, object> theRecordWriter = theOutputFormat.GetRecordWriter(localFs , job, file, reporter); Org.Apache.Hadoop.IO.Text key1 = new Org.Apache.Hadoop.IO.Text("key1"); Org.Apache.Hadoop.IO.Text key2 = new Org.Apache.Hadoop.IO.Text("key2"); Org.Apache.Hadoop.IO.Text val1 = new Org.Apache.Hadoop.IO.Text("val1"); Org.Apache.Hadoop.IO.Text val2 = new Org.Apache.Hadoop.IO.Text("val2"); NullWritable nullWritable = NullWritable.Get(); try { theRecordWriter.Write(key1, val1); theRecordWriter.Write(null, nullWritable); theRecordWriter.Write(null, val1); theRecordWriter.Write(nullWritable, val2); theRecordWriter.Write(key2, nullWritable); theRecordWriter.Write(key1, null); theRecordWriter.Write(null, null); theRecordWriter.Write(key2, val2); } finally { theRecordWriter.Close(reporter); } FilePath expectedFile = new FilePath(new Path(workDir, file).ToString()); StringBuilder expectedOutput = new StringBuilder(); expectedOutput.Append(key1).Append(separator).Append(val1).Append("\n"); expectedOutput.Append(val1).Append("\n"); expectedOutput.Append(val2).Append("\n"); expectedOutput.Append(key2).Append("\n"); expectedOutput.Append(key1).Append("\n"); expectedOutput.Append(key2).Append(separator).Append(val2).Append("\n"); string output = UtilsForTests.Slurp(expectedFile); NUnit.Framework.Assert.AreEqual(expectedOutput.ToString(), output); }
private static void GenRandom(Org.Apache.Hadoop.IO.Text t, int len, StringBuilder sb) { sb.Length = 0; for (int i = 0; i < len; ++i) { sb.Append(Extensions.ToString(r.Next(26) + 10, 36)); } t.Set(sb.ToString()); }
// TODO Auto-generated method stub /// <exception cref="System.IO.IOException"/> public virtual void Reduce(Org.Apache.Hadoop.IO.Text key, IEnumerator <Org.Apache.Hadoop.IO.Text > values, OutputCollector <Org.Apache.Hadoop.IO.Text, Org.Apache.Hadoop.IO.Text> output, Reporter reporter) { string keyStr = key.ToString() + this.fieldSeparator; while (values.HasNext()) { FieldSelectionHelper helper = new FieldSelectionHelper(); helper.ExtractOutputKeyValue(keyStr, values.Next().ToString(), fieldSeparator, reduceOutputKeyFieldList , reduceOutputValueFieldList, allReduceValueFieldsFrom, false, false); output.Collect(helper.GetKey(), helper.GetValue()); } }
public virtual void RunJob(int items) { try { JobConf conf = new JobConf(typeof(TestMapRed)); Path testdir = new Path(TestDir.GetAbsolutePath()); Path inDir = new Path(testdir, "in"); Path outDir = new Path(testdir, "out"); FileSystem fs = FileSystem.Get(conf); fs.Delete(testdir, true); conf.SetInt(JobContext.IoSortMb, 1); conf.SetInputFormat(typeof(SequenceFileInputFormat)); FileInputFormat.SetInputPaths(conf, inDir); FileOutputFormat.SetOutputPath(conf, outDir); conf.SetMapperClass(typeof(IdentityMapper)); conf.SetReducerClass(typeof(IdentityReducer)); conf.SetOutputKeyClass(typeof(Text)); conf.SetOutputValueClass(typeof(Text)); conf.SetOutputFormat(typeof(SequenceFileOutputFormat)); conf.Set(MRConfig.FrameworkName, MRConfig.LocalFrameworkName); if (!fs.Mkdirs(testdir)) { throw new IOException("Mkdirs failed to create " + testdir.ToString()); } if (!fs.Mkdirs(inDir)) { throw new IOException("Mkdirs failed to create " + inDir.ToString()); } Path inFile = new Path(inDir, "part0"); SequenceFile.Writer writer = SequenceFile.CreateWriter(fs, conf, inFile, typeof(Text ), typeof(Text)); StringBuilder content = new StringBuilder(); for (int i = 0; i < 1000; i++) { content.Append(i).Append(": This is one more line of content\n"); } Org.Apache.Hadoop.IO.Text text = new Org.Apache.Hadoop.IO.Text(content.ToString() ); for (int i_1 = 0; i_1 < items; i_1++) { writer.Append(new Org.Apache.Hadoop.IO.Text("rec:" + i_1), text); } writer.Close(); JobClient.RunJob(conf); } catch (Exception e) { NUnit.Framework.Assert.IsTrue("Threw exception:" + e, false); } }
/// <exception cref="System.IO.IOException"/> public virtual string[] GetSorted() { string[] ret = new string[indices.Length]; Org.Apache.Hadoop.IO.Text t = new Org.Apache.Hadoop.IO.Text(); DataInputBuffer dib = new DataInputBuffer(); for (int i = 0; i < ret.Length; ++i) { int ii = indices[i]; dib.Reset(bytes, offsets[ii], ((ii + 1 == indices.Length) ? eob : offsets[ii + 1] ) - offsets[ii]); t.ReadFields(dib); ret[i] = t.ToString(); } return(ret); }
/// <summary>Parse the command line arguments into lines and display the result.</summary> /// <param name="args"/> /// <exception cref="System.Exception"/> public static void Main(string[] args) { foreach (string arg in args) { System.Console.Out.WriteLine("Working on " + arg); LineReader reader = MakeStream(Unquote(arg)); Org.Apache.Hadoop.IO.Text line = new Org.Apache.Hadoop.IO.Text(); int size = reader.ReadLine(line); while (size > 0) { System.Console.Out.WriteLine("Got: " + line.ToString()); size = reader.ReadLine(line); } reader.Close(); } }
private int GenerateSentence(Org.Apache.Hadoop.IO.Text t, int noWords) { sentence.Length = 0; --noWords; for (int i = 0; i < noWords; ++i) { sentence.Append(words[r.Next(words.Length)]); sentence.Append(" "); } if (noWords >= 0) { sentence.Append(words[r.Next(words.Length)]); } t.Set(sentence.ToString()); return(sentence.Length); }
/// <exception cref="System.IO.IOException"/> public override IList <InputSplit> GetSplits(JobContext job) { Configuration conf = job.GetConfiguration(); Path src = new Path(conf.Get(IndirectInputFile, null)); FileSystem fs = src.GetFileSystem(conf); IList <InputSplit> splits = new AList <InputSplit>(); LongWritable key = new LongWritable(); Org.Apache.Hadoop.IO.Text value = new Org.Apache.Hadoop.IO.Text(); for (SequenceFile.Reader sl = new SequenceFile.Reader(fs, src, conf); sl.Next(key , value);) { splits.AddItem(new GenericMRLoadGenerator.IndirectInputFormat.IndirectSplit(new Path (value.ToString()), key.Get())); } return(splits); }
/// <exception cref="System.IO.IOException"/> public virtual InputSplit[] GetSplits(JobConf job, int numSplits) { Path src = new Path(job.Get(GenericMRLoadGenerator.IndirectInputFile, null)); FileSystem fs = src.GetFileSystem(job); AList <GenericMRLoadGenerator.IndirectInputFormat.IndirectSplit> splits = new AList <GenericMRLoadGenerator.IndirectInputFormat.IndirectSplit>(numSplits); LongWritable key = new LongWritable(); Org.Apache.Hadoop.IO.Text value = new Org.Apache.Hadoop.IO.Text(); for (SequenceFile.Reader sl = new SequenceFile.Reader(fs, src, job); sl.Next(key, value);) { splits.AddItem(new GenericMRLoadGenerator.IndirectInputFormat.IndirectSplit(new Path (value.ToString()), key.Get())); } return(Sharpen.Collections.ToArray(splits, new GenericMRLoadGenerator.IndirectInputFormat.IndirectSplit [splits.Count])); }
/// <exception cref="System.IO.IOException"/> public virtual void Map(Org.Apache.Hadoop.IO.Text key, Org.Apache.Hadoop.IO.Text val, OutputCollector <Org.Apache.Hadoop.IO.Text, Org.Apache.Hadoop.IO.Text> output , Reporter reporter) { long acc = 0L; long recs = 0; int keydiff = keymax - keymin; int valdiff = valmax - valmin; for (long i = 0L; acc < bytesToWrite; ++i) { int recacc = 0; recacc += GenerateSentence(key, keymin + (0 == keydiff ? 0 : r.Next(keydiff))); recacc += GenerateSentence(val, valmin + (0 == valdiff ? 0 : r.Next(valdiff))); output.Collect(key, val); ++recs; acc += recacc; reporter.IncrCounter(GenericMRLoadGenerator.Counters.BytesWritten, recacc); reporter.IncrCounter(GenericMRLoadGenerator.Counters.RecordsWritten, 1); reporter.SetStatus(acc + "/" + (bytesToWrite - acc) + " bytes"); } reporter.SetStatus("Wrote " + recs + " records"); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> protected override void Map(Org.Apache.Hadoop.IO.Text key, Org.Apache.Hadoop.IO.Text val, Mapper.Context context) { long acc = 0L; long recs = 0; int keydiff = keymax - keymin; int valdiff = valmax - valmin; for (long i = 0L; acc < bytesToWrite; ++i) { int recacc = 0; recacc += GenerateSentence(key, keymin + (0 == keydiff ? 0 : r.Next(keydiff))); recacc += GenerateSentence(val, valmin + (0 == valdiff ? 0 : r.Next(valdiff))); context.Write(key, val); ++recs; acc += recacc; context.GetCounter(GenericMRLoadGenerator.Counters.BytesWritten).Increment(recacc ); context.GetCounter(GenericMRLoadGenerator.Counters.RecordsWritten).Increment(1); context.SetStatus(acc + "/" + (bytesToWrite - acc) + " bytes"); } context.SetStatus("Wrote " + recs + " records"); }
public FieldSelectionHelper(Org.Apache.Hadoop.IO.Text key, Org.Apache.Hadoop.IO.Text val) { this.key = key; this.value = val; }
public virtual void TestCompress() { JobConf job = new JobConf(); job.Set(JobContext.TaskAttemptId, attempt); job.Set(FileOutputFormat.Compress, "true"); FileOutputFormat.SetOutputPath(job, workDir.GetParent().GetParent()); FileOutputFormat.SetWorkOutputPath(job, workDir); FileSystem fs = workDir.GetFileSystem(job); if (!fs.Mkdirs(workDir)) { NUnit.Framework.Assert.Fail("Failed to create output directory"); } string file = "test_compress.txt"; // A reporter that does nothing Reporter reporter = Reporter.Null; TextOutputFormat <object, object> theOutputFormat = new TextOutputFormat <object, object >(); RecordWriter <object, object> theRecordWriter = theOutputFormat.GetRecordWriter(localFs , job, file, reporter); Org.Apache.Hadoop.IO.Text key1 = new Org.Apache.Hadoop.IO.Text("key1"); Org.Apache.Hadoop.IO.Text key2 = new Org.Apache.Hadoop.IO.Text("key2"); Org.Apache.Hadoop.IO.Text val1 = new Org.Apache.Hadoop.IO.Text("val1"); Org.Apache.Hadoop.IO.Text val2 = new Org.Apache.Hadoop.IO.Text("val2"); NullWritable nullWritable = NullWritable.Get(); try { theRecordWriter.Write(key1, val1); theRecordWriter.Write(null, nullWritable); theRecordWriter.Write(null, val1); theRecordWriter.Write(nullWritable, val2); theRecordWriter.Write(key2, nullWritable); theRecordWriter.Write(key1, null); theRecordWriter.Write(null, null); theRecordWriter.Write(key2, val2); } finally { theRecordWriter.Close(reporter); } StringBuilder expectedOutput = new StringBuilder(); expectedOutput.Append(key1).Append("\t").Append(val1).Append("\n"); expectedOutput.Append(val1).Append("\n"); expectedOutput.Append(val2).Append("\n"); expectedOutput.Append(key2).Append("\n"); expectedOutput.Append(key1).Append("\n"); expectedOutput.Append(key2).Append("\t").Append(val2).Append("\n"); DefaultCodec codec = new DefaultCodec(); codec.SetConf(job); Path expectedFile = new Path(workDir, file + codec.GetDefaultExtension()); FileInputStream istream = new FileInputStream(expectedFile.ToString()); CompressionInputStream cistream = codec.CreateInputStream(istream); LineReader reader = new LineReader(cistream); string output = string.Empty; Org.Apache.Hadoop.IO.Text @out = new Org.Apache.Hadoop.IO.Text(); while (reader.ReadLine(@out) > 0) { output += @out; output += "\n"; } reader.Close(); NUnit.Framework.Assert.AreEqual(expectedOutput.ToString(), output); }