/// <summary>From each split sampled, take the first numSamples / numSplits records.</summary> /// <exception cref="System.IO.IOException"/> public virtual K[] GetSample(InputFormat <K, V> inf, JobConf job) { // ArrayList::toArray doesn't preserve type InputSplit[] splits = inf.GetSplits(job, job.GetNumMapTasks()); AList <K> samples = new AList <K>(numSamples); int splitsToSample = Math.Min(maxSplitsSampled, splits.Length); int splitStep = splits.Length / splitsToSample; int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader <K, V> reader = inf.GetRecordReader(splits[i * splitStep], job, Reporter .Null); K key = reader.CreateKey(); V value = reader.CreateValue(); while (reader.Next(key, value)) { samples.AddItem(key); key = reader.CreateKey(); ++records; if ((i + 1) * samplesPerSplit <= records) { break; } } reader.Close(); } return((K[])Sharpen.Collections.ToArray(samples)); }
/// <summary> /// Randomize the split order, then take the specified number of keys from /// each split sampled, where each key is selected with the specified /// probability and possibly replaced by a subsequently selected key when /// the quota of keys from that split is satisfied. /// </summary> /// <exception cref="System.IO.IOException"/> public virtual K[] GetSample(InputFormat <K, V> inf, JobConf job) { // ArrayList::toArray doesn't preserve type InputSplit[] splits = inf.GetSplits(job, job.GetNumMapTasks()); AList <K> samples = new AList <K>(numSamples); int splitsToSample = Math.Min(maxSplitsSampled, splits.Length); Random r = new Random(); long seed = r.NextLong(); r.SetSeed(seed); Log.Debug("seed: " + seed); // shuffle splits for (int i = 0; i < splits.Length; ++i) { InputSplit tmp = splits[i]; int j = r.Next(splits.Length); splits[i] = splits[j]; splits[j] = tmp; } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i_1 = 0; i_1 < splitsToSample || (i_1 < splits.Length && samples.Count < numSamples); ++i_1) { RecordReader <K, V> reader = inf.GetRecordReader(splits[i_1], job, Reporter.Null); K key = reader.CreateKey(); V value = reader.CreateValue(); while (reader.Next(key, value)) { if (r.NextDouble() <= freq) { if (samples.Count < numSamples) { samples.AddItem(key); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.Next(numSamples); if (ind != numSamples) { samples.Set(ind, key); } freq *= (numSamples - 1) / (double)numSamples; } key = reader.CreateKey(); } } reader.Close(); } return((K[])Sharpen.Collections.ToArray(samples)); }
/// <exception cref="System.IO.IOException"/> internal WrappedRecordReader(int id, RecordReader <K, U> rr, Type cmpcl, Configuration conf) { // index at which values will be inserted in collector // key at the top of this RR // value assoc with khead this.id = id; this.rr = rr; this.conf = (conf == null) ? new Configuration() : conf; khead = rr.CreateKey(); vhead = rr.CreateValue(); try { cmp = (null == cmpcl) ? WritableComparator.Get(khead.GetType(), this.conf) : System.Activator.CreateInstance (cmpcl); } catch (InstantiationException e) { throw (IOException)Sharpen.Extensions.InitCause(new IOException(), e); } catch (MemberAccessException e) { throw (IOException)Sharpen.Extensions.InitCause(new IOException(), e); } vjoin = new StreamBackedIterator <U>(); Next(); }
/// <exception cref="System.IO.IOException"/> private static IList <Text> ReadSplit(KeyValueTextInputFormat format, InputSplit split , JobConf job) { IList <Text> result = new AList <Text>(); RecordReader <Text, Text> reader = null; try { reader = format.GetRecordReader(split, job, voidReporter); Text key = reader.CreateKey(); Text value = reader.CreateValue(); while (reader.Next(key, value)) { result.AddItem(value); value = (Text)reader.CreateValue(); } } finally { if (reader != null) { reader.Close(); } } return(result); }
/// <summary>test DBInputFormat class.</summary> /// <remarks>test DBInputFormat class. Class should split result for chunks</remarks> /// <exception cref="System.Exception"/> public virtual void TestDBInputFormat() { JobConf configuration = new JobConf(); SetupDriver(configuration); DBInputFormat <DBInputFormat.NullDBWritable> format = new DBInputFormat <DBInputFormat.NullDBWritable >(); format.SetConf(configuration); format.SetConf(configuration); DBInputFormat.DBInputSplit splitter = new DBInputFormat.DBInputSplit(1, 10); Reporter reporter = Org.Mockito.Mockito.Mock <Reporter>(); RecordReader <LongWritable, DBInputFormat.NullDBWritable> reader = format.GetRecordReader (splitter, configuration, reporter); configuration.SetInt(MRJobConfig.NumMaps, 3); InputSplit[] lSplits = format.GetSplits(configuration, 3); NUnit.Framework.Assert.AreEqual(5, lSplits[0].GetLength()); NUnit.Framework.Assert.AreEqual(3, lSplits.Length); // test reader .Some simple tests NUnit.Framework.Assert.AreEqual(typeof(LongWritable), reader.CreateKey().GetType( )); NUnit.Framework.Assert.AreEqual(0, reader.GetPos()); NUnit.Framework.Assert.AreEqual(0, reader.GetProgress(), 0.001); reader.Close(); }
/// <exception cref="System.IO.IOException"/> internal static long ReadBench(JobConf conf) { // InputFormat instantiation InputFormat inf = conf.GetInputFormat(); string fn = conf.Get("test.filebench.name", string.Empty); Path pin = new Path(FileInputFormat.GetInputPaths(conf)[0], fn); FileStatus @in = pin.GetFileSystem(conf).GetFileStatus(pin); RecordReader rr = inf.GetRecordReader(new FileSplit(pin, 0, @in.GetLen(), (string [])null), conf, Reporter.Null); try { object key = rr.CreateKey(); object val = rr.CreateValue(); DateTime start = new DateTime(); while (rr.Next(key, val)) { } DateTime end = new DateTime(); return(end.GetTime() - start.GetTime()); } finally { rr.Close(); } }
/// <summary>Run the map task.</summary> /// <param name="input">the set of inputs</param> /// <param name="output">the object to collect the outputs of the map</param> /// <param name="reporter">the object to update with status</param> /// <exception cref="System.IO.IOException"/> public override void Run(RecordReader <K1, V1> input, OutputCollector <K2, V2> output , Reporter reporter) { Application <K1, V1, K2, V2> application = null; try { RecordReader <FloatWritable, NullWritable> fakeInput = (!Submitter.GetIsJavaRecordReader (job) && !Submitter.GetIsJavaMapper(job)) ? (RecordReader <FloatWritable, NullWritable >)input : null; application = new Application <K1, V1, K2, V2>(job, fakeInput, output, reporter, ( Type)job.GetOutputKeyClass(), (Type)job.GetOutputValueClass()); } catch (Exception ie) { throw new RuntimeException("interrupted", ie); } DownwardProtocol <K1, V1> downlink = application.GetDownlink(); bool isJavaInput = Submitter.GetIsJavaRecordReader(job); downlink.RunMap(reporter.GetInputSplit(), job.GetNumReduceTasks(), isJavaInput); bool skipping = job.GetBoolean(MRJobConfig.SkipRecords, false); try { if (isJavaInput) { // allocate key & value instances that are re-used for all entries K1 key = input.CreateKey(); V1 value = input.CreateValue(); downlink.SetInputTypes(key.GetType().FullName, value.GetType().FullName); while (input.Next(key, value)) { // map pair to output downlink.MapItem(key, value); if (skipping) { //flush the streams on every record input if running in skip mode //so that we don't buffer other records surrounding a bad record. downlink.Flush(); } } downlink.EndOfInput(); } application.WaitForFinish(); } catch (Exception t) { application.Abort(t); } finally { application.Cleanup(); } }
/// <exception cref="System.IO.IOException"/> private static IList <Text> ReadSplit(TextInputFormat format, InputSplit split, JobConf jobConf) { IList <Text> result = new AList <Text>(); RecordReader <LongWritable, Text> reader = format.GetRecordReader(split, jobConf, voidReporter); LongWritable key = reader.CreateKey(); Text value = reader.CreateValue(); while (reader.Next(key, value)) { result.AddItem(value); value = reader.CreateValue(); } reader.Close(); return(result); }
// A reporter that does nothing /// <exception cref="System.IO.IOException"/> internal virtual void CheckFormat(JobConf job, int expectedN) { NLineInputFormat format = new NLineInputFormat(); format.Configure(job); int ignoredNumSplits = 1; InputSplit[] splits = format.GetSplits(job, ignoredNumSplits); // check all splits except last one int count = 0; for (int j = 0; j < splits.Length - 1; j++) { NUnit.Framework.Assert.AreEqual("There are no split locations", 0, splits[j].GetLocations ().Length); RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], job, voidReporter); Type readerClass = reader.GetType(); NUnit.Framework.Assert.AreEqual("reader class is LineRecordReader.", typeof(LineRecordReader ), readerClass); LongWritable key = reader.CreateKey(); Type keyClass = key.GetType(); NUnit.Framework.Assert.AreEqual("Key class is LongWritable.", typeof(LongWritable ), keyClass); Text value = reader.CreateValue(); Type valueClass = value.GetType(); NUnit.Framework.Assert.AreEqual("Value class is Text.", typeof(Text), valueClass); try { count = 0; while (reader.Next(key, value)) { count++; } } finally { reader.Close(); } NUnit.Framework.Assert.AreEqual("number of lines in split is " + expectedN, expectedN , count); } }
/// <exception cref="System.IO.IOException"/> private static IList <string> ReadSplit(FixedLengthInputFormat format, InputSplit split, JobConf job) { IList <string> result = new AList <string>(); RecordReader <LongWritable, BytesWritable> reader = format.GetRecordReader(split, job, voidReporter); LongWritable key = reader.CreateKey(); BytesWritable value = reader.CreateValue(); try { while (reader.Next(key, value)) { result.AddItem(Sharpen.Runtime.GetStringForBytes(value.GetBytes(), 0, value.GetLength ())); } } finally { reader.Close(); } return(result); }
/// <exception cref="System.IO.IOException"/> public virtual void Run(RecordReader <K1, V1> input, OutputCollector <K2, V2> output , Reporter reporter) { try { // allocate key & value instances that are re-used for all entries K1 key = input.CreateKey(); V1 value = input.CreateValue(); while (input.Next(key, value)) { // map pair to output mapper.Map(key, value, output, reporter); if (incrProcCount) { reporter.IncrCounter(SkipBadRecords.CounterGroup, SkipBadRecords.CounterMapProcessedRecords , 1); } } } finally { mapper.Close(); } }
/// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(conf); FileSystem fs = FileSystem.GetLocal(conf); Path dir = new Path(Runtime.GetProperty("test.build.data", ".") + "/mapred"); Path file = new Path(dir, "test.seq"); Reporter reporter = Reporter.Null; int seed = new Random().Next(); //LOG.info("seed = "+seed); Random random = new Random(seed); fs.Delete(dir, true); FileInputFormat.SetInputPaths(job, dir); // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) + 1) { //LOG.info("creating; entries = " + length); // create a file with length entries SequenceFile.Writer writer = SequenceFile.CreateWriter(fs, conf, file, typeof(IntWritable ), typeof(LongWritable)); try { for (int i = 0; i < length; i++) { IntWritable key = new IntWritable(i); LongWritable value = new LongWritable(10 * i); writer.Append(key, value); } } finally { writer.Close(); } // try splitting the file in a variety of sizes InputFormat <Text, Text> format = new SequenceFileAsTextInputFormat(); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / (SequenceFile.SyncInterval / 20)) + 1; //LOG.info("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); //LOG.info("splitting: got = " + splits.length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { RecordReader <Text, Text> reader = format.GetRecordReader(splits[j], job, reporter ); Type readerClass = reader.GetType(); NUnit.Framework.Assert.AreEqual("reader class is SequenceFileAsTextRecordReader." , typeof(SequenceFileAsTextRecordReader), readerClass); Text value = reader.CreateValue(); Text key = reader.CreateKey(); try { int count = 0; while (reader.Next(key, value)) { // if (bits.get(key.get())) { // LOG.info("splits["+j+"]="+splits[j]+" : " + key.get()); // LOG.info("@"+reader.getPos()); // } int keyInt = System.Convert.ToInt32(key.ToString()); NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(keyInt)); bits.Set(keyInt); count++; } } finally { //LOG.info("splits["+j+"]="+splits[j]+" count=" + count); reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }
/// <exception cref="System.IO.IOException"/> public virtual void Run(RecordReader <K1, V1> input, OutputCollector <K2, V2> output , Reporter reporter) { try { // allocate key & value instances these objects will not be reused // because execution of Mapper.map is not serialized. K1 key = input.CreateKey(); V1 value = input.CreateValue(); while (input.Next(key, value)) { executorService.Execute(new MultithreadedMapRunner.MapperInvokeRunable(this, key, value, output, reporter)); CheckForExceptionsFromProcessingThreads(); // Allocate new key & value instances as mapper is running in parallel key = input.CreateKey(); value = input.CreateValue(); } if (Log.IsDebugEnabled()) { Log.Debug("Finished dispatching all Mappper.map calls, job " + job.GetJobName()); } // Graceful shutdown of the Threadpool, it will let all scheduled // Runnables to end. executorService.Shutdown(); try { // Now waiting for all Runnables to end. while (!executorService.AwaitTermination(100, TimeUnit.Milliseconds)) { if (Log.IsDebugEnabled()) { Log.Debug("Awaiting all running Mappper.map calls to finish, job " + job.GetJobName ()); } // NOTE: while Mapper.map dispatching has concluded there are still // map calls in progress and exceptions would be thrown. CheckForExceptionsFromProcessingThreads(); } // NOTE: it could be that a map call has had an exception after the // call for awaitTermination() returing true. And edge case but it // could happen. CheckForExceptionsFromProcessingThreads(); } catch (IOException ioEx) { // Forcing a shutdown of all thread of the threadpool and rethrowing // the IOException executorService.ShutdownNow(); throw; } catch (Exception iEx) { throw new RuntimeException(iEx); } } finally { mapper.Close(); } }
/// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(); Path file = new Path(workDir, "test.txt"); // A reporter that does nothing Reporter reporter = Reporter.Null; int seed = new Random().Next(); Log.Info("seed = " + seed); Random random = new Random(seed); localFs.Delete(workDir, true); FileInputFormat.SetInputPaths(job, workDir); // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) + 1) { Log.Debug("creating; entries = " + length); // create a file with length entries TextWriter writer = new OutputStreamWriter(localFs.Create(file)); try { for (int i = 0; i < length; i++) { writer.Write(Sharpen.Extensions.ToString(i * 2)); writer.Write("\t"); writer.Write(Sharpen.Extensions.ToString(i)); writer.Write("\n"); } } finally { writer.Close(); } // try splitting the file in a variety of sizes KeyValueTextInputFormat format = new KeyValueTextInputFormat(); format.Configure(job); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / 20) + 1; Log.Debug("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); Log.Debug("splitting: got = " + splits.Length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { Log.Debug("split[" + j + "]= " + splits[j]); RecordReader <Text, Text> reader = format.GetRecordReader(splits[j], job, reporter ); Type readerClass = reader.GetType(); NUnit.Framework.Assert.AreEqual("reader class is KeyValueLineRecordReader.", typeof( KeyValueLineRecordReader), readerClass); Text key = reader.CreateKey(); Type keyClass = key.GetType(); Text value = reader.CreateValue(); Type valueClass = value.GetType(); NUnit.Framework.Assert.AreEqual("Key class is Text.", typeof(Text), keyClass); NUnit.Framework.Assert.AreEqual("Value class is Text.", typeof(Text), valueClass); try { int count = 0; while (reader.Next(key, value)) { int v = System.Convert.ToInt32(value.ToString()); Log.Debug("read " + v); if (bits.Get(v)) { Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos ()); } NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v)); bits.Set(v); count++; } Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + count); } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }
public virtual K CreateKey() { return(delegate_.CreateKey()); }
public virtual K CreateKey() { return(curReader.CreateKey()); }
/// <summary>Request new key from proxied RR.</summary> public virtual K CreateKey() { return(rr.CreateKey()); }