/// <summary>From each split sampled, take the first numSamples / numSplits records.</summary> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public virtual K[] GetSample(InputFormat <K, V> inf, Job job) { // ArrayList::toArray doesn't preserve type IList <InputSplit> splits = inf.GetSplits(job); AList <K> samples = new AList <K>(numSamples); int splitsToSample = Math.Min(maxSplitsSampled, splits.Count); int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.GetConfiguration (), new TaskAttemptID()); RecordReader <K, V> reader = inf.CreateRecordReader(splits[i], samplingContext); reader.Initialize(splits[i], samplingContext); while (reader.NextKeyValue()) { samples.AddItem(ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey (), null)); ++records; if ((i + 1) * samplesPerSplit <= records) { break; } } reader.Close(); } return((K[])Sharpen.Collections.ToArray(samples)); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public override RecordReader CreateRecordReader(InputSplit split, TaskAttemptContext taskContext) { try { if (!rrCstrMap.Contains(ident)) { throw new IOException("No RecordReader for " + ident); } Configuration conf = GetConf(taskContext.GetConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(conf, TaskAttemptID.ForName (conf.Get(MRJobConfig.TaskAttemptId)), new Parser.WrappedStatusReporter(taskContext )); return(rrCstrMap[ident].NewInstance(id, inf.CreateRecordReader(split, context), cmpcl )); } catch (MemberAccessException e) { throw new IOException(e); } catch (InstantiationException e) { throw new IOException(e); } catch (TargetInvocationException e) { throw new IOException(e); } }
/// <summary> /// Randomize the split order, then take the specified number of keys from /// each split sampled, where each key is selected with the specified /// probability and possibly replaced by a subsequently selected key when /// the quota of keys from that split is satisfied. /// </summary> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public virtual K[] GetSample(InputFormat <K, V> inf, Job job) { // ArrayList::toArray doesn't preserve type IList <InputSplit> splits = inf.GetSplits(job); AList <K> samples = new AList <K>(numSamples); int splitsToSample = Math.Min(maxSplitsSampled, splits.Count); Random r = new Random(); long seed = r.NextLong(); r.SetSeed(seed); Log.Debug("seed: " + seed); // shuffle splits for (int i = 0; i < splits.Count; ++i) { InputSplit tmp = splits[i]; int j = r.Next(splits.Count); splits.Set(i, splits[j]); splits.Set(j, tmp); } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i_1 = 0; i_1 < splitsToSample || (i_1 < splits.Count && samples.Count < numSamples); ++i_1) { TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.GetConfiguration (), new TaskAttemptID()); RecordReader <K, V> reader = inf.CreateRecordReader(splits[i_1], samplingContext); reader.Initialize(splits[i_1], samplingContext); while (reader.NextKeyValue()) { if (r.NextDouble() <= freq) { if (samples.Count < numSamples) { samples.AddItem(ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey (), null)); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.Next(numSamples); if (ind != numSamples) { samples.Set(ind, ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey (), null)); } freq *= (numSamples - 1) / (double)numSamples; } } } reader.Close(); } return((K[])Sharpen.Collections.ToArray(samples)); }
/// <summary>Constructs the DelegatingRecordReader.</summary> /// <param name="split">TaggegInputSplit object</param> /// <param name="context">TaskAttemptContext object</param> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public DelegatingRecordReader(InputSplit split, TaskAttemptContext context) { // Find the InputFormat and then the RecordReader from the // TaggedInputSplit. TaggedInputSplit taggedInputSplit = (TaggedInputSplit)split; InputFormat <K, V> inputFormat = (InputFormat <K, V>)ReflectionUtils.NewInstance(taggedInputSplit .GetInputFormatClass(), context.GetConfiguration()); originalRR = inputFormat.CreateRecordReader(taggedInputSplit.GetInputSplit(), context ); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> private static IList <Text> ReadSplit(InputFormat <LongWritable, Text> format, InputSplit split, Job job) { IList <Text> result = new AList <Text>(); Configuration conf = job.GetConfiguration(); TaskAttemptContext context = MapReduceTestUtil.CreateDummyMapTaskAttemptContext(conf ); RecordReader <LongWritable, Text> reader = format.CreateRecordReader(split, MapReduceTestUtil .CreateDummyMapTaskAttemptContext(conf)); MapContext <LongWritable, Text, LongWritable, Text> mcontext = new MapContextImpl < LongWritable, Text, LongWritable, Text>(conf, context.GetTaskAttemptID(), reader , null, null, MapReduceTestUtil.CreateDummyReporter(), split); reader.Initialize(split, mcontext); while (reader.NextKeyValue()) { result.AddItem(new Text(reader.GetCurrentValue())); } return(result); }