/// <summary>From each split sampled, take the first numSamples / numSplits records.</summary> /// <exception cref="System.IO.IOException"/> public virtual K[] GetSample(InputFormat <K, V> inf, JobConf job) { // ArrayList::toArray doesn't preserve type InputSplit[] splits = inf.GetSplits(job, job.GetNumMapTasks()); AList <K> samples = new AList <K>(numSamples); int splitsToSample = Math.Min(maxSplitsSampled, splits.Length); int splitStep = splits.Length / splitsToSample; int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader <K, V> reader = inf.GetRecordReader(splits[i * splitStep], job, Reporter .Null); K key = reader.CreateKey(); V value = reader.CreateValue(); while (reader.Next(key, value)) { samples.AddItem(key); key = reader.CreateKey(); ++records; if ((i + 1) * samplesPerSplit <= records) { break; } } reader.Close(); } return((K[])Sharpen.Collections.ToArray(samples)); }
/// <summary>From each split sampled, take the first numSamples / numSplits records.</summary> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public virtual K[] GetSample(InputFormat <K, V> inf, Job job) { // ArrayList::toArray doesn't preserve type IList <InputSplit> splits = inf.GetSplits(job); AList <K> samples = new AList <K>(numSamples); int splitsToSample = Math.Min(maxSplitsSampled, splits.Count); int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.GetConfiguration (), new TaskAttemptID()); RecordReader <K, V> reader = inf.CreateRecordReader(splits[i], samplingContext); reader.Initialize(splits[i], samplingContext); while (reader.NextKeyValue()) { samples.AddItem(ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey (), null)); ++records; if ((i + 1) * samplesPerSplit <= records) { break; } } reader.Close(); } return((K[])Sharpen.Collections.ToArray(samples)); }
/// <exception cref="System.IO.IOException"/> public virtual InputSplit[] GetSplits(JobConf conf, int numSplits) { JobConf confCopy = new JobConf(conf); IList <InputSplit> splits = new AList <InputSplit>(); IDictionary <Path, InputFormat> formatMap = MultipleInputs.GetInputFormatMap(conf); IDictionary <Path, Type> mapperMap = MultipleInputs.GetMapperTypeMap(conf); IDictionary <Type, IList <Path> > formatPaths = new Dictionary <Type, IList <Path> >(); // First, build a map of InputFormats to Paths foreach (KeyValuePair <Path, InputFormat> entry in formatMap) { if (!formatPaths.Contains(entry.Value.GetType())) { formatPaths[entry.Value.GetType()] = new List <Path>(); } formatPaths[entry.Value.GetType()].AddItem(entry.Key); } foreach (KeyValuePair <Type, IList <Path> > formatEntry in formatPaths) { Type formatClass = formatEntry.Key; InputFormat format = (InputFormat)ReflectionUtils.NewInstance(formatClass, conf); IList <Path> paths = formatEntry.Value; IDictionary <Type, IList <Path> > mapperPaths = new Dictionary <Type, IList <Path> >(); // Now, for each set of paths that have a common InputFormat, build // a map of Mappers to the paths they're used for foreach (Path path in paths) { Type mapperClass = mapperMap[path]; if (!mapperPaths.Contains(mapperClass)) { mapperPaths[mapperClass] = new List <Path>(); } mapperPaths[mapperClass].AddItem(path); } // Now each set of paths that has a common InputFormat and Mapper can // be added to the same job, and split together. foreach (KeyValuePair <Type, IList <Path> > mapEntry in mapperPaths) { paths = mapEntry.Value; Type mapperClass = mapEntry.Key; if (mapperClass == null) { mapperClass = conf.GetMapperClass(); } FileInputFormat.SetInputPaths(confCopy, Sharpen.Collections.ToArray(paths, new Path [paths.Count])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. InputSplit[] pathSplits = format.GetSplits(confCopy, numSplits); foreach (InputSplit pathSplit in pathSplits) { splits.AddItem(new TaggedInputSplit(pathSplit, conf, format.GetType(), mapperClass )); } } } return(Sharpen.Collections.ToArray(splits, new InputSplit[splits.Count])); }
/// <summary> /// Randomize the split order, then take the specified number of keys from /// each split sampled, where each key is selected with the specified /// probability and possibly replaced by a subsequently selected key when /// the quota of keys from that split is satisfied. /// </summary> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public virtual K[] GetSample(InputFormat <K, V> inf, Job job) { // ArrayList::toArray doesn't preserve type IList <InputSplit> splits = inf.GetSplits(job); AList <K> samples = new AList <K>(numSamples); int splitsToSample = Math.Min(maxSplitsSampled, splits.Count); Random r = new Random(); long seed = r.NextLong(); r.SetSeed(seed); Log.Debug("seed: " + seed); // shuffle splits for (int i = 0; i < splits.Count; ++i) { InputSplit tmp = splits[i]; int j = r.Next(splits.Count); splits.Set(i, splits[j]); splits.Set(j, tmp); } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i_1 = 0; i_1 < splitsToSample || (i_1 < splits.Count && samples.Count < numSamples); ++i_1) { TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.GetConfiguration (), new TaskAttemptID()); RecordReader <K, V> reader = inf.CreateRecordReader(splits[i_1], samplingContext); reader.Initialize(splits[i_1], samplingContext); while (reader.NextKeyValue()) { if (r.NextDouble() <= freq) { if (samples.Count < numSamples) { samples.AddItem(ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey (), null)); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.Next(numSamples); if (ind != numSamples) { samples.Set(ind, ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey (), null)); } freq *= (numSamples - 1) / (double)numSamples; } } } reader.Close(); } return((K[])Sharpen.Collections.ToArray(samples)); }
/// <summary> /// Randomize the split order, then take the specified number of keys from /// each split sampled, where each key is selected with the specified /// probability and possibly replaced by a subsequently selected key when /// the quota of keys from that split is satisfied. /// </summary> /// <exception cref="System.IO.IOException"/> public virtual K[] GetSample(InputFormat <K, V> inf, JobConf job) { // ArrayList::toArray doesn't preserve type InputSplit[] splits = inf.GetSplits(job, job.GetNumMapTasks()); AList <K> samples = new AList <K>(numSamples); int splitsToSample = Math.Min(maxSplitsSampled, splits.Length); Random r = new Random(); long seed = r.NextLong(); r.SetSeed(seed); Log.Debug("seed: " + seed); // shuffle splits for (int i = 0; i < splits.Length; ++i) { InputSplit tmp = splits[i]; int j = r.Next(splits.Length); splits[i] = splits[j]; splits[j] = tmp; } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i_1 = 0; i_1 < splitsToSample || (i_1 < splits.Length && samples.Count < numSamples); ++i_1) { RecordReader <K, V> reader = inf.GetRecordReader(splits[i_1], job, Reporter.Null); K key = reader.CreateKey(); V value = reader.CreateValue(); while (reader.Next(key, value)) { if (r.NextDouble() <= freq) { if (samples.Count < numSamples) { samples.AddItem(key); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.Next(numSamples); if (ind != numSamples) { samples.Set(ind, key); } freq *= (numSamples - 1) / (double)numSamples; } key = reader.CreateKey(); } } reader.Close(); } return((K[])Sharpen.Collections.ToArray(samples)); }
/// <summary> /// Validates map phase progress after each record is processed by map task /// using custom task reporter. /// </summary> /// <exception cref="System.Exception"/> public virtual void TestMapProgress() { JobConf job = new JobConf(); fs = FileSystem.GetLocal(job); Path rootDir = new Path(TestRootDir); CreateInputFile(rootDir); job.SetNumReduceTasks(0); TaskAttemptID taskId = ((TaskAttemptID)TaskAttemptID.ForName("attempt_200907082313_0424_m_000000_0" )); job.SetClass("mapreduce.job.outputformat.class", typeof(NullOutputFormat), typeof( OutputFormat)); job.Set(FileInputFormat.InputDir, TestRootDir); jobId = ((JobID)taskId.GetJobID()); JobContext jContext = new JobContextImpl(job, jobId); InputFormat <object, object> input = ReflectionUtils.NewInstance(jContext.GetInputFormatClass (), job); IList <InputSplit> splits = input.GetSplits(jContext); JobSplitWriter.CreateSplitFiles(new Path(TestRootDir), job, new Path(TestRootDir) .GetFileSystem(job), splits); JobSplit.TaskSplitMetaInfo[] splitMetaInfo = SplitMetaInfoReader.ReadSplitMetaInfo (jobId, fs, job, new Path(TestRootDir)); job.SetUseNewMapper(true); // use new api for (int i = 0; i < splitMetaInfo.Length; i++) { // rawSplits.length is 1 map = new TestMapProgress.TestMapTask(this, job.Get(JTConfig.JtSystemDir, "/tmp/hadoop/mapred/system" ) + jobId + "job.xml", taskId, i, splitMetaInfo[i].GetSplitIndex(), 1); JobConf localConf = new JobConf(job); map.LocalizeConfiguration(localConf); map.SetConf(localConf); map.Run(localConf, fakeUmbilical); } // clean up fs.Delete(rootDir, true); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public override IList <InputSplit> GetSplits(JobContext job) { Configuration conf = job.GetConfiguration(); Job jobCopy = Job.GetInstance(conf); IList <InputSplit> splits = new AList <InputSplit>(); IDictionary <Path, InputFormat> formatMap = MultipleInputs.GetInputFormatMap(job); IDictionary <Path, Type> mapperMap = MultipleInputs.GetMapperTypeMap(job); IDictionary <Type, IList <Path> > formatPaths = new Dictionary <Type, IList <Path> >(); // First, build a map of InputFormats to Paths foreach (KeyValuePair <Path, InputFormat> entry in formatMap) { if (!formatPaths.Contains(entry.Value.GetType())) { formatPaths[entry.Value.GetType()] = new List <Path>(); } formatPaths[entry.Value.GetType()].AddItem(entry.Key); } foreach (KeyValuePair <Type, IList <Path> > formatEntry in formatPaths) { Type formatClass = formatEntry.Key; InputFormat format = (InputFormat)ReflectionUtils.NewInstance(formatClass, conf); IList <Path> paths = formatEntry.Value; IDictionary <Type, IList <Path> > mapperPaths = new Dictionary <Type, IList <Path> >(); // Now, for each set of paths that have a common InputFormat, build // a map of Mappers to the paths they're used for foreach (Path path in paths) { Type mapperClass = mapperMap[path]; if (!mapperPaths.Contains(mapperClass)) { mapperPaths[mapperClass] = new List <Path>(); } mapperPaths[mapperClass].AddItem(path); } // Now each set of paths that has a common InputFormat and Mapper can // be added to the same job, and split together. foreach (KeyValuePair <Type, IList <Path> > mapEntry in mapperPaths) { paths = mapEntry.Value; Type mapperClass = mapEntry.Key; if (mapperClass == null) { try { mapperClass = job.GetMapperClass(); } catch (TypeLoadException e) { throw new IOException("Mapper class is not found", e); } } FileInputFormat.SetInputPaths(jobCopy, Sharpen.Collections.ToArray(paths, new Path [paths.Count])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. IList <InputSplit> pathSplits = format.GetSplits(jobCopy); foreach (InputSplit pathSplit in pathSplits) { splits.AddItem(new TaggedInputSplit(pathSplit, conf, format.GetType(), mapperClass )); } } } return(splits); }
/// <exception cref="System.IO.IOException"/> public override InputSplit[] GetSplits(JobConf job, int numSplits) { return(inf.GetSplits(GetConf(job), numSplits)); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public override IList GetSplits(JobContext context) { return(inf.GetSplits(new JobContextImpl(GetConf(context.GetConfiguration()), context .GetJobID()))); }