/// <exception cref="System.IO.IOException"/> public virtual InputSplit[] GetSplits(JobConf conf, int numSplits) { JobConf confCopy = new JobConf(conf); IList <InputSplit> splits = new AList <InputSplit>(); IDictionary <Path, InputFormat> formatMap = MultipleInputs.GetInputFormatMap(conf); IDictionary <Path, Type> mapperMap = MultipleInputs.GetMapperTypeMap(conf); IDictionary <Type, IList <Path> > formatPaths = new Dictionary <Type, IList <Path> >(); // First, build a map of InputFormats to Paths foreach (KeyValuePair <Path, InputFormat> entry in formatMap) { if (!formatPaths.Contains(entry.Value.GetType())) { formatPaths[entry.Value.GetType()] = new List <Path>(); } formatPaths[entry.Value.GetType()].AddItem(entry.Key); } foreach (KeyValuePair <Type, IList <Path> > formatEntry in formatPaths) { Type formatClass = formatEntry.Key; InputFormat format = (InputFormat)ReflectionUtils.NewInstance(formatClass, conf); IList <Path> paths = formatEntry.Value; IDictionary <Type, IList <Path> > mapperPaths = new Dictionary <Type, IList <Path> >(); // Now, for each set of paths that have a common InputFormat, build // a map of Mappers to the paths they're used for foreach (Path path in paths) { Type mapperClass = mapperMap[path]; if (!mapperPaths.Contains(mapperClass)) { mapperPaths[mapperClass] = new List <Path>(); } mapperPaths[mapperClass].AddItem(path); } // Now each set of paths that has a common InputFormat and Mapper can // be added to the same job, and split together. foreach (KeyValuePair <Type, IList <Path> > mapEntry in mapperPaths) { paths = mapEntry.Value; Type mapperClass = mapEntry.Key; if (mapperClass == null) { mapperClass = conf.GetMapperClass(); } FileInputFormat.SetInputPaths(confCopy, Sharpen.Collections.ToArray(paths, new Path [paths.Count])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. InputSplit[] pathSplits = format.GetSplits(confCopy, numSplits); foreach (InputSplit pathSplit in pathSplits) { splits.AddItem(new TaggedInputSplit(pathSplit, conf, format.GetType(), mapperClass )); } } } return(Sharpen.Collections.ToArray(splits, new InputSplit[splits.Count])); }
public virtual void TestAddInputPathWithMapper() { JobConf conf = new JobConf(); MultipleInputs.AddInputPath(conf, new Path("/foo"), typeof(TextInputFormat), typeof( TestMultipleInputs.MapClass)); MultipleInputs.AddInputPath(conf, new Path("/bar"), typeof(KeyValueTextInputFormat ), typeof(TestMultipleInputs.MapClass2)); IDictionary <Path, InputFormat> inputs = MultipleInputs.GetInputFormatMap(conf); IDictionary <Path, Type> maps = MultipleInputs.GetMapperTypeMap(conf); NUnit.Framework.Assert.AreEqual(typeof(TextInputFormat), inputs[new Path("/foo")] .GetType()); NUnit.Framework.Assert.AreEqual(typeof(KeyValueTextInputFormat), inputs[new Path( "/bar")].GetType()); NUnit.Framework.Assert.AreEqual(typeof(TestMultipleInputs.MapClass), maps[new Path ("/foo")]); NUnit.Framework.Assert.AreEqual(typeof(TestMultipleInputs.MapClass2), maps[new Path ("/bar")]); }