/// <exception cref="System.IO.IOException"/> public virtual InputSplit[] GetSplits(JobConf conf, int numSplits) { JobConf confCopy = new JobConf(conf); IList <InputSplit> splits = new AList <InputSplit>(); IDictionary <Path, InputFormat> formatMap = MultipleInputs.GetInputFormatMap(conf); IDictionary <Path, Type> mapperMap = MultipleInputs.GetMapperTypeMap(conf); IDictionary <Type, IList <Path> > formatPaths = new Dictionary <Type, IList <Path> >(); // First, build a map of InputFormats to Paths foreach (KeyValuePair <Path, InputFormat> entry in formatMap) { if (!formatPaths.Contains(entry.Value.GetType())) { formatPaths[entry.Value.GetType()] = new List <Path>(); } formatPaths[entry.Value.GetType()].AddItem(entry.Key); } foreach (KeyValuePair <Type, IList <Path> > formatEntry in formatPaths) { Type formatClass = formatEntry.Key; InputFormat format = (InputFormat)ReflectionUtils.NewInstance(formatClass, conf); IList <Path> paths = formatEntry.Value; IDictionary <Type, IList <Path> > mapperPaths = new Dictionary <Type, IList <Path> >(); // Now, for each set of paths that have a common InputFormat, build // a map of Mappers to the paths they're used for foreach (Path path in paths) { Type mapperClass = mapperMap[path]; if (!mapperPaths.Contains(mapperClass)) { mapperPaths[mapperClass] = new List <Path>(); } mapperPaths[mapperClass].AddItem(path); } // Now each set of paths that has a common InputFormat and Mapper can // be added to the same job, and split together. foreach (KeyValuePair <Type, IList <Path> > mapEntry in mapperPaths) { paths = mapEntry.Value; Type mapperClass = mapEntry.Key; if (mapperClass == null) { mapperClass = conf.GetMapperClass(); } FileInputFormat.SetInputPaths(confCopy, Sharpen.Collections.ToArray(paths, new Path [paths.Count])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. InputSplit[] pathSplits = format.GetSplits(confCopy, numSplits); foreach (InputSplit pathSplit in pathSplits) { splits.AddItem(new TaggedInputSplit(pathSplit, conf, format.GetType(), mapperClass )); } } } return(Sharpen.Collections.ToArray(splits, new InputSplit[splits.Count])); }
public virtual void TestAddInputPathWithFormat() { JobConf conf = new JobConf(); MultipleInputs.AddInputPath(conf, new Path("/foo"), typeof(TextInputFormat)); MultipleInputs.AddInputPath(conf, new Path("/bar"), typeof(KeyValueTextInputFormat )); IDictionary <Path, InputFormat> inputs = MultipleInputs.GetInputFormatMap(conf); NUnit.Framework.Assert.AreEqual(typeof(TextInputFormat), inputs[new Path("/foo")] .GetType()); NUnit.Framework.Assert.AreEqual(typeof(KeyValueTextInputFormat), inputs[new Path( "/bar")].GetType()); }
/// <exception cref="System.Exception"/> public virtual void TestSplitting() { JobConf conf = new JobConf(); MiniDFSCluster dfs = null; try { dfs = new MiniDFSCluster.Builder(conf).NumDataNodes(4).Racks(new string[] { "/rack0" , "/rack0", "/rack1", "/rack1" }).Hosts(new string[] { "host0", "host1", "host2" , "host3" }).Build(); FileSystem fs = dfs.GetFileSystem(); Path path = GetPath("/foo/bar", fs); Path path2 = GetPath("/foo/baz", fs); Path path3 = GetPath("/bar/bar", fs); Path path4 = GetPath("/bar/baz", fs); int numSplits = 100; MultipleInputs.AddInputPath(conf, path, typeof(TextInputFormat), typeof(TestDelegatingInputFormat.MapClass )); MultipleInputs.AddInputPath(conf, path2, typeof(TextInputFormat), typeof(TestDelegatingInputFormat.MapClass2 )); MultipleInputs.AddInputPath(conf, path3, typeof(KeyValueTextInputFormat), typeof( TestDelegatingInputFormat.MapClass)); MultipleInputs.AddInputPath(conf, path4, typeof(TextInputFormat), typeof(TestDelegatingInputFormat.MapClass2 )); DelegatingInputFormat inFormat = new DelegatingInputFormat(); InputSplit[] splits = inFormat.GetSplits(conf, numSplits); int[] bins = new int[3]; foreach (InputSplit split in splits) { NUnit.Framework.Assert.IsTrue(split is TaggedInputSplit); TaggedInputSplit tis = (TaggedInputSplit)split; int index = -1; if (tis.GetInputFormatClass().Equals(typeof(KeyValueTextInputFormat))) { // path3 index = 0; } else { if (tis.GetMapperClass().Equals(typeof(TestDelegatingInputFormat.MapClass))) { // path index = 1; } else { // path2 and path4 index = 2; } } bins[index]++; } // Each bin is a unique combination of a Mapper and InputFormat, and // DelegatingInputFormat should split each bin into numSplits splits, // regardless of the number of paths that use that Mapper/InputFormat foreach (int count in bins) { NUnit.Framework.Assert.AreEqual(numSplits, count); } NUnit.Framework.Assert.IsTrue(true); } finally { if (dfs != null) { dfs.Shutdown(); } } }