/// <exception cref="System.IO.IOException"/> public virtual void TestAddInputPathWithFormat() { Job conf = Job.GetInstance(); MultipleInputs.AddInputPath(conf, new Path("/foo"), typeof(TextInputFormat)); MultipleInputs.AddInputPath(conf, new Path("/bar"), typeof(KeyValueTextInputFormat )); IDictionary <Path, InputFormat> inputs = MultipleInputs.GetInputFormatMap(conf); NUnit.Framework.Assert.AreEqual(typeof(TextInputFormat), inputs[new Path("/foo")] .GetType()); NUnit.Framework.Assert.AreEqual(typeof(KeyValueTextInputFormat), inputs[new Path( "/bar")].GetType()); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> public override IList <InputSplit> GetSplits(JobContext job) { Configuration conf = job.GetConfiguration(); Job jobCopy = Job.GetInstance(conf); IList <InputSplit> splits = new AList <InputSplit>(); IDictionary <Path, InputFormat> formatMap = MultipleInputs.GetInputFormatMap(job); IDictionary <Path, Type> mapperMap = MultipleInputs.GetMapperTypeMap(job); IDictionary <Type, IList <Path> > formatPaths = new Dictionary <Type, IList <Path> >(); // First, build a map of InputFormats to Paths foreach (KeyValuePair <Path, InputFormat> entry in formatMap) { if (!formatPaths.Contains(entry.Value.GetType())) { formatPaths[entry.Value.GetType()] = new List <Path>(); } formatPaths[entry.Value.GetType()].AddItem(entry.Key); } foreach (KeyValuePair <Type, IList <Path> > formatEntry in formatPaths) { Type formatClass = formatEntry.Key; InputFormat format = (InputFormat)ReflectionUtils.NewInstance(formatClass, conf); IList <Path> paths = formatEntry.Value; IDictionary <Type, IList <Path> > mapperPaths = new Dictionary <Type, IList <Path> >(); // Now, for each set of paths that have a common InputFormat, build // a map of Mappers to the paths they're used for foreach (Path path in paths) { Type mapperClass = mapperMap[path]; if (!mapperPaths.Contains(mapperClass)) { mapperPaths[mapperClass] = new List <Path>(); } mapperPaths[mapperClass].AddItem(path); } // Now each set of paths that has a common InputFormat and Mapper can // be added to the same job, and split together. foreach (KeyValuePair <Type, IList <Path> > mapEntry in mapperPaths) { paths = mapEntry.Value; Type mapperClass = mapEntry.Key; if (mapperClass == null) { try { mapperClass = job.GetMapperClass(); } catch (TypeLoadException e) { throw new IOException("Mapper class is not found", e); } } FileInputFormat.SetInputPaths(jobCopy, Sharpen.Collections.ToArray(paths, new Path [paths.Count])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. IList <InputSplit> pathSplits = format.GetSplits(jobCopy); foreach (InputSplit pathSplit in pathSplits) { splits.AddItem(new TaggedInputSplit(pathSplit, conf, format.GetType(), mapperClass )); } } } return(splits); }
/// <exception cref="System.Exception"/> public virtual void TestSplitting() { Job job = Job.GetInstance(); MiniDFSCluster dfs = null; try { dfs = new MiniDFSCluster.Builder(job.GetConfiguration()).NumDataNodes(4).Racks(new string[] { "/rack0", "/rack0", "/rack1", "/rack1" }).Hosts(new string[] { "host0" , "host1", "host2", "host3" }).Build(); FileSystem fs = dfs.GetFileSystem(); Path path = GetPath("/foo/bar", fs); Path path2 = GetPath("/foo/baz", fs); Path path3 = GetPath("/bar/bar", fs); Path path4 = GetPath("/bar/baz", fs); int numSplits = 100; FileInputFormat.SetMaxInputSplitSize(job, fs.GetFileStatus(path).GetLen() / numSplits ); MultipleInputs.AddInputPath(job, path, typeof(TextInputFormat), typeof(TestDelegatingInputFormat.MapClass )); MultipleInputs.AddInputPath(job, path2, typeof(TextInputFormat), typeof(TestDelegatingInputFormat.MapClass2 )); MultipleInputs.AddInputPath(job, path3, typeof(KeyValueTextInputFormat), typeof(TestDelegatingInputFormat.MapClass )); MultipleInputs.AddInputPath(job, path4, typeof(TextInputFormat), typeof(TestDelegatingInputFormat.MapClass2 )); DelegatingInputFormat inFormat = new DelegatingInputFormat(); int[] bins = new int[3]; foreach (InputSplit split in (IList <InputSplit>)inFormat.GetSplits(job)) { NUnit.Framework.Assert.IsTrue(split is TaggedInputSplit); TaggedInputSplit tis = (TaggedInputSplit)split; int index = -1; if (tis.GetInputFormatClass().Equals(typeof(KeyValueTextInputFormat))) { // path3 index = 0; } else { if (tis.GetMapperClass().Equals(typeof(TestDelegatingInputFormat.MapClass))) { // path index = 1; } else { // path2 and path4 index = 2; } } bins[index]++; } NUnit.Framework.Assert.AreEqual("count is not equal to num splits", numSplits, bins [0]); NUnit.Framework.Assert.AreEqual("count is not equal to num splits", numSplits, bins [1]); NUnit.Framework.Assert.AreEqual("count is not equal to 2 * num splits", numSplits * 2, bins[2]); } finally { if (dfs != null) { dfs.Shutdown(); } } }
public virtual void TestDoMultipleInputs() { Path in1Dir = GetDir(In1Dir); Path in2Dir = GetDir(In2Dir); Path outDir = GetDir(OutDir); Configuration conf = CreateJobConf(); FileSystem fs = FileSystem.Get(conf); fs.Delete(outDir, true); DataOutputStream file1 = fs.Create(new Path(in1Dir, "part-0")); file1.WriteBytes("a\nb\nc\nd\ne"); file1.Close(); // write tab delimited to second file because we're doing // KeyValueInputFormat DataOutputStream file2 = fs.Create(new Path(in2Dir, "part-0")); file2.WriteBytes("a\tblah\nb\tblah\nc\tblah\nd\tblah\ne\tblah"); file2.Close(); Job job = Job.GetInstance(conf); job.SetJobName("mi"); MultipleInputs.AddInputPath(job, in1Dir, typeof(TextInputFormat), typeof(TestMultipleInputs.MapClass )); MultipleInputs.AddInputPath(job, in2Dir, typeof(KeyValueTextInputFormat), typeof( TestMultipleInputs.KeyValueMapClass)); job.SetMapOutputKeyClass(typeof(Text)); job.SetMapOutputValueClass(typeof(Text)); job.SetOutputKeyClass(typeof(NullWritable)); job.SetOutputValueClass(typeof(Text)); job.SetReducerClass(typeof(TestMultipleInputs.ReducerClass)); FileOutputFormat.SetOutputPath(job, outDir); bool success = false; try { success = job.WaitForCompletion(true); } catch (Exception ie) { throw new RuntimeException(ie); } catch (TypeLoadException instante) { throw new RuntimeException(instante); } if (!success) { throw new RuntimeException("Job failed!"); } // copy bytes a bunch of times for the ease of readLine() - whatever BufferedReader output = new BufferedReader(new InputStreamReader(fs.Open(new Path (outDir, "part-r-00000")))); // reducer should have counted one key from each file NUnit.Framework.Assert.IsTrue(output.ReadLine().Equals("a 2")); NUnit.Framework.Assert.IsTrue(output.ReadLine().Equals("b 2")); NUnit.Framework.Assert.IsTrue(output.ReadLine().Equals("c 2")); NUnit.Framework.Assert.IsTrue(output.ReadLine().Equals("d 2")); NUnit.Framework.Assert.IsTrue(output.ReadLine().Equals("e 2")); }