示例#1
0
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestAddInputPathWithFormat()
        {
            Job conf = Job.GetInstance();

            MultipleInputs.AddInputPath(conf, new Path("/foo"), typeof(TextInputFormat));
            MultipleInputs.AddInputPath(conf, new Path("/bar"), typeof(KeyValueTextInputFormat
                                                                       ));
            IDictionary <Path, InputFormat> inputs = MultipleInputs.GetInputFormatMap(conf);

            NUnit.Framework.Assert.AreEqual(typeof(TextInputFormat), inputs[new Path("/foo")]
                                            .GetType());
            NUnit.Framework.Assert.AreEqual(typeof(KeyValueTextInputFormat), inputs[new Path(
                                                                                        "/bar")].GetType());
        }
示例#2
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.Exception"/>
        public override IList <InputSplit> GetSplits(JobContext job)
        {
            Configuration      conf    = job.GetConfiguration();
            Job                jobCopy = Job.GetInstance(conf);
            IList <InputSplit> splits  = new AList <InputSplit>();
            IDictionary <Path, InputFormat>   formatMap   = MultipleInputs.GetInputFormatMap(job);
            IDictionary <Path, Type>          mapperMap   = MultipleInputs.GetMapperTypeMap(job);
            IDictionary <Type, IList <Path> > formatPaths = new Dictionary <Type, IList <Path> >();

            // First, build a map of InputFormats to Paths
            foreach (KeyValuePair <Path, InputFormat> entry in formatMap)
            {
                if (!formatPaths.Contains(entry.Value.GetType()))
                {
                    formatPaths[entry.Value.GetType()] = new List <Path>();
                }
                formatPaths[entry.Value.GetType()].AddItem(entry.Key);
            }
            foreach (KeyValuePair <Type, IList <Path> > formatEntry in formatPaths)
            {
                Type         formatClass = formatEntry.Key;
                InputFormat  format      = (InputFormat)ReflectionUtils.NewInstance(formatClass, conf);
                IList <Path> paths       = formatEntry.Value;
                IDictionary <Type, IList <Path> > mapperPaths = new Dictionary <Type, IList <Path> >();
                // Now, for each set of paths that have a common InputFormat, build
                // a map of Mappers to the paths they're used for
                foreach (Path path in paths)
                {
                    Type mapperClass = mapperMap[path];
                    if (!mapperPaths.Contains(mapperClass))
                    {
                        mapperPaths[mapperClass] = new List <Path>();
                    }
                    mapperPaths[mapperClass].AddItem(path);
                }
                // Now each set of paths that has a common InputFormat and Mapper can
                // be added to the same job, and split together.
                foreach (KeyValuePair <Type, IList <Path> > mapEntry in mapperPaths)
                {
                    paths = mapEntry.Value;
                    Type mapperClass = mapEntry.Key;
                    if (mapperClass == null)
                    {
                        try
                        {
                            mapperClass = job.GetMapperClass();
                        }
                        catch (TypeLoadException e)
                        {
                            throw new IOException("Mapper class is not found", e);
                        }
                    }
                    FileInputFormat.SetInputPaths(jobCopy, Sharpen.Collections.ToArray(paths, new Path
                                                                                       [paths.Count]));
                    // Get splits for each input path and tag with InputFormat
                    // and Mapper types by wrapping in a TaggedInputSplit.
                    IList <InputSplit> pathSplits = format.GetSplits(jobCopy);
                    foreach (InputSplit pathSplit in pathSplits)
                    {
                        splits.AddItem(new TaggedInputSplit(pathSplit, conf, format.GetType(), mapperClass
                                                            ));
                    }
                }
            }
            return(splits);
        }
        /// <exception cref="System.Exception"/>
        public virtual void TestSplitting()
        {
            Job            job = Job.GetInstance();
            MiniDFSCluster dfs = null;

            try
            {
                dfs = new MiniDFSCluster.Builder(job.GetConfiguration()).NumDataNodes(4).Racks(new
                                                                                               string[] { "/rack0", "/rack0", "/rack1", "/rack1" }).Hosts(new string[] { "host0"
                                                                                                                                                                         , "host1", "host2", "host3" }).Build();
                FileSystem fs        = dfs.GetFileSystem();
                Path       path      = GetPath("/foo/bar", fs);
                Path       path2     = GetPath("/foo/baz", fs);
                Path       path3     = GetPath("/bar/bar", fs);
                Path       path4     = GetPath("/bar/baz", fs);
                int        numSplits = 100;
                FileInputFormat.SetMaxInputSplitSize(job, fs.GetFileStatus(path).GetLen() / numSplits
                                                     );
                MultipleInputs.AddInputPath(job, path, typeof(TextInputFormat), typeof(TestDelegatingInputFormat.MapClass
                                                                                       ));
                MultipleInputs.AddInputPath(job, path2, typeof(TextInputFormat), typeof(TestDelegatingInputFormat.MapClass2
                                                                                        ));
                MultipleInputs.AddInputPath(job, path3, typeof(KeyValueTextInputFormat), typeof(TestDelegatingInputFormat.MapClass
                                                                                                ));
                MultipleInputs.AddInputPath(job, path4, typeof(TextInputFormat), typeof(TestDelegatingInputFormat.MapClass2
                                                                                        ));
                DelegatingInputFormat inFormat = new DelegatingInputFormat();
                int[] bins = new int[3];
                foreach (InputSplit split in (IList <InputSplit>)inFormat.GetSplits(job))
                {
                    NUnit.Framework.Assert.IsTrue(split is TaggedInputSplit);
                    TaggedInputSplit tis = (TaggedInputSplit)split;
                    int index            = -1;
                    if (tis.GetInputFormatClass().Equals(typeof(KeyValueTextInputFormat)))
                    {
                        // path3
                        index = 0;
                    }
                    else
                    {
                        if (tis.GetMapperClass().Equals(typeof(TestDelegatingInputFormat.MapClass)))
                        {
                            // path
                            index = 1;
                        }
                        else
                        {
                            // path2 and path4
                            index = 2;
                        }
                    }
                    bins[index]++;
                }
                NUnit.Framework.Assert.AreEqual("count is not equal to num splits", numSplits, bins
                                                [0]);
                NUnit.Framework.Assert.AreEqual("count is not equal to num splits", numSplits, bins
                                                [1]);
                NUnit.Framework.Assert.AreEqual("count is not equal to 2 * num splits", numSplits
                                                * 2, bins[2]);
            }
            finally
            {
                if (dfs != null)
                {
                    dfs.Shutdown();
                }
            }
        }
示例#4
0
        public virtual void TestDoMultipleInputs()
        {
            Path          in1Dir = GetDir(In1Dir);
            Path          in2Dir = GetDir(In2Dir);
            Path          outDir = GetDir(OutDir);
            Configuration conf   = CreateJobConf();
            FileSystem    fs     = FileSystem.Get(conf);

            fs.Delete(outDir, true);
            DataOutputStream file1 = fs.Create(new Path(in1Dir, "part-0"));

            file1.WriteBytes("a\nb\nc\nd\ne");
            file1.Close();
            // write tab delimited to second file because we're doing
            // KeyValueInputFormat
            DataOutputStream file2 = fs.Create(new Path(in2Dir, "part-0"));

            file2.WriteBytes("a\tblah\nb\tblah\nc\tblah\nd\tblah\ne\tblah");
            file2.Close();
            Job job = Job.GetInstance(conf);

            job.SetJobName("mi");
            MultipleInputs.AddInputPath(job, in1Dir, typeof(TextInputFormat), typeof(TestMultipleInputs.MapClass
                                                                                     ));
            MultipleInputs.AddInputPath(job, in2Dir, typeof(KeyValueTextInputFormat), typeof(
                                            TestMultipleInputs.KeyValueMapClass));
            job.SetMapOutputKeyClass(typeof(Text));
            job.SetMapOutputValueClass(typeof(Text));
            job.SetOutputKeyClass(typeof(NullWritable));
            job.SetOutputValueClass(typeof(Text));
            job.SetReducerClass(typeof(TestMultipleInputs.ReducerClass));
            FileOutputFormat.SetOutputPath(job, outDir);
            bool success = false;

            try
            {
                success = job.WaitForCompletion(true);
            }
            catch (Exception ie)
            {
                throw new RuntimeException(ie);
            }
            catch (TypeLoadException instante)
            {
                throw new RuntimeException(instante);
            }
            if (!success)
            {
                throw new RuntimeException("Job failed!");
            }
            // copy bytes a bunch of times for the ease of readLine() - whatever
            BufferedReader output = new BufferedReader(new InputStreamReader(fs.Open(new Path
                                                                                         (outDir, "part-r-00000"))));

            // reducer should have counted one key from each file
            NUnit.Framework.Assert.IsTrue(output.ReadLine().Equals("a 2"));
            NUnit.Framework.Assert.IsTrue(output.ReadLine().Equals("b 2"));
            NUnit.Framework.Assert.IsTrue(output.ReadLine().Equals("c 2"));
            NUnit.Framework.Assert.IsTrue(output.ReadLine().Equals("d 2"));
            NUnit.Framework.Assert.IsTrue(output.ReadLine().Equals("e 2"));
        }