Beispiel #1
0
        /// <exception cref="System.IO.IOException"/>
        public virtual InputSplit[] GetSplits(JobConf conf, int numSplits)
        {
            JobConf            confCopy = new JobConf(conf);
            IList <InputSplit> splits   = new AList <InputSplit>();
            IDictionary <Path, InputFormat>   formatMap   = MultipleInputs.GetInputFormatMap(conf);
            IDictionary <Path, Type>          mapperMap   = MultipleInputs.GetMapperTypeMap(conf);
            IDictionary <Type, IList <Path> > formatPaths = new Dictionary <Type, IList <Path> >();

            // First, build a map of InputFormats to Paths
            foreach (KeyValuePair <Path, InputFormat> entry in formatMap)
            {
                if (!formatPaths.Contains(entry.Value.GetType()))
                {
                    formatPaths[entry.Value.GetType()] = new List <Path>();
                }
                formatPaths[entry.Value.GetType()].AddItem(entry.Key);
            }
            foreach (KeyValuePair <Type, IList <Path> > formatEntry in formatPaths)
            {
                Type         formatClass = formatEntry.Key;
                InputFormat  format      = (InputFormat)ReflectionUtils.NewInstance(formatClass, conf);
                IList <Path> paths       = formatEntry.Value;
                IDictionary <Type, IList <Path> > mapperPaths = new Dictionary <Type, IList <Path> >();
                // Now, for each set of paths that have a common InputFormat, build
                // a map of Mappers to the paths they're used for
                foreach (Path path in paths)
                {
                    Type mapperClass = mapperMap[path];
                    if (!mapperPaths.Contains(mapperClass))
                    {
                        mapperPaths[mapperClass] = new List <Path>();
                    }
                    mapperPaths[mapperClass].AddItem(path);
                }
                // Now each set of paths that has a common InputFormat and Mapper can
                // be added to the same job, and split together.
                foreach (KeyValuePair <Type, IList <Path> > mapEntry in mapperPaths)
                {
                    paths = mapEntry.Value;
                    Type mapperClass = mapEntry.Key;
                    if (mapperClass == null)
                    {
                        mapperClass = conf.GetMapperClass();
                    }
                    FileInputFormat.SetInputPaths(confCopy, Sharpen.Collections.ToArray(paths, new Path
                                                                                        [paths.Count]));
                    // Get splits for each input path and tag with InputFormat
                    // and Mapper types by wrapping in a TaggedInputSplit.
                    InputSplit[] pathSplits = format.GetSplits(confCopy, numSplits);
                    foreach (InputSplit pathSplit in pathSplits)
                    {
                        splits.AddItem(new TaggedInputSplit(pathSplit, conf, format.GetType(), mapperClass
                                                            ));
                    }
                }
            }
            return(Sharpen.Collections.ToArray(splits, new InputSplit[splits.Count]));
        }
Beispiel #2
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.Exception"/>
        public override IList <InputSplit> GetSplits(JobContext job)
        {
            Configuration      conf    = job.GetConfiguration();
            Job                jobCopy = Job.GetInstance(conf);
            IList <InputSplit> splits  = new AList <InputSplit>();
            IDictionary <Path, InputFormat>   formatMap   = MultipleInputs.GetInputFormatMap(job);
            IDictionary <Path, Type>          mapperMap   = MultipleInputs.GetMapperTypeMap(job);
            IDictionary <Type, IList <Path> > formatPaths = new Dictionary <Type, IList <Path> >();

            // First, build a map of InputFormats to Paths
            foreach (KeyValuePair <Path, InputFormat> entry in formatMap)
            {
                if (!formatPaths.Contains(entry.Value.GetType()))
                {
                    formatPaths[entry.Value.GetType()] = new List <Path>();
                }
                formatPaths[entry.Value.GetType()].AddItem(entry.Key);
            }
            foreach (KeyValuePair <Type, IList <Path> > formatEntry in formatPaths)
            {
                Type         formatClass = formatEntry.Key;
                InputFormat  format      = (InputFormat)ReflectionUtils.NewInstance(formatClass, conf);
                IList <Path> paths       = formatEntry.Value;
                IDictionary <Type, IList <Path> > mapperPaths = new Dictionary <Type, IList <Path> >();
                // Now, for each set of paths that have a common InputFormat, build
                // a map of Mappers to the paths they're used for
                foreach (Path path in paths)
                {
                    Type mapperClass = mapperMap[path];
                    if (!mapperPaths.Contains(mapperClass))
                    {
                        mapperPaths[mapperClass] = new List <Path>();
                    }
                    mapperPaths[mapperClass].AddItem(path);
                }
                // Now each set of paths that has a common InputFormat and Mapper can
                // be added to the same job, and split together.
                foreach (KeyValuePair <Type, IList <Path> > mapEntry in mapperPaths)
                {
                    paths = mapEntry.Value;
                    Type mapperClass = mapEntry.Key;
                    if (mapperClass == null)
                    {
                        try
                        {
                            mapperClass = job.GetMapperClass();
                        }
                        catch (TypeLoadException e)
                        {
                            throw new IOException("Mapper class is not found", e);
                        }
                    }
                    FileInputFormat.SetInputPaths(jobCopy, Sharpen.Collections.ToArray(paths, new Path
                                                                                       [paths.Count]));
                    // Get splits for each input path and tag with InputFormat
                    // and Mapper types by wrapping in a TaggedInputSplit.
                    IList <InputSplit> pathSplits = format.GetSplits(jobCopy);
                    foreach (InputSplit pathSplit in pathSplits)
                    {
                        splits.AddItem(new TaggedInputSplit(pathSplit, conf, format.GetType(), mapperClass
                                                            ));
                    }
                }
            }
            return(splits);
        }
Beispiel #3
0
 public override string ToString()
 {
     return(ident + "(" + inf.GetType().FullName + ",\"" + indir + "\")");
 }