Пример #1
0
            /// <summary>From each split sampled, take the first numSamples / numSplits records.</summary>
            /// <exception cref="System.IO.IOException"/>
            public virtual K[] GetSample(InputFormat <K, V> inf, JobConf job)
            {
                // ArrayList::toArray doesn't preserve type
                InputSplit[] splits          = inf.GetSplits(job, job.GetNumMapTasks());
                AList <K>    samples         = new AList <K>(numSamples);
                int          splitsToSample  = Math.Min(maxSplitsSampled, splits.Length);
                int          splitStep       = splits.Length / splitsToSample;
                int          samplesPerSplit = numSamples / splitsToSample;
                long         records         = 0;

                for (int i = 0; i < splitsToSample; ++i)
                {
                    RecordReader <K, V> reader = inf.GetRecordReader(splits[i * splitStep], job, Reporter
                                                                     .Null);
                    K key   = reader.CreateKey();
                    V value = reader.CreateValue();
                    while (reader.Next(key, value))
                    {
                        samples.AddItem(key);
                        key = reader.CreateKey();
                        ++records;
                        if ((i + 1) * samplesPerSplit <= records)
                        {
                            break;
                        }
                    }
                    reader.Close();
                }
                return((K[])Sharpen.Collections.ToArray(samples));
            }
Пример #2
0
            /// <summary>From each split sampled, take the first numSamples / numSplits records.</summary>
            /// <exception cref="System.IO.IOException"/>
            /// <exception cref="System.Exception"/>
            public virtual K[] GetSample(InputFormat <K, V> inf, Job job)
            {
                // ArrayList::toArray doesn't preserve type
                IList <InputSplit> splits  = inf.GetSplits(job);
                AList <K>          samples = new AList <K>(numSamples);
                int  splitsToSample        = Math.Min(maxSplitsSampled, splits.Count);
                int  samplesPerSplit       = numSamples / splitsToSample;
                long records = 0;

                for (int i = 0; i < splitsToSample; ++i)
                {
                    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.GetConfiguration
                                                                                        (), new TaskAttemptID());
                    RecordReader <K, V> reader = inf.CreateRecordReader(splits[i], samplingContext);
                    reader.Initialize(splits[i], samplingContext);
                    while (reader.NextKeyValue())
                    {
                        samples.AddItem(ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey
                                                                 (), null));
                        ++records;
                        if ((i + 1) * samplesPerSplit <= records)
                        {
                            break;
                        }
                    }
                    reader.Close();
                }
                return((K[])Sharpen.Collections.ToArray(samples));
            }
Пример #3
0
        /// <exception cref="System.IO.IOException"/>
        public virtual InputSplit[] GetSplits(JobConf conf, int numSplits)
        {
            JobConf            confCopy = new JobConf(conf);
            IList <InputSplit> splits   = new AList <InputSplit>();
            IDictionary <Path, InputFormat>   formatMap   = MultipleInputs.GetInputFormatMap(conf);
            IDictionary <Path, Type>          mapperMap   = MultipleInputs.GetMapperTypeMap(conf);
            IDictionary <Type, IList <Path> > formatPaths = new Dictionary <Type, IList <Path> >();

            // First, build a map of InputFormats to Paths
            foreach (KeyValuePair <Path, InputFormat> entry in formatMap)
            {
                if (!formatPaths.Contains(entry.Value.GetType()))
                {
                    formatPaths[entry.Value.GetType()] = new List <Path>();
                }
                formatPaths[entry.Value.GetType()].AddItem(entry.Key);
            }
            foreach (KeyValuePair <Type, IList <Path> > formatEntry in formatPaths)
            {
                Type         formatClass = formatEntry.Key;
                InputFormat  format      = (InputFormat)ReflectionUtils.NewInstance(formatClass, conf);
                IList <Path> paths       = formatEntry.Value;
                IDictionary <Type, IList <Path> > mapperPaths = new Dictionary <Type, IList <Path> >();
                // Now, for each set of paths that have a common InputFormat, build
                // a map of Mappers to the paths they're used for
                foreach (Path path in paths)
                {
                    Type mapperClass = mapperMap[path];
                    if (!mapperPaths.Contains(mapperClass))
                    {
                        mapperPaths[mapperClass] = new List <Path>();
                    }
                    mapperPaths[mapperClass].AddItem(path);
                }
                // Now each set of paths that has a common InputFormat and Mapper can
                // be added to the same job, and split together.
                foreach (KeyValuePair <Type, IList <Path> > mapEntry in mapperPaths)
                {
                    paths = mapEntry.Value;
                    Type mapperClass = mapEntry.Key;
                    if (mapperClass == null)
                    {
                        mapperClass = conf.GetMapperClass();
                    }
                    FileInputFormat.SetInputPaths(confCopy, Sharpen.Collections.ToArray(paths, new Path
                                                                                        [paths.Count]));
                    // Get splits for each input path and tag with InputFormat
                    // and Mapper types by wrapping in a TaggedInputSplit.
                    InputSplit[] pathSplits = format.GetSplits(confCopy, numSplits);
                    foreach (InputSplit pathSplit in pathSplits)
                    {
                        splits.AddItem(new TaggedInputSplit(pathSplit, conf, format.GetType(), mapperClass
                                                            ));
                    }
                }
            }
            return(Sharpen.Collections.ToArray(splits, new InputSplit[splits.Count]));
        }
Пример #4
0
            /// <summary>
            /// Randomize the split order, then take the specified number of keys from
            /// each split sampled, where each key is selected with the specified
            /// probability and possibly replaced by a subsequently selected key when
            /// the quota of keys from that split is satisfied.
            /// </summary>
            /// <exception cref="System.IO.IOException"/>
            /// <exception cref="System.Exception"/>
            public virtual K[] GetSample(InputFormat <K, V> inf, Job job)
            {
                // ArrayList::toArray doesn't preserve type
                IList <InputSplit> splits  = inf.GetSplits(job);
                AList <K>          samples = new AList <K>(numSamples);
                int    splitsToSample      = Math.Min(maxSplitsSampled, splits.Count);
                Random r    = new Random();
                long   seed = r.NextLong();

                r.SetSeed(seed);
                Log.Debug("seed: " + seed);
                // shuffle splits
                for (int i = 0; i < splits.Count; ++i)
                {
                    InputSplit tmp = splits[i];
                    int        j   = r.Next(splits.Count);
                    splits.Set(i, splits[j]);
                    splits.Set(j, tmp);
                }
                // our target rate is in terms of the maximum number of sample splits,
                // but we accept the possibility of sampling additional splits to hit
                // the target sample keyset
                for (int i_1 = 0; i_1 < splitsToSample || (i_1 < splits.Count && samples.Count <
                                                           numSamples); ++i_1)
                {
                    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.GetConfiguration
                                                                                        (), new TaskAttemptID());
                    RecordReader <K, V> reader = inf.CreateRecordReader(splits[i_1], samplingContext);
                    reader.Initialize(splits[i_1], samplingContext);
                    while (reader.NextKeyValue())
                    {
                        if (r.NextDouble() <= freq)
                        {
                            if (samples.Count < numSamples)
                            {
                                samples.AddItem(ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey
                                                                         (), null));
                            }
                            else
                            {
                                // When exceeding the maximum number of samples, replace a
                                // random element with this one, then adjust the frequency
                                // to reflect the possibility of existing elements being
                                // pushed out
                                int ind = r.Next(numSamples);
                                if (ind != numSamples)
                                {
                                    samples.Set(ind, ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey
                                                                              (), null));
                                }
                                freq *= (numSamples - 1) / (double)numSamples;
                            }
                        }
                    }
                    reader.Close();
                }
                return((K[])Sharpen.Collections.ToArray(samples));
            }
Пример #5
0
            /// <summary>
            /// Randomize the split order, then take the specified number of keys from
            /// each split sampled, where each key is selected with the specified
            /// probability and possibly replaced by a subsequently selected key when
            /// the quota of keys from that split is satisfied.
            /// </summary>
            /// <exception cref="System.IO.IOException"/>
            public virtual K[] GetSample(InputFormat <K, V> inf, JobConf job)
            {
                // ArrayList::toArray doesn't preserve type
                InputSplit[] splits         = inf.GetSplits(job, job.GetNumMapTasks());
                AList <K>    samples        = new AList <K>(numSamples);
                int          splitsToSample = Math.Min(maxSplitsSampled, splits.Length);
                Random       r    = new Random();
                long         seed = r.NextLong();

                r.SetSeed(seed);
                Log.Debug("seed: " + seed);
                // shuffle splits
                for (int i = 0; i < splits.Length; ++i)
                {
                    InputSplit tmp = splits[i];
                    int        j   = r.Next(splits.Length);
                    splits[i] = splits[j];
                    splits[j] = tmp;
                }
                // our target rate is in terms of the maximum number of sample splits,
                // but we accept the possibility of sampling additional splits to hit
                // the target sample keyset
                for (int i_1 = 0; i_1 < splitsToSample || (i_1 < splits.Length && samples.Count <
                                                           numSamples); ++i_1)
                {
                    RecordReader <K, V> reader = inf.GetRecordReader(splits[i_1], job, Reporter.Null);
                    K key   = reader.CreateKey();
                    V value = reader.CreateValue();
                    while (reader.Next(key, value))
                    {
                        if (r.NextDouble() <= freq)
                        {
                            if (samples.Count < numSamples)
                            {
                                samples.AddItem(key);
                            }
                            else
                            {
                                // When exceeding the maximum number of samples, replace a
                                // random element with this one, then adjust the frequency
                                // to reflect the possibility of existing elements being
                                // pushed out
                                int ind = r.Next(numSamples);
                                if (ind != numSamples)
                                {
                                    samples.Set(ind, key);
                                }
                                freq *= (numSamples - 1) / (double)numSamples;
                            }
                            key = reader.CreateKey();
                        }
                    }
                    reader.Close();
                }
                return((K[])Sharpen.Collections.ToArray(samples));
            }
Пример #6
0
        /// <summary>
        /// Validates map phase progress after each record is processed by map task
        /// using custom task reporter.
        /// </summary>
        /// <exception cref="System.Exception"/>
        public virtual void TestMapProgress()
        {
            JobConf job = new JobConf();

            fs = FileSystem.GetLocal(job);
            Path rootDir = new Path(TestRootDir);

            CreateInputFile(rootDir);
            job.SetNumReduceTasks(0);
            TaskAttemptID taskId = ((TaskAttemptID)TaskAttemptID.ForName("attempt_200907082313_0424_m_000000_0"
                                                                         ));

            job.SetClass("mapreduce.job.outputformat.class", typeof(NullOutputFormat), typeof(
                             OutputFormat));
            job.Set(FileInputFormat.InputDir, TestRootDir);
            jobId = ((JobID)taskId.GetJobID());
            JobContext jContext = new JobContextImpl(job, jobId);
            InputFormat <object, object> input = ReflectionUtils.NewInstance(jContext.GetInputFormatClass
                                                                                 (), job);
            IList <InputSplit> splits = input.GetSplits(jContext);

            JobSplitWriter.CreateSplitFiles(new Path(TestRootDir), job, new Path(TestRootDir)
                                            .GetFileSystem(job), splits);
            JobSplit.TaskSplitMetaInfo[] splitMetaInfo = SplitMetaInfoReader.ReadSplitMetaInfo
                                                             (jobId, fs, job, new Path(TestRootDir));
            job.SetUseNewMapper(true);
            // use new api
            for (int i = 0; i < splitMetaInfo.Length; i++)
            {
                // rawSplits.length is 1
                map = new TestMapProgress.TestMapTask(this, job.Get(JTConfig.JtSystemDir, "/tmp/hadoop/mapred/system"
                                                                    ) + jobId + "job.xml", taskId, i, splitMetaInfo[i].GetSplitIndex(), 1);
                JobConf localConf = new JobConf(job);
                map.LocalizeConfiguration(localConf);
                map.SetConf(localConf);
                map.Run(localConf, fakeUmbilical);
            }
            // clean up
            fs.Delete(rootDir, true);
        }
Пример #7
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.Exception"/>
        public override IList <InputSplit> GetSplits(JobContext job)
        {
            Configuration      conf    = job.GetConfiguration();
            Job                jobCopy = Job.GetInstance(conf);
            IList <InputSplit> splits  = new AList <InputSplit>();
            IDictionary <Path, InputFormat>   formatMap   = MultipleInputs.GetInputFormatMap(job);
            IDictionary <Path, Type>          mapperMap   = MultipleInputs.GetMapperTypeMap(job);
            IDictionary <Type, IList <Path> > formatPaths = new Dictionary <Type, IList <Path> >();

            // First, build a map of InputFormats to Paths
            foreach (KeyValuePair <Path, InputFormat> entry in formatMap)
            {
                if (!formatPaths.Contains(entry.Value.GetType()))
                {
                    formatPaths[entry.Value.GetType()] = new List <Path>();
                }
                formatPaths[entry.Value.GetType()].AddItem(entry.Key);
            }
            foreach (KeyValuePair <Type, IList <Path> > formatEntry in formatPaths)
            {
                Type         formatClass = formatEntry.Key;
                InputFormat  format      = (InputFormat)ReflectionUtils.NewInstance(formatClass, conf);
                IList <Path> paths       = formatEntry.Value;
                IDictionary <Type, IList <Path> > mapperPaths = new Dictionary <Type, IList <Path> >();
                // Now, for each set of paths that have a common InputFormat, build
                // a map of Mappers to the paths they're used for
                foreach (Path path in paths)
                {
                    Type mapperClass = mapperMap[path];
                    if (!mapperPaths.Contains(mapperClass))
                    {
                        mapperPaths[mapperClass] = new List <Path>();
                    }
                    mapperPaths[mapperClass].AddItem(path);
                }
                // Now each set of paths that has a common InputFormat and Mapper can
                // be added to the same job, and split together.
                foreach (KeyValuePair <Type, IList <Path> > mapEntry in mapperPaths)
                {
                    paths = mapEntry.Value;
                    Type mapperClass = mapEntry.Key;
                    if (mapperClass == null)
                    {
                        try
                        {
                            mapperClass = job.GetMapperClass();
                        }
                        catch (TypeLoadException e)
                        {
                            throw new IOException("Mapper class is not found", e);
                        }
                    }
                    FileInputFormat.SetInputPaths(jobCopy, Sharpen.Collections.ToArray(paths, new Path
                                                                                       [paths.Count]));
                    // Get splits for each input path and tag with InputFormat
                    // and Mapper types by wrapping in a TaggedInputSplit.
                    IList <InputSplit> pathSplits = format.GetSplits(jobCopy);
                    foreach (InputSplit pathSplit in pathSplits)
                    {
                        splits.AddItem(new TaggedInputSplit(pathSplit, conf, format.GetType(), mapperClass
                                                            ));
                    }
                }
            }
            return(splits);
        }
Пример #8
0
 /// <exception cref="System.IO.IOException"/>
 public override InputSplit[] GetSplits(JobConf job, int numSplits)
 {
     return(inf.GetSplits(GetConf(job), numSplits));
 }
Пример #9
0
 /// <exception cref="System.IO.IOException"/>
 /// <exception cref="System.Exception"/>
 public override IList GetSplits(JobContext context)
 {
     return(inf.GetSplits(new JobContextImpl(GetConf(context.GetConfiguration()), context
                                             .GetJobID())));
 }