Пример #1
0
            /// <summary>From each split sampled, take the first numSamples / numSplits records.</summary>
            /// <exception cref="System.IO.IOException"/>
            /// <exception cref="System.Exception"/>
            public virtual K[] GetSample(InputFormat <K, V> inf, Job job)
            {
                // ArrayList::toArray doesn't preserve type
                IList <InputSplit> splits  = inf.GetSplits(job);
                AList <K>          samples = new AList <K>(numSamples);
                int  splitsToSample        = Math.Min(maxSplitsSampled, splits.Count);
                int  samplesPerSplit       = numSamples / splitsToSample;
                long records = 0;

                for (int i = 0; i < splitsToSample; ++i)
                {
                    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.GetConfiguration
                                                                                        (), new TaskAttemptID());
                    RecordReader <K, V> reader = inf.CreateRecordReader(splits[i], samplingContext);
                    reader.Initialize(splits[i], samplingContext);
                    while (reader.NextKeyValue())
                    {
                        samples.AddItem(ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey
                                                                 (), null));
                        ++records;
                        if ((i + 1) * samplesPerSplit <= records)
                        {
                            break;
                        }
                    }
                    reader.Close();
                }
                return((K[])Sharpen.Collections.ToArray(samples));
            }
Пример #2
0
 /// <exception cref="System.IO.IOException"/>
 /// <exception cref="System.Exception"/>
 public override RecordReader CreateRecordReader(InputSplit split, TaskAttemptContext
                                                 taskContext)
 {
     try
     {
         if (!rrCstrMap.Contains(ident))
         {
             throw new IOException("No RecordReader for " + ident);
         }
         Configuration      conf    = GetConf(taskContext.GetConfiguration());
         TaskAttemptContext context = new TaskAttemptContextImpl(conf, TaskAttemptID.ForName
                                                                     (conf.Get(MRJobConfig.TaskAttemptId)), new Parser.WrappedStatusReporter(taskContext
                                                                                                                                             ));
         return(rrCstrMap[ident].NewInstance(id, inf.CreateRecordReader(split, context), cmpcl
                                             ));
     }
     catch (MemberAccessException e)
     {
         throw new IOException(e);
     }
     catch (InstantiationException e)
     {
         throw new IOException(e);
     }
     catch (TargetInvocationException e)
     {
         throw new IOException(e);
     }
 }
Пример #3
0
            /// <summary>
            /// Randomize the split order, then take the specified number of keys from
            /// each split sampled, where each key is selected with the specified
            /// probability and possibly replaced by a subsequently selected key when
            /// the quota of keys from that split is satisfied.
            /// </summary>
            /// <exception cref="System.IO.IOException"/>
            /// <exception cref="System.Exception"/>
            public virtual K[] GetSample(InputFormat <K, V> inf, Job job)
            {
                // ArrayList::toArray doesn't preserve type
                IList <InputSplit> splits  = inf.GetSplits(job);
                AList <K>          samples = new AList <K>(numSamples);
                int    splitsToSample      = Math.Min(maxSplitsSampled, splits.Count);
                Random r    = new Random();
                long   seed = r.NextLong();

                r.SetSeed(seed);
                Log.Debug("seed: " + seed);
                // shuffle splits
                for (int i = 0; i < splits.Count; ++i)
                {
                    InputSplit tmp = splits[i];
                    int        j   = r.Next(splits.Count);
                    splits.Set(i, splits[j]);
                    splits.Set(j, tmp);
                }
                // our target rate is in terms of the maximum number of sample splits,
                // but we accept the possibility of sampling additional splits to hit
                // the target sample keyset
                for (int i_1 = 0; i_1 < splitsToSample || (i_1 < splits.Count && samples.Count <
                                                           numSamples); ++i_1)
                {
                    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(job.GetConfiguration
                                                                                        (), new TaskAttemptID());
                    RecordReader <K, V> reader = inf.CreateRecordReader(splits[i_1], samplingContext);
                    reader.Initialize(splits[i_1], samplingContext);
                    while (reader.NextKeyValue())
                    {
                        if (r.NextDouble() <= freq)
                        {
                            if (samples.Count < numSamples)
                            {
                                samples.AddItem(ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey
                                                                         (), null));
                            }
                            else
                            {
                                // When exceeding the maximum number of samples, replace a
                                // random element with this one, then adjust the frequency
                                // to reflect the possibility of existing elements being
                                // pushed out
                                int ind = r.Next(numSamples);
                                if (ind != numSamples)
                                {
                                    samples.Set(ind, ReflectionUtils.Copy(job.GetConfiguration(), reader.GetCurrentKey
                                                                              (), null));
                                }
                                freq *= (numSamples - 1) / (double)numSamples;
                            }
                        }
                    }
                    reader.Close();
                }
                return((K[])Sharpen.Collections.ToArray(samples));
            }
Пример #4
0
        /// <summary>Constructs the DelegatingRecordReader.</summary>
        /// <param name="split">TaggegInputSplit object</param>
        /// <param name="context">TaskAttemptContext object</param>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.Exception"/>
        public DelegatingRecordReader(InputSplit split, TaskAttemptContext context)
        {
            // Find the InputFormat and then the RecordReader from the
            // TaggedInputSplit.
            TaggedInputSplit   taggedInputSplit = (TaggedInputSplit)split;
            InputFormat <K, V> inputFormat      = (InputFormat <K, V>)ReflectionUtils.NewInstance(taggedInputSplit
                                                                                                  .GetInputFormatClass(), context.GetConfiguration());

            originalRR = inputFormat.CreateRecordReader(taggedInputSplit.GetInputSplit(), context
                                                        );
        }
Пример #5
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.Exception"/>
        private static IList <Text> ReadSplit(InputFormat <LongWritable, Text> format, InputSplit
                                              split, Job job)
        {
            IList <Text>       result  = new AList <Text>();
            Configuration      conf    = job.GetConfiguration();
            TaskAttemptContext context = MapReduceTestUtil.CreateDummyMapTaskAttemptContext(conf
                                                                                            );
            RecordReader <LongWritable, Text> reader = format.CreateRecordReader(split, MapReduceTestUtil
                                                                                 .CreateDummyMapTaskAttemptContext(conf));
            MapContext <LongWritable, Text, LongWritable, Text> mcontext = new MapContextImpl <
                LongWritable, Text, LongWritable, Text>(conf, context.GetTaskAttemptID(), reader
                                                        , null, null, MapReduceTestUtil.CreateDummyReporter(), split);

            reader.Initialize(split, mcontext);
            while (reader.NextKeyValue())
            {
                result.AddItem(new Text(reader.GetCurrentValue()));
            }
            return(result);
        }