Пример #1
0
        private double ComputeMetrics(List <NGram> nGrams, int[] columnsMappings = null)
        {
            var    buffer      = 5000000;
            double runningMean = 0;
            double count       = 0;
            var    index       = 0;

            using (
                var textString = new StreamReader(_trainPath,
                                                  Encoding.ASCII, false, 100000000))
            {
                var line            = textString.ReadLine();
                var classLikelihood = new ClassLikelyhood[2 * Classes];
                while (!textString.EndOfStream)
                {
                    index++;
                    line = textString.ReadLine();
                    var columns    = line.Split(",".ToCharArray());
                    var dataSample = new DataSample();
                    var dataPoints = new List <DataPoint>();

                    var classVal = Convert.ToInt32(GetDataFromLine(columns, hotel_cluster).Value);
                    var isbooked = int.Parse(columns[is_booking]) == 1;
                    if (!isbooked)
                    {
                        continue;
                    }

                    var columnIndex = 0;
                    foreach (var field in nGrams)
                    {
                        var data = GetDataFromLine(columns, field.Columns, columnsMappings);
                        if (data != null)
                        {
                            var dict = _symbols[columnIndex];
                            if (dict.ContainsKey(data))
                            {
                                dataPoints.Add(new DataPoint {
                                    ColumnId = columnIndex + 23, Value = dict[data]
                                });
                            }
                        }
                        columnIndex++;
                    }
                    dataSample.DataPoints = dataPoints.ToArray();
                    GetLikelyhood(dataSample, classLikelihood);
                    var value = ComputeValue(classVal,
                                             new[]
                    {
                        classLikelihood[0].ClassId, classLikelihood[1].ClassId, classLikelihood[2].ClassId,
                        classLikelihood[3].ClassId, classLikelihood[4].ClassId
                    });

                    runningMean = ((count / (count + 1.0)) * runningMean) + (value / (count + 1));
                    count       = count + 1;
                }
            }

            return(runningMean);
        }
Пример #2
0
        private void Estimate(List <NGram> ngrams, bool naiveBayes, int[] columnsMappings)
        {
            var dataSamples = GetDataSamples(ngrams, columnsMappings);
            var index       = 0;

            var collectionPartitioner = Partitioner.Create(0, dataSamples.Count);

            Parallel.ForEach(collectionPartitioner, (range, loopState) =>
            {
                ClassLikelyhood[] resultData = new ClassLikelyhood[2 * Classes];
                for (int i = range.Item1; i < range.Item2; i++)
                {
                    if (naiveBayes)
                    {
                        GetLikelyhoodNaiveBayes(dataSamples[i], resultData);
                    }
                    else
                    {
                        GetLikelyhood(dataSamples[i], resultData);
                    }
                    dataSamples[i].Tag = resultData[0].ClassId + " " + resultData[1].ClassId + " " +
                                         resultData[2].ClassId + " " +
                                         resultData[3].ClassId + " " + resultData[4].ClassId;
                }
            });


            ClassLikelyhood[] resData = new ClassLikelyhood[2 * Classes];
            if (naiveBayes)
            {
                GetLikelyhoodNaiveBayes(dataSamples[0], resData);
            }
            else
            {
                GetLikelyhood(dataSamples[0], resData);
            }

            using (var sw = new StreamWriter(string.Format("Submission_{0}", DateTime.Now.ToString("dd-MM-yy hh-mm"))))
            {
                sw.WriteLine("id,hotel_cluster");
                foreach (var sample in dataSamples)
                {
                    var data = index + "," + sample.Tag;
                    sw.WriteLine(data);
                    index++;
                }
            }
        }