private double ComputeMetrics(List <NGram> nGrams, int[] columnsMappings = null) { var buffer = 5000000; double runningMean = 0; double count = 0; var index = 0; using ( var textString = new StreamReader(_trainPath, Encoding.ASCII, false, 100000000)) { var line = textString.ReadLine(); var classLikelihood = new ClassLikelyhood[2 * Classes]; while (!textString.EndOfStream) { index++; line = textString.ReadLine(); var columns = line.Split(",".ToCharArray()); var dataSample = new DataSample(); var dataPoints = new List <DataPoint>(); var classVal = Convert.ToInt32(GetDataFromLine(columns, hotel_cluster).Value); var isbooked = int.Parse(columns[is_booking]) == 1; if (!isbooked) { continue; } var columnIndex = 0; foreach (var field in nGrams) { var data = GetDataFromLine(columns, field.Columns, columnsMappings); if (data != null) { var dict = _symbols[columnIndex]; if (dict.ContainsKey(data)) { dataPoints.Add(new DataPoint { ColumnId = columnIndex + 23, Value = dict[data] }); } } columnIndex++; } dataSample.DataPoints = dataPoints.ToArray(); GetLikelyhood(dataSample, classLikelihood); var value = ComputeValue(classVal, new[] { classLikelihood[0].ClassId, classLikelihood[1].ClassId, classLikelihood[2].ClassId, classLikelihood[3].ClassId, classLikelihood[4].ClassId }); runningMean = ((count / (count + 1.0)) * runningMean) + (value / (count + 1)); count = count + 1; } } return(runningMean); }
private void Estimate(List <NGram> ngrams, bool naiveBayes, int[] columnsMappings) { var dataSamples = GetDataSamples(ngrams, columnsMappings); var index = 0; var collectionPartitioner = Partitioner.Create(0, dataSamples.Count); Parallel.ForEach(collectionPartitioner, (range, loopState) => { ClassLikelyhood[] resultData = new ClassLikelyhood[2 * Classes]; for (int i = range.Item1; i < range.Item2; i++) { if (naiveBayes) { GetLikelyhoodNaiveBayes(dataSamples[i], resultData); } else { GetLikelyhood(dataSamples[i], resultData); } dataSamples[i].Tag = resultData[0].ClassId + " " + resultData[1].ClassId + " " + resultData[2].ClassId + " " + resultData[3].ClassId + " " + resultData[4].ClassId; } }); ClassLikelyhood[] resData = new ClassLikelyhood[2 * Classes]; if (naiveBayes) { GetLikelyhoodNaiveBayes(dataSamples[0], resData); } else { GetLikelyhood(dataSamples[0], resData); } using (var sw = new StreamWriter(string.Format("Submission_{0}", DateTime.Now.ToString("dd-MM-yy hh-mm")))) { sw.WriteLine("id,hotel_cluster"); foreach (var sample in dataSamples) { var data = index + "," + sample.Tag; sw.WriteLine(data); index++; } } }