static double Eval(IList <byte> predictions) { var candidates = Track2Items.Read(data_dir + "/mml-track2/validationCandidatesIdx2.txt"); var hits = Track2Items.Read(data_dir + "/mml-track2/validationHitsIdx2.txt"); return(Eval(predictions, candidates, hits)); }
static void LoadData(string data_dir) { string training_file = Path.Combine(data_dir, "trainIdx2.txt"); string test_file = Path.Combine(data_dir, "testIdx2.txt"); string validation_candidates_file = Path.Combine(data_dir, "validationCandidatesIdx2.txt"); string validation_ratings_file = Path.Combine(data_dir, "validationRatingsIdx2.txt"); string validation_hits_file = Path.Combine(data_dir, "validationHitsIdx2.txt"); string track_file = Path.Combine(data_dir, "trackData2.txt"); string album_file = Path.Combine(data_dir, "albumData2.txt"); string artist_file = Path.Combine(data_dir, "artistData2.txt"); string genre_file = Path.Combine(data_dir, "genreData2.txt"); if (sample_data) { training_file = Path.Combine(data_dir, "trainIdx2.firstLines.txt"); test_file = Path.Combine(data_dir, "testIdx2.firstLines.txt"); validation_candidates_file = Path.Combine(data_dir, "validationCandidatesIdx2.firstLines.txt"); validation_ratings_file = Path.Combine(data_dir, "validationRatingsIdx2.firstLines.txt"); validation_hits_file = Path.Combine(data_dir, "validationHitsIdx2.firstLines.txt"); } TimeSpan loading_time = Utils.MeasureTime(delegate() { // read training data training_ratings = MyMediaLite.IO.KDDCup2011.Ratings.Read(training_file); // read validation data validation_candidates = Track2Items.Read(validation_candidates_file); validation_hits = Track2Items.Read(validation_hits_file); if (validation_hits.Count != validation_candidates.Count) { throw new Exception("inconsistent number of users in hits and candidates"); } validation_ratings = MyMediaLite.IO.KDDCup2011.Ratings.Read(validation_ratings_file); complete_ratings = new CombinedRatings(training_ratings, validation_ratings); // read test data test_candidates = Track2Items.Read(test_file); // read item data if (recommender_validate is IKDDCupRecommender) { var kddcup_recommender = recommender_validate as IKDDCupRecommender; kddcup_recommender.ItemInfo = MyMediaLite.IO.KDDCup2011.Items.Read(track_file, album_file, artist_file, genre_file, 2); } // connect data and recommenders if (predict_rated) { recommender_validate.Feedback = CreateFeedback(training_ratings); recommender_final.Feedback = CreateFeedback(complete_ratings); } else { // normal item recommenders recommender_validate.Feedback = CreateFeedback(training_ratings, 80); recommender_final.Feedback = CreateFeedback(complete_ratings, 80); } if (recommender_validate is ITransductiveItemRecommender) { // add additional data to semi-supervised models // for the validation recommender ((ITransductiveItemRecommender)recommender_validate).TestUsers = new HashSet <int>(validation_candidates.Keys); var validation_items = new HashSet <int>(); foreach (var l in validation_candidates.Values) { foreach (var i in l) { validation_items.Add(i); } } ((ITransductiveItemRecommender)recommender_validate).TestItems = validation_items; // for the test/final recommender ((ITransductiveItemRecommender)recommender_final).TestUsers = new HashSet <int>(test_candidates.Keys); var test_items = new HashSet <int>(); foreach (var l in test_candidates.Values) { foreach (var i in l) { test_items.Add(i); } } ((ITransductiveItemRecommender)recommender_final).TestItems = test_items; } Console.Error.WriteLine("memory before deleting ratings: {0}", Memory.Usage); training_ratings = null; complete_ratings = null; Console.Error.WriteLine("memory after deleting ratings: {0}", Memory.Usage); }); Console.Error.WriteLine("loading_time {0:0.##}", loading_time.TotalSeconds.ToString(CultureInfo.InvariantCulture)); Utils.DisplayDataStats(recommender_final.Feedback, null, recommender_final); }
// TODO more structure static IList <string> GreedyForwardSearch(IList <string> candidate_files, int k, double err_threshold) { var candidate_items = Track2Items.Read(data_dir + "/mml-track2/validationCandidatesIdx2.txt"); var item_hits = Track2Items.Read(data_dir + "/mml-track2/validationHitsIdx2.txt"); // prediction cache (to save IO) var prediction_cache = new Dictionary <string, IList <byte> >(); // get eval results for all predictions Console.Write("Calculating the errors of {0} candidates ... ", candidate_files.Count); var error = new Dictionary <string, double>(); foreach (string file in candidate_files) { prediction_cache[file] = ReadFile(ValidationFilename(file)); double err = Eval(prediction_cache[file], candidate_items, item_hits); // only keep if error is below threshold if (err < err_threshold) { error[file] = err; Console.Error.Write("."); } else { prediction_cache.Remove(file); Console.Error.Write("_"); } } Console.WriteLine("done: candidates {0} memory {1}", error.Count, Memory.Usage); // the ensemble var ensemble = new List <string>(); var ensemble_validation_predictions = new List <IList <byte> >(); double best_result = 10; var files_by_error = new List <string>( from file in error.Keys orderby error[file] select file); // add the top model as a start var top_file = files_by_error.First(); files_by_error.Remove(top_file); ensemble.Add(top_file); ensemble_validation_predictions.Add(ReadFile(ValidationFilename(top_file))); // init merged predictions IList <byte> ensemble_merged_predictions = ensemble_validation_predictions.First(); while (files_by_error.Count() > 0) { // get the K best candidates var top_k = files_by_error.Take(k); var difference = new Dictionary <string, double>(); if (k > 1) { // compute difference foreach (string file in top_k) { difference[file] = ComputeDifference(prediction_cache[file], ensemble_merged_predictions); } } else { var file = top_k.First(); difference[file] = 0; } var files_by_difference = from file in difference.Keys orderby difference[file] descending select file; // remove from candidates, add to ensemble var next_candidate = files_by_difference.First(); files_by_error.Remove(next_candidate); ensemble.Add(next_candidate); ensemble_validation_predictions.Add(prediction_cache[next_candidate]); Console.Write("({0}/{1}) {2}: {3:F7} ... ", error.Count - files_by_error.Count, error.Count, next_candidate, error[next_candidate]); // cache entry not needed any more prediction_cache.Remove(next_candidate); ensemble_merged_predictions = MergePredictions(ensemble_validation_predictions); double result = Eval(ensemble_merged_predictions, candidate_items, item_hits); Console.Write("ERR {0:F7} ... ", result); if (result > best_result) // if no improvement { ensemble.RemoveAt(ensemble.Count - 1); // remove last ensemble_validation_predictions.RemoveAt(ensemble_validation_predictions.Count - 1); // remove last Console.WriteLine("."); } else { best_result = result; Console.WriteLine("keep ({0}).", ensemble.Count); } } // show results foreach (var file in ensemble) { Console.WriteLine("{0} ({1})", file, error[file]); } Console.WriteLine("files {0} of {1} ERR {2:F7} memory {3}", ensemble.Count, error.Count, best_result, Memory.Usage); return(ensemble); }
/// <summary>Parameters: num_files weight_1 .. weight_n file_1 .. file_n output_file</summary> /// <param name="args">the command-line arguments</param> public static void Main(string[] args) { AppDomain.CurrentDomain.UnhandledException += new UnhandledExceptionEventHandler(Handlers.UnhandledExceptionHandler); // parse command-line parameters string prediction_file = null; //string score_file = null; var p = new OptionSet() { { "data-dir=", v => data_dir = v }, { "prediction-file=", v => prediction_file = v }, { "sigmoid", v => sigmoid = v != null }, { "pairwise-probability", v => pairwise_prob = v != null }, { "pairwise-wins", v => pairwise_wins = v != null }, { "rated-probability", v => rated_prob = v != null }, { "constant-rating", v => constant_rating = v != null }, //{ "score-file=", v => score_file = v }, }; IList <string> extra_args = p.Parse(args); string rated_file = extra_args[0]; // combine files IList <double> test_scores; IList <double> validation_scores; if (constant_rating) { test_scores = ReadFile(rated_file); validation_scores = ReadFile(ValidationFilename(rated_file)); } else { string rating_file = extra_args[1]; test_scores = CombineFiles(rated_file, rating_file); validation_scores = CombineFiles(ValidationFilename(rated_file), ValidationFilename(rating_file)); } // compute error on validation set string validation_candidates_file = Path.Combine(data_dir, "mml-track2/validationCandidatesIdx2.txt"); string validation_hits_file = Path.Combine(data_dir, "mml-track2/validationHitsIdx2.txt"); var candidates = Track2Items.Read(validation_candidates_file); var hits = Track2Items.Read(validation_hits_file); double error = KDDCup.EvaluateTrack2(Decide(validation_scores), candidates, hits); Console.WriteLine("ERR {0:F7}", error); if (prediction_file != null) { WritePredictions(Decide(test_scores), prediction_file); WritePredictions(Decide(validation_scores), ValidationFilename(prediction_file)); } /* * if (score_file != null) * { * WriteScores(test_scores, score_file); * WriteScores(test_scores, ValidationFilename(score_file)); * } */ }