Beispiel #1
0
    static double Eval(IList <byte> predictions)
    {
        var candidates = Track2Items.Read(data_dir + "/mml-track2/validationCandidatesIdx2.txt");
        var hits       = Track2Items.Read(data_dir + "/mml-track2/validationHitsIdx2.txt");

        return(Eval(predictions, candidates, hits));
    }
Beispiel #2
0
    static void LoadData(string data_dir)
    {
        string training_file = Path.Combine(data_dir, "trainIdx2.txt");
        string test_file     = Path.Combine(data_dir, "testIdx2.txt");
        string validation_candidates_file = Path.Combine(data_dir, "validationCandidatesIdx2.txt");
        string validation_ratings_file    = Path.Combine(data_dir, "validationRatingsIdx2.txt");
        string validation_hits_file       = Path.Combine(data_dir, "validationHitsIdx2.txt");
        string track_file  = Path.Combine(data_dir, "trackData2.txt");
        string album_file  = Path.Combine(data_dir, "albumData2.txt");
        string artist_file = Path.Combine(data_dir, "artistData2.txt");
        string genre_file  = Path.Combine(data_dir, "genreData2.txt");

        if (sample_data)
        {
            training_file = Path.Combine(data_dir, "trainIdx2.firstLines.txt");
            test_file     = Path.Combine(data_dir, "testIdx2.firstLines.txt");
            validation_candidates_file = Path.Combine(data_dir, "validationCandidatesIdx2.firstLines.txt");
            validation_ratings_file    = Path.Combine(data_dir, "validationRatingsIdx2.firstLines.txt");
            validation_hits_file       = Path.Combine(data_dir, "validationHitsIdx2.firstLines.txt");
        }

        TimeSpan loading_time = Utils.MeasureTime(delegate() {
            // read training data
            training_ratings = MyMediaLite.IO.KDDCup2011.Ratings.Read(training_file);

            // read validation data
            validation_candidates = Track2Items.Read(validation_candidates_file);
            validation_hits       = Track2Items.Read(validation_hits_file);

            if (validation_hits.Count != validation_candidates.Count)
            {
                throw new Exception("inconsistent number of users in hits and candidates");
            }
            validation_ratings = MyMediaLite.IO.KDDCup2011.Ratings.Read(validation_ratings_file);

            complete_ratings = new CombinedRatings(training_ratings, validation_ratings);

            // read test data
            test_candidates = Track2Items.Read(test_file);

            // read item data
            if (recommender_validate is IKDDCupRecommender)
            {
                var kddcup_recommender      = recommender_validate as IKDDCupRecommender;
                kddcup_recommender.ItemInfo = MyMediaLite.IO.KDDCup2011.Items.Read(track_file, album_file, artist_file, genre_file, 2);
            }

            // connect data and recommenders
            if (predict_rated)
            {
                recommender_validate.Feedback = CreateFeedback(training_ratings);
                recommender_final.Feedback    = CreateFeedback(complete_ratings);
            }
            else
            {
                // normal item recommenders
                recommender_validate.Feedback = CreateFeedback(training_ratings, 80);
                recommender_final.Feedback    = CreateFeedback(complete_ratings, 80);
            }
            if (recommender_validate is ITransductiveItemRecommender)
            {
                // add additional data to semi-supervised models
                //   for the validation recommender
                ((ITransductiveItemRecommender)recommender_validate).TestUsers = new HashSet <int>(validation_candidates.Keys);
                var validation_items = new HashSet <int>();
                foreach (var l in validation_candidates.Values)
                {
                    foreach (var i in l)
                    {
                        validation_items.Add(i);
                    }
                }
                ((ITransductiveItemRecommender)recommender_validate).TestItems = validation_items;

                //   for the test/final recommender
                ((ITransductiveItemRecommender)recommender_final).TestUsers = new HashSet <int>(test_candidates.Keys);
                var test_items = new HashSet <int>();
                foreach (var l in test_candidates.Values)
                {
                    foreach (var i in l)
                    {
                        test_items.Add(i);
                    }
                }
                ((ITransductiveItemRecommender)recommender_final).TestItems = test_items;
            }

            Console.Error.WriteLine("memory before deleting ratings: {0}", Memory.Usage);
            training_ratings = null;
            complete_ratings = null;
            Console.Error.WriteLine("memory after deleting ratings:  {0}", Memory.Usage);
        });

        Console.Error.WriteLine("loading_time {0:0.##}", loading_time.TotalSeconds.ToString(CultureInfo.InvariantCulture));

        Utils.DisplayDataStats(recommender_final.Feedback, null, recommender_final);
    }
Beispiel #3
0
    // TODO more structure
    static IList <string> GreedyForwardSearch(IList <string> candidate_files, int k, double err_threshold)
    {
        var candidate_items = Track2Items.Read(data_dir + "/mml-track2/validationCandidatesIdx2.txt");
        var item_hits       = Track2Items.Read(data_dir + "/mml-track2/validationHitsIdx2.txt");

        // prediction cache (to save IO)
        var prediction_cache = new Dictionary <string, IList <byte> >();

        // get eval results for all predictions
        Console.Write("Calculating the errors of {0} candidates ... ", candidate_files.Count);
        var error = new Dictionary <string, double>();

        foreach (string file in candidate_files)
        {
            prediction_cache[file] = ReadFile(ValidationFilename(file));
            double err = Eval(prediction_cache[file], candidate_items, item_hits);

            // only keep if error is below threshold
            if (err < err_threshold)
            {
                error[file] = err;
                Console.Error.Write(".");
            }
            else
            {
                prediction_cache.Remove(file);
                Console.Error.Write("_");
            }
        }
        Console.WriteLine("done: candidates {0} memory {1}", error.Count, Memory.Usage);

        // the ensemble
        var ensemble = new List <string>();
        var ensemble_validation_predictions = new List <IList <byte> >();

        double best_result = 10;

        var files_by_error = new List <string>(
            from file in error.Keys
            orderby error[file]
            select file);

        // add the top model as a start
        var top_file = files_by_error.First();

        files_by_error.Remove(top_file);
        ensemble.Add(top_file);
        ensemble_validation_predictions.Add(ReadFile(ValidationFilename(top_file)));

        // init merged predictions
        IList <byte> ensemble_merged_predictions = ensemble_validation_predictions.First();

        while (files_by_error.Count() > 0)
        {
            // get the K best candidates
            var top_k = files_by_error.Take(k);

            var difference = new Dictionary <string, double>();
            if (k > 1)
            {
                // compute difference
                foreach (string file in top_k)
                {
                    difference[file] = ComputeDifference(prediction_cache[file], ensemble_merged_predictions);
                }
            }
            else
            {
                var file = top_k.First();
                difference[file] = 0;
            }

            var files_by_difference =
                from file in difference.Keys
                orderby difference[file] descending
                select file;

            // remove from candidates, add to ensemble
            var next_candidate = files_by_difference.First();
            files_by_error.Remove(next_candidate);
            ensemble.Add(next_candidate);
            ensemble_validation_predictions.Add(prediction_cache[next_candidate]);
            Console.Write("({0}/{1}) {2}: {3:F7} ... ", error.Count - files_by_error.Count, error.Count, next_candidate, error[next_candidate]);

            // cache entry not needed any more
            prediction_cache.Remove(next_candidate);

            ensemble_merged_predictions = MergePredictions(ensemble_validation_predictions);
            double result = Eval(ensemble_merged_predictions, candidate_items, item_hits);
            Console.Write("ERR {0:F7} ... ", result);
            if (result > best_result)                                                                // if no improvement
            {
                ensemble.RemoveAt(ensemble.Count - 1);                                               // remove last
                ensemble_validation_predictions.RemoveAt(ensemble_validation_predictions.Count - 1); // remove last
                Console.WriteLine(".");
            }
            else
            {
                best_result = result;
                Console.WriteLine("keep ({0}).", ensemble.Count);
            }
        }

        // show results
        foreach (var file in ensemble)
        {
            Console.WriteLine("{0} ({1})", file, error[file]);
        }
        Console.WriteLine("files {0} of {1} ERR {2:F7} memory {3}", ensemble.Count, error.Count, best_result, Memory.Usage);

        return(ensemble);
    }
Beispiel #4
0
    /// <summary>Parameters: num_files weight_1 .. weight_n file_1 .. file_n output_file</summary>
    /// <param name="args">the command-line arguments</param>
    public static void Main(string[] args)
    {
        AppDomain.CurrentDomain.UnhandledException += new UnhandledExceptionEventHandler(Handlers.UnhandledExceptionHandler);

        // parse command-line parameters

        string prediction_file = null;
        //string score_file      = null;
        var p = new OptionSet()
        {
            { "data-dir=", v => data_dir = v },
            { "prediction-file=", v => prediction_file = v },
            { "sigmoid", v => sigmoid = v != null },
            { "pairwise-probability", v => pairwise_prob = v != null },
            { "pairwise-wins", v => pairwise_wins = v != null },
            { "rated-probability", v => rated_prob = v != null },
            { "constant-rating", v => constant_rating = v != null },
            //{ "score-file=",            v => score_file = v },
        };
        IList <string> extra_args = p.Parse(args);

        string rated_file = extra_args[0];

        // combine files
        IList <double> test_scores;
        IList <double> validation_scores;

        if (constant_rating)
        {
            test_scores       = ReadFile(rated_file);
            validation_scores = ReadFile(ValidationFilename(rated_file));
        }
        else
        {
            string rating_file = extra_args[1];
            test_scores       = CombineFiles(rated_file, rating_file);
            validation_scores = CombineFiles(ValidationFilename(rated_file), ValidationFilename(rating_file));
        }

        // compute error on validation set
        string validation_candidates_file = Path.Combine(data_dir, "mml-track2/validationCandidatesIdx2.txt");
        string validation_hits_file       = Path.Combine(data_dir, "mml-track2/validationHitsIdx2.txt");
        var    candidates = Track2Items.Read(validation_candidates_file);
        var    hits       = Track2Items.Read(validation_hits_file);
        double error      = KDDCup.EvaluateTrack2(Decide(validation_scores), candidates, hits);

        Console.WriteLine("ERR {0:F7}", error);

        if (prediction_file != null)
        {
            WritePredictions(Decide(test_scores), prediction_file);
            WritePredictions(Decide(validation_scores), ValidationFilename(prediction_file));
        }

        /*
         * if (score_file != null)
         * {
         *      WriteScores(test_scores, score_file);
         *      WriteScores(test_scores, ValidationFilename(score_file));
         * }
         */
    }