/// <summary>Read in rating data from a file</summary> /// <param name="filename">the name of the file to read from</param> /// <param name="user_mapping">mapping object for user IDs</param> /// <param name="item_mapping">mapping object for item IDs</param> /// <param name="test_rating_format">whether there is a rating column in each line or not</param> /// <param name="ignore_first_line">if true, ignore the first line</param> /// <returns>the rating data</returns> static public ITimedRatings Read( string filename, IMapping user_mapping = null, IMapping item_mapping = null, TestRatingFileFormat test_rating_format = TestRatingFileFormat.WITH_RATINGS, bool ignore_first_line = false) { string binary_filename = filename + ".bin.TimedRatings"; if (FileSerializer.Should(user_mapping, item_mapping) && File.Exists(binary_filename)) { return((ITimedRatings)FileSerializer.Deserialize(binary_filename)); } return(Wrap.FormatException <ITimedRatings>(filename, delegate() { using (var reader = new StreamReader(filename)) { var ratings = (TimedRatings)Read(reader, user_mapping, item_mapping); if (FileSerializer.Should(user_mapping, item_mapping) && FileSerializer.CanWrite(binary_filename)) { ratings.Serialize(binary_filename); } return ratings; } })); }
/// <summary>Read in static rating data from a file</summary> /// <param name="filename">the name of the file to read from</param> /// <param name="user_mapping">mapping object for user IDs</param> /// <param name="item_mapping">mapping object for item IDs</param> /// <param name="rating_type">the data type to be used for storing the ratings</param> /// <param name="test_rating_format">whether there is a rating column in each line or not</param> /// <param name="ignore_first_line">if true, ignore the first line</param> /// <returns>the rating data</returns> public static IRatings Read( string filename, IMapping user_mapping = null, IMapping item_mapping = null, RatingType rating_type = RatingType.FLOAT, TestRatingFileFormat test_rating_format = TestRatingFileFormat.WITH_RATINGS, bool ignore_first_line = false) { string binary_filename = filename + ".bin.StaticRatings"; if (FileSerializer.Should(user_mapping, item_mapping) && File.Exists(binary_filename)) return (IRatings) FileSerializer.Deserialize(binary_filename); int size = 0; using ( var reader = new StreamReader(filename) ) while (reader.ReadLine() != null) size++; if (ignore_first_line) size--; return Wrap.FormatException<IRatings>(filename, delegate() { using ( var reader = new StreamReader(filename) ) { var ratings = (StaticRatings) Read(reader, size, user_mapping, item_mapping, rating_type, test_rating_format); if (FileSerializer.Should(user_mapping, item_mapping) && FileSerializer.CanWrite(binary_filename)) ratings.Serialize(binary_filename); return ratings; } }); }
/// <summary>Read in rating data from a TextReader</summary> /// <param name="reader">the <see cref="TextReader"/> to read from</param> /// <param name="user_mapping">mapping object for user IDs</param> /// <param name="item_mapping">mapping object for item IDs</param> /// <param name="test_rating_format">whether there is a rating column in each line or not</param> /// <returns>the rating data</returns> static public ITimedRatings Read( TextReader reader, IMapping user_mapping = null, IMapping item_mapping = null, TestRatingFileFormat test_rating_format = TestRatingFileFormat.WITH_RATINGS) { if (user_mapping == null) { user_mapping = new IdentityMapping(); } if (item_mapping == null) { item_mapping = new IdentityMapping(); } var ratings = new TimedRatings(); string[] separators = { "::" }; string line; int seconds_pos = test_rating_format == TestRatingFileFormat.WITH_RATINGS ? 3 : 2; while ((line = reader.ReadLine()) != null) { string[] tokens = line.Split(separators, StringSplitOptions.None); if (test_rating_format == TestRatingFileFormat.WITH_RATINGS && tokens.Length < 4) { throw new FormatException("Expected at least 4 columns: " + line); } if (test_rating_format == TestRatingFileFormat.WITHOUT_RATINGS && tokens.Length < 3) { throw new FormatException("Expected at least 3 columns: " + line); } int user_id = user_mapping.ToInternalID(tokens[0]); int item_id = item_mapping.ToInternalID(tokens[1]); float rating = test_rating_format == TestRatingFileFormat.WITH_RATINGS ? float.Parse(tokens[2], CultureInfo.InvariantCulture) : 0; long seconds = uint.Parse(tokens[seconds_pos]); var time = new DateTime(seconds * 10000000L).AddYears(1969); var offset = TimeZone.CurrentTimeZone.GetUtcOffset(time); time -= offset; ratings.Add(user_id, item_id, rating, time); } return(ratings); }
/// <summary>Read in static rating data from a TextReader</summary> /// <param name="reader">the <see cref="TextReader"/> to read from</param> /// <param name="size">the number of ratings in the file</param> /// <param name="user_mapping">mapping object for user IDs</param> /// <param name="item_mapping">mapping object for item IDs</param> /// <param name="rating_type">the data type to be used for storing the ratings</param> /// <param name="test_rating_format">whether there is a rating column in each line or not</param> /// <param name="ignore_first_line">if true, ignore the first line</param> /// <returns>the rating data</returns> public static IRatings Read( TextReader reader, int size, IMapping user_mapping = null, IMapping item_mapping = null, RatingType rating_type = RatingType.FLOAT, TestRatingFileFormat test_rating_format = TestRatingFileFormat.WITH_RATINGS, bool ignore_first_line = false) { if (user_mapping == null) user_mapping = new IdentityMapping(); if (item_mapping == null) item_mapping = new IdentityMapping(); if (ignore_first_line) reader.ReadLine(); IRatings ratings; if (rating_type == RatingType.BYTE) ratings = new StaticByteRatings(size); else if (rating_type == RatingType.FLOAT) ratings = new StaticRatings(size); else throw new FormatException(string.Format("Unknown rating type: {0}", rating_type)); string line; while ((line = reader.ReadLine()) != null) { if (line.Length == 0) continue; string[] tokens = line.Split(Constants.SPLIT_CHARS); if (test_rating_format == TestRatingFileFormat.WITH_RATINGS && tokens.Length < 3) throw new FormatException("Expected at least 3 columns: " + line); if (test_rating_format == TestRatingFileFormat.WITHOUT_RATINGS && tokens.Length < 2) throw new FormatException("Expected at least 2 columns: " + line); int user_id = user_mapping.ToInternalID(tokens[0]); int item_id = item_mapping.ToInternalID(tokens[1]); float rating = test_rating_format == TestRatingFileFormat.WITH_RATINGS ? float.Parse(tokens[2], CultureInfo.InvariantCulture) : 0; ratings.Add(user_id, item_id, rating); } ratings.InitScale(); return ratings; }
/// <summary>Read in rating data from a file</summary> /// <param name="filename">the name of the file to read from</param> /// <param name="user_mapping">mapping object for user IDs</param> /// <param name="item_mapping">mapping object for item IDs</param> /// <param name="test_rating_format">whether there is a rating column in each line or not</param> /// <returns>the rating data</returns> static public ITimedRatings Read( string filename, IMapping user_mapping = null, IMapping item_mapping = null, TestRatingFileFormat test_rating_format = TestRatingFileFormat.WITH_RATINGS) { string binary_filename = filename + ".bin.TimedRatings"; if (FileSerializer.Should(user_mapping, item_mapping) && File.Exists(binary_filename)) return (ITimedRatings) FileSerializer.Deserialize(binary_filename); return Wrap.FormatException<ITimedRatings>(filename, delegate() { using ( var reader = new StreamReader(filename) ) { var ratings = (TimedRatings) Read(reader, user_mapping, item_mapping, test_rating_format); if (FileSerializer.Should(user_mapping, item_mapping) && FileSerializer.CanWrite(binary_filename)) ratings.Serialize(binary_filename); return ratings; } }); }
/// <summary>Read in static rating data from a file</summary> /// <param name="filename">the name of the file to read from</param> /// <param name="user_mapping">mapping object for user IDs</param> /// <param name="item_mapping">mapping object for item IDs</param> /// <param name="rating_type">the data type to be used for storing the ratings</param> /// <param name="test_rating_format">whether there is a rating column in each line or not</param> /// <param name="ignore_first_line">if true, ignore the first line</param> /// <returns>the rating data</returns> static public IRatings Read( string filename, IMapping user_mapping = null, IMapping item_mapping = null, RatingType rating_type = RatingType.FLOAT, TestRatingFileFormat test_rating_format = TestRatingFileFormat.WITH_RATINGS, bool ignore_first_line = false) { string binary_filename = filename + ".bin.StaticRatings"; if (FileSerializer.Should(user_mapping, item_mapping) && File.Exists(binary_filename)) { return((IRatings)FileSerializer.Deserialize(binary_filename)); } int size = 0; using (var reader = new StreamReader(filename)) while (reader.ReadLine() != null) { size++; } if (ignore_first_line) { size--; } return(Wrap.FormatException <IRatings>(filename, delegate() { using (var reader = new StreamReader(filename)) { var ratings = (StaticRatings)Read(reader, size, user_mapping, item_mapping, rating_type, test_rating_format); if (FileSerializer.Should(user_mapping, item_mapping) && FileSerializer.CanWrite(binary_filename)) { ratings.Serialize(binary_filename); } return ratings; } })); }
/// <summary>Read in rating data from a TextReader</summary> /// <param name="reader">the <see cref="TextReader"/> to read from</param> /// <param name="user_mapping">mapping object for user IDs</param> /// <param name="item_mapping">mapping object for item IDs</param> /// <param name="test_rating_format">whether there is a rating column in each line or not</param> /// <returns>the rating data</returns> static public ITimedRatings Read( TextReader reader, IMapping user_mapping = null, IMapping item_mapping = null, TestRatingFileFormat test_rating_format = TestRatingFileFormat.WITH_RATINGS) { if (user_mapping == null) user_mapping = new IdentityMapping(); if (item_mapping == null) item_mapping = new IdentityMapping(); var ratings = new TimedRatings(); string[] separators = { "::" }; string line; int seconds_pos = test_rating_format == TestRatingFileFormat.WITH_RATINGS ? 3 : 2; while ((line = reader.ReadLine()) != null) { string[] tokens = line.Split(separators, StringSplitOptions.None); if (test_rating_format == TestRatingFileFormat.WITH_RATINGS && tokens.Length < 4) throw new FormatException("Expected at least 4 columns: " + line); if (test_rating_format == TestRatingFileFormat.WITHOUT_RATINGS && tokens.Length < 3) throw new FormatException("Expected at least 3 columns: " + line); int user_id = user_mapping.ToInternalID(tokens[0]); int item_id = item_mapping.ToInternalID(tokens[1]); float rating = test_rating_format == TestRatingFileFormat.WITH_RATINGS ? float.Parse(tokens[2], CultureInfo.InvariantCulture) : 0; long seconds = uint.Parse(tokens[seconds_pos]); var time = new DateTime(seconds * 10000000L).AddYears(1969); var offset = TimeZone.CurrentTimeZone.GetUtcOffset(time); time -= offset; ratings.Add(user_id, item_id, rating, time); } return ratings; }
/// <summary>Read in static rating data from a TextReader</summary> /// <param name="reader">the <see cref="TextReader"/> to read from</param> /// <param name="size">the number of ratings in the file</param> /// <param name="user_mapping">mapping object for user IDs</param> /// <param name="item_mapping">mapping object for item IDs</param> /// <param name="rating_type">the data type to be used for storing the ratings</param> /// <param name="test_rating_format">whether there is a rating column in each line or not</param> /// <param name="ignore_first_line">if true, ignore the first line</param> /// <returns>the rating data</returns> static public IRatings Read( TextReader reader, int size, IMapping user_mapping = null, IMapping item_mapping = null, RatingType rating_type = RatingType.FLOAT, TestRatingFileFormat test_rating_format = TestRatingFileFormat.WITH_RATINGS, bool ignore_first_line = false) { if (user_mapping == null) { user_mapping = new IdentityMapping(); } if (item_mapping == null) { item_mapping = new IdentityMapping(); } if (ignore_first_line) { reader.ReadLine(); } IRatings ratings; if (rating_type == RatingType.BYTE) { ratings = new StaticByteRatings(size); } else if (rating_type == RatingType.FLOAT) { ratings = new StaticRatings(size); } else { throw new FormatException(string.Format("Unknown rating type: {0}", rating_type)); } string line; while ((line = reader.ReadLine()) != null) { if (line.Length == 0) { continue; } string[] tokens = line.Split(Constants.SPLIT_CHARS); if (test_rating_format == TestRatingFileFormat.WITH_RATINGS && tokens.Length < 3) { throw new FormatException("Expected at least 3 columns: " + line); } if (test_rating_format == TestRatingFileFormat.WITHOUT_RATINGS && tokens.Length < 2) { throw new FormatException("Expected at least 2 columns: " + line); } int user_id = user_mapping.ToInternalID(tokens[0]); int item_id = item_mapping.ToInternalID(tokens[1]); float rating = test_rating_format == TestRatingFileFormat.WITH_RATINGS ? float.Parse(tokens[2], CultureInfo.InvariantCulture) : 0; ratings.Add(user_id, item_id, rating); } ratings.InitScale(); return(ratings); }
protected override void LoadData() { bool static_data = !online_eval; TimeSpan loading_time = Wrap.MeasureTime(delegate() { base.LoadData(); // read training data if ((recommender is TimeAwareRatingPredictor || chronological_split != null) && file_format != RatingFileFormat.MOVIELENS_1M) { training_data = TimedRatingData.Read(training_file, user_mapping, item_mapping); } else { if (file_format == RatingFileFormat.DEFAULT) { training_data = static_data ? StaticRatingData.Read(training_file, user_mapping, item_mapping, rating_type) : RatingData.Read(training_file, user_mapping, item_mapping); } else if (file_format == RatingFileFormat.IGNORE_FIRST_LINE) { training_data = static_data ? StaticRatingData.Read(training_file, user_mapping, item_mapping, rating_type, TestRatingFileFormat.WITH_RATINGS, true) : RatingData.Read(training_file, user_mapping, item_mapping, true); } else if (file_format == RatingFileFormat.MOVIELENS_1M) { training_data = MovieLensRatingData.Read(training_file, user_mapping, item_mapping); } else if (file_format == RatingFileFormat.KDDCUP_2011) { training_data = MyMediaLite.IO.KDDCup2011.Ratings.Read(training_file); } } recommender.Ratings = training_data; // read test data if (test_file != null) { TestRatingFileFormat test_format = test_no_ratings ? TestRatingFileFormat.WITHOUT_RATINGS : TestRatingFileFormat.WITH_RATINGS; if (recommender is TimeAwareRatingPredictor && file_format != RatingFileFormat.MOVIELENS_1M) { test_data = TimedRatingData.Read(test_file, user_mapping, item_mapping, test_format); } else if (file_format == RatingFileFormat.MOVIELENS_1M) { test_data = MovieLensRatingData.Read(test_file, user_mapping, item_mapping, test_format); } else if (file_format == RatingFileFormat.KDDCUP_2011) { test_data = MyMediaLite.IO.KDDCup2011.Ratings.Read(test_file); } else { test_data = StaticRatingData.Read(test_file, user_mapping, item_mapping, rating_type, test_format, file_format == RatingFileFormat.IGNORE_FIRST_LINE); } if (recommender is ITransductiveRatingPredictor) { ((ITransductiveRatingPredictor)recommender).AdditionalFeedback = test_data; } } }); Console.Error.WriteLine(string.Format(CultureInfo.InvariantCulture, "loading_time {0:0.##}", loading_time.TotalSeconds)); Console.Error.WriteLine("memory {0}", Memory.Usage); }
/// <summary>Read in rating data from a TextReader</summary> /// <param name="reader">the <see cref="TextReader"/> to read from</param> /// <param name="user_mapping">mapping object for user IDs</param> /// <param name="item_mapping">mapping object for item IDs</param> /// <param name="test_rating_format">whether there is a rating column in each line or not</param> /// <param name="ignore_first_line">if true, ignore the first line</param> /// <returns>the rating data</returns> static public ITimedRatings Read( TextReader reader, IMapping user_mapping = null, IMapping item_mapping = null, TestRatingFileFormat test_rating_format = TestRatingFileFormat.WITH_RATINGS, bool ignore_first_line = false) { if (user_mapping == null) { user_mapping = new IdentityMapping(); } if (item_mapping == null) { item_mapping = new IdentityMapping(); } if (ignore_first_line) { reader.ReadLine(); } var ratings = new MyMediaLite.Data.TimedRatings(); var time_split_chars = new char[] { ' ', '-', ':' }; string line; int date_time_offset = test_rating_format == TestRatingFileFormat.WITH_RATINGS ? 3 : 2; while ((line = reader.ReadLine()) != null) { if (line.Length == 0) { continue; } string[] tokens = line.Split(Constants.SPLIT_CHARS); if (test_rating_format == TestRatingFileFormat.WITH_RATINGS && tokens.Length < 4) { throw new FormatException("Expected at least 4 columns: " + line); } if (test_rating_format == TestRatingFileFormat.WITHOUT_RATINGS && tokens.Length < 3) { throw new FormatException("Expected at least 3 columns: " + line); } int user_id = user_mapping.ToInternalID(tokens[0]); int item_id = item_mapping.ToInternalID(tokens[1]); float rating = test_rating_format == TestRatingFileFormat.WITH_RATINGS ? float.Parse(tokens[2], CultureInfo.InvariantCulture) : 0; string date_string = tokens[date_time_offset]; if (tokens[date_time_offset].StartsWith("\"") && tokens.Length > date_time_offset + 1 && tokens[date_time_offset + 1].EndsWith("\"")) { date_string = tokens[date_time_offset] + " " + tokens[date_time_offset + 1]; date_string = date_string.Substring(1, date_string.Length - 2); } uint seconds; if (date_string.Length == 19) // format "yyyy-mm-dd hh:mm:ss" { var date_time_tokens = date_string.Split(time_split_chars); ratings.Add( user_id, item_id, rating, new DateTime( int.Parse(date_time_tokens[0]), int.Parse(date_time_tokens[1]), int.Parse(date_time_tokens[2]), int.Parse(date_time_tokens[3]), int.Parse(date_time_tokens[4]), int.Parse(date_time_tokens[5]))); } else if (date_string.Length == 10 && date_string[4] == '-') // format "yyyy-mm-dd" { var date_time_tokens = date_string.Split(time_split_chars); ratings.Add( user_id, item_id, rating, new DateTime( int.Parse(date_time_tokens[0]), int.Parse(date_time_tokens[1]), int.Parse(date_time_tokens[2]))); } else if (uint.TryParse(date_string, out seconds)) // unsigned integer value, interpreted as seconds since Unix epoch { var time = new DateTime(seconds * 10000000L).AddYears(1969); var offset = TimeZone.CurrentTimeZone.GetUtcOffset(time); ratings.Add(user_id, item_id, rating, time - offset); } else { ratings.Add(user_id, item_id, rating, DateTime.Parse(date_string, CultureInfo.InvariantCulture)); } if (ratings.Count % 200000 == 199999) { Console.Error.Write("."); } if (ratings.Count % 12000000 == 11999999) { Console.Error.WriteLine(); } } ratings.InitScale(); return(ratings); }
/// <summary>Read in rating data from a TextReader</summary> /// <param name="reader">the <see cref="TextReader"/> to read from</param> /// <param name="user_mapping">mapping object for user IDs</param> /// <param name="item_mapping">mapping object for item IDs</param> /// <param name="test_rating_format">whether there is a rating column in each line or not</param> /// <param name="ignore_first_line">if true, ignore the first line</param> /// <returns>the rating data</returns> static public ITimedRatings Read( TextReader reader, IMapping user_mapping = null, IMapping item_mapping = null, TestRatingFileFormat test_rating_format = TestRatingFileFormat.WITH_RATINGS, bool ignore_first_line = false) { if (user_mapping == null) user_mapping = new IdentityMapping(); if (item_mapping == null) item_mapping = new IdentityMapping(); if (ignore_first_line) reader.ReadLine(); var ratings = new MyMediaLite.Data.TimedRatings(); var time_split_chars = new char[] { ' ', '-', ':' }; string line; int date_time_offset = test_rating_format == TestRatingFileFormat.WITH_RATINGS ? 3 : 2; while ((line = reader.ReadLine()) != null) { if (line.Length == 0) continue; string[] tokens = line.Split(Constants.SPLIT_CHARS); if (test_rating_format == TestRatingFileFormat.WITH_RATINGS && tokens.Length < 4) throw new FormatException("Expected at least 4 columns: " + line); if (test_rating_format == TestRatingFileFormat.WITHOUT_RATINGS && tokens.Length < 3) throw new FormatException("Expected at least 3 columns: " + line); int user_id = user_mapping.ToInternalID(tokens[0]); int item_id = item_mapping.ToInternalID(tokens[1]); float rating = test_rating_format == TestRatingFileFormat.WITH_RATINGS ? float.Parse(tokens[2], CultureInfo.InvariantCulture) : 0; string date_string = tokens[date_time_offset]; if (tokens[date_time_offset].StartsWith("\"") && tokens.Length > date_time_offset + 1 && tokens[date_time_offset + 1].EndsWith("\"")) { date_string = tokens[date_time_offset] + " " + tokens[date_time_offset + 1]; date_string = date_string.Substring(1, date_string.Length - 2); } uint seconds; if (date_string.Length == 19) // format "yyyy-mm-dd hh:mm:ss" { var date_time_tokens = date_string.Split(time_split_chars); ratings.Add( user_id, item_id, rating, new DateTime( int.Parse(date_time_tokens[0]), int.Parse(date_time_tokens[1]), int.Parse(date_time_tokens[2]), int.Parse(date_time_tokens[3]), int.Parse(date_time_tokens[4]), int.Parse(date_time_tokens[5]))); } else if (date_string.Length == 10 && date_string[4] == '-') // format "yyyy-mm-dd" { var date_time_tokens = date_string.Split(time_split_chars); ratings.Add( user_id, item_id, rating, new DateTime( int.Parse(date_time_tokens[0]), int.Parse(date_time_tokens[1]), int.Parse(date_time_tokens[2]))); } else if (uint.TryParse(date_string, out seconds)) // unsigned integer value, interpreted as seconds since Unix epoch { var time = new DateTime(seconds * 10000000L).AddYears(1969); var offset = TimeZone.CurrentTimeZone.GetUtcOffset(time); ratings.Add(user_id, item_id, rating, time - offset); } else ratings.Add(user_id, item_id, rating, DateTime.Parse(date_string, CultureInfo.InvariantCulture)); if (ratings.Count % 200000 == 199999) Console.Error.Write("."); if (ratings.Count % 12000000 == 11999999) Console.Error.WriteLine(); } ratings.InitScale(); return ratings; }