/// <summary> /// Parses a string describing a list of features. /// </summary> /// <param name="featureString">The string containing the list of features.</param> /// <param name="parsingContext">The file parsing context.</param> /// <param name="featureCount">The number of features in the dataset, which would be updated from the parsed feature indices.</param> /// <returns>A sparse array of features extracted from <paramref name="featureString"/>.</returns> private static Vector ParseFeatures(string featureString, FileParsingContext parsingContext, ref int featureCount) { Debug.Assert(featureString != null, "A valid feature string should be specified."); var featureIndexToValue = new SortedDictionary <int, double>(); string[] featureDescriptions = featureString.Split('|'); foreach (string featureDescription in featureDescriptions) { if (featureDescription.Trim().Length == 0) { continue; } string[] featureDescriptionParts = featureDescription.Split(':'); int featureIndex = 0; double featureValue = 0; if (featureDescriptionParts.Length != 2 || !int.TryParse(featureDescriptionParts[0], out featureIndex) || !double.TryParse(featureDescriptionParts[1], out featureValue)) { parsingContext.RaiseError("Invalid feature description string."); } if (featureIndexToValue.ContainsKey(featureIndex)) { parsingContext.RaiseError("Feature {0} is referenced several times.", featureIndex); } featureIndexToValue.Add(featureIndex, featureValue); featureCount = Math.Max(featureCount, featureIndex + 1); } return(SparseVector.FromSparseValues(featureCount, 0, featureIndexToValue.Select(kv => new ValueAtIndex <double>(kv.Key, kv.Value)).ToList())); }
/// <summary> /// Reads labeled feature values from a file with the specified name. /// </summary> /// <param name="fileName">The file name.</param> /// <param name="labelSet">An optional set of labels.</param> /// <param name="featureSet">An optional set of features.</param> /// <returns>A list of labeled feature values.</returns> public static IList <LabeledFeatureValues> LoadLabeledFeatureValues( string fileName, IndexedSet <string> labelSet = null, IndexedSet <string> featureSet = null) { if (string.IsNullOrWhiteSpace(fileName)) { throw new ArgumentException("The name of the file must not be null or whitespace.", nameof(fileName)); } var labeledFeatureValues = new List <LabeledFeatureValues>(); var labelDictionary = labelSet ?? new IndexedSet <string>(); var featureDictionary = featureSet ?? new IndexedSet <string>(); var parsingContext = new FileParsingContext(fileName); using (var reader = new StreamReader(fileName)) { string line; while ((line = reader.ReadLine()) != null) { if (string.IsNullOrWhiteSpace(line)) { continue; } line = line.Trim(); if (line.StartsWith("#") || line.StartsWith("//") || line.StartsWith("%")) { continue; } try { labeledFeatureValues.Add(LabeledFeatureValues.Parse(line, labelDictionary, featureDictionary)); } catch (Exception e) { parsingContext.RaiseError("{0}", e.Message); } } } return(labeledFeatureValues); }
/// <summary> /// Loads dataset from a given file. /// <para> /// Data file format: /// Row starting with 'R' describes min and max ratings and has form 'R,Min,Max'. /// Rows starting with 'U' describe a single user and have form 'U,UserId,UserFeatures'. /// Rows starting with 'I' describe a single item and have form 'I,ItemId,ItemFeatures'. /// Rows other than that describe instances and should have form 'UserID,ItemID,Rating'. /// Feature description has form 'FeatureIndex1:Value1|FeatureIndex2:Value2|...' /// If all the user features are zero or there are no user features in the dataset at all, the user description can be omitted. Same is true for items. /// </para> /// </summary> /// <param name="fileName">File to load data from.</param> /// <returns>The loaded dataset.</returns> public static RecommenderDataset Load(string fileName) { var rawObservations = new List <Tuple <string, string, int> >(); var userIdToFeatures = new Dictionary <string, Vector>(); var itemIdToFeatures = new Dictionary <string, Vector>(); int?minRating = null, maxRating = null; int userFeatureCount = 0, itemFeatureCount = 0; var parsingContext = new FileParsingContext(fileName); using (var reader = new StreamReader(fileName)) { string line; bool isFirstRecord = true; while ((line = reader.ReadLine()) != null) { parsingContext.NextLine(line); if (line.Length == 0 || line.StartsWith("#")) { continue; // Skip comments and empty lines } string[] splits = line.Split(','); if (isFirstRecord) { //// Parse rating record int minRatingValue = 0, maxRatingValue = 0; if (splits.Length != 3 || splits[0].Trim() != "R" || !int.TryParse(splits[1], out minRatingValue) || !int.TryParse(splits[2], out maxRatingValue)) { parsingContext.RaiseError("Invalid rating info record."); } minRating = minRatingValue; maxRating = maxRatingValue; isFirstRecord = false; } else if (splits[0].Trim() == "U") { //// Parse user record if (splits.Length != 3) { parsingContext.RaiseError("Invalid user record."); } string userId = splits[1].Trim(); if (userIdToFeatures.ContainsKey(userId)) { parsingContext.RaiseError("Record describing user '{0}' is presented more than once.", userId); } Vector features = ParseFeatures(splits[2], parsingContext, ref userFeatureCount); userIdToFeatures.Add(userId, features); } else if (splits[0].Trim() == "I") { //// Parse item record if (splits.Length != 3) { parsingContext.RaiseError("Invalid item record."); } string itemId = splits[1].Trim(); if (itemIdToFeatures.ContainsKey(itemId)) { parsingContext.RaiseError("Record describing item '{0}' is presented more than once.", itemId); } Vector features = ParseFeatures(splits[2], parsingContext, ref itemFeatureCount); itemIdToFeatures.Add(itemId, features); } else { //// Parse instance record string userId = splits[0].Trim(); string itemId = splits[1].Trim(); int rating = 0; if (splits.Length != 3 || !int.TryParse(splits[2], out rating)) { parsingContext.RaiseError("Invalid instance record.", line); } rawObservations.Add(Tuple.Create(userId, itemId, rating)); } } } if (!minRating.HasValue) { parsingContext.RaiseGlobalError("Rating info is missing."); } var result = new RecommenderDataset { StarRatingInfo = new StarRatingInfo(minRating.Value, maxRating.Value) }; foreach (var observation in rawObservations) { string userId = observation.Item1; string itemId = observation.Item2; int rating = observation.Item3; if (rating < minRating.Value || rating > maxRating.Value) { parsingContext.RaiseGlobalError("One of the ratings is inconsistent with the specified rating info."); } User user = RetrieveEntity(userId, result.idToUser, userIdToFeatures, userFeatureCount, (id, features) => new User(id, features)); Item item = RetrieveEntity(itemId, result.idToItem, itemIdToFeatures, itemFeatureCount, (id, features) => new Item(id, features)); result.observations.Add(new RatedUserItem(user, item, rating)); } return(result); }