/// <summary>
        /// Parses a string describing a list of features.
        /// </summary>
        /// <param name="featureString">The string containing the list of features.</param>
        /// <param name="parsingContext">The file parsing context.</param>
        /// <param name="featureCount">The number of features in the dataset, which would be updated from the parsed feature indices.</param>
        /// <returns>A sparse array of features extracted from <paramref name="featureString"/>.</returns>
        private static Vector ParseFeatures(string featureString, FileParsingContext parsingContext, ref int featureCount)
        {
            Debug.Assert(featureString != null, "A valid feature string should be specified.");

            var featureIndexToValue = new SortedDictionary <int, double>();

            string[] featureDescriptions = featureString.Split('|');
            foreach (string featureDescription in featureDescriptions)
            {
                if (featureDescription.Trim().Length == 0)
                {
                    continue;
                }

                string[] featureDescriptionParts = featureDescription.Split(':');
                int      featureIndex            = 0;
                double   featureValue            = 0;
                if (featureDescriptionParts.Length != 2 ||
                    !int.TryParse(featureDescriptionParts[0], out featureIndex) ||
                    !double.TryParse(featureDescriptionParts[1], out featureValue))
                {
                    parsingContext.RaiseError("Invalid feature description string.");
                }

                if (featureIndexToValue.ContainsKey(featureIndex))
                {
                    parsingContext.RaiseError("Feature {0} is referenced several times.", featureIndex);
                }

                featureIndexToValue.Add(featureIndex, featureValue);
                featureCount = Math.Max(featureCount, featureIndex + 1);
            }

            return(SparseVector.FromSparseValues(featureCount, 0, featureIndexToValue.Select(kv => new ValueAtIndex <double>(kv.Key, kv.Value)).ToList()));
        }
예제 #2
0
        /// <summary>
        /// Reads labeled feature values from a file with the specified name.
        /// </summary>
        /// <param name="fileName">The file name.</param>
        /// <param name="labelSet">An optional set of labels.</param>
        /// <param name="featureSet">An optional set of features.</param>
        /// <returns>A list of labeled feature values.</returns>
        public static IList <LabeledFeatureValues> LoadLabeledFeatureValues(
            string fileName,
            IndexedSet <string> labelSet   = null,
            IndexedSet <string> featureSet = null)
        {
            if (string.IsNullOrWhiteSpace(fileName))
            {
                throw new ArgumentException("The name of the file must not be null or whitespace.", nameof(fileName));
            }

            var labeledFeatureValues = new List <LabeledFeatureValues>();
            var labelDictionary      = labelSet ?? new IndexedSet <string>();
            var featureDictionary    = featureSet ?? new IndexedSet <string>();
            var parsingContext       = new FileParsingContext(fileName);

            using (var reader = new StreamReader(fileName))
            {
                string line;
                while ((line = reader.ReadLine()) != null)
                {
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }

                    line = line.Trim();
                    if (line.StartsWith("#") || line.StartsWith("//") || line.StartsWith("%"))
                    {
                        continue;
                    }

                    try
                    {
                        labeledFeatureValues.Add(LabeledFeatureValues.Parse(line, labelDictionary, featureDictionary));
                    }
                    catch (Exception e)
                    {
                        parsingContext.RaiseError("{0}", e.Message);
                    }
                }
            }

            return(labeledFeatureValues);
        }
        /// <summary>
        /// Loads dataset from a given file.
        /// <para>
        /// Data file format:
        /// Row starting with 'R' describes min and max ratings and has form 'R,Min,Max'.
        /// Rows starting with 'U' describe a single user and have form 'U,UserId,UserFeatures'.
        /// Rows starting with 'I' describe a single item and have form 'I,ItemId,ItemFeatures'.
        /// Rows other than that describe instances and should have form 'UserID,ItemID,Rating'.
        /// Feature description has form 'FeatureIndex1:Value1|FeatureIndex2:Value2|...'
        /// If all the user features are zero or there are no user features in the dataset at all, the user description can be omitted. Same is true for items.
        /// </para>
        /// </summary>
        /// <param name="fileName">File to load data from.</param>
        /// <returns>The loaded dataset.</returns>
        public static RecommenderDataset Load(string fileName)
        {
            var rawObservations = new List <Tuple <string, string, int> >();
            var userIdToFeatures = new Dictionary <string, Vector>();
            var itemIdToFeatures = new Dictionary <string, Vector>();
            int?minRating = null, maxRating = null;
            int userFeatureCount = 0, itemFeatureCount = 0;

            var parsingContext = new FileParsingContext(fileName);

            using (var reader = new StreamReader(fileName))
            {
                string line;
                bool   isFirstRecord = true;
                while ((line = reader.ReadLine()) != null)
                {
                    parsingContext.NextLine(line);
                    if (line.Length == 0 || line.StartsWith("#"))
                    {
                        continue; // Skip comments and empty lines
                    }

                    string[] splits = line.Split(',');

                    if (isFirstRecord)
                    {
                        //// Parse rating record

                        int minRatingValue = 0, maxRatingValue = 0;
                        if (splits.Length != 3 ||
                            splits[0].Trim() != "R" ||
                            !int.TryParse(splits[1], out minRatingValue) ||
                            !int.TryParse(splits[2], out maxRatingValue))
                        {
                            parsingContext.RaiseError("Invalid rating info record.");
                        }

                        minRating     = minRatingValue;
                        maxRating     = maxRatingValue;
                        isFirstRecord = false;
                    }
                    else if (splits[0].Trim() == "U")
                    {
                        //// Parse user record

                        if (splits.Length != 3)
                        {
                            parsingContext.RaiseError("Invalid user record.");
                        }

                        string userId = splits[1].Trim();
                        if (userIdToFeatures.ContainsKey(userId))
                        {
                            parsingContext.RaiseError("Record describing user '{0}' is presented more than once.", userId);
                        }

                        Vector features = ParseFeatures(splits[2], parsingContext, ref userFeatureCount);
                        userIdToFeatures.Add(userId, features);
                    }
                    else if (splits[0].Trim() == "I")
                    {
                        //// Parse item record

                        if (splits.Length != 3)
                        {
                            parsingContext.RaiseError("Invalid item record.");
                        }

                        string itemId = splits[1].Trim();
                        if (itemIdToFeatures.ContainsKey(itemId))
                        {
                            parsingContext.RaiseError("Record describing item '{0}' is presented more than once.", itemId);
                        }

                        Vector features = ParseFeatures(splits[2], parsingContext, ref itemFeatureCount);
                        itemIdToFeatures.Add(itemId, features);
                    }
                    else
                    {
                        //// Parse instance record

                        string userId = splits[0].Trim();
                        string itemId = splits[1].Trim();
                        int    rating = 0;
                        if (splits.Length != 3 || !int.TryParse(splits[2], out rating))
                        {
                            parsingContext.RaiseError("Invalid instance record.", line);
                        }

                        rawObservations.Add(Tuple.Create(userId, itemId, rating));
                    }
                }
            }

            if (!minRating.HasValue)
            {
                parsingContext.RaiseGlobalError("Rating info is missing.");
            }

            var result = new RecommenderDataset {
                StarRatingInfo = new StarRatingInfo(minRating.Value, maxRating.Value)
            };

            foreach (var observation in rawObservations)
            {
                string userId = observation.Item1;
                string itemId = observation.Item2;
                int    rating = observation.Item3;

                if (rating < minRating.Value || rating > maxRating.Value)
                {
                    parsingContext.RaiseGlobalError("One of the ratings is inconsistent with the specified rating info.");
                }

                User user = RetrieveEntity(userId, result.idToUser, userIdToFeatures, userFeatureCount, (id, features) => new User(id, features));
                Item item = RetrieveEntity(itemId, result.idToItem, itemIdToFeatures, itemFeatureCount, (id, features) => new Item(id, features));
                result.observations.Add(new RatedUserItem(user, item, rating));
            }

            return(result);
        }