public double UserSimilarity(long userID1, long userID2)
        {
            IPreferenceArray xPrefs = dataModel.GetPreferencesFromUser(userID1);
            IPreferenceArray yPrefs = dataModel.GetPreferencesFromUser(userID2);
            int xLength             = xPrefs.Length();
            int yLength             = yPrefs.Length();

            if (xLength <= 1 || yLength <= 1)
            {
                return(Double.NaN);
            }

            // Copy prefs since we need to modify pref values to ranks
            xPrefs = xPrefs.Clone();
            yPrefs = yPrefs.Clone();

            // First sort by values from low to high
            xPrefs.SortByValue();
            yPrefs.SortByValue();

            // Assign ranks from low to high
            float nextRank = 1.0f;

            for (int i = 0; i < xLength; i++)
            {
                // ... but only for items that are common to both pref arrays
                if (yPrefs.HasPrefWithItemID(xPrefs.GetItemID(i)))
                {
                    xPrefs.SetValue(i, nextRank);
                    nextRank += 1.0f;
                }
                // Other values are bogus but don't matter
            }
            nextRank = 1.0f;
            for (int i = 0; i < yLength; i++)
            {
                if (xPrefs.HasPrefWithItemID(yPrefs.GetItemID(i)))
                {
                    yPrefs.SetValue(i, nextRank);
                    nextRank += 1.0f;
                }
            }

            xPrefs.SortByItem();
            yPrefs.SortByItem();

            long xIndex     = xPrefs.GetItemID(0);
            long yIndex     = yPrefs.GetItemID(0);
            int  xPrefIndex = 0;
            int  yPrefIndex = 0;

            double sumXYRankDiff2 = 0.0;
            int    count          = 0;

            while (true)
            {
                int compare = xIndex <yIndex ? -1 : xIndex> yIndex ? 1 : 0;
                if (compare == 0)
                {
                    double diff = xPrefs.GetValue(xPrefIndex) - yPrefs.GetValue(yPrefIndex);
                    sumXYRankDiff2 += diff * diff;
                    count++;
                }
                if (compare <= 0)
                {
                    if (++xPrefIndex >= xLength)
                    {
                        break;
                    }
                    xIndex = xPrefs.GetItemID(xPrefIndex);
                }
                if (compare >= 0)
                {
                    if (++yPrefIndex >= yLength)
                    {
                        break;
                    }
                    yIndex = yPrefs.GetItemID(yPrefIndex);
                }
            }

            if (count <= 1)
            {
                return(Double.NaN);
            }

            // When ranks are unique, this formula actually gives the Pearson correlation
            return(1.0 - 6.0 * sumXYRankDiff2 / (count * (count * count - 1)));
        }
示例#2
0
        /// <p>
        /// Reads one line from the input file and adds the data to a {@link FastByIDMap} data structure which maps user IDs
        /// to preferences. This assumes that each line of the input file corresponds to one preference. After
        /// reading a line and determining which user and item the preference pertains to, the method should look to
        /// see if the data contains a mapping for the user ID already, and if not, add an empty data structure of preferences
        /// as appropriate to the data.
        /// </p>
        ///
        /// <p>
        /// Note that if the line is empty or begins with '#' it will be ignored as a comment.
        /// </p>
        ///
        /// @param line
        ///          line from input data file
        /// @param data
        ///          all data read so far, as a mapping from user IDs to preferences
        /// @param fromPriorData an implementation detail -- if true, data will map IDs to
        ///  {@link PreferenceArray} since the framework is attempting to read and update raw
        ///  data that is already in memory. Otherwise it maps to {@link Collection}s of
        ///  {@link Preference}s, since it's reading fresh data. Subclasses must be prepared
        ///  to handle this wrinkle.
        protected void processLine <T>(string line,
                                       FastByIDMap <T> data,
                                       FastByIDMap <FastByIDMap <DateTime?> > timestamps,
                                       bool fromPriorData)
        {
            // Ignore empty lines and comments
            if (line.Length == 0 || line[0] == COMMENT_CHAR)
            {
                return;
            }

            var    tokens                = SplitLine(line);
            string userIDString          = tokens[0];
            string itemIDString          = tokens[1];
            string preferenceValueString = tokens[2];
            bool   hasTimestamp          = tokens.Length > 3;
            string timestampString       = hasTimestamp ? tokens[3] : null;

            long userID = readUserIDFromString(userIDString);
            long itemID = readItemIDFromString(itemIDString);

            if (transpose)
            {
                long tmp = userID;
                userID = itemID;
                itemID = tmp;
            }

            // This is kind of gross but need to handle two types of storage
            var maybePrefs = data.Get(userID);

            if (fromPriorData)
            {
                // Data are PreferenceArray

                IPreferenceArray prefs = (IPreferenceArray)maybePrefs;
                if (!hasTimestamp && String.IsNullOrWhiteSpace(preferenceValueString))
                {
                    // Then line is of form "userID,itemID,", meaning remove
                    if (prefs != null)
                    {
                        bool exists = false;
                        int  length = prefs.Length();
                        for (int i = 0; i < length; i++)
                        {
                            if (prefs.GetItemID(i) == itemID)
                            {
                                exists = true;
                                break;
                            }
                        }
                        if (exists)
                        {
                            if (length == 1)
                            {
                                data.Remove(userID);
                            }
                            else
                            {
                                IPreferenceArray newPrefs = new GenericUserPreferenceArray(length - 1);
                                for (int i = 0, j = 0; i < length; i++, j++)
                                {
                                    if (prefs.GetItemID(i) == itemID)
                                    {
                                        j--;
                                    }
                                    else
                                    {
                                        newPrefs.Set(j, prefs.Get(i));
                                    }
                                }
                                data.Put(userID, (T)newPrefs);
                            }
                        }
                    }

                    removeTimestamp(userID, itemID, timestamps);
                }
                else
                {
                    float preferenceValue = float.Parse(preferenceValueString, CultureInfo.InvariantCulture);

                    bool exists = false;
                    if (uniqueUserItemCheck && prefs != null)
                    {
                        for (int i = 0; i < prefs.Length(); i++)
                        {
                            if (prefs.GetItemID(i) == itemID)
                            {
                                exists = true;
                                prefs.SetValue(i, preferenceValue);
                                break;
                            }
                        }
                    }

                    if (!exists)
                    {
                        if (prefs == null)
                        {
                            prefs = new GenericUserPreferenceArray(1);
                        }
                        else
                        {
                            IPreferenceArray newPrefs = new GenericUserPreferenceArray(prefs.Length() + 1);
                            for (int i = 0, j = 1; i < prefs.Length(); i++, j++)
                            {
                                newPrefs.Set(j, prefs.Get(i));
                            }
                            prefs = newPrefs;
                        }
                        prefs.SetUserID(0, userID);
                        prefs.SetItemID(0, itemID);
                        prefs.SetValue(0, preferenceValue);
                        data.Put(userID, (T)prefs);
                    }
                }

                addTimestamp(userID, itemID, timestampString, timestamps);
            }
            else
            {
                // Data are IEnumerable<Preference>

                IEnumerable <IPreference> prefs = ((IEnumerable <IPreference>)maybePrefs);

                if (!hasTimestamp && String.IsNullOrWhiteSpace(preferenceValueString))
                {
                    // Then line is of form "userID,itemID,", meaning remove
                    if (prefs != null)
                    {
                        // remove pref
                        var prefsIterator = ((IEnumerable <IPreference>)prefs.ToArray()).GetEnumerator();
                        while (prefsIterator.MoveNext())
                        {
                            IPreference pref = prefsIterator.Current;
                            if (pref.GetItemID() == itemID)
                            {
                                if (prefs is IList <IPreference> )
                                {
                                    ((IList <IPreference>)maybePrefs).Remove(pref);// prefsIterator.remove()
                                }
                                break;
                            }
                        }
                    }

                    removeTimestamp(userID, itemID, timestamps);
                }
                else
                {
                    float preferenceValue = float.Parse(preferenceValueString, CultureInfo.InvariantCulture);

                    bool exists = false;
                    if (uniqueUserItemCheck && prefs != null)
                    {
                        foreach (IPreference pref in prefs)
                        {
                            if (pref.GetItemID() == itemID)
                            {
                                exists = true;
                                pref.SetValue(preferenceValue);
                                break;
                            }
                        }
                    }

                    if (!exists)
                    {
                        if (prefs == null)
                        {
                            prefs = new List <IPreference>(5);
                            data.Put(userID, (T)prefs);
                        }

                        if (prefs is IList <IPreference> )
                        {
                            ((IList <IPreference>)prefs).Add(new GenericPreference(userID, itemID, preferenceValue));
                        }
                    }

                    addTimestamp(userID, itemID, timestampString, timestamps);
                }
            }
        }