public double UserSimilarity(long userID1, long userID2) { IPreferenceArray xPrefs = dataModel.GetPreferencesFromUser(userID1); IPreferenceArray yPrefs = dataModel.GetPreferencesFromUser(userID2); int xLength = xPrefs.Length(); int yLength = yPrefs.Length(); if (xLength <= 1 || yLength <= 1) { return(Double.NaN); } // Copy prefs since we need to modify pref values to ranks xPrefs = xPrefs.Clone(); yPrefs = yPrefs.Clone(); // First sort by values from low to high xPrefs.SortByValue(); yPrefs.SortByValue(); // Assign ranks from low to high float nextRank = 1.0f; for (int i = 0; i < xLength; i++) { // ... but only for items that are common to both pref arrays if (yPrefs.HasPrefWithItemID(xPrefs.GetItemID(i))) { xPrefs.SetValue(i, nextRank); nextRank += 1.0f; } // Other values are bogus but don't matter } nextRank = 1.0f; for (int i = 0; i < yLength; i++) { if (xPrefs.HasPrefWithItemID(yPrefs.GetItemID(i))) { yPrefs.SetValue(i, nextRank); nextRank += 1.0f; } } xPrefs.SortByItem(); yPrefs.SortByItem(); long xIndex = xPrefs.GetItemID(0); long yIndex = yPrefs.GetItemID(0); int xPrefIndex = 0; int yPrefIndex = 0; double sumXYRankDiff2 = 0.0; int count = 0; while (true) { int compare = xIndex <yIndex ? -1 : xIndex> yIndex ? 1 : 0; if (compare == 0) { double diff = xPrefs.GetValue(xPrefIndex) - yPrefs.GetValue(yPrefIndex); sumXYRankDiff2 += diff * diff; count++; } if (compare <= 0) { if (++xPrefIndex >= xLength) { break; } xIndex = xPrefs.GetItemID(xPrefIndex); } if (compare >= 0) { if (++yPrefIndex >= yLength) { break; } yIndex = yPrefs.GetItemID(yPrefIndex); } } if (count <= 1) { return(Double.NaN); } // When ranks are unique, this formula actually gives the Pearson correlation return(1.0 - 6.0 * sumXYRankDiff2 / (count * (count * count - 1))); }
/// <p> /// Reads one line from the input file and adds the data to a {@link FastByIDMap} data structure which maps user IDs /// to preferences. This assumes that each line of the input file corresponds to one preference. After /// reading a line and determining which user and item the preference pertains to, the method should look to /// see if the data contains a mapping for the user ID already, and if not, add an empty data structure of preferences /// as appropriate to the data. /// </p> /// /// <p> /// Note that if the line is empty or begins with '#' it will be ignored as a comment. /// </p> /// /// @param line /// line from input data file /// @param data /// all data read so far, as a mapping from user IDs to preferences /// @param fromPriorData an implementation detail -- if true, data will map IDs to /// {@link PreferenceArray} since the framework is attempting to read and update raw /// data that is already in memory. Otherwise it maps to {@link Collection}s of /// {@link Preference}s, since it's reading fresh data. Subclasses must be prepared /// to handle this wrinkle. protected void processLine <T>(string line, FastByIDMap <T> data, FastByIDMap <FastByIDMap <DateTime?> > timestamps, bool fromPriorData) { // Ignore empty lines and comments if (line.Length == 0 || line[0] == COMMENT_CHAR) { return; } var tokens = SplitLine(line); string userIDString = tokens[0]; string itemIDString = tokens[1]; string preferenceValueString = tokens[2]; bool hasTimestamp = tokens.Length > 3; string timestampString = hasTimestamp ? tokens[3] : null; long userID = readUserIDFromString(userIDString); long itemID = readItemIDFromString(itemIDString); if (transpose) { long tmp = userID; userID = itemID; itemID = tmp; } // This is kind of gross but need to handle two types of storage var maybePrefs = data.Get(userID); if (fromPriorData) { // Data are PreferenceArray IPreferenceArray prefs = (IPreferenceArray)maybePrefs; if (!hasTimestamp && String.IsNullOrWhiteSpace(preferenceValueString)) { // Then line is of form "userID,itemID,", meaning remove if (prefs != null) { bool exists = false; int length = prefs.Length(); for (int i = 0; i < length; i++) { if (prefs.GetItemID(i) == itemID) { exists = true; break; } } if (exists) { if (length == 1) { data.Remove(userID); } else { IPreferenceArray newPrefs = new GenericUserPreferenceArray(length - 1); for (int i = 0, j = 0; i < length; i++, j++) { if (prefs.GetItemID(i) == itemID) { j--; } else { newPrefs.Set(j, prefs.Get(i)); } } data.Put(userID, (T)newPrefs); } } } removeTimestamp(userID, itemID, timestamps); } else { float preferenceValue = float.Parse(preferenceValueString, CultureInfo.InvariantCulture); bool exists = false; if (uniqueUserItemCheck && prefs != null) { for (int i = 0; i < prefs.Length(); i++) { if (prefs.GetItemID(i) == itemID) { exists = true; prefs.SetValue(i, preferenceValue); break; } } } if (!exists) { if (prefs == null) { prefs = new GenericUserPreferenceArray(1); } else { IPreferenceArray newPrefs = new GenericUserPreferenceArray(prefs.Length() + 1); for (int i = 0, j = 1; i < prefs.Length(); i++, j++) { newPrefs.Set(j, prefs.Get(i)); } prefs = newPrefs; } prefs.SetUserID(0, userID); prefs.SetItemID(0, itemID); prefs.SetValue(0, preferenceValue); data.Put(userID, (T)prefs); } } addTimestamp(userID, itemID, timestampString, timestamps); } else { // Data are IEnumerable<Preference> IEnumerable <IPreference> prefs = ((IEnumerable <IPreference>)maybePrefs); if (!hasTimestamp && String.IsNullOrWhiteSpace(preferenceValueString)) { // Then line is of form "userID,itemID,", meaning remove if (prefs != null) { // remove pref var prefsIterator = ((IEnumerable <IPreference>)prefs.ToArray()).GetEnumerator(); while (prefsIterator.MoveNext()) { IPreference pref = prefsIterator.Current; if (pref.GetItemID() == itemID) { if (prefs is IList <IPreference> ) { ((IList <IPreference>)maybePrefs).Remove(pref);// prefsIterator.remove() } break; } } } removeTimestamp(userID, itemID, timestamps); } else { float preferenceValue = float.Parse(preferenceValueString, CultureInfo.InvariantCulture); bool exists = false; if (uniqueUserItemCheck && prefs != null) { foreach (IPreference pref in prefs) { if (pref.GetItemID() == itemID) { exists = true; pref.SetValue(preferenceValue); break; } } } if (!exists) { if (prefs == null) { prefs = new List <IPreference>(5); data.Put(userID, (T)prefs); } if (prefs is IList <IPreference> ) { ((IList <IPreference>)prefs).Add(new GenericPreference(userID, itemID, preferenceValue)); } } addTimestamp(userID, itemID, timestampString, timestamps); } } }