public double UserSimilarity(long userID1, long userID2) { IPreferenceArray xPrefs = dataModel.GetPreferencesFromUser(userID1); IPreferenceArray yPrefs = dataModel.GetPreferencesFromUser(userID2); int xLength = xPrefs.Length(); int yLength = yPrefs.Length(); if (xLength <= 1 || yLength <= 1) { return(Double.NaN); } // Copy prefs since we need to modify pref values to ranks xPrefs = xPrefs.Clone(); yPrefs = yPrefs.Clone(); // First sort by values from low to high xPrefs.SortByValue(); yPrefs.SortByValue(); // Assign ranks from low to high float nextRank = 1.0f; for (int i = 0; i < xLength; i++) { // ... but only for items that are common to both pref arrays if (yPrefs.HasPrefWithItemID(xPrefs.GetItemID(i))) { xPrefs.SetValue(i, nextRank); nextRank += 1.0f; } // Other values are bogus but don't matter } nextRank = 1.0f; for (int i = 0; i < yLength; i++) { if (xPrefs.HasPrefWithItemID(yPrefs.GetItemID(i))) { yPrefs.SetValue(i, nextRank); nextRank += 1.0f; } } xPrefs.SortByItem(); yPrefs.SortByItem(); long xIndex = xPrefs.GetItemID(0); long yIndex = yPrefs.GetItemID(0); int xPrefIndex = 0; int yPrefIndex = 0; double sumXYRankDiff2 = 0.0; int count = 0; while (true) { int compare = xIndex <yIndex ? -1 : xIndex> yIndex ? 1 : 0; if (compare == 0) { double diff = xPrefs.GetValue(xPrefIndex) - yPrefs.GetValue(yPrefIndex); sumXYRankDiff2 += diff * diff; count++; } if (compare <= 0) { if (++xPrefIndex >= xLength) { break; } xIndex = xPrefs.GetItemID(xPrefIndex); } if (compare >= 0) { if (++yPrefIndex >= yLength) { break; } yIndex = yPrefs.GetItemID(yPrefIndex); } } if (count <= 1) { return(Double.NaN); } // When ranks are unique, this formula actually gives the Pearson correlation return(1.0 - 6.0 * sumXYRankDiff2 / (count * (count * count - 1))); }
/// <summary> /// Creates a new <see cref="GenericDataModel"/> from the given users (and their preferences). This /// <see cref="IDataModel"/> retains all this information in memory and is effectively immutable. /// </summary> /// <param name="userData">users to include; (see also <see cref="GenericDataModel.ToDataMap(FastByIDMap, bool)"/>)</param> /// <param name="timestamps">timestamps optionally, provided timestamps of preferences as milliseconds since the epoch. User IDs are mapped to maps of item IDs to long timestamps.</param> public GenericDataModel(FastByIDMap <IPreferenceArray> userData, FastByIDMap <FastByIDMap <DateTime?> > timestamps) { //Preconditions.checkArgument(userData != null, "userData is null"); this.preferenceFromUsers = userData; FastByIDMap <IList <IPreference> > prefsForItems = new FastByIDMap <IList <IPreference> >(); FastIDSet itemIDSet = new FastIDSet(); int currentCount = 0; float maxPrefValue = float.NegativeInfinity; float minPrefValue = float.PositiveInfinity; foreach (var entry in preferenceFromUsers.EntrySet()) { IPreferenceArray prefs = entry.Value; prefs.SortByItem(); foreach (IPreference preference in prefs) { long itemID = preference.GetItemID(); itemIDSet.Add(itemID); var prefsForItem = prefsForItems.Get(itemID); if (prefsForItem == null) { prefsForItem = new List <IPreference>(2); prefsForItems.Put(itemID, prefsForItem); } prefsForItem.Add(preference); float value = preference.GetValue(); if (value > maxPrefValue) { maxPrefValue = value; } if (value < minPrefValue) { minPrefValue = value; } } if (++currentCount % 10000 == 0) { log.Info("Processed {0} users", currentCount); } } log.Info("Processed {0} users", currentCount); setMinPreference(minPrefValue); setMaxPreference(maxPrefValue); this.itemIDs = itemIDSet.ToArray(); itemIDSet = null; // Might help GC -- this is big Array.Sort(itemIDs); this.preferenceForItems = ToDataMap(prefsForItems, false); foreach (var entry in preferenceForItems.EntrySet()) { entry.Value.SortByUser(); } this.userIDs = new long[userData.Count()]; int i = 0; foreach (var v in userData.Keys) { userIDs[i++] = v; } Array.Sort(userIDs); this.timestamps = timestamps; }