/// TODO: this is the vanilla sgd by Tacaks 2009, I speculate that using scaling technique proposed in: /// Towards Optimal One Pass Large Scale Learning with Averaged Stochastic Gradient Descent section 5, page 6 /// can be beneficial in term s of both speed and accuracy. /// /// Tacaks' method doesn't calculate gradient of regularization correctly, which has non-zero elements everywhere of /// the matrix. While Tacaks' method can only updates a single row/column, if one user has a lot of recommendation, /// her vector will be more affected by regularization using an isolated scaling factor for both user vectors and /// item vectors can remove this issue without inducing more update cost it even reduces it a bit by only performing /// one addition and one multiplication. /// /// BAD SIDE1: the scaling factor decreases fast, it has to be scaled up from time to time before dropped to zero or /// caused roundoff error /// BAD SIDE2: no body experiment on it before, and people generally use very small lambda /// so it's impact on accuracy may still be unknown. /// BAD SIDE3: don't know how to make it work for L1-regularization or /// "pseudorank?" (sum of singular values)-regularization protected void update(IPreference preference, double mu) { int userIdx = userIndex(preference.GetUserID()); int itemIdx = itemIndex(preference.GetItemID()); double[] userVector = userVectors[userIdx]; double[] itemVector = itemVectors[itemIdx]; double prediction = dot(userVector, itemVector); double err = preference.GetValue() - prediction; // adjust features for (int k = FEATURE_OFFSET; k < rank; k++) { double userFeature = userVector[k]; double itemFeature = itemVector[k]; userVector[k] += mu * (err * itemFeature - lambda * userFeature); itemVector[k] += mu * (err * userFeature - lambda * itemFeature); } // adjust user and item bias userVector[USER_BIAS_INDEX] += biasMuRatio * mu * (err - biasLambdaRatio * lambda * userVector[USER_BIAS_INDEX]); itemVector[ITEM_BIAS_INDEX] += biasMuRatio * mu * (err - biasLambdaRatio * lambda * itemVector[ITEM_BIAS_INDEX]); }
public void testPreferenceShufflerWithSyntheticData() { setUpSyntheticData(); ParallelSGDFactorizer.PreferenceShuffler shuffler = new ParallelSGDFactorizer.PreferenceShuffler(dataModel); shuffler.shuffle(); shuffler.stage(); FastByIDMap <FastByIDMap <bool?> > checkedLst = new FastByIDMap <FastByIDMap <bool?> >(); for (int i = 0; i < shuffler.size(); i++) { IPreference pref = shuffler.get(i); float?value = dataModel.GetPreferenceValue(pref.GetUserID(), pref.GetItemID()); Assert.AreEqual(pref.GetValue(), value.Value, 0.0); if (!checkedLst.ContainsKey(pref.GetUserID())) { checkedLst.Put(pref.GetUserID(), new FastByIDMap <bool?>()); } Assert.IsNull(checkedLst.Get(pref.GetUserID()).Get(pref.GetItemID())); checkedLst.Get(pref.GetUserID()).Put(pref.GetItemID(), true); } var userIDs = dataModel.GetUserIDs(); int index = 0; while (userIDs.MoveNext()) { long userID = userIDs.Current; IPreferenceArray preferencesFromUser = dataModel.GetPreferencesFromUser(userID); foreach (IPreference preference in preferencesFromUser) { Assert.True(checkedLst.Get(preference.GetUserID()).Get(preference.GetItemID()).Value); index++; } } Assert.AreEqual(index, shuffler.size()); }
public BooleanUserPreferenceArray(List <IPreference> prefs) : this(prefs.Count) { int size = prefs.Count; for (int i = 0; i < size; i++) { IPreference pref = prefs[i]; ids[i] = pref.GetItemID(); } if (size > 0) { id = prefs[0].GetUserID(); } }
public BooleanItemPreferenceArray(List <IPreference> prefs, bool forOneUser) : this(prefs.Count) { int size = prefs.Count; for (int i = 0; i < size; i++) { IPreference pref = prefs[i]; ids[i] = forOneUser ? pref.GetItemID() : pref.GetUserID(); } if (size > 0) { id = forOneUser ? prefs[0].GetUserID() : prefs[0].GetItemID(); } }
public void testPreferencesForItem() { IPreferenceArray prefs = model.GetPreferencesForItem(456); Assert.NotNull(prefs); IPreference pref1 = prefs.Get(0); Assert.AreEqual(123, pref1.GetUserID()); Assert.AreEqual(456, pref1.GetItemID()); IPreference pref2 = prefs.Get(1); Assert.AreEqual(456, pref2.GetUserID()); Assert.AreEqual(456, pref2.GetItemID()); Assert.AreEqual(2, prefs.Length()); }
public GenericItemPreferenceArray(IList <IPreference> prefs) : this(prefs.Count) { int size = prefs.Count; long itemID = Int64.MinValue; for (int i = 0; i < size; i++) { IPreference pref = prefs[i]; ids[i] = pref.GetUserID(); if (i == 0) { itemID = pref.GetItemID(); } else { if (itemID != pref.GetItemID()) { throw new ArgumentException("Not all item IDs are the same"); } } values[i] = pref.GetValue(); } id = itemID; }
public void Set(int i, IPreference pref) { id = pref.GetItemID(); ids[i] = pref.GetUserID(); values[i] = pref.GetValue(); }
/// <p> /// Reads one line from the input file and adds the data to a {@link FastByIDMap} data structure which maps user IDs /// to preferences. This assumes that each line of the input file corresponds to one preference. After /// reading a line and determining which user and item the preference pertains to, the method should look to /// see if the data contains a mapping for the user ID already, and if not, add an empty data structure of preferences /// as appropriate to the data. /// </p> /// /// <p> /// Note that if the line is empty or begins with '#' it will be ignored as a comment. /// </p> /// /// @param line /// line from input data file /// @param data /// all data read so far, as a mapping from user IDs to preferences /// @param fromPriorData an implementation detail -- if true, data will map IDs to /// {@link PreferenceArray} since the framework is attempting to read and update raw /// data that is already in memory. Otherwise it maps to {@link Collection}s of /// {@link Preference}s, since it's reading fresh data. Subclasses must be prepared /// to handle this wrinkle. protected void processLine <T>(string line, FastByIDMap <T> data, FastByIDMap <FastByIDMap <DateTime?> > timestamps, bool fromPriorData) { // Ignore empty lines and comments if (line.Length == 0 || line[0] == COMMENT_CHAR) { return; } var tokens = SplitLine(line); string userIDString = tokens[0]; string itemIDString = tokens[1]; string preferenceValueString = tokens[2]; bool hasTimestamp = tokens.Length > 3; string timestampString = hasTimestamp ? tokens[3] : null; long userID = readUserIDFromString(userIDString); long itemID = readItemIDFromString(itemIDString); if (transpose) { long tmp = userID; userID = itemID; itemID = tmp; } // This is kind of gross but need to handle two types of storage var maybePrefs = data.Get(userID); if (fromPriorData) { // Data are PreferenceArray IPreferenceArray prefs = (IPreferenceArray)maybePrefs; if (!hasTimestamp && String.IsNullOrWhiteSpace(preferenceValueString)) { // Then line is of form "userID,itemID,", meaning remove if (prefs != null) { bool exists = false; int length = prefs.Length(); for (int i = 0; i < length; i++) { if (prefs.GetItemID(i) == itemID) { exists = true; break; } } if (exists) { if (length == 1) { data.Remove(userID); } else { IPreferenceArray newPrefs = new GenericUserPreferenceArray(length - 1); for (int i = 0, j = 0; i < length; i++, j++) { if (prefs.GetItemID(i) == itemID) { j--; } else { newPrefs.Set(j, prefs.Get(i)); } } data.Put(userID, (T)newPrefs); } } } removeTimestamp(userID, itemID, timestamps); } else { float preferenceValue = float.Parse(preferenceValueString, CultureInfo.InvariantCulture); bool exists = false; if (uniqueUserItemCheck && prefs != null) { for (int i = 0; i < prefs.Length(); i++) { if (prefs.GetItemID(i) == itemID) { exists = true; prefs.SetValue(i, preferenceValue); break; } } } if (!exists) { if (prefs == null) { prefs = new GenericUserPreferenceArray(1); } else { IPreferenceArray newPrefs = new GenericUserPreferenceArray(prefs.Length() + 1); for (int i = 0, j = 1; i < prefs.Length(); i++, j++) { newPrefs.Set(j, prefs.Get(i)); } prefs = newPrefs; } prefs.SetUserID(0, userID); prefs.SetItemID(0, itemID); prefs.SetValue(0, preferenceValue); data.Put(userID, (T)prefs); } } addTimestamp(userID, itemID, timestampString, timestamps); } else { // Data are IEnumerable<Preference> IEnumerable <IPreference> prefs = ((IEnumerable <IPreference>)maybePrefs); if (!hasTimestamp && String.IsNullOrWhiteSpace(preferenceValueString)) { // Then line is of form "userID,itemID,", meaning remove if (prefs != null) { // remove pref var prefsIterator = ((IEnumerable <IPreference>)prefs.ToArray()).GetEnumerator(); while (prefsIterator.MoveNext()) { IPreference pref = prefsIterator.Current; if (pref.GetItemID() == itemID) { if (prefs is IList <IPreference> ) { ((IList <IPreference>)maybePrefs).Remove(pref);// prefsIterator.remove() } break; } } } removeTimestamp(userID, itemID, timestamps); } else { float preferenceValue = float.Parse(preferenceValueString, CultureInfo.InvariantCulture); bool exists = false; if (uniqueUserItemCheck && prefs != null) { foreach (IPreference pref in prefs) { if (pref.GetItemID() == itemID) { exists = true; pref.SetValue(preferenceValue); break; } } } if (!exists) { if (prefs == null) { prefs = new List <IPreference>(5); data.Put(userID, (T)prefs); } if (prefs is IList <IPreference> ) { ((IList <IPreference>)prefs).Add(new GenericPreference(userID, itemID, preferenceValue)); } } addTimestamp(userID, itemID, timestampString, timestamps); } } }