/// <summary>Compute the overlap between the vectors in a binary matrix</summary> /// <returns>a sparse matrix with the overlaps</returns> /// <param name='entity_data'>the binary matrix</param> public static Tuple<IMatrix<float>, IList<float>> ComputeWeighted(IBooleanMatrix entity_data) { var transpose = (IBooleanMatrix) entity_data.Transpose(); var other_entity_weights = new float[transpose.NumberOfRows]; for (int row_id = 0; row_id < transpose.NumberOfRows; row_id++) { int freq = transpose.GetEntriesByRow(row_id).Count; other_entity_weights[row_id] = 1f / (float) Math.Log(3 + freq, 2); // TODO make configurable } IMatrix<float> weighted_overlap = new SymmetricMatrix<float>(entity_data.NumberOfRows); IList<float> entity_weights = new float[entity_data.NumberOfRows]; // go over all (other) entities for (int row_id = 0; row_id < transpose.NumberOfRows; row_id++) { var row = transpose.GetEntriesByRow(row_id); for (int i = 0; i < row.Count; i++) { int x = row[i]; entity_weights[x] += other_entity_weights[row_id]; for (int j = i + 1; j < row.Count; j++) { int y = row[j]; weighted_overlap[x, y] += other_entity_weights[row_id] * other_entity_weights[row_id]; } } } return Tuple.Create(weighted_overlap, entity_weights); }
/// <summary>Compute the overlap between the vectors in a binary matrix</summary> /// <returns>a sparse matrix with the overlaps</returns> /// <param name='entity_data'>the binary matrix</param> public static Tuple <IMatrix <float>, IList <float> > ComputeWeighted(IBooleanMatrix entity_data) { var transpose = (IBooleanMatrix)entity_data.Transpose(); var other_entity_weights = new float[transpose.NumberOfRows]; for (int row_id = 0; row_id < transpose.NumberOfRows; row_id++) { int freq = transpose.GetEntriesByRow(row_id).Count; other_entity_weights[row_id] = 1f / (float)Math.Log(3 + freq, 2); // TODO make configurable } IMatrix <float> weighted_overlap = new SymmetricMatrix <float>(entity_data.NumberOfRows); IList <float> entity_weights = new float[entity_data.NumberOfRows]; // go over all (other) entities for (int row_id = 0; row_id < transpose.NumberOfRows; row_id++) { var row = transpose.GetEntriesByRow(row_id); for (int i = 0; i < row.Count; i++) { int x = row[i]; entity_weights[x] += other_entity_weights[row_id]; for (int j = i + 1; j < row.Count; j++) { int y = row[j]; weighted_overlap[x, y] += other_entity_weights[row_id] * other_entity_weights[row_id]; } } } return(Tuple.Create(weighted_overlap, entity_weights)); }
/// <summary>Display data statistics for item recommendation datasets</summary> /// <param name="training_data">the training dataset</param> /// <param name="test_data">the test dataset</param> /// <param name="user_attributes">the user attributes</param> /// <param name="item_attributes">the item attributes</param> public static string Statistics( this IPosOnlyFeedback training_data, IPosOnlyFeedback test_data = null, IBooleanMatrix user_attributes = null, IBooleanMatrix item_attributes = null) { // training data stats int num_users = training_data.AllUsers.Count; int num_items = training_data.AllItems.Count; long matrix_size = (long)num_users * num_items; long empty_size = (long)matrix_size - training_data.Count; double sparsity = (double)100L * empty_size / matrix_size; string s = string.Format(CultureInfo.InvariantCulture, "training data: {0} users, {1} items, {2} events, sparsity {3,0:0.#####}\n", num_users, num_items, training_data.Count, sparsity); // test data stats if (test_data != null) { num_users = test_data.AllUsers.Count; num_items = test_data.AllItems.Count; matrix_size = (long)num_users * num_items; empty_size = (long)matrix_size - test_data.Count; sparsity = (double)100L * empty_size / matrix_size; // TODO depends on the eval scheme whether this is correct s += string.Format(CultureInfo.InvariantCulture, "test data: {0} users, {1} items, {2} events, sparsity {3,0:0.#####}\n", num_users, num_items, test_data.Count, sparsity); } return(s + Statistics(user_attributes, item_attributes)); }
void ComputeCorrelationsUShortOverlap(IBooleanMatrix entity_data) { var overlap = Overlap.ComputeUShort(entity_data); for (int x = 0; x < NumEntities; x++) for (int y = 0; y < x; y++) this[x, y] = ComputeCorrelationFromOverlap(overlap[x, y], entity_data.NumEntriesByRow(x), entity_data.NumEntriesByRow(y)); }
void ComputeCorrelationsWeighted(IBooleanMatrix entity_data) { var overlap_and_entity_weights = Overlap.ComputeWeighted(entity_data); var overlap = overlap_and_entity_weights.Item1; var entity_weights = overlap_and_entity_weights.Item2; for (int x = 0; x < NumEntities; x++) for (int y = 0; y < x; y++) this[x, y] = ComputeCorrelationFromOverlap(overlap[x, y], entity_weights[x], entity_weights[y]); }
/// public int Overlap(IBooleanMatrix s) { int c = 0; for (int i = 0; i < row_list.Count; i++) foreach (int j in row_list[i]) if (s[i, j]) c++; return c; }
void ComputeCorrelationsUShortOverlap(IBooleanMatrix entity_data) { var overlap = Overlap.ComputeUShort(entity_data); for (int x = 0; x < NumEntities; x++) { for (int y = 0; y < x; y++) { this[x, y] = ComputeCorrelationFromOverlap(overlap[x, y], entity_data.NumEntriesByRow(x), entity_data.NumEntriesByRow(y)); } } }
/// protected internal override void InitModel() { base.InitModel(); if (user_connections == null) { user_connections = new SparseBooleanMatrix(); Console.Error.WriteLine("Warning: UserRelation not set."); } group = new float[user_connections.NumberOfEntries]; }
void ComputeCorrelationsUIntOverlap(IBooleanMatrix entity_data) { var overlap = Overlap.ComputeUInt(entity_data); // compute correlations for (int x = 0; x < num_entities; x++) for (int y = 0; y < x; y++) { this[x, y] = ComputeCorrelationFromOverlap(overlap[x, y], entity_data.NumEntriesByRow(x), entity_data.NumEntriesByRow(y)); this[y, x] = ComputeCorrelationFromOverlap(overlap[x, y], entity_data.NumEntriesByRow(y), entity_data.NumEntriesByRow(x)); } }
/// protected internal override void InitModel() { if (user_connections == null) { user_connections = new SparseBooleanMatrix(); Console.Error.WriteLine("Warning: UserRelation not set."); } this.MaxUserID = Math.Max(MaxUserID, user_connections.NumberOfRows - 1); this.MaxUserID = Math.Max(MaxUserID, user_connections.NumberOfColumns - 1); base.InitModel(); }
void ComputeCorrelationsWeighted(IBooleanMatrix entity_data) { var overlap_and_entity_weights = Overlap.ComputeWeighted(entity_data); var overlap = overlap_and_entity_weights.Item1; var entity_weights = overlap_and_entity_weights.Item2; for (int x = 0; x < NumEntities; x++) { for (int y = 0; y < x; y++) { this[x, y] = ComputeCorrelationFromOverlap(overlap[x, y], entity_weights[x], entity_weights[y]); } } }
/// <summary>Optimizes the specified data</summary> /// <param name="data">data</param> /// <param name="W">W</param> /// <param name="H">H</param> protected virtual void Optimize(IBooleanMatrix data, Matrix<float> W, Matrix<float> H) { // comments are in terms of computing the user factors // ... works the same with users and items exchanged // (1) create HH in O(f^2|Items|) var HH = ComputeSquareMatrix(H); // (2) optimize all U Parallel.For( 0, W.dim1, u => { Optimize(u, data, W, H, HH); } ); }
void ComputeCorrelationsUIntOverlap(IBooleanMatrix entity_data) { var overlap = Overlap.ComputeUInt(entity_data); // compute correlations for (int x = 0; x < num_entities; x++) { for (int y = 0; y < x; y++) { this[x, y] = ComputeCorrelationFromOverlap(overlap[x, y], entity_data.NumEntriesByRow(x), entity_data.NumEntriesByRow(y)); this[y, x] = ComputeCorrelationFromOverlap(overlap[x, y], entity_data.NumEntriesByRow(y), entity_data.NumEntriesByRow(x)); } } }
/// <summary>Display dataset statistics</summary> /// <param name="train">the training data</param> /// <param name="test">the test data</param> /// <param name="user_attributes">the user attributes</param> /// <param name="item_attributes">the item attributes</param> /// <param name="display_overlap">if set true, display the user/item overlap between train and test</param> public static string Statistics( this IRatings train, IRatings test = null, IBooleanMatrix user_attributes = null, IBooleanMatrix item_attributes = null, bool display_overlap = false) { // training data stats int num_users = train.AllUsers.Count; int num_items = train.AllItems.Count; long matrix_size = (long)num_users * num_items; long empty_size = (long)matrix_size - train.Count; double sparsity = (double)100L * empty_size / matrix_size; string s = string.Format(CultureInfo.InvariantCulture, "training data: {0} users, {1} items, {2} ratings, sparsity {3,0:0.#####}\n", num_users, num_items, train.Count, sparsity); if (train is ITimedRatings) { var time_train = train as ITimedRatings; s += string.Format(CultureInfo.InvariantCulture, "rating period: {0} to {1}\n", time_train.EarliestTime, time_train.LatestTime); } // test data stats if (test != null) { num_users = test.AllUsers.Count; num_items = test.AllItems.Count; matrix_size = (long)num_users * num_items; empty_size = (long)matrix_size - test.Count; // TODO depends on the eval scheme whether this is correct sparsity = (double)100L * empty_size / matrix_size; s += string.Format(CultureInfo.InvariantCulture, "test data: {0} users, {1} items, {2} ratings, sparsity {3,0:0.#####}\n", num_users, num_items, test.Count, sparsity); if (test is ITimedRatings) { var time_test = test as ITimedRatings; s += string.Format(CultureInfo.InvariantCulture, "rating period: {0} to {1}\n", time_test.EarliestTime, time_test.LatestTime); } } // count and display the overlap between train and test if (display_overlap && test != null) { int num_new_users = 0; int num_new_items = 0; TimeSpan seconds = Wrap.MeasureTime(delegate() { num_new_users = test.AllUsers.Except(train.AllUsers).Count(); num_new_items = test.AllItems.Except(train.AllItems).Count(); }); s += string.Format("{0} new users, {1} new items ({2} seconds)\n", num_new_users, num_new_items, seconds); } return(s + Statistics(user_attributes, item_attributes)); }
/// <summary>Optimizes the specified data</summary> /// <param name="data">data</param> /// <param name="W">W</param> /// <param name="H">H</param> protected virtual void Optimize(IBooleanMatrix data, Matrix <float> W, Matrix <float> H) { // comments are in terms of computing the user factors // ... works the same with users and items exchanged // (1) create HH in O(f^2|Items|) var HH = ComputeSquareMatrix(H); // (2) optimize all U Parallel.For( 0, W.dim1, u => { Optimize(u, data, W, H, HH); } ); }
/// public void ComputeCorrelations(IBooleanMatrix entity_data) { Resize(entity_data.NumberOfRows); // the diagonal of the correlation matrix for (int i = 0; i < NumEntities; i++) this[i, i] = 1; if (Weighted) ComputeCorrelationsWeighted(entity_data); else if (entity_data.NumberOfColumns > ushort.MaxValue) // if possible, save some memory ComputeCorrelationsUIntOverlap(entity_data); else ComputeCorrelationsUShortOverlap(entity_data); }
/// <summary>Display dataset statistics</summary> /// <param name="train">the training data</param> /// <param name="test">the test data</param> /// <param name="user_attributes">the user attributes</param> /// <param name="item_attributes">the item attributes</param> /// <param name="display_overlap">if set true, display the user/item overlap between train and test</param> public static string Statistics( this IRatings train, IRatings test = null, IBooleanMatrix user_attributes = null, IBooleanMatrix item_attributes = null, bool display_overlap = false) { // training data stats int num_users = train.AllUsers.Count; int num_items = train.AllItems.Count; long matrix_size = (long) num_users * num_items; long empty_size = (long) matrix_size - train.Count; double sparsity = (double) 100L * empty_size / matrix_size; string s = string.Format(CultureInfo.InvariantCulture, "training data: {0} users, {1} items, {2} ratings, sparsity {3,0:0.#####}\n", num_users, num_items, train.Count, sparsity); if (train is ITimedRatings) { var time_train = train as ITimedRatings; s += string.Format(CultureInfo.InvariantCulture, "rating period: {0} to {1}\n", time_train.EarliestTime, time_train.LatestTime); } // test data stats if (test != null) { num_users = test.AllUsers.Count; num_items = test.AllItems.Count; matrix_size = (long) num_users * num_items; empty_size = (long) matrix_size - test.Count; // TODO depends on the eval scheme whether this is correct sparsity = (double) 100L * empty_size / matrix_size; s += string.Format(CultureInfo.InvariantCulture, "test data: {0} users, {1} items, {2} ratings, sparsity {3,0:0.#####}\n", num_users, num_items, test.Count, sparsity); if (test is ITimedRatings) { var time_test = test as ITimedRatings; s += string.Format(CultureInfo.InvariantCulture, "rating period: {0} to {1}\n", time_test.EarliestTime, time_test.LatestTime); } } // count and display the overlap between train and test if (display_overlap && test != null) { int num_new_users = 0; int num_new_items = 0; TimeSpan seconds = Wrap.MeasureTime(delegate() { num_new_users = test.AllUsers.Except(train.AllUsers).Count(); num_new_items = test.AllItems.Except(train.AllItems).Count(); }); s += string.Format("{0} new users, {1} new items ({2} seconds)\n", num_new_users, num_new_items, seconds); } return s + Statistics(user_attributes, item_attributes); }
/// <summary>Creates a Cosine similarity matrix from given data</summary> /// <param name="vectors">the boolean data</param> /// <returns>the similarity matrix based on the data</returns> public static CorrelationMatrix Create(IBooleanMatrix vectors) { BinaryDataCorrelationMatrix cm; int num_entities = vectors.NumberOfRows; try { cm = new WeightedBinaryCosine(num_entities); } catch (OverflowException) { Console.Error.WriteLine("Too many entities: " + num_entities); throw; } cm.ComputeCorrelations(vectors); return cm; }
/// public override void ComputeCorrelations(IBooleanMatrix entity_data) { var transpose = entity_data.Transpose(); var overlap = new SparseMatrix <int>(entity_data.NumberOfRows, entity_data.NumberOfRows); // go over all (other) entities for (int row_id = 0; row_id < transpose.NumberOfRows; row_id++) { var row = ((IBooleanMatrix)transpose).GetEntriesByRow(row_id); for (int i = 0; i < row.Count; i++) { int x = row[i]; for (int j = i + 1; j < row.Count; j++) { int y = row[j]; if (x < y) { overlap[x, y]++; } else { overlap[y, x]++; } } } } // the diagonal of the correlation matrix for (int i = 0; i < num_entities; i++) { this[i, i] = 1; } // compute cosine foreach (var index_pair in overlap.NonEmptyEntryIDs) { int x = index_pair.First; int y = index_pair.Second; this[x, y] = (float)(overlap[x, y] / Math.Sqrt(entity_data.NumEntriesByRow(x) * entity_data.NumEntriesByRow(y))); } }
/// <summary>Get the overlap of two matrices, i.e. the number of true entries where they agree</summary> /// <param name="s">the <see cref="SparseBooleanMatrix"/> to compare to</param> /// <returns>the number of entries that are true in both matrices</returns> public int Overlap(IBooleanMatrix s) { int c = 0; for (int i = 0; i < row_list.Count; i++) { foreach (int j in row_list[i]) { if (s[i, j]) { c++; } } } return(c); }
/// <summary>Creates a Cosine similarity matrix from given data</summary> /// <param name="vectors">the boolean data</param> /// <returns>the similarity matrix based on the data</returns> static public CorrelationMatrix Create(IBooleanMatrix vectors) { BinaryDataCorrelationMatrix cm; int num_entities = vectors.NumberOfRows; try { cm = new BinaryCosine(num_entities); } catch (OverflowException) { Console.Error.WriteLine("Too many entities: " + num_entities); throw; } cm.ComputeCorrelations(vectors); return(cm); }
/// <summary>Compute the overlap between the vectors in a binary matrix</summary> /// <returns>a sparse matrix with the overlaps</returns> /// <param name='entity_data'>the binary matrix</param> public static IMatrix<uint> ComputeUInt(IBooleanMatrix entity_data) { var transpose = entity_data.Transpose() as IBooleanMatrix; var overlap = new SymmetricSparseMatrix<uint>(entity_data.NumberOfRows); // go over all (other) entities for (int row_id = 0; row_id < transpose.NumberOfRows; row_id++) { var row = transpose.GetEntriesByRow(row_id); for (int i = 0; i < row.Count; i++) { int x = row[i]; for (int j = i + 1; j < row.Count; j++) overlap[x, row[j]]++; } } return overlap; }
protected virtual void LoadData() { training_file = Path.Combine(data_dir, training_file); if (test_file != null) { test_file = Path.Combine(data_dir, test_file); } // user attributes if (user_attributes_file != null) { user_attributes = AttributeData.Read(Path.Combine(data_dir, user_attributes_file), user_mapping); } if (recommender is IUserAttributeAwareRecommender) { ((IUserAttributeAwareRecommender)recommender).UserAttributes = user_attributes; } // item attributes if (item_attributes_file != null) { item_attributes = AttributeData.Read(Path.Combine(data_dir, item_attributes_file), item_mapping); } if (recommender is IItemAttributeAwareRecommender) { ((IItemAttributeAwareRecommender)recommender).ItemAttributes = item_attributes; } // user relation if (recommender is IUserRelationAwareRecommender) { ((IUserRelationAwareRecommender)recommender).UserRelation = RelationData.Read(Path.Combine(data_dir, user_relations_file), user_mapping); Console.WriteLine("relation over {0} users", ((IUserRelationAwareRecommender)recommender).NumUsers); } // item relation if (recommender is IItemRelationAwareRecommender) { ((IItemRelationAwareRecommender)recommender).ItemRelation = RelationData.Read(Path.Combine(data_dir, item_relations_file), item_mapping); Console.WriteLine("relation over {0} items", ((IItemRelationAwareRecommender)recommender).NumItems); } }
/// <summary>Display statistics for user and item attributes</summary> /// <param name="user_attributes">the user attributes</param> /// <param name="item_attributes">the item attributes</param> public static string Statistics(IBooleanMatrix user_attributes, IBooleanMatrix item_attributes) { string s = string.Empty; if (user_attributes != null) { s += string.Format( "{0} user attributes for {1} users, {2} assignments, {3} users with attribute assignments\n", user_attributes.NumberOfColumns, user_attributes.NumberOfRows, user_attributes.NumberOfEntries, user_attributes.NonEmptyRowIDs.Count); } if (item_attributes != null) { s += string.Format( "{0} item attributes for {1} items, {2} assignments, {3} items with attribute assignments\n", item_attributes.NonEmptyColumnIDs.Count, item_attributes.NumberOfRows, item_attributes.NumberOfEntries, item_attributes.NonEmptyRowIDs.Count); } return(s); }
/// <summary>Computes the overlap between the vectors in a binary matrix</summary> /// <returns>a sparse matrix with the overlaps</returns> /// <param name='entity_data'>the binary matrix</param> public static IMatrix <ushort> ComputeUShort(IBooleanMatrix entity_data) { var transpose = entity_data.Transpose() as IBooleanMatrix; var overlap = new SymmetricSparseMatrix <ushort>(entity_data.NumberOfRows); // go over all (other) entities for (int row_id = 0; row_id < transpose.NumberOfRows; row_id++) { var row = transpose.GetEntriesByRow(row_id); for (int i = 0; i < row.Count; i++) { int x = row[i]; for (int j = i + 1; j < row.Count; j++) { overlap[x, row[j]]++; } } } return(overlap); }
/// public void ComputeCorrelations(IBooleanMatrix entity_data) { Resize(entity_data.NumberOfRows); // the diagonal of the correlation matrix for (int i = 0; i < NumEntities; i++) { this[i, i] = 1; } if (Weighted) { ComputeCorrelationsWeighted(entity_data); } else if (entity_data.NumberOfColumns > ushort.MaxValue) // if possible, save some memory { ComputeCorrelationsUIntOverlap(entity_data); } else { ComputeCorrelationsUShortOverlap(entity_data); } }
/// public override void ComputeCorrelations(IBooleanMatrix entity_data) { var transpose = entity_data.Transpose() as IBooleanMatrix; var overlap = new SymmetricMatrix<int>(entity_data.NumberOfRows); // go over all (other) entities for (int row_id = 0; row_id < transpose.NumberOfRows; row_id++) { var row = transpose.GetEntriesByRow(row_id); for (int i = 0; i < row.Count; i++) { int x = row[i]; for (int j = i + 1; j < row.Count; j++) { int y = row[j]; overlap[x, y]++; } } } // the diagonal of the correlation matrix for (int i = 0; i < num_entities; i++) this[i, i] = 1; // compute Jaccard index for (int x = 0; x < num_entities; x++) for (int y = 0; y < x; y++) this[x, y] = (float) (overlap[x, y] / (entity_data.NumEntriesByRow(x) + entity_data.NumEntriesByRow(y) - overlap[x, y])); }
/// <summary>Compute the correlations from an implicit feedback, positive-only dataset</summary> /// <param name="entity_data">the implicit feedback set, rows contain the entities to correlate</param> public virtual void ComputeCorrelations(IBooleanMatrix entity_data) { throw new NotSupportedException(); }
/// <summary>Optimizes the specified data</summary> /// <param name="data">data</param> /// <param name="W">W</param> /// <param name="H">H</param> protected virtual void Optimize(IBooleanMatrix data, Matrix <float> W, Matrix <float> H) { var HH = new Matrix <double>(num_factors, num_factors); // comments are in terms of computing the user factors // ... works the same with users and items exchanged // (1) create HH in O(f^2|Items|) // HH is symmetric for (int f_1 = 0; f_1 < num_factors; f_1++) { for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = 0; for (int i = 0; i < H.dim1; i++) { d += H[i, f_1] * H[i, f_2]; } HH[f_1, f_2] = d; } } // (2) optimize all U // HC_minus_IH is symmetric Parallel.For(0, W.dim1, u => { var row = data.GetEntriesByRow(u); // create HC_minus_IH in O(f^2|S_u|) var HC_minus_IH = new Matrix <double>(num_factors, num_factors); for (int f_1 = 0; f_1 < num_factors; f_1++) { for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = 0; foreach (int i in row) { d += H[i, f_1] * H[i, f_2] * alpha; } HC_minus_IH[f_1, f_2] = d; } } // create HCp in O(f|S_u|) var HCp = new double[num_factors]; for (int f = 0; f < num_factors; f++) { double d = 0; foreach (int i in row) { d += H[i, f] * (1 + alpha); } HCp[f] = d; } // create m = HH + HC_minus_IH + reg*I // m is symmetric // the inverse m_inv is symmetric var m = new DenseMatrix(num_factors, num_factors); for (int f_1 = 0; f_1 < num_factors; f_1++) { for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = HH[f_1, f_2] + HC_minus_IH[f_1, f_2]; if (f_1 == f_2) { d += regularization; } m[f_1, f_2] = d; } } var m_inv = m.Inverse(); // write back optimal W for (int f = 0; f < num_factors; f++) { double d = 0; for (int f_2 = 0; f_2 < num_factors; f_2++) { d += m_inv[f, f_2] * HCp[f_2]; } W[u, f] = (float)d; } }); }
/// <summary>Optimizes the specified data</summary> /// <param name="data">data</param> /// <param name="W">W</param> /// <param name="H">H</param> protected virtual void Optimize(IBooleanMatrix data, Matrix <double> W, Matrix <double> H) { var HH = new Matrix <double>(num_factors, num_factors); var HC_minus_IH = new Matrix <double>(num_factors, num_factors); var HCp = new double[num_factors]; var m = new MathNet.Numerics.LinearAlgebra.Matrix(num_factors, num_factors); MathNet.Numerics.LinearAlgebra.Matrix m_inv; // TODO speed up using more parts of that library // source code comments are in terms of computing the user factors // works the same with users and items exchanged // (1) create HH in O(f^2|Items|) // HH is symmetric for (int f_1 = 0; f_1 < num_factors; f_1++) { for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = 0; for (int i = 0; i < H.dim1; i++) { d += H[i, f_1] * H[i, f_2]; } HH[f_1, f_2] = d; } } // (2) optimize all U // HC_minus_IH is symmetric for (int u = 0; u < W.dim1; u++) { var row = data.GetEntriesByRow(u); // create HC_minus_IH in O(f^2|S_u|) for (int f_1 = 0; f_1 < num_factors; f_1++) { for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = 0; foreach (int i in row) { //d += H[i, f_1] * H[i, f_2] * (c_pos - 1); d += H[i, f_1] * H[i, f_2] * c_pos; } HC_minus_IH[f_1, f_2] = d; } } // create HCp in O(f|S_u|) for (int f = 0; f < num_factors; f++) { double d = 0; foreach (int i in row) { //d += H[i, f] * c_pos; d += H[i, f] * (1 + c_pos); } HCp[f] = d; } // create m = HH + HC_minus_IH + reg*I // m is symmetric // the inverse m_inv is symmetric for (int f_1 = 0; f_1 < num_factors; f_1++) { for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = HH[f_1, f_2] + HC_minus_IH[f_1, f_2]; if (f_1 == f_2) { d += regularization; } m[f_1, f_2] = d; } } m_inv = m.Inverse(); // write back optimal W for (int f = 0; f < num_factors; f++) { double d = 0; for (int f_2 = 0; f_2 < num_factors; f_2++) { d += m_inv[f, f_2] * HCp[f_2]; } W[u, f] = d; } } }
/// <summary>Optimizes the specified data</summary> /// <param name="data">data</param> /// <param name="inverse_data">data</param> /// <param name="W">W</param> /// <param name="H">H</param> void Optimize(IBooleanMatrix data, IBooleanMatrix inverse_data, Matrix<double> W, Matrix<double> H) { var HH = new Matrix<double>(num_factors, num_factors); var HC_minus_IH = new Matrix<double>(num_factors, num_factors); var HCp = new double[num_factors]; var m = new MathNet.Numerics.LinearAlgebra.Matrix(num_factors, num_factors); MathNet.Numerics.LinearAlgebra.Matrix m_inv; // TODO speed up using more parts of that library // TODO using properties gives a 3-5% performance penalty // source code comments are in terms of computing the user factors // works the same with users and items exchanged // (1) create HH in O(f^2|Items|) // HH is symmetric for (int f_1 = 0; f_1 < num_factors; f_1++) for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = 0; for (int i = 0; i < H.dim1; i++) d += H[i, f_1] * H[i, f_2]; HH[f_1, f_2] = d; } // (2) optimize all U // HC_minus_IH is symmetric for (int u = 0; u < W.dim1; u++) { var row = data.GetEntriesByRow(u); // prepare KDD Cup specific weighting int num_user_items = row.Count; int user_positive_weight_sum = 0; foreach (int i in row) user_positive_weight_sum += inverse_data.NumEntriesByRow(i); double neg_weight_normalization = (double) (num_user_items * (1 + CPos)) / (Feedback.Count - user_positive_weight_sum); // TODO precompute // TODO check whether this is correct // create HC_minus_IH in O(f^2|S_u|) for (int f_1 = 0; f_1 < num_factors; f_1++) for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = 0; foreach (int i in row) //d += H[i, f_1] * H[i, f_2] * (c_pos - 1); d += H[i, f_1] * H[i, f_2] * CPos; HC_minus_IH[f_1, f_2] = d; } // create HCp in O(f|S_u|) for (int f = 0; f < num_factors; f++) { double d = 0; for (int i = 0; i < inverse_data.NumberOfRows; i++) if (row.Contains(i)) d += H[i, f] * (1 + CPos); else d += H[i, f] * inverse_data.NumEntriesByRow(i) * neg_weight_normalization; HCp[f] = d; } // create m = HH + HC_minus_IH + reg*I // m is symmetric // the inverse m_inv is symmetric for (int f_1 = 0; f_1 < num_factors; f_1++) for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = HH[f_1, f_2] + HC_minus_IH[f_1, f_2]; if (f_1 == f_2) d += Regularization; m[f_1, f_2] = d; } m_inv = m.Inverse(); // write back optimal W for (int f = 0; f < num_factors; f++) { double d = 0; for (int f_2 = 0; f_2 < num_factors; f_2++) d += m_inv[f, f_2] * HCp[f_2]; W[u, f] = d; } } }
/// <summary>Compute the correlations from an implicit feedback, positive-only dataset</summary> /// <param name="entity_data">the implicit feedback set, rows contain the entities to correlate</param> public abstract void ComputeCorrelations(IBooleanMatrix entity_data);
/// public override void ComputeCorrelations(IBooleanMatrix entity_data) { // if possible, save some memory if (entity_data.NumberOfColumns > ushort.MaxValue) ComputeCorrelationsUIntOverlap(entity_data); else ComputeCorrelationsUShortOverlap(entity_data); }
private void Optimize(int u, IBooleanMatrix data, Matrix <float> W, Matrix <float> H, Matrix <double> HH) { var row = data.GetEntriesByRow(u); // HC_minus_IH is symmetric // create HC_minus_IH in O(f^2|S_u|) var HC_minus_IH = new Matrix <double>(num_factors, num_factors); for (int f_1 = 0; f_1 < num_factors; f_1++) { for (int f_2 = f_1; f_2 < num_factors; f_2++) { double d = 0; foreach (int i in row) { d += H[i, f_1] * H[i, f_2]; } HC_minus_IH[f_1, f_2] = d * Alpha; HC_minus_IH[f_2, f_1] = d * Alpha; } } // create HCp in O(f|S_u|) var HCp = new double[num_factors]; for (int f = 0; f < num_factors; f++) { double d = 0; foreach (int i in row) { d += H[i, f]; } HCp[f] = d * (1 + Alpha); } // create m = HH + HC_minus_IH + reg*I // m is symmetric // the inverse m_inv is symmetric var m = new DenseMatrix(num_factors, num_factors); for (int f_1 = 0; f_1 < num_factors; f_1++) { for (int f_2 = f_1; f_2 < num_factors; f_2++) { double d = HH[f_1, f_2] + HC_minus_IH[f_1, f_2]; if (f_1 == f_2) { d += Regularization; } m[f_1, f_2] = d; m[f_2, f_1] = d; } } var m_inv = m.Inverse(); // write back optimal W for (int f = 0; f < num_factors; f++) { double d = 0; for (int f_2 = 0; f_2 < num_factors; f_2++) { d += m_inv[f, f_2] * HCp[f_2]; } W[u, f] = (float)d; } }
/// public override void ComputeCorrelations(IBooleanMatrix entity_data) { var transpose = (IBooleanMatrix) entity_data.Transpose(); var other_entity_weights = new float[transpose.NumberOfRows]; for (int row_id = 0; row_id < transpose.NumberOfRows; row_id++) { int freq = transpose.GetEntriesByRow(row_id).Count; other_entity_weights[row_id] = 1f / (float) Math.Log(3 + freq, 2); // TODO make configurable } var weighted_overlap = new SymmetricMatrix<float>(entity_data.NumberOfRows); var entity_weights = new float[entity_data.NumberOfRows]; // go over all (other) entities for (int row_id = 0; row_id < transpose.NumberOfRows; row_id++) { var row = transpose.GetEntriesByRow(row_id); for (int i = 0; i < row.Count; i++) { int x = row[i]; entity_weights[x] += other_entity_weights[row_id]; for (int j = i + 1; j < row.Count; j++) { int y = row[j]; weighted_overlap[x, y] += other_entity_weights[row_id] * other_entity_weights[row_id]; } } } // the diagonal of the correlation matrix for (int i = 0; i < num_entities; i++) this[i, i] = 1; // compute cosine for (int x = 0; x < num_entities; x++) for (int y = 0; y < x; y++) this[x, y] = (float) (weighted_overlap[x, y] / Math.Sqrt(entity_weights[x] * entity_weights[y] )); }
void ComputeCorrelationsUShortOverlap(IBooleanMatrix entity_data) { var transpose = entity_data.Transpose() as IBooleanMatrix; var overlap = new SymmetricMatrix<ushort>(entity_data.NumberOfRows); // go over all (other) entities for (int row_id = 0; row_id < transpose.NumberOfRows; row_id++) { var row = transpose.GetEntriesByRow(row_id); for (int i = 0; i < row.Count; i++) { int x = row[i]; for (int j = i + 1; j < row.Count; j++) overlap[x, row[j]]++; } } // the diagonal of the correlation matrix for (int i = 0; i < num_entities; i++) this[i, i] = 1; // compute cosine for (int x = 0; x < num_entities; x++) for (int y = 0; y < x; y++) { long size_product = entity_data.NumEntriesByRow(x) * entity_data.NumEntriesByRow(y); if (size_product > 0) this[x, y] = (float) (overlap[x, y] / Math.Sqrt(size_product)); } }
private void Optimize(int u, IBooleanMatrix data, Matrix<float> W, Matrix<float> H, Matrix<double> HH) { var row = data.GetEntriesByRow(u); // HC_minus_IH is symmetric // create HC_minus_IH in O(f^2|S_u|) var HC_minus_IH = new Matrix<double>(num_factors, num_factors); for (int f_1 = 0; f_1 < num_factors; f_1++) for (int f_2 = f_1; f_2 < num_factors; f_2++) { double d = 0; foreach (int i in row) d += H[i, f_1] * H[i, f_2]; HC_minus_IH[f_1, f_2] = d * alpha; HC_minus_IH[f_2, f_1] = d * alpha; } // create HCp in O(f|S_u|) var HCp = new double[num_factors]; for (int f = 0; f < num_factors; f++) { double d = 0; foreach (int i in row) d += H[i, f]; HCp[f] = d * (1 + alpha); } // create m = HH + HC_minus_IH + reg*I // m is symmetric // the inverse m_inv is symmetric var m = new DenseMatrix(num_factors, num_factors); for (int f_1 = 0; f_1 < num_factors; f_1++) for (int f_2 = f_1; f_2 < num_factors; f_2++) { double d = HH[f_1, f_2] + HC_minus_IH[f_1, f_2]; if (f_1 == f_2) d += regularization; m[f_1, f_2] = d; m[f_2, f_1] = d; } var m_inv = m.Inverse(); // write back optimal W for (int f = 0; f < num_factors; f++) { double d = 0; for (int f_2 = 0; f_2 < num_factors; f_2++) d += m_inv[f, f_2] * HCp[f_2]; W[u, f] = (float) d; } }
/// <summary>Display statistics for user and item attributes</summary> /// <param name="user_attributes">the user attributes</param> /// <param name="item_attributes">the item attributes</param> public static string Statistics(IBooleanMatrix user_attributes, IBooleanMatrix item_attributes) { string s = string.Empty; if (user_attributes != null) { s += string.Format( "{0} user attributes for {1} users, {2} assignments, {3} users with attribute assignments\n", user_attributes.NumberOfColumns, user_attributes.NumberOfRows, user_attributes.NumberOfEntries, user_attributes.NonEmptyRowIDs.Count); } if (item_attributes != null) s += string.Format( "{0} item attributes for {1} items, {2} assignments, {3} items with attribute assignments\n", item_attributes.NonEmptyColumnIDs.Count, item_attributes.NumberOfRows, item_attributes.NumberOfEntries, item_attributes.NonEmptyRowIDs.Count); return s; }
/// public override void ComputeCorrelations(IBooleanMatrix entity_data) { var transpose = entity_data.Transpose(); var overlap = new SparseMatrix<int>(entity_data.NumberOfRows, entity_data.NumberOfRows); // go over all (other) entities for (int row_id = 0; row_id < transpose.NumberOfRows; row_id++) { var row = ((IBooleanMatrix) transpose).GetEntriesByRow(row_id); for (int i = 0; i < row.Count; i++) { int x = row[i]; for (int j = i + 1; j < row.Count; j++) { int y = row[j]; if (x < y) overlap[x, y]++; else overlap[y, x]++; } } } // the diagonal of the correlation matrix for (int i = 0; i < num_entities; i++) this[i, i] = 1; // compute cosine foreach (var index_pair in overlap.NonEmptyEntryIDs) { int x = index_pair.First; int y = index_pair.Second; this[x, y] = (float) (overlap[x, y] / Math.Sqrt(entity_data.NumEntriesByRow(x) * entity_data.NumEntriesByRow(y) )); } }
/// <summary>Display data statistics for item recommendation datasets</summary> /// <param name="training_data">the training dataset</param> /// <param name="test_data">the test dataset</param> /// <param name="user_attributes">the user attributes</param> /// <param name="item_attributes">the item attributes</param> public static string Statistics( this IPosOnlyFeedback training_data, IPosOnlyFeedback test_data = null, IBooleanMatrix user_attributes = null, IBooleanMatrix item_attributes = null) { // training data stats int num_users = training_data.AllUsers.Count; int num_items = training_data.AllItems.Count; long matrix_size = (long) num_users * num_items; long empty_size = (long) matrix_size - training_data.Count; double sparsity = (double) 100L * empty_size / matrix_size; string s = string.Format(CultureInfo.InvariantCulture, "training data: {0} users, {1} items, {2} events, sparsity {3,0:0.#####}\n", num_users, num_items, training_data.Count, sparsity); // test data stats if (test_data != null) { num_users = test_data.AllUsers.Count; num_items = test_data.AllItems.Count; matrix_size = (long) num_users * num_items; empty_size = (long) matrix_size - test_data.Count; sparsity = (double) 100L * empty_size / matrix_size; // TODO depends on the eval scheme whether this is correct s += string.Format(CultureInfo.InvariantCulture, "test data: {0} users, {1} items, {2} events, sparsity {3,0:0.#####}\n", num_users, num_items, test_data.Count, sparsity); } return s + Statistics(user_attributes, item_attributes); }
/// <summary>Optimizes the specified data</summary> /// <param name="data">data</param> /// <param name="W">W</param> /// <param name="H">H</param> protected virtual void Optimize(IBooleanMatrix data, Matrix<float> W, Matrix<float> H) { var HH = new Matrix<double>(num_factors, num_factors); var HC_minus_IH = new Matrix<double>(num_factors, num_factors); var HCp = new double[num_factors]; var m = new DenseMatrix(num_factors, num_factors); // source code comments are in terms of computing the user factors // works the same with users and items exchanged // (1) create HH in O(f^2|Items|) // HH is symmetric for (int f_1 = 0; f_1 < num_factors; f_1++) for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = 0; for (int i = 0; i < H.dim1; i++) d += H[i, f_1] * H[i, f_2]; HH[f_1, f_2] = d; } // (2) optimize all U // HC_minus_IH is symmetric for (int u = 0; u < W.dim1; u++) { var row = data.GetEntriesByRow(u); // create HC_minus_IH in O(f^2|S_u|) for (int f_1 = 0; f_1 < num_factors; f_1++) for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = 0; foreach (int i in row) //d += H[i, f_1] * H[i, f_2] * (c_pos - 1); d += H[i, f_1] * H[i, f_2] * c_pos; HC_minus_IH[f_1, f_2] = d; } // create HCp in O(f|S_u|) for (int f = 0; f < num_factors; f++) { double d = 0; foreach (int i in row) //d += H[i, f] * c_pos; d += H[i, f] * (1 + c_pos); HCp[f] = d; } // create m = HH + HC_minus_IH + reg*I // m is symmetric // the inverse m_inv is symmetric for (int f_1 = 0; f_1 < num_factors; f_1++) for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = HH[f_1, f_2] + HC_minus_IH[f_1, f_2]; if (f_1 == f_2) d += regularization; m[f_1, f_2] = d; } var m_inv = m.Inverse(); // write back optimal W for (int f = 0; f < num_factors; f++) { double d = 0; for (int f_2 = 0; f_2 < num_factors; f_2++) d += m_inv[f, f_2] * HCp[f_2]; W[u, f] = (float) d; } } }
/// <summary>Optimizes the specified data</summary> /// <param name="data">data</param> /// <param name="inverse_data">data</param> /// <param name="W">W</param> /// <param name="H">H</param> void Optimize(IBooleanMatrix data, IBooleanMatrix inverse_data, Matrix <double> W, Matrix <double> H) { var HH = new Matrix <double>(num_factors, num_factors); var HC_minus_IH = new Matrix <double>(num_factors, num_factors); var HCp = new double[num_factors]; var m = new MathNet.Numerics.LinearAlgebra.Matrix(num_factors, num_factors); MathNet.Numerics.LinearAlgebra.Matrix m_inv; // TODO speed up using more parts of that library // TODO using properties gives a 3-5% performance penalty // source code comments are in terms of computing the user factors // works the same with users and items exchanged // (1) create HH in O(f^2|Items|) // HH is symmetric for (int f_1 = 0; f_1 < num_factors; f_1++) { for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = 0; for (int i = 0; i < H.dim1; i++) { d += H[i, f_1] * H[i, f_2]; } HH[f_1, f_2] = d; } } // (2) optimize all U // HC_minus_IH is symmetric for (int u = 0; u < W.dim1; u++) { var row = data.GetEntriesByRow(u); // prepare KDD Cup specific weighting int num_user_items = row.Count; int user_positive_weight_sum = 0; foreach (int i in row) { user_positive_weight_sum += inverse_data.NumEntriesByRow(i); } double neg_weight_normalization = (double)(num_user_items * (1 + CPos)) / (Feedback.Count - user_positive_weight_sum); // TODO precompute // TODO check whether this is correct // create HC_minus_IH in O(f^2|S_u|) for (int f_1 = 0; f_1 < num_factors; f_1++) { for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = 0; foreach (int i in row) { //d += H[i, f_1] * H[i, f_2] * (c_pos - 1); d += H[i, f_1] * H[i, f_2] * CPos; } HC_minus_IH[f_1, f_2] = d; } } // create HCp in O(f|S_u|) for (int f = 0; f < num_factors; f++) { double d = 0; for (int i = 0; i < inverse_data.NumberOfRows; i++) { if (row.Contains(i)) { d += H[i, f] * (1 + CPos); } else { d += H[i, f] * inverse_data.NumEntriesByRow(i) * neg_weight_normalization; } } HCp[f] = d; } // create m = HH + HC_minus_IH + reg*I // m is symmetric // the inverse m_inv is symmetric for (int f_1 = 0; f_1 < num_factors; f_1++) { for (int f_2 = 0; f_2 < num_factors; f_2++) { double d = HH[f_1, f_2] + HC_minus_IH[f_1, f_2]; if (f_1 == f_2) { d += Regularization; } m[f_1, f_2] = d; } } m_inv = m.Inverse(); // write back optimal W for (int f = 0; f < num_factors; f++) { double d = 0; for (int f_2 = 0; f_2 < num_factors; f_2++) { d += m_inv[f, f_2] * HCp[f_2]; } W[u, f] = d; } } }
/// <summary>Evaluation for rankings of items recommended to groups</summary> /// <remarks> /// </remarks> /// <param name="recommender">group recommender</param> /// <param name="test">test cases</param> /// <param name="train">training data</param> /// <param name="group_to_user">group to user relation</param> /// <param name="candidate_items">a collection of integers with all candidate items</param> /// <param name="ignore_overlap">if true, ignore items that appear for a group in the training set when evaluating for that user</param> /// <returns>a dictionary containing the evaluation results</returns> public static ItemRecommendationEvaluationResults Evaluate( this GroupRecommender recommender, IPosOnlyFeedback test, IPosOnlyFeedback train, IBooleanMatrix group_to_user, ICollection<int> candidate_items, bool ignore_overlap = true) { var result = new ItemRecommendationEvaluationResults(); int num_groups = 0; foreach (int group_id in group_to_user.NonEmptyRowIDs) { var users = group_to_user.GetEntriesByRow(group_id); var correct_items = new HashSet<int>(); foreach (int user_id in users) correct_items.UnionWith(test.UserMatrix[user_id]); correct_items.IntersectWith(candidate_items); var candidate_items_in_train = new HashSet<int>(); foreach (int user_id in users) candidate_items_in_train.UnionWith(train.UserMatrix[user_id]); candidate_items_in_train.IntersectWith(candidate_items); int num_eval_items = candidate_items.Count - (ignore_overlap ? candidate_items_in_train.Count() : 0); // skip all groups that have 0 or #candidate_items test items if (correct_items.Count == 0) continue; if (num_eval_items - correct_items.Count == 0) continue; IList<int> prediction_list = recommender.RankItems(users, candidate_items); if (prediction_list.Count != candidate_items.Count) throw new Exception("Not all items have been ranked."); var ignore_items = ignore_overlap ? candidate_items_in_train : new HashSet<int>(); double auc = AUC.Compute(prediction_list, correct_items, ignore_items); double map = PrecisionAndRecall.AP(prediction_list, correct_items, ignore_items); double ndcg = NDCG.Compute(prediction_list, correct_items, ignore_items); double rr = ReciprocalRank.Compute(prediction_list, correct_items, ignore_items); var positions = new int[] { 5, 10 }; var prec = PrecisionAndRecall.PrecisionAt(prediction_list, correct_items, ignore_items, positions); var recall = PrecisionAndRecall.RecallAt(prediction_list, correct_items, ignore_items, positions); // thread-safe incrementing lock(result) { num_groups++; result["AUC"] += (float) auc; result["MAP"] += (float) map; result["NDCG"] += (float) ndcg; result["MRR"] += (float) rr; result["prec@5"] += (float) prec[5]; result["prec@10"] += (float) prec[10]; result["recall@5"] += (float) recall[5]; result["recall@10"] += (float) recall[10]; } if (num_groups % 1000 == 0) Console.Error.Write("."); if (num_groups % 60000 == 0) Console.Error.WriteLine(); } result["num_groups"] = num_groups; result["num_lists"] = num_groups; result["num_items"] = candidate_items.Count; return result; }