/// <summary> /// Combines two TermDocumentMatrices into one matrix with the same terms. /// The first matrix is put in first (ie. newmatrix[0] = matrix1[0]) /// </summary> /// <param name="matrix1">First matrix</param> /// <param name="matrix2">Second matrix</param> /// <returns></returns> public static TermDocumentMatrix Combine(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { TermDocumentMatrix combined = new TermDocumentMatrix(); // add documents combined._docIndex = new List <string>(matrix1._docIndex); combined._docIndexLookup = new Dictionary <string, int>(matrix1._docIndexLookup); foreach (string doc in matrix2.DocMap) { combined._docIndex.Add(doc); combined._docIndexLookup.Add(doc, combined.NumDocs - 1); } // calculate union of terms combined._termIndex = new List <string>(matrix1._termIndex); combined._termIndexLookup = new Dictionary <string, int>(matrix1._termIndexLookup); foreach (string term in matrix2.TermMap) { if (!combined._termIndexLookup.ContainsKey(term)) { combined._termIndex.Add(term); combined._termIndexLookup.Add(term, combined.NumTerms - 1); } } // create and populate matrix combined._matrix = new double[combined.NumDocs][]; // matrix1 for (int i = 0; i < matrix1.NumDocs; i++) { combined._matrix[i] = new double[combined.NumTerms]; for (int j = 0; j < combined.NumTerms; j++) { if (matrix1.ContainsTerm(combined.TermMap[j])) { combined[i, j] = matrix1[i, matrix1.GetTermIndex(combined.TermMap[j])]; } else { combined[i, j] = 0.0; } } } // matrix2 for (int i = matrix1.NumDocs; i < combined.NumDocs; i++) { combined._matrix[i] = new double[combined.NumTerms]; for (int j = 0; j < combined.NumTerms; j++) { if (matrix2.ContainsTerm(combined.TermMap[j])) { combined[i, j] = matrix2[i - matrix1.NumDocs, matrix2.GetTermIndex(combined.TermMap[j])]; } else { combined[i, j] = 0.0; } } } return(combined); }
/// <summary> /// Recreates each matrix with documents containing missing terms. /// List[0] : matrix 1 /// List[1] : matrix 2 /// </summary> /// <param name="matrix1">First term-by-document matrix</param> /// <param name="matrix2">Second term-by-document matrix</param> /// <returns>Copies of original matrices with missing terms from each</returns> public static List<TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { // initialize matrices List<TermDocumentMatrix> matrices = new List<TermDocumentMatrix>(); // matrix 1 matrices.Add(new TermDocumentMatrix()); matrices[0]._matrix = new double[matrix1.NumDocs][]; matrices[0]._docIndex = new List<string>(matrix1._docIndex); matrices[0]._docIndexLookup = new Dictionary<string,int>(matrix1._docIndexLookup); // matrix 2 matrices.Add(new TermDocumentMatrix()); matrices[1]._matrix = new double[matrix2.NumDocs][]; matrices[1]._docIndex = new List<string>(matrix2._docIndex); matrices[1]._docIndexLookup = new Dictionary<string,int>(matrix2._docIndexLookup); // compute term set List<string> termIndex = new List<string>(); Dictionary<string, int> termIndexLookup = new Dictionary<string, int>(); Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup); // get all terms in first matrix foreach (string term in matrix1._termIndex) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); // remove duplicate terms if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); } } // add leftovers foreach (string term in leftovers.Keys) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); } // create new term distributions for each document // matrix 1 matrices[0]._termIndex = new List<string>(termIndex); matrices[0]._termIndexLookup = new Dictionary<string,int>(termIndexLookup); for (int i = 0; i < matrices[0].NumDocs; i++) { matrices[0]._matrix[i] = new double[termIndex.Count]; // fill in original values for (int j = 0; j < matrix1.NumTerms; j++) { matrices[0][i, j] = matrix1[i, j]; } // fill in missing terms for (int j = matrix1.NumTerms; j < termIndex.Count; j++) { matrices[0][i, j] = 0.0; } } // matrix 2 matrices[1]._termIndex = new List<string>(termIndex); matrices[1]._termIndexLookup = new Dictionary<string,int>(termIndexLookup); for (int i = 0; i < matrices[1].NumDocs; i++) { matrices[1]._matrix[i] = new double[termIndex.Count]; // fill in values for (int j = 0; j < termIndex.Count; j++) { if (matrix2.ContainsTerm(termIndex[j])) { matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]); } else { matrices[1][i, j] = 0.0; } } } // return return matrices; }
/// <summary> /// Recreates each matrix with documents containing missing terms. /// List[0] : matrix 1 /// List[1] : matrix 2 /// </summary> /// <param name="matrix1">First term-by-document matrix</param> /// <param name="matrix2">Second term-by-document matrix</param> /// <returns>Copies of original matrices with missing terms from each</returns> public static List <TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { // initialize matrices List <TermDocumentMatrix> matrices = new List <TermDocumentMatrix>(); // matrix 1 matrices.Add(new TermDocumentMatrix()); matrices[0]._matrix = new double[matrix1.NumDocs][]; matrices[0]._docIndex = new List <string>(matrix1._docIndex); matrices[0]._docIndexLookup = new Dictionary <string, int>(matrix1._docIndexLookup); // matrix 2 matrices.Add(new TermDocumentMatrix()); matrices[1]._matrix = new double[matrix2.NumDocs][]; matrices[1]._docIndex = new List <string>(matrix2._docIndex); matrices[1]._docIndexLookup = new Dictionary <string, int>(matrix2._docIndexLookup); // compute term set List <string> termIndex = new List <string>(); Dictionary <string, int> termIndexLookup = new Dictionary <string, int>(); Dictionary <string, int> leftovers = new Dictionary <string, int>(matrix2._termIndexLookup); // get all terms in first matrix foreach (string term in matrix1._termIndex) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); // remove duplicate terms if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); } } // add leftovers foreach (string term in leftovers.Keys) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); } // create new term distributions for each document // matrix 1 matrices[0]._termIndex = new List <string>(termIndex); matrices[0]._termIndexLookup = new Dictionary <string, int>(termIndexLookup); for (int i = 0; i < matrices[0].NumDocs; i++) { matrices[0]._matrix[i] = new double[termIndex.Count]; // fill in original values for (int j = 0; j < matrix1.NumTerms; j++) { matrices[0][i, j] = matrix1[i, j]; } // fill in missing terms for (int j = matrix1.NumTerms; j < termIndex.Count; j++) { matrices[0][i, j] = 0.0; } } // matrix 2 matrices[1]._termIndex = new List <string>(termIndex); matrices[1]._termIndexLookup = new Dictionary <string, int>(termIndexLookup); for (int i = 0; i < matrices[1].NumDocs; i++) { matrices[1]._matrix[i] = new double[termIndex.Count]; // fill in values for (int j = 0; j < termIndex.Count; j++) { if (matrix2.ContainsTerm(termIndex[j])) { matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]); } else { matrices[1][i, j] = 0.0; } } } // return return(matrices); }
/// <summary> /// Combines two TermDocumentMatrices into one matrix with the same terms. /// The first matrix is put in first (ie. newmatrix[0] = matrix1[0]) /// </summary> /// <param name="matrix1">First matrix</param> /// <param name="matrix2">Second matrix</param> /// <returns></returns> public static TermDocumentMatrix Combine(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { TermDocumentMatrix combined = new TermDocumentMatrix(); // add documents combined._docIndex = new List<string>(matrix1._docIndex); combined._docIndexLookup = new Dictionary<string, int>(matrix1._docIndexLookup); foreach (string doc in matrix2.DocMap) { combined._docIndex.Add(doc); combined._docIndexLookup.Add(doc, combined.NumDocs - 1); } // calculate union of terms combined._termIndex = new List<string>(matrix1._termIndex); combined._termIndexLookup = new Dictionary<string, int>(matrix1._termIndexLookup); foreach (string term in matrix2.TermMap) { if (!combined._termIndexLookup.ContainsKey(term)) { combined._termIndex.Add(term); combined._termIndexLookup.Add(term, combined.NumTerms - 1); } } // create and populate matrix combined._matrix = new double[combined.NumDocs][]; // matrix1 for (int i = 0; i < matrix1.NumDocs; i++) { combined._matrix[i] = new double[combined.NumTerms]; for (int j = 0; j < combined.NumTerms; j++) { if (matrix1.ContainsTerm(combined.TermMap[j])) { combined[i, j] = matrix1[i, matrix1.GetTermIndex(combined.TermMap[j])]; } else { combined[i, j] = 0.0; } } } // matrix2 for (int i = matrix1.NumDocs; i < combined.NumDocs; i++) { combined._matrix[i] = new double[combined.NumTerms]; for (int j = 0; j < combined.NumTerms; j++) { if (matrix2.ContainsTerm(combined.TermMap[j])) { combined[i, j] = matrix2[i - matrix1.NumDocs, matrix2.GetTermIndex(combined.TermMap[j])]; } else { combined[i, j] = 0.0; } } } return combined; }