/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// Row 0: document 1 /// Row 1: document 2 /// </summary> /// <param name="matrix1">document1 container</param> /// <param name="document1">document1 index</param> /// <param name="matrix2">document2 container</param> /// <param name="document2">document2 index</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2) { // initialize new TermDocumentMatrix TermDocumentMatrix newmatrix = new TermDocumentMatrix(); newmatrix._matrix = new double[2][]; newmatrix._termIndex = new List <string>(); newmatrix._termIndexLookup = new Dictionary <string, int>(); newmatrix._docIndex = new List <string>(); newmatrix._docIndexLookup = new Dictionary <string, int>(); newmatrix._docIndex.Add(matrix1.GetDocumentName(document1)); newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1); newmatrix._docIndex.Add(matrix2.GetDocumentName(document2)); newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1); List <double> doc1 = new List <double>(); List <double> doc2 = new List <double>(); // compute total term set Dictionary <string, int> leftovers = new Dictionary <string, int>(matrix2._termIndexLookup); foreach (string term in matrix1._termIndex) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term))); if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } else { doc2.Add(0.0); } } foreach (string term in leftovers.Keys) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(0.0); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } newmatrix._matrix[0] = doc1.ToArray(); newmatrix._matrix[1] = doc2.ToArray(); return(newmatrix); }
/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// Row 0: document 1 /// Row 1: document 2 /// </summary> /// <param name="matrix1">document1 container</param> /// <param name="document1">document1 index</param> /// <param name="matrix2">document2 container</param> /// <param name="document2">document2 index</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2) { // initialize new TermDocumentMatrix TermDocumentMatrix newmatrix = new TermDocumentMatrix(); newmatrix._matrix = new double[2][]; newmatrix._termIndex = new List<string>(); newmatrix._termIndexLookup = new Dictionary<string, int>(); newmatrix._docIndex = new List<string>(); newmatrix._docIndexLookup = new Dictionary<string, int>(); newmatrix._docIndex.Add(matrix1.GetDocumentName(document1)); newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1); newmatrix._docIndex.Add(matrix2.GetDocumentName(document2)); newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1); List<double> doc1 = new List<double>(); List<double> doc2 = new List<double>(); // compute total term set Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup); foreach (string term in matrix1._termIndex) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term))); if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } else { doc2.Add(0.0); } } foreach (string term in leftovers.Keys) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(0.0); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } newmatrix._matrix[0] = doc1.ToArray(); newmatrix._matrix[1] = doc2.ToArray(); return newmatrix; }
/// <summary> /// Recreates each matrix with documents containing missing terms. /// List[0] : matrix 1 /// List[1] : matrix 2 /// </summary> /// <param name="matrix1">First term-by-document matrix</param> /// <param name="matrix2">Second term-by-document matrix</param> /// <returns>Copies of original matrices with missing terms from each</returns> public static List <TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { // initialize matrices List <TermDocumentMatrix> matrices = new List <TermDocumentMatrix>(); // matrix 1 matrices.Add(new TermDocumentMatrix()); matrices[0]._matrix = new double[matrix1.NumDocs][]; matrices[0]._docIndex = new List <string>(matrix1._docIndex); matrices[0]._docIndexLookup = new Dictionary <string, int>(matrix1._docIndexLookup); // matrix 2 matrices.Add(new TermDocumentMatrix()); matrices[1]._matrix = new double[matrix2.NumDocs][]; matrices[1]._docIndex = new List <string>(matrix2._docIndex); matrices[1]._docIndexLookup = new Dictionary <string, int>(matrix2._docIndexLookup); // compute term set List <string> termIndex = new List <string>(); Dictionary <string, int> termIndexLookup = new Dictionary <string, int>(); Dictionary <string, int> leftovers = new Dictionary <string, int>(matrix2._termIndexLookup); // get all terms in first matrix foreach (string term in matrix1._termIndex) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); // remove duplicate terms if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); } } // add leftovers foreach (string term in leftovers.Keys) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); } // create new term distributions for each document // matrix 1 matrices[0]._termIndex = new List <string>(termIndex); matrices[0]._termIndexLookup = new Dictionary <string, int>(termIndexLookup); for (int i = 0; i < matrices[0].NumDocs; i++) { matrices[0]._matrix[i] = new double[termIndex.Count]; // fill in original values for (int j = 0; j < matrix1.NumTerms; j++) { matrices[0][i, j] = matrix1[i, j]; } // fill in missing terms for (int j = matrix1.NumTerms; j < termIndex.Count; j++) { matrices[0][i, j] = 0.0; } } // matrix 2 matrices[1]._termIndex = new List <string>(termIndex); matrices[1]._termIndexLookup = new Dictionary <string, int>(termIndexLookup); for (int i = 0; i < matrices[1].NumDocs; i++) { matrices[1]._matrix[i] = new double[termIndex.Count]; // fill in values for (int j = 0; j < termIndex.Count; j++) { if (matrix2.ContainsTerm(termIndex[j])) { matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]); } else { matrices[1][i, j] = 0.0; } } } // return return(matrices); }
/// <summary> /// Recreates each matrix with documents containing missing terms. /// List[0] : matrix 1 /// List[1] : matrix 2 /// </summary> /// <param name="matrix1">First term-by-document matrix</param> /// <param name="matrix2">Second term-by-document matrix</param> /// <returns>Copies of original matrices with missing terms from each</returns> public static List<TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { // initialize matrices List<TermDocumentMatrix> matrices = new List<TermDocumentMatrix>(); // matrix 1 matrices.Add(new TermDocumentMatrix()); matrices[0]._matrix = new double[matrix1.NumDocs][]; matrices[0]._docIndex = new List<string>(matrix1._docIndex); matrices[0]._docIndexLookup = new Dictionary<string,int>(matrix1._docIndexLookup); // matrix 2 matrices.Add(new TermDocumentMatrix()); matrices[1]._matrix = new double[matrix2.NumDocs][]; matrices[1]._docIndex = new List<string>(matrix2._docIndex); matrices[1]._docIndexLookup = new Dictionary<string,int>(matrix2._docIndexLookup); // compute term set List<string> termIndex = new List<string>(); Dictionary<string, int> termIndexLookup = new Dictionary<string, int>(); Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup); // get all terms in first matrix foreach (string term in matrix1._termIndex) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); // remove duplicate terms if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); } } // add leftovers foreach (string term in leftovers.Keys) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); } // create new term distributions for each document // matrix 1 matrices[0]._termIndex = new List<string>(termIndex); matrices[0]._termIndexLookup = new Dictionary<string,int>(termIndexLookup); for (int i = 0; i < matrices[0].NumDocs; i++) { matrices[0]._matrix[i] = new double[termIndex.Count]; // fill in original values for (int j = 0; j < matrix1.NumTerms; j++) { matrices[0][i, j] = matrix1[i, j]; } // fill in missing terms for (int j = matrix1.NumTerms; j < termIndex.Count; j++) { matrices[0][i, j] = 0.0; } } // matrix 2 matrices[1]._termIndex = new List<string>(termIndex); matrices[1]._termIndexLookup = new Dictionary<string,int>(termIndexLookup); for (int i = 0; i < matrices[1].NumDocs; i++) { matrices[1]._matrix[i] = new double[termIndex.Count]; // fill in values for (int j = 0; j < termIndex.Count; j++) { if (matrix2.ContainsTerm(termIndex[j])) { matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]); } else { matrices[1][i, j] = 0.0; } } } // return return matrices; }