/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// Row 0: document 1 /// Row 1: document 2 /// </summary> /// <param name="matrix1">document1 container</param> /// <param name="document1">document1 index</param> /// <param name="matrix2">document2 container</param> /// <param name="document2">document2 index</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2) { // initialize new TermDocumentMatrix TermDocumentMatrix newmatrix = new TermDocumentMatrix(); newmatrix._matrix = new double[2][]; newmatrix._termIndex = new List <string>(); newmatrix._termIndexLookup = new Dictionary <string, int>(); newmatrix._docIndex = new List <string>(); newmatrix._docIndexLookup = new Dictionary <string, int>(); newmatrix._docIndex.Add(matrix1.GetDocumentName(document1)); newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1); newmatrix._docIndex.Add(matrix2.GetDocumentName(document2)); newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1); List <double> doc1 = new List <double>(); List <double> doc2 = new List <double>(); // compute total term set Dictionary <string, int> leftovers = new Dictionary <string, int>(matrix2._termIndexLookup); foreach (string term in matrix1._termIndex) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term))); if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } else { doc2.Add(0.0); } } foreach (string term in leftovers.Keys) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(0.0); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } newmatrix._matrix[0] = doc1.ToArray(); newmatrix._matrix[1] = doc2.ToArray(); return(newmatrix); }
/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// Row 0: document 1 /// Row 1: document 2 /// </summary> /// <param name="matrix1">document1 container</param> /// <param name="document1">document1 index</param> /// <param name="matrix2">document2 container</param> /// <param name="document2">document2 index</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2) { // initialize new TermDocumentMatrix TermDocumentMatrix newmatrix = new TermDocumentMatrix(); newmatrix._matrix = new double[2][]; newmatrix._termIndex = new List<string>(); newmatrix._termIndexLookup = new Dictionary<string, int>(); newmatrix._docIndex = new List<string>(); newmatrix._docIndexLookup = new Dictionary<string, int>(); newmatrix._docIndex.Add(matrix1.GetDocumentName(document1)); newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1); newmatrix._docIndex.Add(matrix2.GetDocumentName(document2)); newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1); List<double> doc1 = new List<double>(); List<double> doc2 = new List<double>(); // compute total term set Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup); foreach (string term in matrix1._termIndex) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term))); if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } else { doc2.Add(0.0); } } foreach (string term in leftovers.Keys) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(0.0); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } newmatrix._matrix[0] = doc1.ToArray(); newmatrix._matrix[1] = doc2.ToArray(); return newmatrix; }