/// <summary> /// Saves matrix to file /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <param name="filename">File location</param> public static void Save(TermDocumentMatrix matrix, string filename) { // attempt to create file TextWriter tw = new StreamWriter(File.Open(filename, FileMode.Create)); // print out term list foreach (string term in matrix.TermMap) { tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, term); } tw.WriteLine(); // print out each document for (int i = 0; i < matrix.NumDocs; i++) { tw.Write(matrix.GetDocumentName(i)); // print out each term for (int j = 0; j < matrix.NumTerms; j++) { tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, matrix[i, j]); } tw.WriteLine(); } // close file tw.Flush(); tw.Close(); }
/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// Row 0: document 1 /// Row 1: document 2 /// </summary> /// <param name="matrix1">document1 container</param> /// <param name="document1">document1 index</param> /// <param name="matrix2">document2 container</param> /// <param name="document2">document2 index</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2) { // initialize new TermDocumentMatrix TermDocumentMatrix newmatrix = new TermDocumentMatrix(); newmatrix._matrix = new double[2][]; newmatrix._termIndex = new List <string>(); newmatrix._termIndexLookup = new Dictionary <string, int>(); newmatrix._docIndex = new List <string>(); newmatrix._docIndexLookup = new Dictionary <string, int>(); newmatrix._docIndex.Add(matrix1.GetDocumentName(document1)); newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1); newmatrix._docIndex.Add(matrix2.GetDocumentName(document2)); newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1); List <double> doc1 = new List <double>(); List <double> doc2 = new List <double>(); // compute total term set Dictionary <string, int> leftovers = new Dictionary <string, int>(matrix2._termIndexLookup); foreach (string term in matrix1._termIndex) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term))); if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } else { doc2.Add(0.0); } } foreach (string term in leftovers.Keys) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(0.0); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } newmatrix._matrix[0] = doc1.ToArray(); newmatrix._matrix[1] = doc2.ToArray(); return(newmatrix); }
/// <summary> /// Computes cosine similarities between a set of boolean document vectors and a tfidf weighted corpus /// </summary> /// <param name="ids">Boolean document vectors</param> /// <param name="tfidf">tf-idf weighted document vectors</param> /// <returns>Similarity matrix</returns> private static TLSimilarityMatrix ComputeSimilarities(TermDocumentMatrix ids, TermDocumentMatrix tfidf) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(ids, tfidf); for (int i = 0; i < ids.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < tfidf.NumDocs; j++) { double product = 0.0; double asquared = 0.0; double bsquared = 0.0; for (int k = 0; k < matrices[0].NumTerms; k++) { double a = matrices[0][i, k]; double b = matrices[1][j, k]; product += (a * b); asquared += Math.Pow(a, 2); bsquared += Math.Pow(b, 2); } double cross = Math.Sqrt(asquared) * Math.Sqrt(bsquared); if (cross == 0.0) { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), product / cross)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return(sims); }
public void ConstructorTest_Artifacts() { string data = @"../../Data/SimpleCorpus."; TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.Import(data + "input.target.txt")); TermDocumentMatrix answer = TermDocumentMatrix.Load(data + "output.target.matrix.txt"); // counts Assert.AreEqual(matrix.NumDocs, answer.NumDocs); Assert.AreEqual(matrix.NumTerms, answer.NumTerms); // matrix for (int i = 0; i < answer.NumDocs; i++) { Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i)); Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms); for (int j = 0; j < answer.NumTerms; j++) { Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j)); Assert.AreEqual(matrix[i, j], answer[i, j], 0.0); } } }
/// <summary> /// Computes cosine similarities between a set of boolean document vectors and a tfidf weighted corpus /// </summary> /// <param name="ids">Boolean document vectors</param> /// <param name="tfidf">tf-idf weighted document vectors</param> /// <returns>Similarity matrix</returns> private static TLSimilarityMatrix ComputeSimilarities(TermDocumentMatrix ids, TermDocumentMatrix tfidf) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(ids, tfidf); for (int i = 0; i < ids.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < tfidf.NumDocs; j++) { double product = 0.0; double asquared = 0.0; double bsquared = 0.0; for (int k = 0; k < matrices[0].NumTerms; k++) { double a = matrices[0][i, k]; double b = matrices[1][j, k]; product += (a * b); asquared += Math.Pow(a, 2); bsquared += Math.Pow(b, 2); } double cross = Math.Sqrt(asquared) * Math.Sqrt(bsquared); if (cross == 0.0) { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), product / cross)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// Row 0: document 1 /// Row 1: document 2 /// </summary> /// <param name="matrix1">document1 container</param> /// <param name="document1">document1 index</param> /// <param name="matrix2">document2 container</param> /// <param name="document2">document2 index</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2) { // initialize new TermDocumentMatrix TermDocumentMatrix newmatrix = new TermDocumentMatrix(); newmatrix._matrix = new double[2][]; newmatrix._termIndex = new List<string>(); newmatrix._termIndexLookup = new Dictionary<string, int>(); newmatrix._docIndex = new List<string>(); newmatrix._docIndexLookup = new Dictionary<string, int>(); newmatrix._docIndex.Add(matrix1.GetDocumentName(document1)); newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1); newmatrix._docIndex.Add(matrix2.GetDocumentName(document2)); newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1); List<double> doc1 = new List<double>(); List<double> doc2 = new List<double>(); // compute total term set Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup); foreach (string term in matrix1._termIndex) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term))); if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } else { doc2.Add(0.0); } } foreach (string term in leftovers.Keys) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(0.0); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } newmatrix._matrix[0] = doc1.ToArray(); newmatrix._matrix[1] = doc2.ToArray(); return newmatrix; }
/// <summary> /// Recreates each matrix with documents containing missing terms. /// List[0] : matrix 1 /// List[1] : matrix 2 /// </summary> /// <param name="matrix1">First term-by-document matrix</param> /// <param name="matrix2">Second term-by-document matrix</param> /// <returns>Copies of original matrices with missing terms from each</returns> public static List<TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { // initialize matrices List<TermDocumentMatrix> matrices = new List<TermDocumentMatrix>(); // matrix 1 matrices.Add(new TermDocumentMatrix()); matrices[0]._matrix = new double[matrix1.NumDocs][]; matrices[0]._docIndex = new List<string>(matrix1._docIndex); matrices[0]._docIndexLookup = new Dictionary<string,int>(matrix1._docIndexLookup); // matrix 2 matrices.Add(new TermDocumentMatrix()); matrices[1]._matrix = new double[matrix2.NumDocs][]; matrices[1]._docIndex = new List<string>(matrix2._docIndex); matrices[1]._docIndexLookup = new Dictionary<string,int>(matrix2._docIndexLookup); // compute term set List<string> termIndex = new List<string>(); Dictionary<string, int> termIndexLookup = new Dictionary<string, int>(); Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup); // get all terms in first matrix foreach (string term in matrix1._termIndex) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); // remove duplicate terms if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); } } // add leftovers foreach (string term in leftovers.Keys) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); } // create new term distributions for each document // matrix 1 matrices[0]._termIndex = new List<string>(termIndex); matrices[0]._termIndexLookup = new Dictionary<string,int>(termIndexLookup); for (int i = 0; i < matrices[0].NumDocs; i++) { matrices[0]._matrix[i] = new double[termIndex.Count]; // fill in original values for (int j = 0; j < matrix1.NumTerms; j++) { matrices[0][i, j] = matrix1[i, j]; } // fill in missing terms for (int j = matrix1.NumTerms; j < termIndex.Count; j++) { matrices[0][i, j] = 0.0; } } // matrix 2 matrices[1]._termIndex = new List<string>(termIndex); matrices[1]._termIndexLookup = new Dictionary<string,int>(termIndexLookup); for (int i = 0; i < matrices[1].NumDocs; i++) { matrices[1]._matrix[i] = new double[termIndex.Count]; // fill in values for (int j = 0; j < termIndex.Count; j++) { if (matrix2.ContainsTerm(termIndex[j])) { matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]); } else { matrices[1][i, j] = 0.0; } } } // return return matrices; }
/// <summary> /// Recreates each matrix with documents containing missing terms. /// List[0] : matrix 1 /// List[1] : matrix 2 /// </summary> /// <param name="matrix1">First term-by-document matrix</param> /// <param name="matrix2">Second term-by-document matrix</param> /// <returns>Copies of original matrices with missing terms from each</returns> public static List <TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { // initialize matrices List <TermDocumentMatrix> matrices = new List <TermDocumentMatrix>(); // matrix 1 matrices.Add(new TermDocumentMatrix()); matrices[0]._matrix = new double[matrix1.NumDocs][]; matrices[0]._docIndex = new List <string>(matrix1._docIndex); matrices[0]._docIndexLookup = new Dictionary <string, int>(matrix1._docIndexLookup); // matrix 2 matrices.Add(new TermDocumentMatrix()); matrices[1]._matrix = new double[matrix2.NumDocs][]; matrices[1]._docIndex = new List <string>(matrix2._docIndex); matrices[1]._docIndexLookup = new Dictionary <string, int>(matrix2._docIndexLookup); // compute term set List <string> termIndex = new List <string>(); Dictionary <string, int> termIndexLookup = new Dictionary <string, int>(); Dictionary <string, int> leftovers = new Dictionary <string, int>(matrix2._termIndexLookup); // get all terms in first matrix foreach (string term in matrix1._termIndex) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); // remove duplicate terms if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); } } // add leftovers foreach (string term in leftovers.Keys) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); } // create new term distributions for each document // matrix 1 matrices[0]._termIndex = new List <string>(termIndex); matrices[0]._termIndexLookup = new Dictionary <string, int>(termIndexLookup); for (int i = 0; i < matrices[0].NumDocs; i++) { matrices[0]._matrix[i] = new double[termIndex.Count]; // fill in original values for (int j = 0; j < matrix1.NumTerms; j++) { matrices[0][i, j] = matrix1[i, j]; } // fill in missing terms for (int j = matrix1.NumTerms; j < termIndex.Count; j++) { matrices[0][i, j] = 0.0; } } // matrix 2 matrices[1]._termIndex = new List <string>(termIndex); matrices[1]._termIndexLookup = new Dictionary <string, int>(termIndexLookup); for (int i = 0; i < matrices[1].NumDocs; i++) { matrices[1]._matrix[i] = new double[termIndex.Count]; // fill in values for (int j = 0; j < termIndex.Count; j++) { if (matrix2.ContainsTerm(termIndex[j])) { matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]); } else { matrices[1][i, j] = 0.0; } } } // return return(matrices); }