/// <summary> /// Saves matrix to file /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <param name="filename">File location</param> public static void Save(TermDocumentMatrix matrix, string filename) { // attempt to create file TextWriter tw = new StreamWriter(File.Open(filename, FileMode.Create)); // print out term list foreach (string term in matrix.TermMap) { tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, term); } tw.WriteLine(); // print out each document for (int i = 0; i < matrix.NumDocs; i++) { tw.Write(matrix.GetDocumentName(i)); // print out each term for (int j = 0; j < matrix.NumTerms; j++) { tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, matrix[i, j]); } tw.WriteLine(); } // close file tw.Flush(); tw.Close(); }
/// <summary> /// Computes similarities between term-by-document matrices via the Vector Space Model /// using a tf-idf weighting scheme and cosine similarity. /// </summary> /// <param name="source">Source matrix</param> /// <param name="target">Target matrix</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target) { TermDocumentMatrix IDs = ComputeIdentities(source); TermDocumentMatrix TF = ComputeTF(target); double[] IDF = ComputeIDF(ComputeDF(target), target.NumDocs); TermDocumentMatrix TFIDF = ComputeTFIDF(TF, IDF); return ComputeSimilarities(IDs, TFIDF); }
/// <summary> /// Computes similarities between term-by-document matrices via the Vector Space Model /// using a tf-idf weighting scheme and cosine similarity. /// </summary> /// <param name="source">Source matrix</param> /// <param name="target">Target matrix</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target) { TermDocumentMatrix IDs = ComputeIdentities(source); TermDocumentMatrix TF = ComputeTF(target); double[] IDF = ComputeIDF(ComputeDF(target), target.NumDocs); TermDocumentMatrix TFIDF = ComputeTFIDF(TF, IDF); return(ComputeSimilarities(IDs, TFIDF)); }
/// <summary> /// Computes tf-idf weights /// </summary> /// <param name="tf">Term-frequency weighted matrix</param> /// <param name="idf">Inverse document frequencies vector</param> /// <returns></returns> private static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix tf, double[] idf) { for (int i = 0; i < tf.NumDocs; i++) { for (int j = 0; j < tf.NumTerms; j++) { tf[i, j] = tf[i, j] * idf[j]; } } return(tf); }
/// <summary> /// Computes boolean (0|1) terms in documents. /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>Term-by-document matrix with 1s for terms that are in the document and 0s for terms that are not.</returns> private static TermDocumentMatrix ComputeIdentities(TermDocumentMatrix matrix) { for (int i = 0; i < matrix.NumDocs; i++) { for (int j = 0; j < matrix.NumTerms; j++) { matrix[i,j] = (matrix[i,j] > 0.0) ? 1.0 : 0.0; } } return matrix; }
/// <summary> /// Computes boolean (0|1) terms in documents. /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>Term-by-document matrix with 1s for terms that are in the document and 0s for terms that are not.</returns> private static TermDocumentMatrix ComputeIdentities(TermDocumentMatrix matrix) { for (int i = 0; i < matrix.NumDocs; i++) { for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] = (matrix[i, j] > 0.0) ? 1.0 : 0.0; } } return(matrix); }
/// <summary> /// Computes the term frequencies of each document. /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>tf-weighted term-by-document matrix</returns> private static TermDocumentMatrix ComputeTF(TermDocumentMatrix matrix) { for (int i = 0; i < matrix.NumDocs; i++) { double max = matrix.GetDocument(i).Max(); for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] = matrix[i, j] / max; } } return(matrix); }
/// <summary> /// Computes the term frequencies of each document. /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>tf-weighted term-by-document matrix</returns> private static TermDocumentMatrix ComputeTF(TermDocumentMatrix matrix) { for (int i = 0; i < matrix.NumDocs; i++) { double max = matrix.GetDocument(i).Max(); for (int j = 0; j < matrix.NumTerms; j++) { matrix[i,j] = matrix[i,j] / max; } } return matrix; }
/// <summary> /// Loads a previously saved TermDocumentMatrix from disk. /// </summary> /// <param name="filename">File location</param> /// <returns>Term-by-document matrix</returns> public static TermDocumentMatrix Load(string filename) { TextReader tr = new StreamReader(File.OpenRead(filename)); TermDocumentMatrix matrix = new TermDocumentMatrix(); int lineNum = 1; string line = tr.ReadLine(); string[] delimeter = new string[] { TermDocumentMatrix.IODelimeter }; // read terms matrix._termIndex = new List <string>(line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries)); matrix._termIndexLookup = new Dictionary <string, int>(); for (int i = 0; i < matrix._termIndex.Count; i++) { matrix._termIndexLookup.Add(matrix._termIndex[i], i); } // read documents matrix._docIndex = new List <string>(); matrix._docIndexLookup = new Dictionary <string, int>(); List <double[]> docs = new List <double[]>(); while ((line = tr.ReadLine()) != null) { lineNum++; string[] document = line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries); if (document.Length != matrix.NumTerms + 1) { tr.Close(); throw new InvalidDataException("Incorrect data format on line " + lineNum + " in file: " + filename); } matrix._docIndex.Add(document[0]); matrix._docIndexLookup.Add(document[0], matrix._docIndex.Count - 1); double[] doc = new double[matrix.NumTerms]; for (int i = 1; i < document.Length; i++) { doc[i - 1] = Convert.ToDouble(document[i]); } docs.Add(doc); } // add documents matrix._matrix = new double[matrix.NumDocs][]; for (int i = 0; i < matrix.NumDocs; i++) { matrix._matrix[i] = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] = docs[i][j]; } } // cleanup tr.Close(); return(matrix); }
/// <summary> /// Computes the document frequencies of each term /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>df-weighted term distribution</returns> private static double[] ComputeDF(TermDocumentMatrix matrix) { double[] df = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { df[j] = 0.0; for (int i = 0; i < matrix.NumDocs; i++) { df[j] += (matrix[i,j] > 0.0) ? 1.0 : 0.0; } } return df; }
/// <summary> /// Computes the average term vector of the matrix /// </summary> /// <param name="matrix">Artifacts</param> /// <returns>Average vector</returns> private static double[] ComputeAverageVector(TermDocumentMatrix matrix) { double[] avg = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { for (int i = 0; i < matrix.NumDocs; i++) { avg[j] += matrix[i, j]; } avg[j] = avg[j] / matrix.NumDocs; } return avg; }
/// <summary> /// Computes the document frequencies of each term /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>df-weighted term distribution</returns> private static double[] ComputeDF(TermDocumentMatrix matrix) { double[] df = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { df[j] = 0.0; for (int i = 0; i < matrix.NumDocs; i++) { df[j] += (matrix[i, j] > 0.0) ? 1.0 : 0.0; } } return(df); }
/// <summary> /// Deep copy constructor /// </summary> /// <param name="matrix">Object to be copied</param> public TermDocumentMatrix(TermDocumentMatrix matrix) { _matrix = new double[matrix.NumDocs][]; for (int i = 0; i < matrix.NumDocs; i++) { _matrix[i] = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { _matrix[i][j] = matrix[i, j]; } } _docIndex = new List <string>(matrix._docIndex); _docIndexLookup = new Dictionary <string, int>(matrix._docIndexLookup); _termIndex = new List <string>(matrix._termIndex); _termIndexLookup = new Dictionary <string, int>(matrix._termIndexLookup); }
/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// Row 0: document 1 /// Row 1: document 2 /// </summary> /// <param name="matrix1">document1 container</param> /// <param name="document1">document1 index</param> /// <param name="matrix2">document2 container</param> /// <param name="document2">document2 index</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2) { // initialize new TermDocumentMatrix TermDocumentMatrix newmatrix = new TermDocumentMatrix(); newmatrix._matrix = new double[2][]; newmatrix._termIndex = new List <string>(); newmatrix._termIndexLookup = new Dictionary <string, int>(); newmatrix._docIndex = new List <string>(); newmatrix._docIndexLookup = new Dictionary <string, int>(); newmatrix._docIndex.Add(matrix1.GetDocumentName(document1)); newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1); newmatrix._docIndex.Add(matrix2.GetDocumentName(document2)); newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1); List <double> doc1 = new List <double>(); List <double> doc2 = new List <double>(); // compute total term set Dictionary <string, int> leftovers = new Dictionary <string, int>(matrix2._termIndexLookup); foreach (string term in matrix1._termIndex) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term))); if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } else { doc2.Add(0.0); } } foreach (string term in leftovers.Keys) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(0.0); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } newmatrix._matrix[0] = doc1.ToArray(); newmatrix._matrix[1] = doc2.ToArray(); return(newmatrix); }
/// <summary> /// Computes Jensen-Shannon divergence on two TermDocumentMatrices /// </summary> /// <param name="source">Source artifacts collection</param> /// <param name="target">Target artifacts collection</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target) { List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target); TLSimilarityMatrix sims = new TLSimilarityMatrix(); for (int i = 0; i < matrices[0].NumDocs; i++) { TLLinksList list = new TLLinksList(); for (int j = 0; j < matrices[1].NumDocs; j++) { list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j), DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j)))); } list.Sort(); foreach (TLSingleLink link in list) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
public void ConstructorTest_Artifacts() { string data = @"../../Data/SimpleCorpus."; TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.Import(data + "input.target.txt")); TermDocumentMatrix answer = TermDocumentMatrix.Load(data + "output.target.matrix.txt"); // counts Assert.AreEqual(matrix.NumDocs, answer.NumDocs); Assert.AreEqual(matrix.NumTerms, answer.NumTerms); // matrix for (int i = 0; i < answer.NumDocs; i++) { Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i)); Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms); for (int j = 0; j < answer.NumTerms; j++) { Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j)); Assert.AreEqual(matrix[i, j], answer[i, j], 0.0); } } }
/// <summary> /// Smoothing filter from ICPC'11 paper "Improving IR-based Traceability Recovery Using Smoothing Filters" /// </summary> /// <param name="artifacts">Artifacts</param> /// <returns>Smoothed artifacts</returns> public static TermDocumentMatrix Compute(TermDocumentMatrix matrix) { double[] avg = ComputeAverageVector(matrix); if (avg.Length != matrix.NumTerms) throw new ArgumentException("Average vector does not have the correct number of terms."); for (int i = 0; i < matrix.NumDocs; i++) { for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] -= avg[j]; if (matrix[i, j] < 0.0) { matrix[i, j] = 0.0; } } } return matrix; }
/// <summary> /// Computes Jensen-Shannon divergence on two TermDocumentMatrices /// </summary> /// <param name="source">Source artifacts collection</param> /// <param name="target">Target artifacts collection</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target) { List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target); TLSimilarityMatrix sims = new TLSimilarityMatrix(); for (int i = 0; i < matrices[0].NumDocs; i++) { TLLinksList list = new TLLinksList(); for (int j = 0; j < matrices[1].NumDocs; j++) { list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j), DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j)))); } list.Sort(); foreach (TLSingleLink link in list) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return(sims); }
/// <summary> /// Computes cosine similarities between a set of boolean document vectors and a tfidf weighted corpus /// </summary> /// <param name="ids">Boolean document vectors</param> /// <param name="tfidf">tf-idf weighted document vectors</param> /// <returns>Similarity matrix</returns> private static TLSimilarityMatrix ComputeSimilarities(TermDocumentMatrix ids, TermDocumentMatrix tfidf) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(ids, tfidf); for (int i = 0; i < ids.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < tfidf.NumDocs; j++) { double product = 0.0; double asquared = 0.0; double bsquared = 0.0; for (int k = 0; k < matrices[0].NumTerms; k++) { double a = matrices[0][i, k]; double b = matrices[1][j, k]; product += (a * b); asquared += Math.Pow(a, 2); bsquared += Math.Pow(b, 2); } double cross = Math.Sqrt(asquared) * Math.Sqrt(bsquared); if (cross == 0.0) { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), product / cross)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return(sims); }
/// <summary> /// Loads a previously saved TermDocumentMatrix from disk. /// </summary> /// <param name="filename">File location</param> /// <returns>Term-by-document matrix</returns> public static TermDocumentMatrix Load(string filename) { TextReader tr = new StreamReader(File.OpenRead(filename)); TermDocumentMatrix matrix = new TermDocumentMatrix(); int lineNum = 1; string line = tr.ReadLine(); string[] delimeter = new string[] { TermDocumentMatrix.IODelimeter }; // read terms matrix._termIndex = new List<string>(line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries)); matrix._termIndexLookup = new Dictionary<string, int>(); for (int i = 0; i < matrix._termIndex.Count; i++) { matrix._termIndexLookup.Add(matrix._termIndex[i], i); } // read documents matrix._docIndex = new List<string>(); matrix._docIndexLookup = new Dictionary<string, int>(); List<double[]> docs = new List<double[]>(); while ((line = tr.ReadLine()) != null) { lineNum++; string[] document = line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries); if (document.Length != matrix.NumTerms + 1) { tr.Close(); throw new InvalidDataException("Incorrect data format on line " + lineNum + " in file: " + filename); } matrix._docIndex.Add(document[0]); matrix._docIndexLookup.Add(document[0], matrix._docIndex.Count - 1); double[] doc = new double[matrix.NumTerms]; for (int i = 1; i < document.Length; i++) { doc[i - 1] = Convert.ToDouble(document[i]); } docs.Add(doc); } // add documents matrix._matrix = new double[matrix.NumDocs][]; for (int i = 0; i < matrix.NumDocs; i++) { matrix._matrix[i] = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] = docs[i][j]; } } // cleanup tr.Close(); return matrix; }
/// <summary> /// Computes cosine similarities between a set of boolean document vectors and a tfidf weighted corpus /// </summary> /// <param name="ids">Boolean document vectors</param> /// <param name="tfidf">tf-idf weighted document vectors</param> /// <returns>Similarity matrix</returns> private static TLSimilarityMatrix ComputeSimilarities(TermDocumentMatrix ids, TermDocumentMatrix tfidf) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(ids, tfidf); for (int i = 0; i < ids.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < tfidf.NumDocs; j++) { double product = 0.0; double asquared = 0.0; double bsquared = 0.0; for (int k = 0; k < matrices[0].NumTerms; k++) { double a = matrices[0][i, k]; double b = matrices[1][j, k]; product += (a * b); asquared += Math.Pow(a, 2); bsquared += Math.Pow(b, 2); } double cross = Math.Sqrt(asquared) * Math.Sqrt(bsquared); if (cross == 0.0) { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), product / cross)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
/// <summary> /// Computes tf-idf weights /// </summary> /// <param name="tf">Term-frequency weighted matrix</param> /// <param name="idf">Inverse document frequencies vector</param> /// <returns></returns> private static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix tf, double[] idf) { for (int i = 0; i < tf.NumDocs; i++) { for (int j = 0; j < tf.NumTerms; j++) { tf[i,j] = tf[i,j] * idf[j]; } } return tf; }
/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// </summary> /// <param name="matrix1">artifact1 container</param> /// <param name="artifact1">artifact1 ID</param> /// <param name="matrix2">artifact2 container</param> /// <param name="artifact2">artifact2 ID</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, string artifact1, TermDocumentMatrix matrix2, string artifact2) { return(EqualizeDocuments(matrix1, matrix1.GetDocumentIndex(artifact1), matrix2, matrix2.GetDocumentIndex(artifact2))); }
/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// </summary> /// <param name="matrix1">artifact1 container</param> /// <param name="artifact1">artifact1 ID</param> /// <param name="matrix2">artifact2 container</param> /// <param name="artifact2">artifact2 ID</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, string artifact1, TermDocumentMatrix matrix2, string artifact2) { return EqualizeDocuments(matrix1, matrix1.GetDocumentIndex(artifact1), matrix2, matrix2.GetDocumentIndex(artifact2)); }
/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// Row 0: document 1 /// Row 1: document 2 /// </summary> /// <param name="matrix1">document1 container</param> /// <param name="document1">document1 index</param> /// <param name="matrix2">document2 container</param> /// <param name="document2">document2 index</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2) { // initialize new TermDocumentMatrix TermDocumentMatrix newmatrix = new TermDocumentMatrix(); newmatrix._matrix = new double[2][]; newmatrix._termIndex = new List<string>(); newmatrix._termIndexLookup = new Dictionary<string, int>(); newmatrix._docIndex = new List<string>(); newmatrix._docIndexLookup = new Dictionary<string, int>(); newmatrix._docIndex.Add(matrix1.GetDocumentName(document1)); newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1); newmatrix._docIndex.Add(matrix2.GetDocumentName(document2)); newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1); List<double> doc1 = new List<double>(); List<double> doc2 = new List<double>(); // compute total term set Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup); foreach (string term in matrix1._termIndex) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term))); if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } else { doc2.Add(0.0); } } foreach (string term in leftovers.Keys) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(0.0); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } newmatrix._matrix[0] = doc1.ToArray(); newmatrix._matrix[1] = doc2.ToArray(); return newmatrix; }
/// <summary> /// Recreates each matrix with documents containing missing terms. /// List[0] : matrix 1 /// List[1] : matrix 2 /// </summary> /// <param name="matrix1">First term-by-document matrix</param> /// <param name="matrix2">Second term-by-document matrix</param> /// <returns>Copies of original matrices with missing terms from each</returns> public static List<TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { // initialize matrices List<TermDocumentMatrix> matrices = new List<TermDocumentMatrix>(); // matrix 1 matrices.Add(new TermDocumentMatrix()); matrices[0]._matrix = new double[matrix1.NumDocs][]; matrices[0]._docIndex = new List<string>(matrix1._docIndex); matrices[0]._docIndexLookup = new Dictionary<string,int>(matrix1._docIndexLookup); // matrix 2 matrices.Add(new TermDocumentMatrix()); matrices[1]._matrix = new double[matrix2.NumDocs][]; matrices[1]._docIndex = new List<string>(matrix2._docIndex); matrices[1]._docIndexLookup = new Dictionary<string,int>(matrix2._docIndexLookup); // compute term set List<string> termIndex = new List<string>(); Dictionary<string, int> termIndexLookup = new Dictionary<string, int>(); Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup); // get all terms in first matrix foreach (string term in matrix1._termIndex) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); // remove duplicate terms if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); } } // add leftovers foreach (string term in leftovers.Keys) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); } // create new term distributions for each document // matrix 1 matrices[0]._termIndex = new List<string>(termIndex); matrices[0]._termIndexLookup = new Dictionary<string,int>(termIndexLookup); for (int i = 0; i < matrices[0].NumDocs; i++) { matrices[0]._matrix[i] = new double[termIndex.Count]; // fill in original values for (int j = 0; j < matrix1.NumTerms; j++) { matrices[0][i, j] = matrix1[i, j]; } // fill in missing terms for (int j = matrix1.NumTerms; j < termIndex.Count; j++) { matrices[0][i, j] = 0.0; } } // matrix 2 matrices[1]._termIndex = new List<string>(termIndex); matrices[1]._termIndexLookup = new Dictionary<string,int>(termIndexLookup); for (int i = 0; i < matrices[1].NumDocs; i++) { matrices[1]._matrix[i] = new double[termIndex.Count]; // fill in values for (int j = 0; j < termIndex.Count; j++) { if (matrix2.ContainsTerm(termIndex[j])) { matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]); } else { matrices[1][i, j] = 0.0; } } } // return return matrices; }
/// <summary> /// Deep copy constructor /// </summary> /// <param name="matrix">Object to be copied</param> public TermDocumentMatrix(TermDocumentMatrix matrix) { _matrix = new double[matrix.NumDocs][]; for (int i = 0; i < matrix.NumDocs; i++) { _matrix[i] = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { _matrix[i][j] = matrix[i,j]; } } _docIndex = new List<string>(matrix._docIndex); _docIndexLookup = new Dictionary<string, int>(matrix._docIndexLookup); _termIndex = new List<string>(matrix._termIndex); _termIndexLookup = new Dictionary<string, int>(matrix._termIndexLookup); }
/// <summary> /// Recreates each matrix with documents containing missing terms. /// List[0] : matrix 1 /// List[1] : matrix 2 /// </summary> /// <param name="matrix1">First term-by-document matrix</param> /// <param name="matrix2">Second term-by-document matrix</param> /// <returns>Copies of original matrices with missing terms from each</returns> public static List <TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { // initialize matrices List <TermDocumentMatrix> matrices = new List <TermDocumentMatrix>(); // matrix 1 matrices.Add(new TermDocumentMatrix()); matrices[0]._matrix = new double[matrix1.NumDocs][]; matrices[0]._docIndex = new List <string>(matrix1._docIndex); matrices[0]._docIndexLookup = new Dictionary <string, int>(matrix1._docIndexLookup); // matrix 2 matrices.Add(new TermDocumentMatrix()); matrices[1]._matrix = new double[matrix2.NumDocs][]; matrices[1]._docIndex = new List <string>(matrix2._docIndex); matrices[1]._docIndexLookup = new Dictionary <string, int>(matrix2._docIndexLookup); // compute term set List <string> termIndex = new List <string>(); Dictionary <string, int> termIndexLookup = new Dictionary <string, int>(); Dictionary <string, int> leftovers = new Dictionary <string, int>(matrix2._termIndexLookup); // get all terms in first matrix foreach (string term in matrix1._termIndex) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); // remove duplicate terms if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); } } // add leftovers foreach (string term in leftovers.Keys) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); } // create new term distributions for each document // matrix 1 matrices[0]._termIndex = new List <string>(termIndex); matrices[0]._termIndexLookup = new Dictionary <string, int>(termIndexLookup); for (int i = 0; i < matrices[0].NumDocs; i++) { matrices[0]._matrix[i] = new double[termIndex.Count]; // fill in original values for (int j = 0; j < matrix1.NumTerms; j++) { matrices[0][i, j] = matrix1[i, j]; } // fill in missing terms for (int j = matrix1.NumTerms; j < termIndex.Count; j++) { matrices[0][i, j] = 0.0; } } // matrix 2 matrices[1]._termIndex = new List <string>(termIndex); matrices[1]._termIndexLookup = new Dictionary <string, int>(termIndexLookup); for (int i = 0; i < matrices[1].NumDocs; i++) { matrices[1]._matrix[i] = new double[termIndex.Count]; // fill in values for (int j = 0; j < termIndex.Count; j++) { if (matrix2.ContainsTerm(termIndex[j])) { matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]); } else { matrices[1][i, j] = 0.0; } } } // return return(matrices); }