/// <summary> /// Saves matrix to file /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <param name="filename">File location</param> public static void Save(TermDocumentMatrix matrix, string filename) { // attempt to create file TextWriter tw = new StreamWriter(File.Open(filename, FileMode.Create)); // print out term list foreach (string term in matrix.TermMap) { tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, term); } tw.WriteLine(); // print out each document for (int i = 0; i < matrix.NumDocs; i++) { tw.Write(matrix.GetDocumentName(i)); // print out each term for (int j = 0; j < matrix.NumTerms; j++) { tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, matrix[i, j]); } tw.WriteLine(); } // close file tw.Flush(); tw.Close(); }
/// <summary> /// Loads a document-by-term matrix with terms as rows and documents as columns, /// then transposes it to the implementation format. /// </summary> /// <param name="filename">Input file</param> /// <returns>Term-by-document matrix</returns> public static TermDocumentMatrix LoadTransposed(string filename) { TermDocumentMatrix original = Load(filename); TermDocumentMatrix t = new TermDocumentMatrix(); t._matrix = new double[original.NumTerms][]; t._docIndex = new List <string>(); t._docIndexLookup = new Dictionary <string, int>(); for (int i = 0; i < original.NumTerms; i++) { t._matrix[i] = new double[original.NumDocs]; t._docIndex.Add(original._termIndex[i]); t._docIndexLookup.Add(original._termIndex[i], i); } t._termIndex = new List <string>(); t._termIndexLookup = new Dictionary <string, int>(); for (int i = 0; i < original.NumDocs; i++) { t._termIndex.Add(original._docIndex[i]); t._termIndexLookup.Add(original._docIndex[i], i); } for (int i = 0; i < original.NumTerms; i++) { for (int j = 0; j < original.NumDocs; j++) { t._matrix[i][j] = original._matrix[j][i]; } } return(t); }
/// <summary> /// Constructor /// </summary> /// <param name="name">Corpus name</param> /// <param name="source">Source matrix</param> /// <param name="target">Target matrix</param> public LDACorpus(string name, TermDocumentMatrix source, TermDocumentMatrix target) { Name = name; _sourceDocs = source.DocMap; _targetDocs = target.DocMap; _matrix = TermDocumentMatrix.Combine(source, target); }
/// <summary> /// Combines two TermDocumentMatrices into one matrix with the same terms. /// The first matrix is put in first (ie. newmatrix[0] = matrix1[0]) /// </summary> /// <param name="matrix1">First matrix</param> /// <param name="matrix2">Second matrix</param> /// <returns></returns> public static TermDocumentMatrix Combine(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { TermDocumentMatrix combined = new TermDocumentMatrix(); // add documents combined._docIndex = new List <string>(matrix1._docIndex); combined._docIndexLookup = new Dictionary <string, int>(matrix1._docIndexLookup); foreach (string doc in matrix2.DocMap) { combined._docIndex.Add(doc); combined._docIndexLookup.Add(doc, combined.NumDocs - 1); } // calculate union of terms combined._termIndex = new List <string>(matrix1._termIndex); combined._termIndexLookup = new Dictionary <string, int>(matrix1._termIndexLookup); foreach (string term in matrix2.TermMap) { if (!combined._termIndexLookup.ContainsKey(term)) { combined._termIndex.Add(term); combined._termIndexLookup.Add(term, combined.NumTerms - 1); } } // create and populate matrix combined._matrix = new double[combined.NumDocs][]; // matrix1 for (int i = 0; i < matrix1.NumDocs; i++) { combined._matrix[i] = new double[combined.NumTerms]; for (int j = 0; j < combined.NumTerms; j++) { if (matrix1.ContainsTerm(combined.TermMap[j])) { combined[i, j] = matrix1[i, matrix1.GetTermIndex(combined.TermMap[j])]; } else { combined[i, j] = 0.0; } } } // matrix2 for (int i = matrix1.NumDocs; i < combined.NumDocs; i++) { combined._matrix[i] = new double[combined.NumTerms]; for (int j = 0; j < combined.NumTerms; j++) { if (matrix2.ContainsTerm(combined.TermMap[j])) { combined[i, j] = matrix2[i - matrix1.NumDocs, matrix2.GetTermIndex(combined.TermMap[j])]; } else { combined[i, j] = 0.0; } } } return(combined); }
/// <summary> /// Constructor /// </summary> /// <param name="name">Corpus name</param> /// <param name="source">Source artifacts</param> /// <param name="target">Target artifacts</param> public LDACorpus(string name, TLArtifactsCollection source, TLArtifactsCollection target) { Name = name; TermDocumentMatrix sMatrix = new TermDocumentMatrix(source); TermDocumentMatrix tMatrix = new TermDocumentMatrix(target); _sourceDocs = sMatrix.DocMap; _targetDocs = tMatrix.DocMap; _matrix = TermDocumentMatrix.Combine(sMatrix, tMatrix); }
public override void Compute() { TLArtifactsCollection sourceArtifacts = (TLArtifactsCollection)Workspace.Load("SourceArtifacts"); TLArtifactsCollection targetArtifacts = (TLArtifactsCollection)Workspace.Load("TargetArtifacts"); TermDocumentMatrix matrix = new TermDocumentMatrix(sourceArtifacts, targetArtifacts); matrix = SmoothingFilter.Compute(matrix, sourceArtifacts.Keys); matrix = SmoothingFilter.Compute(matrix, targetArtifacts.Keys); TLSimilarityMatrix sims = SimilarityUtil.ComputeCosine(matrix, sourceArtifacts.Keys, targetArtifacts.Keys); Workspace.Store("Similarities", sims); }
/// <summary> /// Deep copy constructor /// </summary> /// <param name="matrix">Object to be copied</param> public TermDocumentMatrix(TermDocumentMatrix matrix) { _matrix = new double[matrix.NumDocs][]; for (int i = 0; i < matrix.NumDocs; i++) { _matrix[i] = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { _matrix[i][j] = matrix[i, j]; } } _docIndex = new List <string>(matrix._docIndex); _docIndexLookup = new Dictionary <string, int>(matrix._docIndexLookup); _termIndex = new List <string>(matrix._termIndex); _termIndexLookup = new Dictionary <string, int>(matrix._termIndexLookup); }
/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// Row 0: document 1 /// Row 1: document 2 /// </summary> /// <param name="matrix1">document1 container</param> /// <param name="document1">document1 index</param> /// <param name="matrix2">document2 container</param> /// <param name="document2">document2 index</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2) { // initialize new TermDocumentMatrix TermDocumentMatrix newmatrix = new TermDocumentMatrix(); newmatrix._matrix = new double[2][]; newmatrix._termIndex = new List <string>(); newmatrix._termIndexLookup = new Dictionary <string, int>(); newmatrix._docIndex = new List <string>(); newmatrix._docIndexLookup = new Dictionary <string, int>(); newmatrix._docIndex.Add(matrix1.GetDocumentName(document1)); newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1); newmatrix._docIndex.Add(matrix2.GetDocumentName(document2)); newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1); List <double> doc1 = new List <double>(); List <double> doc2 = new List <double>(); // compute total term set Dictionary <string, int> leftovers = new Dictionary <string, int>(matrix2._termIndexLookup); foreach (string term in matrix1._termIndex) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term))); if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } else { doc2.Add(0.0); } } foreach (string term in leftovers.Keys) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(0.0); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } newmatrix._matrix[0] = doc1.ToArray(); newmatrix._matrix[1] = doc2.ToArray(); return(newmatrix); }
public void ConstructorTest_Artifacts() { string inputData = Settings.Default.SimpleCorpusDir; string outputData = Path.Combine(inputData, "TermDocumentMatrix"); TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.ImportFile(Path.Combine(inputData, "target.txt"))); TermDocumentMatrix answer = TermDocumentMatrix.Load(Path.Combine(outputData, "output.txt")); // counts Assert.AreEqual(matrix.NumDocs, answer.NumDocs); Assert.AreEqual(matrix.NumTerms, answer.NumTerms); // matrix for (int i = 0; i < answer.NumDocs; i++) { Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i)); Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms); for (int j = 0; j < answer.NumTerms; j++) { Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j)); Assert.AreEqual(matrix[i, j], answer[i, j], 0.0); } } }
/// <summary> /// Saves matrix to file /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <param name="filename">File location</param> public static void Save(TermDocumentMatrix matrix, string filename) { // attempt to create file TextWriter tw = new StreamWriter(File.Open(filename, FileMode.Create)); // print out term list foreach (string term in matrix.TermMap) { tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, term); } tw.WriteLine(); // print out each document for (int i = 0; i < matrix.NumDocs; i++) { tw.Write(matrix.GetDocumentName(i)); // print out each term for (int j = 0; j < matrix.NumTerms; j++) { tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, matrix[i, j]); } tw.WriteLine(); } // close file tw.Flush(); tw.Close(); }
/// <summary> /// Loads a previously saved TermDocumentMatrix from disk. /// </summary> /// <param name="filename">File location</param> /// <returns>Term-by-document matrix</returns> public static TermDocumentMatrix Load(string filename) { TextReader tr = new StreamReader(File.OpenRead(filename)); TermDocumentMatrix matrix = new TermDocumentMatrix(); int lineNum = 1; string line = tr.ReadLine(); string[] delimeter = new string[] { TermDocumentMatrix.IODelimeter, " ", "\t" }; // read terms // testing List <string> termList = new List <string>(line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries)); // check for identical terms HashSet <string> termSet = new HashSet <string>(); for (int i = 0; i < termList.Count; i++) { if (termSet.Contains(termList[i])) { termList[i] = termList[i] + new Random().Next(); i--; } else { termSet.Add(termList[i]); } } // end of testing // add to matrix matrix._termIndex = termList; matrix._termIndexLookup = new Dictionary <string, int>(); for (int i = 0; i < matrix._termIndex.Count; i++) { matrix._termIndexLookup.Add(matrix._termIndex[i], i); } // read documents matrix._docIndex = new List <string>(); matrix._docIndexLookup = new Dictionary <string, int>(); List <double[]> docs = new List <double[]>(); while ((line = tr.ReadLine()) != null) { lineNum++; string[] document = line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries); if (document.Length != matrix.NumTerms + 1) { tr.Close(); throw new InvalidDataException("Incorrect data format on line " + lineNum + " in file: " + filename); } matrix._docIndex.Add(document[0]); matrix._docIndexLookup.Add(document[0], matrix._docIndex.Count - 1); double[] doc = new double[matrix.NumTerms]; for (int i = 1; i < document.Length; i++) { doc[i - 1] = Convert.ToDouble(document[i]); } docs.Add(doc); } // add documents matrix._matrix = new double[matrix.NumDocs][]; for (int i = 0; i < matrix.NumDocs; i++) { matrix._matrix[i] = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] = docs[i][j]; } } // cleanup tr.Close(); return(matrix); }
/// <summary> /// Constructor /// </summary> /// <param name="name">Corpus name</param> /// <param name="matrix">Input matrix</param> /// <param name="sourceIDs">Collection of source artifacts ids</param> /// <param name="targetIDs">Collection of target artifacts ids</param> public LDACorpus(string name, TermDocumentMatrix matrix, IEnumerable<string> sourceIDs, IEnumerable<string> targetIDs) { Name = name; _sourceDocs = sourceIDs; _targetDocs = targetIDs; _matrix = matrix; }
/// <summary> /// Combines two TermDocumentMatrices into one matrix with the same terms. /// The first matrix is put in first (ie. newmatrix[0] = matrix1[0]) /// </summary> /// <param name="matrix1">First matrix</param> /// <param name="matrix2">Second matrix</param> /// <returns></returns> public static TermDocumentMatrix Combine(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { TermDocumentMatrix combined = new TermDocumentMatrix(); // add documents combined._docIndex = new List<string>(matrix1._docIndex); combined._docIndexLookup = new Dictionary<string, int>(matrix1._docIndexLookup); foreach (string doc in matrix2.DocMap) { combined._docIndex.Add(doc); combined._docIndexLookup.Add(doc, combined.NumDocs - 1); } // calculate union of terms combined._termIndex = new List<string>(matrix1._termIndex); combined._termIndexLookup = new Dictionary<string, int>(matrix1._termIndexLookup); foreach (string term in matrix2.TermMap) { if (!combined._termIndexLookup.ContainsKey(term)) { combined._termIndex.Add(term); combined._termIndexLookup.Add(term, combined.NumTerms - 1); } } // create and populate matrix combined._matrix = new double[combined.NumDocs][]; // matrix1 for (int i = 0; i < matrix1.NumDocs; i++) { combined._matrix[i] = new double[combined.NumTerms]; for (int j = 0; j < combined.NumTerms; j++) { if (matrix1.ContainsTerm(combined.TermMap[j])) { combined[i, j] = matrix1[i, matrix1.GetTermIndex(combined.TermMap[j])]; } else { combined[i, j] = 0.0; } } } // matrix2 for (int i = matrix1.NumDocs; i < combined.NumDocs; i++) { combined._matrix[i] = new double[combined.NumTerms]; for (int j = 0; j < combined.NumTerms; j++) { if (matrix2.ContainsTerm(combined.TermMap[j])) { combined[i, j] = matrix2[i - matrix1.NumDocs, matrix2.GetTermIndex(combined.TermMap[j])]; } else { combined[i, j] = 0.0; } } } return combined; }
/// <summary> /// Deep copy constructor /// </summary> /// <param name="matrix">Object to be copied</param> public TermDocumentMatrix(TermDocumentMatrix matrix) { _matrix = new double[matrix.NumDocs][]; for (int i = 0; i < matrix.NumDocs; i++) { _matrix[i] = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { _matrix[i][j] = matrix[i,j]; } } _docIndex = new List<string>(matrix._docIndex); _docIndexLookup = new Dictionary<string, int>(matrix._docIndexLookup); _termIndex = new List<string>(matrix._termIndex); _termIndexLookup = new Dictionary<string, int>(matrix._termIndexLookup); }
/// <summary> /// Recreates each matrix with documents containing missing terms. /// List[0] : matrix 1 /// List[1] : matrix 2 /// </summary> /// <param name="matrix1">First term-by-document matrix</param> /// <param name="matrix2">Second term-by-document matrix</param> /// <returns>Copies of original matrices with missing terms from each</returns> public static List<TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { // initialize matrices List<TermDocumentMatrix> matrices = new List<TermDocumentMatrix>(); // matrix 1 matrices.Add(new TermDocumentMatrix()); matrices[0]._matrix = new double[matrix1.NumDocs][]; matrices[0]._docIndex = new List<string>(matrix1._docIndex); matrices[0]._docIndexLookup = new Dictionary<string,int>(matrix1._docIndexLookup); // matrix 2 matrices.Add(new TermDocumentMatrix()); matrices[1]._matrix = new double[matrix2.NumDocs][]; matrices[1]._docIndex = new List<string>(matrix2._docIndex); matrices[1]._docIndexLookup = new Dictionary<string,int>(matrix2._docIndexLookup); // compute term set List<string> termIndex = new List<string>(); Dictionary<string, int> termIndexLookup = new Dictionary<string, int>(); Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup); // get all terms in first matrix foreach (string term in matrix1._termIndex) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); // remove duplicate terms if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); } } // add leftovers foreach (string term in leftovers.Keys) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); } // create new term distributions for each document // matrix 1 matrices[0]._termIndex = new List<string>(termIndex); matrices[0]._termIndexLookup = new Dictionary<string,int>(termIndexLookup); for (int i = 0; i < matrices[0].NumDocs; i++) { matrices[0]._matrix[i] = new double[termIndex.Count]; // fill in original values for (int j = 0; j < matrix1.NumTerms; j++) { matrices[0][i, j] = matrix1[i, j]; } // fill in missing terms for (int j = matrix1.NumTerms; j < termIndex.Count; j++) { matrices[0][i, j] = 0.0; } } // matrix 2 matrices[1]._termIndex = new List<string>(termIndex); matrices[1]._termIndexLookup = new Dictionary<string,int>(termIndexLookup); for (int i = 0; i < matrices[1].NumDocs; i++) { matrices[1]._matrix[i] = new double[termIndex.Count]; // fill in values for (int j = 0; j < termIndex.Count; j++) { if (matrix2.ContainsTerm(termIndex[j])) { matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]); } else { matrices[1][i, j] = 0.0; } } } // return return matrices; }
/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// </summary> /// <param name="matrix1">artifact1 container</param> /// <param name="artifact1">artifact1 ID</param> /// <param name="matrix2">artifact2 container</param> /// <param name="artifact2">artifact2 ID</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, string artifact1, TermDocumentMatrix matrix2, string artifact2) { return(EqualizeDocuments(matrix1, matrix1.GetDocumentIndex(artifact1), matrix2, matrix2.GetDocumentIndex(artifact2))); }
/// <summary> /// Recreates each matrix with documents containing missing terms. /// List[0] : matrix 1 /// List[1] : matrix 2 /// </summary> /// <param name="matrix1">First term-by-document matrix</param> /// <param name="matrix2">Second term-by-document matrix</param> /// <returns>Copies of original matrices with missing terms from each</returns> public static List <TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2) { // initialize matrices List <TermDocumentMatrix> matrices = new List <TermDocumentMatrix>(); // matrix 1 matrices.Add(new TermDocumentMatrix()); matrices[0]._matrix = new double[matrix1.NumDocs][]; matrices[0]._docIndex = new List <string>(matrix1._docIndex); matrices[0]._docIndexLookup = new Dictionary <string, int>(matrix1._docIndexLookup); // matrix 2 matrices.Add(new TermDocumentMatrix()); matrices[1]._matrix = new double[matrix2.NumDocs][]; matrices[1]._docIndex = new List <string>(matrix2._docIndex); matrices[1]._docIndexLookup = new Dictionary <string, int>(matrix2._docIndexLookup); // compute term set List <string> termIndex = new List <string>(); Dictionary <string, int> termIndexLookup = new Dictionary <string, int>(); Dictionary <string, int> leftovers = new Dictionary <string, int>(matrix2._termIndexLookup); // get all terms in first matrix foreach (string term in matrix1._termIndex) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); // remove duplicate terms if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); } } // add leftovers foreach (string term in leftovers.Keys) { termIndex.Add(term); termIndexLookup.Add(term, termIndex.Count - 1); } // create new term distributions for each document // matrix 1 matrices[0]._termIndex = new List <string>(termIndex); matrices[0]._termIndexLookup = new Dictionary <string, int>(termIndexLookup); for (int i = 0; i < matrices[0].NumDocs; i++) { matrices[0]._matrix[i] = new double[termIndex.Count]; // fill in original values for (int j = 0; j < matrix1.NumTerms; j++) { matrices[0][i, j] = matrix1[i, j]; } // fill in missing terms for (int j = matrix1.NumTerms; j < termIndex.Count; j++) { matrices[0][i, j] = 0.0; } } // matrix 2 matrices[1]._termIndex = new List <string>(termIndex); matrices[1]._termIndexLookup = new Dictionary <string, int>(termIndexLookup); for (int i = 0; i < matrices[1].NumDocs; i++) { matrices[1]._matrix[i] = new double[termIndex.Count]; // fill in values for (int j = 0; j < termIndex.Count; j++) { if (matrix2.ContainsTerm(termIndex[j])) { matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]); } else { matrices[1][i, j] = 0.0; } } } // return return(matrices); }
/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// Row 0: document 1 /// Row 1: document 2 /// </summary> /// <param name="matrix1">document1 container</param> /// <param name="document1">document1 index</param> /// <param name="matrix2">document2 container</param> /// <param name="document2">document2 index</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2) { // initialize new TermDocumentMatrix TermDocumentMatrix newmatrix = new TermDocumentMatrix(); newmatrix._matrix = new double[2][]; newmatrix._termIndex = new List<string>(); newmatrix._termIndexLookup = new Dictionary<string, int>(); newmatrix._docIndex = new List<string>(); newmatrix._docIndexLookup = new Dictionary<string, int>(); newmatrix._docIndex.Add(matrix1.GetDocumentName(document1)); newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1); newmatrix._docIndex.Add(matrix2.GetDocumentName(document2)); newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1); List<double> doc1 = new List<double>(); List<double> doc2 = new List<double>(); // compute total term set Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup); foreach (string term in matrix1._termIndex) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term))); if (matrix2._termIndexLookup.ContainsKey(term)) { leftovers.Remove(term); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } else { doc2.Add(0.0); } } foreach (string term in leftovers.Keys) { newmatrix._termIndex.Add(term); newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1); doc1.Add(0.0); doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term))); } newmatrix._matrix[0] = doc1.ToArray(); newmatrix._matrix[1] = doc2.ToArray(); return newmatrix; }
/// <summary> /// Takes the two specified documents and creates two new document vectors with the missing terms from each. /// </summary> /// <param name="matrix1">artifact1 container</param> /// <param name="artifact1">artifact1 ID</param> /// <param name="matrix2">artifact2 container</param> /// <param name="artifact2">artifact2 ID</param> /// <returns>New term-by-document matrix containing the two documents and their term maps</returns> public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, string artifact1, TermDocumentMatrix matrix2, string artifact2) { return EqualizeDocuments(matrix1, matrix1.GetDocumentIndex(artifact1), matrix2, matrix2.GetDocumentIndex(artifact2)); }
/// <summary> /// Loads a previously saved TermDocumentMatrix from disk. /// </summary> /// <param name="filename">File location</param> /// <returns>Term-by-document matrix</returns> public static TermDocumentMatrix Load(string filename) { TextReader tr = new StreamReader(File.OpenRead(filename)); TermDocumentMatrix matrix = new TermDocumentMatrix(); int lineNum = 1; string line = tr.ReadLine(); string[] delimeter = new string[] { TermDocumentMatrix.IODelimeter, " ", "\t" }; // read terms // testing List<string> termList = new List<string>(line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries)); // check for identical terms HashSet<string> termSet = new HashSet<string>(); for (int i = 0; i < termList.Count; i++) { if (termSet.Contains(termList[i])) { termList[i] = termList[i] + new Random().Next(); i--; } else { termSet.Add(termList[i]); } } // end of testing // add to matrix matrix._termIndex = termList; matrix._termIndexLookup = new Dictionary<string, int>(); for (int i = 0; i < matrix._termIndex.Count; i++) { matrix._termIndexLookup.Add(matrix._termIndex[i], i); } // read documents matrix._docIndex = new List<string>(); matrix._docIndexLookup = new Dictionary<string, int>(); List<double[]> docs = new List<double[]>(); while ((line = tr.ReadLine()) != null) { lineNum++; string[] document = line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries); if (document.Length != matrix.NumTerms + 1) { tr.Close(); throw new InvalidDataException("Incorrect data format on line " + lineNum + " in file: " + filename); } matrix._docIndex.Add(document[0]); matrix._docIndexLookup.Add(document[0], matrix._docIndex.Count - 1); double[] doc = new double[matrix.NumTerms]; for (int i = 1; i < document.Length; i++) { doc[i - 1] = Convert.ToDouble(document[i]); } docs.Add(doc); } // add documents matrix._matrix = new double[matrix.NumDocs][]; for (int i = 0; i < matrix.NumDocs; i++) { matrix._matrix[i] = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] = docs[i][j]; } } // cleanup tr.Close(); return matrix; }
/// <summary> /// Loads a document-by-term matrix with terms as rows and documents as columns, /// then transposes it to the implementation format. /// </summary> /// <param name="filename">Input file</param> /// <returns>Term-by-document matrix</returns> public static TermDocumentMatrix LoadTransposed(string filename) { TermDocumentMatrix original = Load(filename); TermDocumentMatrix t = new TermDocumentMatrix(); t._matrix = new double[original.NumTerms][]; t._docIndex = new List<string>(); t._docIndexLookup = new Dictionary<string, int>(); for (int i = 0; i < original.NumTerms; i++) { t._matrix[i] = new double[original.NumDocs]; t._docIndex.Add(original._termIndex[i]); t._docIndexLookup.Add(original._termIndex[i], i); } t._termIndex = new List<string>(); t._termIndexLookup = new Dictionary<string, int>(); for (int i = 0; i < original.NumDocs; i++) { t._termIndex.Add(original._docIndex[i]); t._termIndexLookup.Add(original._docIndex[i], i); } for (int i = 0; i < original.NumTerms; i++) { for (int j = 0; j < original.NumDocs; j++) { t._matrix[i][j] = original._matrix[j][i]; } } return t; }