示例#1
0
        /// <summary>
        /// Saves matrix to file
        /// </summary>
        /// <param name="matrix">Term-by-document matrix</param>
        /// <param name="filename">File location</param>
        public static void Save(TermDocumentMatrix matrix, string filename)
        {
            // attempt to create file
            TextWriter tw = new StreamWriter(File.Open(filename, FileMode.Create));

            // print out term list
            foreach (string term in matrix.TermMap)
            {
                tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, term);
            }
            tw.WriteLine();
            // print out each document
            for (int i = 0; i < matrix.NumDocs; i++)
            {
                tw.Write(matrix.GetDocumentName(i));
                // print out each term
                for (int j = 0; j < matrix.NumTerms; j++)
                {
                    tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, matrix[i, j]);
                }
                tw.WriteLine();
            }
            // close file
            tw.Flush();
            tw.Close();
        }
示例#2
0
 /// <summary>
 /// Computes similarities between term-by-document matrices via the Vector Space Model
 /// using a tf-idf weighting scheme and cosine similarity.
 /// </summary>
 /// <param name="source">Source matrix</param>
 /// <param name="target">Target matrix</param>
 /// <returns>Similarity matrix</returns>
 public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target)
 {
     TermDocumentMatrix IDs = ComputeIdentities(source);
     TermDocumentMatrix TF = ComputeTF(target);
     double[] IDF = ComputeIDF(ComputeDF(target), target.NumDocs);
     TermDocumentMatrix TFIDF = ComputeTFIDF(TF, IDF);
     return ComputeSimilarities(IDs, TFIDF);
 }
示例#3
0
        /// <summary>
        /// Computes similarities between term-by-document matrices via the Vector Space Model
        /// using a tf-idf weighting scheme and cosine similarity.
        /// </summary>
        /// <param name="source">Source matrix</param>
        /// <param name="target">Target matrix</param>
        /// <returns>Similarity matrix</returns>
        public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target)
        {
            TermDocumentMatrix IDs = ComputeIdentities(source);
            TermDocumentMatrix TF  = ComputeTF(target);

            double[]           IDF   = ComputeIDF(ComputeDF(target), target.NumDocs);
            TermDocumentMatrix TFIDF = ComputeTFIDF(TF, IDF);

            return(ComputeSimilarities(IDs, TFIDF));
        }
示例#4
0
 /// <summary>
 /// Computes tf-idf weights
 /// </summary>
 /// <param name="tf">Term-frequency weighted matrix</param>
 /// <param name="idf">Inverse document frequencies vector</param>
 /// <returns></returns>
 private static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix tf, double[] idf)
 {
     for (int i = 0; i < tf.NumDocs; i++)
     {
         for (int j = 0; j < tf.NumTerms; j++)
         {
             tf[i, j] = tf[i, j] * idf[j];
         }
     }
     return(tf);
 }
示例#5
0
 /// <summary>
 /// Computes boolean (0|1) terms in documents.
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>Term-by-document matrix with 1s for terms that are in the document and 0s for terms that are not.</returns>
 private static TermDocumentMatrix ComputeIdentities(TermDocumentMatrix matrix)
 {
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             matrix[i,j] = (matrix[i,j] > 0.0) ? 1.0 : 0.0;
         }
     }
     return matrix;
 }
示例#6
0
 /// <summary>
 /// Computes boolean (0|1) terms in documents.
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>Term-by-document matrix with 1s for terms that are in the document and 0s for terms that are not.</returns>
 private static TermDocumentMatrix ComputeIdentities(TermDocumentMatrix matrix)
 {
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             matrix[i, j] = (matrix[i, j] > 0.0) ? 1.0 : 0.0;
         }
     }
     return(matrix);
 }
示例#7
0
 /// <summary>
 /// Computes the term frequencies of each document.
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>tf-weighted term-by-document matrix</returns>
 private static TermDocumentMatrix ComputeTF(TermDocumentMatrix matrix)
 {
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         double max = matrix.GetDocument(i).Max();
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             matrix[i, j] = matrix[i, j] / max;
         }
     }
     return(matrix);
 }
示例#8
0
 /// <summary>
 /// Computes the term frequencies of each document.
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>tf-weighted term-by-document matrix</returns>
 private static TermDocumentMatrix ComputeTF(TermDocumentMatrix matrix)
 {
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         double max = matrix.GetDocument(i).Max();
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             matrix[i,j] = matrix[i,j] / max;
         }
     }
     return matrix;
 }
示例#9
0
        /// <summary>
        /// Loads a previously saved TermDocumentMatrix from disk.
        /// </summary>
        /// <param name="filename">File location</param>
        /// <returns>Term-by-document matrix</returns>
        public static TermDocumentMatrix Load(string filename)
        {
            TextReader         tr     = new StreamReader(File.OpenRead(filename));
            TermDocumentMatrix matrix = new TermDocumentMatrix();
            int    lineNum            = 1;
            string line = tr.ReadLine();

            string[] delimeter = new string[] { TermDocumentMatrix.IODelimeter };
            // read terms
            matrix._termIndex       = new List <string>(line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries));
            matrix._termIndexLookup = new Dictionary <string, int>();
            for (int i = 0; i < matrix._termIndex.Count; i++)
            {
                matrix._termIndexLookup.Add(matrix._termIndex[i], i);
            }
            // read documents
            matrix._docIndex       = new List <string>();
            matrix._docIndexLookup = new Dictionary <string, int>();
            List <double[]> docs = new List <double[]>();

            while ((line = tr.ReadLine()) != null)
            {
                lineNum++;
                string[] document = line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries);
                if (document.Length != matrix.NumTerms + 1)
                {
                    tr.Close();
                    throw new InvalidDataException("Incorrect data format on line " + lineNum + " in file: " + filename);
                }
                matrix._docIndex.Add(document[0]);
                matrix._docIndexLookup.Add(document[0], matrix._docIndex.Count - 1);
                double[] doc = new double[matrix.NumTerms];
                for (int i = 1; i < document.Length; i++)
                {
                    doc[i - 1] = Convert.ToDouble(document[i]);
                }
                docs.Add(doc);
            }
            // add documents
            matrix._matrix = new double[matrix.NumDocs][];
            for (int i = 0; i < matrix.NumDocs; i++)
            {
                matrix._matrix[i] = new double[matrix.NumTerms];
                for (int j = 0; j < matrix.NumTerms; j++)
                {
                    matrix[i, j] = docs[i][j];
                }
            }
            // cleanup
            tr.Close();
            return(matrix);
        }
示例#10
0
文件: VSM.cs 项目: CoEST/TraceLab-CDK
 /// <summary>
 /// Computes the document frequencies of each term
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>df-weighted term distribution</returns>
 private static double[] ComputeDF(TermDocumentMatrix matrix)
 {
     double[] df = new double[matrix.NumTerms];
     for (int j = 0; j < matrix.NumTerms; j++)
     {
         df[j] = 0.0;
         for (int i = 0; i < matrix.NumDocs; i++)
         {
             df[j] += (matrix[i,j] > 0.0) ? 1.0 : 0.0;
         }
     }
     return df;
 }
 /// <summary>
 /// Computes the average term vector of the matrix
 /// </summary>
 /// <param name="matrix">Artifacts</param>
 /// <returns>Average vector</returns>
 private static double[] ComputeAverageVector(TermDocumentMatrix matrix)
 {
     double[] avg = new double[matrix.NumTerms];
     for (int j = 0; j < matrix.NumTerms; j++)
     {
         for (int i = 0; i < matrix.NumDocs; i++)
         {
             avg[j] += matrix[i, j];
         }
         avg[j] = avg[j] / matrix.NumDocs;
     } 
     return avg;
 }
示例#12
0
 /// <summary>
 /// Computes the document frequencies of each term
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>df-weighted term distribution</returns>
 private static double[] ComputeDF(TermDocumentMatrix matrix)
 {
     double[] df = new double[matrix.NumTerms];
     for (int j = 0; j < matrix.NumTerms; j++)
     {
         df[j] = 0.0;
         for (int i = 0; i < matrix.NumDocs; i++)
         {
             df[j] += (matrix[i, j] > 0.0) ? 1.0 : 0.0;
         }
     }
     return(df);
 }
示例#13
0
 /// <summary>
 /// Deep copy constructor
 /// </summary>
 /// <param name="matrix">Object to be copied</param>
 public TermDocumentMatrix(TermDocumentMatrix matrix)
 {
     _matrix = new double[matrix.NumDocs][];
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         _matrix[i] = new double[matrix.NumTerms];
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             _matrix[i][j] = matrix[i, j];
         }
     }
     _docIndex        = new List <string>(matrix._docIndex);
     _docIndexLookup  = new Dictionary <string, int>(matrix._docIndexLookup);
     _termIndex       = new List <string>(matrix._termIndex);
     _termIndexLookup = new Dictionary <string, int>(matrix._termIndexLookup);
 }
示例#14
0
        /// <summary>
        /// Takes the two specified documents and creates two new document vectors with the missing terms from each.
        /// Row 0: document 1
        /// Row 1: document 2
        /// </summary>
        /// <param name="matrix1">document1 container</param>
        /// <param name="document1">document1 index</param>
        /// <param name="matrix2">document2 container</param>
        /// <param name="document2">document2 index</param>
        /// <returns>New term-by-document matrix containing the two documents and their term maps</returns>
        public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2)
        {
            // initialize new TermDocumentMatrix
            TermDocumentMatrix newmatrix = new TermDocumentMatrix();

            newmatrix._matrix          = new double[2][];
            newmatrix._termIndex       = new List <string>();
            newmatrix._termIndexLookup = new Dictionary <string, int>();
            newmatrix._docIndex        = new List <string>();
            newmatrix._docIndexLookup  = new Dictionary <string, int>();
            newmatrix._docIndex.Add(matrix1.GetDocumentName(document1));
            newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1);
            newmatrix._docIndex.Add(matrix2.GetDocumentName(document2));
            newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1);
            List <double> doc1 = new List <double>();
            List <double> doc2 = new List <double>();
            // compute total term set
            Dictionary <string, int> leftovers = new Dictionary <string, int>(matrix2._termIndexLookup);

            foreach (string term in matrix1._termIndex)
            {
                newmatrix._termIndex.Add(term);
                newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1);
                doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term)));
                if (matrix2._termIndexLookup.ContainsKey(term))
                {
                    leftovers.Remove(term);
                    doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term)));
                }
                else
                {
                    doc2.Add(0.0);
                }
            }
            foreach (string term in leftovers.Keys)
            {
                newmatrix._termIndex.Add(term);
                newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1);
                doc1.Add(0.0);
                doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term)));
            }
            newmatrix._matrix[0] = doc1.ToArray();
            newmatrix._matrix[1] = doc2.ToArray();
            return(newmatrix);
        }
示例#15
0
 /// <summary>
 /// Computes Jensen-Shannon divergence on two TermDocumentMatrices
 /// </summary>
 /// <param name="source">Source artifacts collection</param>
 /// <param name="target">Target artifacts collection</param>
 /// <returns>Similarity matrix</returns>
 public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target)
 {
     List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target);
     TLSimilarityMatrix sims = new TLSimilarityMatrix();
     for (int i = 0; i < matrices[0].NumDocs; i++)
     {
         TLLinksList list = new TLLinksList();
         for (int j = 0; j < matrices[1].NumDocs; j++)
         {
             list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j),
                 DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j))));
         }
         list.Sort();
         foreach (TLSingleLink link in list)
         {
             sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score);
         }
     }
     return sims;
 }
 public void ConstructorTest_Artifacts()
 {
     string data = @"../../Data/SimpleCorpus.";
     TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.Import(data + "input.target.txt"));
     TermDocumentMatrix answer = TermDocumentMatrix.Load(data + "output.target.matrix.txt");
     // counts
     Assert.AreEqual(matrix.NumDocs, answer.NumDocs);
     Assert.AreEqual(matrix.NumTerms, answer.NumTerms);
     // matrix
     for (int i = 0; i < answer.NumDocs; i++)
     {
         Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i));
         Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms);
         for (int j = 0; j < answer.NumTerms; j++)
         {
             Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j));
             Assert.AreEqual(matrix[i, j], answer[i, j], 0.0);
         }
     }
 }
        /// <summary>
        /// Smoothing filter from ICPC'11 paper "Improving IR-based Traceability Recovery Using Smoothing Filters"
        /// </summary>
        /// <param name="artifacts">Artifacts</param>
        /// <returns>Smoothed artifacts</returns>
        public static TermDocumentMatrix Compute(TermDocumentMatrix matrix)
        {
            double[] avg = ComputeAverageVector(matrix);

            if (avg.Length != matrix.NumTerms)
                throw new ArgumentException("Average vector does not have the correct number of terms.");

            for (int i = 0; i < matrix.NumDocs; i++)
            {
                for (int j = 0; j < matrix.NumTerms; j++)
                {
                    matrix[i, j] -= avg[j];
                    if (matrix[i, j] < 0.0)
                    {
                        matrix[i, j] = 0.0;
                    }
                }
            }

            return matrix;
        }
示例#18
0
        /// <summary>
        /// Computes Jensen-Shannon divergence on two TermDocumentMatrices
        /// </summary>
        /// <param name="source">Source artifacts collection</param>
        /// <param name="target">Target artifacts collection</param>
        /// <returns>Similarity matrix</returns>
        public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target)
        {
            List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target);
            TLSimilarityMatrix        sims     = new TLSimilarityMatrix();

            for (int i = 0; i < matrices[0].NumDocs; i++)
            {
                TLLinksList list = new TLLinksList();
                for (int j = 0; j < matrices[1].NumDocs; j++)
                {
                    list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j),
                                              DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j))));
                }
                list.Sort();
                foreach (TLSingleLink link in list)
                {
                    sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score);
                }
            }
            return(sims);
        }
示例#19
0
        /// <summary>
        /// Computes cosine similarities between a set of boolean document vectors and a tfidf weighted corpus
        /// </summary>
        /// <param name="ids">Boolean document vectors</param>
        /// <param name="tfidf">tf-idf weighted document vectors</param>
        /// <returns>Similarity matrix</returns>
        private static TLSimilarityMatrix ComputeSimilarities(TermDocumentMatrix ids, TermDocumentMatrix tfidf)
        {
            TLSimilarityMatrix        sims     = new TLSimilarityMatrix();
            List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(ids, tfidf);

            for (int i = 0; i < ids.NumDocs; i++)
            {
                TLLinksList links = new TLLinksList();
                for (int j = 0; j < tfidf.NumDocs; j++)
                {
                    double product  = 0.0;
                    double asquared = 0.0;
                    double bsquared = 0.0;
                    for (int k = 0; k < matrices[0].NumTerms; k++)
                    {
                        double a = matrices[0][i, k];
                        double b = matrices[1][j, k];
                        product  += (a * b);
                        asquared += Math.Pow(a, 2);
                        bsquared += Math.Pow(b, 2);
                    }
                    double cross = Math.Sqrt(asquared) * Math.Sqrt(bsquared);
                    if (cross == 0.0)
                    {
                        links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), 0.0));
                    }
                    else
                    {
                        links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), product / cross));
                    }
                }
                links.Sort();
                foreach (TLSingleLink link in links)
                {
                    sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score);
                }
            }
            return(sims);
        }
示例#20
0
 /// <summary>
 /// Loads a previously saved TermDocumentMatrix from disk.
 /// </summary>
 /// <param name="filename">File location</param>
 /// <returns>Term-by-document matrix</returns>
 public static TermDocumentMatrix Load(string filename)
 {
     TextReader tr = new StreamReader(File.OpenRead(filename));
     TermDocumentMatrix matrix = new TermDocumentMatrix();
     int lineNum = 1;
     string line = tr.ReadLine();
     string[] delimeter = new string[] { TermDocumentMatrix.IODelimeter };
     // read terms
     matrix._termIndex = new List<string>(line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries));
     matrix._termIndexLookup = new Dictionary<string, int>();
     for (int i = 0; i < matrix._termIndex.Count; i++)
     {
         matrix._termIndexLookup.Add(matrix._termIndex[i], i);
     }
     // read documents
     matrix._docIndex = new List<string>();
     matrix._docIndexLookup = new Dictionary<string, int>();
     List<double[]> docs = new List<double[]>();
     while ((line = tr.ReadLine()) != null)
     {
         lineNum++;
         string[] document = line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries);
         if (document.Length != matrix.NumTerms + 1)
         {
             tr.Close();
             throw new InvalidDataException("Incorrect data format on line " + lineNum + " in file: " + filename);
         }
         matrix._docIndex.Add(document[0]);
         matrix._docIndexLookup.Add(document[0], matrix._docIndex.Count - 1);
         double[] doc = new double[matrix.NumTerms];
         for (int i = 1; i < document.Length; i++)
         {
             doc[i - 1] = Convert.ToDouble(document[i]);
         }
         docs.Add(doc);
     }
     // add documents
     matrix._matrix = new double[matrix.NumDocs][];
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         matrix._matrix[i] = new double[matrix.NumTerms];
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             matrix[i, j] = docs[i][j];
         }
     }
     // cleanup
     tr.Close();
     return matrix;
 }
示例#21
0
 /// <summary>
 /// Computes cosine similarities between a set of boolean document vectors and a tfidf weighted corpus
 /// </summary>
 /// <param name="ids">Boolean document vectors</param>
 /// <param name="tfidf">tf-idf weighted document vectors</param>
 /// <returns>Similarity matrix</returns>
 private static TLSimilarityMatrix ComputeSimilarities(TermDocumentMatrix ids, TermDocumentMatrix tfidf)
 {
     TLSimilarityMatrix sims = new TLSimilarityMatrix();
     List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(ids, tfidf);
     for (int i = 0; i < ids.NumDocs; i++)
     {
         TLLinksList links = new TLLinksList();
         for (int j = 0; j < tfidf.NumDocs; j++)
         {
             double product = 0.0;
             double asquared = 0.0;
             double bsquared = 0.0;
             for (int k = 0; k < matrices[0].NumTerms; k++)
             {
                 double a = matrices[0][i, k];
                 double b = matrices[1][j, k];
                 product += (a * b);
                 asquared += Math.Pow(a, 2);
                 bsquared += Math.Pow(b, 2);
             }
             double cross = Math.Sqrt(asquared) * Math.Sqrt(bsquared);
             if (cross == 0.0)
             {
                 links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), 0.0));
             }
             else
             {
                 links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), product / cross));
             }
         }
         links.Sort();
         foreach (TLSingleLink link in links)
         {
             sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score);
         }
     }
     return sims;
 }
示例#22
0
 /// <summary>
 /// Computes tf-idf weights
 /// </summary>
 /// <param name="tf">Term-frequency weighted matrix</param>
 /// <param name="idf">Inverse document frequencies vector</param>
 /// <returns></returns>
 private static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix tf, double[] idf)
 {
     for (int i = 0; i < tf.NumDocs; i++)
     {
         for (int j = 0; j < tf.NumTerms; j++)
         {
             tf[i,j] = tf[i,j] * idf[j];
         }
     }
     return tf;
 }
示例#23
0
 /// <summary>
 /// Saves matrix to file
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <param name="filename">File location</param>
 public static void Save(TermDocumentMatrix matrix, string filename)
 {
     // attempt to create file
     TextWriter tw = new StreamWriter(File.Open(filename, FileMode.Create));
     // print out term list
     foreach (string term in matrix.TermMap)
     {
         tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, term);
     }
     tw.WriteLine();
     // print out each document
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         tw.Write(matrix.GetDocumentName(i));
         // print out each term
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, matrix[i, j]);
         }
         tw.WriteLine();
     }
     // close file
     tw.Flush();
     tw.Close();
 }
示例#24
0
 /// <summary>
 /// Takes the two specified documents and creates two new document vectors with the missing terms from each.
 /// </summary>
 /// <param name="matrix1">artifact1 container</param>
 /// <param name="artifact1">artifact1 ID</param>
 /// <param name="matrix2">artifact2 container</param>
 /// <param name="artifact2">artifact2 ID</param>
 /// <returns>New term-by-document matrix containing the two documents and their term maps</returns>
 public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, string artifact1, TermDocumentMatrix matrix2, string artifact2)
 {
     return(EqualizeDocuments(matrix1, matrix1.GetDocumentIndex(artifact1), matrix2, matrix2.GetDocumentIndex(artifact2)));
 }
示例#25
0
 /// <summary>
 /// Takes the two specified documents and creates two new document vectors with the missing terms from each.
 /// </summary>
 /// <param name="matrix1">artifact1 container</param>
 /// <param name="artifact1">artifact1 ID</param>
 /// <param name="matrix2">artifact2 container</param>
 /// <param name="artifact2">artifact2 ID</param>
 /// <returns>New term-by-document matrix containing the two documents and their term maps</returns>
 public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, string artifact1, TermDocumentMatrix matrix2, string artifact2)
 {
     return EqualizeDocuments(matrix1, matrix1.GetDocumentIndex(artifact1), matrix2, matrix2.GetDocumentIndex(artifact2));
 }
示例#26
0
 /// <summary>
 /// Takes the two specified documents and creates two new document vectors with the missing terms from each.
 /// Row 0: document 1
 /// Row 1: document 2
 /// </summary>
 /// <param name="matrix1">document1 container</param>
 /// <param name="document1">document1 index</param>
 /// <param name="matrix2">document2 container</param>
 /// <param name="document2">document2 index</param>
 /// <returns>New term-by-document matrix containing the two documents and their term maps</returns>
 public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2)
 {
     // initialize new TermDocumentMatrix
     TermDocumentMatrix newmatrix = new TermDocumentMatrix();
     newmatrix._matrix = new double[2][];
     newmatrix._termIndex = new List<string>();
     newmatrix._termIndexLookup = new Dictionary<string, int>();
     newmatrix._docIndex = new List<string>();
     newmatrix._docIndexLookup = new Dictionary<string, int>();
     newmatrix._docIndex.Add(matrix1.GetDocumentName(document1));
     newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1);
     newmatrix._docIndex.Add(matrix2.GetDocumentName(document2));
     newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1);
     List<double> doc1 = new List<double>();
     List<double> doc2 = new List<double>();
     // compute total term set
     Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup);
     foreach (string term in matrix1._termIndex)
     {
         newmatrix._termIndex.Add(term);
         newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1);
         doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term)));
         if (matrix2._termIndexLookup.ContainsKey(term))
         {
             leftovers.Remove(term);
             doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term)));
         }
         else
         {
             doc2.Add(0.0);
         }
     }
     foreach (string term in leftovers.Keys)
     {
         newmatrix._termIndex.Add(term);
         newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1);
         doc1.Add(0.0);
         doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term)));
     }
     newmatrix._matrix[0] = doc1.ToArray();
     newmatrix._matrix[1] = doc2.ToArray();
     return newmatrix;
 }
示例#27
0
 /// <summary>
 /// Recreates each matrix with documents containing missing terms.
 /// List[0] : matrix 1
 /// List[1] : matrix 2
 /// </summary>
 /// <param name="matrix1">First term-by-document matrix</param>
 /// <param name="matrix2">Second term-by-document matrix</param>
 /// <returns>Copies of original matrices with missing terms from each</returns>
 public static List<TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2)
 {
     // initialize matrices
     List<TermDocumentMatrix> matrices = new List<TermDocumentMatrix>();
     // matrix 1
     matrices.Add(new TermDocumentMatrix());
     matrices[0]._matrix = new double[matrix1.NumDocs][];
     matrices[0]._docIndex = new List<string>(matrix1._docIndex);
     matrices[0]._docIndexLookup = new Dictionary<string,int>(matrix1._docIndexLookup);
     // matrix 2
     matrices.Add(new TermDocumentMatrix());
     matrices[1]._matrix = new double[matrix2.NumDocs][];
     matrices[1]._docIndex = new List<string>(matrix2._docIndex);
     matrices[1]._docIndexLookup = new Dictionary<string,int>(matrix2._docIndexLookup);
     // compute term set
     List<string> termIndex = new List<string>();
     Dictionary<string, int> termIndexLookup = new Dictionary<string, int>();
     Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup);
     // get all terms in first matrix
     foreach (string term in matrix1._termIndex)
     {
         termIndex.Add(term);
         termIndexLookup.Add(term, termIndex.Count - 1);
         // remove duplicate terms
         if (matrix2._termIndexLookup.ContainsKey(term))
         {
             leftovers.Remove(term);
         }
     }
     // add leftovers
     foreach (string term in leftovers.Keys)
     {
         termIndex.Add(term);
         termIndexLookup.Add(term, termIndex.Count - 1);
     }
     // create new term distributions for each document
     // matrix 1
     matrices[0]._termIndex = new List<string>(termIndex);
     matrices[0]._termIndexLookup = new Dictionary<string,int>(termIndexLookup);
     for (int i = 0; i < matrices[0].NumDocs; i++)
     {
         matrices[0]._matrix[i] = new double[termIndex.Count];
         // fill in original values
         for (int j = 0; j < matrix1.NumTerms; j++)
         {
             matrices[0][i, j] = matrix1[i, j];
         }
         // fill in missing terms
         for (int j = matrix1.NumTerms; j < termIndex.Count; j++)
         {
             matrices[0][i, j] = 0.0;
         }
     }
     // matrix 2
     matrices[1]._termIndex = new List<string>(termIndex);
     matrices[1]._termIndexLookup = new Dictionary<string,int>(termIndexLookup);
     for (int i = 0; i < matrices[1].NumDocs; i++)
     {
         matrices[1]._matrix[i] = new double[termIndex.Count];
         // fill in values
         for (int j = 0; j < termIndex.Count; j++)
         {
             if (matrix2.ContainsTerm(termIndex[j]))
             {
                 matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]);
             }
             else
             {
                 matrices[1][i, j] = 0.0;
             }
         }
     }
     // return
     return matrices;
 }
示例#28
0
 /// <summary>
 /// Deep copy constructor
 /// </summary>
 /// <param name="matrix">Object to be copied</param>
 public TermDocumentMatrix(TermDocumentMatrix matrix)
 {
     _matrix = new double[matrix.NumDocs][];
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         _matrix[i] = new double[matrix.NumTerms];
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             _matrix[i][j] = matrix[i,j];
         }
     }
     _docIndex = new List<string>(matrix._docIndex);
     _docIndexLookup = new Dictionary<string, int>(matrix._docIndexLookup);
     _termIndex = new List<string>(matrix._termIndex);
     _termIndexLookup = new Dictionary<string, int>(matrix._termIndexLookup);
 }
示例#29
0
        /// <summary>
        /// Recreates each matrix with documents containing missing terms.
        /// List[0] : matrix 1
        /// List[1] : matrix 2
        /// </summary>
        /// <param name="matrix1">First term-by-document matrix</param>
        /// <param name="matrix2">Second term-by-document matrix</param>
        /// <returns>Copies of original matrices with missing terms from each</returns>
        public static List <TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2)
        {
            // initialize matrices
            List <TermDocumentMatrix> matrices = new List <TermDocumentMatrix>();

            // matrix 1
            matrices.Add(new TermDocumentMatrix());
            matrices[0]._matrix         = new double[matrix1.NumDocs][];
            matrices[0]._docIndex       = new List <string>(matrix1._docIndex);
            matrices[0]._docIndexLookup = new Dictionary <string, int>(matrix1._docIndexLookup);
            // matrix 2
            matrices.Add(new TermDocumentMatrix());
            matrices[1]._matrix         = new double[matrix2.NumDocs][];
            matrices[1]._docIndex       = new List <string>(matrix2._docIndex);
            matrices[1]._docIndexLookup = new Dictionary <string, int>(matrix2._docIndexLookup);
            // compute term set
            List <string>            termIndex       = new List <string>();
            Dictionary <string, int> termIndexLookup = new Dictionary <string, int>();
            Dictionary <string, int> leftovers       = new Dictionary <string, int>(matrix2._termIndexLookup);

            // get all terms in first matrix
            foreach (string term in matrix1._termIndex)
            {
                termIndex.Add(term);
                termIndexLookup.Add(term, termIndex.Count - 1);
                // remove duplicate terms
                if (matrix2._termIndexLookup.ContainsKey(term))
                {
                    leftovers.Remove(term);
                }
            }
            // add leftovers
            foreach (string term in leftovers.Keys)
            {
                termIndex.Add(term);
                termIndexLookup.Add(term, termIndex.Count - 1);
            }
            // create new term distributions for each document
            // matrix 1
            matrices[0]._termIndex       = new List <string>(termIndex);
            matrices[0]._termIndexLookup = new Dictionary <string, int>(termIndexLookup);
            for (int i = 0; i < matrices[0].NumDocs; i++)
            {
                matrices[0]._matrix[i] = new double[termIndex.Count];
                // fill in original values
                for (int j = 0; j < matrix1.NumTerms; j++)
                {
                    matrices[0][i, j] = matrix1[i, j];
                }
                // fill in missing terms
                for (int j = matrix1.NumTerms; j < termIndex.Count; j++)
                {
                    matrices[0][i, j] = 0.0;
                }
            }
            // matrix 2
            matrices[1]._termIndex       = new List <string>(termIndex);
            matrices[1]._termIndexLookup = new Dictionary <string, int>(termIndexLookup);
            for (int i = 0; i < matrices[1].NumDocs; i++)
            {
                matrices[1]._matrix[i] = new double[termIndex.Count];
                // fill in values
                for (int j = 0; j < termIndex.Count; j++)
                {
                    if (matrix2.ContainsTerm(termIndex[j]))
                    {
                        matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]);
                    }
                    else
                    {
                        matrices[1][i, j] = 0.0;
                    }
                }
            }
            // return
            return(matrices);
        }