Exemplo n.º 1
0
        /// <summary>
        /// Combines two TermDocumentMatrices into one matrix with the same terms.
        /// The first matrix is put in first (ie. newmatrix[0] = matrix1[0])
        /// </summary>
        /// <param name="matrix1">First matrix</param>
        /// <param name="matrix2">Second matrix</param>
        /// <returns></returns>
        public static TermDocumentMatrix Combine(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2)
        {
            TermDocumentMatrix combined = new TermDocumentMatrix();

            // add documents
            combined._docIndex       = new List <string>(matrix1._docIndex);
            combined._docIndexLookup = new Dictionary <string, int>(matrix1._docIndexLookup);
            foreach (string doc in matrix2.DocMap)
            {
                combined._docIndex.Add(doc);
                combined._docIndexLookup.Add(doc, combined.NumDocs - 1);
            }
            // calculate union of terms
            combined._termIndex       = new List <string>(matrix1._termIndex);
            combined._termIndexLookup = new Dictionary <string, int>(matrix1._termIndexLookup);
            foreach (string term in matrix2.TermMap)
            {
                if (!combined._termIndexLookup.ContainsKey(term))
                {
                    combined._termIndex.Add(term);
                    combined._termIndexLookup.Add(term, combined.NumTerms - 1);
                }
            }
            // create and populate matrix
            combined._matrix = new double[combined.NumDocs][];
            // matrix1
            for (int i = 0; i < matrix1.NumDocs; i++)
            {
                combined._matrix[i] = new double[combined.NumTerms];
                for (int j = 0; j < combined.NumTerms; j++)
                {
                    if (matrix1.ContainsTerm(combined.TermMap[j]))
                    {
                        combined[i, j] = matrix1[i, matrix1.GetTermIndex(combined.TermMap[j])];
                    }
                    else
                    {
                        combined[i, j] = 0.0;
                    }
                }
            }
            // matrix2
            for (int i = matrix1.NumDocs; i < combined.NumDocs; i++)
            {
                combined._matrix[i] = new double[combined.NumTerms];
                for (int j = 0; j < combined.NumTerms; j++)
                {
                    if (matrix2.ContainsTerm(combined.TermMap[j]))
                    {
                        combined[i, j] = matrix2[i - matrix1.NumDocs, matrix2.GetTermIndex(combined.TermMap[j])];
                    }
                    else
                    {
                        combined[i, j] = 0.0;
                    }
                }
            }
            return(combined);
        }
Exemplo n.º 2
0
 /// <summary>
 /// Recreates each matrix with documents containing missing terms.
 /// List[0] : matrix 1
 /// List[1] : matrix 2
 /// </summary>
 /// <param name="matrix1">First term-by-document matrix</param>
 /// <param name="matrix2">Second term-by-document matrix</param>
 /// <returns>Copies of original matrices with missing terms from each</returns>
 public static List<TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2)
 {
     // initialize matrices
     List<TermDocumentMatrix> matrices = new List<TermDocumentMatrix>();
     // matrix 1
     matrices.Add(new TermDocumentMatrix());
     matrices[0]._matrix = new double[matrix1.NumDocs][];
     matrices[0]._docIndex = new List<string>(matrix1._docIndex);
     matrices[0]._docIndexLookup = new Dictionary<string,int>(matrix1._docIndexLookup);
     // matrix 2
     matrices.Add(new TermDocumentMatrix());
     matrices[1]._matrix = new double[matrix2.NumDocs][];
     matrices[1]._docIndex = new List<string>(matrix2._docIndex);
     matrices[1]._docIndexLookup = new Dictionary<string,int>(matrix2._docIndexLookup);
     // compute term set
     List<string> termIndex = new List<string>();
     Dictionary<string, int> termIndexLookup = new Dictionary<string, int>();
     Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup);
     // get all terms in first matrix
     foreach (string term in matrix1._termIndex)
     {
         termIndex.Add(term);
         termIndexLookup.Add(term, termIndex.Count - 1);
         // remove duplicate terms
         if (matrix2._termIndexLookup.ContainsKey(term))
         {
             leftovers.Remove(term);
         }
     }
     // add leftovers
     foreach (string term in leftovers.Keys)
     {
         termIndex.Add(term);
         termIndexLookup.Add(term, termIndex.Count - 1);
     }
     // create new term distributions for each document
     // matrix 1
     matrices[0]._termIndex = new List<string>(termIndex);
     matrices[0]._termIndexLookup = new Dictionary<string,int>(termIndexLookup);
     for (int i = 0; i < matrices[0].NumDocs; i++)
     {
         matrices[0]._matrix[i] = new double[termIndex.Count];
         // fill in original values
         for (int j = 0; j < matrix1.NumTerms; j++)
         {
             matrices[0][i, j] = matrix1[i, j];
         }
         // fill in missing terms
         for (int j = matrix1.NumTerms; j < termIndex.Count; j++)
         {
             matrices[0][i, j] = 0.0;
         }
     }
     // matrix 2
     matrices[1]._termIndex = new List<string>(termIndex);
     matrices[1]._termIndexLookup = new Dictionary<string,int>(termIndexLookup);
     for (int i = 0; i < matrices[1].NumDocs; i++)
     {
         matrices[1]._matrix[i] = new double[termIndex.Count];
         // fill in values
         for (int j = 0; j < termIndex.Count; j++)
         {
             if (matrix2.ContainsTerm(termIndex[j]))
             {
                 matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]);
             }
             else
             {
                 matrices[1][i, j] = 0.0;
             }
         }
     }
     // return
     return matrices;
 }
Exemplo n.º 3
0
        /// <summary>
        /// Recreates each matrix with documents containing missing terms.
        /// List[0] : matrix 1
        /// List[1] : matrix 2
        /// </summary>
        /// <param name="matrix1">First term-by-document matrix</param>
        /// <param name="matrix2">Second term-by-document matrix</param>
        /// <returns>Copies of original matrices with missing terms from each</returns>
        public static List <TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2)
        {
            // initialize matrices
            List <TermDocumentMatrix> matrices = new List <TermDocumentMatrix>();

            // matrix 1
            matrices.Add(new TermDocumentMatrix());
            matrices[0]._matrix         = new double[matrix1.NumDocs][];
            matrices[0]._docIndex       = new List <string>(matrix1._docIndex);
            matrices[0]._docIndexLookup = new Dictionary <string, int>(matrix1._docIndexLookup);
            // matrix 2
            matrices.Add(new TermDocumentMatrix());
            matrices[1]._matrix         = new double[matrix2.NumDocs][];
            matrices[1]._docIndex       = new List <string>(matrix2._docIndex);
            matrices[1]._docIndexLookup = new Dictionary <string, int>(matrix2._docIndexLookup);
            // compute term set
            List <string>            termIndex       = new List <string>();
            Dictionary <string, int> termIndexLookup = new Dictionary <string, int>();
            Dictionary <string, int> leftovers       = new Dictionary <string, int>(matrix2._termIndexLookup);

            // get all terms in first matrix
            foreach (string term in matrix1._termIndex)
            {
                termIndex.Add(term);
                termIndexLookup.Add(term, termIndex.Count - 1);
                // remove duplicate terms
                if (matrix2._termIndexLookup.ContainsKey(term))
                {
                    leftovers.Remove(term);
                }
            }
            // add leftovers
            foreach (string term in leftovers.Keys)
            {
                termIndex.Add(term);
                termIndexLookup.Add(term, termIndex.Count - 1);
            }
            // create new term distributions for each document
            // matrix 1
            matrices[0]._termIndex       = new List <string>(termIndex);
            matrices[0]._termIndexLookup = new Dictionary <string, int>(termIndexLookup);
            for (int i = 0; i < matrices[0].NumDocs; i++)
            {
                matrices[0]._matrix[i] = new double[termIndex.Count];
                // fill in original values
                for (int j = 0; j < matrix1.NumTerms; j++)
                {
                    matrices[0][i, j] = matrix1[i, j];
                }
                // fill in missing terms
                for (int j = matrix1.NumTerms; j < termIndex.Count; j++)
                {
                    matrices[0][i, j] = 0.0;
                }
            }
            // matrix 2
            matrices[1]._termIndex       = new List <string>(termIndex);
            matrices[1]._termIndexLookup = new Dictionary <string, int>(termIndexLookup);
            for (int i = 0; i < matrices[1].NumDocs; i++)
            {
                matrices[1]._matrix[i] = new double[termIndex.Count];
                // fill in values
                for (int j = 0; j < termIndex.Count; j++)
                {
                    if (matrix2.ContainsTerm(termIndex[j]))
                    {
                        matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]);
                    }
                    else
                    {
                        matrices[1][i, j] = 0.0;
                    }
                }
            }
            // return
            return(matrices);
        }
Exemplo n.º 4
0
 /// <summary>
 /// Combines two TermDocumentMatrices into one matrix with the same terms.
 /// The first matrix is put in first (ie. newmatrix[0] = matrix1[0])
 /// </summary>
 /// <param name="matrix1">First matrix</param>
 /// <param name="matrix2">Second matrix</param>
 /// <returns></returns>
 public static TermDocumentMatrix Combine(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2)
 {
     TermDocumentMatrix combined = new TermDocumentMatrix();
     // add documents
     combined._docIndex = new List<string>(matrix1._docIndex);
     combined._docIndexLookup = new Dictionary<string, int>(matrix1._docIndexLookup);
     foreach (string doc in matrix2.DocMap)
     {
         combined._docIndex.Add(doc);
         combined._docIndexLookup.Add(doc, combined.NumDocs - 1);
     }
     // calculate union of terms
     combined._termIndex = new List<string>(matrix1._termIndex);
     combined._termIndexLookup = new Dictionary<string, int>(matrix1._termIndexLookup);
     foreach (string term in matrix2.TermMap)
     {
         if (!combined._termIndexLookup.ContainsKey(term))
         {
             combined._termIndex.Add(term);
             combined._termIndexLookup.Add(term, combined.NumTerms - 1);
         }
     }
     // create and populate matrix
     combined._matrix = new double[combined.NumDocs][];
     // matrix1
     for (int i = 0; i < matrix1.NumDocs; i++)
     {
         combined._matrix[i] = new double[combined.NumTerms];
         for (int j = 0; j < combined.NumTerms; j++)
         {
             if (matrix1.ContainsTerm(combined.TermMap[j]))
             {
                 combined[i, j] = matrix1[i, matrix1.GetTermIndex(combined.TermMap[j])];
             }
             else
             {
                 combined[i, j] = 0.0;
             }
         }
     }
     // matrix2
     for (int i = matrix1.NumDocs; i < combined.NumDocs; i++)
     {
         combined._matrix[i] = new double[combined.NumTerms];
         for (int j = 0; j < combined.NumTerms; j++)
         {
             if (matrix2.ContainsTerm(combined.TermMap[j]))
             {
                 combined[i, j] = matrix2[i - matrix1.NumDocs, matrix2.GetTermIndex(combined.TermMap[j])];
             }
             else
             {
                 combined[i, j] = 0.0;
             }
         }
     }
     return combined;
 }