示例#1
0
        /// <summary>
        /// Takes the two specified documents and creates two new document vectors with the missing terms from each.
        /// Row 0: document 1
        /// Row 1: document 2
        /// </summary>
        /// <param name="matrix1">document1 container</param>
        /// <param name="document1">document1 index</param>
        /// <param name="matrix2">document2 container</param>
        /// <param name="document2">document2 index</param>
        /// <returns>New term-by-document matrix containing the two documents and their term maps</returns>
        public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2)
        {
            // initialize new TermDocumentMatrix
            TermDocumentMatrix newmatrix = new TermDocumentMatrix();

            newmatrix._matrix          = new double[2][];
            newmatrix._termIndex       = new List <string>();
            newmatrix._termIndexLookup = new Dictionary <string, int>();
            newmatrix._docIndex        = new List <string>();
            newmatrix._docIndexLookup  = new Dictionary <string, int>();
            newmatrix._docIndex.Add(matrix1.GetDocumentName(document1));
            newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1);
            newmatrix._docIndex.Add(matrix2.GetDocumentName(document2));
            newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1);
            List <double> doc1 = new List <double>();
            List <double> doc2 = new List <double>();
            // compute total term set
            Dictionary <string, int> leftovers = new Dictionary <string, int>(matrix2._termIndexLookup);

            foreach (string term in matrix1._termIndex)
            {
                newmatrix._termIndex.Add(term);
                newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1);
                doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term)));
                if (matrix2._termIndexLookup.ContainsKey(term))
                {
                    leftovers.Remove(term);
                    doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term)));
                }
                else
                {
                    doc2.Add(0.0);
                }
            }
            foreach (string term in leftovers.Keys)
            {
                newmatrix._termIndex.Add(term);
                newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1);
                doc1.Add(0.0);
                doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term)));
            }
            newmatrix._matrix[0] = doc1.ToArray();
            newmatrix._matrix[1] = doc2.ToArray();
            return(newmatrix);
        }
示例#2
0
 /// <summary>
 /// Takes the two specified documents and creates two new document vectors with the missing terms from each.
 /// Row 0: document 1
 /// Row 1: document 2
 /// </summary>
 /// <param name="matrix1">document1 container</param>
 /// <param name="document1">document1 index</param>
 /// <param name="matrix2">document2 container</param>
 /// <param name="document2">document2 index</param>
 /// <returns>New term-by-document matrix containing the two documents and their term maps</returns>
 public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2)
 {
     // initialize new TermDocumentMatrix
     TermDocumentMatrix newmatrix = new TermDocumentMatrix();
     newmatrix._matrix = new double[2][];
     newmatrix._termIndex = new List<string>();
     newmatrix._termIndexLookup = new Dictionary<string, int>();
     newmatrix._docIndex = new List<string>();
     newmatrix._docIndexLookup = new Dictionary<string, int>();
     newmatrix._docIndex.Add(matrix1.GetDocumentName(document1));
     newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1);
     newmatrix._docIndex.Add(matrix2.GetDocumentName(document2));
     newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1);
     List<double> doc1 = new List<double>();
     List<double> doc2 = new List<double>();
     // compute total term set
     Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup);
     foreach (string term in matrix1._termIndex)
     {
         newmatrix._termIndex.Add(term);
         newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1);
         doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term)));
         if (matrix2._termIndexLookup.ContainsKey(term))
         {
             leftovers.Remove(term);
             doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term)));
         }
         else
         {
             doc2.Add(0.0);
         }
     }
     foreach (string term in leftovers.Keys)
     {
         newmatrix._termIndex.Add(term);
         newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1);
         doc1.Add(0.0);
         doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term)));
     }
     newmatrix._matrix[0] = doc1.ToArray();
     newmatrix._matrix[1] = doc2.ToArray();
     return newmatrix;
 }