/// <summary> /// Finds duplicates in a list of <c>NewsItem</c>s and the database. /// </summary> /// <param name="archivist"> /// An <c>Archivist</c> instance with database access. /// </param> /// <param name="news"> /// The list of news to compare with the matrix. /// </param> /// <returns> /// Returns the filtered list. /// </returns> private List<NewsItem> FilterDatabaseRedundancy(Archivist archivist, List<NewsItem> news) { // Create result list containing all news items. Redundant news // will later be removed. List<NewsItem> result = new List<NewsItem>(news); DateTime dateLimit = DateTime.Now.Subtract( new TimeSpan(24 * 7 * 2, 0, 0)); SparseMatrix matrix = GetTermDocumentMatrix(archivist, news, dateLimit); // If matrix == null, there are no news items in the db, and thus // no redundant news. if (matrix == null) { return news; } // Transpose matrix and thus prepare for multiplication. SparseMatrix mTransposed = matrix.Transpose(); // Normalize all rows to make cosine similarity calculation faster. mTransposed = mTransposed.NormalizeRows(); // Find redundancy value for each news item. foreach (NewsItem item in news) { // Get tf-idf vector representing this news item. SparseVector newsVector = archivist.GetTfIdfVector(item); newsVector = newsVector.Normalize(); // Calculate product. SparseVector product = mTransposed.VectorProduct(newsVector); // Remove item from result list if product contains element // with value >= threshold. foreach (int entry in product.NonZeroIndices) { if (product[entry] >= THRESHOLD) { result.Remove(item); break; } } } return result; }
/// <summary> /// Write the cosine similarity of the given <c>NewsItem</c>s to /// the file cosine_similarity_log.txt. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> for getting database information. /// </param> /// <param name="news"> /// The <c>NewsItem</c>s to print the cosine similarity of. /// </param> private static void WriteCosineSimilarity(Archivist archivist, List<NewsItem> news) { // Sort the news according to title. news.Sort((n1, n2) => int.Parse(n1.Title).CompareTo(int.Parse(n2.Title))); // Get the vector of all news and add them to a matrix. List<SparseVector> vectors = new List<SparseVector>(); foreach (NewsItem n in news) { vectors.Add(archivist.GetTfIdfVector(n)); } // Create a matrix. SparseMatrix matrix = new SparseMatrix(vectors.First().Dimension, vectors.Count); // Add entries to the matrix. for (int i = 0; i < vectors.Count; i++) { for (int j = 0; j < vectors.First().Dimension; j++) { matrix[j, i] = vectors[i][j]; } } // Transpose matrix and thus prepare for multiplication. SparseMatrix mTransposed = matrix.Transpose().NormalizeRows(); // Calculate the result. SparseMatrix result = mTransposed.Product(matrix.Transpose().NormalizeRows().Transpose()); // Write to file. using (StreamWriter file = new StreamWriter("cosine_similarity_log.txt")) { int rowLength = result.Rows; int columnLength = result.Columns; // Print header. file.Write(" "); for (int i = 0; i < result.Columns; i++) { file.Write("dok" + (i + 1).ToString("0#") + " "); } file.WriteLine(); // Print the matrix. for (int i = 0; i < rowLength; i++) { // Print doc title. file.Write("dok" + (i + 1).ToString("0#") + " "); for (int j = 0; j < columnLength; j++) { file.Write(result[i, j].ToString("0.##0") + " "); } if (i != rowLength - 1) { file.WriteLine(); } } } }
/// <summary> /// Finds redundant <c>NewsItem</c>s in the given list, /// and returns a new list without the <c>NewsItem</c>s. /// </summary> /// <param name="archivist"> /// An <c>Archivist</c> instance with database access. /// </param> /// <param name="news"> /// The list of <c>NewsItem</c>s to filter. /// </param> /// <returns> /// A filtered list of <c>NewsItem</c>s. /// </returns> private List<NewsItem> FilterListRedundancy(Archivist archivist, List<NewsItem> news) { List<NewsItem> result = new List<NewsItem>(news); // Find redundancy for each news item. for (int i = 0; i < news.Count - 1; i++) { // Get Tf-idf vector for current item. SparseVector itemVector = archivist.GetTfIdfVector(news[i]); itemVector = itemVector.Normalize(); // For all items with index j > i, calculate cosine similarity. for (int j = i + 1; j < news.Count; j++) { // Get sparsevector from news item to compare. SparseVector compareVector = archivist.GetTfIdfVector(news[j]); compareVector = compareVector.Normalize(); // Remove from list if cosine similarity exeeds or equals // threshold. if (compareVector.DotProduct(itemVector) >= THRESHOLD) { result.Remove(news[i]); break; } } } return result; }