Пример #1
0
        /// <summary>
        /// Finds duplicates in a list of <c>NewsItem</c>s and the database.
        /// </summary>
        /// <param name="archivist">
        /// An <c>Archivist</c> instance with database access.
        /// </param>
        /// <param name="news">
        /// The list of news to compare with the matrix.
        /// </param>
        /// <returns>
        /// Returns the filtered list.
        /// </returns>
        private List<NewsItem> FilterDatabaseRedundancy(Archivist archivist,
            List<NewsItem> news)
        {
            // Create result list containing all news items. Redundant news
            // will later be removed.
            List<NewsItem> result = new List<NewsItem>(news);

            DateTime dateLimit = DateTime.Now.Subtract(
                new TimeSpan(24 * 7 * 2, 0, 0));

            SparseMatrix matrix = GetTermDocumentMatrix(archivist, news, dateLimit);

            // If matrix == null, there are no news items in the db, and thus
            // no redundant news.
            if (matrix == null)
            {
                return news;
            }

            // Transpose matrix and thus prepare for multiplication.
            SparseMatrix mTransposed = matrix.Transpose();
            // Normalize all rows to make cosine similarity calculation faster.
            mTransposed = mTransposed.NormalizeRows();

            // Find redundancy value for each news item.
            foreach (NewsItem item in news)
            {
                // Get tf-idf vector representing this news item.
                SparseVector newsVector = archivist.GetTfIdfVector(item);
                newsVector = newsVector.Normalize();
                // Calculate product.
                SparseVector product = mTransposed.VectorProduct(newsVector);

                // Remove item from result list if product contains element
                // with value >= threshold.
                foreach (int entry in product.NonZeroIndices)
                {
                    if (product[entry] >= THRESHOLD)
                    {
                        result.Remove(item);
                        break;
                    }
                }
            }

            return result;
        }
Пример #2
0
        /// <summary>
        /// Write the cosine similarity of the given <c>NewsItem</c>s to 
        /// the file cosine_similarity_log.txt.
        /// </summary>
        /// <param name="archivist">
        /// The <c>Archivist</c> for getting database information.
        /// </param>
        /// <param name="news">
        /// The <c>NewsItem</c>s to print the cosine similarity of.
        /// </param>
        private static void WriteCosineSimilarity(Archivist archivist, List<NewsItem> news)
        {
            // Sort the news according to title.
            news.Sort((n1, n2) => int.Parse(n1.Title).CompareTo(int.Parse(n2.Title)));

            // Get the vector of all news and add them to a matrix.
            List<SparseVector> vectors = new List<SparseVector>();
            foreach (NewsItem n in news)
            {
                vectors.Add(archivist.GetTfIdfVector(n));
            }

            // Create a matrix.
            SparseMatrix matrix =
                new SparseMatrix(vectors.First().Dimension, vectors.Count);
            // Add entries to the matrix.
            for (int i = 0; i < vectors.Count; i++)
            {
                for (int j = 0; j < vectors.First().Dimension; j++)
                {
                    matrix[j, i] = vectors[i][j];
                }
            }

            // Transpose matrix and thus prepare for multiplication.
            SparseMatrix mTransposed = matrix.Transpose().NormalizeRows();

            // Calculate the result.
            SparseMatrix result = mTransposed.Product(matrix.Transpose().NormalizeRows().Transpose());

            // Write to file.
            using (StreamWriter file = new StreamWriter("cosine_similarity_log.txt"))
            {
                int rowLength = result.Rows;
                int columnLength = result.Columns;

                // Print header.
                file.Write("      ");
                for (int i = 0; i < result.Columns; i++)
                {
                    file.Write("dok" + (i + 1).ToString("0#") + " ");
                }
                file.WriteLine();

                // Print the matrix.
                for (int i = 0; i < rowLength; i++)
                {
                    // Print doc title.
                    file.Write("dok" + (i + 1).ToString("0#") + " ");
                    for (int j = 0; j < columnLength; j++)
                    {
                        file.Write(result[i, j].ToString("0.##0") + " ");
                    }

                    if (i != rowLength - 1)
                    {
                        file.WriteLine();
                    }
                }
            }
        }
Пример #3
0
        /// <summary>
        /// Finds redundant <c>NewsItem</c>s in the given list, 
        /// and returns a new list without the <c>NewsItem</c>s.
        /// </summary>
        /// <param name="archivist">
        /// An <c>Archivist</c> instance with database access.
        /// </param>
        /// <param name="news">
        /// The list of <c>NewsItem</c>s to filter.
        /// </param>
        /// <returns>
        /// A filtered list of <c>NewsItem</c>s.
        /// </returns>
        private List<NewsItem> FilterListRedundancy(Archivist archivist,
            List<NewsItem> news)
        {
            List<NewsItem> result = new List<NewsItem>(news);

            // Find redundancy for each news item.
            for (int i = 0; i < news.Count - 1; i++)
            {
                // Get Tf-idf vector for current item.
                SparseVector itemVector = archivist.GetTfIdfVector(news[i]);
                itemVector = itemVector.Normalize();
                // For all items with index j > i, calculate cosine similarity.
                for (int j = i + 1; j < news.Count; j++)
                {
                    // Get sparsevector from news item to compare.
                    SparseVector compareVector = archivist.GetTfIdfVector(news[j]);
                    compareVector = compareVector.Normalize();
                    // Remove from list if cosine similarity exeeds or equals
                    // threshold.
                    if (compareVector.DotProduct(itemVector) >= THRESHOLD)
                    {
                        result.Remove(news[i]);
                        break;
                    }
                }
            }

            return result;
        }