/// <summary> /// Filters the news based on interest. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> connected to the database. /// </param> /// <param name="news"> /// The list of <c>NewsItem</c>s to filter. /// </param> /// <returns></returns> public override List<NewsItem> Filter(Archivist archivist, List<NewsItem> news) { List<Tuple<NewsItem, float>> interestScores = CalculateInterestScores(archivist, news); // Sort by interest. interestScores.Sort(delegate(Tuple<NewsItem, float> t1, Tuple<NewsItem, float> t2) { return t2.Item2.CompareTo(t1.Item2); } ); // Convert Tuple to interest filtered List with item1. List<NewsItem> result = new List<NewsItem>(); foreach (Tuple<NewsItem, float> item in interestScores) { // Is news item interesting? if (item.Item2 >= THRESHOLD) { result.Add(item.Item1); } } return result; }
/// <summary> /// Filters a given list of news for redundant <c>NewsItem</c>s. A /// precondition to make this filter work is that the list of <c>NewsItem</c>s /// given is ordered by interest. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> instance with database access. /// </param> /// <param name="news"> /// The list of <c>NewsItem</c>s to be filtered. /// </param> /// <returns> /// A filtered list of <c>NewsItem</c>s in the same /// order as in the input. /// </returns> public override List<NewsItem> Filter(Archivist archivist, List<NewsItem> news) { // Filter in list. List<NewsItem> result = FilterListRedundancy(archivist, news); // Filter in database. result = FilterDatabaseRedundancy(archivist, result); return result; }
/// <summary> /// Seeds the data from JSON-files located in the given directory. /// The directory must contain folders named as the category of its /// contents. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> with database access /// </param> /// <param name="dataDir"> /// The directory the data is located in. /// </param> /// <param name="perCategory"> /// Number of <c>NewsItem</c>s per category to read. /// </param> /// <returns> /// The number of <c>NewsItem</c>s added. /// </returns> public static int SeedDatabaseWithNews(Archivist archivist, string dataDir, int perCategory = 3000) { int count = 0; int errorCount = 0; // Each category should be placed in its own directory. string[] categoryDirs = Directory.GetDirectories(dataDir); // Calculate number of files (3000 pr. category). // The number of files is limited to ensure that no category is more // likely than others. int totalFilesCount = categoryDirs.Length * 3000; foreach (string s in categoryDirs) { int categoryId = archivist.AddCategory(Path.GetFileName(s)); List<NewsMaterial> newsBuffer = new List<NewsMaterial>(); // Read 3000 files. string[] files = Directory.GetFiles(s); for (int i = 0; i < perCategory && i < files.Length; i++) { string file = files[i]; string json = File.ReadAllText(file, UTF8Encoding.UTF8); NewsMaterial news = ParseJson(json); if (news != null) { newsBuffer.Add(news); count++; } else { Console.WriteLine("Error seeding item."); errorCount++; } Console.WriteLine("Loaded {0}/{1}", count + errorCount, totalFilesCount); } // Write the news to the archivist. Stopwatch watch = new Stopwatch(); watch.Start(); archivist.AddNews(newsBuffer, categoryId, true); watch.Stop(); Console.WriteLine("Time: {0}", watch.ElapsedMilliseconds); Console.WriteLine("Seeded {0}.", newsBuffer.Count); } return count; }
/// <summary> /// Finds duplicates in a list of <c>NewsItem</c>s and the database. /// </summary> /// <param name="archivist"> /// An <c>Archivist</c> instance with database access. /// </param> /// <param name="news"> /// The list of news to compare with the matrix. /// </param> /// <returns> /// Returns the filtered list. /// </returns> private List<NewsItem> FilterDatabaseRedundancy(Archivist archivist, List<NewsItem> news) { // Create result list containing all news items. Redundant news // will later be removed. List<NewsItem> result = new List<NewsItem>(news); DateTime dateLimit = DateTime.Now.Subtract( new TimeSpan(24 * 7 * 2, 0, 0)); SparseMatrix matrix = GetTermDocumentMatrix(archivist, news, dateLimit); // If matrix == null, there are no news items in the db, and thus // no redundant news. if (matrix == null) { return news; } // Transpose matrix and thus prepare for multiplication. SparseMatrix mTransposed = matrix.Transpose(); // Normalize all rows to make cosine similarity calculation faster. mTransposed = mTransposed.NormalizeRows(); // Find redundancy value for each news item. foreach (NewsItem item in news) { // Get tf-idf vector representing this news item. SparseVector newsVector = archivist.GetTfIdfVector(item); newsVector = newsVector.Normalize(); // Calculate product. SparseVector product = mTransposed.VectorProduct(newsVector); // Remove item from result list if product contains element // with value >= threshold. foreach (int entry in product.NonZeroIndices) { if (product[entry] >= THRESHOLD) { result.Remove(item); break; } } } return result; }
/// <summary> /// Curates undread news from the given <c>Archivist</c> instance based /// on a maximum reading time, a maximum individual item reading time /// and a limit. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> instance giving access to the news that /// should be curated. /// </param> /// <param name="limit"> /// The maximum number of news to return. /// </param> /// <param name="maxTime"> /// The maximum reading time for the curated news. /// </param> /// <param name="maxItemTime"> /// The maximum reading time per news item. /// </param> /// <returns> /// A list of unread, curated <c>NewsItem</c>s received from the /// <c>Archivist</c> instance. /// </returns> public static List<NewsItem> GetCuratedNews(Archivist archivist, int limit, int maxTime = -1, int maxItemTime = - 1) { // Get cached data and update if null. List<NewsItem> curatedNews = CachedNews; if (curatedNews == null) { // Update needed. NewsQuery query = new NewsQuery(); // Only fetch unread news. query.Read = ReadStatus.Unread; query.OrderDateDesc = true; query.Limit = 100; curatedNews = archivist.GetNews(query); // If no news was found then there's no need to filter them. if (curatedNews.Count == 0) { return curatedNews; } // Filter for interesting news. InterestFilter interestFilter = new InterestFilter(); curatedNews = interestFilter.Filter(archivist, curatedNews); // Filter redundancy. RedundancyFilter redundancyFilter = new RedundancyFilter(); curatedNews = redundancyFilter.Filter(archivist, curatedNews); // Update cache. CachedNews = curatedNews; } // Filter quantity. QuantityFilter quantityFilter = new QuantityFilter(limit, maxTime, maxItemTime); curatedNews = quantityFilter.Filter(archivist, curatedNews); // Return curated list. return curatedNews; }
/// <summary> /// Filters a list of news according to the parameters set /// by the constructor, a precondition to this method is that the list of /// <c>NewsItems</c>s is sorted by descending interesting score. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> /// </param> /// <param name="news"> /// A list of <c>News</c> to filter. /// </param> /// <returns> /// A list of <c>NewsItem</c> filtered as specified by the constructor. /// </returns> public override List<NewsItem> Filter(Archivist archivist, List<NewsItem> news) { List<NewsItem> listCopy = news.ToList(); // Only filter for time if MaxTime has been set. if (MaxTime > -1) { int readingSpeed = archivist.GetReadingSpeed(); int remainingTime = MaxTime; int articleLength = 0; if (readingSpeed <= 0) { // Not possible to filter for. Return an empty list. return new List<NewsItem>(); } foreach (NewsItem item in news) { articleLength = CalculateNewsReadTime(item, readingSpeed); if (MaxItemTime > 0 && articleLength > MaxItemTime) { listCopy.Remove(item); } else if (articleLength <= remainingTime) { remainingTime -= articleLength; } else { listCopy.Remove(item); } } } // Trim list to max count. TrimNewsList(listCopy, MaxCount); return listCopy; }
/// <summary> /// Aggregates news from <c>NewsSource</c>s found in the given /// <c>Archivist</c>. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> to get <c>NewsSource</c>s from and /// store <c>NewsItem</c>s in. /// </param> public static void Aggregate(Archivist archivist) { // Collect news sources from the archivist. List<NewsSource> sources = CollectNewsSources(archivist); List<NewsSource> sourcesCopy = sources.ToList(); // Run through all sources. foreach (NewsSource source in sourcesCopy) { if (source != null && source.ContentExpirationTime != null && DateTime.Now.CompareTo(source.ContentExpirationTime) < 0) { sources.Remove(source); } } // Aggregate news from the filtered sources. AggregateNewsFromSources(archivist, sources.FindAll(n => !n.IsBlocked)); }
/// <summary> /// Aggregates news from the given list of <c>NewsSource</c>s. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> of which to save aggregated news. /// </param> /// <param name="sources"> /// The <c>NewsSource</c>s of which to load news from. /// </param> private static void AggregateNewsFromSources(Archivist archivist, List<NewsSource> sources) { List<NewsMaterial> newNewsMaterial = new List<NewsMaterial>(); // Run ParseNewsItems on each news source. // In paralell Parallel.ForEach(sources, (source, state) => { if (source != null) { newNewsMaterial.AddRange(ParseNewsItems(source)); } }); foreach (NewsSource source in sources) { archivist.UpdateNewsSource(source); } archivist.AddNews(newNewsMaterial); }
/// <summary> /// Filters the list of <c>NewsItem</c>.s /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> instance for database access. /// </param> /// <param name="news"> /// The list of <c>NewsItem</c>s to sort. /// </param> /// <returns> /// The filtered list. /// </returns> public abstract List<NewsItem> Filter(Archivist archivist, List<NewsItem> news);
/// <summary> /// Reads a JSON file with news sources and saves in archivist. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> instance with database access. /// </param> /// <param name="file"> /// The path to the JSON-file containing <c>NewsSource</c>s. /// </param> public static void SeedNewsSources(Archivist archivist, string file) { List<NewsSource> newsSources = new List<NewsSource>(); string json = File.ReadAllText(file, UTF8Encoding.UTF8); try { // Parse JSON and assign various values. var values = JArray.Parse(json); for (int i = 0; i < values.Count; i++) { // Parse and add news source to list. string name = values[i]["name"].ToString(); string url = values[i]["url"].ToString(); try { Uri uri = new Uri(url); NewsSource newsSource = new NewsSource(name, uri); newsSources.Add(newsSource); } catch (UriFormatException) { Console.WriteLine("Could not parse news item URL."); } } } catch (Newtonsoft.Json.JsonReaderException) { Console.WriteLine("Error parsing news source"); } // Add to database. archivist.AddNewsSources(newsSources); }
/// <summary> /// Write the cosine similarity of the given <c>NewsItem</c>s to /// the file cosine_similarity_log.txt. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> for getting database information. /// </param> /// <param name="news"> /// The <c>NewsItem</c>s to print the cosine similarity of. /// </param> private static void WriteCosineSimilarity(Archivist archivist, List<NewsItem> news) { // Sort the news according to title. news.Sort((n1, n2) => int.Parse(n1.Title).CompareTo(int.Parse(n2.Title))); // Get the vector of all news and add them to a matrix. List<SparseVector> vectors = new List<SparseVector>(); foreach (NewsItem n in news) { vectors.Add(archivist.GetTfIdfVector(n)); } // Create a matrix. SparseMatrix matrix = new SparseMatrix(vectors.First().Dimension, vectors.Count); // Add entries to the matrix. for (int i = 0; i < vectors.Count; i++) { for (int j = 0; j < vectors.First().Dimension; j++) { matrix[j, i] = vectors[i][j]; } } // Transpose matrix and thus prepare for multiplication. SparseMatrix mTransposed = matrix.Transpose().NormalizeRows(); // Calculate the result. SparseMatrix result = mTransposed.Product(matrix.Transpose().NormalizeRows().Transpose()); // Write to file. using (StreamWriter file = new StreamWriter("cosine_similarity_log.txt")) { int rowLength = result.Rows; int columnLength = result.Columns; // Print header. file.Write(" "); for (int i = 0; i < result.Columns; i++) { file.Write("dok" + (i + 1).ToString("0#") + " "); } file.WriteLine(); // Print the matrix. for (int i = 0; i < rowLength; i++) { // Print doc title. file.Write("dok" + (i + 1).ToString("0#") + " "); for (int j = 0; j < columnLength; j++) { file.Write(result[i, j].ToString("0.##0") + " "); } if (i != rowLength - 1) { file.WriteLine(); } } } }
/// <summary> /// Calculates and assigns a score for each <c>NewsItem</c> based /// on interest and weighted by an age-factor. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> instance with database access. /// </param> /// <param name="news"> /// A list of <c>NewsItem</c> that should be given a score. /// </param> /// <returns> /// A list of tuples containing a <c>NewsItem</c> and a score. /// </returns> private List<Tuple<NewsItem, float>> CalculateInterestScores(Archivist archivist, List<NewsItem> news) { List<Tuple<NewsItem, float>> result = new List<Tuple<NewsItem, float>>(); // Get unique term count (size of the dictionary). int uniqueTermCount = archivist.GetUniqueTermCount(); // Calculate score for each NewsItem. foreach (NewsItem n in news) { // No need to calculate if term count = 0. if (n.Terms.Count == 0) { continue; } // Calculate the number of interesting term occurrences // belonging to n.category. int interestingCategoryCount = CountTermsWithQuery(archivist, n.Category, true); // Calculate the number of not-interesting term occurrences // belonging to n.category. int nInterestingCategoryCount = CountTermsWithQuery(archivist, n.Category, false); Dictionary<string, int> interestTermsCount = CountTermsWithQuery(archivist, n.Terms.Select(p => p.TermName).ToList<string>(), n.Category, true); Dictionary<string, int> nInterestTermsCount = CountTermsWithQuery(archivist, n.Terms.Select(p => p.TermName).ToList<string>(), n.Category, false); // The first part of the naive bayes P(T|i). double probFirstPart = 0; // The first part of the naive bayes P(T|not-i). double probSecondPart = 0; // For each term in document. foreach (Term term in n.Terms) { // Number of occurrences of the term in interesting news. int interestCount = interestTermsCount.ContainsKey(term.TermName) ? interestTermsCount[term.TermName] : 0; // Number of occurrences of the term in non-interesting news. int nInterestCount = nInterestTermsCount.ContainsKey(term.TermName) ? nInterestTermsCount[term.TermName] : 0; // Calculate location weight. double indexWeight = 1 + Math.Log( (n.WordCount * 2.0 - term.MedianIndex) / n.WordCount, 2); // Calculate P(T|i). probFirstPart += Math.Log10(indexWeight * (interestCount + 1.0) / (double)(interestingCategoryCount + uniqueTermCount)); // Calculate P(T|not-i). probSecondPart += Math.Log10(indexWeight * (nInterestCount + 1.0) / (double)(nInterestingCategoryCount + uniqueTermCount)); } // Calculate age. TimeSpan age = DateTime.Now.Subtract(n.Date); // Calculate age weight. double ageWeight = Math.Pow(0.99, Math.Ceiling(age.TotalHours)); // Calculate probability. float probability = (float)(ageWeight / (1 + Math.Pow(10, probSecondPart - probFirstPart))); // Add to result. result.Add(new Tuple<NewsItem, float>(n, probability)); } return result; }
/// <summary> /// Collects newssources from the given <c>Archivist</c>. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> of which to collect <c>NewsSource</c>s from. /// </param> /// <returns> /// A list of <c>NewsSource</c>s. /// </returns> private static List<NewsSource> CollectNewsSources(Archivist archivist) { return archivist.GetNewsSources(int.MaxValue); }
/// <summary> /// Counts the number of documents satisfying the parameter values. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> instance with database access. /// </param> /// <param name="terms"> /// A list of terms the documents should contain. /// </param> /// <param name="category"> /// A <c>Category</c> the documents should belong to. /// </param> /// <param name="interesting"> /// Whether the documents should be marked interesting or not. /// </param> /// <returns> /// The number of documents satisfying the parameter values. /// </returns> private Dictionary<string, int> CountTermsWithQuery(Archivist archivist, List<string> terms, Category category, bool interesting) { // Create query instance. TermCountQuery termQuery = new TermCountQuery(); // Add term to query if not null. if (terms != null) { termQuery.Terms = terms; } // Set category if not null. if (category != null) { termQuery.CategoryIds = new List<int> { category.Id }; } termQuery.Interest = interesting ? Database.InterestStatus.Interesting : Database.InterestStatus.Uninteresting; // Send query and return result. return archivist.GetIndividualTermCount(termQuery); }
/// <summary> /// Counts the number of documents satisfying the parameter values. /// </summary> /// <param name="archivist"> /// The <c>Archivist</c> instance with database access. /// </param> /// <param name="category"> /// A <c>Category</c> the documents should belong to. /// </param> /// <param name="interesting"> /// Whether the documents should be marked interesting or not. /// </param> /// <returns> /// The number of documents satisfying the parameter values. /// </returns> private int CountTermsWithQuery(Archivist archivist, Category category, bool interesting) { // Send query and return result. return archivist.GetTermCountInCategory(category, interesting ? InterestStatus.Interesting : InterestStatus.Uninteresting); }
/// <summary> /// Creates a term-document matrix from the database based on the given /// parameters. /// </summary> /// <param name="archivist"> /// An <c>Archivist</c> instance with database access. /// </param> /// <param name="news"> /// A list of <c>NewsItem</c>s to be excluded from the matrix. /// </param> /// <param name="dateLimit"> /// A date limit indicating whether old news should be included in the /// matrix. /// </param> /// <returns> /// A term-document matrix with tf-idf values representing news in the /// database. /// </returns> private SparseMatrix GetTermDocumentMatrix(Archivist archivist, List<NewsItem> news, DateTime dateLimit) { // Create query which defines what the matrix should be made of. NewsQuery query = new NewsQuery(); query.NewerThan = dateLimit; query.ExcludedNews = news; query.Read = ReadStatus.Read; // Return result of request. return archivist.GetTfIdfMatrix(query); }
/// <summary> /// Finds redundant <c>NewsItem</c>s in the given list, /// and returns a new list without the <c>NewsItem</c>s. /// </summary> /// <param name="archivist"> /// An <c>Archivist</c> instance with database access. /// </param> /// <param name="news"> /// The list of <c>NewsItem</c>s to filter. /// </param> /// <returns> /// A filtered list of <c>NewsItem</c>s. /// </returns> private List<NewsItem> FilterListRedundancy(Archivist archivist, List<NewsItem> news) { List<NewsItem> result = new List<NewsItem>(news); // Find redundancy for each news item. for (int i = 0; i < news.Count - 1; i++) { // Get Tf-idf vector for current item. SparseVector itemVector = archivist.GetTfIdfVector(news[i]); itemVector = itemVector.Normalize(); // For all items with index j > i, calculate cosine similarity. for (int j = i + 1; j < news.Count; j++) { // Get sparsevector from news item to compare. SparseVector compareVector = archivist.GetTfIdfVector(news[j]); compareVector = compareVector.Normalize(); // Remove from list if cosine similarity exeeds or equals // threshold. if (compareVector.DotProduct(itemVector) >= THRESHOLD) { result.Remove(news[i]); break; } } } return result; }