Пример #1
0
        /// <summary>
        /// Filters the news based on interest.
        /// </summary>
        /// <param name="archivist">
        /// The <c>Archivist</c> connected to the database.
        /// </param>
        /// <param name="news">
        /// The list of <c>NewsItem</c>s to filter.
        /// </param>
        /// <returns></returns>
        public override List<NewsItem> Filter(Archivist archivist, List<NewsItem> news)
        {
            List<Tuple<NewsItem, float>> interestScores =
                CalculateInterestScores(archivist, news);

            // Sort by interest.
            interestScores.Sort(delegate(Tuple<NewsItem, float> t1,
                                         Tuple<NewsItem, float> t2)
                                         {
                                             return t2.Item2.CompareTo(t1.Item2);
                                         }
                                );

            // Convert Tuple to interest filtered List with item1.
            List<NewsItem> result = new List<NewsItem>();
            foreach (Tuple<NewsItem, float> item in interestScores)
            {
                // Is news item interesting?
                if (item.Item2 >= THRESHOLD)
                {
                    result.Add(item.Item1);
                }
            }

            return result;
        }
Пример #2
0
        /// <summary>
        /// Filters a given list of news for redundant <c>NewsItem</c>s. A
        /// precondition to make this filter work is that the list of <c>NewsItem</c>s
        /// given is ordered by interest.
        /// </summary>
        /// <param name="archivist">
        /// The <c>Archivist</c> instance with database access.
        /// </param>
        /// <param name="news">
        /// The list of <c>NewsItem</c>s to be filtered.
        /// </param>
        /// <returns>
        /// A filtered list of <c>NewsItem</c>s in the same 
        /// order as in the input.
        /// </returns>
        public override List<NewsItem> Filter(Archivist archivist,
            List<NewsItem> news)
        {
            // Filter in list.
            List<NewsItem> result = FilterListRedundancy(archivist, news);
            // Filter in database.
            result = FilterDatabaseRedundancy(archivist, result);

            return result;
        }
Пример #3
0
        /// <summary>
        /// Seeds the data from JSON-files located in the given directory.
        /// The directory must contain folders named as the category of its
        /// contents.
        /// </summary>
        /// <param name="archivist">
        /// The <c>Archivist</c> with database access
        /// </param>
        /// <param name="dataDir">
        /// The directory the data is located in.
        /// </param>
        /// <param name="perCategory">
        /// Number of <c>NewsItem</c>s per category to read.
        /// </param>
        /// <returns>
        /// The number of <c>NewsItem</c>s added.
        /// </returns>
        public static int SeedDatabaseWithNews(Archivist archivist, string dataDir,
            int perCategory = 3000)
        {
            int count = 0;
            int errorCount = 0;

            // Each category should be placed in its own directory.
            string[] categoryDirs = Directory.GetDirectories(dataDir);

            // Calculate number of files (3000 pr. category).
            // The number of files is limited to ensure that no category is more
            // likely than others.
            int totalFilesCount = categoryDirs.Length * 3000;

            foreach (string s in categoryDirs)
            {
                int categoryId = archivist.AddCategory(Path.GetFileName(s));

                List<NewsMaterial> newsBuffer = new List<NewsMaterial>();
                // Read 3000 files.
                string[] files = Directory.GetFiles(s);
                for (int i = 0; i < perCategory && i < files.Length; i++)
                {
                    string file = files[i];
                    string json = File.ReadAllText(file, UTF8Encoding.UTF8);
                    NewsMaterial news = ParseJson(json);
                    if (news != null)
                    {
                        newsBuffer.Add(news);
                        count++;
                    }
                    else
                    {
                        Console.WriteLine("Error seeding item.");
                        errorCount++;
                    }
                    Console.WriteLine("Loaded {0}/{1}", count + errorCount,
                        totalFilesCount);
                }

                // Write the news to the archivist.
                Stopwatch watch = new Stopwatch();
                watch.Start();
                archivist.AddNews(newsBuffer, categoryId, true);
                watch.Stop();
                Console.WriteLine("Time: {0}", watch.ElapsedMilliseconds);
                Console.WriteLine("Seeded {0}.", newsBuffer.Count);
            }

            return count;
        }
Пример #4
0
        /// <summary>
        /// Finds duplicates in a list of <c>NewsItem</c>s and the database.
        /// </summary>
        /// <param name="archivist">
        /// An <c>Archivist</c> instance with database access.
        /// </param>
        /// <param name="news">
        /// The list of news to compare with the matrix.
        /// </param>
        /// <returns>
        /// Returns the filtered list.
        /// </returns>
        private List<NewsItem> FilterDatabaseRedundancy(Archivist archivist,
            List<NewsItem> news)
        {
            // Create result list containing all news items. Redundant news
            // will later be removed.
            List<NewsItem> result = new List<NewsItem>(news);

            DateTime dateLimit = DateTime.Now.Subtract(
                new TimeSpan(24 * 7 * 2, 0, 0));

            SparseMatrix matrix = GetTermDocumentMatrix(archivist, news, dateLimit);

            // If matrix == null, there are no news items in the db, and thus
            // no redundant news.
            if (matrix == null)
            {
                return news;
            }

            // Transpose matrix and thus prepare for multiplication.
            SparseMatrix mTransposed = matrix.Transpose();
            // Normalize all rows to make cosine similarity calculation faster.
            mTransposed = mTransposed.NormalizeRows();

            // Find redundancy value for each news item.
            foreach (NewsItem item in news)
            {
                // Get tf-idf vector representing this news item.
                SparseVector newsVector = archivist.GetTfIdfVector(item);
                newsVector = newsVector.Normalize();
                // Calculate product.
                SparseVector product = mTransposed.VectorProduct(newsVector);

                // Remove item from result list if product contains element
                // with value >= threshold.
                foreach (int entry in product.NonZeroIndices)
                {
                    if (product[entry] >= THRESHOLD)
                    {
                        result.Remove(item);
                        break;
                    }
                }
            }

            return result;
        }
Пример #5
0
        /// <summary>
        /// Curates undread news from the given <c>Archivist</c> instance based 
        /// on a maximum reading time, a maximum individual item reading time 
        /// and a limit.
        /// </summary>
        /// <param name="archivist">
        /// The <c>Archivist</c> instance giving access to the news that 
        /// should be curated.
        /// </param>
        /// <param name="limit">
        /// The maximum number of news to return.
        /// </param>
        /// <param name="maxTime">
        /// The maximum reading time for the curated news.
        /// </param>
        /// <param name="maxItemTime">
        /// The maximum reading time per news item.
        /// </param>
        /// <returns>
        /// A list of unread, curated <c>NewsItem</c>s received from the 
        /// <c>Archivist</c> instance.
        /// </returns>
        public static List<NewsItem> GetCuratedNews(Archivist archivist,
            int limit, int maxTime = -1, int maxItemTime = - 1)
        {
            // Get cached data and update if null.
            List<NewsItem> curatedNews = CachedNews;

            if (curatedNews == null)
            {
                // Update needed.

                NewsQuery query = new NewsQuery();
                // Only fetch unread news.
                query.Read = ReadStatus.Unread;
                query.OrderDateDesc = true;
                query.Limit = 100;
                curatedNews = archivist.GetNews(query);

                // If no news was found then there's no need to filter them.
                if (curatedNews.Count == 0)
                {
                    return curatedNews;
                }

                // Filter for interesting news.
                InterestFilter interestFilter = new InterestFilter();
                curatedNews = interestFilter.Filter(archivist, curatedNews);

                // Filter redundancy.
                RedundancyFilter redundancyFilter = new RedundancyFilter();
                curatedNews = redundancyFilter.Filter(archivist, curatedNews);

                // Update cache.
                CachedNews = curatedNews;
            }

            // Filter quantity.
            QuantityFilter quantityFilter = new QuantityFilter(limit,
                maxTime, maxItemTime);
            curatedNews = quantityFilter.Filter(archivist, curatedNews);

            // Return curated list.
            return curatedNews;
        }
Пример #6
0
        /// <summary>
        /// Filters a list of news according to the parameters set 
        /// by the constructor, a precondition to this method is that the list of
        /// <c>NewsItems</c>s is sorted by descending interesting score.
        /// </summary>
        /// <param name="archivist">
        /// The <c>Archivist</c>
        /// </param>
        /// <param name="news">
        /// A list of <c>News</c> to filter. 
        /// </param>
        /// <returns>
        /// A list of <c>NewsItem</c> filtered as specified by the constructor.
        /// </returns>
        public override List<NewsItem> Filter(Archivist archivist,
        List<NewsItem> news)
        {
            List<NewsItem> listCopy = news.ToList();

            // Only filter for time if MaxTime has been set.
            if (MaxTime > -1)
            {
                int readingSpeed = archivist.GetReadingSpeed();
                int remainingTime = MaxTime;
                int articleLength = 0;

                if (readingSpeed <= 0)
                {
                    // Not possible to filter for. Return an empty list.
                    return new List<NewsItem>();
                }

                foreach (NewsItem item in news)
                {
                    articleLength = CalculateNewsReadTime(item, readingSpeed);

                    if (MaxItemTime > 0 && articleLength > MaxItemTime)
                    {
                        listCopy.Remove(item);
                    }
                    else if (articleLength <= remainingTime)
                    {
                        remainingTime -= articleLength;
                    }
                    else
                    {
                        listCopy.Remove(item);
                    }
                }
            }

            // Trim list to max count.
            TrimNewsList(listCopy, MaxCount);

            return listCopy;
        }
Пример #7
0
        /// <summary>
        /// Aggregates news from <c>NewsSource</c>s found in the given
        /// <c>Archivist</c>.
        /// </summary>
        /// <param name="archivist">
        /// The <c>Archivist</c> to get <c>NewsSource</c>s from and
        /// store <c>NewsItem</c>s in.
        /// </param>
        public static void Aggregate(Archivist archivist)
        {
            // Collect news sources from the archivist.
            List<NewsSource> sources = CollectNewsSources(archivist);
            List<NewsSource> sourcesCopy = sources.ToList();

            // Run through all sources.
            foreach (NewsSource source in sourcesCopy)
            {
                if (source != null &&
                    source.ContentExpirationTime != null &&
                    DateTime.Now.CompareTo(source.ContentExpirationTime) < 0)
                {
                    sources.Remove(source);
                }
            }

            // Aggregate news from the filtered sources.
            AggregateNewsFromSources(archivist, sources.FindAll(n => !n.IsBlocked));
        }
Пример #8
0
        /// <summary>
        /// Aggregates news from the given list of <c>NewsSource</c>s.
        /// </summary>
        /// <param name="archivist">
        /// The <c>Archivist</c> of which to save aggregated news.
        /// </param>
        /// <param name="sources">
        /// The <c>NewsSource</c>s of which to load news from.
        /// </param>
        private static void AggregateNewsFromSources(Archivist archivist,
            List<NewsSource> sources)
        {
            List<NewsMaterial> newNewsMaterial = new List<NewsMaterial>();

            // Run ParseNewsItems on each news source.
            // In paralell
            Parallel.ForEach(sources, (source, state) =>
            {
                if (source != null)
                {
                    newNewsMaterial.AddRange(ParseNewsItems(source));
                }
            });

            foreach (NewsSource source in sources)
            {
                archivist.UpdateNewsSource(source);
            }

            archivist.AddNews(newNewsMaterial);
        }
Пример #9
0
 /// <summary>
 /// Filters the list of <c>NewsItem</c>.s
 /// </summary>
 /// <param name="archivist">
 /// The <c>Archivist</c> instance for database access.
 /// </param>
 /// <param name="news">
 /// The list of <c>NewsItem</c>s to sort.
 /// </param>
 /// <returns>
 /// The filtered list.
 /// </returns>
 public abstract List<NewsItem> Filter(Archivist archivist, List<NewsItem> news);
Пример #10
0
        /// <summary>
        /// Reads a JSON file with news sources and saves in archivist.
        /// </summary>
        /// <param name="archivist">
        /// The <c>Archivist</c> instance with database access.
        /// </param>
        /// <param name="file">
        /// The path to the JSON-file containing <c>NewsSource</c>s.
        /// </param>
        public static void SeedNewsSources(Archivist archivist, string file)
        {
            List<NewsSource> newsSources = new List<NewsSource>();

            string json = File.ReadAllText(file, UTF8Encoding.UTF8);
            try
            {
                // Parse JSON and assign various values.
                var values = JArray.Parse(json);

                for (int i = 0; i < values.Count; i++)
                {
                    // Parse and add news source to list.
                    string name = values[i]["name"].ToString();
                    string url = values[i]["url"].ToString();
                    try
                    {
                        Uri uri = new Uri(url);
                        NewsSource newsSource = new NewsSource(name, uri);
                        newsSources.Add(newsSource);
                    }
                    catch (UriFormatException)
                    {
                        Console.WriteLine("Could not parse news item URL.");
                    }
                }

            }
            catch (Newtonsoft.Json.JsonReaderException)
            {
                Console.WriteLine("Error parsing news source");
            }

            // Add to database.
            archivist.AddNewsSources(newsSources);
        }
Пример #11
0
        /// <summary>
        /// Write the cosine similarity of the given <c>NewsItem</c>s to 
        /// the file cosine_similarity_log.txt.
        /// </summary>
        /// <param name="archivist">
        /// The <c>Archivist</c> for getting database information.
        /// </param>
        /// <param name="news">
        /// The <c>NewsItem</c>s to print the cosine similarity of.
        /// </param>
        private static void WriteCosineSimilarity(Archivist archivist, List<NewsItem> news)
        {
            // Sort the news according to title.
            news.Sort((n1, n2) => int.Parse(n1.Title).CompareTo(int.Parse(n2.Title)));

            // Get the vector of all news and add them to a matrix.
            List<SparseVector> vectors = new List<SparseVector>();
            foreach (NewsItem n in news)
            {
                vectors.Add(archivist.GetTfIdfVector(n));
            }

            // Create a matrix.
            SparseMatrix matrix =
                new SparseMatrix(vectors.First().Dimension, vectors.Count);
            // Add entries to the matrix.
            for (int i = 0; i < vectors.Count; i++)
            {
                for (int j = 0; j < vectors.First().Dimension; j++)
                {
                    matrix[j, i] = vectors[i][j];
                }
            }

            // Transpose matrix and thus prepare for multiplication.
            SparseMatrix mTransposed = matrix.Transpose().NormalizeRows();

            // Calculate the result.
            SparseMatrix result = mTransposed.Product(matrix.Transpose().NormalizeRows().Transpose());

            // Write to file.
            using (StreamWriter file = new StreamWriter("cosine_similarity_log.txt"))
            {
                int rowLength = result.Rows;
                int columnLength = result.Columns;

                // Print header.
                file.Write("      ");
                for (int i = 0; i < result.Columns; i++)
                {
                    file.Write("dok" + (i + 1).ToString("0#") + " ");
                }
                file.WriteLine();

                // Print the matrix.
                for (int i = 0; i < rowLength; i++)
                {
                    // Print doc title.
                    file.Write("dok" + (i + 1).ToString("0#") + " ");
                    for (int j = 0; j < columnLength; j++)
                    {
                        file.Write(result[i, j].ToString("0.##0") + " ");
                    }

                    if (i != rowLength - 1)
                    {
                        file.WriteLine();
                    }
                }
            }
        }
Пример #12
0
        /// <summary>
        /// Calculates and assigns a score for each <c>NewsItem</c> based 
        /// on interest and weighted by an age-factor.
        /// </summary>
        /// <param name="archivist">
        /// The <c>Archivist</c> instance with database access.
        /// </param>
        /// <param name="news">
        /// A list of <c>NewsItem</c> that should be given a score.
        /// </param>
        /// <returns>
        /// A list of tuples containing a <c>NewsItem</c> and a score.
        /// </returns>
        private List<Tuple<NewsItem, float>> CalculateInterestScores(Archivist archivist,
                                                                    List<NewsItem> news)
        {
            List<Tuple<NewsItem, float>> result = new List<Tuple<NewsItem, float>>();

            // Get unique term count (size of the dictionary).
            int uniqueTermCount = archivist.GetUniqueTermCount();

            // Calculate score for each NewsItem.
            foreach (NewsItem n in news)
            {
                // No need to calculate if term count = 0.
                if (n.Terms.Count == 0)
                {
                    continue;
                }

                // Calculate the number of interesting term occurrences
                // belonging to n.category.
                int interestingCategoryCount = CountTermsWithQuery(archivist, n.Category, true);

                // Calculate the number of not-interesting term occurrences
                // belonging to n.category.
                int nInterestingCategoryCount = CountTermsWithQuery(archivist, n.Category, false);

                Dictionary<string, int> interestTermsCount = CountTermsWithQuery(archivist,
                    n.Terms.Select(p => p.TermName).ToList<string>(),
                    n.Category,
                    true);

                Dictionary<string, int> nInterestTermsCount = CountTermsWithQuery(archivist,
                    n.Terms.Select(p => p.TermName).ToList<string>(),
                    n.Category,
                    false);

                // The first part of the naive bayes P(T|i).
                double probFirstPart = 0;
                // The first part of the naive bayes P(T|not-i).
                double probSecondPart = 0;

                // For each term in document.
                foreach (Term term in n.Terms)
                {
                    // Number of occurrences of the term in interesting news.
                    int interestCount = interestTermsCount.ContainsKey(term.TermName) ?
                        interestTermsCount[term.TermName] :
                        0;
                    // Number of occurrences of the term in non-interesting news.
                    int nInterestCount = nInterestTermsCount.ContainsKey(term.TermName) ?
                        nInterestTermsCount[term.TermName] :
                        0;

                    // Calculate location weight.
                    double indexWeight = 1 + Math.Log(
                        (n.WordCount * 2.0 - term.MedianIndex) / n.WordCount, 2);

                    // Calculate P(T|i).
                    probFirstPart += Math.Log10(indexWeight * (interestCount + 1.0) /
                                                (double)(interestingCategoryCount
                                                + uniqueTermCount));
                    // Calculate P(T|not-i).
                    probSecondPart += Math.Log10(indexWeight * (nInterestCount + 1.0) /
                                                (double)(nInterestingCategoryCount
                                                + uniqueTermCount));
                }

                // Calculate age.
                TimeSpan age = DateTime.Now.Subtract(n.Date);
                // Calculate age weight.
                double ageWeight = Math.Pow(0.99, Math.Ceiling(age.TotalHours));

                // Calculate probability.
                float probability = (float)(ageWeight /
                    (1 + Math.Pow(10, probSecondPart - probFirstPart)));

                // Add to result.
                result.Add(new Tuple<NewsItem, float>(n, probability));
            }

            return result;
        }
Пример #13
0
 /// <summary>
 /// Collects newssources from the given <c>Archivist</c>.
 /// </summary>
 /// <param name="archivist">
 /// The <c>Archivist</c> of which to collect <c>NewsSource</c>s from.
 /// </param>
 /// <returns>
 /// A list of <c>NewsSource</c>s.
 /// </returns>
 private static List<NewsSource> CollectNewsSources(Archivist archivist)
 {
     return archivist.GetNewsSources(int.MaxValue);
 }
Пример #14
0
        /// <summary>
        /// Counts the number of documents satisfying the parameter values.
        /// </summary>
        /// <param name="archivist">
        /// The <c>Archivist</c> instance with database access.
        /// </param>
        /// <param name="terms">
        /// A list of terms the documents should contain.
        /// </param>
        /// <param name="category">
        /// A <c>Category</c> the documents should belong to.
        /// </param>
        /// <param name="interesting">
        /// Whether the documents should be marked interesting or not.
        /// </param>
        /// <returns>
        /// The number of documents satisfying the parameter values.
        /// </returns>
        private Dictionary<string, int> CountTermsWithQuery(Archivist archivist, List<string> terms,
                              Category category, bool interesting)
        {
            // Create query instance.
            TermCountQuery termQuery = new TermCountQuery();

            // Add term to query if not null.
            if (terms != null)
            {
                termQuery.Terms = terms;
            }

            // Set category if not null.
            if (category != null)
            {
                termQuery.CategoryIds = new List<int> { category.Id };
            }

            termQuery.Interest = interesting ?
                Database.InterestStatus.Interesting :
                Database.InterestStatus.Uninteresting;

            // Send query and return result.
            return archivist.GetIndividualTermCount(termQuery);
        }
Пример #15
0
 /// <summary>
 /// Counts the number of documents satisfying the parameter values.
 /// </summary>
 /// <param name="archivist">
 /// The <c>Archivist</c> instance with database access.
 /// </param>
 /// <param name="category">
 /// A <c>Category</c> the documents should belong to.
 /// </param>
 /// <param name="interesting">
 /// Whether the documents should be marked interesting or not.
 /// </param>
 /// <returns>
 /// The number of documents satisfying the parameter values.
 /// </returns>
 private int CountTermsWithQuery(Archivist archivist, Category category,
                       bool interesting)
 {
     // Send query and return result.
     return archivist.GetTermCountInCategory(category,
         interesting ? InterestStatus.Interesting : InterestStatus.Uninteresting);
 }
Пример #16
0
 /// <summary>
 /// Creates a term-document matrix from the database based on the given
 /// parameters.
 /// </summary>
 /// <param name="archivist">
 /// An <c>Archivist</c> instance with database access.
 /// </param>
 /// <param name="news">
 /// A list of <c>NewsItem</c>s to be excluded from the matrix.
 /// </param>
 /// <param name="dateLimit">
 /// A date limit indicating whether old news should be included in the
 /// matrix.
 /// </param>
 /// <returns>
 /// A term-document matrix with tf-idf values representing news in the
 /// database.
 /// </returns>
 private SparseMatrix GetTermDocumentMatrix(Archivist archivist,
     List<NewsItem> news, DateTime dateLimit)
 {
     // Create query which defines what the matrix should be made of.
     NewsQuery query = new NewsQuery();
     query.NewerThan = dateLimit;
     query.ExcludedNews = news;
     query.Read = ReadStatus.Read;
     // Return result of request.
     return archivist.GetTfIdfMatrix(query);
 }
Пример #17
0
        /// <summary>
        /// Finds redundant <c>NewsItem</c>s in the given list, 
        /// and returns a new list without the <c>NewsItem</c>s.
        /// </summary>
        /// <param name="archivist">
        /// An <c>Archivist</c> instance with database access.
        /// </param>
        /// <param name="news">
        /// The list of <c>NewsItem</c>s to filter.
        /// </param>
        /// <returns>
        /// A filtered list of <c>NewsItem</c>s.
        /// </returns>
        private List<NewsItem> FilterListRedundancy(Archivist archivist,
            List<NewsItem> news)
        {
            List<NewsItem> result = new List<NewsItem>(news);

            // Find redundancy for each news item.
            for (int i = 0; i < news.Count - 1; i++)
            {
                // Get Tf-idf vector for current item.
                SparseVector itemVector = archivist.GetTfIdfVector(news[i]);
                itemVector = itemVector.Normalize();
                // For all items with index j > i, calculate cosine similarity.
                for (int j = i + 1; j < news.Count; j++)
                {
                    // Get sparsevector from news item to compare.
                    SparseVector compareVector = archivist.GetTfIdfVector(news[j]);
                    compareVector = compareVector.Normalize();
                    // Remove from list if cosine similarity exeeds or equals
                    // threshold.
                    if (compareVector.DotProduct(itemVector) >= THRESHOLD)
                    {
                        result.Remove(news[i]);
                        break;
                    }
                }
            }

            return result;
        }