public void GetTfIdfVectorFourNewsItemsInDatabase() { // Add categories and news sources. foreach (Category c in Categories) { c.Id = Archivist.AddCategory(c.Name); } Archivist.AddNewsSources(NewsSources); Dictionary<string, List<int>> terms = new Dictionary<string, List<int>>(); // Add some news material. for (int i = 0; i < NewsMaterial.Count; i++) { NewsMaterial n = NewsMaterial[i]; // Generate vector for index #1. Dictionary<string, int> termsInText = TermUtils.CalculateTermFrequency(n.Content); // Find all unique terms in news, and increase counts. foreach (KeyValuePair<string, int> term in termsInText) { if (!terms.ContainsKey(term.Key)) { terms.Add(term.Key, new List<int>()); // Add for all news material items. for (int j = 0; j < NewsMaterial.Count; j++) { terms[term.Key].Add(0); } } terms[term.Key][i] += term.Value; } // Add to database. Archivist.AddNews(n); } // Update idf values. Archivist.UpdateIdfValues(); // Create expected vector. SparseVector expectedVector = new SparseVector(terms.Count); int index = 0; foreach (KeyValuePair<string, List<int>> termCount in terms) { // Calculate idf. int docCount = 0; termCount.Value.ForEach((p) => docCount += p > 0 ? 1 : 0); double idf = TermUtils.CalculateInverseDocumentFrequency( NewsMaterial.Count, docCount); // Calculate tf. int tf = termCount.Value[1]; // Set value in vector. expectedVector[index] = (float)(tf * idf); index++; } // Get vector. List<NewsItem> news = Archivist.GetNews(new NewsQuery()); SparseVector vector = Archivist.GetTfIdfVector( news.Find(n => n.Title.Equals(NewsMaterial[1].Title))); Assert.AreEqual(expectedVector.Length(), vector.Length(), 0.001); }
public void GetTfIdfVectorOneNewsItemInDatabase() { // Add categories and news sources. foreach (Category c in Categories) { c.Id = Archivist.AddCategory(c.Name); } Archivist.AddNewsSources(NewsSources); Dictionary<string, int> terms = new Dictionary<string, int>(); // Add some news material. NewsMaterial nItem = NewsMaterial[1]; // Generate vector. Dictionary<string, int> termsInText = TermUtils.CalculateTermFrequency(nItem.Content); // Find all unique terms in news, and increase counts. foreach (KeyValuePair<string, int> term in termsInText) { if (!terms.ContainsKey(term.Key)) { terms.Add(term.Key, 0); } terms[term.Key] += term.Value; } // Add to database. Archivist.AddNews(nItem); // Update idf values. Archivist.UpdateIdfValues(); // Create expected vector. SparseVector expectedVector = new SparseVector(terms.Count); int index = 0; foreach (KeyValuePair<string, int> termCount in terms) { // Calculate idf. double idf = TermUtils.CalculateInverseDocumentFrequency( 1, 1); // Calculate tf. int tf = termCount.Value; // Set value in vector. expectedVector[index] = (float)(tf * idf); index++; } // Get vector. List<NewsItem> news = Archivist.GetNews(new NewsQuery()); SparseVector vector = Archivist.GetTfIdfVector( news.Find(n => n.Title.Equals(NewsMaterial[1].Title))); Assert.AreEqual(expectedVector.Length(), vector.Length(), 0.001); }
public void LengthTestWhenZero() { SparseVector v = new SparseVector(5); float expected = 0.0f; double result = v.Length(); Assert.AreEqual(expected, result, EPSILON); }