Exemplo n.º 1
0
 public static void ClearAll()
 {
     using (var _db = new EngineDBDataContext())
     {
         _db.Terms.DeleteAllOnSubmit(_db.Terms);
         _db.Scores.DeleteAllOnSubmit(_db.Scores);
         _db.SubmitChanges();
     }
 }
Exemplo n.º 2
0
        public static void RemoveStory(int storyID)
        {
            using (var _db = new EngineDBDataContext())
            {
                _db.Terms.DeleteAllOnSubmit(_db.Terms.Where(t => t.StoryID == storyID));

                _db.Scores.DeleteAllOnSubmit(_db.Scores.Where(s => s.Story1ID == storyID || s.Story2ID == storyID));

                _db.SubmitChanges();
            }
        }
Exemplo n.º 3
0
        /// <summary>
        /// Store a new document or update an existing one
        /// </summary>
        /// <param name="documentID"></param>
        /// <param name="body"></param>
        public static void StoreStory(int storyID, string text)
        {
            using (var _db = new EngineDBDataContext())
            {
                IList <Term> terms = TermSet(text);

                // grab idfs
                var idfs = from term in _db.Terms
                           where term.StoryID != storyID
                           group term by term.Text
                           into g
                           where terms.Select(t => t.Text).Contains(g.Key)
                           select new { TermVal = g.Key, Count = g.Count() };

                // calculate tfidfs
                double num_docs = (from term in _db.Terms
                                   where term.StoryID != storyID
                                   group term by term.StoryID
                                   into g
                                   select g).Count() + 1;
                foreach (Term term in terms)
                {
                    term.StoryID = storyID;
                    Term term1 = term;
                    var  idf   = 1;
                    // need to refactor this because it's doing a massive SELECT * FROM tbl WHERE x IN ...
                    var idfEntry = idfs.SingleOrDefault(t => t.TermVal.ToLower() == term1.Text.ToLower());
                    if (idfEntry != null)
                    {
                        idf = idfEntry.Count;
                    }
                    term.tfidf = term.tf * Math.Log(num_docs / idf);
                }

                // delete old entries
                _db.Terms.DeleteAllOnSubmit(_db.Terms.Where(t => t.StoryID == storyID));

                // store
                _db.Terms.InsertAllOnSubmit(terms);

                _db.SubmitChanges();
            }

            ScoreStory(storyID);
        }
Exemplo n.º 4
0
        /* Calculate the cosine similarity measure of ID versus all other documents in the corpus */

        private static void ScoreStory(int storyID)
        {
            // consider passing data context as an argument to the function
            using (var _db = new EngineDBDataContext())
            {
                // delete all existing scores for documentID
                _db.Scores.DeleteAllOnSubmit(from score in _db.Scores
                                             where score.Story1ID == storyID || score.Story2ID == storyID
                                             select score);
                _db.SubmitChanges();

                // grab the principle document's terms/tfidfs
                var terms = from term in _db.Terms
                            where term.StoryID == storyID
                            select new { Text = term.Text, tfidf = term.tfidf };

                // magnitude of the document's vector
                var mag1 = Math.Sqrt(terms.Sum(t => t.tfidf));

                // ids of documents to score against
                var ids = from term in _db.Terms
                          where term.StoryID != storyID
                          group term by term.StoryID
                          into g
                          select g.Key;

                // score against each document
                foreach (int id in ids)
                {
                    var id1 = id;
                    // grab the other document's terms/tfidfs
                    var terms2 = from term in _db.Terms
                                 where term.StoryID == id1
                                 select new { Text = term.Text, tfidf = term.tfidf };

                    var mag2 = Math.Sqrt(terms2.Sum(t => t.tfidf));

                    // calculate the dot product
                    var dot_product = 0.0;
                    foreach (var term in terms)
                    {
                        var term1 = term;
                        var term2 = terms2.SingleOrDefault(t => t.Text.ToLower() == term1.Text.ToLower());
                        if (term2 != null)
                        {
                            dot_product += term1.tfidf * term2.tfidf;
                        }
                    }

                    var euclidean_dist = mag1 * mag2;

                    // linq may complain about overwriting existing keys... we'll see. if so, delete scores before InsertOnSubmit()
                    var score = new Score
                    {
                        Story1ID = storyID,
                        Story2ID = id,
                        Value    = dot_product / euclidean_dist
                    };
                    if (double.IsNaN(score.Value))
                    {
                        score.Value = 0;
                    }
                    _db.Scores.InsertOnSubmit(score);
                }

                _db.SubmitChanges();

                //
            }
        }