Beispiel #1
0
 /// <summary>
 /// Determines whether [the specified token] [is a stop word].
 /// </summary>
 /// <param name="token">The token.</param>
 /// <param name="language">The language.</param>
 /// <returns><c>true</c> if [the specified token] [is a stop word]; otherwise, <c>false</c>.</returns>
 public bool IsStopWord(string token, StopWordsLanguage language)
 {
     if (string.IsNullOrEmpty(token) || !StopWordsLanguages.TryGetValue(language, out var StopWords))
     {
         return(false);
     }
     return(StopWords.IsStopWord(token));
 }
Beispiel #2
0
        public static void CalculateTFIDF()
        {
            string db_path     = @"C:\Users\Administrator\Documents\stackoverflow\WORDS_DATA";
            string interm_path = @"C:\Users\Administrator\Documents\stackoverflow\INTERM_DATA";
            Db     db_words    = new Db(db_path);
            Db     db_post     = new Db(interm_path);

            int step  = 10000;
            int start = db_post.Table <WholePost>().OrderBy(f => f.Id).Take(1).Select(f => new { f.Id }).Select(f => f.Id).First();
            int end   = db_post.Table <WholePost>().OrderByDescending(f => f.Id).Take(1).Select(f => new { f.Id }).Select(f => f.Id).First();
            var list  = new List <WholePost>();

            int print_counter = 0;
            var dic           = new Dictionary <string, int>();

            for (int i = start; ; i += step, print_counter += step)
            {
                list = db_post.Table <WholePost>().BetweenInt(f => f.Id, i, i + step, BetweenBoundaries.FromInclusiveToExclusive).SelectEntity();
                if (!list.Any())
                {
                    if (i < end)
                    {
                        continue;
                    }
                    else
                    {
                        break;
                    }
                }
                if (print_counter % 50000 == 0)
                {
                    Console.WriteLine("phase 1: " + print_counter);
                }
                int total = 0;
                foreach (var p in list)
                {
                    var text  = p.Text;
                    var words = new List <string>();
                    foreach (var word in text.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
                    {
                        var w = word.ToLower().Trim(" .,?()!:;'\"[]".ToCharArray());
                        if (string.IsNullOrEmpty(w) || StopWords.IsStopWord(w) || w.Length > 10 || w.Count(f => NonReadable.Contains(f)) > 3 ||
                            (w.Count(f => NonReadable.Contains(f)) / (double)w.Length > 0.3 && w.Length > 4))
                        {
                            continue;
                        }
                        words.Add(w);
                    }
                    foreach (var w in words.GroupBy(f => f))
                    {
                        if (dic.ContainsKey(w.Key))
                        {
                            dic[w.Key]++;
                        }
                        else
                        {
                            dic[w.Key] = 1;
                        }
                        total++;
                    }
                }
            }


            dic = dic.Where(f => f.Value > 40).ToDictionary(f => f.Key, f => f.Value);

            //calculate tf-idf
            var saved_words = new HashSet <string>();
            int total_docs  = db_post.Table <WholePost>().Count();

            step          = 10000;
            print_counter = 0;
            int saved_count     = 0;
            int not_saved_count = 0;

            for (int i = start; ; i += step, print_counter += step)
            {
                list = db_post.Table <WholePost>().BetweenInt(f => f.Id, i, i + step, BetweenBoundaries.FromInclusiveToExclusive).SelectEntity();
                if (!list.Any())
                {
                    if (i < end)
                    {
                        continue;
                    }
                    else
                    {
                        break;
                    }
                }
                if (print_counter % 50000 == 0)
                {
                    Console.WriteLine("phase 2: " + print_counter);
                }
                var saved = new List <WordTfIdf>();
                foreach (var p in list)
                {
                    var text  = p.Text;
                    var words = new List <string>();
                    foreach (var word in text.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
                    {
                        var w = word.ToLower().Trim(" .,?()!:;'\"[]".ToCharArray());
                        if (string.IsNullOrEmpty(w) || StopWords.IsStopWord(w) || w.Length > 13 || w.Count(f => NonReadable.Contains(f)) > 3 ||
                            (w.Count(f => NonReadable.Contains(f)) / (double)w.Length > 0.3 && w.Length > 4))
                        {
                            continue;
                        }
                        words.Add(w);
                    }
                    foreach (var w in words.GroupBy(f => f))
                    {
                        if (!dic.ContainsKey(w.Key))
                        {
                            continue;
                        }
                        double tf    = w.Count() < Math.E ? w.Count() : (Math.E + Math.Log(w.Count()));
                        int    wdocs = dic[w.Key];
                        double tmp   = total_docs / (double)wdocs;
                        double idf   = tmp < Math.E ? tmp : (Math.E + Math.Log(tmp));
                        var    tfidf = new WordTfIdf()
                        {
                            QuestionId = p.Id,
                            Word       = w.Key,
                            TfIdf      = tf * idf
                        };
                        if (tfidf.TfIdf > 17)
                        {
                            saved.Add(tfidf);
                            if (saved.Count() == 5000)
                            {
                                db_words.Table <WordTfIdf>().SaveBatch(saved);
                                saved = new List <WordTfIdf>();
                            }
                            saved_words.Add(tfidf.Word);
                            saved_count++;
                        }
                        else
                        {
                            not_saved_count++;
                        }
                    }
                }
                db_words.Table <WordTfIdf>().SaveBatch(saved);
                saved = new List <WordTfIdf>();
            }

            //convenient representation
            var saved_data = new List <WordTfIdfData>();

            foreach (var w in saved_words)
            {
                var l    = db_words.Table <WordTfIdf>().Where(f => f.Word == w).SelectEntity();
                var data = Utils.TfIdfToData(l);
                var nd   = new WordTfIdfData()
                {
                    Word = w,
                    Data = data
                };
                saved_data.Add(nd);
                if (saved_data.Count() == 50)
                {
                    db_words.Table <WordTfIdfData>().SaveBatch(saved_data);
                    saved_data = new List <WordTfIdfData>();
                }
            }
            db_words.Table <WordTfIdfData>().SaveBatch(saved_data);
            saved_data = new List <WordTfIdfData>();

            Console.WriteLine("Done. Saved: " + saved_count + " not saved: " + not_saved_count);
            db_words.Dispose();
            db_post.Dispose();
        }
Beispiel #3
0
        public string SearchNode(string data_json)
        {
            var data  = JsonConvert.DeserializeObject <SearchData>(data_json);
            var words = data.Query.Trim().Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries).ToList();

            var cw = new List <string>();

            foreach (var w in words)
            {
                if (!StopWords.IsStopWord(w))
                {
                    cw.Add(w);
                }
            }
            data.Query = cw.Distinct().Aggregate((a, b) => a + " " + b);
            if (string.IsNullOrEmpty(data.Query))
            {
                data.Links = new List <Models.Result>();
                return(JsonConvert.SerializeObject(data));
            }
            if (data.TitleSearch)
            {
                Stopwatch sp = new Stopwatch();
                sp.Start();
                var tres = SearchApi.SearchTitles(data.Query);


                var title_links = tres.Select(f => new WebMvc.Models.Result()
                {
                    IsMeta      = true,
                    Description = GetWords(f.Fragment, words),
                    Title       = GetWords(f.Title, words),
                    Url         = "http://stackoverflow.com/questions/" + f.Id,
                    Id          = f.Id,
                    Score       = f.Score
                })
                                  .ToList();

                data.Links = title_links;
                sp.Stop();
                data.NodeTimeInMs = sp.ElapsedMilliseconds;
            }
            else
            {
                Stopwatch sp = new Stopwatch();
                sp.Start();
                var mres = SearchApi.SearchMain(data.Query);

                var main_links = mres.Select(z => new WebMvc.Models.Result()
                {
                    IsMeta      = false,
                    Description = GetWords(z.Fragment, words),
                    Title       = GetWords(z.Title, words),
                    Score       = z.Score,
                    Url         = "http://stackoverflow.com/questions/" + z.Id,
                    Id          = z.Id
                })
                                 .ToList();
                data.Links = main_links;
                sp.Stop();
                data.NodeTimeInMs = sp.ElapsedMilliseconds;
            }
            return(JsonConvert.SerializeObject(data));
        }