public static void StartTest()
 {
     DateTime curTime = DateTime.Now;
     DetectorFacade detector = new DetectorFacade(new Parameters());
     bool finished = false;
     int total = 0, cnt = 0, dupcnt = 0;
     while (!finished)
     {
         Console.WriteLine(cnt++);
         if (cnt > 3 * 24) break;
         List<ItemToDuplication> Items = new List<ItemToDuplication>();
         DateTime nextTime = curTime.Subtract(new TimeSpan(1, 0, 0));
         Console.WriteLine("Fetch time: from {0} to {1}", nextTime, curTime);
         LoadPeriod_Mongo(nextTime, curTime, Items);
         Console.WriteLine("fetch finished. Now inserting...");
         for (int i = 0; i < Items.Count; ++i)
         {
             try
             {
                 string result = detector.TestAndTryAdd(Items[i]);
                 if (result != null)
                     dupcnt++;
                 total++;
                 if (total % 1000 == 0)
                 {
                     Console.WriteLine("Have tested {0} items. Duplication: {1}", total, dupcnt);
                 }
                 Items[i] = null;
             }
             catch (Exception e)
             {
                 finished = true;
                 Logger.Error(string.Format("DetectorFacade AddItem Exp:{0}\n{1}\nTotal {2} Articles", e.Message, e.StackTrace, total));
             }
         }
         Console.WriteLine("insert finished");
         curTime = nextTime;
         Thread.Sleep(1000);
     }
 }
Ejemplo n.º 2
0
        /* 基于文章标题字符串距离的比较函数

        /// <summary>
        /// 初始化,读入所有文章标题
        /// </summary>
        void init_TitleCompare()
        {
            PureTitles.Clear();

            //对过去的nBackwardDays的每一天标题
            for (int Day = nBackwardDays; Day >= 0; Day--)
            {
                //foreach (string Line in File.ReadLines(FN, Encoding.UTF8))
                //    if (!string.IsNullOrEmpty(Line))
                //    {
                //        string DupID = Line.Substring(0, 32);
                //        string PureTitle = Line.Substring(33);

                //        if (!PureTitles.ContainsKey(PureTitle))
                //            PureTitles.Add(PureTitle, DupID);
                //    }
            }

            LastInitTime = DateTime.Now;
        }

        /// <summary>
        /// 检索转载
        /// </summary>
        /// <param name="CleanTitle"></param>
        /// <returns>空或者DuplicationID</returns>
        string IsDup_TitleCompare(string CleanTitle, string ItemID)
        {
            string PureTitle = GetPureTitle(CleanTitle, true);
            if (string.IsNullOrEmpty(PureTitle)) return null;

            //对考察范围内的所有标题进行相似度匹配
            foreach (string OldTitle in PureTitles.Keys)
            {
                double Sim = StringCompare.NeedlemanWunsch.Sim(PureTitle, OldTitle);
                //如标题相似度符合要求
                if (Sim >= SimThreshold)
                    //如果与ItemID一致,很可能是上次运行出错导致此Item二次监测,返回null
                    if (PureTitles[OldTitle] == ItemID)
                        return null;
                    else
                        //匹配上
                        return PureTitles[OldTitle];
            }

            //不含,添加入列表
            PureTitles.TryAdd(PureTitle, ItemID);
            return null;
        }

        /// <summary>
        /// 检索转载,基于标题字符串距离比较
        /// </summary>
        /// <param name="CleanTitle"></param>
        /// <returns>空或者DuplicationID</returns>
        public static string IsDuplication_TitleCompare(string CleanTitle, string ItemID)
        {
            return Instance.IsDup_TitleCompare(CleanTitle, ItemID);
        }

         */
        private void init_FingerPrint()
        {
            //Console.WriteLine("============ Duplication ============");
            //Console.WriteLine(string.Format("[{0}]Strart loading {1} days' items for duplication. All threads blocked.", DateTime.Now.ToShortTimeString(), nBackwardDays));
            //DateTime startTime = DateTime.Now;

            //获取N天内的Items总数
            //var query = Query.GT("PubDate", DateTime.Now.AddDays(-nBackwardDays));
            //int count = -1;
            //try
            //{
            //    count = MongoItemAccess.Items.Count(query);
            //}
            //catch (Exception e)
            //{
            //    Logger.Error(e.Message + "\n" + e.StackTrace);
            //}

            //创造指纹识别器
            FingerDetector = new DetectorFacade(new Parameters());

            //分天读取
            //if (count > 0)
            //{
            //    for (int i = -nBackwardDays; i <= 0; i++)
            //    {
            //        Console.Write(DateTime.Now.AddDays(i).ToShortDateString()+" : ");
            //        LoadDaily_Mongo(DateTime.Now.AddDays(i), FingerDetector);
            //        Console.WriteLine();
            //    }
            //}

            //LastInitTime = DateTime.Now;
            //Console.WriteLine(string.Format("Loaded {0} items for {1:F1} mins.", FingerDetector.GetItemCount(), (DateTime.Now - startTime).TotalMinutes));
            //Logger.Info(string.Format("Loaded {0} items for dup for {1:F1} mins.", FingerDetector.GetItemCount(), (DateTime.Now - startTime).TotalMinutes));
        }