public static void StartTest() { DateTime curTime = DateTime.Now; DetectorFacade detector = new DetectorFacade(new Parameters()); bool finished = false; int total = 0, cnt = 0, dupcnt = 0; while (!finished) { Console.WriteLine(cnt++); if (cnt > 3 * 24) break; List<ItemToDuplication> Items = new List<ItemToDuplication>(); DateTime nextTime = curTime.Subtract(new TimeSpan(1, 0, 0)); Console.WriteLine("Fetch time: from {0} to {1}", nextTime, curTime); LoadPeriod_Mongo(nextTime, curTime, Items); Console.WriteLine("fetch finished. Now inserting..."); for (int i = 0; i < Items.Count; ++i) { try { string result = detector.TestAndTryAdd(Items[i]); if (result != null) dupcnt++; total++; if (total % 1000 == 0) { Console.WriteLine("Have tested {0} items. Duplication: {1}", total, dupcnt); } Items[i] = null; } catch (Exception e) { finished = true; Logger.Error(string.Format("DetectorFacade AddItem Exp:{0}\n{1}\nTotal {2} Articles", e.Message, e.StackTrace, total)); } } Console.WriteLine("insert finished"); curTime = nextTime; Thread.Sleep(1000); } }
/* 基于文章标题字符串距离的比较函数 /// <summary> /// 初始化,读入所有文章标题 /// </summary> void init_TitleCompare() { PureTitles.Clear(); //对过去的nBackwardDays的每一天标题 for (int Day = nBackwardDays; Day >= 0; Day--) { //foreach (string Line in File.ReadLines(FN, Encoding.UTF8)) // if (!string.IsNullOrEmpty(Line)) // { // string DupID = Line.Substring(0, 32); // string PureTitle = Line.Substring(33); // if (!PureTitles.ContainsKey(PureTitle)) // PureTitles.Add(PureTitle, DupID); // } } LastInitTime = DateTime.Now; } /// <summary> /// 检索转载 /// </summary> /// <param name="CleanTitle"></param> /// <returns>空或者DuplicationID</returns> string IsDup_TitleCompare(string CleanTitle, string ItemID) { string PureTitle = GetPureTitle(CleanTitle, true); if (string.IsNullOrEmpty(PureTitle)) return null; //对考察范围内的所有标题进行相似度匹配 foreach (string OldTitle in PureTitles.Keys) { double Sim = StringCompare.NeedlemanWunsch.Sim(PureTitle, OldTitle); //如标题相似度符合要求 if (Sim >= SimThreshold) //如果与ItemID一致,很可能是上次运行出错导致此Item二次监测,返回null if (PureTitles[OldTitle] == ItemID) return null; else //匹配上 return PureTitles[OldTitle]; } //不含,添加入列表 PureTitles.TryAdd(PureTitle, ItemID); return null; } /// <summary> /// 检索转载,基于标题字符串距离比较 /// </summary> /// <param name="CleanTitle"></param> /// <returns>空或者DuplicationID</returns> public static string IsDuplication_TitleCompare(string CleanTitle, string ItemID) { return Instance.IsDup_TitleCompare(CleanTitle, ItemID); } */ private void init_FingerPrint() { //Console.WriteLine("============ Duplication ============"); //Console.WriteLine(string.Format("[{0}]Strart loading {1} days' items for duplication. All threads blocked.", DateTime.Now.ToShortTimeString(), nBackwardDays)); //DateTime startTime = DateTime.Now; //获取N天内的Items总数 //var query = Query.GT("PubDate", DateTime.Now.AddDays(-nBackwardDays)); //int count = -1; //try //{ // count = MongoItemAccess.Items.Count(query); //} //catch (Exception e) //{ // Logger.Error(e.Message + "\n" + e.StackTrace); //} //创造指纹识别器 FingerDetector = new DetectorFacade(new Parameters()); //分天读取 //if (count > 0) //{ // for (int i = -nBackwardDays; i <= 0; i++) // { // Console.Write(DateTime.Now.AddDays(i).ToShortDateString()+" : "); // LoadDaily_Mongo(DateTime.Now.AddDays(i), FingerDetector); // Console.WriteLine(); // } //} //LastInitTime = DateTime.Now; //Console.WriteLine(string.Format("Loaded {0} items for {1:F1} mins.", FingerDetector.GetItemCount(), (DateTime.Now - startTime).TotalMinutes)); //Logger.Info(string.Format("Loaded {0} items for dup for {1:F1} mins.", FingerDetector.GetItemCount(), (DateTime.Now - startTime).TotalMinutes)); }