private static void SimpleTest(Enums.MediaType type) { #region 建立一个item StringBuilder st = new StringBuilder(); string[] article = File.ReadAllLines(@"1.txt"); ItemToDuplication s = new ItemToDuplication(new Item()); s.ItemID = "source"; s.MediaType = type; s.SpliteTitle = article[0]; s.PubDate = DateTime.Now; st.Clear(); for (int i = 1; i < article.Length; ++i) st.Append(article[i]); s.SpliteText = st.ToString(); #endregion //将刚才建立的item添加到myDetector中 myDetector.TestAndTryAdd(s); #region 建立一个测试item article = File.ReadAllLines(@"2.txt"); //s = new ItemToDuplication(); s.ItemID = "test"; s.MediaType = type; s.SpliteTitle = article[0]; s.PubDate = DateTime.Now; st.Clear(); for (int i = 1; i < article.Length; ++i) st.Append(article[i]); s.SpliteText = st.ToString(); #endregion //测试转载检测 string id = null; id = myDetector.TestAndTryAdd(s); if (id != null) Console.WriteLine("Copied Item: " + id); else Console.WriteLine("Not copied."); }
public string TestAndTryAdd(ItemToDuplication item, double TITLE_WEIGHT = -1, double THRESHOLD = -1) { if (string.IsNullOrEmpty(item.SpliteText) && string.IsNullOrEmpty(item.SpliteTitle)) return null; string DupItemID = null; ReaderWriterLockSlim targetLock = GetLock(item.MediaType); try { targetLock.EnterReadLock(); Detector curDetector = GetCurDetector(item.MediaType); if (TITLE_WEIGHT < 0) TITLE_WEIGHT = curDetector.contextParameters.TITLE_WEIGHT; if (THRESHOLD < 0) THRESHOLD = curDetector.contextParameters.THRESHOLD; int[] sentenceTitle, sentenceContext, kwordsTitle, kwordsContext; curDetector.GetFingerPrints(item, out sentenceTitle, out sentenceContext, out kwordsTitle, out kwordsContext); if (!curDetector.IsItemCopied(sentenceTitle, sentenceContext, kwordsTitle, kwordsContext, TITLE_WEIGHT, THRESHOLD, out DupItemID)) { if ((DateTime.Now - item.PubDate) < DetectPeriod) { targetLock.ExitReadLock(); EnterWriteLock(targetLock); curDetector.RegisterArticle(item, sentenceTitle, sentenceContext, kwordsTitle, kwordsContext); ExitWriteLock(targetLock); targetLock.EnterReadLock(); } } } catch (Exception e) { Logger.Error(string.Format("DetectorFacade IsItemCopied Exp:{0}\n{1}", e.Message, e.StackTrace)); } finally { targetLock.ExitReadLock(); } return DupItemID; }
/// <summary> /// 获取与某篇文章相似的内存中的所有文章 /// </summary> /// <param name="item"></param> /// <returns></returns> public string[] GetSimilarItemIDs(ItemToDuplication item, double TITLE_WEIGHT = -1, double THRESHOLD = -1) { ReaderWriterLockSlim targetLock = GetLock(item.MediaType); string[] result = null; try { targetLock.EnterReadLock(); Detector curDetector = GetCurDetector(item.MediaType); if (TITLE_WEIGHT < 0) TITLE_WEIGHT = curDetector.contextParameters.TITLE_WEIGHT; if (THRESHOLD < 0) THRESHOLD = curDetector.contextParameters.THRESHOLD; result = curDetector.GetSimilarItems(item, TITLE_WEIGHT, THRESHOLD); } catch (Exception e) { Logger.Error(string.Format("DetectorFacade GetSimilarItemIDs Exp:{0}\n{2}", e.Message, e.StackTrace)); } finally { targetLock.ExitReadLock(); } return result; }
public void RegisterArticle(ItemToDuplication item, int[] sentenceTitle, int[] sentenceContext, int[] kwordsTitle, int[] kwordsContext) { if (sentenceTitle == null && sentenceContext == null && kwordsTitle == null && kwordsContext == null) return; string id = item.ItemID; if (item.DuplicationID != null) id = item.DuplicationID; sentenceHolder.RegisterArticleFingerPrint(sentenceTitle, sentenceContext, id, item.PubDate); kwordsHolder.RegisterArticleFingerPrint(kwordsTitle, kwordsContext, id, item.PubDate); }
public string[] GetSimilarItems(ItemToDuplication item, double TITLE_WEIGHT, double THRESHOLD) { if (string.IsNullOrEmpty(item.SpliteText) && string.IsNullOrEmpty(item.SpliteTitle)) return null; int[] sentenceTitle, sentenceContext, kwordsTitle, kwordsContext; GetFingerPrints(item, out sentenceTitle, out sentenceContext, out kwordsTitle, out kwordsContext); string[] listA = sentenceHolder.GetSimilarArticles(sentenceTitle, sentenceContext, TITLE_WEIGHT, THRESHOLD); string[] listB = kwordsHolder.GetSimilarArticles(kwordsTitle, kwordsContext, TITLE_WEIGHT, THRESHOLD); return MergeArticleNames(listA, listB); }
public void GetFingerPrints(ItemToDuplication item, out int[] sentenceTitle, out int[] sentenceContext, out int[] kwordsTitle, out int[] kwordsContext) { sentenceTitle = sentenceContext = kwordsTitle = kwordsContext = null; Parameters titleParameters = new Parameters(1, 1, contextParameters.TITLE_WEIGHT, contextParameters.THRESHOLD); if (string.IsNullOrEmpty(item.SpliteText) && item.SpliteTitle.Length > Parameters.MAX_TITLE_LENGTH) titleParameters = contextParameters; if (!string.IsNullOrEmpty(item.SpliteTitle)) { sentenceTitle = FingerPrintBuilder.GetSentenceFingerPrint(item.SpliteTitle, titleParameters, true); kwordsTitle = FingerPrintBuilder.GetK_WordsFingerPrint(item.SpliteTitle, titleParameters, true); } if (!string.IsNullOrEmpty(item.SpliteText)) { sentenceContext = FingerPrintBuilder.GetSentenceFingerPrint(item.SpliteText, contextParameters, false); kwordsContext = FingerPrintBuilder.GetK_WordsFingerPrint(item.SpliteText, contextParameters, false); } }
/// <summary> /// /// </summary> /// <param name="Date"></param> /// <returns></returns> //static void LoadDaily_Mongo(DateTime Date, DetectorFacade detector) //{ // //左闭右开 // //QueryConditionList low = Query.GTE("PubDate", Date.Date); // //QueryConditionList high = Query.LT("PubDate", Date.Date.AddDays(1)); // //var query = Query.And(low, high); // //这里就已经挂了 // //int count = MongoItemAccess.Items.Count(query); // //总count>4000要再分批 // //const int MaxSetSize = 4000; // //if (count < MaxSetSize) // // LoadPeriod_Mongo(Date.Date, Date.Date.AddDays(1), Items); // //else // { // int HourStep = 1; //24 / ((count + MaxSetSize - 1) / MaxSetSize); // int Hour = 0; // while (Hour < 24) // { // LoadPeriod_Mongo(Date.Date.AddHours(Hour), Date.Date.AddHours(Hour + HourStep > 24 ? 24 : Hour + HourStep), detector); // Hour += HourStep; // Console.Write('*'); // } // } //} //static void LoadPeriod_Mongo(DateTime lowTime, DateTime highTime, DetectorFacade detector) //{ // //左闭右开 // QueryConditionList low = Query.GTE("FetchTime", lowTime); // QueryConditionList high = Query.LT("FetchTime", highTime); // var query = Query.And(low, high); // var sort = SortBy.Ascending("FetchTime"); // //int count = MongoItemAccess.Items.Count(query); // //const int packsize = 1000; //每次读取条数 // const int sleepMS = 200; //两次读取的间隔时间 // //int skip = 0; // //while (skip < count) // try // { // var result = MongoItemAccess.Items.Find(query)./*SetSortOrder(sort).*/SetFields("ItemID", "PubDate", "DuplicationID", "SpliteTitle", "SpliteText", "ProsdDuplication"); // //.Take(packsize).Skip(skip); // foreach (var item in result) // detector.AddItem(new ItemToDuplication(item)); // //skip += packsize; // } // catch (Exception e) // { // Logger.Error(string.Format(@"Duplication读取文章失败[{0}-{1}]:{2}", lowTime, highTime, e.Message)); // //break; // } // Thread.Sleep(sleepMS); //} /// <summary> /// (对象实例方法)转载判别,基于文本指纹 /// </summary> /// <param name="Item"></param> /// <returns></returns> string IsDuplication_FingerPrint(ItemToDuplication Item) { try { //没有则加入 string r = FingerDetector.TestAndTryAdd(Item); //if (!string.IsNullOrEmpty(r)) // FingerDetector.AddItem(Item); return r; } catch (Exception e) { Logger.Error(string.Format("Dup Err:{0} ItemID:{1}\n{2}", e.Message, Item.ItemID, e.StackTrace)); return null; } }