/// <summary> /// 构造函数:除Unkonw类型外都使用提供的默认参数 /// </summary> /// <param name="defaultParameters"></param> /// <param name="Items"></param> public DetectorFacade(Parameters defaultContextParameters) { detectorLock = new ConcurrentDictionary<Enums.MediaType, ReaderWriterLockSlim>(); this.defaultContextParameters = defaultContextParameters; detectorOFMedia = new ConcurrentDictionary<Enums.MediaType, Detector>(); DetectPeriod = new TimeSpan(Duplication.nBackwardDays, 0, 0, 0); }
/// <summary> /// 将粗粒度哈希后的结果进行筛选提取 /// </summary> /// <param name="hashCodes">粗粒度哈希结果</param> /// <returns></returns> private static int[] GetSimplifiedFingerPrint(int[] hashCodes, Parameters parameters, bool isTitle) { int windowSize = parameters.G - parameters.K + 1; int size = hashCodes.Length - windowSize + 1; if (size <= 0) size = 1; int[] result = new int[size]; Tuple<int, int>[] q = new Tuple<int, int>[windowSize + hashCodes.Length + 10]; int head = 0, tail = 0; q[0] = new Tuple<int, int>(hashCodes[0], 0); for (int i = 1; i < windowSize; ++i) { if (i >= hashCodes.Length) break; while (tail >= head && hashCodes[i] <= q[tail].Item1) tail--; q[++tail] = new Tuple<int, int>(hashCodes[i], i); } if (!isTitle) result[0] = q[0].Item1;//* 999983 + q[0].Item2; else result[0] = q[0].Item1; for (int i = windowSize; i < hashCodes.Length; ++i) { while (tail >= head && hashCodes[i] <= q[tail].Item1) tail--; q[++tail] = new Tuple<int, int>(hashCodes[i], i); while (head <= tail && q[head].Item2 + windowSize <= i) head++; if (!isTitle) result[i - windowSize + 1] = q[head].Item1;// * 999983 + q[head].Item2; else result[i - windowSize + 1] = q[head].Item1; } return result; }
/// <summary> /// 以句子为最小粒度获得文章指纹 /// </summary> /// <param name="article">待提取指纹的文章</param> /// <returns></returns> internal static int[] GetSentenceFingerPrint(string content, Parameters parameters, bool isTitle) { List<List<int>> codes = WordList.GetWordIDs(content); HashSet<int> hashSet = new HashSet<int>(); for (int i = 0; i < codes.Count; ++i) { if (codes[i].Count < parameters.K) continue; int[] hashCodes = GetKarpRabinHashCode(codes[i], codes[i].Count); int curFingerPrint = hashCodes[0]; if (!hashSet.Contains(curFingerPrint)) hashSet.Add(curFingerPrint); } if (hashSet.Count > 0) return hashSet.ToArray(); else return null; }
/// <summary> /// 用k-words算法获取文章的指纹 /// </summary> /// <param name="article">待提取指纹的文章</param> /// <returns></returns> internal static int[] GetK_WordsFingerPrint(string content, Parameters parameters, bool isTitle) { List<List<int>> codes = WordList.GetWordIDs(content); HashSet<int> hashSet = new HashSet<int>(); for (int i = 0; i < codes.Count; ++i) { if (codes[i].Count < parameters.K) continue; int[] hashCodes = GetKarpRabinHashCode(codes[i], parameters.K); int[] curFingerPrint = GetSimplifiedFingerPrint(hashCodes, parameters, isTitle); for (int j = 0; j < curFingerPrint.Length; ++j) if (!hashSet.Contains(curFingerPrint[j])) hashSet.Add(curFingerPrint[j]); } if (hashSet.Count > 0) return hashSet.ToArray(); else return null; }
public void GetFingerPrints(ItemToDuplication item, out int[] sentenceTitle, out int[] sentenceContext, out int[] kwordsTitle, out int[] kwordsContext) { sentenceTitle = sentenceContext = kwordsTitle = kwordsContext = null; Parameters titleParameters = new Parameters(1, 1, contextParameters.TITLE_WEIGHT, contextParameters.THRESHOLD); if (string.IsNullOrEmpty(item.SpliteText) && item.SpliteTitle.Length > Parameters.MAX_TITLE_LENGTH) titleParameters = contextParameters; if (!string.IsNullOrEmpty(item.SpliteTitle)) { sentenceTitle = FingerPrintBuilder.GetSentenceFingerPrint(item.SpliteTitle, titleParameters, true); kwordsTitle = FingerPrintBuilder.GetK_WordsFingerPrint(item.SpliteTitle, titleParameters, true); } if (!string.IsNullOrEmpty(item.SpliteText)) { sentenceContext = FingerPrintBuilder.GetSentenceFingerPrint(item.SpliteText, contextParameters, false); kwordsContext = FingerPrintBuilder.GetK_WordsFingerPrint(item.SpliteText, contextParameters, false); } }
public Detector(Parameters _contextParameters, string collectionName) { sentenceHolder = new FingerPrintHolder(collectionName, 0); kwordsHolder = new FingerPrintHolder(collectionName, 2); contextParameters = _contextParameters; }
private void RegisterMediaType(Enums.MediaType mediaType, Parameters contextParameters) { if (mediaType == Enums.MediaType.SearchWeibo) mediaType = Enums.MediaType.Weibo; if (mediaType == Enums.MediaType.SearchForum) mediaType = Enums.MediaType.Forum; if (mediaType != Enums.MediaType.Weibo && mediaType != Enums.MediaType.Forum) mediaType = Enums.MediaType.Unknown; if (detectorOFMedia.ContainsKey(mediaType)) return; string collectionName; switch (mediaType) { case Enums.MediaType.Forum: collectionName = "FingerPrint_Forum"; break; case Enums.MediaType.Weibo: collectionName = "FingerPrint_Weibo"; break; default: collectionName = "FingerPrint_Other"; break; } Detector detector = new Detector(contextParameters, collectionName); detectorOFMedia.TryAdd(mediaType, detector); }