internal void loadHashFile(string path) { _log.log(string.Format("Loading {0}", path)); _pathmemory = path; //var toload1 = Path.Combine(path, "htih.txt"); var toload2 = Path.Combine(path, "htih_pz.txt"); if (!File.Exists(toload2)) { MessageBox.Show("Folder has not been zip hashed!"); return; } _hashSource++; // new file loaded using (var sr = new StreamReader(toload2)) { while (sr.Peek() >= 0) { var line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } var parts = line.Split('|'); var he = new HashZipEntry(); he.ZipFile = parts[0]; he.InnerPath = parts[1]; he.phash = ulong.Parse(parts[2]); he.source = _hashSource; _toCompare.Add(he); // Need a set of ZIPs to compare if (_zipDict.ContainsKey(he.ZipFile)) { var filelist = _zipDict[he.ZipFile]; filelist.Add(he); } else { var filelist = new ConcurrentBag <HashZipEntry>(); filelist.Add(he); _zipDict[he.ZipFile] = filelist; } } } _log.log(string.Format(" to compare - Zips:{0} Files:{1}", _zipDict.Keys.Count, _toCompare.Count)); //CompareAsync(); CompareVPTree(); }
private void MakeScore(int matches, HashZipEntry he1, HashZipEntry he2) { var zip1 = _zipDict[he1.ZipFile]; var zip2 = _zipDict[he2.ZipFile]; var foo = MakeDetailList(zip1.ToList(), zip2.ToList()); int brutematches = 0; foreach (var bar in foo) { if (bar.score < MAX_SCORE) { brutematches++; } } int score1 = (int)(((double)matches / zip1.Count) * 100.0); int score2 = (int)(((double)matches / zip2.Count) * 100.0); int score = Math.Max(score1, score2); int bscore1 = (int)(((double)brutematches / zip1.Count) * 100.0); int bscore2 = (int)(((double)brutematches / zip2.Count) * 100.0); int bscore = Math.Max(bscore1, bscore2); if (bscore > 20) { ScoreEntry se = new ScoreEntry(); se.zipfile1 = he1.ZipFile; se.zip1count = zip1.Count; se.zipfile2 = he2.ZipFile; se.zip2count = zip2.Count; se.score = bscore; se.sameSource = zip1.First().source == zip2.First().source; _scores.Add(se); } }
private int CalcScoreP(HashZipEntry fd1, HashZipEntry fd2) { return(CompareForm.ham_dist(fd1.phash, fd2.phash)); }
private void CompareVPTree() { var ziplist = _zipDict.Keys.ToArray(); int zipCount = ziplist.Length; if (zipCount < 1) { return; } SetStatus(string.Format("Hashes: {0} Archives: {1}", _hashSource, zipCount)); _scores = new HashSet <ScoreEntry>(); // use a set so that AxB and BxA are not duplicated var tree = new VPTree <HashZipEntry>(CalcScoreP); var root = tree.make_vp(_toCompare); var ret = new List <HashZipEntry>(); var thisfilematches = new HashSet <string>(); var filesdone = new HashSet <HashZipEntry>(); var zipsdone = new HashSet <string>(); updateProgress(0); int doneCount = 0; var pairset = new HashSet <ScoreEntry2>(); foreach (var azip in ziplist) { var filelist = _zipDict[azip]; foreach (var afile in filelist) { tree.query_vp(root, afile, 1, ret); foreach (var aret in ret) { if (aret == afile) // skip self { continue; } if (aret.ZipFile == afile.ZipFile) // skip self-zip matches { continue; } int dist = CalcScoreP(afile, aret); // reduce 'noise' by tossing too-distant matches if (dist > MAX_SCORE) { continue; } ScoreEntry2 se2 = new ScoreEntry2(); se2.F1 = afile; se2.F2 = aret; se2.score = dist; pairset.Add(se2); } ret.Clear(); } //foreach (var azip in ziplist) //{ // zipsdone.Add(azip); // var filelist = _zipDict[azip]; // var matchlist = new Dictionary<string, int>(); // foreach (var comp in filelist) // { // filesdone.Add(comp); // tree.query_vp(root, comp, 1, ret); // //int selfdups = ret.Where(x => x.ZipFile == azip).Count(); // //if (selfdups < 2) // foreach (var aret in ret) // { // if (zipsdone.Contains(aret.ZipFile)) // continue; // if (filesdone.Contains(aret)) // continue; // thisfilematches.Add(aret.ZipFile); // } // ret.Clear(); // foreach (var zipmatch in thisfilematches) // if (zipmatch != azip) // if (matchlist.ContainsKey(zipmatch)) // matchlist[zipmatch]++; // else // matchlist.Add(zipmatch, 1); // } // thisfilematches.Clear(); //if (ret.Count > 1) // TODO won't this always be true [as 'comp' is in the tree and will match] //{ // foreach (var aret in ret) // { // // ignore a match against self or a match against self-zip // if (aret.Equals(comp) || aret.ZipFile == comp.ZipFile) // continue; // // each aret may be from a distinct zip // // need to turn into a set of zip+match counts // if (matchlist.ContainsKey(aret.ZipFile)) // { // if (newfile) // matchlist[aret.ZipFile]++; // //newfile = false; // } // else // { // matchlist.Add(aret.ZipFile, 1); // } // } //} // ret.Clear(); // } //// build ScoreEntry list based on number of matches for azip against other zips //foreach (var amatch in matchlist) //{ // string who = amatch.Key; // int matches = amatch.Value; // var zip2 = _zipDict[who]; // int score1 = (int)(((double)matches / filelist.Count) * 100.0); // int score2 = (int)(((double)matches / zip2.Count) * 100.0); // int score = Math.Max(score1, score2); // //System.Diagnostics.Debug.Assert(score <= 100.0); // if (score > 20) // { // ScoreEntry se = new ScoreEntry(); // se.zipfile1 = azip; // se.zip1count = filelist.Count; // se.zipfile2 = who; // se.zip2count = zip2.Count; // se.score = score; // se.sameSource = filelist.First().source == zip2.First().source; // _scores.Add(se); // } //} doneCount++; if (doneCount % 5 == 0) { int perc = (int)(100.0 * doneCount / zipCount); updateProgress(perc); } } // Turn pairset into _scores var pairlist = pairset.ToList(); _log.log(string.Format(" pair candidates:{0}", pairlist.Count)); if (pairlist.Count != 0) { int matches = 0; HashZipEntry he = pairlist[0].F1; HashZipEntry he2 = pairlist[0].F2; foreach (var apair in pairlist) { if (apair.F1.ZipFile == he.ZipFile) { if (apair.F2.ZipFile == he2.ZipFile) { matches++; } else { MakeScore(matches, he, he2); he2 = apair.F2; matches = 1; } } else { MakeScore(matches, he, he2); he = apair.F1; he2 = apair.F2; matches = 1; } } // 20190426 the last entry was not processed as a possible candidate MakeScore(matches, he, he2); } updateProgress(0); _scoreList = _scores.ToList(); _log.log(string.Format(" zip matches:{0}", _scoreList.Count)); _scores = null; _scoreList.Sort(ScoreEntry.Comparer); LoadZipList(); }
private void btnLoad_Click(object sender, EventArgs e) { var fbd = new FolderBrowserDialog(); if (!string.IsNullOrEmpty(_pathHistory)) { fbd.SelectedPath = _pathHistory; } fbd.ShowNewFolderButton = false; if (fbd.ShowDialog() != DialogResult.OK) { return; } var path = fbd.SelectedPath; _pathHistory = path; //var toload1 = Path.Combine(path, "htih.txt"); var toload2 = Path.Combine(path, "htih_pz.txt"); if (!File.Exists(toload2)) { MessageBox.Show("Folder has not been zip hashed!"); return; } ClearForLoad(); _hashSource++; // new file loaded using (var sr = new StreamReader(toload2)) { while (sr.Peek() >= 0) { var line = sr.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { continue; } var parts = line.Split('|'); var he = new HashZipEntry(); he.ZipFile = parts[0]; he.InnerPath = parts[1]; he.phash = ulong.Parse(parts[2]); he.source = _hashSource; _toCompare.Add(he); // Need a set of ZIPs to compare if (_zipDict.ContainsKey(he.ZipFile)) { var filelist = _zipDict[he.ZipFile]; filelist.Add(he); } else { var filelist = new ConcurrentBag <HashZipEntry>(); filelist.Add(he); _zipDict[he.ZipFile] = filelist; } } } _log.log(string.Format("to compare - Zips:{0} Files:{1}", _zipDict.Keys.Count, _toCompare.Count)); CompareAsync(); }