internal void loadHashFile(string path)
        {
            _log.log(string.Format("Loading {0}", path));

            _pathmemory = path;

            //var toload1 = Path.Combine(path, "htih.txt");
            var toload2 = Path.Combine(path, "htih_pz.txt");

            if (!File.Exists(toload2))
            {
                MessageBox.Show("Folder has not been zip hashed!");
                return;
            }

            _hashSource++; // new file loaded

            using (var sr = new StreamReader(toload2))
            {
                while (sr.Peek() >= 0)
                {
                    var line = sr.ReadLine();
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }

                    var parts = line.Split('|');
                    var he    = new HashZipEntry();
                    he.ZipFile   = parts[0];
                    he.InnerPath = parts[1];
                    he.phash     = ulong.Parse(parts[2]);
                    he.source    = _hashSource;
                    _toCompare.Add(he);

                    // Need a set of ZIPs to compare
                    if (_zipDict.ContainsKey(he.ZipFile))
                    {
                        var filelist = _zipDict[he.ZipFile];
                        filelist.Add(he);
                    }
                    else
                    {
                        var filelist = new ConcurrentBag <HashZipEntry>();
                        filelist.Add(he);
                        _zipDict[he.ZipFile] = filelist;
                    }
                }
            }

            _log.log(string.Format(" to compare - Zips:{0} Files:{1}", _zipDict.Keys.Count, _toCompare.Count));

            //CompareAsync();
            CompareVPTree();
        }
        private void MakeScore(int matches, HashZipEntry he1, HashZipEntry he2)
        {
            var zip1 = _zipDict[he1.ZipFile];
            var zip2 = _zipDict[he2.ZipFile];

            var foo          = MakeDetailList(zip1.ToList(), zip2.ToList());
            int brutematches = 0;

            foreach (var bar in foo)
            {
                if (bar.score < MAX_SCORE)
                {
                    brutematches++;
                }
            }

            int score1 = (int)(((double)matches / zip1.Count) * 100.0);
            int score2 = (int)(((double)matches / zip2.Count) * 100.0);
            int score  = Math.Max(score1, score2);

            int bscore1 = (int)(((double)brutematches / zip1.Count) * 100.0);
            int bscore2 = (int)(((double)brutematches / zip2.Count) * 100.0);
            int bscore  = Math.Max(bscore1, bscore2);

            if (bscore > 20)
            {
                ScoreEntry se = new ScoreEntry();
                se.zipfile1   = he1.ZipFile;
                se.zip1count  = zip1.Count;
                se.zipfile2   = he2.ZipFile;
                se.zip2count  = zip2.Count;
                se.score      = bscore;
                se.sameSource = zip1.First().source == zip2.First().source;

                _scores.Add(se);
            }
        }
 private int CalcScoreP(HashZipEntry fd1, HashZipEntry fd2)
 {
     return(CompareForm.ham_dist(fd1.phash, fd2.phash));
 }
        private void CompareVPTree()
        {
            var ziplist  = _zipDict.Keys.ToArray();
            int zipCount = ziplist.Length;

            if (zipCount < 1)
            {
                return;
            }

            SetStatus(string.Format("Hashes: {0} Archives: {1}", _hashSource, zipCount));

            _scores = new HashSet <ScoreEntry>(); // use a set so that AxB and BxA are not duplicated

            var tree            = new VPTree <HashZipEntry>(CalcScoreP);
            var root            = tree.make_vp(_toCompare);
            var ret             = new List <HashZipEntry>();
            var thisfilematches = new HashSet <string>();
            var filesdone       = new HashSet <HashZipEntry>();
            var zipsdone        = new HashSet <string>();

            updateProgress(0);
            int doneCount = 0;

            var pairset = new HashSet <ScoreEntry2>();

            foreach (var azip in ziplist)
            {
                var filelist = _zipDict[azip];
                foreach (var afile in filelist)
                {
                    tree.query_vp(root, afile, 1, ret);

                    foreach (var aret in ret)
                    {
                        if (aret == afile)  // skip self
                        {
                            continue;
                        }
                        if (aret.ZipFile == afile.ZipFile) // skip self-zip matches
                        {
                            continue;
                        }
                        int dist = CalcScoreP(afile, aret); // reduce 'noise' by tossing too-distant matches
                        if (dist > MAX_SCORE)
                        {
                            continue;
                        }

                        ScoreEntry2 se2 = new ScoreEntry2();
                        se2.F1    = afile;
                        se2.F2    = aret;
                        se2.score = dist;
                        pairset.Add(se2);
                    }

                    ret.Clear();
                }



                //foreach (var azip in ziplist)
                //{
                //    zipsdone.Add(azip);
                //    var filelist = _zipDict[azip];
                //    var matchlist = new Dictionary<string, int>();

                //    foreach (var comp in filelist)
                //    {
                //        filesdone.Add(comp);

                //        tree.query_vp(root, comp, 1, ret);

                //        //int selfdups = ret.Where(x => x.ZipFile == azip).Count();
                //        //if (selfdups < 2)
                //            foreach (var aret in ret)
                //            {
                //                if (zipsdone.Contains(aret.ZipFile))
                //                    continue;
                //                if (filesdone.Contains(aret))
                //                    continue;
                //                thisfilematches.Add(aret.ZipFile);
                //            }

                //        ret.Clear();

                //        foreach (var zipmatch in thisfilematches)
                //            if (zipmatch != azip)
                //                if (matchlist.ContainsKey(zipmatch))
                //                    matchlist[zipmatch]++;
                //                else
                //                    matchlist.Add(zipmatch, 1);
                //    }

                //    thisfilematches.Clear();
                //if (ret.Count > 1) // TODO won't this always be true [as 'comp' is in the tree and will match]
                //{
                //    foreach (var aret in ret)
                //    {
                //        // ignore a match against self or a match against self-zip
                //        if (aret.Equals(comp) || aret.ZipFile == comp.ZipFile)
                //            continue;

                //        // each aret may be from a distinct zip
                //        // need to turn into a set of zip+match counts
                //        if (matchlist.ContainsKey(aret.ZipFile))
                //        {
                //            if (newfile)
                //                matchlist[aret.ZipFile]++;
                //            //newfile = false;
                //        }
                //        else
                //        {
                //            matchlist.Add(aret.ZipFile, 1);
                //        }
                //    }
                //}

//                    ret.Clear();
//                }

                //// build ScoreEntry list based on number of matches for azip against other zips
                //foreach (var amatch in matchlist)
                //{
                //    string who = amatch.Key;
                //    int matches = amatch.Value;
                //    var zip2 = _zipDict[who];

                //    int score1 = (int)(((double)matches / filelist.Count) * 100.0);
                //    int score2 = (int)(((double)matches / zip2.Count) * 100.0);
                //    int score = Math.Max(score1, score2);

                //    //System.Diagnostics.Debug.Assert(score <= 100.0);

                //    if (score > 20)
                //    {
                //        ScoreEntry se = new ScoreEntry();
                //        se.zipfile1 = azip;
                //        se.zip1count = filelist.Count;
                //        se.zipfile2 = who;
                //        se.zip2count = zip2.Count;
                //        se.score = score;
                //        se.sameSource = filelist.First().source == zip2.First().source;

                //        _scores.Add(se);
                //    }
                //}

                doneCount++;
                if (doneCount % 5 == 0)
                {
                    int perc = (int)(100.0 * doneCount / zipCount);
                    updateProgress(perc);
                }
            }

            // Turn pairset into _scores
            var pairlist = pairset.ToList();

            _log.log(string.Format(" pair candidates:{0}", pairlist.Count));

            if (pairlist.Count != 0)
            {
                int          matches = 0;
                HashZipEntry he      = pairlist[0].F1;
                HashZipEntry he2     = pairlist[0].F2;
                foreach (var apair in pairlist)
                {
                    if (apair.F1.ZipFile == he.ZipFile)
                    {
                        if (apair.F2.ZipFile == he2.ZipFile)
                        {
                            matches++;
                        }
                        else
                        {
                            MakeScore(matches, he, he2);
                            he2     = apair.F2;
                            matches = 1;
                        }
                    }
                    else
                    {
                        MakeScore(matches, he, he2);
                        he      = apair.F1;
                        he2     = apair.F2;
                        matches = 1;
                    }
                }

                // 20190426 the last entry was not processed as a possible candidate
                MakeScore(matches, he, he2);
            }

            updateProgress(0);
            _scoreList = _scores.ToList();
            _log.log(string.Format(" zip matches:{0}", _scoreList.Count));

            _scores = null;

            _scoreList.Sort(ScoreEntry.Comparer);

            LoadZipList();
        }
Beispiel #5
0
        private void btnLoad_Click(object sender, EventArgs e)
        {
            var fbd = new FolderBrowserDialog();

            if (!string.IsNullOrEmpty(_pathHistory))
            {
                fbd.SelectedPath = _pathHistory;
            }
            fbd.ShowNewFolderButton = false;
            if (fbd.ShowDialog() != DialogResult.OK)
            {
                return;
            }
            var path = fbd.SelectedPath;

            _pathHistory = path;

            //var toload1 = Path.Combine(path, "htih.txt");
            var toload2 = Path.Combine(path, "htih_pz.txt");

            if (!File.Exists(toload2))
            {
                MessageBox.Show("Folder has not been zip hashed!");
                return;
            }

            ClearForLoad();

            _hashSource++; // new file loaded

            using (var sr = new StreamReader(toload2))
            {
                while (sr.Peek() >= 0)
                {
                    var line = sr.ReadLine();
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }

                    var parts = line.Split('|');
                    var he    = new HashZipEntry();
                    he.ZipFile   = parts[0];
                    he.InnerPath = parts[1];
                    he.phash     = ulong.Parse(parts[2]);
                    he.source    = _hashSource;
                    _toCompare.Add(he);

                    // Need a set of ZIPs to compare
                    if (_zipDict.ContainsKey(he.ZipFile))
                    {
                        var filelist = _zipDict[he.ZipFile];
                        filelist.Add(he);
                    }
                    else
                    {
                        var filelist = new ConcurrentBag <HashZipEntry>();
                        filelist.Add(he);
                        _zipDict[he.ZipFile] = filelist;
                    }
                }
            }

            _log.log(string.Format("to compare - Zips:{0} Files:{1}", _zipDict.Keys.Count, _toCompare.Count));

            CompareAsync();
        }