Пример #1
0
        public HashSet <long> GetNearDups(SimhashResult simhash)
        {
            /*
             * "simhash" is an instance of Simhash
             * return a list of obj_id, which is in type of long (for now)
             */
            if (Simhash.FpSize != _fpSize)
            {
                throw new Exception();
            }

            var ans = new HashSet <long>();

            foreach (var key in GetEnumerableKeys(simhash))
            {
                if (!_bucket.TryGetValue(key, out var dups))
                {
                    continue;
                }

                foreach (var dup in dups)
                {
                    var parts = dup.Split(',');

                    var fp    = Convert.ToUInt64(parts[0]);
                    var objId = Convert.ToInt64(parts[1]);

                    var sim2 = new SimhashResult(fp);

                    var d = simhash.Distance(sim2);
                    if (d <= _kDistance)
                    {
                        ans.Add(objId);
                    }
                }
            }
            return(ans);
        }