Esempio n. 1
0
        public LSH get1LSH(int ID_Lop)
        {
            string query = " select * from SV where LopSH = " + ID_Lop;
            LSH    lopsh = set1LSH(DBHelper.Instance.GetRecords(query).Rows[0]);

            return(lopsh);
        }
Esempio n. 2
0
        public LSH GetLSH(DataRow i)
        {
            LSH lsh = new LSH();

            lsh.ID_Lop  = Convert.ToInt32(i["ID_Lop"].ToString());
            lsh.NameLop = i["NameLop"].ToString();
            return(lsh);
        }
Esempio n. 3
0
        private LSH set1LSH(DataRow i)
        {
            LSH l = new LSH();

            l.ID_Lop  = Convert.ToInt32(i["ID_Lop"].ToString());
            l.NameLop = i["NameLop"].ToString();
            return(l);
        }
Esempio n. 4
0
        public LSH GetLSH(DataRow i)
        {
            LSH s = new LSH
            {
                ID_Lop  = Convert.ToInt32(i["ID_Lop"]),
                NameLop = i["NameLop"].ToString()
            };

            return(s);
        }
Esempio n. 5
0
        private static void CalculateLshForListingSet(List <Listing> listings, string job_id, Dictionary <long, long> duplicates)
        {
            var numSimilarityBuckets = (int)Math.Ceiling(listings.Count / 100M);

            // First make 2 dimensional array (docs by min-hashes)
            var matrix = new int[listings.Count, minHashCount];

            for (int listing = 0; listing < listings.Count; listing++)
            {
                for (int hash = 0; hash < listings[listing].minhash_description.Count; hash++)
                {
                    matrix[listing, hash] = (int)listings[listing].minhash_description[hash];
                }
            }

            // Now set LSH
            var lsh = new LSH(matrix, numSimilarityBuckets);

            lsh.Calc();

            // Set closes duplicate on each listing
            var duplicatesFound    = new Dictionary <long, long>();
            var singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M);

            for (int listing = 0; listing < listings.Count; listing++)
            {
                ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress);

                var nearest = lsh.GetNearest(listing);
                if (!nearest.Any())
                {
                    continue;
                }

                var thisListing    = listings[listing];
                var nearestListing = listings[nearest[0]];

                var priceRatio = nearestListing.buy_now_price / thisListing.buy_now_price;
                if (priceRatio < 0.8M || priceRatio > 1.2M)
                {
                    continue;
                }

                if (duplicatesFound.ContainsKey(nearestListing.id))
                {
                    continue;
                }

                listings[listing].likely_duplicate_id_by_description = nearestListing.id;
                listings[listing].similarity_description             = Jaccard.Calc(ArrayHelpers.GetRow <int>(matrix, listing).ToList(), nearest);
                duplicates[nearestListing.id] = thisListing.id;
                duplicates[thisListing.id]    = nearestListing.id;
            }
        }
Esempio n. 6
0
        public List <LSH> GetAllLSH()
        {
            List <LSH> LSHList = new List <LSH>();
            DataTable  data;

            data = DBHelper.Instance.ExecuteQuery("SELECT * FROM LSH");
            foreach (DataRow item in data.Rows)
            {
                LSH lsh = new LSH(item);
                LSHList.Add(lsh);
            }
            return(LSHList);
        }
Esempio n. 7
0
        public List <LSH> getAllLSH_DAL()
        {
            List <LSH> list  = new List <LSH>();
            string     query = "select * from LSH";

            foreach (DataRow dr in DBHelper.Instance.getRecords(query).Rows)
            {
                LSH lsh = new LSH();
                lsh.ID_Lop  = Convert.ToInt32(dr["ID_Lop"]);
                lsh.NameLop = dr["NameLop"].ToString();
                list.Add(lsh);
            }
            return(list);
        }
Esempio n. 8
0
        static void Main(string[] args)
        {
            // Creates a list of documents
            var documents = new Faker <TextDocument>()
                            .RuleFor(x => x.Text, faker => faker.Lorem.Sentence())
                            .Generate(10000);



            var lsh = new LSH(documents, 50000);

            var lshSearch    = lsh.Search("cum");
            var linearSearch = documents.Where(x => x.Text.Split(" ").Contains("cum")).ToList();

            Console.ReadLine();
        }
Esempio n. 9
0
        public List <LSH> GetAllLSH()
        {
            List <LSH>        llsh  = new List <LSH>();
            DataTable         dataa = CSDL.Instance.DTLSH;
            DataRowCollection dt    = dataa.Rows;

            foreach (DataRow i in dt)
            {
                LSH s = new LSH
                {
                    ID_Lop  = Convert.ToInt32(i["ID_Lop"].ToString()),
                    NameLop = i["NameLop"].ToString()
                };
                llsh.Add(s);
            }
            return(llsh);
        }
Esempio n. 10
0
        public void GenerateSSGraph(List <string> docs)
        {
            /* 1) decomposite the document represetned by fileName into sentences
             * 2) generate the sentence similarity graph via minhashing and LSH
             * 3) describe the graph by neiborhood list NB and offset list OFF
             */
            Stopwatch sw = new Stopwatch();

            sw.Start();
            List <string> docsOrg = new List <string>(docs);

            for (int i = 0; i < docs.Count; i++)
            {
                senteceNames[i] = docs[i];
            }

            int r    = docs.Count;
            int n    = 40;
            int rows = 2; // b= n / r;

            int[][] minHashes = new int[r][];
            for (int i = 0; i < r; i++)
            {
                //minHashes[i] = getShingleVec(parseWords(docs[i]).ToList()).ToArray();
                minHashes[i] = GetShingleVec(docs[i]).ToArray();
            }

            MinHash mh = new MinHash(r, n);

            int[,] minhashes = new int[r, n];
            for (int i = 0; i < r; i++)
            {
                List <int>  doc = minHashes[i].ToList();
                List <uint> hvs = mh.GetMinHash(doc).ToList();
                for (int j = 0; j < hvs.Count; j++)
                {
                    minhashes[i, j] = (int)hvs[j];
                }
            }


            OFF.Add(0);
            int conCount = 0;


            LSH lsh = new LSH(minhashes, rows);

            lsh.Calc();
            int idx = 0;

            for (int k = 0; k < minhashes.GetUpperBound(0); k++)
            {
                List <int> nearest = lsh.GetNearest(k);
                if (!nodes.Contains(k))
                {
                    nodes.Add(k);
                }
                //Console.Write("\n" + k+" ");
                foreach (int i in nearest)
                {
                    //Console.Write(near + ", ");
                    if (!nodes.Contains(i))
                    {
                        nodes.Add(i);
                    }
                    if (i == idx)
                    {
                        continue;
                    }
                    NB.Add(i);
                    conCount++;
                    ++idx;
                }
                OFF.Add(conCount);
            }
            sw.Stop();
            Console.WriteLine(sw.ElapsedMilliseconds / (double)1000);
        }
Esempio n. 11
0
        public void GenerateSSGraphForComparativeSum(List <string> docs, List <int> offsets)
        {
            /*
             * same as the first just applied to comparative sum
             * here we have the set of different documents
             */
            Stopwatch sw = new Stopwatch();

            sw.Start();
            List <string> docsOrg = new List <string>(docs);

            for (int i = 0; i < docs.Count; i++)
            {
                senteceNames[i] = docs[i];
            }

            int r    = docs.Count;
            int n    = 100;
            int rows = 5; // n / r;

            int[][] minHashes = new int[r][];
            for (int i = 0; i < r; i++)
            {
                //minHashes[i] = getShingleVec(parseWords(docs[i]).ToList()).ToArray();
                minHashes[i] = GetShingleVec(docs[i]).ToArray();
            }

            MinHash mh = new MinHash(r, n);

            int[,] minhashes = new int[r, n];
            for (int i = 0; i < r; i++)
            {
                List <int>  doc = minHashes[i].ToList();
                List <uint> hvs = mh.GetMinHash(doc).ToList();
                for (int j = 0; j < hvs.Count; j++)
                {
                    minhashes[i, j] = (int)hvs[j];
                }
            }


            OFF.Add(0);
            int conCount = 0;


            LSH lsh = new LSH(minhashes, rows);

            lsh.Calc();
            int idx = 0;

            for (int k = 0; k < minhashes.GetUpperBound(0); k++)
            {
                List <int> nearest = lsh.GetNearest(k);
                if (!nodes.Contains(k))
                {
                    nodes.Add(k);
                }
                //Console.Write("\n" + k+" ");
                foreach (int i in nearest)
                {
                    //Console.Write(near + ", ");
                    if (!nodes.Contains(i))
                    {
                        nodes.Add(i);
                    }
                    if (i == idx)
                    {
                        continue;
                    }
                    NB.Add(i);
                    if (Helpers.AreFromSameGroup(k, i, offsets))
                    {
                        SIGN.Add(1);
                    }
                    else
                    {
                        SIGN.Add(-0.5f);
                    }
                    conCount++;
                    ++idx;
                }
                OFF.Add(conCount);
            }
            sw.Stop();
            Console.WriteLine(sw.ElapsedMilliseconds / (double)1000);
        }
Esempio n. 12
0
		private static int[] Calculeaza_vecini_LSH(int k, int id_user)
        {
            // Extragem din DB un Dictionary de toate id utilizator, lista de preparate
            // comandate.
            Dictionary<int, List<int>> toatePrep = DatabaseFunctions.
                preparateComandateDupaUtilizator();
            Dictionary<int, HashSet<int>> signatures = new Dictionary<int, HashSet<int>>();
            HashSet<int> single_signatures = new HashSet<int>();

            foreach (KeyValuePair<int, List<int>> entry in toatePrep)
            {
                signatures.Add(entry.Key, entry.Value.ToHashSet<int>());
                single_signatures.UnionWith(entry.Value);
            }

            int numSets = signatures.Count;
            int numHashFunctions = single_signatures.Count;

            MinHash<int> minHash = new MinHash<int>(numHashFunctions);
            int[][] minHashValues = minHash.initializeHashBuckets(numSets,
                numHashFunctions);

            int index = 0, index_cautat = 0;
            List<HashSet<int>> list_signatures = new List<HashSet<int>>();
            foreach(var entry in signatures)
            {
                minHash.computeMinHashForSet(entry.Value, index, minHashValues,
                    single_signatures);
                if(entry.Key == id_user)
                {
                    index_cautat = index;
                }
                index++;
                list_signatures.Add(entry.Value);
            }

            

            LSH<int> lsh = new LSH<int>(minHashValues, list_signatures);
            Dictionary<int, double> closeSimilarItems = lsh.closestSimilarItems(index_cautat, minHash);

            int poz = 0;
            int[] vecini = new int[k];
            Dictionary<int, double> results = new Dictionary<int,double>();
            int i = 0;

            foreach(var closeItem in closeSimilarItems)
            {
                poz = closeItem.Key;
                i = 0;
                foreach (var entry in signatures)
                {
                    if (i == poz)
                    {
                        results.Add(entry.Key, closeItem.Value);
                        break;
                    }
                    i++;

                }

            }
            if (results.Count() > 0)
            {
                for (int ind = 0; ind < k; ind++)
                {
                    vecini[ind] = results.MaxBy(x => x.Value).Key;
                    results.Remove(vecini[ind]);
                    if (results.Count == 0) break;
                }
            }

            return vecini;

        }