コード例 #1
0
        static void processNumbersTest(bool sample = false, int sample_size = 1000)
        {
            //to try minhash on news corpora file
            string file = @"C:\Users\maydar\Dropbox\Semantic Study\ScabilityPaper\datasets\news aggregator\newsCorpora.csv-clean.txt";
            string pair_output_filename = file + "_minhashpairs.txt";

            int        numHashFunctions            = 2000;
            double     simThreshold                = 0.65;
            bool       exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set
            MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold);

            NumberDocumentCreator   numDocCreator = new NumberDocumentCreator(10, 100000);
            Dictionary <int, int[]> wordList      = numDocCreator.documentCollection;

            //Now create a MinHasher object to minhash each of the documents created above
            //using 300 unique hashing functions.
            //MinHasher minHasher = new MinHasher(500, 5);
            Console.WriteLine("\r\nGenerating MinHash signatures ... ");
            Dictionary <int, int[]> docMinhashes = minHasher.createMinhashCollection(wordList);
            double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes);

            /*StringDocumentCreator strDocCreator = new StringDocumentCreator(100, 10000);
             *
             * Dictionary<int, string[]> wordList2 = strDocCreator.documentCollection;
             *
             * //Now create a MinHasher object to minhash each of the documents created above
             * //using 300 unique hashing functions.
             * //MinHasher minHasher = new MinHasher(500, 5);
             * Console.WriteLine("\r\nGenerating MinHash signatures ... ");
             * Dictionary<int, int[]> docMinhashes2 = minHasher.createMinhashCollection(wordList2);
             * double avg_diff_perc_from_actual_and_minhash_jaccard2 = Util.calculateMinHashFunctionsAccuracy(wordList2, docMinhashes2);
             */

            Console.ReadKey();
        }
コード例 #2
0
        static void processNumbersTest3(bool sample = false, int sample_size = 1000)
        {
            int    numHashFunctions = 128;
            int    universeSize     = 1000;
            double simThreshold     = 0.65;
            double atn = 0.05;

            MinHasher2 mh2 = new MinHasher2(numHashFunctions, simThreshold);

            NumberDocumentCreator numDocCreator2 = new NumberDocumentCreator(10, universeSize);

            int[] a1 = numDocCreator2.createDocument(universeSize);
            int[] a2 = numDocCreator2.createDocument(universeSize);

            Console.WriteLine("Actual jaccaard: " + MinHasher2.calculateJaccard(a1, a2));
            Console.WriteLine("MinHash jaccaard: " + MinHasher2.calculateJaccard(mh2.getMinHashSignature(a1), mh2.getMinHashSignature(a2)));

            return;

            MinHasher3         mh  = new MinHasher3(universeSize, numHashFunctions);
            MinHasher_Buckets3 mhb = new MinHasher_Buckets3(mh, simThreshold, atn);

            NumberDocumentCreator numDocCreator = new NumberDocumentCreator(10, universeSize);

            List <int> s1 = numDocCreator.createDocument(universeSize).ToList();
            List <int> s2 = numDocCreator.createDocument(universeSize).ToList();

            Console.WriteLine("Actual jaccaard: " + Jaccard.Calc(s1, s2));
            Console.WriteLine("MinHash jaccaard: " + Jaccard.Calc(mh.GetMinHash(s1), mh.GetMinHash(s2)));
            return;

            Dictionary <int, List <int> > wordList = numDocCreator.documentCollectionList;

            //Now create a MinHasher object to minhash each of the documents created above
            //using 300 unique hashing functions.
            //MinHasher minHasher = new MinHasher(500, 5);
            Console.WriteLine("\r\nGenerating MinHash signatures ... ");
            Dictionary <int, List <uint> > docMinhashes          = mhb.createMinhashCollection(wordList); //minHasher.createMinhashCollection(wordList);
            double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes);

            /*StringDocumentCreator strDocCreator = new StringDocumentCreator(100, 10000);
             *
             * Dictionary<int, string[]> wordList2 = strDocCreator.documentCollection;
             *
             * //Now create a MinHasher object to minhash each of the documents created above
             * //using 300 unique hashing functions.
             * //MinHasher minHasher = new MinHasher(500, 5);
             * Console.WriteLine("\r\nGenerating MinHash signatures ... ");
             * Dictionary<int, int[]> docMinhashes2 = minHasher.createMinhashCollection(wordList2);
             * double avg_diff_perc_from_actual_and_minhash_jaccard2 = Util.calculateMinHashFunctionsAccuracy(wordList2, docMinhashes2);
             */

            Console.ReadKey();
        }