コード例 #1
0
        // POST: api/Document
        public async Task <HttpResponseMessage> Post()
        {
            // Check if the request contains multipart/form-data.
            if (!Request.Content.IsMimeMultipartContent())
            {
                throw new HttpResponseException(HttpStatusCode.UnsupportedMediaType);
            }

            string root     = HttpContext.Current.Server.MapPath("~/App_Data");
            var    provider = new MultipartFormDataStreamProvider(root);

            try
            {
                // Read the form data.
                await Request.Content.ReadAsMultipartAsync(provider);

                // This illustrates how to get the file names.
                foreach (MultipartFileData file in provider.FileData)
                {
                    Document      doc = new Document();
                    string        textExtractionResult = new TextExtractor().Extract(file.LocalFileName).Text;
                    List <string> shingles             = _minHashes.GetShingles(textExtractionResult);
                    doc.ID        = documents.Count + 1;
                    doc.Name      = file.Headers.ContentDisposition.FileName.Replace("\"", "");
                    doc.MinHashes = _minHashes.GetMinHash(shingles);
                    documents.Add(doc);
                }
                return(Request.CreateResponse(HttpStatusCode.OK));
            }
            catch (System.Exception e)
            {
                return(Request.CreateErrorResponse(HttpStatusCode.InternalServerError, e));
            }
        }
コード例 #2
0
ファイル: MinHashTests.cs プロジェクト: ErichL/DocumentID
        public void GetMinHashTest_ForDifferentInputs()
        {
            MinHash       minHash  = new MinHash(10);
            List <string> inputOne = new List <string>()
            {
                "hello world i", "world i am", "i am a", "am a string", "a string", "string"
            };
            List <string> inputTwo = new List <string>()
            {
                "not the same", "the same", "same"
            };
            List <uint> result   = minHash.GetMinHash(inputOne);
            List <uint> expected = minHash.GetMinHash(inputTwo);

            Assert.AreNotEqual(string.Join(" ", expected), string.Join(" ", result));
        }
コード例 #3
0
        private static void BuildTitleMinHashes(List <Listing> listings, string job_id)
        {
            var universe = new Dictionary <string, int>();
            var wordId   = 0;
            var mh       = new MinHash(200000, minHashCount);

            var singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M);

            // Build ngrams for each listing (if not done already) and save to universe of ngrams
            foreach (var listing in listings.ToList())
            {
                if (!listing.ngrams_description.Any())
                {
                    listing.ngrams_description = mh.GetProfile(listing.description, nGramLength);
                }

                foreach (var ngram in listing.ngrams_description.Keys)
                {
                    if (!universe.ContainsKey(ngram))
                    {
                        universe[ngram] = wordId++;
                    }
                }

                ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress);
            }

            singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M);

            mh = new MinHash(universe.Count, minHashCount);

            foreach (var listing in listings)
            {
                if (listing.minhash_description.Any())
                {
                    continue;
                }

                // Set word ID in each document
                foreach (var ngram in listing.ngrams_description.Keys)
                {
                    listing.word_ids_description.Add(universe[ngram]);
                }

                // Calculate min hash for each listing
                listing.minhash_description = mh.GetMinHash(listing.word_ids_description);

                ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress);
            }
        }
コード例 #4
0
        public void GenerateSSGraph(List <string> docs)
        {
            /* 1) decomposite the document represetned by fileName into sentences
             * 2) generate the sentence similarity graph via minhashing and LSH
             * 3) describe the graph by neiborhood list NB and offset list OFF
             */
            Stopwatch sw = new Stopwatch();

            sw.Start();
            List <string> docsOrg = new List <string>(docs);

            for (int i = 0; i < docs.Count; i++)
            {
                senteceNames[i] = docs[i];
            }

            int r    = docs.Count;
            int n    = 40;
            int rows = 2; // b= n / r;

            int[][] minHashes = new int[r][];
            for (int i = 0; i < r; i++)
            {
                //minHashes[i] = getShingleVec(parseWords(docs[i]).ToList()).ToArray();
                minHashes[i] = GetShingleVec(docs[i]).ToArray();
            }

            MinHash mh = new MinHash(r, n);

            int[,] minhashes = new int[r, n];
            for (int i = 0; i < r; i++)
            {
                List <int>  doc = minHashes[i].ToList();
                List <uint> hvs = mh.GetMinHash(doc).ToList();
                for (int j = 0; j < hvs.Count; j++)
                {
                    minhashes[i, j] = (int)hvs[j];
                }
            }


            OFF.Add(0);
            int conCount = 0;


            LSH lsh = new LSH(minhashes, rows);

            lsh.Calc();
            int idx = 0;

            for (int k = 0; k < minhashes.GetUpperBound(0); k++)
            {
                List <int> nearest = lsh.GetNearest(k);
                if (!nodes.Contains(k))
                {
                    nodes.Add(k);
                }
                //Console.Write("\n" + k+" ");
                foreach (int i in nearest)
                {
                    //Console.Write(near + ", ");
                    if (!nodes.Contains(i))
                    {
                        nodes.Add(i);
                    }
                    if (i == idx)
                    {
                        continue;
                    }
                    NB.Add(i);
                    conCount++;
                    ++idx;
                }
                OFF.Add(conCount);
            }
            sw.Stop();
            Console.WriteLine(sw.ElapsedMilliseconds / (double)1000);
        }
コード例 #5
0
        public void GenerateSSGraphForComparativeSum(List <string> docs, List <int> offsets)
        {
            /*
             * same as the first just applied to comparative sum
             * here we have the set of different documents
             */
            Stopwatch sw = new Stopwatch();

            sw.Start();
            List <string> docsOrg = new List <string>(docs);

            for (int i = 0; i < docs.Count; i++)
            {
                senteceNames[i] = docs[i];
            }

            int r    = docs.Count;
            int n    = 100;
            int rows = 5; // n / r;

            int[][] minHashes = new int[r][];
            for (int i = 0; i < r; i++)
            {
                //minHashes[i] = getShingleVec(parseWords(docs[i]).ToList()).ToArray();
                minHashes[i] = GetShingleVec(docs[i]).ToArray();
            }

            MinHash mh = new MinHash(r, n);

            int[,] minhashes = new int[r, n];
            for (int i = 0; i < r; i++)
            {
                List <int>  doc = minHashes[i].ToList();
                List <uint> hvs = mh.GetMinHash(doc).ToList();
                for (int j = 0; j < hvs.Count; j++)
                {
                    minhashes[i, j] = (int)hvs[j];
                }
            }


            OFF.Add(0);
            int conCount = 0;


            LSH lsh = new LSH(minhashes, rows);

            lsh.Calc();
            int idx = 0;

            for (int k = 0; k < minhashes.GetUpperBound(0); k++)
            {
                List <int> nearest = lsh.GetNearest(k);
                if (!nodes.Contains(k))
                {
                    nodes.Add(k);
                }
                //Console.Write("\n" + k+" ");
                foreach (int i in nearest)
                {
                    //Console.Write(near + ", ");
                    if (!nodes.Contains(i))
                    {
                        nodes.Add(i);
                    }
                    if (i == idx)
                    {
                        continue;
                    }
                    NB.Add(i);
                    if (Helpers.AreFromSameGroup(k, i, offsets))
                    {
                        SIGN.Add(1);
                    }
                    else
                    {
                        SIGN.Add(-0.5f);
                    }
                    conCount++;
                    ++idx;
                }
                OFF.Add(conCount);
            }
            sw.Stop();
            Console.WriteLine(sw.ElapsedMilliseconds / (double)1000);
        }