// POST: api/Document public async Task <HttpResponseMessage> Post() { // Check if the request contains multipart/form-data. if (!Request.Content.IsMimeMultipartContent()) { throw new HttpResponseException(HttpStatusCode.UnsupportedMediaType); } string root = HttpContext.Current.Server.MapPath("~/App_Data"); var provider = new MultipartFormDataStreamProvider(root); try { // Read the form data. await Request.Content.ReadAsMultipartAsync(provider); // This illustrates how to get the file names. foreach (MultipartFileData file in provider.FileData) { Document doc = new Document(); string textExtractionResult = new TextExtractor().Extract(file.LocalFileName).Text; List <string> shingles = _minHashes.GetShingles(textExtractionResult); doc.ID = documents.Count + 1; doc.Name = file.Headers.ContentDisposition.FileName.Replace("\"", ""); doc.MinHashes = _minHashes.GetMinHash(shingles); documents.Add(doc); } return(Request.CreateResponse(HttpStatusCode.OK)); } catch (System.Exception e) { return(Request.CreateErrorResponse(HttpStatusCode.InternalServerError, e)); } }
public void GetMinHashTest_ForDifferentInputs() { MinHash minHash = new MinHash(10); List <string> inputOne = new List <string>() { "hello world i", "world i am", "i am a", "am a string", "a string", "string" }; List <string> inputTwo = new List <string>() { "not the same", "the same", "same" }; List <uint> result = minHash.GetMinHash(inputOne); List <uint> expected = minHash.GetMinHash(inputTwo); Assert.AreNotEqual(string.Join(" ", expected), string.Join(" ", result)); }
private static void BuildTitleMinHashes(List <Listing> listings, string job_id) { var universe = new Dictionary <string, int>(); var wordId = 0; var mh = new MinHash(200000, minHashCount); var singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M); // Build ngrams for each listing (if not done already) and save to universe of ngrams foreach (var listing in listings.ToList()) { if (!listing.ngrams_description.Any()) { listing.ngrams_description = mh.GetProfile(listing.description, nGramLength); } foreach (var ngram in listing.ngrams_description.Keys) { if (!universe.ContainsKey(ngram)) { universe[ngram] = wordId++; } } ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress); } singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M); mh = new MinHash(universe.Count, minHashCount); foreach (var listing in listings) { if (listing.minhash_description.Any()) { continue; } // Set word ID in each document foreach (var ngram in listing.ngrams_description.Keys) { listing.word_ids_description.Add(universe[ngram]); } // Calculate min hash for each listing listing.minhash_description = mh.GetMinHash(listing.word_ids_description); ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress); } }
public void GenerateSSGraph(List <string> docs) { /* 1) decomposite the document represetned by fileName into sentences * 2) generate the sentence similarity graph via minhashing and LSH * 3) describe the graph by neiborhood list NB and offset list OFF */ Stopwatch sw = new Stopwatch(); sw.Start(); List <string> docsOrg = new List <string>(docs); for (int i = 0; i < docs.Count; i++) { senteceNames[i] = docs[i]; } int r = docs.Count; int n = 40; int rows = 2; // b= n / r; int[][] minHashes = new int[r][]; for (int i = 0; i < r; i++) { //minHashes[i] = getShingleVec(parseWords(docs[i]).ToList()).ToArray(); minHashes[i] = GetShingleVec(docs[i]).ToArray(); } MinHash mh = new MinHash(r, n); int[,] minhashes = new int[r, n]; for (int i = 0; i < r; i++) { List <int> doc = minHashes[i].ToList(); List <uint> hvs = mh.GetMinHash(doc).ToList(); for (int j = 0; j < hvs.Count; j++) { minhashes[i, j] = (int)hvs[j]; } } OFF.Add(0); int conCount = 0; LSH lsh = new LSH(minhashes, rows); lsh.Calc(); int idx = 0; for (int k = 0; k < minhashes.GetUpperBound(0); k++) { List <int> nearest = lsh.GetNearest(k); if (!nodes.Contains(k)) { nodes.Add(k); } //Console.Write("\n" + k+" "); foreach (int i in nearest) { //Console.Write(near + ", "); if (!nodes.Contains(i)) { nodes.Add(i); } if (i == idx) { continue; } NB.Add(i); conCount++; ++idx; } OFF.Add(conCount); } sw.Stop(); Console.WriteLine(sw.ElapsedMilliseconds / (double)1000); }
public void GenerateSSGraphForComparativeSum(List <string> docs, List <int> offsets) { /* * same as the first just applied to comparative sum * here we have the set of different documents */ Stopwatch sw = new Stopwatch(); sw.Start(); List <string> docsOrg = new List <string>(docs); for (int i = 0; i < docs.Count; i++) { senteceNames[i] = docs[i]; } int r = docs.Count; int n = 100; int rows = 5; // n / r; int[][] minHashes = new int[r][]; for (int i = 0; i < r; i++) { //minHashes[i] = getShingleVec(parseWords(docs[i]).ToList()).ToArray(); minHashes[i] = GetShingleVec(docs[i]).ToArray(); } MinHash mh = new MinHash(r, n); int[,] minhashes = new int[r, n]; for (int i = 0; i < r; i++) { List <int> doc = minHashes[i].ToList(); List <uint> hvs = mh.GetMinHash(doc).ToList(); for (int j = 0; j < hvs.Count; j++) { minhashes[i, j] = (int)hvs[j]; } } OFF.Add(0); int conCount = 0; LSH lsh = new LSH(minhashes, rows); lsh.Calc(); int idx = 0; for (int k = 0; k < minhashes.GetUpperBound(0); k++) { List <int> nearest = lsh.GetNearest(k); if (!nodes.Contains(k)) { nodes.Add(k); } //Console.Write("\n" + k+" "); foreach (int i in nearest) { //Console.Write(near + ", "); if (!nodes.Contains(i)) { nodes.Add(i); } if (i == idx) { continue; } NB.Add(i); if (Helpers.AreFromSameGroup(k, i, offsets)) { SIGN.Add(1); } else { SIGN.Add(-0.5f); } conCount++; ++idx; } OFF.Add(conCount); } sw.Stop(); Console.WriteLine(sw.ElapsedMilliseconds / (double)1000); }