private static Cluster OptimizeRange(SvdEntities context, ClusterCalculation clusterCalculationEntity) { try { var randGen = new Random(); Cluster.SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.Clustering); var clusters = (from k in Enumerable.Range(clusterCalculationEntity.MinimumClusterCount, (clusterCalculationEntity.MaximumClusterCount - clusterCalculationEntity.MinimumClusterCount) + 1) select Optimize(randGen, clusterCalculationEntity.JobId, k, clusterCalculationEntity.IterationsPerCluster, clusterCalculationEntity.MaximumOptimizationsCount)).ToList(); var optimizedCluster = clusters .OrderByDescending(c => c.GlobalSi) .ThenByDescending(c => c.GlobalClusterSiAverage).First(); optimizedCluster.Save(context, clusterCalculationEntity); return(optimizedCluster); } catch (Exception) { Cluster.SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.Failed); throw; } }
// This function will get triggered/executed when a new message is written // on an Azure Queue called queue. public static void StartClusterAnalysis([QueueTrigger("clusterqueue")] int clusterCalculationId, TextWriter log) { Task.Factory.StartNew(() => { using (var context = new SvdEntities()) { ClusterOptimizer.OptimizeRange(context, clusterCalculationId); } }); }
public static Job CreateNewJob(SvdEntities context, int docCount) { var job = context.Jobs.Add(new Job() { DocumentCount = docCount, Created = DateTime.Now }); context.SaveChanges(); return(job); }
public static Svd <float> GetSvd(SvdEntities context, Job job, DenseMatrix termDocMatrix) { var svdStart = DateTime.Now; SetJobStatus(context, job, JobStatus.Svd); var svd = termDocMatrix.Svd(); Debug.WriteLine($"SVD Calc Time: {DateTime.Now.Subtract(svdStart).TotalMilliseconds} Milliseconds"); return(svd); }
public static void GetMatrixContainer(int jobId) { if (!_matrixContainers.ContainsKey(jobId)) { using (var context = new SvdEntities()) { var job = context.Jobs.Find(jobId); var binaryFormatter = new BinaryFormatter(); DenseMatrix newUMatrix = null; DenseMatrix newVMatrix = null; using (var ms = new MemoryStream(job.UMatrix.SerializedValues)) { var uValues = binaryFormatter.Deserialize(ms) as float[]; newUMatrix = new DenseMatrix(job.JobTerms.Count, job.Dimensions, uValues); } using (var ms = new MemoryStream(job.VMatrix.SerializedValues)) { var vValues = binaryFormatter.Deserialize(ms) as float[]; newVMatrix = new DenseMatrix(job.Dimensions, job.JobDocuments.Count, vValues); } // Calc Distance Map var distanceMap = new float[newVMatrix.ColumnCount, newVMatrix.ColumnCount]; Enumerable.Range(0, newVMatrix.ColumnCount).AsParallel().ForAll(i => { for (var m = 0; m < newVMatrix.ColumnCount; m++) { distanceMap[i, m] = Distance.Cosine(newVMatrix.Column(i).ToArray(), newVMatrix.Column(m).ToArray()); } }); _matrixContainers[jobId] = new MatrixContainer() { Dimensions = job.Dimensions, DocNameMap = job.JobDocuments.OrderBy(jd => jd.OrdinalIndex).Select(d => d.Document.Name).ToList(), Terms = job.JobTerms.OrderBy(jt => jt.OrdinalIndex).Select(t => t.Term.Value).ToList(), UMatrix = newUMatrix, VMatrix = newVMatrix, DistanceMap = distanceMap }; } } MatrixContainer = _matrixContainers[jobId]; }
public static void SetCalculationStatus(SvdEntities context, ClusterCalculation clusterCalculationEntity, Contracts.ClusterCalculationStatus status) { if (status == Contracts.ClusterCalculationStatus.New) { clusterCalculationEntity.Created = DateTime.Now; } if (status == Contracts.ClusterCalculationStatus.Completed || status == Contracts.ClusterCalculationStatus.Failed) { clusterCalculationEntity.Completed = DateTime.Now; } clusterCalculationEntity.Status = status; context.SaveChanges(); }
public static Job GetJob(SvdEntities context, int id, bool preLoadMatrixContainer = true) { if (preLoadMatrixContainer) { Task.Factory.StartNew(() => { lock (locker) { LoadMatrices(id); } }); } return(context.Jobs.Find(id)); }
public static ClusterCalculation CreateCalculation(SvdEntities context, Contracts.ClusterCalculationParameters clusterParams) { var clusterCalculationEntity = context.ClusterCalculations.Add(new ClusterCalculation() { JobId = clusterParams.JobId.GetValueOrDefault(), MinimumClusterCount = clusterParams.MinimumClusterCount, MaximumClusterCount = clusterParams.MaximumClusterCount, IterationsPerCluster = clusterParams.IterationsPerCluster, MaximumOptimizationsCount = clusterParams.MaximumOptimizationsCount, }); SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.New); context.SaveChanges(); return(clusterCalculationEntity); }
public static void LoadMatrices(int jobId) { if (!_vMatrices.ContainsKey(jobId) || !_uMatrices.ContainsKey(jobId)) { using (var context = new SvdEntities()) { Job job = null; if (!_vMatrices.ContainsKey(jobId)) { job = context.Jobs.Find(jobId); using (var ms = new MemoryStream(job.VMatrix.SerializedValues)) { var vValues = _binaryFormatter.Deserialize(ms) as float[]; _vMatrices[jobId] = new DenseMatrix(job.Dimensions, job.JobDocuments.Count, vValues); } } if (!_uMatrices.ContainsKey(jobId)) { if (job == null) { job = context.Jobs.Find(jobId); } using (var ms = new MemoryStream(job.UMatrix.SerializedValues)) { var uValues = _binaryFormatter.Deserialize(ms) as float[]; _uMatrices[jobId] = new DenseMatrix(job.JobTerms.Count, job.Dimensions, uValues); } } } } }
public static int GetTotalTermDocCount(SvdEntities context, int documentId) { return(context.TermDocumentCounts.Count(tdc => tdc.DocumentId == documentId)); }
public static Document GetDocument(SvdEntities context, int documentId) { return(context.Documents.Find(documentId)); }
public static List <Document> GetDocuments(SvdEntities context, int page, int docsPerPage) { return(context.Documents.OrderBy(i => i.Name) /*.Skip(page * docsPerPage).Take(docsPerPage)*/.ToList()); }
public static List <Job> GetJobs(SvdEntities context) { return(context.Jobs.ToList()); }
public void Save(SvdEntities context, ClusterCalculation clusterCalculationEntity) { var binaryFormatter = new BinaryFormatter(); var jobDocs = context.JobDocuments.Where(jd => jd.JobId == JobId).ToLookup(jd => jd.OrdinalIndex); var jobTerms = context.JobTerms.Where(jd => jd.JobId == JobId).ToLookup(jt => jt.Term.Value); var clusterEntities = new Dictionary <int, Engine.Cluster>(); clusterCalculationEntity.ClusterCount = Clusters; clusterCalculationEntity.GlobalSi = GlobalSi; clusterCalculationEntity.ClusterSi = GlobalClusterSiAverage; // Update Cluster Calculation context.SaveChanges(); Enumerable.Range(0, Clusters).ToList().ForEach(cluster => { using (var memoryStreamCenterVector = new MemoryStream()) { binaryFormatter.Serialize(memoryStreamCenterVector, Centers[cluster]); memoryStreamCenterVector.Position = 0; clusterEntities.Add(cluster, new Engine.Cluster() { JobId = JobId, ClusterCalculationId = clusterCalculationEntity.Id, Si = ClusterSiAverages[cluster], CenterVectorSerialized = memoryStreamCenterVector.ToArray() }); } }); // Insert Clusters context.BulkInsert(clusterEntities.Select(kvp => kvp.Value)); var clusterJobDocumentEntities = new ConcurrentBag <ClusterJobDocument>(); var clusterJobTermEntities = new ConcurrentBag <ClusterJobTerm>(); clusterEntities.AsParallel().ForAll(clusterEntity => { using (var memoryStreamCenterVector = new MemoryStream()) { var termDistanceMap = new Dictionary <string, float>(); var centerVector = Centers[clusterEntity.Key]; foreach (var kvp in ClusterMap.Where(kvp => kvp.Value == clusterEntity.Key)) { var docIndex = kvp.Key; var jobDocument = jobDocs[docIndex]; if (jobDocument != null) { clusterJobDocumentEntities.Add(new ClusterJobDocument() { ClusterCalculationId = clusterCalculationEntity.Id, ClusterId = clusterEntity.Value.Id, JobId = JobId, Si = DocumentSi.ContainsKey(docIndex) ? DocumentSi[docIndex] : 0, JobDocumentId = jobDocument.First().Id }); } } for (var i = 0; i < LSA.MatrixContainer.UMatrix.RowCount; i++) { termDistanceMap[LSA.MatrixContainer.Terms[i]] = Distance.Cosine(centerVector, LSA.MatrixContainer.UMatrix.Row(i).ToArray()); } foreach (var term in termDistanceMap.OrderBy(t => t.Value).Take(20)) { var jobTermLookup = jobTerms[term.Key]; if (jobTermLookup != null) { clusterJobTermEntities.Add(new ClusterJobTerm() { ClusterCalculationId = clusterCalculationEntity.Id, ClusterId = clusterEntity.Value.Id, JobId = JobId, JobTermId = jobTermLookup.First().Id, DistanceToClusterCenter = term.Value }); } } } }); // Insert Cluster Documents & Terms context.BulkInsert(clusterJobTermEntities); context.BulkInsert(clusterJobDocumentEntities); SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.Completed); }
public static void ProcessAndStore(int jobId, IEnumerable <int> docIds) { using (var context = new SvdEntities()) { Job job = null; var _docIds = docIds.ToArray(); try { job = context.Jobs.Find(jobId); // Process var matrix = GetTermDocMatrix(context, job, _docIds); var svd = GetSvd(context, job, matrix); var dimensions = svd.S.Count <= 300 ? svd.S.Count : 300; var binaryFormatter = new BinaryFormatter(); // Reduction Step - U Table var newUMatrix = new DenseMatrix(matrix.RowCount, dimensions); for (var i = 0; i < dimensions; i++) { for (var m = 0; m < matrix.RowCount; m++) { newUMatrix[m, i] = svd.U[m, i] * svd.S[i]; } } using (var memoryStreamU = new MemoryStream()) { binaryFormatter.Serialize(memoryStreamU, newUMatrix.Values); memoryStreamU.Position = 0; context.UMatrices.Add(new UMatrix() { Job = job, SerializedValues = memoryStreamU.ToArray() }); } // Reduction Step - V Table var newVMatrix = new DenseMatrix(dimensions, _docIds.Length); for (var i = 0; i < dimensions; i++) { for (var m = 0; m < _docIds.Length; m++) { newVMatrix[i, m] = svd.VT[i, m] * svd.S[i]; } } using (var memoryStreamV = new MemoryStream()) { binaryFormatter.Serialize(memoryStreamV, newVMatrix.Values); memoryStreamV.Position = 0; context.VMatrices.Add(new VMatrix() { Job = job, SerializedValues = memoryStreamV.ToArray() }); } job.Dimensions = dimensions; job.Completed = DateTime.Now; job.Status = JobStatus.Completed; context.SaveChanges(); } catch (Exception) { job.Status = JobStatus.Failed; job.Completed = DateTime.Now; context.SaveChanges(); throw; } } }
public static void SetJobStatus(SvdEntities context, Job job, JobStatus status) { job.Status = status; context.SaveChanges(); }
public static Cluster OptimizeRange(SvdEntities context, Contracts.ClusterCalculationParameters clusterAnalysisParameters) => OptimizeRange(context, Cluster.CreateCalculation(context, clusterAnalysisParameters));
public static DenseMatrix GetTermDocMatrix(SvdEntities context, Job job, IEnumerable <int> docIds) { var termLookup = GetTerms(context).ToLookup(t => t.Value); SetJobStatus(context, job, JobStatus.BuildingMatrix); var readFilesStart = DateTime.Now; var _docIds = docIds.ToArray(); var files = context.Documents.Where(d => _docIds.Contains(d.Id)).Select(d => d.Name).ToList(); var newDocuments = new List <Document>(); var jobDocuments = new List <JobDocument>(); var termDocCounts = new List <TermDocumentCount>(); var documentLookup = context.Documents.ToLookup(d => d.Name); // Create Documents foreach (var file in files) { var docEntity = documentLookup[file].FirstOrDefault(); if (docEntity == null) { docEntity = new Document() { Name = file }; newDocuments.Add(docEntity); } else { termDocCounts.AddRange(docEntity.TermDocumentCounts); } jobDocuments.Add(new JobDocument() { Job = job, Document = docEntity, OrdinalIndex = files.IndexOf(file) }); } context.Documents.AddRange(newDocuments); context.JobDocuments.AddRange(jobDocuments); context.SaveChanges(); // Setup Parallel Collections ConcurrentBag <TermDocumentCount> termDocCountsBagCalculated = new ConcurrentBag <TermDocumentCount>(); jobDocuments.AsParallel().ForAll((jobDocumentEntity) => { if (jobDocumentEntity.Document.TermDocumentCounts.Count == 0) { var html = File.ReadAllText(jobDocumentEntity.Document.Name, Encoding.UTF8); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(HttpUtility.HtmlDecode(html)); doc.DocumentNode.SelectNodes("//body//text()").ToList().ForEach(node => { var text = node.InnerText.Trim(); if (!string.IsNullOrEmpty(text) && !string.IsNullOrWhiteSpace(text)) { var chars = text.Where(c => ( char.IsLetterOrDigit(c) || char.IsWhiteSpace(c) || c == '-')) .ToArray(); text = new string(chars); ParseDocumentData(text, jobDocumentEntity.Document, termDocCountsBagCalculated, termLookup); } }); } }); // Build New Term/Doc Count Entites var newTdc = from tdc in termDocCountsBagCalculated group tdc by new { DocumentId = tdc.Document.Id, TermId = tdc.Term.Id } into g let tdc = g.First() select new TermDocumentCount() { Document = tdc.Document, Term = tdc.Term, DocumentId = g.Key.DocumentId, TermId = g.Key.TermId, Count = g.Count() }; context.TermDocumentCounts.AddRange(newTdc); termDocCounts.AddRange(newTdc); // Remove Exclusions from saved list termDocCounts = termDocCounts.Where(tdc => !Exclusions.Contains(tdc.Term.Value)).ToList(); // Save Job Terms var termsList = termDocCounts.Select(tdc => tdc.Term.Value).Distinct().ToList(); var jobTerms = from t in termsList let termEntity = termLookup[t].First() select new JobTerm() { Job = job, TermId = termEntity.Id, OrdinalIndex = termsList.IndexOf(t) }; context.JobTerms.AddRange(jobTerms); // Build Final Term/Doc Matrix var matrix = new DenseMatrix(termsList.Count, _docIds.Length); foreach (var termDocCount in termDocCounts) { matrix[termsList.IndexOf(termDocCount.Term.Value), files.IndexOf(termDocCount.Document.Name)] = termDocCount.Count; } Debug.WriteLine($"Read File Calc Time: {DateTime.Now.Subtract(readFilesStart).TotalMilliseconds} Milliseconds"); return(matrix); }
public static IEnumerable <Term> GetTerms(SvdEntities context) { return(context.Terms.ToList()); }
public static ClusterCalculation Get(SvdEntities context, int clusterCalculationId) { return(context.ClusterCalculations .FirstOrDefault(cc => cc.Id == clusterCalculationId)); }
public static IEnumerable <ClusterCalculation> GetAll(SvdEntities context, int jobId) { return(context.ClusterCalculations.Where(cc => cc.JobId == jobId).ToList()); }
public static Cluster OptimizeRange(SvdEntities context, int clusterCalculationId) => OptimizeRange(context, Cluster.Get(context, clusterCalculationId));
public static void CreateDocument(byte[] documentBytes, string documentName) { using (var context = new SvdEntities()) { var document = context.Documents.FirstOrDefault(d => d.Name == documentName); if (document == null) { var termLookup = GetTerms(context).ToLookup(t => t.Value); var html = Encoding.UTF8.GetString(documentBytes); document = context.Documents.Add(new Document() { Name = documentName.Trim('"') }); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(HttpUtility.HtmlDecode(html)); var termDocCounts = new List <TermDocumentCount>(); doc.DocumentNode.SelectNodes("//body//text()").ToList().ForEach(node => { var text = node.InnerText.Trim(); if (!string.IsNullOrEmpty(text) && !string.IsNullOrWhiteSpace(text)) { var chars = text.Where(c => ( char.IsLetterOrDigit(c) || char.IsWhiteSpace(c) || c == '-')) .ToArray(); text = new string(chars); foreach (var _token in text.Trim().Split(' ')) { var miniToken = _token.Trim().ToLower(); var termList = termLookup[miniToken].ToList(); if (!string.IsNullOrEmpty(miniToken) && miniToken != "-" && miniToken != "\n" && termList.Count > 0) { termDocCounts.Add(new TermDocumentCount() { Document = document, Term = termList.First() }); } } } }); var newTdc = from tdc in termDocCounts group tdc by new { DocumentId = tdc.Document.Id, TermId = tdc.Term.Id } into g let tdc = g.First() select new TermDocumentCount() { Document = tdc.Document, Term = tdc.Term, DocumentId = g.Key.DocumentId, TermId = g.Key.TermId, Count = g.Count() }; context.TermDocumentCounts.AddRange(newTdc); context.SaveChanges(); } } }