public static Job CreateNewJob(SvdEntities context, int docCount) { var job = context.Jobs.Add(new Job() { DocumentCount = docCount, Created = DateTime.Now }); context.SaveChanges(); return(job); }
public static void SetCalculationStatus(SvdEntities context, ClusterCalculation clusterCalculationEntity, Contracts.ClusterCalculationStatus status) { if (status == Contracts.ClusterCalculationStatus.New) { clusterCalculationEntity.Created = DateTime.Now; } if (status == Contracts.ClusterCalculationStatus.Completed || status == Contracts.ClusterCalculationStatus.Failed) { clusterCalculationEntity.Completed = DateTime.Now; } clusterCalculationEntity.Status = status; context.SaveChanges(); }
public static ClusterCalculation CreateCalculation(SvdEntities context, Contracts.ClusterCalculationParameters clusterParams) { var clusterCalculationEntity = context.ClusterCalculations.Add(new ClusterCalculation() { JobId = clusterParams.JobId.GetValueOrDefault(), MinimumClusterCount = clusterParams.MinimumClusterCount, MaximumClusterCount = clusterParams.MaximumClusterCount, IterationsPerCluster = clusterParams.IterationsPerCluster, MaximumOptimizationsCount = clusterParams.MaximumOptimizationsCount, }); SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.New); context.SaveChanges(); return(clusterCalculationEntity); }
public static void CreateDocument(byte[] documentBytes, string documentName) { using (var context = new SvdEntities()) { var document = context.Documents.FirstOrDefault(d => d.Name == documentName); if (document == null) { var termLookup = GetTerms(context).ToLookup(t => t.Value); var html = Encoding.UTF8.GetString(documentBytes); document = context.Documents.Add(new Document() { Name = documentName.Trim('"') }); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(HttpUtility.HtmlDecode(html)); var termDocCounts = new List <TermDocumentCount>(); doc.DocumentNode.SelectNodes("//body//text()").ToList().ForEach(node => { var text = node.InnerText.Trim(); if (!string.IsNullOrEmpty(text) && !string.IsNullOrWhiteSpace(text)) { var chars = text.Where(c => ( char.IsLetterOrDigit(c) || char.IsWhiteSpace(c) || c == '-')) .ToArray(); text = new string(chars); foreach (var _token in text.Trim().Split(' ')) { var miniToken = _token.Trim().ToLower(); var termList = termLookup[miniToken].ToList(); if (!string.IsNullOrEmpty(miniToken) && miniToken != "-" && miniToken != "\n" && termList.Count > 0) { termDocCounts.Add(new TermDocumentCount() { Document = document, Term = termList.First() }); } } } }); var newTdc = from tdc in termDocCounts group tdc by new { DocumentId = tdc.Document.Id, TermId = tdc.Term.Id } into g let tdc = g.First() select new TermDocumentCount() { Document = tdc.Document, Term = tdc.Term, DocumentId = g.Key.DocumentId, TermId = g.Key.TermId, Count = g.Count() }; context.TermDocumentCounts.AddRange(newTdc); context.SaveChanges(); } } }
public static void ProcessAndStore(int jobId, IEnumerable <int> docIds) { using (var context = new SvdEntities()) { Job job = null; var _docIds = docIds.ToArray(); try { job = context.Jobs.Find(jobId); // Process var matrix = GetTermDocMatrix(context, job, _docIds); var svd = GetSvd(context, job, matrix); var dimensions = svd.S.Count <= 300 ? svd.S.Count : 300; var binaryFormatter = new BinaryFormatter(); // Reduction Step - U Table var newUMatrix = new DenseMatrix(matrix.RowCount, dimensions); for (var i = 0; i < dimensions; i++) { for (var m = 0; m < matrix.RowCount; m++) { newUMatrix[m, i] = svd.U[m, i] * svd.S[i]; } } using (var memoryStreamU = new MemoryStream()) { binaryFormatter.Serialize(memoryStreamU, newUMatrix.Values); memoryStreamU.Position = 0; context.UMatrices.Add(new UMatrix() { Job = job, SerializedValues = memoryStreamU.ToArray() }); } // Reduction Step - V Table var newVMatrix = new DenseMatrix(dimensions, _docIds.Length); for (var i = 0; i < dimensions; i++) { for (var m = 0; m < _docIds.Length; m++) { newVMatrix[i, m] = svd.VT[i, m] * svd.S[i]; } } using (var memoryStreamV = new MemoryStream()) { binaryFormatter.Serialize(memoryStreamV, newVMatrix.Values); memoryStreamV.Position = 0; context.VMatrices.Add(new VMatrix() { Job = job, SerializedValues = memoryStreamV.ToArray() }); } job.Dimensions = dimensions; job.Completed = DateTime.Now; job.Status = JobStatus.Completed; context.SaveChanges(); } catch (Exception) { job.Status = JobStatus.Failed; job.Completed = DateTime.Now; context.SaveChanges(); throw; } } }
public static void SetJobStatus(SvdEntities context, Job job, JobStatus status) { job.Status = status; context.SaveChanges(); }
public static DenseMatrix GetTermDocMatrix(SvdEntities context, Job job, IEnumerable <int> docIds) { var termLookup = GetTerms(context).ToLookup(t => t.Value); SetJobStatus(context, job, JobStatus.BuildingMatrix); var readFilesStart = DateTime.Now; var _docIds = docIds.ToArray(); var files = context.Documents.Where(d => _docIds.Contains(d.Id)).Select(d => d.Name).ToList(); var newDocuments = new List <Document>(); var jobDocuments = new List <JobDocument>(); var termDocCounts = new List <TermDocumentCount>(); var documentLookup = context.Documents.ToLookup(d => d.Name); // Create Documents foreach (var file in files) { var docEntity = documentLookup[file].FirstOrDefault(); if (docEntity == null) { docEntity = new Document() { Name = file }; newDocuments.Add(docEntity); } else { termDocCounts.AddRange(docEntity.TermDocumentCounts); } jobDocuments.Add(new JobDocument() { Job = job, Document = docEntity, OrdinalIndex = files.IndexOf(file) }); } context.Documents.AddRange(newDocuments); context.JobDocuments.AddRange(jobDocuments); context.SaveChanges(); // Setup Parallel Collections ConcurrentBag <TermDocumentCount> termDocCountsBagCalculated = new ConcurrentBag <TermDocumentCount>(); jobDocuments.AsParallel().ForAll((jobDocumentEntity) => { if (jobDocumentEntity.Document.TermDocumentCounts.Count == 0) { var html = File.ReadAllText(jobDocumentEntity.Document.Name, Encoding.UTF8); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(HttpUtility.HtmlDecode(html)); doc.DocumentNode.SelectNodes("//body//text()").ToList().ForEach(node => { var text = node.InnerText.Trim(); if (!string.IsNullOrEmpty(text) && !string.IsNullOrWhiteSpace(text)) { var chars = text.Where(c => ( char.IsLetterOrDigit(c) || char.IsWhiteSpace(c) || c == '-')) .ToArray(); text = new string(chars); ParseDocumentData(text, jobDocumentEntity.Document, termDocCountsBagCalculated, termLookup); } }); } }); // Build New Term/Doc Count Entites var newTdc = from tdc in termDocCountsBagCalculated group tdc by new { DocumentId = tdc.Document.Id, TermId = tdc.Term.Id } into g let tdc = g.First() select new TermDocumentCount() { Document = tdc.Document, Term = tdc.Term, DocumentId = g.Key.DocumentId, TermId = g.Key.TermId, Count = g.Count() }; context.TermDocumentCounts.AddRange(newTdc); termDocCounts.AddRange(newTdc); // Remove Exclusions from saved list termDocCounts = termDocCounts.Where(tdc => !Exclusions.Contains(tdc.Term.Value)).ToList(); // Save Job Terms var termsList = termDocCounts.Select(tdc => tdc.Term.Value).Distinct().ToList(); var jobTerms = from t in termsList let termEntity = termLookup[t].First() select new JobTerm() { Job = job, TermId = termEntity.Id, OrdinalIndex = termsList.IndexOf(t) }; context.JobTerms.AddRange(jobTerms); // Build Final Term/Doc Matrix var matrix = new DenseMatrix(termsList.Count, _docIds.Length); foreach (var termDocCount in termDocCounts) { matrix[termsList.IndexOf(termDocCount.Term.Value), files.IndexOf(termDocCount.Document.Name)] = termDocCount.Count; } Debug.WriteLine($"Read File Calc Time: {DateTime.Now.Subtract(readFilesStart).TotalMilliseconds} Milliseconds"); return(matrix); }
public void Save(SvdEntities context, ClusterCalculation clusterCalculationEntity) { var binaryFormatter = new BinaryFormatter(); var jobDocs = context.JobDocuments.Where(jd => jd.JobId == JobId).ToLookup(jd => jd.OrdinalIndex); var jobTerms = context.JobTerms.Where(jd => jd.JobId == JobId).ToLookup(jt => jt.Term.Value); var clusterEntities = new Dictionary <int, Engine.Cluster>(); clusterCalculationEntity.ClusterCount = Clusters; clusterCalculationEntity.GlobalSi = GlobalSi; clusterCalculationEntity.ClusterSi = GlobalClusterSiAverage; // Update Cluster Calculation context.SaveChanges(); Enumerable.Range(0, Clusters).ToList().ForEach(cluster => { using (var memoryStreamCenterVector = new MemoryStream()) { binaryFormatter.Serialize(memoryStreamCenterVector, Centers[cluster]); memoryStreamCenterVector.Position = 0; clusterEntities.Add(cluster, new Engine.Cluster() { JobId = JobId, ClusterCalculationId = clusterCalculationEntity.Id, Si = ClusterSiAverages[cluster], CenterVectorSerialized = memoryStreamCenterVector.ToArray() }); } }); // Insert Clusters context.BulkInsert(clusterEntities.Select(kvp => kvp.Value)); var clusterJobDocumentEntities = new ConcurrentBag <ClusterJobDocument>(); var clusterJobTermEntities = new ConcurrentBag <ClusterJobTerm>(); clusterEntities.AsParallel().ForAll(clusterEntity => { using (var memoryStreamCenterVector = new MemoryStream()) { var termDistanceMap = new Dictionary <string, float>(); var centerVector = Centers[clusterEntity.Key]; foreach (var kvp in ClusterMap.Where(kvp => kvp.Value == clusterEntity.Key)) { var docIndex = kvp.Key; var jobDocument = jobDocs[docIndex]; if (jobDocument != null) { clusterJobDocumentEntities.Add(new ClusterJobDocument() { ClusterCalculationId = clusterCalculationEntity.Id, ClusterId = clusterEntity.Value.Id, JobId = JobId, Si = DocumentSi.ContainsKey(docIndex) ? DocumentSi[docIndex] : 0, JobDocumentId = jobDocument.First().Id }); } } for (var i = 0; i < LSA.MatrixContainer.UMatrix.RowCount; i++) { termDistanceMap[LSA.MatrixContainer.Terms[i]] = Distance.Cosine(centerVector, LSA.MatrixContainer.UMatrix.Row(i).ToArray()); } foreach (var term in termDistanceMap.OrderBy(t => t.Value).Take(20)) { var jobTermLookup = jobTerms[term.Key]; if (jobTermLookup != null) { clusterJobTermEntities.Add(new ClusterJobTerm() { ClusterCalculationId = clusterCalculationEntity.Id, ClusterId = clusterEntity.Value.Id, JobId = JobId, JobTermId = jobTermLookup.First().Id, DistanceToClusterCenter = term.Value }); } } } }); // Insert Cluster Documents & Terms context.BulkInsert(clusterJobTermEntities); context.BulkInsert(clusterJobDocumentEntities); SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.Completed); }