Example #1
0
        public static Job CreateNewJob(SvdEntities context, int docCount)
        {
            var job = context.Jobs.Add(new Job()
            {
                DocumentCount = docCount,
                Created       = DateTime.Now
            });

            context.SaveChanges();

            return(job);
        }
Example #2
0
        public static void SetCalculationStatus(SvdEntities context, ClusterCalculation clusterCalculationEntity, Contracts.ClusterCalculationStatus status)
        {
            if (status == Contracts.ClusterCalculationStatus.New)
            {
                clusterCalculationEntity.Created = DateTime.Now;
            }

            if (status == Contracts.ClusterCalculationStatus.Completed || status == Contracts.ClusterCalculationStatus.Failed)
            {
                clusterCalculationEntity.Completed = DateTime.Now;
            }

            clusterCalculationEntity.Status = status;
            context.SaveChanges();
        }
Example #3
0
        public static ClusterCalculation CreateCalculation(SvdEntities context, Contracts.ClusterCalculationParameters clusterParams)
        {
            var clusterCalculationEntity = context.ClusterCalculations.Add(new ClusterCalculation()
            {
                JobId = clusterParams.JobId.GetValueOrDefault(),
                MinimumClusterCount       = clusterParams.MinimumClusterCount,
                MaximumClusterCount       = clusterParams.MaximumClusterCount,
                IterationsPerCluster      = clusterParams.IterationsPerCluster,
                MaximumOptimizationsCount = clusterParams.MaximumOptimizationsCount,
            });

            SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.New);

            context.SaveChanges();

            return(clusterCalculationEntity);
        }
Example #4
0
        public static void CreateDocument(byte[] documentBytes, string documentName)
        {
            using (var context = new SvdEntities())
            {
                var document = context.Documents.FirstOrDefault(d => d.Name == documentName);

                if (document == null)
                {
                    var termLookup = GetTerms(context).ToLookup(t => t.Value);
                    var html       = Encoding.UTF8.GetString(documentBytes);

                    document = context.Documents.Add(new Document()
                    {
                        Name = documentName.Trim('"')
                    });

                    HtmlDocument doc = new HtmlDocument();

                    doc.LoadHtml(HttpUtility.HtmlDecode(html));

                    var termDocCounts = new List <TermDocumentCount>();

                    doc.DocumentNode.SelectNodes("//body//text()").ToList().ForEach(node =>
                    {
                        var text = node.InnerText.Trim();

                        if (!string.IsNullOrEmpty(text) && !string.IsNullOrWhiteSpace(text))
                        {
                            var chars = text.Where(c => (
                                                       char.IsLetterOrDigit(c) ||
                                                       char.IsWhiteSpace(c) ||
                                                       c == '-'))
                                        .ToArray();

                            text = new string(chars);

                            foreach (var _token in text.Trim().Split(' '))
                            {
                                var miniToken = _token.Trim().ToLower();

                                var termList = termLookup[miniToken].ToList();

                                if (!string.IsNullOrEmpty(miniToken) && miniToken != "-" && miniToken != "\n" && termList.Count > 0)
                                {
                                    termDocCounts.Add(new TermDocumentCount()
                                    {
                                        Document = document,
                                        Term     = termList.First()
                                    });
                                }
                            }
                        }
                    });

                    var newTdc = from tdc in termDocCounts
                                 group tdc by new
                    {
                        DocumentId = tdc.Document.Id,
                        TermId     = tdc.Term.Id
                    } into g
                    let tdc = g.First()
                              select new TermDocumentCount()
                    {
                        Document   = tdc.Document,
                        Term       = tdc.Term,
                        DocumentId = g.Key.DocumentId,
                        TermId     = g.Key.TermId,
                        Count      = g.Count()
                    };


                    context.TermDocumentCounts.AddRange(newTdc);
                    context.SaveChanges();
                }
            }
        }
Example #5
0
        public static void ProcessAndStore(int jobId, IEnumerable <int> docIds)
        {
            using (var context = new SvdEntities())
            {
                Job job     = null;
                var _docIds = docIds.ToArray();

                try
                {
                    job = context.Jobs.Find(jobId);

                    // Process
                    var matrix = GetTermDocMatrix(context, job, _docIds);
                    var svd    = GetSvd(context, job, matrix);

                    var dimensions = svd.S.Count <= 300 ? svd.S.Count : 300;

                    var binaryFormatter = new BinaryFormatter();

                    // Reduction Step - U Table

                    var newUMatrix = new DenseMatrix(matrix.RowCount, dimensions);

                    for (var i = 0; i < dimensions; i++)
                    {
                        for (var m = 0; m < matrix.RowCount; m++)
                        {
                            newUMatrix[m, i] = svd.U[m, i] * svd.S[i];
                        }
                    }

                    using (var memoryStreamU = new MemoryStream())
                    {
                        binaryFormatter.Serialize(memoryStreamU, newUMatrix.Values);

                        memoryStreamU.Position = 0;

                        context.UMatrices.Add(new UMatrix()
                        {
                            Job = job,
                            SerializedValues = memoryStreamU.ToArray()
                        });
                    }

                    // Reduction Step - V Table

                    var newVMatrix = new DenseMatrix(dimensions, _docIds.Length);

                    for (var i = 0; i < dimensions; i++)
                    {
                        for (var m = 0; m < _docIds.Length; m++)
                        {
                            newVMatrix[i, m] = svd.VT[i, m] * svd.S[i];
                        }
                    }

                    using (var memoryStreamV = new MemoryStream())
                    {
                        binaryFormatter.Serialize(memoryStreamV, newVMatrix.Values);

                        memoryStreamV.Position = 0;

                        context.VMatrices.Add(new VMatrix()
                        {
                            Job = job,
                            SerializedValues = memoryStreamV.ToArray()
                        });
                    }

                    job.Dimensions = dimensions;
                    job.Completed  = DateTime.Now;
                    job.Status     = JobStatus.Completed;

                    context.SaveChanges();
                }
                catch (Exception)
                {
                    job.Status    = JobStatus.Failed;
                    job.Completed = DateTime.Now;
                    context.SaveChanges();

                    throw;
                }
            }
        }
Example #6
0
 public static void SetJobStatus(SvdEntities context, Job job, JobStatus status)
 {
     job.Status = status;
     context.SaveChanges();
 }
Example #7
0
        public static DenseMatrix GetTermDocMatrix(SvdEntities context, Job job, IEnumerable <int> docIds)
        {
            var termLookup = GetTerms(context).ToLookup(t => t.Value);

            SetJobStatus(context, job, JobStatus.BuildingMatrix);

            var readFilesStart = DateTime.Now;

            var _docIds = docIds.ToArray();
            var files   = context.Documents.Where(d => _docIds.Contains(d.Id)).Select(d => d.Name).ToList();

            var newDocuments   = new List <Document>();
            var jobDocuments   = new List <JobDocument>();
            var termDocCounts  = new List <TermDocumentCount>();
            var documentLookup = context.Documents.ToLookup(d => d.Name);

            // Create Documents
            foreach (var file in files)
            {
                var docEntity = documentLookup[file].FirstOrDefault();

                if (docEntity == null)
                {
                    docEntity = new Document()
                    {
                        Name = file
                    };

                    newDocuments.Add(docEntity);
                }
                else
                {
                    termDocCounts.AddRange(docEntity.TermDocumentCounts);
                }

                jobDocuments.Add(new JobDocument()
                {
                    Job          = job,
                    Document     = docEntity,
                    OrdinalIndex = files.IndexOf(file)
                });
            }

            context.Documents.AddRange(newDocuments);
            context.JobDocuments.AddRange(jobDocuments);

            context.SaveChanges();

            // Setup Parallel Collections

            ConcurrentBag <TermDocumentCount> termDocCountsBagCalculated = new ConcurrentBag <TermDocumentCount>();

            jobDocuments.AsParallel().ForAll((jobDocumentEntity) =>
            {
                if (jobDocumentEntity.Document.TermDocumentCounts.Count == 0)
                {
                    var html = File.ReadAllText(jobDocumentEntity.Document.Name, Encoding.UTF8);

                    HtmlDocument doc = new HtmlDocument();

                    doc.LoadHtml(HttpUtility.HtmlDecode(html));

                    doc.DocumentNode.SelectNodes("//body//text()").ToList().ForEach(node =>
                    {
                        var text = node.InnerText.Trim();

                        if (!string.IsNullOrEmpty(text) && !string.IsNullOrWhiteSpace(text))
                        {
                            var chars = text.Where(c => (
                                                       char.IsLetterOrDigit(c) ||
                                                       char.IsWhiteSpace(c) ||
                                                       c == '-'))
                                        .ToArray();

                            text = new string(chars);

                            ParseDocumentData(text, jobDocumentEntity.Document, termDocCountsBagCalculated, termLookup);
                        }
                    });
                }
            });

            // Build New Term/Doc Count Entites

            var newTdc = from tdc in termDocCountsBagCalculated
                         group tdc by new
            {
                DocumentId = tdc.Document.Id,
                TermId     = tdc.Term.Id
            } into g
            let tdc = g.First()
                      select new TermDocumentCount()
            {
                Document   = tdc.Document,
                Term       = tdc.Term,
                DocumentId = g.Key.DocumentId,
                TermId     = g.Key.TermId,
                Count      = g.Count()
            };

            context.TermDocumentCounts.AddRange(newTdc);
            termDocCounts.AddRange(newTdc);

            // Remove Exclusions from saved list
            termDocCounts = termDocCounts.Where(tdc => !Exclusions.Contains(tdc.Term.Value)).ToList();

            // Save Job Terms

            var termsList = termDocCounts.Select(tdc => tdc.Term.Value).Distinct().ToList();

            var jobTerms = from t in termsList
                           let termEntity = termLookup[t].First()
                                            select new JobTerm()
            {
                Job          = job,
                TermId       = termEntity.Id,
                OrdinalIndex = termsList.IndexOf(t)
            };

            context.JobTerms.AddRange(jobTerms);

            // Build Final Term/Doc Matrix

            var matrix = new DenseMatrix(termsList.Count, _docIds.Length);

            foreach (var termDocCount in termDocCounts)
            {
                matrix[termsList.IndexOf(termDocCount.Term.Value), files.IndexOf(termDocCount.Document.Name)] = termDocCount.Count;
            }

            Debug.WriteLine($"Read File Calc Time: {DateTime.Now.Subtract(readFilesStart).TotalMilliseconds} Milliseconds");

            return(matrix);
        }
Example #8
0
        public void Save(SvdEntities context, ClusterCalculation clusterCalculationEntity)
        {
            var binaryFormatter = new BinaryFormatter();

            var jobDocs         = context.JobDocuments.Where(jd => jd.JobId == JobId).ToLookup(jd => jd.OrdinalIndex);
            var jobTerms        = context.JobTerms.Where(jd => jd.JobId == JobId).ToLookup(jt => jt.Term.Value);
            var clusterEntities = new Dictionary <int, Engine.Cluster>();

            clusterCalculationEntity.ClusterCount = Clusters;
            clusterCalculationEntity.GlobalSi     = GlobalSi;
            clusterCalculationEntity.ClusterSi    = GlobalClusterSiAverage;

            // Update Cluster Calculation
            context.SaveChanges();

            Enumerable.Range(0, Clusters).ToList().ForEach(cluster =>
            {
                using (var memoryStreamCenterVector = new MemoryStream())
                {
                    binaryFormatter.Serialize(memoryStreamCenterVector, Centers[cluster]);

                    memoryStreamCenterVector.Position = 0;

                    clusterEntities.Add(cluster, new Engine.Cluster()
                    {
                        JobId = JobId,
                        ClusterCalculationId = clusterCalculationEntity.Id,
                        Si = ClusterSiAverages[cluster],
                        CenterVectorSerialized = memoryStreamCenterVector.ToArray()
                    });
                }
            });

            // Insert Clusters
            context.BulkInsert(clusterEntities.Select(kvp => kvp.Value));

            var clusterJobDocumentEntities = new ConcurrentBag <ClusterJobDocument>();
            var clusterJobTermEntities     = new ConcurrentBag <ClusterJobTerm>();

            clusterEntities.AsParallel().ForAll(clusterEntity =>
            {
                using (var memoryStreamCenterVector = new MemoryStream())
                {
                    var termDistanceMap = new Dictionary <string, float>();
                    var centerVector    = Centers[clusterEntity.Key];

                    foreach (var kvp in ClusterMap.Where(kvp => kvp.Value == clusterEntity.Key))
                    {
                        var docIndex    = kvp.Key;
                        var jobDocument = jobDocs[docIndex];

                        if (jobDocument != null)
                        {
                            clusterJobDocumentEntities.Add(new ClusterJobDocument()
                            {
                                ClusterCalculationId = clusterCalculationEntity.Id,
                                ClusterId            = clusterEntity.Value.Id,
                                JobId         = JobId,
                                Si            = DocumentSi.ContainsKey(docIndex) ? DocumentSi[docIndex] : 0,
                                JobDocumentId = jobDocument.First().Id
                            });
                        }
                    }

                    for (var i = 0; i < LSA.MatrixContainer.UMatrix.RowCount; i++)
                    {
                        termDistanceMap[LSA.MatrixContainer.Terms[i]] = Distance.Cosine(centerVector, LSA.MatrixContainer.UMatrix.Row(i).ToArray());
                    }

                    foreach (var term in termDistanceMap.OrderBy(t => t.Value).Take(20))
                    {
                        var jobTermLookup = jobTerms[term.Key];

                        if (jobTermLookup != null)
                        {
                            clusterJobTermEntities.Add(new ClusterJobTerm()
                            {
                                ClusterCalculationId = clusterCalculationEntity.Id,
                                ClusterId            = clusterEntity.Value.Id,
                                JobId     = JobId,
                                JobTermId = jobTermLookup.First().Id,
                                DistanceToClusterCenter = term.Value
                            });
                        }
                    }
                }
            });

            // Insert Cluster Documents & Terms
            context.BulkInsert(clusterJobTermEntities);
            context.BulkInsert(clusterJobDocumentEntities);

            SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.Completed);
        }