Пример #1
0
        private static Cluster OptimizeRange(SvdEntities context, ClusterCalculation clusterCalculationEntity)
        {
            try
            {
                var randGen = new Random();

                Cluster.SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.Clustering);

                var clusters = (from k in Enumerable.Range(clusterCalculationEntity.MinimumClusterCount, (clusterCalculationEntity.MaximumClusterCount - clusterCalculationEntity.MinimumClusterCount) + 1)
                                select Optimize(randGen, clusterCalculationEntity.JobId, k, clusterCalculationEntity.IterationsPerCluster, clusterCalculationEntity.MaximumOptimizationsCount)).ToList();

                var optimizedCluster = clusters
                                       .OrderByDescending(c => c.GlobalSi)
                                       .ThenByDescending(c => c.GlobalClusterSiAverage).First();

                optimizedCluster.Save(context, clusterCalculationEntity);

                return(optimizedCluster);
            }
            catch (Exception)
            {
                Cluster.SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.Failed);
                throw;
            }
        }
Пример #2
0
 // This function will get triggered/executed when a new message is written
 // on an Azure Queue called queue.
 public static void StartClusterAnalysis([QueueTrigger("clusterqueue")] int clusterCalculationId, TextWriter log)
 {
     Task.Factory.StartNew(() =>
     {
         using (var context = new SvdEntities())
         {
             ClusterOptimizer.OptimizeRange(context, clusterCalculationId);
         }
     });
 }
Пример #3
0
        public static Job CreateNewJob(SvdEntities context, int docCount)
        {
            var job = context.Jobs.Add(new Job()
            {
                DocumentCount = docCount,
                Created       = DateTime.Now
            });

            context.SaveChanges();

            return(job);
        }
Пример #4
0
        public static Svd <float> GetSvd(SvdEntities context, Job job, DenseMatrix termDocMatrix)
        {
            var svdStart = DateTime.Now;

            SetJobStatus(context, job, JobStatus.Svd);

            var svd = termDocMatrix.Svd();

            Debug.WriteLine($"SVD Calc Time: {DateTime.Now.Subtract(svdStart).TotalMilliseconds} Milliseconds");

            return(svd);
        }
Пример #5
0
        public static void GetMatrixContainer(int jobId)
        {
            if (!_matrixContainers.ContainsKey(jobId))
            {
                using (var context = new SvdEntities())
                {
                    var job = context.Jobs.Find(jobId);

                    var binaryFormatter = new BinaryFormatter();

                    DenseMatrix newUMatrix = null;
                    DenseMatrix newVMatrix = null;

                    using (var ms = new MemoryStream(job.UMatrix.SerializedValues))
                    {
                        var uValues = binaryFormatter.Deserialize(ms) as float[];

                        newUMatrix = new DenseMatrix(job.JobTerms.Count, job.Dimensions, uValues);
                    }

                    using (var ms = new MemoryStream(job.VMatrix.SerializedValues))
                    {
                        var vValues = binaryFormatter.Deserialize(ms) as float[];

                        newVMatrix = new DenseMatrix(job.Dimensions, job.JobDocuments.Count, vValues);
                    }

                    // Calc Distance Map
                    var distanceMap = new float[newVMatrix.ColumnCount, newVMatrix.ColumnCount];

                    Enumerable.Range(0, newVMatrix.ColumnCount).AsParallel().ForAll(i =>
                    {
                        for (var m = 0; m < newVMatrix.ColumnCount; m++)
                        {
                            distanceMap[i, m] = Distance.Cosine(newVMatrix.Column(i).ToArray(), newVMatrix.Column(m).ToArray());
                        }
                    });

                    _matrixContainers[jobId] = new MatrixContainer()
                    {
                        Dimensions  = job.Dimensions,
                        DocNameMap  = job.JobDocuments.OrderBy(jd => jd.OrdinalIndex).Select(d => d.Document.Name).ToList(),
                        Terms       = job.JobTerms.OrderBy(jt => jt.OrdinalIndex).Select(t => t.Term.Value).ToList(),
                        UMatrix     = newUMatrix,
                        VMatrix     = newVMatrix,
                        DistanceMap = distanceMap
                    };
                }
            }

            MatrixContainer = _matrixContainers[jobId];
        }
Пример #6
0
        public static void SetCalculationStatus(SvdEntities context, ClusterCalculation clusterCalculationEntity, Contracts.ClusterCalculationStatus status)
        {
            if (status == Contracts.ClusterCalculationStatus.New)
            {
                clusterCalculationEntity.Created = DateTime.Now;
            }

            if (status == Contracts.ClusterCalculationStatus.Completed || status == Contracts.ClusterCalculationStatus.Failed)
            {
                clusterCalculationEntity.Completed = DateTime.Now;
            }

            clusterCalculationEntity.Status = status;
            context.SaveChanges();
        }
Пример #7
0
        public static Job GetJob(SvdEntities context, int id, bool preLoadMatrixContainer = true)
        {
            if (preLoadMatrixContainer)
            {
                Task.Factory.StartNew(() =>
                {
                    lock (locker)
                    {
                        LoadMatrices(id);
                    }
                });
            }

            return(context.Jobs.Find(id));
        }
Пример #8
0
        public static ClusterCalculation CreateCalculation(SvdEntities context, Contracts.ClusterCalculationParameters clusterParams)
        {
            var clusterCalculationEntity = context.ClusterCalculations.Add(new ClusterCalculation()
            {
                JobId = clusterParams.JobId.GetValueOrDefault(),
                MinimumClusterCount       = clusterParams.MinimumClusterCount,
                MaximumClusterCount       = clusterParams.MaximumClusterCount,
                IterationsPerCluster      = clusterParams.IterationsPerCluster,
                MaximumOptimizationsCount = clusterParams.MaximumOptimizationsCount,
            });

            SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.New);

            context.SaveChanges();

            return(clusterCalculationEntity);
        }
Пример #9
0
        public static void LoadMatrices(int jobId)
        {
            if (!_vMatrices.ContainsKey(jobId) || !_uMatrices.ContainsKey(jobId))
            {
                using (var context = new SvdEntities())
                {
                    Job job = null;

                    if (!_vMatrices.ContainsKey(jobId))
                    {
                        job = context.Jobs.Find(jobId);

                        using (var ms = new MemoryStream(job.VMatrix.SerializedValues))
                        {
                            var vValues = _binaryFormatter.Deserialize(ms) as float[];

                            _vMatrices[jobId] = new DenseMatrix(job.Dimensions, job.JobDocuments.Count, vValues);
                        }
                    }

                    if (!_uMatrices.ContainsKey(jobId))
                    {
                        if (job == null)
                        {
                            job = context.Jobs.Find(jobId);
                        }

                        using (var ms = new MemoryStream(job.UMatrix.SerializedValues))
                        {
                            var uValues = _binaryFormatter.Deserialize(ms) as float[];

                            _uMatrices[jobId] = new DenseMatrix(job.JobTerms.Count, job.Dimensions, uValues);
                        }
                    }
                }
            }
        }
Пример #10
0
 public static int GetTotalTermDocCount(SvdEntities context, int documentId)
 {
     return(context.TermDocumentCounts.Count(tdc => tdc.DocumentId == documentId));
 }
Пример #11
0
 public static Document GetDocument(SvdEntities context, int documentId)
 {
     return(context.Documents.Find(documentId));
 }
Пример #12
0
 public static List <Document> GetDocuments(SvdEntities context, int page, int docsPerPage)
 {
     return(context.Documents.OrderBy(i => i.Name) /*.Skip(page * docsPerPage).Take(docsPerPage)*/.ToList());
 }
Пример #13
0
 public static List <Job> GetJobs(SvdEntities context)
 {
     return(context.Jobs.ToList());
 }
Пример #14
0
        public void Save(SvdEntities context, ClusterCalculation clusterCalculationEntity)
        {
            var binaryFormatter = new BinaryFormatter();

            var jobDocs         = context.JobDocuments.Where(jd => jd.JobId == JobId).ToLookup(jd => jd.OrdinalIndex);
            var jobTerms        = context.JobTerms.Where(jd => jd.JobId == JobId).ToLookup(jt => jt.Term.Value);
            var clusterEntities = new Dictionary <int, Engine.Cluster>();

            clusterCalculationEntity.ClusterCount = Clusters;
            clusterCalculationEntity.GlobalSi     = GlobalSi;
            clusterCalculationEntity.ClusterSi    = GlobalClusterSiAverage;

            // Update Cluster Calculation
            context.SaveChanges();

            Enumerable.Range(0, Clusters).ToList().ForEach(cluster =>
            {
                using (var memoryStreamCenterVector = new MemoryStream())
                {
                    binaryFormatter.Serialize(memoryStreamCenterVector, Centers[cluster]);

                    memoryStreamCenterVector.Position = 0;

                    clusterEntities.Add(cluster, new Engine.Cluster()
                    {
                        JobId = JobId,
                        ClusterCalculationId = clusterCalculationEntity.Id,
                        Si = ClusterSiAverages[cluster],
                        CenterVectorSerialized = memoryStreamCenterVector.ToArray()
                    });
                }
            });

            // Insert Clusters
            context.BulkInsert(clusterEntities.Select(kvp => kvp.Value));

            var clusterJobDocumentEntities = new ConcurrentBag <ClusterJobDocument>();
            var clusterJobTermEntities     = new ConcurrentBag <ClusterJobTerm>();

            clusterEntities.AsParallel().ForAll(clusterEntity =>
            {
                using (var memoryStreamCenterVector = new MemoryStream())
                {
                    var termDistanceMap = new Dictionary <string, float>();
                    var centerVector    = Centers[clusterEntity.Key];

                    foreach (var kvp in ClusterMap.Where(kvp => kvp.Value == clusterEntity.Key))
                    {
                        var docIndex    = kvp.Key;
                        var jobDocument = jobDocs[docIndex];

                        if (jobDocument != null)
                        {
                            clusterJobDocumentEntities.Add(new ClusterJobDocument()
                            {
                                ClusterCalculationId = clusterCalculationEntity.Id,
                                ClusterId            = clusterEntity.Value.Id,
                                JobId         = JobId,
                                Si            = DocumentSi.ContainsKey(docIndex) ? DocumentSi[docIndex] : 0,
                                JobDocumentId = jobDocument.First().Id
                            });
                        }
                    }

                    for (var i = 0; i < LSA.MatrixContainer.UMatrix.RowCount; i++)
                    {
                        termDistanceMap[LSA.MatrixContainer.Terms[i]] = Distance.Cosine(centerVector, LSA.MatrixContainer.UMatrix.Row(i).ToArray());
                    }

                    foreach (var term in termDistanceMap.OrderBy(t => t.Value).Take(20))
                    {
                        var jobTermLookup = jobTerms[term.Key];

                        if (jobTermLookup != null)
                        {
                            clusterJobTermEntities.Add(new ClusterJobTerm()
                            {
                                ClusterCalculationId = clusterCalculationEntity.Id,
                                ClusterId            = clusterEntity.Value.Id,
                                JobId     = JobId,
                                JobTermId = jobTermLookup.First().Id,
                                DistanceToClusterCenter = term.Value
                            });
                        }
                    }
                }
            });

            // Insert Cluster Documents & Terms
            context.BulkInsert(clusterJobTermEntities);
            context.BulkInsert(clusterJobDocumentEntities);

            SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.Completed);
        }
Пример #15
0
        public static void ProcessAndStore(int jobId, IEnumerable <int> docIds)
        {
            using (var context = new SvdEntities())
            {
                Job job     = null;
                var _docIds = docIds.ToArray();

                try
                {
                    job = context.Jobs.Find(jobId);

                    // Process
                    var matrix = GetTermDocMatrix(context, job, _docIds);
                    var svd    = GetSvd(context, job, matrix);

                    var dimensions = svd.S.Count <= 300 ? svd.S.Count : 300;

                    var binaryFormatter = new BinaryFormatter();

                    // Reduction Step - U Table

                    var newUMatrix = new DenseMatrix(matrix.RowCount, dimensions);

                    for (var i = 0; i < dimensions; i++)
                    {
                        for (var m = 0; m < matrix.RowCount; m++)
                        {
                            newUMatrix[m, i] = svd.U[m, i] * svd.S[i];
                        }
                    }

                    using (var memoryStreamU = new MemoryStream())
                    {
                        binaryFormatter.Serialize(memoryStreamU, newUMatrix.Values);

                        memoryStreamU.Position = 0;

                        context.UMatrices.Add(new UMatrix()
                        {
                            Job = job,
                            SerializedValues = memoryStreamU.ToArray()
                        });
                    }

                    // Reduction Step - V Table

                    var newVMatrix = new DenseMatrix(dimensions, _docIds.Length);

                    for (var i = 0; i < dimensions; i++)
                    {
                        for (var m = 0; m < _docIds.Length; m++)
                        {
                            newVMatrix[i, m] = svd.VT[i, m] * svd.S[i];
                        }
                    }

                    using (var memoryStreamV = new MemoryStream())
                    {
                        binaryFormatter.Serialize(memoryStreamV, newVMatrix.Values);

                        memoryStreamV.Position = 0;

                        context.VMatrices.Add(new VMatrix()
                        {
                            Job = job,
                            SerializedValues = memoryStreamV.ToArray()
                        });
                    }

                    job.Dimensions = dimensions;
                    job.Completed  = DateTime.Now;
                    job.Status     = JobStatus.Completed;

                    context.SaveChanges();
                }
                catch (Exception)
                {
                    job.Status    = JobStatus.Failed;
                    job.Completed = DateTime.Now;
                    context.SaveChanges();

                    throw;
                }
            }
        }
Пример #16
0
 public static void SetJobStatus(SvdEntities context, Job job, JobStatus status)
 {
     job.Status = status;
     context.SaveChanges();
 }
Пример #17
0
 public static Cluster OptimizeRange(SvdEntities context, Contracts.ClusterCalculationParameters clusterAnalysisParameters) =>
 OptimizeRange(context, Cluster.CreateCalculation(context, clusterAnalysisParameters));
Пример #18
0
        public static DenseMatrix GetTermDocMatrix(SvdEntities context, Job job, IEnumerable <int> docIds)
        {
            var termLookup = GetTerms(context).ToLookup(t => t.Value);

            SetJobStatus(context, job, JobStatus.BuildingMatrix);

            var readFilesStart = DateTime.Now;

            var _docIds = docIds.ToArray();
            var files   = context.Documents.Where(d => _docIds.Contains(d.Id)).Select(d => d.Name).ToList();

            var newDocuments   = new List <Document>();
            var jobDocuments   = new List <JobDocument>();
            var termDocCounts  = new List <TermDocumentCount>();
            var documentLookup = context.Documents.ToLookup(d => d.Name);

            // Create Documents
            foreach (var file in files)
            {
                var docEntity = documentLookup[file].FirstOrDefault();

                if (docEntity == null)
                {
                    docEntity = new Document()
                    {
                        Name = file
                    };

                    newDocuments.Add(docEntity);
                }
                else
                {
                    termDocCounts.AddRange(docEntity.TermDocumentCounts);
                }

                jobDocuments.Add(new JobDocument()
                {
                    Job          = job,
                    Document     = docEntity,
                    OrdinalIndex = files.IndexOf(file)
                });
            }

            context.Documents.AddRange(newDocuments);
            context.JobDocuments.AddRange(jobDocuments);

            context.SaveChanges();

            // Setup Parallel Collections

            ConcurrentBag <TermDocumentCount> termDocCountsBagCalculated = new ConcurrentBag <TermDocumentCount>();

            jobDocuments.AsParallel().ForAll((jobDocumentEntity) =>
            {
                if (jobDocumentEntity.Document.TermDocumentCounts.Count == 0)
                {
                    var html = File.ReadAllText(jobDocumentEntity.Document.Name, Encoding.UTF8);

                    HtmlDocument doc = new HtmlDocument();

                    doc.LoadHtml(HttpUtility.HtmlDecode(html));

                    doc.DocumentNode.SelectNodes("//body//text()").ToList().ForEach(node =>
                    {
                        var text = node.InnerText.Trim();

                        if (!string.IsNullOrEmpty(text) && !string.IsNullOrWhiteSpace(text))
                        {
                            var chars = text.Where(c => (
                                                       char.IsLetterOrDigit(c) ||
                                                       char.IsWhiteSpace(c) ||
                                                       c == '-'))
                                        .ToArray();

                            text = new string(chars);

                            ParseDocumentData(text, jobDocumentEntity.Document, termDocCountsBagCalculated, termLookup);
                        }
                    });
                }
            });

            // Build New Term/Doc Count Entites

            var newTdc = from tdc in termDocCountsBagCalculated
                         group tdc by new
            {
                DocumentId = tdc.Document.Id,
                TermId     = tdc.Term.Id
            } into g
            let tdc = g.First()
                      select new TermDocumentCount()
            {
                Document   = tdc.Document,
                Term       = tdc.Term,
                DocumentId = g.Key.DocumentId,
                TermId     = g.Key.TermId,
                Count      = g.Count()
            };

            context.TermDocumentCounts.AddRange(newTdc);
            termDocCounts.AddRange(newTdc);

            // Remove Exclusions from saved list
            termDocCounts = termDocCounts.Where(tdc => !Exclusions.Contains(tdc.Term.Value)).ToList();

            // Save Job Terms

            var termsList = termDocCounts.Select(tdc => tdc.Term.Value).Distinct().ToList();

            var jobTerms = from t in termsList
                           let termEntity = termLookup[t].First()
                                            select new JobTerm()
            {
                Job          = job,
                TermId       = termEntity.Id,
                OrdinalIndex = termsList.IndexOf(t)
            };

            context.JobTerms.AddRange(jobTerms);

            // Build Final Term/Doc Matrix

            var matrix = new DenseMatrix(termsList.Count, _docIds.Length);

            foreach (var termDocCount in termDocCounts)
            {
                matrix[termsList.IndexOf(termDocCount.Term.Value), files.IndexOf(termDocCount.Document.Name)] = termDocCount.Count;
            }

            Debug.WriteLine($"Read File Calc Time: {DateTime.Now.Subtract(readFilesStart).TotalMilliseconds} Milliseconds");

            return(matrix);
        }
Пример #19
0
 public static IEnumerable <Term> GetTerms(SvdEntities context)
 {
     return(context.Terms.ToList());
 }
Пример #20
0
 public static ClusterCalculation Get(SvdEntities context, int clusterCalculationId)
 {
     return(context.ClusterCalculations
            .FirstOrDefault(cc => cc.Id == clusterCalculationId));
 }
Пример #21
0
 public static IEnumerable <ClusterCalculation> GetAll(SvdEntities context, int jobId)
 {
     return(context.ClusterCalculations.Where(cc => cc.JobId == jobId).ToList());
 }
Пример #22
0
 public static Cluster OptimizeRange(SvdEntities context, int clusterCalculationId) =>
 OptimizeRange(context, Cluster.Get(context, clusterCalculationId));
Пример #23
0
        public static void CreateDocument(byte[] documentBytes, string documentName)
        {
            using (var context = new SvdEntities())
            {
                var document = context.Documents.FirstOrDefault(d => d.Name == documentName);

                if (document == null)
                {
                    var termLookup = GetTerms(context).ToLookup(t => t.Value);
                    var html       = Encoding.UTF8.GetString(documentBytes);

                    document = context.Documents.Add(new Document()
                    {
                        Name = documentName.Trim('"')
                    });

                    HtmlDocument doc = new HtmlDocument();

                    doc.LoadHtml(HttpUtility.HtmlDecode(html));

                    var termDocCounts = new List <TermDocumentCount>();

                    doc.DocumentNode.SelectNodes("//body//text()").ToList().ForEach(node =>
                    {
                        var text = node.InnerText.Trim();

                        if (!string.IsNullOrEmpty(text) && !string.IsNullOrWhiteSpace(text))
                        {
                            var chars = text.Where(c => (
                                                       char.IsLetterOrDigit(c) ||
                                                       char.IsWhiteSpace(c) ||
                                                       c == '-'))
                                        .ToArray();

                            text = new string(chars);

                            foreach (var _token in text.Trim().Split(' '))
                            {
                                var miniToken = _token.Trim().ToLower();

                                var termList = termLookup[miniToken].ToList();

                                if (!string.IsNullOrEmpty(miniToken) && miniToken != "-" && miniToken != "\n" && termList.Count > 0)
                                {
                                    termDocCounts.Add(new TermDocumentCount()
                                    {
                                        Document = document,
                                        Term     = termList.First()
                                    });
                                }
                            }
                        }
                    });

                    var newTdc = from tdc in termDocCounts
                                 group tdc by new
                    {
                        DocumentId = tdc.Document.Id,
                        TermId     = tdc.Term.Id
                    } into g
                    let tdc = g.First()
                              select new TermDocumentCount()
                    {
                        Document   = tdc.Document,
                        Term       = tdc.Term,
                        DocumentId = g.Key.DocumentId,
                        TermId     = g.Key.TermId,
                        Count      = g.Count()
                    };


                    context.TermDocumentCounts.AddRange(newTdc);
                    context.SaveChanges();
                }
            }
        }