public void GenerateReportFromLatestEmbeddings() { var embeddingsFile = new DirectoryInfo($@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}").EnumerateFiles() .Where(f => Regex.IsMatch(f.Name, "^wordEmbeddings-.*$")) .OrderBy(f => f.CreationTime) .Last(); var reportFileLoc = $@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}/report-{DateTime.Now.Ticks}.csv"; var wordEmbeddings = new List <WordEmbedding>(); using var fileStream = new FileStream(embeddingsFile.FullName, FileMode.OpenOrCreate, FileAccess.Read); using var reader = new StreamReader(fileStream, Encoding.UTF8); wordEmbeddings.PopulateWordEmbeddingsFromStream(reader); wordEmbeddings.NormaliseEmbeddings(); var articleEmbeddings = new List <ArticleEmbedding>(); foreach (var line in File.ReadLines(InputFileLoc)) { var splitLine = line.Split(','); articleEmbeddings.Add(new ArticleEmbedding(splitLine[0], string.Join(' ', splitLine.Skip(1)), maxContentsLength: 500)); } articleEmbeddings.AssignVectorsFromWeightedWordEmbeddings(wordEmbeddings); var kMeans = new KMeans(articleEmbeddings); kMeans.CalculateLabelClusterMap( numberOfClusters: 25 ); var reportHandler = new ReportWriter(reportFileLoc); reportHandler.WriteLabelsWithClusterIndex(kMeans.LabelClusterMap); }
public void PutXElementsInXClusters(int x) { var embeddings = new List <WordEmbedding>(); for (var i = 0; i < x; i++) { var embedding = new double[x]; embedding[i] = 1d; embeddings.Add(new WordEmbedding(i.ToString(), embedding)); } var kMeans = new KMeans(embeddings); kMeans.CalculateLabelClusterMap(numberOfClusters: x); Assert.Equal(x, kMeans.LabelClusterMap.Keys.Distinct().Count()); }
public void CorrectlyAssignClusters(List <IEmbedding> embeddings, int numberOfClusters, List <List <string> > desiredClusters) { var kMeans = new KMeans(embeddings); kMeans.CalculateLabelClusterMap(numberOfClusters: numberOfClusters); var clusters = kMeans.LabelClusterMap.GroupBy(lcm => lcm.Value); foreach (var cluster in clusters) { var elements = cluster.ToArray(); var desiredCluster = desiredClusters.First(dc => dc.Contains(elements[0].Key)); // assert desired and actual cluster contain the same elements Assert.Equal(desiredCluster.Count, elements.Length); foreach (var element in elements) { Assert.Contains(desiredCluster, x => x == element.Key); } } }