Ejemplo n.º 1
0
    public void GenerateReportFromLatestEmbeddings()
    {
        var embeddingsFile = new DirectoryInfo($@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}").EnumerateFiles()
                             .Where(f => Regex.IsMatch(f.Name, "^wordEmbeddings-.*$"))
                             .OrderBy(f => f.CreationTime)
                             .Last();
        var reportFileLoc = $@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}/report-{DateTime.Now.Ticks}.csv";

        var wordEmbeddings = new List <WordEmbedding>();

        using var fileStream = new FileStream(embeddingsFile.FullName, FileMode.OpenOrCreate, FileAccess.Read);
        using var reader     = new StreamReader(fileStream, Encoding.UTF8);
        wordEmbeddings.PopulateWordEmbeddingsFromStream(reader);
        wordEmbeddings.NormaliseEmbeddings();

        var articleEmbeddings = new List <ArticleEmbedding>();

        foreach (var line in File.ReadLines(InputFileLoc))
        {
            var splitLine = line.Split(',');
            articleEmbeddings.Add(new ArticleEmbedding(splitLine[0], string.Join(' ', splitLine.Skip(1)), maxContentsLength: 500));
        }
        articleEmbeddings.AssignVectorsFromWeightedWordEmbeddings(wordEmbeddings);

        var kMeans = new KMeans(articleEmbeddings);

        kMeans.CalculateLabelClusterMap(
            numberOfClusters: 25
            );

        var reportHandler = new ReportWriter(reportFileLoc);

        reportHandler.WriteLabelsWithClusterIndex(kMeans.LabelClusterMap);
    }
Ejemplo n.º 2
0
    public void PutXElementsInXClusters(int x)
    {
        var embeddings = new List <WordEmbedding>();

        for (var i = 0; i < x; i++)
        {
            var embedding = new double[x];
            embedding[i] = 1d;
            embeddings.Add(new WordEmbedding(i.ToString(), embedding));
        }
        var kMeans = new KMeans(embeddings);

        kMeans.CalculateLabelClusterMap(numberOfClusters: x);

        Assert.Equal(x, kMeans.LabelClusterMap.Keys.Distinct().Count());
    }
Ejemplo n.º 3
0
    public void CorrectlyAssignClusters(List <IEmbedding> embeddings, int numberOfClusters, List <List <string> > desiredClusters)
    {
        var kMeans = new KMeans(embeddings);

        kMeans.CalculateLabelClusterMap(numberOfClusters: numberOfClusters);

        var clusters = kMeans.LabelClusterMap.GroupBy(lcm => lcm.Value);

        foreach (var cluster in clusters)
        {
            var elements       = cluster.ToArray();
            var desiredCluster = desiredClusters.First(dc => dc.Contains(elements[0].Key));

            // assert desired and actual cluster contain the same elements
            Assert.Equal(desiredCluster.Count, elements.Length);
            foreach (var element in elements)
            {
                Assert.Contains(desiredCluster, x => x == element.Key);
            }
        }
    }