public void TrainWordEmbeddings()
    {
        var id = DateTime.Now.Ticks;
        var embeddingsFileLoc = $@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}/wordEmbeddings-{id}.csv";

        Directory.CreateDirectory($@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}");

        var word2Vec = new Word2VecTrainer();

        word2Vec.Setup(InputFileLoc, minWordOccurrences: 3);
        word2Vec.TrainModel(useCbow: false, numberOfIterations: 20);
        word2Vec.WriteWordEmbeddings(embeddingsFileLoc);
    }
Example #2
0
    public void Go()
    {
        var id                = DateTime.Now.Ticks;
        var inputFileLoc      = TrainingDataManager.GetAlphabetFile().FullName;
        var embeddingsFileLoc = $@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}/wordEmbeddings-{id}.csv";
        var reportFileLoc     = $@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}/report-{id}.csv";

        Directory.CreateDirectory($@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}");

        var word2Vec = new Word2VecTrainer();

        word2Vec.Setup(inputFileLoc);
        word2Vec.TrainModel(windowSize: 1, thresholdForOccurrenceOfWords: 0, useSkipgram: false);
        word2Vec.WriteWordEmbeddings(embeddingsFileLoc);

        var reportHandler = new ReportWriter(reportFileLoc);

        reportHandler.WriteProbabilityMatrix(word2Vec.WordCollection, word2Vec.NeuralNetwork);
    }
Example #3
0
    public void NotSufferFromOffByOne()
    {
        const string input          = "This is a string. The String to test, the strings   to prevail.\r\nWhat is the string?";
        var          wordCollection = new WordCollection();

        wordCollection.AddWords(input, 11);
        wordCollection.InitWordPositions();
        var          sentence   = new int?[12];
        var          nextRandom = new Random();
        const double thresholdForOccurrenceOfWords = 0;
        var          sentenceLength = 0;

        string[] lastLine = null;
        var      reader   = new StreamReader(new MemoryStream(Encoding.ASCII.GetBytes(input)));

        Word2VecTrainer.SetSentence(wordCollection, reader, sentence, nextRandom, ref sentenceLength, ref lastLine, thresholdForOccurrenceOfWords);
        reader.Dispose();

        Assert.Equal(12, sentenceLength);
        Assert.NotNull(sentence[11]);
    }
    public void Go()
    {
        var id                = DateTime.Now.Ticks;
        var inputFileLoc      = TrainingDataManager.GetBlogAuthorshipCorpusFiles().First(f => f.Length >= 1e5 && f.Length <= 2e5).FullName;
        var embeddingsFileLoc = $@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}/wordEmbeddings-{id}.csv";
        var reportFileLoc     = $@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}/report-{id}.csv";

        Directory.CreateDirectory($@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}");

        var word2Vec = new Word2VecTrainer();

        word2Vec.Setup(inputFileLoc, minWordOccurrences: 3);
        word2Vec.TrainModel();
        word2Vec.WriteWordEmbeddings(embeddingsFileLoc);


        using var fileStream = new FileStream(embeddingsFileLoc, FileMode.OpenOrCreate, FileAccess.Read);
        using var reader     = new StreamReader(fileStream, Encoding.UTF8);
        var wordEmbeddings = new List <WordEmbedding>();

        wordEmbeddings.PopulateWordEmbeddingsFromStream(reader);

        var tsne = new TSNE(2, distanceFunctionType: DistanceFunctionType.Cosine);

        tsne.ReduceDimensions(wordEmbeddings);

        var labelClusterIndexMap = DBSCAN.GetLabelClusterMap(
            wordEmbeddings,
            epsilon: 0.1,
            minimumSamples: 3,
            distanceFunctionType: DistanceFunctionType.Cosine,
            concurrentThreads: 4);

        var reportHandler = new ReportWriter(reportFileLoc);

        reportHandler.Write2DWordEmbeddingsAndClusterIndexesForExcel(wordEmbeddings, labelClusterIndexMap);
    }