public void TrainWordEmbeddings() { var id = DateTime.Now.Ticks; var embeddingsFileLoc = $@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}/wordEmbeddings-{id}.csv"; Directory.CreateDirectory($@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}"); var word2Vec = new Word2VecTrainer(); word2Vec.Setup(InputFileLoc, minWordOccurrences: 3); word2Vec.TrainModel(useCbow: false, numberOfIterations: 20); word2Vec.WriteWordEmbeddings(embeddingsFileLoc); }
public void Go() { var id = DateTime.Now.Ticks; var inputFileLoc = TrainingDataManager.GetAlphabetFile().FullName; var embeddingsFileLoc = $@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}/wordEmbeddings-{id}.csv"; var reportFileLoc = $@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}/report-{id}.csv"; Directory.CreateDirectory($@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}"); var word2Vec = new Word2VecTrainer(); word2Vec.Setup(inputFileLoc); word2Vec.TrainModel(windowSize: 1, thresholdForOccurrenceOfWords: 0, useSkipgram: false); word2Vec.WriteWordEmbeddings(embeddingsFileLoc); var reportHandler = new ReportWriter(reportFileLoc); reportHandler.WriteProbabilityMatrix(word2Vec.WordCollection, word2Vec.NeuralNetwork); }
public void NotSufferFromOffByOne() { const string input = "This is a string. The String to test, the strings to prevail.\r\nWhat is the string?"; var wordCollection = new WordCollection(); wordCollection.AddWords(input, 11); wordCollection.InitWordPositions(); var sentence = new int?[12]; var nextRandom = new Random(); const double thresholdForOccurrenceOfWords = 0; var sentenceLength = 0; string[] lastLine = null; var reader = new StreamReader(new MemoryStream(Encoding.ASCII.GetBytes(input))); Word2VecTrainer.SetSentence(wordCollection, reader, sentence, nextRandom, ref sentenceLength, ref lastLine, thresholdForOccurrenceOfWords); reader.Dispose(); Assert.Equal(12, sentenceLength); Assert.NotNull(sentence[11]); }
public void Go() { var id = DateTime.Now.Ticks; var inputFileLoc = TrainingDataManager.GetBlogAuthorshipCorpusFiles().First(f => f.Length >= 1e5 && f.Length <= 2e5).FullName; var embeddingsFileLoc = $@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}/wordEmbeddings-{id}.csv"; var reportFileLoc = $@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}/report-{id}.csv"; Directory.CreateDirectory($@"{Directory.GetCurrentDirectory()}/{ResultsDirectory}"); var word2Vec = new Word2VecTrainer(); word2Vec.Setup(inputFileLoc, minWordOccurrences: 3); word2Vec.TrainModel(); word2Vec.WriteWordEmbeddings(embeddingsFileLoc); using var fileStream = new FileStream(embeddingsFileLoc, FileMode.OpenOrCreate, FileAccess.Read); using var reader = new StreamReader(fileStream, Encoding.UTF8); var wordEmbeddings = new List <WordEmbedding>(); wordEmbeddings.PopulateWordEmbeddingsFromStream(reader); var tsne = new TSNE(2, distanceFunctionType: DistanceFunctionType.Cosine); tsne.ReduceDimensions(wordEmbeddings); var labelClusterIndexMap = DBSCAN.GetLabelClusterMap( wordEmbeddings, epsilon: 0.1, minimumSamples: 3, distanceFunctionType: DistanceFunctionType.Cosine, concurrentThreads: 4); var reportHandler = new ReportWriter(reportFileLoc); reportHandler.Write2DWordEmbeddingsAndClusterIndexesForExcel(wordEmbeddings, labelClusterIndexMap); }