public Word2VecUsingLibrary( FileHandler fileHandler, int numberOfThreads = 4, int numberOfIterations = 4, int numberOfDimensions = 50, int maxSentenceLength = 10000, int minCount = 5, float startingAlpha = 0.025f, bool useSkipgram = true, bool useCbow = true, int negativeSamples = 5, int windowSize = 5, float thresholdForOccurrenceOfWords = 1e-3f ) { _fileHandler = fileHandler; _numberOfThreads = numberOfThreads; _numberOfIterations = numberOfIterations; _numberOfDimensions = numberOfDimensions; _maxSentenceLength = maxSentenceLength; _minCount = minCount; _startingAlpha = startingAlpha; _useSkipgram = useSkipgram; _useCbow = useCbow; // note: first 'negative sample' is positive _negativeSamples = negativeSamples; _windowSize = windowSize; _thresholdForOccurrenceOfWords = thresholdForOccurrenceOfWords; _wordCollection = new WordCollection(); }
public void WriteOutputMatrix(WordCollection wordCollection, Layer neuralNetwork) { using (var fs = new FileStream(_outputFile, FileMode.OpenOrCreate, FileAccess.Write)) { fs.Seek(0, SeekOrigin.End); using (var writer = new StreamWriter(fs, Encoding.UTF8)) { var words = wordCollection.GetWords().ToList(); var stringBuilder = new StringBuilder(); stringBuilder.Append(","); foreach (var word in words) { stringBuilder.Append($"{word},"); } stringBuilder.AppendLine(); for (var i = 0; i < words.Count; i++) { var inputs = new double[words.Count]; inputs[i] = 1; neuralNetwork.PopulateAllOutputs(inputs); stringBuilder.Append($"{words[i]},"); for (var j = 0; j < words.Count; j++) { stringBuilder.Append($"{neuralNetwork.Nodes[j].Output},"); } stringBuilder.AppendLine(); } writer.WriteLine(stringBuilder.ToString()); } } }
public int SetSentence(StreamReader reader, int wordCount, int?[] sentence, Random random, ref int sentenceLength, ref string[] lineThatGotCutOff) { string line; var loopEnd = false; if (lineThatGotCutOff != null && lineThatGotCutOff.Any()) { loopEnd = HandleWords(reader, ref wordCount, sentence, random, ref sentenceLength, lineThatGotCutOff); lineThatGotCutOff = null; } while (!loopEnd && (line = reader.ReadLine()) != null) { var words = WordCollection.ParseWords(line).Select(WordCollection.Clean).ToArray(); if (words.Length > sentence.Length) { continue; } if (sentenceLength > sentence.Length - words.Length) { lineThatGotCutOff = words; break; } loopEnd = HandleWords(reader, ref wordCount, sentence, random, ref sentenceLength, words); } return(wordCount); }
private static List <Node> GetQueueSortedByWordFrequencyAscending(WordCollection wordCollection) { var sortedByLowestCount = wordCollection.ToArray(); var queue = sortedByLowestCount.Select(word => new Node { Frequency = word.Value.Count, WordInfo = word.Value, Word = word.Key }) .OrderBy(y => y.Frequency).ToList(); return(queue); }
private static void IterateQueue(WordCollection wordCollection, List <Node> queue) { var numberOfInteriorNodes = 0; for (var a = 0; a < wordCollection.GetNumberOfUniqueWords() - 1; a++) { var node = CreateInteriorNode(queue, numberOfInteriorNodes); numberOfInteriorNodes++; InsertNodeInQueue(queue, node); } }
public void WriteDescription(WordCollection wordCollection, int numberOfDimensions) { using (var fs = new FileStream(_outputFile, FileMode.OpenOrCreate, FileAccess.Write)) { fs.Seek(0, SeekOrigin.End); using (var writer = new StreamWriter(fs, Encoding.UTF8)) { writer.WriteLine(wordCollection.GetNumberOfUniqueWords()); writer.WriteLine(numberOfDimensions); } } }
public void Create(WordCollection wordCollection) { _wordCollection = wordCollection; var queue = GetQueueSortedByWordFrequencyAscending(wordCollection); IterateQueue(wordCollection, queue); var root = queue.Single(); root.Code = ""; Preorder(root); GC.Collect(); }
public void WriteOutput(WordCollection wordCollection, int numberOfDimensions, float[,] hiddenLayerWeights) { using (var fs = new FileStream(_outputFile, FileMode.OpenOrCreate, FileAccess.Write)) { fs.Seek(0, SeekOrigin.End); using (var writer = new StreamWriter(fs, Encoding.UTF8)) { var keys = wordCollection.GetWords().ToArray(); for (var a = 0; a < wordCollection.GetNumberOfUniqueWords(); a++) { var bytes = new List <byte>(); for (var dimensionIndex = 0; dimensionIndex < numberOfDimensions; dimensionIndex++) { bytes.AddRange(BitConverter.GetBytes(hiddenLayerWeights[a, dimensionIndex])); } writer.WriteLine($"{keys[a]}\t{Convert.ToBase64String(bytes.ToArray())}"); } } } }
public void GetWordDictionaryFromFile(WordCollection wordCollection, int maxCodeLength) { if (!File.Exists(_trainFile)) { throw new InvalidOperationException($"Unable to find {_trainFile}"); } using (var fileStream = new FileStream(_trainFile, FileMode.OpenOrCreate, FileAccess.Read)) { using (var reader = new StreamReader(fileStream, Encoding.UTF8)) { string line; while ((line = reader.ReadLine()) != null) { wordCollection.AddWords(line, maxCodeLength); if (reader.EndOfStream) { break; } } } } }
private static bool HandleWords(StreamReader reader, ref long wordCount, long?[] sentence, ref ulong nextRandom, ref long sentenceLength, IEnumerable <string> words, WordCollection wordCollection, float thresholdForOccurrenceOfWords) { var totalNumberOfWords = wordCollection.GetTotalNumberOfWords(); foreach (var word in words) { var wordIndex = wordCollection[word]; if (!wordIndex.HasValue) { continue; } wordCount++; //Subsampling of frequent words if (thresholdForOccurrenceOfWords > 0) { var random = ((float)Math.Sqrt(wordCollection.GetOccurrenceOfWord(word) / (thresholdForOccurrenceOfWords * totalNumberOfWords)) + 1) * (thresholdForOccurrenceOfWords * totalNumberOfWords) / wordCollection.GetOccurrenceOfWord(word); nextRandom = LinearCongruentialGenerator(nextRandom); if (random < (nextRandom & 0xFFFF) / (float)65536) { continue; } } sentence[sentenceLength] = wordIndex.Value; sentenceLength++; if (sentenceLength > sentence.Length) { return(true); } } if (reader.EndOfStream) { return(true); } return(false); }
public static long SetSentence(StreamReader reader, long wordCount, long?[] sentence, ref ulong nextRandom, ref long sentenceLength, ref string [] lineThatGotCutOff, WordCollection wordCollection, float thresholdForOccurrenceOfWords) { string line; var loopEnd = false; if (lineThatGotCutOff != null && lineThatGotCutOff.Any()) { loopEnd = HandleWords(reader, ref wordCount, sentence, ref nextRandom, ref sentenceLength, lineThatGotCutOff, wordCollection, thresholdForOccurrenceOfWords); lineThatGotCutOff = null; } while (!loopEnd && (line = reader.ReadLine()) != null) { var words = WordCollection.ParseWords(line).Select(WordCollection.Clean).ToArray(); if (words.Length > sentence.Length) { continue; } if (sentenceLength > sentence.Length - words.Length) { lineThatGotCutOff = words; break; } loopEnd = HandleWords(reader, ref wordCount, sentence, ref nextRandom, ref sentenceLength, words, wordCollection, thresholdForOccurrenceOfWords); } return(wordCount); }