Beispiel #1
0
        public Word2VecUsingLibrary(
            FileHandler fileHandler,
            int numberOfThreads    = 4,
            int numberOfIterations = 4,
            int numberOfDimensions = 50,
            int maxSentenceLength  = 10000,
            int minCount           = 5,
            float startingAlpha    = 0.025f,
            bool useSkipgram       = true,
            bool useCbow           = true,
            int negativeSamples    = 5,
            int windowSize         = 5,
            float thresholdForOccurrenceOfWords = 1e-3f
            )
        {
            _fileHandler        = fileHandler;
            _numberOfThreads    = numberOfThreads;
            _numberOfIterations = numberOfIterations;
            _numberOfDimensions = numberOfDimensions;
            _maxSentenceLength  = maxSentenceLength;
            _minCount           = minCount;
            _startingAlpha      = startingAlpha;
            _useSkipgram        = useSkipgram;
            _useCbow            = useCbow;
            // note: first 'negative sample' is positive
            _negativeSamples = negativeSamples;
            _windowSize      = windowSize;
            _thresholdForOccurrenceOfWords = thresholdForOccurrenceOfWords;

            _wordCollection = new WordCollection();
        }
Beispiel #2
0
        public void WriteOutputMatrix(WordCollection wordCollection, Layer neuralNetwork)
        {
            using (var fs = new FileStream(_outputFile, FileMode.OpenOrCreate, FileAccess.Write))
            {
                fs.Seek(0, SeekOrigin.End);
                using (var writer = new StreamWriter(fs, Encoding.UTF8))
                {
                    var words = wordCollection.GetWords().ToList();

                    var stringBuilder = new StringBuilder();

                    stringBuilder.Append(",");
                    foreach (var word in words)
                    {
                        stringBuilder.Append($"{word},");
                    }
                    stringBuilder.AppendLine();
                    for (var i = 0; i < words.Count; i++)
                    {
                        var inputs = new double[words.Count];
                        inputs[i] = 1;
                        neuralNetwork.PopulateAllOutputs(inputs);

                        stringBuilder.Append($"{words[i]},");
                        for (var j = 0; j < words.Count; j++)
                        {
                            stringBuilder.Append($"{neuralNetwork.Nodes[j].Output},");
                        }
                        stringBuilder.AppendLine();
                    }

                    writer.WriteLine(stringBuilder.ToString());
                }
            }
        }
Beispiel #3
0
        public int SetSentence(StreamReader reader, int wordCount, int?[] sentence, Random random, ref int sentenceLength,
                               ref string[] lineThatGotCutOff)
        {
            string line;
            var    loopEnd = false;

            if (lineThatGotCutOff != null && lineThatGotCutOff.Any())
            {
                loopEnd           = HandleWords(reader, ref wordCount, sentence, random, ref sentenceLength, lineThatGotCutOff);
                lineThatGotCutOff = null;
            }

            while (!loopEnd && (line = reader.ReadLine()) != null)
            {
                var words = WordCollection.ParseWords(line).Select(WordCollection.Clean).ToArray();
                if (words.Length > sentence.Length)
                {
                    continue;
                }
                if (sentenceLength > sentence.Length - words.Length)
                {
                    lineThatGotCutOff = words;
                    break;
                }
                loopEnd = HandleWords(reader, ref wordCount, sentence, random, ref sentenceLength, words);
            }
            return(wordCount);
        }
Beispiel #4
0
        private static List <Node> GetQueueSortedByWordFrequencyAscending(WordCollection wordCollection)
        {
            var sortedByLowestCount = wordCollection.ToArray();
            var queue = sortedByLowestCount.Select(word => new Node
            {
                Frequency = word.Value.Count, WordInfo = word.Value, Word = word.Key
            })
                        .OrderBy(y => y.Frequency).ToList();

            return(queue);
        }
Beispiel #5
0
        private static void IterateQueue(WordCollection wordCollection, List <Node> queue)
        {
            var numberOfInteriorNodes = 0;

            for (var a = 0; a < wordCollection.GetNumberOfUniqueWords() - 1; a++)
            {
                var node = CreateInteriorNode(queue, numberOfInteriorNodes);
                numberOfInteriorNodes++;
                InsertNodeInQueue(queue, node);
            }
        }
Beispiel #6
0
 public void WriteDescription(WordCollection wordCollection, int numberOfDimensions)
 {
     using (var fs = new FileStream(_outputFile, FileMode.OpenOrCreate, FileAccess.Write))
     {
         fs.Seek(0, SeekOrigin.End);
         using (var writer = new StreamWriter(fs, Encoding.UTF8))
         {
             writer.WriteLine(wordCollection.GetNumberOfUniqueWords());
             writer.WriteLine(numberOfDimensions);
         }
     }
 }
Beispiel #7
0
        public void Create(WordCollection wordCollection)
        {
            _wordCollection = wordCollection;
            var queue = GetQueueSortedByWordFrequencyAscending(wordCollection);

            IterateQueue(wordCollection, queue);
            var root = queue.Single();

            root.Code = "";
            Preorder(root);
            GC.Collect();
        }
Beispiel #8
0
 public void WriteOutput(WordCollection wordCollection, int numberOfDimensions, float[,] hiddenLayerWeights)
 {
     using (var fs = new FileStream(_outputFile, FileMode.OpenOrCreate, FileAccess.Write))
     {
         fs.Seek(0, SeekOrigin.End);
         using (var writer = new StreamWriter(fs, Encoding.UTF8))
         {
             var keys = wordCollection.GetWords().ToArray();
             for (var a = 0; a < wordCollection.GetNumberOfUniqueWords(); a++)
             {
                 var bytes = new List <byte>();
                 for (var dimensionIndex = 0; dimensionIndex < numberOfDimensions; dimensionIndex++)
                 {
                     bytes.AddRange(BitConverter.GetBytes(hiddenLayerWeights[a, dimensionIndex]));
                 }
                 writer.WriteLine($"{keys[a]}\t{Convert.ToBase64String(bytes.ToArray())}");
             }
         }
     }
 }
Beispiel #9
0
        public void GetWordDictionaryFromFile(WordCollection wordCollection, int maxCodeLength)
        {
            if (!File.Exists(_trainFile))
            {
                throw new InvalidOperationException($"Unable to find {_trainFile}");
            }

            using (var fileStream = new FileStream(_trainFile, FileMode.OpenOrCreate, FileAccess.Read))
            {
                using (var reader = new StreamReader(fileStream, Encoding.UTF8))
                {
                    string line;
                    while ((line = reader.ReadLine()) != null)
                    {
                        wordCollection.AddWords(line, maxCodeLength);

                        if (reader.EndOfStream)
                        {
                            break;
                        }
                    }
                }
            }
        }
Beispiel #10
0
        private static bool HandleWords(StreamReader reader, ref long wordCount, long?[] sentence, ref ulong nextRandom,
                                        ref long sentenceLength, IEnumerable <string> words, WordCollection wordCollection, float thresholdForOccurrenceOfWords)
        {
            var totalNumberOfWords = wordCollection.GetTotalNumberOfWords();

            foreach (var word in words)
            {
                var wordIndex = wordCollection[word];
                if (!wordIndex.HasValue)
                {
                    continue;
                }
                wordCount++;

                //Subsampling of frequent words
                if (thresholdForOccurrenceOfWords > 0)
                {
                    var random = ((float)Math.Sqrt(wordCollection.GetOccurrenceOfWord(word) / (thresholdForOccurrenceOfWords * totalNumberOfWords)) + 1) *
                                 (thresholdForOccurrenceOfWords * totalNumberOfWords) / wordCollection.GetOccurrenceOfWord(word);
                    nextRandom = LinearCongruentialGenerator(nextRandom);
                    if (random < (nextRandom & 0xFFFF) / (float)65536)
                    {
                        continue;
                    }
                }
                sentence[sentenceLength] = wordIndex.Value;
                sentenceLength++;
                if (sentenceLength > sentence.Length)
                {
                    return(true);
                }
            }
            if (reader.EndOfStream)
            {
                return(true);
            }
            return(false);
        }
Beispiel #11
0
        public static long SetSentence(StreamReader reader, long wordCount, long?[] sentence,
                                       ref ulong nextRandom, ref long sentenceLength, ref string [] lineThatGotCutOff, WordCollection wordCollection, float thresholdForOccurrenceOfWords)
        {
            string line;
            var    loopEnd = false;

            if (lineThatGotCutOff != null && lineThatGotCutOff.Any())
            {
                loopEnd           = HandleWords(reader, ref wordCount, sentence, ref nextRandom, ref sentenceLength, lineThatGotCutOff, wordCollection, thresholdForOccurrenceOfWords);
                lineThatGotCutOff = null;
            }

            while (!loopEnd && (line = reader.ReadLine()) != null)
            {
                var words = WordCollection.ParseWords(line).Select(WordCollection.Clean).ToArray();
                if (words.Length > sentence.Length)
                {
                    continue;
                }
                if (sentenceLength > sentence.Length - words.Length)
                {
                    lineThatGotCutOff = words;
                    break;
                }
                loopEnd = HandleWords(reader, ref wordCount, sentence, ref nextRandom, ref sentenceLength, words, wordCollection, thresholdForOccurrenceOfWords);
            }
            return(wordCount);
        }