/// <summary>
    /// Create unigram table for random sub-sampling.
    /// Frequent words will occupy more positions in the table.
    /// </summary>
    public static int[] GetUnigramTable(this WordCollection wordCollection, int tableSize, double power = 0.75)
    {
        if (wordCollection.GetNumberOfUniqueWords() == 0)
        {
            // maybe throw?
            return(Array.Empty <int>());
        }

        var table = new int[tableSize];
        var sumOfOccurenceOfWordsRaisedToPower = wordCollection.GetSumOfOccurenceOfWordsRaisedToPower(power);

        var words = wordCollection.GetWords().ToArray();
        var indexOfCurrentWord           = -1;
        var highestPositionOfWordInTable = -1;

        for (var tablePosition = 0; tablePosition < tableSize; tablePosition++)
        {
            if (tablePosition > highestPositionOfWordInTable)
            {
                indexOfCurrentWord++;
                highestPositionOfWordInTable += (int)Math.Ceiling(Math.Pow(wordCollection.GetOccurrenceOfWord(words[indexOfCurrentWord]), power) / sumOfOccurenceOfWordsRaisedToPower * tableSize);
            }

            table[tablePosition] = indexOfCurrentWord;

            if (indexOfCurrentWord >= wordCollection.GetNumberOfUniqueWords())
            {
                indexOfCurrentWord = wordCollection.GetNumberOfUniqueWords() - 1;
            }
        }

        return(table);
    }
    private static void IterateQueue(WordCollection wordCollection, List <Node> queue)
    {
        var numberOfInteriorNodes = 0;

        for (var a = 0; a < wordCollection.GetNumberOfUniqueWords() - 1; a++)
        {
            var node = CreateInteriorNode(queue, numberOfInteriorNodes);
            numberOfInteriorNodes++;
            InsertNodeInQueue(queue, node);
        }
    }
Ejemplo n.º 3
0
    public void Setup(string trainingFileLocation, int dimensions = 50, int minWordOccurrences = 5)
    {
        _trainingFileHandler = new FileHandler(trainingFileLocation);

        WordCollection = _trainingFileHandler.GetWordDictionaryFromFile(MaxCodeLength);
        WordCollection.RemoveWordsWithCountLessThanMinCount(minWordOccurrences);
        _table = WordCollection.GetUnigramTable(TableSize);

        var inputLayer  = new Layer(WordCollection.GetNumberOfUniqueWords(), Array.Empty <Layer>(), ActivationFunctionType.RELU, InitialisationFunctionType.None);
        var hiddenLayer = new Layer(dimensions, new[] { inputLayer }, ActivationFunctionType.Linear, InitialisationFunctionType.RandomWeighted, false);

        NeuralNetwork = new Layer(WordCollection.GetNumberOfUniqueWords(), new[] { hiddenLayer }, ActivationFunctionType.Sigmoid, InitialisationFunctionType.HeEtAl, false);
        NeuralNetwork.Initialise(new Random());

        GC.Collect();
    }
Ejemplo n.º 4
0
    public void WriteWordEmbeddings(string embeddingsFileLocation)
    {
        var words       = WordCollection.GetWords().ToArray();
        var hiddenLayer = NeuralNetwork.PreviousLayers[0];
        var inputLayer  = hiddenLayer.PreviousLayers[0];

        var wordEmbeddings = new List <WordEmbedding>();

        for (var i = 0; i < WordCollection.GetNumberOfUniqueWords(); i++)
        {
            wordEmbeddings.Add(new WordEmbedding(words[i], hiddenLayer.Nodes.Select(hiddenNode => hiddenNode.Weights[inputLayer.Nodes[i]].Value).ToArray()));
        }
        using var fileStream = new FileStream(embeddingsFileLocation, FileMode.OpenOrCreate, FileAccess.Write);
        _ = fileStream.Seek(0, SeekOrigin.End);
        using var writer = new StreamWriter(fileStream, Encoding.UTF8);
        wordEmbeddings.WriteEmbeddingToStream(writer);
    }