Beispiel #1
0
        /// <summary>
        /// Createsa  new training word.
        /// </summary>
        /// <param name="word">The word.</param>
        /// <param name="defaultEndWord">The default end word for the training word.</param>
        internal TrainingWord(string word, WordInstance defaultEndWord)
        {
            Word           = word;
            DefaultEndWord = defaultEndWord;

            PostWords = new Dictionary <string, WordInstance>();

            _id = _nextId;
            _nextId++;
        }
Beispiel #2
0
        /// <summary>
        /// Creates a new word trainer.
        /// </summary>
        /// <param name="defaultEndSymbol">The default end symbol.</param>
        /// <param name="endSymbols">The end symbols.</param>
        /// <param name="endSeparators">The end separators.</param>
        /// <param name="spacingSymbols">The spacing symbols.</param>
        /// <param name="separatorSymbols">The separator symbols.</param>
        /// <param name="combinatorSymbols">The combinator symbols.</param>
        /// <param name="wrapperSymbols">The wrapper symbols.</param>
        public WordTrainer(string defaultEndSymbol, IReadOnlyCollection <string> endSymbols, IReadOnlyCollection <string> endSeparators, IReadOnlyCollection <string> spacingSymbols, IReadOnlyCollection <string> separatorSymbols, IReadOnlyCollection <string> combinatorSymbols, IReadOnlyList <string> wrapperSymbols)
        {
            _trainingData = new Dictionary <string, WordInstance>();
            _wrapperEnds  = new Dictionary <string, string>();
            _symbolHashes = new HashSet <string>();

            _endSymbols          = endSymbols;
            _endSeparatorSymbols = endSeparators;
            _spacingSymbols      = spacingSymbols;
            _separatorSymbols    = separatorSymbols;
            _combinatorSymbols   = combinatorSymbols;
            _wrapperSymbols      = wrapperSymbols;

            if (!string.IsNullOrWhiteSpace(defaultEndSymbol))
            {
                _defaultEndSymbol = new WordInstance(1, new TrainingWord(defaultEndSymbol, null), SymbolType.End);
            }

            for (var i = 0; i < wrapperSymbols.Count; i++)
            {
                var a = _wrapperSymbols[i];
                i++;
                var b = _wrapperSymbols[i];
                _wrapperEnds.Add(a, b);
            }

            foreach (var symbol in _endSymbols)
            {
                _symbolHashes.Add(symbol);
            }
            foreach (var symbol in _endSeparatorSymbols)
            {
                _symbolHashes.Add(symbol);
            }
            foreach (var symbol in _spacingSymbols)
            {
                _symbolHashes.Add(symbol);
            }
            foreach (var symbol in _separatorSymbols)
            {
                _symbolHashes.Add(symbol);
            }
            foreach (var symbol in _combinatorSymbols)
            {
                _symbolHashes.Add(symbol);
            }
            foreach (var symbol in _wrapperSymbols)
            {
                _symbolHashes.Add(symbol);
            }
        }
Beispiel #3
0
        /// <summary>
        /// Adds training data.
        /// </summary>
        /// <param name="input">The input.</param>
        /// <param name="levels">The levels of training. This is 1 by default. More levels = larger weight of each word/symbol.</param>
        public void Train(string input, int levels = 1)
        {
            for (var i = 0; i < levels; i++)
            {
                var data = input.Trim();

                foreach (var symbol in _endSymbols)
                {
                    data = data.Replace(symbol, $" {symbol} ");
                }

                foreach (var symbol in _spacingSymbols)
                {
                    data = data.Replace(symbol, $" {symbol} ");
                }

                foreach (var symbol in _separatorSymbols)
                {
                    data = data.Replace(symbol, $" {symbol} ");
                }

                foreach (var symbol in _combinatorSymbols)
                {
                    data = data.Replace(symbol, $" {symbol} ");
                }

                foreach (var symbol in _wrapperSymbols)
                {
                    data = data.Replace(symbol, $" {symbol} ");
                }

                var words = data.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries).Select(w => w.Trim()).ToList();

                WordInstance lastWord = null;
                foreach (var word in words)
                {
                    WordInstance trainingWord;
                    if (!_trainingData.TryGetValue(word, out trainingWord))
                    {
                        var    symbolType       = SymbolType.None;
                        string symbolWrapperEnd = null;

                        if (_endSymbols.Contains(word))
                        {
                            symbolType = SymbolType.End;
                        }
                        else if (_endSeparatorSymbols.Contains(word))
                        {
                            symbolType = SymbolType.EndSeparator;
                        }
                        else if (_spacingSymbols.Contains(word))
                        {
                            symbolType = SymbolType.Spacing;
                        }
                        else if (_separatorSymbols.Contains(word))
                        {
                            symbolType = SymbolType.Separator;
                        }
                        else if (_combinatorSymbols.Contains(word))
                        {
                            symbolType = SymbolType.Combinator;
                        }
                        else if (_wrapperSymbols.Contains(word))
                        {
                            symbolType = SymbolType.Wrapper;

                            _wrapperEnds.TryGetValue(word, out symbolWrapperEnd);
                        }

                        trainingWord = new WordInstance(1, new TrainingWord(word, _defaultEndSymbol), symbolType, symbolWrapperEnd);

                        _trainingData.Add(word, trainingWord);
                    }
                    else if (trainingWord != null)
                    {
                        trainingWord.IncreaseWeight();
                    }

                    if (lastWord != null && lastWord.Word != null)
                    {
                        WordInstance preWordInstance;

                        if (!lastWord.Word.PostWords.TryGetValue(word, out preWordInstance))
                        {
                            lastWord.Word.PostWords.Add(word, new WordInstance(1, trainingWord.Word, trainingWord.SymbolType));
                        }
                        else if (preWordInstance != null)
                        {
                            preWordInstance.IncreaseWeight();
                        }
                    }

                    lastWord = trainingWord;
                }
            }
        }
Beispiel #4
0
        public void LoadModel(string modelFile)
        {
            if (!File.Exists(modelFile))
            {
                return;
            }

            using (var fs = new FileStream(modelFile, FileMode.Open))
            {
                using (var br = new BinaryReader(fs))
                {
                    // Symbols
                    var endSymbolsCount          = br.ReadUInt64();
                    var endSeparatorSymbolsCount = br.ReadUInt64();
                    var spacingSymbolsCount      = br.ReadUInt64();
                    var separatorSymbolsCount    = br.ReadUInt64();
                    var combinatorSymbolsCount   = br.ReadUInt64();
                    var wrapperSymbolsCount      = br.ReadUInt64();

                    var endSymbols = new List <string>();

                    for (uint i = 0; i < endSymbolsCount; i++)
                    {
                        var symbolLength = br.ReadUInt32();
                        var buffer       = br.ReadBytes((int)symbolLength);

                        endSymbols.Add(Encoding.Unicode.GetString(buffer));
                    }

                    var endSeparatorSymbols = new List <string>();

                    for (uint i = 0; i < endSeparatorSymbolsCount; i++)
                    {
                        var symbolLength = br.ReadUInt32();
                        var buffer       = br.ReadBytes((int)symbolLength);

                        endSeparatorSymbols.Add(Encoding.Unicode.GetString(buffer));
                    }

                    var spacingSymbols = new List <string>();

                    for (uint i = 0; i < spacingSymbolsCount; i++)
                    {
                        var symbolLength = br.ReadUInt32();
                        var buffer       = br.ReadBytes((int)symbolLength);

                        spacingSymbols.Add(Encoding.Unicode.GetString(buffer));
                    }

                    var separatorSymbols = new List <string>();

                    for (uint i = 0; i < separatorSymbolsCount; i++)
                    {
                        var symbolLength = br.ReadUInt32();
                        var buffer       = br.ReadBytes((int)symbolLength);

                        separatorSymbols.Add(Encoding.Unicode.GetString(buffer));
                    }

                    var combinatorSymbols = new List <string>();

                    for (uint i = 0; i < combinatorSymbolsCount; i++)
                    {
                        var symbolLength = br.ReadUInt32();
                        var buffer       = br.ReadBytes((int)symbolLength);

                        combinatorSymbols.Add(Encoding.Unicode.GetString(buffer));
                    }

                    var wrapperSymbols = new List <string>();

                    for (uint i = 0; i < wrapperSymbolsCount; i++)
                    {
                        var symbolLength = br.ReadUInt32();
                        var buffer       = br.ReadBytes((int)symbolLength);

                        wrapperSymbols.Add(Encoding.Unicode.GetString(buffer));
                    }

                    // Training Data
                    var trainingDataCount = br.ReadUInt64();

                    var trainingData = new Dictionary <ulong, WordInstance>();

                    var trainingIds = new Dictionary <ulong, List <ulong> >();

                    for (ulong i = 0; i < trainingDataCount; i++)
                    {
                        var    weight              = br.ReadInt32();
                        var    symbolType          = (SymbolType)br.ReadInt32();
                        var    hasSymbolWrapperEnd = br.ReadByte() == 1;
                        string symbolWrapperEnd    = null;

                        if (hasSymbolWrapperEnd)
                        {
                            var symbolWrapperEndCount = br.ReadUInt32();
                            var symbolWrapperBuffer   = br.ReadBytes((int)symbolWrapperEndCount);

                            symbolWrapperEnd = Encoding.Unicode.GetString(symbolWrapperBuffer);
                        }

                        var wordId = br.ReadUInt64();

                        var wordLength = br.ReadUInt32();
                        var wordBuffer = br.ReadBytes((int)wordLength);
                        var word       = symbolWrapperEnd = Encoding.Unicode.GetString(wordBuffer);

                        var postWordCount = br.ReadUInt64();

                        var postWordIds = new List <ulong>();

                        for (ulong j = 0; j < postWordCount; j++)
                        {
                            postWordIds.Add(br.ReadUInt64());
                        }

                        var trainingWord = new WordInstance(weight, new TrainingWord(word, _defaultEndSymbol), symbolType, symbolWrapperEnd);

                        trainingData.Add(wordId, trainingWord);
                        trainingIds.Add(wordId, postWordIds);
                    }

                    foreach (var instance in trainingData)
                    {
                        var ids = trainingIds[instance.Key];

                        foreach (var id in ids)
                        {
                            WordInstance wordInstance;
                            if (trainingData.TryGetValue(id, out wordInstance))
                            {
                                instance.Value.Word.PostWords.Add(wordInstance.Word.Word, wordInstance);
                            }
                        }

                        _trainingData.Add(instance.Value.Word.Word, instance.Value);
                    }
                }
            }
        }
Beispiel #5
0
        /// <summary>
        /// Generates the next word or symbol in the sequence.
        /// </summary>
        /// <param name="selectUntilEndSymbol">Boolean determining whether it should select until the end symbol only.</param>
        /// <param name="includeEndSymbol">Boolean determining whether it should include the end symbol too.</param>
        /// <returns>The word builder allowing for chaining.</returns>
        public WordBuilder Next(bool selectUntilEndSymbol = true, bool includeEndSymbol = true)
        {
            if (_tries > 10)
            {
                EndOfSequence = true;
                _wordSequence.Clear();

                return(this);
            }

            if (EndOfSequence)
            {
                return(this);
            }

            if (_currentWord == null)
            {
                _currentWord = _trainer.SelectStartWord();

                if (_currentWord == null)
                {
                    EndOfSequence = true;

                    return(this);
                }
            }
            else
            {
                _currentWord = _currentWord.Word.SelectWord();

                if (_currentWord == null)
                {
                    EndOfSequence = true;

                    return(this);
                }
            }

            if (_currentWord.SymbolType == SymbolType.Wrapper && string.IsNullOrWhiteSpace(_currentWord.SymbolWrapperEnd))
            {
                _tries++;
                return(Next(selectUntilEndSymbol, includeEndSymbol));
            }

            if (selectUntilEndSymbol && _currentWord.SymbolType == SymbolType.End)
            {
                if (includeEndSymbol || _wrapperInstance != null)
                {
                    _wordSequence.Add(_currentWord);
                }

                if (_wrapperInstance != null)
                {
                    _wordSequence.Add(_wrapperInstance);
                }

                EndOfSequence = true;
            }
            else
            {
                if (_wrapperInstance != null && _currentWord.SymbolType == SymbolType.Wrapper && _wrapperInstance.SymbolWrapperEnd == _currentWord.Word.Word)
                {
                    _wrapperInstance = null;
                }
                else if (_currentWord.SymbolType == SymbolType.Wrapper)
                {
                    _wrapperInstance = _currentWord;
                }

                _wordSequence.Add(_currentWord);
            }

            return(this);
        }
Beispiel #6
0
        /// <summary>
        /// Converts the sequence of the builder to a constructed string.
        /// </summary>
        /// <param name="selectUntilEndSymbol">Boolean determining whether it should select until the end symbol only.</param>
        /// <param name="includeEndSymbol">Boolean determining whether it should include the end symbol too.</param>
        /// <returns>Returns the constructed string.</returns>
        public string ToString(bool selectUntilEndSymbol, bool includeEndSymbol)
        {
            if (_tries > 10)
            {
                return(null);
            }

            if (_wordSequence.Count(w => w.SymbolType == SymbolType.None) > _maxWords)
            {
                _maxWords = Math.Min(_wordSequence.Count, _maxWords);

                for (var i = 0; i < _maxWords; i++)
                {
                    var word     = _wordSequence[i];
                    var nextWord = i < (_maxWords - 1) ? _wordSequence[i + 1] : null;

                    if (word.SymbolType == SymbolType.Wrapper && string.IsNullOrWhiteSpace(word.SymbolWrapperEnd))
                    {
                        _wordSequence = _wordSequence.Take(i + 1).ToList();
                        break;
                    }
                    else if (word.SymbolType == SymbolType.End && (nextWord == null || !(nextWord.SymbolType == SymbolType.Wrapper && string.IsNullOrWhiteSpace(nextWord.SymbolWrapperEnd))))
                    {
                        _wordSequence = _wordSequence.Take(i + 1).ToList();
                        break;
                    }
                }
            }

            if (_wordSequence.Count(w => w.SymbolType == SymbolType.None) < _minWords)
            {
                _wordSequence.Clear();
                EndOfSequence    = false;
                _currentWord     = null;
                _wrapperInstance = null;

                Generate(selectUntilEndSymbol, includeEndSymbol);

                _tries++;

                return(ToString(selectUntilEndSymbol, includeEndSymbol));
            }

            var result = new StringBuilder();

            for (var i = 0; i < _wordSequence.Count; i++)
            {
                var word     = _wordSequence[i];
                var nextWord = i < (_wordSequence.Count - 1) ? _wordSequence[i + 1] : null;

                switch (word.SymbolType)
                {
                case SymbolType.Spacing: result.AppendFormat(" {0} ", word.Word.Word); break;

                case SymbolType.Combinator:
                case SymbolType.Separator:
                case SymbolType.Wrapper:
                    result.AppendFormat("{0}", word.Word.Word);
                    break;

                case SymbolType.End:
                case SymbolType.EndSeparator:
                case SymbolType.None:
                default:
                    if (nextWord != null && (nextWord.SymbolType == SymbolType.End || nextWord.SymbolType == SymbolType.EndSeparator || nextWord.SymbolType == SymbolType.Combinator))
                    {
                        result.AppendFormat("{0}", word.Word.Word);
                    }
                    else if (nextWord != null && nextWord.SymbolType == SymbolType.Wrapper && word.SymbolType == SymbolType.End)
                    {
                        result.AppendFormat("{0}", word.Word.Word);
                    }
                    else
                    {
                        result.AppendFormat("{0} ", word.Word.Word);
                    }
                    break;
                }
            }

            return(FirstToUpper(result.ToString().Trim()));
        }