/// <summary> /// Createsa new training word. /// </summary> /// <param name="word">The word.</param> /// <param name="defaultEndWord">The default end word for the training word.</param> internal TrainingWord(string word, WordInstance defaultEndWord) { Word = word; DefaultEndWord = defaultEndWord; PostWords = new Dictionary <string, WordInstance>(); _id = _nextId; _nextId++; }
/// <summary> /// Creates a new word trainer. /// </summary> /// <param name="defaultEndSymbol">The default end symbol.</param> /// <param name="endSymbols">The end symbols.</param> /// <param name="endSeparators">The end separators.</param> /// <param name="spacingSymbols">The spacing symbols.</param> /// <param name="separatorSymbols">The separator symbols.</param> /// <param name="combinatorSymbols">The combinator symbols.</param> /// <param name="wrapperSymbols">The wrapper symbols.</param> public WordTrainer(string defaultEndSymbol, IReadOnlyCollection <string> endSymbols, IReadOnlyCollection <string> endSeparators, IReadOnlyCollection <string> spacingSymbols, IReadOnlyCollection <string> separatorSymbols, IReadOnlyCollection <string> combinatorSymbols, IReadOnlyList <string> wrapperSymbols) { _trainingData = new Dictionary <string, WordInstance>(); _wrapperEnds = new Dictionary <string, string>(); _symbolHashes = new HashSet <string>(); _endSymbols = endSymbols; _endSeparatorSymbols = endSeparators; _spacingSymbols = spacingSymbols; _separatorSymbols = separatorSymbols; _combinatorSymbols = combinatorSymbols; _wrapperSymbols = wrapperSymbols; if (!string.IsNullOrWhiteSpace(defaultEndSymbol)) { _defaultEndSymbol = new WordInstance(1, new TrainingWord(defaultEndSymbol, null), SymbolType.End); } for (var i = 0; i < wrapperSymbols.Count; i++) { var a = _wrapperSymbols[i]; i++; var b = _wrapperSymbols[i]; _wrapperEnds.Add(a, b); } foreach (var symbol in _endSymbols) { _symbolHashes.Add(symbol); } foreach (var symbol in _endSeparatorSymbols) { _symbolHashes.Add(symbol); } foreach (var symbol in _spacingSymbols) { _symbolHashes.Add(symbol); } foreach (var symbol in _separatorSymbols) { _symbolHashes.Add(symbol); } foreach (var symbol in _combinatorSymbols) { _symbolHashes.Add(symbol); } foreach (var symbol in _wrapperSymbols) { _symbolHashes.Add(symbol); } }
/// <summary> /// Adds training data. /// </summary> /// <param name="input">The input.</param> /// <param name="levels">The levels of training. This is 1 by default. More levels = larger weight of each word/symbol.</param> public void Train(string input, int levels = 1) { for (var i = 0; i < levels; i++) { var data = input.Trim(); foreach (var symbol in _endSymbols) { data = data.Replace(symbol, $" {symbol} "); } foreach (var symbol in _spacingSymbols) { data = data.Replace(symbol, $" {symbol} "); } foreach (var symbol in _separatorSymbols) { data = data.Replace(symbol, $" {symbol} "); } foreach (var symbol in _combinatorSymbols) { data = data.Replace(symbol, $" {symbol} "); } foreach (var symbol in _wrapperSymbols) { data = data.Replace(symbol, $" {symbol} "); } var words = data.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries).Select(w => w.Trim()).ToList(); WordInstance lastWord = null; foreach (var word in words) { WordInstance trainingWord; if (!_trainingData.TryGetValue(word, out trainingWord)) { var symbolType = SymbolType.None; string symbolWrapperEnd = null; if (_endSymbols.Contains(word)) { symbolType = SymbolType.End; } else if (_endSeparatorSymbols.Contains(word)) { symbolType = SymbolType.EndSeparator; } else if (_spacingSymbols.Contains(word)) { symbolType = SymbolType.Spacing; } else if (_separatorSymbols.Contains(word)) { symbolType = SymbolType.Separator; } else if (_combinatorSymbols.Contains(word)) { symbolType = SymbolType.Combinator; } else if (_wrapperSymbols.Contains(word)) { symbolType = SymbolType.Wrapper; _wrapperEnds.TryGetValue(word, out symbolWrapperEnd); } trainingWord = new WordInstance(1, new TrainingWord(word, _defaultEndSymbol), symbolType, symbolWrapperEnd); _trainingData.Add(word, trainingWord); } else if (trainingWord != null) { trainingWord.IncreaseWeight(); } if (lastWord != null && lastWord.Word != null) { WordInstance preWordInstance; if (!lastWord.Word.PostWords.TryGetValue(word, out preWordInstance)) { lastWord.Word.PostWords.Add(word, new WordInstance(1, trainingWord.Word, trainingWord.SymbolType)); } else if (preWordInstance != null) { preWordInstance.IncreaseWeight(); } } lastWord = trainingWord; } } }
public void LoadModel(string modelFile) { if (!File.Exists(modelFile)) { return; } using (var fs = new FileStream(modelFile, FileMode.Open)) { using (var br = new BinaryReader(fs)) { // Symbols var endSymbolsCount = br.ReadUInt64(); var endSeparatorSymbolsCount = br.ReadUInt64(); var spacingSymbolsCount = br.ReadUInt64(); var separatorSymbolsCount = br.ReadUInt64(); var combinatorSymbolsCount = br.ReadUInt64(); var wrapperSymbolsCount = br.ReadUInt64(); var endSymbols = new List <string>(); for (uint i = 0; i < endSymbolsCount; i++) { var symbolLength = br.ReadUInt32(); var buffer = br.ReadBytes((int)symbolLength); endSymbols.Add(Encoding.Unicode.GetString(buffer)); } var endSeparatorSymbols = new List <string>(); for (uint i = 0; i < endSeparatorSymbolsCount; i++) { var symbolLength = br.ReadUInt32(); var buffer = br.ReadBytes((int)symbolLength); endSeparatorSymbols.Add(Encoding.Unicode.GetString(buffer)); } var spacingSymbols = new List <string>(); for (uint i = 0; i < spacingSymbolsCount; i++) { var symbolLength = br.ReadUInt32(); var buffer = br.ReadBytes((int)symbolLength); spacingSymbols.Add(Encoding.Unicode.GetString(buffer)); } var separatorSymbols = new List <string>(); for (uint i = 0; i < separatorSymbolsCount; i++) { var symbolLength = br.ReadUInt32(); var buffer = br.ReadBytes((int)symbolLength); separatorSymbols.Add(Encoding.Unicode.GetString(buffer)); } var combinatorSymbols = new List <string>(); for (uint i = 0; i < combinatorSymbolsCount; i++) { var symbolLength = br.ReadUInt32(); var buffer = br.ReadBytes((int)symbolLength); combinatorSymbols.Add(Encoding.Unicode.GetString(buffer)); } var wrapperSymbols = new List <string>(); for (uint i = 0; i < wrapperSymbolsCount; i++) { var symbolLength = br.ReadUInt32(); var buffer = br.ReadBytes((int)symbolLength); wrapperSymbols.Add(Encoding.Unicode.GetString(buffer)); } // Training Data var trainingDataCount = br.ReadUInt64(); var trainingData = new Dictionary <ulong, WordInstance>(); var trainingIds = new Dictionary <ulong, List <ulong> >(); for (ulong i = 0; i < trainingDataCount; i++) { var weight = br.ReadInt32(); var symbolType = (SymbolType)br.ReadInt32(); var hasSymbolWrapperEnd = br.ReadByte() == 1; string symbolWrapperEnd = null; if (hasSymbolWrapperEnd) { var symbolWrapperEndCount = br.ReadUInt32(); var symbolWrapperBuffer = br.ReadBytes((int)symbolWrapperEndCount); symbolWrapperEnd = Encoding.Unicode.GetString(symbolWrapperBuffer); } var wordId = br.ReadUInt64(); var wordLength = br.ReadUInt32(); var wordBuffer = br.ReadBytes((int)wordLength); var word = symbolWrapperEnd = Encoding.Unicode.GetString(wordBuffer); var postWordCount = br.ReadUInt64(); var postWordIds = new List <ulong>(); for (ulong j = 0; j < postWordCount; j++) { postWordIds.Add(br.ReadUInt64()); } var trainingWord = new WordInstance(weight, new TrainingWord(word, _defaultEndSymbol), symbolType, symbolWrapperEnd); trainingData.Add(wordId, trainingWord); trainingIds.Add(wordId, postWordIds); } foreach (var instance in trainingData) { var ids = trainingIds[instance.Key]; foreach (var id in ids) { WordInstance wordInstance; if (trainingData.TryGetValue(id, out wordInstance)) { instance.Value.Word.PostWords.Add(wordInstance.Word.Word, wordInstance); } } _trainingData.Add(instance.Value.Word.Word, instance.Value); } } } }
/// <summary> /// Generates the next word or symbol in the sequence. /// </summary> /// <param name="selectUntilEndSymbol">Boolean determining whether it should select until the end symbol only.</param> /// <param name="includeEndSymbol">Boolean determining whether it should include the end symbol too.</param> /// <returns>The word builder allowing for chaining.</returns> public WordBuilder Next(bool selectUntilEndSymbol = true, bool includeEndSymbol = true) { if (_tries > 10) { EndOfSequence = true; _wordSequence.Clear(); return(this); } if (EndOfSequence) { return(this); } if (_currentWord == null) { _currentWord = _trainer.SelectStartWord(); if (_currentWord == null) { EndOfSequence = true; return(this); } } else { _currentWord = _currentWord.Word.SelectWord(); if (_currentWord == null) { EndOfSequence = true; return(this); } } if (_currentWord.SymbolType == SymbolType.Wrapper && string.IsNullOrWhiteSpace(_currentWord.SymbolWrapperEnd)) { _tries++; return(Next(selectUntilEndSymbol, includeEndSymbol)); } if (selectUntilEndSymbol && _currentWord.SymbolType == SymbolType.End) { if (includeEndSymbol || _wrapperInstance != null) { _wordSequence.Add(_currentWord); } if (_wrapperInstance != null) { _wordSequence.Add(_wrapperInstance); } EndOfSequence = true; } else { if (_wrapperInstance != null && _currentWord.SymbolType == SymbolType.Wrapper && _wrapperInstance.SymbolWrapperEnd == _currentWord.Word.Word) { _wrapperInstance = null; } else if (_currentWord.SymbolType == SymbolType.Wrapper) { _wrapperInstance = _currentWord; } _wordSequence.Add(_currentWord); } return(this); }
/// <summary> /// Converts the sequence of the builder to a constructed string. /// </summary> /// <param name="selectUntilEndSymbol">Boolean determining whether it should select until the end symbol only.</param> /// <param name="includeEndSymbol">Boolean determining whether it should include the end symbol too.</param> /// <returns>Returns the constructed string.</returns> public string ToString(bool selectUntilEndSymbol, bool includeEndSymbol) { if (_tries > 10) { return(null); } if (_wordSequence.Count(w => w.SymbolType == SymbolType.None) > _maxWords) { _maxWords = Math.Min(_wordSequence.Count, _maxWords); for (var i = 0; i < _maxWords; i++) { var word = _wordSequence[i]; var nextWord = i < (_maxWords - 1) ? _wordSequence[i + 1] : null; if (word.SymbolType == SymbolType.Wrapper && string.IsNullOrWhiteSpace(word.SymbolWrapperEnd)) { _wordSequence = _wordSequence.Take(i + 1).ToList(); break; } else if (word.SymbolType == SymbolType.End && (nextWord == null || !(nextWord.SymbolType == SymbolType.Wrapper && string.IsNullOrWhiteSpace(nextWord.SymbolWrapperEnd)))) { _wordSequence = _wordSequence.Take(i + 1).ToList(); break; } } } if (_wordSequence.Count(w => w.SymbolType == SymbolType.None) < _minWords) { _wordSequence.Clear(); EndOfSequence = false; _currentWord = null; _wrapperInstance = null; Generate(selectUntilEndSymbol, includeEndSymbol); _tries++; return(ToString(selectUntilEndSymbol, includeEndSymbol)); } var result = new StringBuilder(); for (var i = 0; i < _wordSequence.Count; i++) { var word = _wordSequence[i]; var nextWord = i < (_wordSequence.Count - 1) ? _wordSequence[i + 1] : null; switch (word.SymbolType) { case SymbolType.Spacing: result.AppendFormat(" {0} ", word.Word.Word); break; case SymbolType.Combinator: case SymbolType.Separator: case SymbolType.Wrapper: result.AppendFormat("{0}", word.Word.Word); break; case SymbolType.End: case SymbolType.EndSeparator: case SymbolType.None: default: if (nextWord != null && (nextWord.SymbolType == SymbolType.End || nextWord.SymbolType == SymbolType.EndSeparator || nextWord.SymbolType == SymbolType.Combinator)) { result.AppendFormat("{0}", word.Word.Word); } else if (nextWord != null && nextWord.SymbolType == SymbolType.Wrapper && word.SymbolType == SymbolType.End) { result.AppendFormat("{0}", word.Word.Word); } else { result.AppendFormat("{0} ", word.Word.Word); } break; } } return(FirstToUpper(result.ToString().Trim())); }