private List <string> AnalyzeSentence(Sentence sentence, NgramType type) { var length = (int)type; var result = new List <string>(); var ngramVariants = new List <NGramVariants>(); var combinationWords = CreateCombinationsWordList(sentence.Text); combinationWords = _dictionary.CheckWords(combinationWords); ngramVariants.AddRange(CreateNgramVariantsList(sentence.Text, combinationWords, length)); var tmp = ngramVariants.Where(x => x.NgramVariants.Count > 1).ToList(); var ngrams = GetAllData(tmp, type).ToList(); foreach (var variant in ngramVariants) { variant.UpdateNGramsVariants(ngrams); variant.RestoreUpperLettersInVariants(); variant.CountProbability(tmp.Contains(variant)); } result.AddRange(_ngramConnector.AnalyzeNgramsVariants(ngramVariants, length, sentence.Text.Count)); result[result.Count - 1] = result[result.Count - 1] + sentence.EndMarks; return(result); }
/// <summary> /// Generate query to get data from Ngrams table. /// </summary> /// <param name="ngramType">Type of NGrams.</param> /// <param name="wordList">List of words - must have a size suitable for the type ngrams.</param> /// <returns> /// Query string. /// </returns> /// <exception cref="ArgumentException">List(string) 'wordList' has wrong size</exception> /// <inheritdoc /> public string GetTheSameNgramsFromTable(NgramType ngramType, List <string> wordList) { var number = (int)ngramType; if (wordList == null || wordList.Count < number) { throw new ArgumentException("List<string> 'wordList' has wrong size"); } var index = GetIndexOfNames(wordList[0]); var query = $"SELECT * FROM `{_dbTableDbTableName[number - 1]}[{Names[index]}]` WHERE"; for (var i = 0; i < number; ++i) { if (i != 0) { query += " AND"; } query += " Word" + (i + 1) + "='" + wordList[i].ChangeSpecialCharacters() + "'"; } //query += ";"; return(query); }
private IEnumerable <NGram> GetAllData(List <NGramVariants> wordLists, NgramType type) { var ngramsList = new List <NGram>(); var ngramVariants = new List <NGram>(); foreach (var item in wordLists) { foreach (var elem in item.NgramVariants) { ngramVariants.Add(elem.Ngram); } } ngramVariants = ngramVariants.Distinct().ToList(); var index = ngramVariants.FindIndex(x => x.WordsList.Contains("")); if (index != -1) { ngramVariants.RemoveAt(index); } _db.ConnectToDb(); foreach (var elem in ngramVariants) { var tmp = new NGram(elem); if (_iManager != null) { tmp = _iManager.Remove(tmp); } var data = _db.ExecuteSqlCommand(_queryProvider.GetTheSameNgramsFromTable(type, tmp.WordsList)); for (var i = 0; i < data.Tables[0].Rows.Count; ++i) { var dataRow = data.Tables[0].Rows[i].ItemArray; var ngram = StringArrayToNgram(dataRow.Select(a => a.ToString()).ToArray()); if (ngram == null) { continue; } var n = (NGram)ngram; if (_iManager != null) { n = _iManager.Restore(elem, n); } ngramsList.Add(n); } } _db.Disconnect(); return(ngramsList); }
/// <inheritdoc /> /// <summary> /// Generate query to get the similar ngrams from Ngrams table. /// </summary> /// <param name="ngramType">Type of the ngram.</param> /// <param name="wordList">List of words - must have a one size smaller than the type ngrams.</param> /// <returns></returns> /// <exception cref="T:System.ArgumentException"> /// List(string) 'wordList' has wrong size /// or /// NgramType 'ngramType' cannot be an Unigram /// </exception> public string GetSimilarNgramsFromTable(NgramType ngramType, List <string> wordList) { var number = (int)ngramType; var numberComparedWords = number - 1; if (wordList == null || wordList.Count < numberComparedWords) { throw new ArgumentException("List<string> 'wordList' has wrong size"); } var query = QueryCreator(number, numberComparedWords, wordList); return(query); }
private string[] GetLeftContext(List<string> context, NgramType type) { if (type == NgramType.Digram && context.Count == 3) { context.RemoveAt(0); } if (type == NgramType.Unigram && context.Count == 3) { context.RemoveRange(0, 2); } if (type == NgramType.Unigram && context.Count == 2) { context.RemoveAt(0); } return context.ToArray(); }
// public List<string> WordsCombinations { get; private set; } #endregion #region CONSTRUCTORS /// <summary> /// Initializes a new instance of the <see cref="DiacriticMarksRestorer"/> class. /// </summary> /// <param name="diacriticAdder">The diacritic adder.</param> /// <param name="dictionary">The dictionary.</param> /// <param name="splitter"></param> /// <param name="iManager"></param> /// <param name="ngramConnector"></param> public DiacriticMarksRestorer(ILetterChanger diacriticAdder = null, IDictionary dictionary = null, IFragmentsSplitter splitter = null, ICharactersIgnorer iManager = null, INgramsConnector ngramConnector = null) { _diacriticAdder = diacriticAdder ?? new DiacriticMarksAdder(); if (dictionary == null) { var logFile = File.ReadAllLines(@"Resources\dictionary"); var logList = new List <string>(logFile); var result = new Dictionary <string, int>(); foreach (var item in logList) { result.Add(item, 0); } dictionary = new Dict(result); } _dictionary = dictionary; _splitter = splitter; _ngramConnector = ngramConnector ?? new Hierarchy(); _iManager = iManager; _ngramType = NgramType.Bigram; }
/// <summary> /// Generate query which gets the ngrams changing the last word from table. /// </summary> /// <param name="ngramType">Type of the ngram.</param> /// <param name="wordList">The word list.</param> /// <param name="combinations">Possible last words.</param> /// <returns></returns> /// <exception cref="ArgumentException"> /// List(string) 'wordList' has wrong size /// or /// NgramType 'ngramType' cannot be an Unigram /// </exception> public string GetMultiNgramsFromTable(NgramType ngramType, List <string> wordList, List <string> combinations) { var number = (int)ngramType; if (wordList == null || wordList.Count != number - 1) { throw new ArgumentException("List<string> 'wordList' has wrong size"); } if (combinations == null || combinations.Count < 1) { throw new ArgumentException("List<string> 'combinations' has wrong size"); } var index = GetIndexOfNames(wordList[0]); var query = $"SELECT * FROM `{_dbTableDbTableName[number - 1]}[{Names[index]}]` WHERE "; for (var i = 0; i < number - 1; ++i) { if (i != 0) { query += "AND "; } query += "Word" + (i + 1) + "='" + wordList[i].ChangeSpecialCharacters() + "' "; } query += "AND ( "; for (var i = 0; i < combinations.Count; ++i) { if (i != 0) { query += "OR "; } query += "Word" + number + "='" + combinations[i].ChangeSpecialCharacters() + "' "; } query += ");"; return(query); }
public string GetAllNecessaryNgramsFromTable(NgramType ngramType, List <List <List <string> > > wordLists) { var number = (int)ngramType; if (wordLists == null || wordLists.Count < 1) { throw new ArgumentException("List<string> 'wordLists' has wrong size"); } foreach (var item in wordLists) { if (item.Count != number) { throw new ArgumentException("List<string> middle list has wrong size"); } foreach (var item2 in item) { if (item2.Count < 1) { throw new ArgumentException("List<string> inside list has wrong size"); } } } var query = "SELECT * FROM " + _dbTableDbTableName[number - 1] + " WHERE "; var z = 1; foreach (var item1 in wordLists) { var j = 1; if (z != 1) { query += " OR "; } query += "( "; foreach (var item2 in item1) { if (j != 1) { query += " AND "; } query += "( "; for (var i = 0; i < item2.Count; ++i) { if (i != 0) { query += "OR "; } query += "Word" + j + "='" + item2[i].ChangeSpecialCharacters() + "' "; } query += ")"; ++j; } query += " )"; ++z; } query += ";"; return(query); }
internal void AddNgramCollection(NgramType type, NgramCollection collection) { ngrams.Add(type, collection); }
public bool IsAvailableNgramCollection(NgramType type) { return this.ngrams.ContainsKey(type); }
public NgramCollection GetNgramCollection(NgramType type) { return this.ngrams[type]; }
public string GetAllNecessaryNgramsFromTable(NgramType ngramType, List <List <List <string> > > wordLists) { var number = (int)ngramType; if (wordLists == null || wordLists.Count < 1) { throw new ArgumentException("List<string> 'wordLists' has wrong size"); } foreach (var item in wordLists) { if (item.Count != number) { throw new ArgumentException("List<string> middle list has wrong size"); } foreach (var item2 in item) { if (item2.Count < 1) { throw new ArgumentException("List<string> inside list has wrong size"); } } } var commandsText = new string[Names.Length]; for (var index = 0; index < commandsText.Length; index++) { commandsText[index] = ""; } foreach (var item1 in wordLists) { var index = GetIndexOfNames(item1[0][0]); if (commandsText[index] == "") { commandsText[index] = $"SELECT * FROM `{_dbTableDbTableName[number - 1]}[{Names[index]}]` WHERE "; } else { commandsText[index] += " OR "; } var j = 1; commandsText[index] += "( "; foreach (var item2 in item1) { if (j != 1) { commandsText[index] += " AND "; } commandsText[index] += "( "; for (var i = 0; i < item2.Count; ++i) { if (i != 0) { commandsText[index] += "OR "; } commandsText[index] += "Word" + j + "='" + item2[i].ChangeSpecialCharacters() + "' "; } commandsText[index] += ")"; ++j; } commandsText[index] += " )"; } var result = ""; for (var index = 0; index < commandsText.Length; index++) { if (commandsText[index] != "") { commandsText[index] += " UNION ALL "; } result += commandsText[index]; } return(result.Substring(0, result.Length - 11) + ";"); }
/// <summary> /// This method set a NgramType. /// </summary> /// <param name="type">The type.</param> /// <inheritdoc /> public void SetNgram(NgramType type) { _ngramType = type; }
private List <string> AnalyzeSentence2(Sentence sentence, NgramType type, ref List <TimeSpan> times, ref List <int> counts) { var length = (int)type; var result = new List <string>(); var ngramVariants = new List <NGramVariants>(); var start = DateTime.Now; var combinationWords = CreateCombinationsWordList(sentence.Text); counts[2] += combinationWords.Count; var stop = DateTime.Now; times[1] += stop - start; start = DateTime.Now; combinationWords = _dictionary.CheckWords(combinationWords); stop = DateTime.Now; counts[3] += combinationWords.Count; times[2] += stop - start; start = DateTime.Now; ngramVariants.AddRange(CreateNgramVariantsList(sentence.Text, combinationWords, length)); stop = DateTime.Now; counts[4] += ngramVariants.Count; times[3] += stop - start; start = DateTime.Now; var tmp = ngramVariants.Where(x => x.NgramVariants.Count > 1).ToList(); stop = DateTime.Now; counts[5] += tmp.Count; times[4] += stop - start; foreach (var item in tmp) { counts[6] += item.NgramVariants.Count; } start = DateTime.Now; var ngrams = GetAllData(tmp, type).ToList(); stop = DateTime.Now; counts[7] += ngrams.Count; times[5] += stop - start; start = DateTime.Now; foreach (var variant in ngramVariants) { variant.UpdateNGramsVariants(ngrams); variant.RestoreUpperLettersInVariants(); variant.CountProbability(tmp.Contains(variant)); } stop = DateTime.Now; times[6] += stop - start; start = DateTime.Now; result.AddRange(_ngramConnector.AnalyzeNgramsVariants(ngramVariants, length, sentence.Text.Count)); result[result.Count - 1] = result[result.Count - 1] + sentence.EndMarks; counts[8] += result.Count; stop = DateTime.Now; times[7] += stop - start; return(result); }