public void Add(string ngram) { if (ngram == null) { return; } var nonDiacritics = stringRoutines.MyDiacriticsRemover(ngram); foreach (var word in nonDiacritics.Split(' ')) { var ngrams = cache.Find(word); if (ngrams != null) { ngrams.Add(ngram); } else { cache.Add(word, new List <string> { ngram }); } } if (isSetSize) { CheckSize(ngram); } }
private void LoadUnigrams(UniGramFile uniGramFiles, List <List <string> > ngramLists) { Ngram ngram; string lineWordsFormated; while ((ngram = uniGramFiles.Next()) != null) { lineWordsFormated = string.Join(" ", ngram.Words); foreach (string w in ngram.Words) { string nonDiacriticsWord = StringRoutines.MyDiacriticsRemover(w); List <string> foundList = trie.Find(nonDiacriticsWord); if (foundList == null) { var l = new List <string> { lineWordsFormated }; ngramLists.Add(l); trie.Add(nonDiacriticsWord, l); } else { foundList.Add(lineWordsFormated); } } } }
private void OptimizedLoad(List <NgramFile> otherNgramFiles) { var maxAllowedConut = new int[5] { 0, 702, 702, 352, 107 }; // load 1, 350, 245, 105 (1 2 3 4) foreach (var file in otherNgramFiles) { Ngram ngram; string lineWordsFormated; int size = file.Next().Words.Length; file.ReOpen(); while ((ngram = file.Next()) != null) { lineWordsFormated = string.Join(" ", ngram.Words); foreach (string w in ngram.Words) { string nonDiacriticsWord = StringRoutines.MyDiacriticsRemover(w); List <string> foundList = trie.Find(nonDiacriticsWord); if (foundList != null && foundList.Count > 1 && foundList.Count < maxAllowedConut[size]) { foundList.Add(lineWordsFormated); } } } Console.WriteLine(" 4 3 2"); } }
internal void Load(NgramFile file) { Ngram ngram; string lineWordsFormated; while ((ngram = file.Next()) != null) { lineWordsFormated = string.Join(" ", ngram.Words); foreach (string w in ngram.Words) { string nonDiacriticsWord = StringRoutines.MyDiacriticsRemover(w); List <string> foundList = trie.Find(nonDiacriticsWord); if (foundList == null) { trie.Add(nonDiacriticsWord, new List <string> { lineWordsFormated }); } else { foundList.Add(lineWordsFormated); } } } }
private List <FileNgram> GetFileNgrams(NgramFile file, int from, int to) { var ngrams = new List <FileNgram>(); Ngram ng; int i = 1; while (i < from) { if ((ng = file.Next()) == null) { return(ngrams); } i++; } while (i <= to) { if ((ng = file.Next()) != null) { foreach (var w in ng.Words) { int id = idTrie.Find(StringRoutines.MyDiacriticsRemover(w)); ngrams.Add(new FileNgram(ng.ToString(), ng.Frequency, id)); } i++; } else { break; } } return(ngrams); }
public void MyDiacriticsRemover_WordWithDiacritics_ReturnsWithoutDiacritics() { string input = "áäčďéíĺľňóôŕšťúýžěřůäöüẞß abcdefghijklmnopqrstuvwxyz ,./;'[]{} 1234567890 ~`!@#$%^&*()+_-"; string expected = "aacdeillnoorstuyzeruaouẞß abcdefghijklmnopqrstuvwxyz ,./;'[]{} 1234567890 ~`!@#$%^&*()+_-"; string result = StringRoutines.MyDiacriticsRemover(input); Assert.AreEqual(expected, result); }
internal static void Test(string path, DiacriticsReconstructor dr, bool writeStatistics = true) { long bytes = GC.GetTotalMemory(true); Console.WriteLine($"Memory (bytes): {bytes}"); if (writeStatistics) { statisticsPath = $"{TextFile.FileName(path)}_STATISTICS{TextFile.FileExtension(path)}"; File.WriteAllText(statisticsPath, $"Memory (bytes): {bytes}\n"); } Console.WriteLine($"Reading {path}"); string originalText = File.OpenText(path).ReadToEnd(); Console.WriteLine("Removing diacritics..."); string textWithoutDiacritics = StringRoutines.MyDiacriticsRemover(originalText); File.WriteAllText($"{TextFile.FileName(path)}_WITHOUT-DIACRITICS{TextFile.FileExtension(path)}", textWithoutDiacritics); Console.WriteLine("Reconstructing..."); var sw = Stopwatch.StartNew(); string reconstructedText = dr.Reconstruct(textWithoutDiacritics); sw.Stop(); string ngramsStat = dr.GetStatistic(); Console.Write(ngramsStat); if (writeStatistics) { File.AppendAllText(statisticsPath, ngramsStat); } dr.EraseStatistic(); Console.WriteLine($"Elapsed (milliseconds): {sw.Elapsed.TotalMilliseconds}"); if (writeStatistics) { File.AppendAllText(statisticsPath, $"Elapsed (milliseconds): {sw.Elapsed.TotalMilliseconds}\n"); } Console.WriteLine("Done."); File.WriteAllText($"{TextFile.FileName(path)}_RENCOSTRUCTED{TextFile.FileExtension(path)}", reconstructedText); Console.WriteLine("Testing..."); FindMistakes(originalText, reconstructedText, path, writeStatistics); Console.WriteLine("Done.\n"); }
private List <FileNgram> DivideFileBy(NgramFile file, int count, ref bool endOfFile) { var ret = new List <FileNgram>(); Ngram ng; int i = 0; while ((ng = file.Next()) != null && i++ < count) { foreach (var w in ng.Words) { int id = idTrie.Find(StringRoutines.MyDiacriticsRemover(w)); if (id == 0) { throw new Exception("Word '" + w + "' is not in idTrie!!!"); } ret.Add(new FileNgram(ng.ToString(), ng.Frequency, id)); } } endOfFile = ng == null; Console.WriteLine("part of file loaded..."); return(ret); }
private void CheckSize(string ngram) { priorityNgrams.Remove(ngram); priorityNgrams.Add(ngram); if (priorityNgrams.Count > size) { var ngramToRemove = StringRoutines.MyDiacriticsRemover(priorityNgrams[0]); foreach (var word in ngramToRemove.Split(' ')) { var listRemoveFrom = cache.Find(word); if (listRemoveFrom == null) { continue; } listRemoveFrom.Remove(priorityNgrams[0]); if (listRemoveFrom.Count == 0) { cache.Remove(word); } } priorityNgrams.RemoveAt(0); } }
private void InsertWordsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db) { var dtWords = new DataTable(); dtWords.Columns.Add("Id"); dtWords.Columns.Add("Value"); file.ReOpen(); Ngram ng; int id = 0; var counter = 0; while ((ng = file.Next()) != null) { string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(ng.ToString()); dtWords.Rows.Add(++id, nonDiacriticsW); if (++counter % 100000 == 0) { Console.WriteLine(counter + " words prepared for insertion."); } } InsertIntoDb(dtWords, db, "dbo.Words"); }
private void InsertUnigramsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db) { var dtUniGrams = new DataTable(); dtUniGrams.Columns.Add("Word1"); dtUniGrams.Columns.Add("WordId"); dtUniGrams.Columns.Add("Id"); dtUniGrams.Columns.Add("Frequency"); file.ReOpen(); Ngram ng; int wordId; var uniGramId = 0; var counter = 0; while ((ng = file.Next()) != null) { string w = ng.ToString(); string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w); wordId = wordTrie.Find(nonDiacriticsW); if (wordId != 0) { dtUniGrams.Rows.Add(w, wordId, ++uniGramId, ng.Frequency); } else { throw new Exception("Word '" + nonDiacriticsW + "' is not present in Trie!"); } if (++counter % 10000 == 0) { Console.WriteLine(counter + " unigrams prepared for insertion."); } } InsertIntoDb(dtUniGrams, db, "dbo.UniGramEntities"); }
private void LoadUniGramsEF(NgramFile file, DiacriticsDBEntities db) { file.ReOpen(); Ngram ngram; Word word; int i = 0; while ((ngram = file.Next()) != null) { foreach (var w in ngram.Words) { string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w); word = db.Words.Where(a => a.Value == nonDiacriticsW).SingleOrDefault(); if (word == null) { word = new Word() { Value = nonDiacriticsW }; db.Words.Add(word); } db.UniGramEntities.Add(new UniGramEntity() { Frequency = ngram.Frequency, Word = word, WordId = word.Id, Word1 = w }); } if (++i % 100 == 0) { db.SaveChanges(); Console.WriteLine(i); } } }
private void LoadUniGramsSqlCmd(NgramFile file, DiacriticsDBEntities db) { var sqlSelect = new SqlCommand("SELECT * FROM dbo.Words WHERE Value = @value", db.Database.Connection as SqlConnection); sqlSelect.CommandType = CommandType.Text; sqlSelect.Parameters.Add("value", SqlDbType.NVarChar); var sqlInsertWord = new SqlCommand("INSERT INTO dbo.Words (Value) VALUES (@value)", db.Database.Connection as SqlConnection); sqlInsertWord.CommandType = CommandType.Text; sqlInsertWord.Parameters.Add("value", SqlDbType.NVarChar); var sqlInsertUniGram = new SqlCommand("INSERT INTO dbo.UniGramEntities (Word1, WordId, Frequency) VALUES (@word1, @wordId, @frequency)", db.Database.Connection as SqlConnection); sqlInsertUniGram.CommandType = CommandType.Text; sqlInsertUniGram.Parameters.Add("word1", SqlDbType.NVarChar); sqlInsertUniGram.Parameters.Add("wordId", SqlDbType.Int); sqlInsertUniGram.Parameters.Add("frequency", SqlDbType.Int); db.Database.Connection.Open(); file.ReOpen(); Ngram ngram; int i = 0; while ((ngram = file.Next()) != null) { foreach (var w in ngram.Words) { string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w); int id = -1; bool wasIserted; do { wasIserted = false; sqlSelect.Parameters["value"].Value = nonDiacriticsW; SqlDataReader reader = sqlSelect.ExecuteReader(); if (reader.Read()) { id = (int)reader[0]; } else { sqlInsertWord.Parameters["value"].Value = nonDiacriticsW; sqlInsertWord.ExecuteNonQuery(); wasIserted = true; } reader.Close(); } while (wasIserted); sqlInsertUniGram.Parameters["word1"].Value = w; sqlInsertUniGram.Parameters["wordId"].Value = id; sqlInsertUniGram.Parameters["frequency"].Value = ngram.Frequency; sqlInsertUniGram.ExecuteNonQuery(); } if (++i % 10000 == 0) { Console.WriteLine(i); } } db.Database.Connection.Close(); sqlSelect.Dispose(); sqlInsertWord.Dispose(); sqlInsertUniGram.Dispose(); }
protected bool MatchesUp(string word, string[] ngram, string[] nthBefore, string[] nthAfter, ref string result) { string[] ngramWordsDiacritics = ngram; string[] ngramWords = new string[ngram.Length]; for (int i = 0; i < ngram.Length; i++) { ngramWords[i] = StringRoutines.MyDiacriticsRemover(ngram[i]); } bool matches; int res; for (int i = 0; i < ngramWords.Length; i++) { // find {word} in {ngramWords} (multiple matches can by found) if (ngramWords[i] == word) { res = i; matches = true; // test {ngramWords} with {nthBefore} and {nthAfter} for (int j = 0; j < nthBefore.Length; j++) { if ((i - j - 1) >= 0) { if (nthBefore[j] != ngramWords[i - j - 1]) { matches = false; break; } } else { break; } } if (matches) { for (int j = 0; j < nthAfter.Length; j++) { if ((i + j + 1) < ngramWords.Length) { if (nthAfter[j] != ngramWords[i + j + 1]) { matches = false; break; } } else { break; } } } if (matches) { result = ngramWordsDiacritics[res]; return(true); } } } return(false); }
private string Normalize(string word) { return(StringRoutines.MyDiacriticsRemover(word).ToLower()); }