internal void Load(NgramFile file) { Ngram ngram; string lineWordsFormated; while ((ngram = file.Next()) != null) { lineWordsFormated = string.Join(" ", ngram.Words); foreach (string w in ngram.Words) { string nonDiacriticsWord = StringRoutines.MyDiacriticsRemover(w); List <string> foundList = trie.Find(nonDiacriticsWord); if (foundList == null) { trie.Add(nonDiacriticsWord, new List <string> { lineWordsFormated }); } else { foundList.Add(lineWordsFormated); } } } }
private List <FileNgram> GetFileNgrams(NgramFile file, int from, int to) { var ngrams = new List <FileNgram>(); Ngram ng; int i = 1; while (i < from) { if ((ng = file.Next()) == null) { return(ngrams); } i++; } while (i <= to) { if ((ng = file.Next()) != null) { foreach (var w in ng.Words) { int id = idTrie.Find(StringRoutines.MyDiacriticsRemover(w)); ngrams.Add(new FileNgram(ng.ToString(), ng.Frequency, id)); } i++; } else { break; } } return(ngrams); }
internal string CompleteProcessing(NgramFile file, int rmvWordsFromFreq = 0, bool clean = true, int rmvBadWordsFromFreq = int.MaxValue, int rmvWordsFromLength = int.MaxValue) { bool isUniGramFile = file is UniGramFile; if (rmvWordsFromFreq > 0) { file = isUniGramFile ? new UniGramFile(RemoveWordsFromFreqDown(file, rmvWordsFromFreq)) : file = new NgramFile(RemoveWordsFromFreqDown(file, rmvWordsFromFreq)); } if (clean) { file = isUniGramFile ? file = new UniGramFile(Clean(file)) : file = new NgramFile(Clean(file)); } if (rmvWordsFromLength == int.MaxValue) { return(RemoveBadWords(file, rmvBadWordsFromFreq)); } else { file = isUniGramFile ? file = new UniGramFile(RemoveBadWords(file, rmvBadWordsFromFreq)) : file = new NgramFile(RemoveBadWords(file, rmvBadWordsFromFreq)); return(RemoveWordsFromLength(file, rmvWordsFromLength)); } }
internal string RemoveWordsFromLength(NgramFile file, int fromLength) { string name = file.FileName; string extension = file.FileExtension; using (var toLength_sw = new StreamWriter($"{name}_TO-LENGTH-{fromLength}{extension}")) using (var fromLength_sw = new StreamWriter($"{name}_FROM-LENGTH-{fromLength}{extension}")) { bool isToLength; Ngram ngram; while ((ngram = file.Next()) != null) { isToLength = true; foreach (var w in ngram.Words) { if (w.Length > fromLength) { isToLength = false; break; } } if (isToLength) { toLength_sw.WriteLine(ngram.Line); } else { fromLength_sw.WriteLine(ngram.Line); } } } return($"{name}_TO-LENGTH-{fromLength}{extension}"); }
private int GetSuitableCountForDivision(NgramFile file) { int count; count = GetDivisionCountByNumber(file.Next().Words.Length); file.ReOpen(); return(count); }
public void Set(string index, NGram gramEntries) { var path = GetFileName(index, gramEntries.Size); var obj = new NgramFile() { Entryes = gramEntries, N = gramEntries.Size }; var jsonText = JsonConvert.SerializeObject(obj); File.WriteAllText(path, jsonText); _nGrams[GetKey(index, gramEntries.Size)] = gramEntries; }
internal void LoadFile(NgramFile file) { using (var db = new DiacriticsDBEntities()) { switch (file.Next().Words.Length) { case 1: InsertUnigramsSqlBulkCopy((UniGramFile)file, db); break; default: throw new Exception("Unknown length of ngams!"); } } }
private static void CleanFiles() { var fc = new FileCleaner(); var file = new UniGramFile("D:/slovniky/prim-8.0-public-all-word_frequency_non_case_sensitive/prim-8.0-public-all-word_frequency_non_case_sensitive.txt"); Console.WriteLine(fc.CompleteProcessing(file, rmvWordsFromFreq: 0, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30)); var file2 = new NgramFile("D:/ngramy/prim-8.0-public-all-2-gramy/prim-8.0-public-all-2-gramy.txt"); Console.WriteLine(fc.CompleteProcessing(file2, rmvWordsFromFreq: 1, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30)); var file3 = new NgramFile("D:/ngramy/prim-8.0-public-all-3-gramy/prim-8.0-public-all-3-gramy.txt"); Console.WriteLine(fc.CompleteProcessing(file3, rmvWordsFromFreq: 2, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30)); var file4 = new NgramFile("D:/ngramy/prim-8.0-public-all-4-gramy/prim-8.0-public-all-4-gramy.txt"); Console.WriteLine(fc.CompleteProcessing(file4, rmvWordsFromFreq: 3, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30)); }
internal void SortByLineLength(NgramFile file) { var words = new List <Ngram>(); Ngram ng; while ((ng = file.Next()) != null) { words.Add(ng); } var arr = words.ToArray(); Array.Sort(arr, (x, y) => x.ToString().Length.CompareTo(y.ToString().Length)); using (var writer = new StreamWriter($"{file.FileName}_SORTED{file.FileExtension}")) { foreach (var n in arr) { writer.WriteLine(n.ToString() + $" {n.Frequency} ({n.ToString().Length})"); } } }
private List <FileNgram> DivideFileBy(NgramFile file, int count, ref bool endOfFile) { var ret = new List <FileNgram>(); Ngram ng; int i = 0; while ((ng = file.Next()) != null && i++ < count) { foreach (var w in ng.Words) { int id = idTrie.Find(StringRoutines.MyDiacriticsRemover(w)); if (id == 0) { throw new Exception("Word '" + w + "' is not in idTrie!!!"); } ret.Add(new FileNgram(ng.ToString(), ng.Frequency, id)); } } endOfFile = ng == null; Console.WriteLine("part of file loaded..."); return(ret); }
internal string RemoveWordsFromFreqDown(NgramFile file, int fromFrequency) { string name = file.FileName; string extension = file.FileExtension; using (var fromDown_sw = new StreamWriter($"{name}_FROM-{fromFrequency}-DOWN{extension}")) using (var to_sw = new StreamWriter($"{name}_TO-{fromFrequency}{extension}")) { Ngram ngram; while ((ngram = file.Next()) != null) { if (ngram.Frequency <= fromFrequency) { fromDown_sw.WriteLine(ngram.Line); } else { to_sw.WriteLine(ngram.Line); } } } return($"{name}_TO-{fromFrequency}{extension}"); }
public string RemoveBadWords(NgramFile file, int fromFrequency) { string name = file.FileName; string extension = file.FileExtension; using (var goodWords_sw = new StreamWriter($"{name}_GOOD-WORDS{extension}")) using (var badWords_sw = new StreamWriter($"{name}_BAD-WORDS{extension}")) { Ngram ngram; while ((ngram = file.Next()) != null) { if (ngram.Frequency > fromFrequency || IsGoodWord(StringRoutines.MyDiacriticsRemover(ngram.ToString()))) { goodWords_sw.WriteLine(ngram.Line); } else { badWords_sw.WriteLine(ngram.Line); } } } return($"{name}_GOOD-WORDS{extension}"); }
private void LoadUniGramsEF(NgramFile file, DiacriticsDBEntities db) { file.ReOpen(); Ngram ngram; Word word; int i = 0; while ((ngram = file.Next()) != null) { foreach (var w in ngram.Words) { string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w); word = db.Words.Where(a => a.Value == nonDiacriticsW).SingleOrDefault(); if (word == null) { word = new Word() { Value = nonDiacriticsW }; db.Words.Add(word); } db.UniGramEntities.Add(new UniGramEntity() { Frequency = ngram.Frequency, Word = word, WordId = word.Id, Word1 = w }); } if (++i % 100 == 0) { db.SaveChanges(); Console.WriteLine(i); } } }
internal string Clean(NgramFile file) { string name = file.FileName; string extension = file.FileExtension; string word; using (var cleaned_sw = new StreamWriter($"{name}_CLEANED{extension}")) using (var chrs_nums_sw = new StreamWriter($"{name}_TRASH-CHRS+NUMS{extension}")) using (var nums_sw = new StreamWriter($"{name}_TRASH-NUMS{extension}")) using (var trash_sw = new StreamWriter($"{name}_TRASH{extension}")) { Ngram ngram; while ((ngram = file.Next()) != null) { word = String.Join("", ngram.Words); if (rgxChars.IsMatch(word) && rgxDigits.IsMatch(word)) { chrs_nums_sw.WriteLine(ngram.Line); } else if (rgxDigits.IsMatch(word)) { nums_sw.WriteLine(ngram.Line); } else if (rgxNonChars.IsMatch(word)) { cleaned_sw.WriteLine(ngram.Line); } else { trash_sw.WriteLine(ngram.Line); } } } return($"{name}_CLEANED{extension}"); }
private void LoadUniGramsSqlCmd(NgramFile file, DiacriticsDBEntities db) { var sqlSelect = new SqlCommand("SELECT * FROM dbo.Words WHERE Value = @value", db.Database.Connection as SqlConnection); sqlSelect.CommandType = CommandType.Text; sqlSelect.Parameters.Add("value", SqlDbType.NVarChar); var sqlInsertWord = new SqlCommand("INSERT INTO dbo.Words (Value) VALUES (@value)", db.Database.Connection as SqlConnection); sqlInsertWord.CommandType = CommandType.Text; sqlInsertWord.Parameters.Add("value", SqlDbType.NVarChar); var sqlInsertUniGram = new SqlCommand("INSERT INTO dbo.UniGramEntities (Word1, WordId, Frequency) VALUES (@word1, @wordId, @frequency)", db.Database.Connection as SqlConnection); sqlInsertUniGram.CommandType = CommandType.Text; sqlInsertUniGram.Parameters.Add("word1", SqlDbType.NVarChar); sqlInsertUniGram.Parameters.Add("wordId", SqlDbType.Int); sqlInsertUniGram.Parameters.Add("frequency", SqlDbType.Int); db.Database.Connection.Open(); file.ReOpen(); Ngram ngram; int i = 0; while ((ngram = file.Next()) != null) { foreach (var w in ngram.Words) { string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w); int id = -1; bool wasIserted; do { wasIserted = false; sqlSelect.Parameters["value"].Value = nonDiacriticsW; SqlDataReader reader = sqlSelect.ExecuteReader(); if (reader.Read()) { id = (int)reader[0]; } else { sqlInsertWord.Parameters["value"].Value = nonDiacriticsW; sqlInsertWord.ExecuteNonQuery(); wasIserted = true; } reader.Close(); } while (wasIserted); sqlInsertUniGram.Parameters["word1"].Value = w; sqlInsertUniGram.Parameters["wordId"].Value = id; sqlInsertUniGram.Parameters["frequency"].Value = ngram.Frequency; sqlInsertUniGram.ExecuteNonQuery(); } if (++i % 10000 == 0) { Console.WriteLine(i); } } db.Database.Connection.Close(); sqlSelect.Dispose(); sqlInsertWord.Dispose(); sqlInsertUniGram.Dispose(); }