public DBCreator() { using (var db = new DiacriticsDBEntities()) { wordTrie = DBTrieCreator.CreateDBTrie(db); } }
private void initAttrs() { using (var db = new DiacriticsDBEntities()) { idTrie = DBTrieCreator.CreateDBTrie(db); maxId = db.Words.Max(x => x.Id); minId = db.Words.Min(x => x.Id); } }
private void InsertIntoDb(DataTable dt, DiacriticsDBEntities db, string destinationTableName) { using (var sqlBulk = new SqlBulkCopy(db.Database.Connection.ConnectionString, SqlBulkCopyOptions.KeepIdentity)) { sqlBulk.BatchSize = 10000; sqlBulk.NotifyAfter = 10000; sqlBulk.SqlRowsCopied += (sender, eventArgs) => Console.WriteLine("Inserted " + eventArgs.RowsCopied + " records."); sqlBulk.DestinationTableName = destinationTableName; sqlBulk.WriteToServer(dt); } }
public DBDR() { db = new DiacriticsDBEntities(); wordTrie = DBTrieCreator.CreateDBTrie(db); db.Database.Connection.Open(); sqlSelectUniGrams = new SqlCommand("SELECT Word1 FROM dbo.UniGramEntities WHERE WordId = @id ORDER BY Frequency DESC", db.Database.Connection as SqlConnection); sqlSelectUniGrams.CommandType = CommandType.Text; sqlSelectUniGrams.Parameters.Add("id", SqlDbType.Int); }
internal void LoadFile(NgramFile file) { using (var db = new DiacriticsDBEntities()) { switch (file.Next().Words.Length) { case 1: InsertUnigramsSqlBulkCopy((UniGramFile)file, db); break; default: throw new Exception("Unknown length of ngams!"); } } }
// Creating files from DB internal static void CreateBinaryFileFromDBWordsAndUniGramsEntities(string positionTriePath, string fileUniGramsPath) { using (var db = new DiacriticsDBEntities()) { db.Database.Connection.Open(); List <FileNgram> ngrams = GetAllFileNgrams(db); Console.WriteLine("rows created..."); var sqlSelectWord = new SqlCommand("SELECT Id, Value FROM dbo.Words ORDER BY Id ASC", db.Database.Connection as SqlConnection); sqlSelectWord.CommandType = CommandType.Text; using (SqlDataReader wordReader = sqlSelectWord.ExecuteReader()) using (var trieWriter = new StreamWriter(positionTriePath)) using (BinaryWriter fileWriter = new BinaryWriter(File.Open(fileUniGramsPath, FileMode.Create))) { Console.WriteLine("started..."); while (wordReader.Read()) { int id = wordReader.GetInt32(0); string word = wordReader.GetString(1); trieWriter.WriteLine(word + " " + fileWriter.BaseStream.Position); var foundNgrams = FindFileNgramsBinarySearch(ngrams, id); SortByFrequencyDesc(foundNgrams); fileWriter.Write(foundNgrams.Count); foreach (var ng in foundNgrams) { fileWriter.Write(ng.Value); } if (id % 100000 == 0) { Console.WriteLine(id); } } } db.Database.Connection.Close(); } }
private static List <FileNgram> GetAllFileNgrams(DiacriticsDBEntities db) { var sqlSelectUniGrams = new SqlCommand("SELECT Word1, Frequency, WordId FROM dbo.UniGramEntities", db.Database.Connection as SqlConnection); sqlSelectUniGrams.CommandType = CommandType.Text; var ret = new List <FileNgram>(); using (SqlDataReader unigramsReader = sqlSelectUniGrams.ExecuteReader()) { while (unigramsReader.Read()) { ret.Add(new FileNgram(unigramsReader.GetString(0), unigramsReader.GetInt32(1), unigramsReader.GetInt32(2))); } } SortByIdAsc(ret); return(ret); }
public static Trie <char, int> CreateDBTrie(DiacriticsDBEntities db) { var t = new Trie <char, int>(); using (SqlCommand sqlSelect = new SqlCommand("SELECT * FROM dbo.Words", db.Database.Connection as SqlConnection)) { db.Database.Connection.Open(); Console.WriteLine("Creating word trie..."); using (SqlDataReader reader = sqlSelect.ExecuteReader()) { while (reader.Read()) { t.Add((string)reader[1], (int)reader[0]); } } Console.WriteLine("Word trie created."); db.Database.Connection.Close(); } return(t); }
private List <string> GetWords() { var words = new List <string>(); using (var db = new DiacriticsDBEntities()) { db.Database.Connection.Open(); var sqlSelectWord = new SqlCommand("SELECT Value FROM dbo.Words ORDER BY Id ASC", db.Database.Connection as SqlConnection); sqlSelectWord.CommandType = CommandType.Text; using (SqlDataReader wordReader = sqlSelectWord.ExecuteReader()) { while (wordReader.Read()) { words.Add(wordReader.GetString(0)); } } db.Database.Connection.Close(); } return(words); }
private void InsertWordsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db) { var dtWords = new DataTable(); dtWords.Columns.Add("Id"); dtWords.Columns.Add("Value"); file.ReOpen(); Ngram ng; int id = 0; var counter = 0; while ((ng = file.Next()) != null) { string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(ng.ToString()); dtWords.Rows.Add(++id, nonDiacriticsW); if (++counter % 100000 == 0) { Console.WriteLine(counter + " words prepared for insertion."); } } InsertIntoDb(dtWords, db, "dbo.Words"); }
private void InsertUnigramsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db) { var dtUniGrams = new DataTable(); dtUniGrams.Columns.Add("Word1"); dtUniGrams.Columns.Add("WordId"); dtUniGrams.Columns.Add("Id"); dtUniGrams.Columns.Add("Frequency"); file.ReOpen(); Ngram ng; int wordId; var uniGramId = 0; var counter = 0; while ((ng = file.Next()) != null) { string w = ng.ToString(); string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w); wordId = wordTrie.Find(nonDiacriticsW); if (wordId != 0) { dtUniGrams.Rows.Add(w, wordId, ++uniGramId, ng.Frequency); } else { throw new Exception("Word '" + nonDiacriticsW + "' is not present in Trie!"); } if (++counter % 10000 == 0) { Console.WriteLine(counter + " unigrams prepared for insertion."); } } InsertIntoDb(dtUniGrams, db, "dbo.UniGramEntities"); }
private void LoadUniGramsEF(NgramFile file, DiacriticsDBEntities db) { file.ReOpen(); Ngram ngram; Word word; int i = 0; while ((ngram = file.Next()) != null) { foreach (var w in ngram.Words) { string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w); word = db.Words.Where(a => a.Value == nonDiacriticsW).SingleOrDefault(); if (word == null) { word = new Word() { Value = nonDiacriticsW }; db.Words.Add(word); } db.UniGramEntities.Add(new UniGramEntity() { Frequency = ngram.Frequency, Word = word, WordId = word.Id, Word1 = w }); } if (++i % 100 == 0) { db.SaveChanges(); Console.WriteLine(i); } } }
private void LoadUniGramsSqlCmd(NgramFile file, DiacriticsDBEntities db) { var sqlSelect = new SqlCommand("SELECT * FROM dbo.Words WHERE Value = @value", db.Database.Connection as SqlConnection); sqlSelect.CommandType = CommandType.Text; sqlSelect.Parameters.Add("value", SqlDbType.NVarChar); var sqlInsertWord = new SqlCommand("INSERT INTO dbo.Words (Value) VALUES (@value)", db.Database.Connection as SqlConnection); sqlInsertWord.CommandType = CommandType.Text; sqlInsertWord.Parameters.Add("value", SqlDbType.NVarChar); var sqlInsertUniGram = new SqlCommand("INSERT INTO dbo.UniGramEntities (Word1, WordId, Frequency) VALUES (@word1, @wordId, @frequency)", db.Database.Connection as SqlConnection); sqlInsertUniGram.CommandType = CommandType.Text; sqlInsertUniGram.Parameters.Add("word1", SqlDbType.NVarChar); sqlInsertUniGram.Parameters.Add("wordId", SqlDbType.Int); sqlInsertUniGram.Parameters.Add("frequency", SqlDbType.Int); db.Database.Connection.Open(); file.ReOpen(); Ngram ngram; int i = 0; while ((ngram = file.Next()) != null) { foreach (var w in ngram.Words) { string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w); int id = -1; bool wasIserted; do { wasIserted = false; sqlSelect.Parameters["value"].Value = nonDiacriticsW; SqlDataReader reader = sqlSelect.ExecuteReader(); if (reader.Read()) { id = (int)reader[0]; } else { sqlInsertWord.Parameters["value"].Value = nonDiacriticsW; sqlInsertWord.ExecuteNonQuery(); wasIserted = true; } reader.Close(); } while (wasIserted); sqlInsertUniGram.Parameters["word1"].Value = w; sqlInsertUniGram.Parameters["wordId"].Value = id; sqlInsertUniGram.Parameters["frequency"].Value = ngram.Frequency; sqlInsertUniGram.ExecuteNonQuery(); } if (++i % 10000 == 0) { Console.WriteLine(i); } } db.Database.Connection.Close(); sqlSelect.Dispose(); sqlInsertWord.Dispose(); sqlInsertUniGram.Dispose(); }