示例#1
0
 public DBCreator()
 {
     using (var db = new DiacriticsDBEntities())
     {
         wordTrie = DBTrieCreator.CreateDBTrie(db);
     }
 }
示例#2
0
 private void initAttrs()
 {
     using (var db = new DiacriticsDBEntities())
     {
         idTrie = DBTrieCreator.CreateDBTrie(db);
         maxId  = db.Words.Max(x => x.Id);
         minId  = db.Words.Min(x => x.Id);
     }
 }
示例#3
0
        private void InsertIntoDb(DataTable dt, DiacriticsDBEntities db, string destinationTableName)
        {
            using (var sqlBulk = new SqlBulkCopy(db.Database.Connection.ConnectionString, SqlBulkCopyOptions.KeepIdentity))
            {
                sqlBulk.BatchSize      = 10000;
                sqlBulk.NotifyAfter    = 10000;
                sqlBulk.SqlRowsCopied += (sender, eventArgs) => Console.WriteLine("Inserted " + eventArgs.RowsCopied + " records.");

                sqlBulk.DestinationTableName = destinationTableName;
                sqlBulk.WriteToServer(dt);
            }
        }
示例#4
0
        public DBDR()
        {
            db = new DiacriticsDBEntities();

            wordTrie = DBTrieCreator.CreateDBTrie(db);

            db.Database.Connection.Open();

            sqlSelectUniGrams = new SqlCommand("SELECT Word1 FROM dbo.UniGramEntities WHERE WordId = @id ORDER BY Frequency DESC",
                                               db.Database.Connection as SqlConnection);
            sqlSelectUniGrams.CommandType = CommandType.Text;
            sqlSelectUniGrams.Parameters.Add("id", SqlDbType.Int);
        }
示例#5
0
        internal void LoadFile(NgramFile file)
        {
            using (var db = new DiacriticsDBEntities())
            {
                switch (file.Next().Words.Length)
                {
                case 1:
                    InsertUnigramsSqlBulkCopy((UniGramFile)file, db);
                    break;

                default:
                    throw new Exception("Unknown length of ngams!");
                }
            }
        }
示例#6
0
        // Creating files from DB

        internal static void CreateBinaryFileFromDBWordsAndUniGramsEntities(string positionTriePath, string fileUniGramsPath)
        {
            using (var db = new DiacriticsDBEntities())
            {
                db.Database.Connection.Open();

                List <FileNgram> ngrams = GetAllFileNgrams(db);
                Console.WriteLine("rows created...");

                var sqlSelectWord = new SqlCommand("SELECT Id, Value FROM dbo.Words ORDER BY Id ASC",
                                                   db.Database.Connection as SqlConnection);
                sqlSelectWord.CommandType = CommandType.Text;

                using (SqlDataReader wordReader = sqlSelectWord.ExecuteReader())
                    using (var trieWriter = new StreamWriter(positionTriePath))
                        using (BinaryWriter fileWriter = new BinaryWriter(File.Open(fileUniGramsPath, FileMode.Create)))
                        {
                            Console.WriteLine("started...");
                            while (wordReader.Read())
                            {
                                int    id   = wordReader.GetInt32(0);
                                string word = wordReader.GetString(1);

                                trieWriter.WriteLine(word + " " + fileWriter.BaseStream.Position);

                                var foundNgrams = FindFileNgramsBinarySearch(ngrams, id);
                                SortByFrequencyDesc(foundNgrams);

                                fileWriter.Write(foundNgrams.Count);
                                foreach (var ng in foundNgrams)
                                {
                                    fileWriter.Write(ng.Value);
                                }
                                if (id % 100000 == 0)
                                {
                                    Console.WriteLine(id);
                                }
                            }
                        }
                db.Database.Connection.Close();
            }
        }
示例#7
0
        private static List <FileNgram> GetAllFileNgrams(DiacriticsDBEntities db)
        {
            var sqlSelectUniGrams = new SqlCommand("SELECT Word1, Frequency, WordId FROM dbo.UniGramEntities",
                                                   db.Database.Connection as SqlConnection);

            sqlSelectUniGrams.CommandType = CommandType.Text;

            var ret = new List <FileNgram>();

            using (SqlDataReader unigramsReader = sqlSelectUniGrams.ExecuteReader())
            {
                while (unigramsReader.Read())
                {
                    ret.Add(new FileNgram(unigramsReader.GetString(0), unigramsReader.GetInt32(1), unigramsReader.GetInt32(2)));
                }
            }

            SortByIdAsc(ret);
            return(ret);
        }
示例#8
0
        public static Trie <char, int> CreateDBTrie(DiacriticsDBEntities db)
        {
            var t = new Trie <char, int>();

            using (SqlCommand sqlSelect = new SqlCommand("SELECT * FROM dbo.Words", db.Database.Connection as SqlConnection))
            {
                db.Database.Connection.Open();
                Console.WriteLine("Creating word trie...");
                using (SqlDataReader reader = sqlSelect.ExecuteReader())
                {
                    while (reader.Read())
                    {
                        t.Add((string)reader[1], (int)reader[0]);
                    }
                }
                Console.WriteLine("Word trie created.");
                db.Database.Connection.Close();
            }

            return(t);
        }
示例#9
0
        private List <string> GetWords()
        {
            var words = new List <string>();

            using (var db = new DiacriticsDBEntities())
            {
                db.Database.Connection.Open();
                var sqlSelectWord = new SqlCommand("SELECT Value FROM dbo.Words ORDER BY Id ASC",
                                                   db.Database.Connection as SqlConnection);
                sqlSelectWord.CommandType = CommandType.Text;

                using (SqlDataReader wordReader = sqlSelectWord.ExecuteReader())
                {
                    while (wordReader.Read())
                    {
                        words.Add(wordReader.GetString(0));
                    }
                }
                db.Database.Connection.Close();
            }
            return(words);
        }
示例#10
0
        private void InsertWordsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db)
        {
            var dtWords = new DataTable();

            dtWords.Columns.Add("Id");
            dtWords.Columns.Add("Value");

            file.ReOpen();
            Ngram ng;
            int   id      = 0;
            var   counter = 0;

            while ((ng = file.Next()) != null)
            {
                string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(ng.ToString());
                dtWords.Rows.Add(++id, nonDiacriticsW);

                if (++counter % 100000 == 0)
                {
                    Console.WriteLine(counter + " words prepared for insertion.");
                }
            }
            InsertIntoDb(dtWords, db, "dbo.Words");
        }
示例#11
0
        private void InsertUnigramsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db)
        {
            var dtUniGrams = new DataTable();

            dtUniGrams.Columns.Add("Word1");
            dtUniGrams.Columns.Add("WordId");
            dtUniGrams.Columns.Add("Id");
            dtUniGrams.Columns.Add("Frequency");

            file.ReOpen();
            Ngram ng;
            int   wordId;
            var   uniGramId = 0;
            var   counter   = 0;

            while ((ng = file.Next()) != null)
            {
                string w = ng.ToString();
                string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w);

                wordId = wordTrie.Find(nonDiacriticsW);
                if (wordId != 0)
                {
                    dtUniGrams.Rows.Add(w, wordId, ++uniGramId, ng.Frequency);
                }
                else
                {
                    throw new Exception("Word '" + nonDiacriticsW + "' is not present in Trie!");
                }
                if (++counter % 10000 == 0)
                {
                    Console.WriteLine(counter + " unigrams prepared for insertion.");
                }
            }
            InsertIntoDb(dtUniGrams, db, "dbo.UniGramEntities");
        }
示例#12
0
        private void LoadUniGramsEF(NgramFile file, DiacriticsDBEntities db)
        {
            file.ReOpen();
            Ngram ngram;
            Word  word;
            int   i = 0;

            while ((ngram = file.Next()) != null)
            {
                foreach (var w in ngram.Words)
                {
                    string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w);
                    word = db.Words.Where(a => a.Value == nonDiacriticsW).SingleOrDefault();
                    if (word == null)
                    {
                        word = new Word()
                        {
                            Value = nonDiacriticsW
                        };
                        db.Words.Add(word);
                    }
                    db.UniGramEntities.Add(new UniGramEntity()
                    {
                        Frequency = ngram.Frequency,
                        Word      = word,
                        WordId    = word.Id,
                        Word1     = w
                    });
                }
                if (++i % 100 == 0)
                {
                    db.SaveChanges();
                    Console.WriteLine(i);
                }
            }
        }
示例#13
0
        private void LoadUniGramsSqlCmd(NgramFile file, DiacriticsDBEntities db)
        {
            var sqlSelect = new SqlCommand("SELECT * FROM dbo.Words WHERE Value = @value", db.Database.Connection as SqlConnection);

            sqlSelect.CommandType = CommandType.Text;
            sqlSelect.Parameters.Add("value", SqlDbType.NVarChar);

            var sqlInsertWord = new SqlCommand("INSERT INTO dbo.Words (Value) VALUES (@value)", db.Database.Connection as SqlConnection);

            sqlInsertWord.CommandType = CommandType.Text;
            sqlInsertWord.Parameters.Add("value", SqlDbType.NVarChar);

            var sqlInsertUniGram = new SqlCommand("INSERT INTO dbo.UniGramEntities (Word1, WordId, Frequency) VALUES (@word1, @wordId, @frequency)",
                                                  db.Database.Connection as SqlConnection);

            sqlInsertUniGram.CommandType = CommandType.Text;
            sqlInsertUniGram.Parameters.Add("word1", SqlDbType.NVarChar);
            sqlInsertUniGram.Parameters.Add("wordId", SqlDbType.Int);
            sqlInsertUniGram.Parameters.Add("frequency", SqlDbType.Int);

            db.Database.Connection.Open();

            file.ReOpen();
            Ngram ngram;
            int   i = 0;

            while ((ngram = file.Next()) != null)
            {
                foreach (var w in ngram.Words)
                {
                    string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w);
                    int    id             = -1;
                    bool   wasIserted;
                    do
                    {
                        wasIserted = false;
                        sqlSelect.Parameters["value"].Value = nonDiacriticsW;
                        SqlDataReader reader = sqlSelect.ExecuteReader();

                        if (reader.Read())
                        {
                            id = (int)reader[0];
                        }
                        else
                        {
                            sqlInsertWord.Parameters["value"].Value = nonDiacriticsW;
                            sqlInsertWord.ExecuteNonQuery();
                            wasIserted = true;
                        }
                        reader.Close();
                    } while (wasIserted);

                    sqlInsertUniGram.Parameters["word1"].Value     = w;
                    sqlInsertUniGram.Parameters["wordId"].Value    = id;
                    sqlInsertUniGram.Parameters["frequency"].Value = ngram.Frequency;
                    sqlInsertUniGram.ExecuteNonQuery();
                }
                if (++i % 10000 == 0)
                {
                    Console.WriteLine(i);
                }
            }
            db.Database.Connection.Close();

            sqlSelect.Dispose();
            sqlInsertWord.Dispose();
            sqlInsertUniGram.Dispose();
        }