private void LoadUnigrams(UniGramFile uniGramFiles, List <List <string> > ngramLists) { Ngram ngram; string lineWordsFormated; while ((ngram = uniGramFiles.Next()) != null) { lineWordsFormated = string.Join(" ", ngram.Words); foreach (string w in ngram.Words) { string nonDiacriticsWord = StringRoutines.MyDiacriticsRemover(w); List <string> foundList = trie.Find(nonDiacriticsWord); if (foundList == null) { var l = new List <string> { lineWordsFormated }; ngramLists.Add(l); trie.Add(nonDiacriticsWord, l); } else { foundList.Add(lineWordsFormated); } } } }
private void InsertWordsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db) { var dtWords = new DataTable(); dtWords.Columns.Add("Id"); dtWords.Columns.Add("Value"); file.ReOpen(); Ngram ng; int id = 0; var counter = 0; while ((ng = file.Next()) != null) { string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(ng.ToString()); dtWords.Rows.Add(++id, nonDiacriticsW); if (++counter % 100000 == 0) { Console.WriteLine(counter + " words prepared for insertion."); } } InsertIntoDb(dtWords, db, "dbo.Words"); }
private void InsertUnigramsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db) { var dtUniGrams = new DataTable(); dtUniGrams.Columns.Add("Word1"); dtUniGrams.Columns.Add("WordId"); dtUniGrams.Columns.Add("Id"); dtUniGrams.Columns.Add("Frequency"); file.ReOpen(); Ngram ng; int wordId; var uniGramId = 0; var counter = 0; while ((ng = file.Next()) != null) { string w = ng.ToString(); string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w); wordId = wordTrie.Find(nonDiacriticsW); if (wordId != 0) { dtUniGrams.Rows.Add(w, wordId, ++uniGramId, ng.Frequency); } else { throw new Exception("Word '" + nonDiacriticsW + "' is not present in Trie!"); } if (++counter % 10000 == 0) { Console.WriteLine(counter + " unigrams prepared for insertion."); } } InsertIntoDb(dtUniGrams, db, "dbo.UniGramEntities"); }