private void LoadUnigrams(UniGramFile uniGramFiles, List <List <string> > ngramLists) { Ngram ngram; string lineWordsFormated; while ((ngram = uniGramFiles.Next()) != null) { lineWordsFormated = string.Join(" ", ngram.Words); foreach (string w in ngram.Words) { string nonDiacriticsWord = StringRoutines.MyDiacriticsRemover(w); List <string> foundList = trie.Find(nonDiacriticsWord); if (foundList == null) { var l = new List <string> { lineWordsFormated }; ngramLists.Add(l); trie.Add(nonDiacriticsWord, l); } else { foundList.Add(lineWordsFormated); } } } }
public TrieDR(UniGramFile unigrams, List <NgramFile> othersNgrams) { TrieCreator tc = new TrieCreator(); tc.GetOptimizedTrie(unigrams, othersNgrams); trie = tc.Get(); }
internal string CompleteProcessing(NgramFile file, int rmvWordsFromFreq = 0, bool clean = true, int rmvBadWordsFromFreq = int.MaxValue, int rmvWordsFromLength = int.MaxValue) { bool isUniGramFile = file is UniGramFile; if (rmvWordsFromFreq > 0) { file = isUniGramFile ? new UniGramFile(RemoveWordsFromFreqDown(file, rmvWordsFromFreq)) : file = new NgramFile(RemoveWordsFromFreqDown(file, rmvWordsFromFreq)); } if (clean) { file = isUniGramFile ? file = new UniGramFile(Clean(file)) : file = new NgramFile(Clean(file)); } if (rmvWordsFromLength == int.MaxValue) { return(RemoveBadWords(file, rmvBadWordsFromFreq)); } else { file = isUniGramFile ? file = new UniGramFile(RemoveBadWords(file, rmvBadWordsFromFreq)) : file = new NgramFile(RemoveBadWords(file, rmvBadWordsFromFreq)); return(RemoveWordsFromLength(file, rmvWordsFromLength)); } }
internal void GetOptimizedTrie(UniGramFile uniGramFiles, List <NgramFile> otherNgramFiles) { var ngramLists = new List <List <string> >(); LoadUnigrams(uniGramFiles, ngramLists); Console.WriteLine("unigrams loaded"); OptimizeUniGramTrie(ngramLists); Console.WriteLine("unigrams optimized"); OptimizedLoad(otherNgramFiles); Console.WriteLine("other files loaded"); SwitchUniGramsToTheEnd(ngramLists); Console.WriteLine("unigrams switched"); }
private static void CleanFiles() { var fc = new FileCleaner(); var file = new UniGramFile("D:/slovniky/prim-8.0-public-all-word_frequency_non_case_sensitive/prim-8.0-public-all-word_frequency_non_case_sensitive.txt"); Console.WriteLine(fc.CompleteProcessing(file, rmvWordsFromFreq: 0, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30)); var file2 = new NgramFile("D:/ngramy/prim-8.0-public-all-2-gramy/prim-8.0-public-all-2-gramy.txt"); Console.WriteLine(fc.CompleteProcessing(file2, rmvWordsFromFreq: 1, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30)); var file3 = new NgramFile("D:/ngramy/prim-8.0-public-all-3-gramy/prim-8.0-public-all-3-gramy.txt"); Console.WriteLine(fc.CompleteProcessing(file3, rmvWordsFromFreq: 2, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30)); var file4 = new NgramFile("D:/ngramy/prim-8.0-public-all-4-gramy/prim-8.0-public-all-4-gramy.txt"); Console.WriteLine(fc.CompleteProcessing(file4, rmvWordsFromFreq: 3, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30)); }
private void InsertWordsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db) { var dtWords = new DataTable(); dtWords.Columns.Add("Id"); dtWords.Columns.Add("Value"); file.ReOpen(); Ngram ng; int id = 0; var counter = 0; while ((ng = file.Next()) != null) { string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(ng.ToString()); dtWords.Rows.Add(++id, nonDiacriticsW); if (++counter % 100000 == 0) { Console.WriteLine(counter + " words prepared for insertion."); } } InsertIntoDb(dtWords, db, "dbo.Words"); }
private void InsertUnigramsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db) { var dtUniGrams = new DataTable(); dtUniGrams.Columns.Add("Word1"); dtUniGrams.Columns.Add("WordId"); dtUniGrams.Columns.Add("Id"); dtUniGrams.Columns.Add("Frequency"); file.ReOpen(); Ngram ng; int wordId; var uniGramId = 0; var counter = 0; while ((ng = file.Next()) != null) { string w = ng.ToString(); string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w); wordId = wordTrie.Find(nonDiacriticsW); if (wordId != 0) { dtUniGrams.Rows.Add(w, wordId, ++uniGramId, ng.Frequency); } else { throw new Exception("Word '" + nonDiacriticsW + "' is not present in Trie!"); } if (++counter % 10000 == 0) { Console.WriteLine(counter + " unigrams prepared for insertion."); } } InsertIntoDb(dtUniGrams, db, "dbo.UniGramEntities"); }