private void LoadUnigrams(UniGramFile uniGramFiles, List <List <string> > ngramLists)
        {
            Ngram  ngram;
            string lineWordsFormated;

            while ((ngram = uniGramFiles.Next()) != null)
            {
                lineWordsFormated = string.Join(" ", ngram.Words);
                foreach (string w in ngram.Words)
                {
                    string        nonDiacriticsWord = StringRoutines.MyDiacriticsRemover(w);
                    List <string> foundList         = trie.Find(nonDiacriticsWord);
                    if (foundList == null)
                    {
                        var l = new List <string> {
                            lineWordsFormated
                        };
                        ngramLists.Add(l);
                        trie.Add(nonDiacriticsWord, l);
                    }
                    else
                    {
                        foundList.Add(lineWordsFormated);
                    }
                }
            }
        }
Exemple #2
0
        private void InsertWordsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db)
        {
            var dtWords = new DataTable();

            dtWords.Columns.Add("Id");
            dtWords.Columns.Add("Value");

            file.ReOpen();
            Ngram ng;
            int   id      = 0;
            var   counter = 0;

            while ((ng = file.Next()) != null)
            {
                string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(ng.ToString());
                dtWords.Rows.Add(++id, nonDiacriticsW);

                if (++counter % 100000 == 0)
                {
                    Console.WriteLine(counter + " words prepared for insertion.");
                }
            }
            InsertIntoDb(dtWords, db, "dbo.Words");
        }
Exemple #3
0
        private void InsertUnigramsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db)
        {
            var dtUniGrams = new DataTable();

            dtUniGrams.Columns.Add("Word1");
            dtUniGrams.Columns.Add("WordId");
            dtUniGrams.Columns.Add("Id");
            dtUniGrams.Columns.Add("Frequency");

            file.ReOpen();
            Ngram ng;
            int   wordId;
            var   uniGramId = 0;
            var   counter   = 0;

            while ((ng = file.Next()) != null)
            {
                string w = ng.ToString();
                string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w);

                wordId = wordTrie.Find(nonDiacriticsW);
                if (wordId != 0)
                {
                    dtUniGrams.Rows.Add(w, wordId, ++uniGramId, ng.Frequency);
                }
                else
                {
                    throw new Exception("Word '" + nonDiacriticsW + "' is not present in Trie!");
                }
                if (++counter % 10000 == 0)
                {
                    Console.WriteLine(counter + " unigrams prepared for insertion.");
                }
            }
            InsertIntoDb(dtUniGrams, db, "dbo.UniGramEntities");
        }