Ejemplo n.º 1
0
        private void LoadUnigrams(UniGramFile uniGramFiles, List <List <string> > ngramLists)
        {
            Ngram  ngram;
            string lineWordsFormated;

            while ((ngram = uniGramFiles.Next()) != null)
            {
                lineWordsFormated = string.Join(" ", ngram.Words);
                foreach (string w in ngram.Words)
                {
                    string        nonDiacriticsWord = StringRoutines.MyDiacriticsRemover(w);
                    List <string> foundList         = trie.Find(nonDiacriticsWord);
                    if (foundList == null)
                    {
                        var l = new List <string> {
                            lineWordsFormated
                        };
                        ngramLists.Add(l);
                        trie.Add(nonDiacriticsWord, l);
                    }
                    else
                    {
                        foundList.Add(lineWordsFormated);
                    }
                }
            }
        }
Ejemplo n.º 2
0
        public TrieDR(UniGramFile unigrams, List <NgramFile> othersNgrams)
        {
            TrieCreator tc = new TrieCreator();

            tc.GetOptimizedTrie(unigrams, othersNgrams);
            trie = tc.Get();
        }
Ejemplo n.º 3
0
        internal string CompleteProcessing(NgramFile file, int rmvWordsFromFreq = 0, bool clean           = true,
                                           int rmvBadWordsFromFreq = int.MaxValue, int rmvWordsFromLength = int.MaxValue)
        {
            bool isUniGramFile = file is UniGramFile;

            if (rmvWordsFromFreq > 0)
            {
                file = isUniGramFile
                    ? new UniGramFile(RemoveWordsFromFreqDown(file, rmvWordsFromFreq))
                    : file = new NgramFile(RemoveWordsFromFreqDown(file, rmvWordsFromFreq));
            }
            if (clean)
            {
                file = isUniGramFile ? file = new UniGramFile(Clean(file)) : file = new NgramFile(Clean(file));
            }

            if (rmvWordsFromLength == int.MaxValue)
            {
                return(RemoveBadWords(file, rmvBadWordsFromFreq));
            }
            else
            {
                file       = isUniGramFile ? file = new UniGramFile(RemoveBadWords(file, rmvBadWordsFromFreq))
                    : file = new NgramFile(RemoveBadWords(file, rmvBadWordsFromFreq));
                return(RemoveWordsFromLength(file, rmvWordsFromLength));
            }
        }
Ejemplo n.º 4
0
        internal void GetOptimizedTrie(UniGramFile uniGramFiles, List <NgramFile> otherNgramFiles)
        {
            var ngramLists = new List <List <string> >();

            LoadUnigrams(uniGramFiles, ngramLists);
            Console.WriteLine("unigrams loaded");
            OptimizeUniGramTrie(ngramLists);
            Console.WriteLine("unigrams optimized");
            OptimizedLoad(otherNgramFiles);
            Console.WriteLine("other files loaded");
            SwitchUniGramsToTheEnd(ngramLists);
            Console.WriteLine("unigrams switched");
        }
Ejemplo n.º 5
0
        private static void CleanFiles()
        {
            var fc = new FileCleaner();

            var file = new UniGramFile("D:/slovniky/prim-8.0-public-all-word_frequency_non_case_sensitive/prim-8.0-public-all-word_frequency_non_case_sensitive.txt");

            Console.WriteLine(fc.CompleteProcessing(file, rmvWordsFromFreq: 0, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30));

            var file2 = new NgramFile("D:/ngramy/prim-8.0-public-all-2-gramy/prim-8.0-public-all-2-gramy.txt");

            Console.WriteLine(fc.CompleteProcessing(file2, rmvWordsFromFreq: 1, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30));

            var file3 = new NgramFile("D:/ngramy/prim-8.0-public-all-3-gramy/prim-8.0-public-all-3-gramy.txt");

            Console.WriteLine(fc.CompleteProcessing(file3, rmvWordsFromFreq: 2, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30));

            var file4 = new NgramFile("D:/ngramy/prim-8.0-public-all-4-gramy/prim-8.0-public-all-4-gramy.txt");

            Console.WriteLine(fc.CompleteProcessing(file4, rmvWordsFromFreq: 3, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30));
        }
Ejemplo n.º 6
0
        private void InsertWordsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db)
        {
            var dtWords = new DataTable();

            dtWords.Columns.Add("Id");
            dtWords.Columns.Add("Value");

            file.ReOpen();
            Ngram ng;
            int   id      = 0;
            var   counter = 0;

            while ((ng = file.Next()) != null)
            {
                string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(ng.ToString());
                dtWords.Rows.Add(++id, nonDiacriticsW);

                if (++counter % 100000 == 0)
                {
                    Console.WriteLine(counter + " words prepared for insertion.");
                }
            }
            InsertIntoDb(dtWords, db, "dbo.Words");
        }
Ejemplo n.º 7
0
        private void InsertUnigramsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db)
        {
            var dtUniGrams = new DataTable();

            dtUniGrams.Columns.Add("Word1");
            dtUniGrams.Columns.Add("WordId");
            dtUniGrams.Columns.Add("Id");
            dtUniGrams.Columns.Add("Frequency");

            file.ReOpen();
            Ngram ng;
            int   wordId;
            var   uniGramId = 0;
            var   counter   = 0;

            while ((ng = file.Next()) != null)
            {
                string w = ng.ToString();
                string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w);

                wordId = wordTrie.Find(nonDiacriticsW);
                if (wordId != 0)
                {
                    dtUniGrams.Rows.Add(w, wordId, ++uniGramId, ng.Frequency);
                }
                else
                {
                    throw new Exception("Word '" + nonDiacriticsW + "' is not present in Trie!");
                }
                if (++counter % 10000 == 0)
                {
                    Console.WriteLine(counter + " unigrams prepared for insertion.");
                }
            }
            InsertIntoDb(dtUniGrams, db, "dbo.UniGramEntities");
        }