示例#1
0
        internal void Load(NgramFile file)
        {
            Ngram  ngram;
            string lineWordsFormated;

            while ((ngram = file.Next()) != null)
            {
                lineWordsFormated = string.Join(" ", ngram.Words);
                foreach (string w in ngram.Words)
                {
                    string        nonDiacriticsWord = StringRoutines.MyDiacriticsRemover(w);
                    List <string> foundList         = trie.Find(nonDiacriticsWord);
                    if (foundList == null)
                    {
                        trie.Add(nonDiacriticsWord, new List <string> {
                            lineWordsFormated
                        });
                    }
                    else
                    {
                        foundList.Add(lineWordsFormated);
                    }
                }
            }
        }
示例#2
0
        private List <FileNgram> GetFileNgrams(NgramFile file, int from, int to)
        {
            var   ngrams = new List <FileNgram>();
            Ngram ng;
            int   i = 1;

            while (i < from)
            {
                if ((ng = file.Next()) == null)
                {
                    return(ngrams);
                }
                i++;
            }

            while (i <= to)
            {
                if ((ng = file.Next()) != null)
                {
                    foreach (var w in ng.Words)
                    {
                        int id = idTrie.Find(StringRoutines.MyDiacriticsRemover(w));
                        ngrams.Add(new FileNgram(ng.ToString(), ng.Frequency, id));
                    }
                    i++;
                }
                else
                {
                    break;
                }
            }

            return(ngrams);
        }
示例#3
0
        internal string CompleteProcessing(NgramFile file, int rmvWordsFromFreq = 0, bool clean           = true,
                                           int rmvBadWordsFromFreq = int.MaxValue, int rmvWordsFromLength = int.MaxValue)
        {
            bool isUniGramFile = file is UniGramFile;

            if (rmvWordsFromFreq > 0)
            {
                file = isUniGramFile
                    ? new UniGramFile(RemoveWordsFromFreqDown(file, rmvWordsFromFreq))
                    : file = new NgramFile(RemoveWordsFromFreqDown(file, rmvWordsFromFreq));
            }
            if (clean)
            {
                file = isUniGramFile ? file = new UniGramFile(Clean(file)) : file = new NgramFile(Clean(file));
            }

            if (rmvWordsFromLength == int.MaxValue)
            {
                return(RemoveBadWords(file, rmvBadWordsFromFreq));
            }
            else
            {
                file       = isUniGramFile ? file = new UniGramFile(RemoveBadWords(file, rmvBadWordsFromFreq))
                    : file = new NgramFile(RemoveBadWords(file, rmvBadWordsFromFreq));
                return(RemoveWordsFromLength(file, rmvWordsFromLength));
            }
        }
示例#4
0
        internal string RemoveWordsFromLength(NgramFile file, int fromLength)
        {
            string name      = file.FileName;
            string extension = file.FileExtension;

            using (var toLength_sw = new StreamWriter($"{name}_TO-LENGTH-{fromLength}{extension}"))
                using (var fromLength_sw = new StreamWriter($"{name}_FROM-LENGTH-{fromLength}{extension}"))
                {
                    bool  isToLength;
                    Ngram ngram;
                    while ((ngram = file.Next()) != null)
                    {
                        isToLength = true;
                        foreach (var w in ngram.Words)
                        {
                            if (w.Length > fromLength)
                            {
                                isToLength = false;
                                break;
                            }
                        }

                        if (isToLength)
                        {
                            toLength_sw.WriteLine(ngram.Line);
                        }
                        else
                        {
                            fromLength_sw.WriteLine(ngram.Line);
                        }
                    }
                }
            return($"{name}_TO-LENGTH-{fromLength}{extension}");
        }
示例#5
0
        private int GetSuitableCountForDivision(NgramFile file)
        {
            int count;

            count = GetDivisionCountByNumber(file.Next().Words.Length);
            file.ReOpen();
            return(count);
        }
示例#6
0
        public void Set(string index, NGram gramEntries)
        {
            var path = GetFileName(index, gramEntries.Size);
            var obj  = new NgramFile()
            {
                Entryes = gramEntries,
                N       = gramEntries.Size
            };

            var jsonText = JsonConvert.SerializeObject(obj);

            File.WriteAllText(path, jsonText);
            _nGrams[GetKey(index, gramEntries.Size)] = gramEntries;
        }
示例#7
0
        internal void LoadFile(NgramFile file)
        {
            using (var db = new DiacriticsDBEntities())
            {
                switch (file.Next().Words.Length)
                {
                case 1:
                    InsertUnigramsSqlBulkCopy((UniGramFile)file, db);
                    break;

                default:
                    throw new Exception("Unknown length of ngams!");
                }
            }
        }
示例#8
0
        private static void CleanFiles()
        {
            var fc = new FileCleaner();

            var file = new UniGramFile("D:/slovniky/prim-8.0-public-all-word_frequency_non_case_sensitive/prim-8.0-public-all-word_frequency_non_case_sensitive.txt");

            Console.WriteLine(fc.CompleteProcessing(file, rmvWordsFromFreq: 0, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30));

            var file2 = new NgramFile("D:/ngramy/prim-8.0-public-all-2-gramy/prim-8.0-public-all-2-gramy.txt");

            Console.WriteLine(fc.CompleteProcessing(file2, rmvWordsFromFreq: 1, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30));

            var file3 = new NgramFile("D:/ngramy/prim-8.0-public-all-3-gramy/prim-8.0-public-all-3-gramy.txt");

            Console.WriteLine(fc.CompleteProcessing(file3, rmvWordsFromFreq: 2, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30));

            var file4 = new NgramFile("D:/ngramy/prim-8.0-public-all-4-gramy/prim-8.0-public-all-4-gramy.txt");

            Console.WriteLine(fc.CompleteProcessing(file4, rmvWordsFromFreq: 3, rmvBadWordsFromFreq: 11, rmvWordsFromLength: 30));
        }
示例#9
0
        internal void SortByLineLength(NgramFile file)
        {
            var   words = new List <Ngram>();
            Ngram ng;

            while ((ng = file.Next()) != null)
            {
                words.Add(ng);
            }
            var arr = words.ToArray();

            Array.Sort(arr, (x, y) => x.ToString().Length.CompareTo(y.ToString().Length));

            using (var writer = new StreamWriter($"{file.FileName}_SORTED{file.FileExtension}"))
            {
                foreach (var n in arr)
                {
                    writer.WriteLine(n.ToString() + $" {n.Frequency} ({n.ToString().Length})");
                }
            }
        }
示例#10
0
        private List <FileNgram> DivideFileBy(NgramFile file, int count, ref bool endOfFile)
        {
            var   ret = new List <FileNgram>();
            Ngram ng;
            int   i = 0;

            while ((ng = file.Next()) != null && i++ < count)
            {
                foreach (var w in ng.Words)
                {
                    int id = idTrie.Find(StringRoutines.MyDiacriticsRemover(w));
                    if (id == 0)
                    {
                        throw new Exception("Word '" + w + "' is not in idTrie!!!");
                    }
                    ret.Add(new FileNgram(ng.ToString(), ng.Frequency, id));
                }
            }
            endOfFile = ng == null;
            Console.WriteLine("part of file loaded...");
            return(ret);
        }
示例#11
0
        internal string RemoveWordsFromFreqDown(NgramFile file, int fromFrequency)
        {
            string name      = file.FileName;
            string extension = file.FileExtension;

            using (var fromDown_sw = new StreamWriter($"{name}_FROM-{fromFrequency}-DOWN{extension}"))
                using (var to_sw = new StreamWriter($"{name}_TO-{fromFrequency}{extension}"))
                {
                    Ngram ngram;
                    while ((ngram = file.Next()) != null)
                    {
                        if (ngram.Frequency <= fromFrequency)
                        {
                            fromDown_sw.WriteLine(ngram.Line);
                        }
                        else
                        {
                            to_sw.WriteLine(ngram.Line);
                        }
                    }
                }
            return($"{name}_TO-{fromFrequency}{extension}");
        }
示例#12
0
        public string RemoveBadWords(NgramFile file, int fromFrequency)
        {
            string name      = file.FileName;
            string extension = file.FileExtension;

            using (var goodWords_sw = new StreamWriter($"{name}_GOOD-WORDS{extension}"))
                using (var badWords_sw = new StreamWriter($"{name}_BAD-WORDS{extension}"))
                {
                    Ngram ngram;
                    while ((ngram = file.Next()) != null)
                    {
                        if (ngram.Frequency > fromFrequency || IsGoodWord(StringRoutines.MyDiacriticsRemover(ngram.ToString())))
                        {
                            goodWords_sw.WriteLine(ngram.Line);
                        }
                        else
                        {
                            badWords_sw.WriteLine(ngram.Line);
                        }
                    }
                }
            return($"{name}_GOOD-WORDS{extension}");
        }
示例#13
0
        private void LoadUniGramsEF(NgramFile file, DiacriticsDBEntities db)
        {
            file.ReOpen();
            Ngram ngram;
            Word  word;
            int   i = 0;

            while ((ngram = file.Next()) != null)
            {
                foreach (var w in ngram.Words)
                {
                    string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w);
                    word = db.Words.Where(a => a.Value == nonDiacriticsW).SingleOrDefault();
                    if (word == null)
                    {
                        word = new Word()
                        {
                            Value = nonDiacriticsW
                        };
                        db.Words.Add(word);
                    }
                    db.UniGramEntities.Add(new UniGramEntity()
                    {
                        Frequency = ngram.Frequency,
                        Word      = word,
                        WordId    = word.Id,
                        Word1     = w
                    });
                }
                if (++i % 100 == 0)
                {
                    db.SaveChanges();
                    Console.WriteLine(i);
                }
            }
        }
示例#14
0
        internal string Clean(NgramFile file)
        {
            string name      = file.FileName;
            string extension = file.FileExtension;
            string word;

            using (var cleaned_sw = new StreamWriter($"{name}_CLEANED{extension}"))
                using (var chrs_nums_sw = new StreamWriter($"{name}_TRASH-CHRS+NUMS{extension}"))
                    using (var nums_sw = new StreamWriter($"{name}_TRASH-NUMS{extension}"))
                        using (var trash_sw = new StreamWriter($"{name}_TRASH{extension}"))
                        {
                            Ngram ngram;
                            while ((ngram = file.Next()) != null)
                            {
                                word = String.Join("", ngram.Words);

                                if (rgxChars.IsMatch(word) && rgxDigits.IsMatch(word))
                                {
                                    chrs_nums_sw.WriteLine(ngram.Line);
                                }
                                else if (rgxDigits.IsMatch(word))
                                {
                                    nums_sw.WriteLine(ngram.Line);
                                }
                                else if (rgxNonChars.IsMatch(word))
                                {
                                    cleaned_sw.WriteLine(ngram.Line);
                                }
                                else
                                {
                                    trash_sw.WriteLine(ngram.Line);
                                }
                            }
                        }
            return($"{name}_CLEANED{extension}");
        }
示例#15
0
        private void LoadUniGramsSqlCmd(NgramFile file, DiacriticsDBEntities db)
        {
            var sqlSelect = new SqlCommand("SELECT * FROM dbo.Words WHERE Value = @value", db.Database.Connection as SqlConnection);

            sqlSelect.CommandType = CommandType.Text;
            sqlSelect.Parameters.Add("value", SqlDbType.NVarChar);

            var sqlInsertWord = new SqlCommand("INSERT INTO dbo.Words (Value) VALUES (@value)", db.Database.Connection as SqlConnection);

            sqlInsertWord.CommandType = CommandType.Text;
            sqlInsertWord.Parameters.Add("value", SqlDbType.NVarChar);

            var sqlInsertUniGram = new SqlCommand("INSERT INTO dbo.UniGramEntities (Word1, WordId, Frequency) VALUES (@word1, @wordId, @frequency)",
                                                  db.Database.Connection as SqlConnection);

            sqlInsertUniGram.CommandType = CommandType.Text;
            sqlInsertUniGram.Parameters.Add("word1", SqlDbType.NVarChar);
            sqlInsertUniGram.Parameters.Add("wordId", SqlDbType.Int);
            sqlInsertUniGram.Parameters.Add("frequency", SqlDbType.Int);

            db.Database.Connection.Open();

            file.ReOpen();
            Ngram ngram;
            int   i = 0;

            while ((ngram = file.Next()) != null)
            {
                foreach (var w in ngram.Words)
                {
                    string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w);
                    int    id             = -1;
                    bool   wasIserted;
                    do
                    {
                        wasIserted = false;
                        sqlSelect.Parameters["value"].Value = nonDiacriticsW;
                        SqlDataReader reader = sqlSelect.ExecuteReader();

                        if (reader.Read())
                        {
                            id = (int)reader[0];
                        }
                        else
                        {
                            sqlInsertWord.Parameters["value"].Value = nonDiacriticsW;
                            sqlInsertWord.ExecuteNonQuery();
                            wasIserted = true;
                        }
                        reader.Close();
                    } while (wasIserted);

                    sqlInsertUniGram.Parameters["word1"].Value     = w;
                    sqlInsertUniGram.Parameters["wordId"].Value    = id;
                    sqlInsertUniGram.Parameters["frequency"].Value = ngram.Frequency;
                    sqlInsertUniGram.ExecuteNonQuery();
                }
                if (++i % 10000 == 0)
                {
                    Console.WriteLine(i);
                }
            }
            db.Database.Connection.Close();

            sqlSelect.Dispose();
            sqlInsertWord.Dispose();
            sqlInsertUniGram.Dispose();
        }