示例#1
0
        public void Add(string ngram)
        {
            if (ngram == null)
            {
                return;
            }
            var nonDiacritics = stringRoutines.MyDiacriticsRemover(ngram);

            foreach (var word in nonDiacritics.Split(' '))
            {
                var ngrams = cache.Find(word);
                if (ngrams != null)
                {
                    ngrams.Add(ngram);
                }
                else
                {
                    cache.Add(word, new List <string> {
                        ngram
                    });
                }
            }

            if (isSetSize)
            {
                CheckSize(ngram);
            }
        }
示例#2
0
        private void LoadUnigrams(UniGramFile uniGramFiles, List <List <string> > ngramLists)
        {
            Ngram  ngram;
            string lineWordsFormated;

            while ((ngram = uniGramFiles.Next()) != null)
            {
                lineWordsFormated = string.Join(" ", ngram.Words);
                foreach (string w in ngram.Words)
                {
                    string        nonDiacriticsWord = StringRoutines.MyDiacriticsRemover(w);
                    List <string> foundList         = trie.Find(nonDiacriticsWord);
                    if (foundList == null)
                    {
                        var l = new List <string> {
                            lineWordsFormated
                        };
                        ngramLists.Add(l);
                        trie.Add(nonDiacriticsWord, l);
                    }
                    else
                    {
                        foundList.Add(lineWordsFormated);
                    }
                }
            }
        }
示例#3
0
        private void OptimizedLoad(List <NgramFile> otherNgramFiles)
        {
            var maxAllowedConut = new int[5] {
                0, 702, 702, 352, 107
            };

            // load 1, 350, 245, 105  (1 2 3 4)
            foreach (var file in otherNgramFiles)
            {
                Ngram  ngram;
                string lineWordsFormated;

                int size = file.Next().Words.Length;
                file.ReOpen();

                while ((ngram = file.Next()) != null)
                {
                    lineWordsFormated = string.Join(" ", ngram.Words);
                    foreach (string w in ngram.Words)
                    {
                        string        nonDiacriticsWord = StringRoutines.MyDiacriticsRemover(w);
                        List <string> foundList         = trie.Find(nonDiacriticsWord);

                        if (foundList != null && foundList.Count > 1 && foundList.Count < maxAllowedConut[size])
                        {
                            foundList.Add(lineWordsFormated);
                        }
                    }
                }
                Console.WriteLine(" 4 3 2");
            }
        }
示例#4
0
        internal void Load(NgramFile file)
        {
            Ngram  ngram;
            string lineWordsFormated;

            while ((ngram = file.Next()) != null)
            {
                lineWordsFormated = string.Join(" ", ngram.Words);
                foreach (string w in ngram.Words)
                {
                    string        nonDiacriticsWord = StringRoutines.MyDiacriticsRemover(w);
                    List <string> foundList         = trie.Find(nonDiacriticsWord);
                    if (foundList == null)
                    {
                        trie.Add(nonDiacriticsWord, new List <string> {
                            lineWordsFormated
                        });
                    }
                    else
                    {
                        foundList.Add(lineWordsFormated);
                    }
                }
            }
        }
示例#5
0
        private List <FileNgram> GetFileNgrams(NgramFile file, int from, int to)
        {
            var   ngrams = new List <FileNgram>();
            Ngram ng;
            int   i = 1;

            while (i < from)
            {
                if ((ng = file.Next()) == null)
                {
                    return(ngrams);
                }
                i++;
            }

            while (i <= to)
            {
                if ((ng = file.Next()) != null)
                {
                    foreach (var w in ng.Words)
                    {
                        int id = idTrie.Find(StringRoutines.MyDiacriticsRemover(w));
                        ngrams.Add(new FileNgram(ng.ToString(), ng.Frequency, id));
                    }
                    i++;
                }
                else
                {
                    break;
                }
            }

            return(ngrams);
        }
        public void MyDiacriticsRemover_WordWithDiacritics_ReturnsWithoutDiacritics()
        {
            string input    = "áäčďéíĺľňóôŕšťúýžěřůäöüẞß abcdefghijklmnopqrstuvwxyz ,./;'[]{} 1234567890 ~`!@#$%^&*()+_-";
            string expected = "aacdeillnoorstuyzeruaouẞß abcdefghijklmnopqrstuvwxyz ,./;'[]{} 1234567890 ~`!@#$%^&*()+_-";

            string result = StringRoutines.MyDiacriticsRemover(input);

            Assert.AreEqual(expected, result);
        }
示例#7
0
        internal static void Test(string path, DiacriticsReconstructor dr, bool writeStatistics = true)
        {
            long bytes = GC.GetTotalMemory(true);

            Console.WriteLine($"Memory (bytes): {bytes}");
            if (writeStatistics)
            {
                statisticsPath = $"{TextFile.FileName(path)}_STATISTICS{TextFile.FileExtension(path)}";
                File.WriteAllText(statisticsPath, $"Memory (bytes): {bytes}\n");
            }

            Console.WriteLine($"Reading {path}");
            string originalText = File.OpenText(path).ReadToEnd();

            Console.WriteLine("Removing diacritics...");
            string textWithoutDiacritics = StringRoutines.MyDiacriticsRemover(originalText);

            File.WriteAllText($"{TextFile.FileName(path)}_WITHOUT-DIACRITICS{TextFile.FileExtension(path)}", textWithoutDiacritics);

            Console.WriteLine("Reconstructing...");
            var    sw = Stopwatch.StartNew();
            string reconstructedText = dr.Reconstruct(textWithoutDiacritics);

            sw.Stop();
            string ngramsStat = dr.GetStatistic();

            Console.Write(ngramsStat);
            if (writeStatistics)
            {
                File.AppendAllText(statisticsPath, ngramsStat);
            }
            dr.EraseStatistic();
            Console.WriteLine($"Elapsed (milliseconds): {sw.Elapsed.TotalMilliseconds}");
            if (writeStatistics)
            {
                File.AppendAllText(statisticsPath, $"Elapsed (milliseconds): {sw.Elapsed.TotalMilliseconds}\n");
            }
            Console.WriteLine("Done.");

            File.WriteAllText($"{TextFile.FileName(path)}_RENCOSTRUCTED{TextFile.FileExtension(path)}", reconstructedText);

            Console.WriteLine("Testing...");
            FindMistakes(originalText, reconstructedText, path, writeStatistics);
            Console.WriteLine("Done.\n");
        }
示例#8
0
        private List <FileNgram> DivideFileBy(NgramFile file, int count, ref bool endOfFile)
        {
            var   ret = new List <FileNgram>();
            Ngram ng;
            int   i = 0;

            while ((ng = file.Next()) != null && i++ < count)
            {
                foreach (var w in ng.Words)
                {
                    int id = idTrie.Find(StringRoutines.MyDiacriticsRemover(w));
                    if (id == 0)
                    {
                        throw new Exception("Word '" + w + "' is not in idTrie!!!");
                    }
                    ret.Add(new FileNgram(ng.ToString(), ng.Frequency, id));
                }
            }
            endOfFile = ng == null;
            Console.WriteLine("part of file loaded...");
            return(ret);
        }
示例#9
0
 private void CheckSize(string ngram)
 {
     priorityNgrams.Remove(ngram);
     priorityNgrams.Add(ngram);
     if (priorityNgrams.Count > size)
     {
         var ngramToRemove = StringRoutines.MyDiacriticsRemover(priorityNgrams[0]);
         foreach (var word in ngramToRemove.Split(' '))
         {
             var listRemoveFrom = cache.Find(word);
             if (listRemoveFrom == null)
             {
                 continue;
             }
             listRemoveFrom.Remove(priorityNgrams[0]);
             if (listRemoveFrom.Count == 0)
             {
                 cache.Remove(word);
             }
         }
         priorityNgrams.RemoveAt(0);
     }
 }
示例#10
0
        private void InsertWordsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db)
        {
            var dtWords = new DataTable();

            dtWords.Columns.Add("Id");
            dtWords.Columns.Add("Value");

            file.ReOpen();
            Ngram ng;
            int   id      = 0;
            var   counter = 0;

            while ((ng = file.Next()) != null)
            {
                string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(ng.ToString());
                dtWords.Rows.Add(++id, nonDiacriticsW);

                if (++counter % 100000 == 0)
                {
                    Console.WriteLine(counter + " words prepared for insertion.");
                }
            }
            InsertIntoDb(dtWords, db, "dbo.Words");
        }
示例#11
0
        private void InsertUnigramsSqlBulkCopy(UniGramFile file, DiacriticsDBEntities db)
        {
            var dtUniGrams = new DataTable();

            dtUniGrams.Columns.Add("Word1");
            dtUniGrams.Columns.Add("WordId");
            dtUniGrams.Columns.Add("Id");
            dtUniGrams.Columns.Add("Frequency");

            file.ReOpen();
            Ngram ng;
            int   wordId;
            var   uniGramId = 0;
            var   counter   = 0;

            while ((ng = file.Next()) != null)
            {
                string w = ng.ToString();
                string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w);

                wordId = wordTrie.Find(nonDiacriticsW);
                if (wordId != 0)
                {
                    dtUniGrams.Rows.Add(w, wordId, ++uniGramId, ng.Frequency);
                }
                else
                {
                    throw new Exception("Word '" + nonDiacriticsW + "' is not present in Trie!");
                }
                if (++counter % 10000 == 0)
                {
                    Console.WriteLine(counter + " unigrams prepared for insertion.");
                }
            }
            InsertIntoDb(dtUniGrams, db, "dbo.UniGramEntities");
        }
示例#12
0
        private void LoadUniGramsEF(NgramFile file, DiacriticsDBEntities db)
        {
            file.ReOpen();
            Ngram ngram;
            Word  word;
            int   i = 0;

            while ((ngram = file.Next()) != null)
            {
                foreach (var w in ngram.Words)
                {
                    string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w);
                    word = db.Words.Where(a => a.Value == nonDiacriticsW).SingleOrDefault();
                    if (word == null)
                    {
                        word = new Word()
                        {
                            Value = nonDiacriticsW
                        };
                        db.Words.Add(word);
                    }
                    db.UniGramEntities.Add(new UniGramEntity()
                    {
                        Frequency = ngram.Frequency,
                        Word      = word,
                        WordId    = word.Id,
                        Word1     = w
                    });
                }
                if (++i % 100 == 0)
                {
                    db.SaveChanges();
                    Console.WriteLine(i);
                }
            }
        }
示例#13
0
        private void LoadUniGramsSqlCmd(NgramFile file, DiacriticsDBEntities db)
        {
            var sqlSelect = new SqlCommand("SELECT * FROM dbo.Words WHERE Value = @value", db.Database.Connection as SqlConnection);

            sqlSelect.CommandType = CommandType.Text;
            sqlSelect.Parameters.Add("value", SqlDbType.NVarChar);

            var sqlInsertWord = new SqlCommand("INSERT INTO dbo.Words (Value) VALUES (@value)", db.Database.Connection as SqlConnection);

            sqlInsertWord.CommandType = CommandType.Text;
            sqlInsertWord.Parameters.Add("value", SqlDbType.NVarChar);

            var sqlInsertUniGram = new SqlCommand("INSERT INTO dbo.UniGramEntities (Word1, WordId, Frequency) VALUES (@word1, @wordId, @frequency)",
                                                  db.Database.Connection as SqlConnection);

            sqlInsertUniGram.CommandType = CommandType.Text;
            sqlInsertUniGram.Parameters.Add("word1", SqlDbType.NVarChar);
            sqlInsertUniGram.Parameters.Add("wordId", SqlDbType.Int);
            sqlInsertUniGram.Parameters.Add("frequency", SqlDbType.Int);

            db.Database.Connection.Open();

            file.ReOpen();
            Ngram ngram;
            int   i = 0;

            while ((ngram = file.Next()) != null)
            {
                foreach (var w in ngram.Words)
                {
                    string nonDiacriticsW = StringRoutines.MyDiacriticsRemover(w);
                    int    id             = -1;
                    bool   wasIserted;
                    do
                    {
                        wasIserted = false;
                        sqlSelect.Parameters["value"].Value = nonDiacriticsW;
                        SqlDataReader reader = sqlSelect.ExecuteReader();

                        if (reader.Read())
                        {
                            id = (int)reader[0];
                        }
                        else
                        {
                            sqlInsertWord.Parameters["value"].Value = nonDiacriticsW;
                            sqlInsertWord.ExecuteNonQuery();
                            wasIserted = true;
                        }
                        reader.Close();
                    } while (wasIserted);

                    sqlInsertUniGram.Parameters["word1"].Value     = w;
                    sqlInsertUniGram.Parameters["wordId"].Value    = id;
                    sqlInsertUniGram.Parameters["frequency"].Value = ngram.Frequency;
                    sqlInsertUniGram.ExecuteNonQuery();
                }
                if (++i % 10000 == 0)
                {
                    Console.WriteLine(i);
                }
            }
            db.Database.Connection.Close();

            sqlSelect.Dispose();
            sqlInsertWord.Dispose();
            sqlInsertUniGram.Dispose();
        }
        protected bool MatchesUp(string word, string[] ngram, string[] nthBefore, string[] nthAfter, ref string result)
        {
            string[] ngramWordsDiacritics = ngram;
            string[] ngramWords           = new string[ngram.Length];
            for (int i = 0; i < ngram.Length; i++)
            {
                ngramWords[i] = StringRoutines.MyDiacriticsRemover(ngram[i]);
            }

            bool matches;
            int  res;

            for (int i = 0; i < ngramWords.Length; i++)
            {
                // find {word} in {ngramWords} (multiple matches can by found)
                if (ngramWords[i] == word)
                {
                    res     = i;
                    matches = true;
                    // test {ngramWords} with {nthBefore} and {nthAfter}
                    for (int j = 0; j < nthBefore.Length; j++)
                    {
                        if ((i - j - 1) >= 0)
                        {
                            if (nthBefore[j] != ngramWords[i - j - 1])
                            {
                                matches = false;
                                break;
                            }
                        }
                        else
                        {
                            break;
                        }
                    }

                    if (matches)
                    {
                        for (int j = 0; j < nthAfter.Length; j++)
                        {
                            if ((i + j + 1) < ngramWords.Length)
                            {
                                if (nthAfter[j] != ngramWords[i + j + 1])
                                {
                                    matches = false;
                                    break;
                                }
                            }
                            else
                            {
                                break;
                            }
                        }
                    }

                    if (matches)
                    {
                        result = ngramWordsDiacritics[res];
                        return(true);
                    }
                }
            }
            return(false);
        }
 private string Normalize(string word)
 {
     return(StringRoutines.MyDiacriticsRemover(word).ToLower());
 }