예제 #1
0
        public static void RemoveDiacriticsInFile(string path)
        {
            string originalText          = File.OpenText(path).ReadToEnd();
            string textWithoutDiacritics = StringRoutines.MyDiacriticsRemover(originalText);

            File.WriteAllText($"{TextFile.FileName(path)}_WITHOUT-DIACRITICS{TextFile.FileExtension(path)}", textWithoutDiacritics);
        }
예제 #2
0
 private static void Normalize(string rPath)
 {
     foreach (var text in allTexts)
     {
         string t = File.OpenText(rPath + text).ReadToEnd();
         t = StringRoutines.Normalize(t);
         File.WriteAllText(TextFile.FileName(rPath + text) + "_NORMALIZED.txt", t);
     }
 }
예제 #3
0
        public void FileName_Path_ReturnsFileName()
        {
            var name   = "D:/slovniky/prim-8.0-public-all-word_frequency_non_case_sensitive/prim-8.0-public-all-word_frequency_non_case_sensitive_CLEANED_GOOD-WORDS";
            var result = TextFile.FileName(path);

            Console.WriteLine($"name  : {name}");
            Console.WriteLine($"result: {result}");

            Assert.AreEqual(result, name);
        }
예제 #4
0
        private static void OnlyTestTexts(string rPath)
        {
            foreach (var text in allTexts)
            {
                //string originalText = File.OpenText(rPath + TextFile.FileName(text) + "_NORMALIZED.txt").ReadToEnd();
                string originalText = File.OpenText(rPath + text).ReadToEnd();
                //originalText = StringRoutines.Normalize(originalText);
                string reconstructedText = File.OpenText(rPath + TextFile.FileName(text) + "_Reconstructed.txt").ReadToEnd();
                //reconstructedText = StringRoutines.Normalize(reconstructedText);

                DiacriticsTester.FindMistakes(originalText, reconstructedText, rPath + text, true);
            }
        }
예제 #5
0
        internal static void Test(string path, DiacriticsReconstructor dr, bool writeStatistics = true)
        {
            long bytes = GC.GetTotalMemory(true);

            Console.WriteLine($"Memory (bytes): {bytes}");
            if (writeStatistics)
            {
                statisticsPath = $"{TextFile.FileName(path)}_STATISTICS{TextFile.FileExtension(path)}";
                File.WriteAllText(statisticsPath, $"Memory (bytes): {bytes}\n");
            }

            Console.WriteLine($"Reading {path}");
            string originalText = File.OpenText(path).ReadToEnd();

            Console.WriteLine("Removing diacritics...");
            string textWithoutDiacritics = StringRoutines.MyDiacriticsRemover(originalText);

            File.WriteAllText($"{TextFile.FileName(path)}_WITHOUT-DIACRITICS{TextFile.FileExtension(path)}", textWithoutDiacritics);

            Console.WriteLine("Reconstructing...");
            var    sw = Stopwatch.StartNew();
            string reconstructedText = dr.Reconstruct(textWithoutDiacritics);

            sw.Stop();
            string ngramsStat = dr.GetStatistic();

            Console.Write(ngramsStat);
            if (writeStatistics)
            {
                File.AppendAllText(statisticsPath, ngramsStat);
            }
            dr.EraseStatistic();
            Console.WriteLine($"Elapsed (milliseconds): {sw.Elapsed.TotalMilliseconds}");
            if (writeStatistics)
            {
                File.AppendAllText(statisticsPath, $"Elapsed (milliseconds): {sw.Elapsed.TotalMilliseconds}\n");
            }
            Console.WriteLine("Done.");

            File.WriteAllText($"{TextFile.FileName(path)}_RENCOSTRUCTED{TextFile.FileExtension(path)}", reconstructedText);

            Console.WriteLine("Testing...");
            FindMistakes(originalText, reconstructedText, path, writeStatistics);
            Console.WriteLine("Done.\n");
        }
예제 #6
0
        public static string BinaryFilePartitioningStats(string binFilePath, string positionTriePath, string word)
        {
            string statName = TextFile.FileName(binFilePath) + "PARTITION-STATS-" + word + ".txt";

            var positionTrie = PositionTrieCreator.CreatePositionTrie(positionTriePath);

            using (var binaryReader = new BinaryReader(File.Open(binFilePath, FileMode.Open)))
                using (var statWriter = new StreamWriter(statName))
                {
                    var position = positionTrie.Find(word);

                    binaryReader.BaseStream.Position = position;
                    int howMany = binaryReader.ReadInt32();
                    var ngrams  = new List <string>();

                    for (int i = 0; i < howMany; i++)
                    {
                        ngrams.Add(binaryReader.ReadString());
                    }

                    statWriter.WriteLine($"Word: {word}");
                    int len = 1;
                    while (ngrams.Count != 0)
                    {
                        var found = ngrams.Where(x => x.Length == len);

                        if (found.Count() != 0)
                        {
                            statWriter.WriteLine($"{len} - {found.Count()}");

                            foreach (var n in found.ToList())

                            {
                                ngrams.Remove(n);
                            }
                        }
                        len++;
                    }
                }
            return(statName);
        }
예제 #7
0
        private List <string> GetAllPartialBinFiles(string[] binFiles)
        {
            var ret = new List <string>();

            int size = 4;

            foreach (var path in binFiles)
            {
                int stepSize = GetDivisionCountByNumber(size--);
                int i        = 0;

                string finalPath;
                while (File.Exists(finalPath = TextFile.FileName(path) + i + TextFile.FileExtension(path)))
                {
                    ret.Add(finalPath);
                    Console.WriteLine(finalPath);
                    i += stepSize;
                }
            }

            return(ret);
        }
예제 #8
0
        internal static string BinaryFileNgramStats(string binFilePath, string positionTriePath)
        {
            string statName = TextFile.FileName(binFilePath) + "_STATS.txt";

            using (StreamReader positionReader = File.OpenText(positionTriePath))
                using (var binaryReader = new BinaryReader(File.Open(binFilePath, FileMode.Open)))
                    using (var statWriter = new StreamWriter(statName))
                    {
                        int[]  allNgCount = new int[5];
                        string line;
                        while ((line = positionReader.ReadLine()) != null)
                        {
                            string word     = line.Substring(0, line.IndexOf(" "));
                            long   position = Convert.ToInt64(line.Substring(line.IndexOf(" ") + 1));

                            int[] ngCount = new int[5];

                            binaryReader.BaseStream.Position = position;
                            var len = binaryReader.ReadInt32();
                            for (int i = 0; i < len; i++)
                            {
                                string   ng     = binaryReader.ReadString();
                                string[] ngArr  = ng.Split(' ');
                                int      length = ngArr.Length;
                                ngCount[length]++;
                            }
                            statWriter.WriteLine($"{word} ({ngCount.Sum()}) ({ngCount[4]}, {ngCount[3]}, {ngCount[2]}, {ngCount[1]})");

                            for (int i = 1; i < 5; i++)
                            {
                                allNgCount[i] += ngCount[i];
                            }
                        }
                        statWriter.WriteLine("All ngrams: {0} (4: {1}, 3: {2}, 2: {3}, 1: {4})",
                                             allNgCount.Sum(), allNgCount[4], allNgCount[3], allNgCount[2], allNgCount[1]);
                    }

            return(statName);
        }
예제 #9
0
 public static void CleanFileFromHiddenChars(string path)
 {
     using (var strmWriter = new StreamWriter($"{TextFile.FileName(path)}_CLEANED{TextFile.FileExtension(path)}"))
         using (var binReader = new BinaryReader(File.Open(path, FileMode.Open)))
         {
             var rgxLatin  = new Regex($"[{latinChars}]");
             var rgxCommon = new Regex("[-+*/=_—–]");
             while (binReader.BaseStream.Position != binReader.BaseStream.Length)
             {
                 byte b = binReader.ReadByte();
                 char c = Convert.ToChar(b);
                 if (rgxLatin.IsMatch(c.ToString().ToLower()) || char.IsDigit(c) ||
                     char.IsPunctuation(c) || char.IsSeparator(c) ||
                     char.IsWhiteSpace(c) /*|| rgxCommon.IsMatch(c.ToString())*/)
                 {
                     strmWriter.Write(c);
                 }
                 else
                 {
                     Console.WriteLine($"<{c}>");
                 }
             }
         }
 }
예제 #10
0
        public static void FindMistakes(string originalText, string reconstructedText, string path, bool writeStatistics)
        {
            string[] originalWords      = originalText.Split(' ', '\n', '\t', '\r').Where(x => x != "").ToArray();
            string[] reconstructedWords = reconstructedText.Split(' ', '\n', '\t', '\r').Where(x => x != "").ToArray();

            Console.WriteLine($"originalWords.Length = {originalWords.Length}");
            Console.WriteLine($"reconstructedWords.Length = {reconstructedWords.Length}");

            if (originalWords.Length != reconstructedWords.Length)
            {
                //throw new Exception("Length of original and reconstructed text are not equal!");
                Console.WriteLine("Length of original and reconstructed text are not equal!");
            }

            int count = 0;

            using (var sw = new StreamWriter($"{TextFile.FileName(path)}_MISTAKES-RECONST-ORIG{TextFile.FileExtension(path)}"))
            {
                //int len = originalWords.Length;
                int len = Math.Min(originalWords.Length, reconstructedWords.Length);
                for (int i = 0; i < len; i++)
                {
                    var originalW    = originalWords[i];
                    var reconstructW = reconstructedWords[i];
                    if (originalW != reconstructW)
                    {
                        sw.WriteLine("{0} {1} {2} {3} {4} {5} {6} - {7} {8} {9} {10} {11} {12} {13}",
                                     i - 3 >= 0 ? reconstructedWords[i - 3] : "",
                                     i - 2 >= 0 ? reconstructedWords[i - 2] : "",
                                     i - 1 >= 0 ? reconstructedWords[i - 1] : "",
                                     reconstructW,
                                     i + 1 < len ? reconstructedWords[i + 1] : "",
                                     i + 2 < len ? reconstructedWords[i + 2] : "",
                                     i + 3 < len ? reconstructedWords[i + 3] : "",

                                     i - 3 >= 0 ? originalWords[i - 3] : "",
                                     i - 2 >= 0 ? originalWords[i - 2] : "",
                                     i - 1 >= 0 ? originalWords[i - 1] : "",
                                     originalW,
                                     i + 1 < len ? originalWords[i + 1] : "",
                                     i + 2 < len ? originalWords[i + 2] : "",
                                     i + 3 < len ? originalWords[i + 3] : "");

                        count++;
                    }
                }
            }

            Console.WriteLine($"Number of mistakes: {count}");
            if (writeStatistics)
            {
                statisticsPath = $"{TextFile.FileName(path)}_STATISTICS{TextFile.FileExtension(path)}";

                File.AppendAllText(statisticsPath, $"originalWords.Length = {originalWords.Length}\n");
                File.AppendAllText(statisticsPath, $"reconstructedWords.Length = {reconstructedWords.Length}\n");
                File.AppendAllText(statisticsPath, $"Number of mistakes: {count}\n");
            }

            countOfAllOrigWords    += originalWords.Length;
            countOfAllReconstWords += reconstructedWords.Length;
            countOfAllMistakes     += count;
        }