public static void RemoveDiacriticsInFile(string path) { string originalText = File.OpenText(path).ReadToEnd(); string textWithoutDiacritics = StringRoutines.MyDiacriticsRemover(originalText); File.WriteAllText($"{TextFile.FileName(path)}_WITHOUT-DIACRITICS{TextFile.FileExtension(path)}", textWithoutDiacritics); }
private static void Normalize(string rPath) { foreach (var text in allTexts) { string t = File.OpenText(rPath + text).ReadToEnd(); t = StringRoutines.Normalize(t); File.WriteAllText(TextFile.FileName(rPath + text) + "_NORMALIZED.txt", t); } }
public void FileName_Path_ReturnsFileName() { var name = "D:/slovniky/prim-8.0-public-all-word_frequency_non_case_sensitive/prim-8.0-public-all-word_frequency_non_case_sensitive_CLEANED_GOOD-WORDS"; var result = TextFile.FileName(path); Console.WriteLine($"name : {name}"); Console.WriteLine($"result: {result}"); Assert.AreEqual(result, name); }
private static void OnlyTestTexts(string rPath) { foreach (var text in allTexts) { //string originalText = File.OpenText(rPath + TextFile.FileName(text) + "_NORMALIZED.txt").ReadToEnd(); string originalText = File.OpenText(rPath + text).ReadToEnd(); //originalText = StringRoutines.Normalize(originalText); string reconstructedText = File.OpenText(rPath + TextFile.FileName(text) + "_Reconstructed.txt").ReadToEnd(); //reconstructedText = StringRoutines.Normalize(reconstructedText); DiacriticsTester.FindMistakes(originalText, reconstructedText, rPath + text, true); } }
internal static void Test(string path, DiacriticsReconstructor dr, bool writeStatistics = true) { long bytes = GC.GetTotalMemory(true); Console.WriteLine($"Memory (bytes): {bytes}"); if (writeStatistics) { statisticsPath = $"{TextFile.FileName(path)}_STATISTICS{TextFile.FileExtension(path)}"; File.WriteAllText(statisticsPath, $"Memory (bytes): {bytes}\n"); } Console.WriteLine($"Reading {path}"); string originalText = File.OpenText(path).ReadToEnd(); Console.WriteLine("Removing diacritics..."); string textWithoutDiacritics = StringRoutines.MyDiacriticsRemover(originalText); File.WriteAllText($"{TextFile.FileName(path)}_WITHOUT-DIACRITICS{TextFile.FileExtension(path)}", textWithoutDiacritics); Console.WriteLine("Reconstructing..."); var sw = Stopwatch.StartNew(); string reconstructedText = dr.Reconstruct(textWithoutDiacritics); sw.Stop(); string ngramsStat = dr.GetStatistic(); Console.Write(ngramsStat); if (writeStatistics) { File.AppendAllText(statisticsPath, ngramsStat); } dr.EraseStatistic(); Console.WriteLine($"Elapsed (milliseconds): {sw.Elapsed.TotalMilliseconds}"); if (writeStatistics) { File.AppendAllText(statisticsPath, $"Elapsed (milliseconds): {sw.Elapsed.TotalMilliseconds}\n"); } Console.WriteLine("Done."); File.WriteAllText($"{TextFile.FileName(path)}_RENCOSTRUCTED{TextFile.FileExtension(path)}", reconstructedText); Console.WriteLine("Testing..."); FindMistakes(originalText, reconstructedText, path, writeStatistics); Console.WriteLine("Done.\n"); }
public static string BinaryFilePartitioningStats(string binFilePath, string positionTriePath, string word) { string statName = TextFile.FileName(binFilePath) + "PARTITION-STATS-" + word + ".txt"; var positionTrie = PositionTrieCreator.CreatePositionTrie(positionTriePath); using (var binaryReader = new BinaryReader(File.Open(binFilePath, FileMode.Open))) using (var statWriter = new StreamWriter(statName)) { var position = positionTrie.Find(word); binaryReader.BaseStream.Position = position; int howMany = binaryReader.ReadInt32(); var ngrams = new List <string>(); for (int i = 0; i < howMany; i++) { ngrams.Add(binaryReader.ReadString()); } statWriter.WriteLine($"Word: {word}"); int len = 1; while (ngrams.Count != 0) { var found = ngrams.Where(x => x.Length == len); if (found.Count() != 0) { statWriter.WriteLine($"{len} - {found.Count()}"); foreach (var n in found.ToList()) { ngrams.Remove(n); } } len++; } } return(statName); }
private List <string> GetAllPartialBinFiles(string[] binFiles) { var ret = new List <string>(); int size = 4; foreach (var path in binFiles) { int stepSize = GetDivisionCountByNumber(size--); int i = 0; string finalPath; while (File.Exists(finalPath = TextFile.FileName(path) + i + TextFile.FileExtension(path))) { ret.Add(finalPath); Console.WriteLine(finalPath); i += stepSize; } } return(ret); }
internal static string BinaryFileNgramStats(string binFilePath, string positionTriePath) { string statName = TextFile.FileName(binFilePath) + "_STATS.txt"; using (StreamReader positionReader = File.OpenText(positionTriePath)) using (var binaryReader = new BinaryReader(File.Open(binFilePath, FileMode.Open))) using (var statWriter = new StreamWriter(statName)) { int[] allNgCount = new int[5]; string line; while ((line = positionReader.ReadLine()) != null) { string word = line.Substring(0, line.IndexOf(" ")); long position = Convert.ToInt64(line.Substring(line.IndexOf(" ") + 1)); int[] ngCount = new int[5]; binaryReader.BaseStream.Position = position; var len = binaryReader.ReadInt32(); for (int i = 0; i < len; i++) { string ng = binaryReader.ReadString(); string[] ngArr = ng.Split(' '); int length = ngArr.Length; ngCount[length]++; } statWriter.WriteLine($"{word} ({ngCount.Sum()}) ({ngCount[4]}, {ngCount[3]}, {ngCount[2]}, {ngCount[1]})"); for (int i = 1; i < 5; i++) { allNgCount[i] += ngCount[i]; } } statWriter.WriteLine("All ngrams: {0} (4: {1}, 3: {2}, 2: {3}, 1: {4})", allNgCount.Sum(), allNgCount[4], allNgCount[3], allNgCount[2], allNgCount[1]); } return(statName); }
public static void CleanFileFromHiddenChars(string path) { using (var strmWriter = new StreamWriter($"{TextFile.FileName(path)}_CLEANED{TextFile.FileExtension(path)}")) using (var binReader = new BinaryReader(File.Open(path, FileMode.Open))) { var rgxLatin = new Regex($"[{latinChars}]"); var rgxCommon = new Regex("[-+*/=_—–]"); while (binReader.BaseStream.Position != binReader.BaseStream.Length) { byte b = binReader.ReadByte(); char c = Convert.ToChar(b); if (rgxLatin.IsMatch(c.ToString().ToLower()) || char.IsDigit(c) || char.IsPunctuation(c) || char.IsSeparator(c) || char.IsWhiteSpace(c) /*|| rgxCommon.IsMatch(c.ToString())*/) { strmWriter.Write(c); } else { Console.WriteLine($"<{c}>"); } } } }
public static void FindMistakes(string originalText, string reconstructedText, string path, bool writeStatistics) { string[] originalWords = originalText.Split(' ', '\n', '\t', '\r').Where(x => x != "").ToArray(); string[] reconstructedWords = reconstructedText.Split(' ', '\n', '\t', '\r').Where(x => x != "").ToArray(); Console.WriteLine($"originalWords.Length = {originalWords.Length}"); Console.WriteLine($"reconstructedWords.Length = {reconstructedWords.Length}"); if (originalWords.Length != reconstructedWords.Length) { //throw new Exception("Length of original and reconstructed text are not equal!"); Console.WriteLine("Length of original and reconstructed text are not equal!"); } int count = 0; using (var sw = new StreamWriter($"{TextFile.FileName(path)}_MISTAKES-RECONST-ORIG{TextFile.FileExtension(path)}")) { //int len = originalWords.Length; int len = Math.Min(originalWords.Length, reconstructedWords.Length); for (int i = 0; i < len; i++) { var originalW = originalWords[i]; var reconstructW = reconstructedWords[i]; if (originalW != reconstructW) { sw.WriteLine("{0} {1} {2} {3} {4} {5} {6} - {7} {8} {9} {10} {11} {12} {13}", i - 3 >= 0 ? reconstructedWords[i - 3] : "", i - 2 >= 0 ? reconstructedWords[i - 2] : "", i - 1 >= 0 ? reconstructedWords[i - 1] : "", reconstructW, i + 1 < len ? reconstructedWords[i + 1] : "", i + 2 < len ? reconstructedWords[i + 2] : "", i + 3 < len ? reconstructedWords[i + 3] : "", i - 3 >= 0 ? originalWords[i - 3] : "", i - 2 >= 0 ? originalWords[i - 2] : "", i - 1 >= 0 ? originalWords[i - 1] : "", originalW, i + 1 < len ? originalWords[i + 1] : "", i + 2 < len ? originalWords[i + 2] : "", i + 3 < len ? originalWords[i + 3] : ""); count++; } } } Console.WriteLine($"Number of mistakes: {count}"); if (writeStatistics) { statisticsPath = $"{TextFile.FileName(path)}_STATISTICS{TextFile.FileExtension(path)}"; File.AppendAllText(statisticsPath, $"originalWords.Length = {originalWords.Length}\n"); File.AppendAllText(statisticsPath, $"reconstructedWords.Length = {reconstructedWords.Length}\n"); File.AppendAllText(statisticsPath, $"Number of mistakes: {count}\n"); } countOfAllOrigWords += originalWords.Length; countOfAllReconstWords += reconstructedWords.Length; countOfAllMistakes += count; }