private (string, string) ShuffleAll() { SortedDictionary <int, int> dictSrcLenDist = new SortedDictionary <int, int>(); SortedDictionary <int, int> dictTgtLenDist = new SortedDictionary <int, int>(); string srcShuffledFilePath = Path.Combine(Directory.GetCurrentDirectory(), Path.GetRandomFileName() + ".tmp"); string tgtShuffledFilePath = Path.Combine(Directory.GetCurrentDirectory(), Path.GetRandomFileName() + ".tmp"); Logger.WriteLine($"Shuffling corpus for '{m_srcFileList.Count}' files."); StreamWriter swSrc = new StreamWriter(srcShuffledFilePath, false); StreamWriter swTgt = new StreamWriter(tgtShuffledFilePath, false); List <RawSntPair> sntPairs = new List <RawSntPair>(); CorpusSize = 0; int tooLongSrcSntCnt = 0; for (int i = 0; i < m_srcFileList.Count; i++) { if (m_showTokenDist) { Logger.WriteLine($"Process file '{m_srcFileList[i]}' and '{m_tgtFileList[i]}'"); } StreamReader srSrc = new StreamReader(m_srcFileList[i]); StreamReader srTgt = new StreamReader(m_tgtFileList[i]); while (true) { if (srSrc.EndOfStream && srTgt.EndOfStream) { break; } RawSntPair rawSntPair = new RawSntPair(srSrc.ReadLine(), srTgt.ReadLine()); if (rawSntPair.IsEmptyPair()) { break; } if (dictSrcLenDist.ContainsKey(rawSntPair.SrcLength / 100) == false) { dictSrcLenDist.Add(rawSntPair.SrcLength / 100, 0); } dictSrcLenDist[rawSntPair.SrcLength / 100]++; if (dictTgtLenDist.ContainsKey(rawSntPair.TgtLength / 100) == false) { dictTgtLenDist.Add(rawSntPair.TgtLength / 100, 0); } dictTgtLenDist[rawSntPair.TgtLength / 100]++; bool hasTooLongSent = false; if (rawSntPair.SrcLength > m_maxSentLength) { tooLongSrcSntCnt++; hasTooLongSent = true; } if (hasTooLongSent) { continue; } sntPairs.Add(rawSntPair); CorpusSize++; if (m_blockSize > 0 && sntPairs.Count >= m_blockSize) { Shuffle(sntPairs); foreach (RawSntPair item in sntPairs) { swSrc.WriteLine(item.SrcSnt); swTgt.WriteLine(item.TgtSnt); } sntPairs.Clear(); } } srSrc.Close(); srTgt.Close(); } if (sntPairs.Count > 0) { Shuffle(sntPairs); foreach (RawSntPair item in sntPairs) { swSrc.WriteLine(item.SrcSnt); swTgt.WriteLine(item.TgtSnt); } sntPairs.Clear(); } swSrc.Close(); swTgt.Close(); Logger.WriteLine($"Shuffled '{CorpusSize}' sentence pairs to file '{srcShuffledFilePath}' and '{tgtShuffledFilePath}'."); if (tooLongSrcSntCnt > 0) { Logger.WriteLine(Logger.Level.warn, ConsoleColor.Yellow, $"Found {tooLongSrcSntCnt} source sentences are longer than '{m_maxSentLength}' tokens, ignore them."); } if (m_showTokenDist) { Logger.WriteLine($"Src token length distribution"); int srcTotalNum = 0; foreach (var pair in dictSrcLenDist) { srcTotalNum += pair.Value; } int srcAccNum = 0; foreach (var pair in dictSrcLenDist) { srcAccNum += pair.Value; Logger.WriteLine($"{pair.Key * 100} ~ {(pair.Key + 1) * 100}: {pair.Value} (acc: {(100.0f * (float)srcAccNum / (float)srcTotalNum).ToString("F")}%)"); } Logger.WriteLine($"Tgt token length distribution"); int tgtTotalNum = 0; foreach (var pair in dictTgtLenDist) { tgtTotalNum += pair.Value; } int tgtAccNum = 0; foreach (var pair in dictTgtLenDist) { tgtAccNum += pair.Value; Logger.WriteLine($"{pair.Key * 100} ~ {(pair.Key + 1) * 100}: {pair.Value} (acc: {(100.0f * (float)tgtAccNum / (float)tgtTotalNum).ToString("F")}%)"); } m_showTokenDist = false; } return(srcShuffledFilePath, tgtShuffledFilePath); }