public static TextFile Load(string fileName, AnsiCodePageGetter ansiCodePageGetter) { if (string.IsNullOrEmpty(fileName)) throw new ArgumentNullException("fileName"); if (!File.Exists(fileName)) throw new FileNotFoundException("File not found.", fileName); byte[] allBytes; using (FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { if (fs.Length == 0) allBytes = new byte[] { }; else { allBytes = new byte[fs.Length]; int nread = fs.Read(allBytes, 0, allBytes.Length); if (nread != allBytes.Length) throw new Exception(); } } int codePage; int textStart; if (allBytes.Length == 0) { codePage = (int)SpecialCodePage.Utf8; textStart = 0; } else { KeyValuePair<SpecialCodePage, CodePageInfoAttribute>? codePageGot = null; foreach (KeyValuePair<SpecialCodePage, CodePageInfoAttribute> kv in TextFile.SpecialCodePageInfo) { bool maybe; if (kv.Value.Bom.Length > 0) { if (allBytes.Length >= kv.Value.Bom.Length) { byte[] bytesBom = new byte[kv.Value.Bom.Length]; Array.Copy(allBytes, bytesBom, bytesBom.Length); maybe = Array.Equals(bytesBom, kv.Value.Bom); } else { maybe = false; } } else maybe = true; if (maybe && ((codePageGot == null) || (codePageGot.Value.Value.Bom.Length < kv.Value.Bom.Length))) codePageGot = kv; } if (codePageGot != null) { codePage = (int)codePageGot.Value.Key; textStart = codePageGot.Value.Value.Bom.Length; } else { textStart = 0; List<int> ansiPositions = new List<int>(); for (int i = 0; i < allBytes.Length; i++) { byte cur = allBytes[i]; if (cur <= 0x80) continue; if (i == (allBytes.Length - 1)) { ansiPositions.Add(i); continue; } byte nxt1 = allBytes[i + 1]; if ((cur >= 0xC0) && (cur <= 0xDF) && (nxt1 >= 0x80) && (nxt1 <= 0xBF)) { i += 1; //ok: 2-byte char } else { if (i == (allBytes.Length - 2)) { ansiPositions.Add(i); continue; } byte nxt2 = allBytes[i + 2]; if ((cur >= 0xE0) && (cur <= 0xEF) && (nxt1 >= 0x80) && (nxt1 <= 0xBF) && (nxt2 >= 0x80) && (nxt2 <= 0xBF)) { i += 2; //ok: 3-byte char } else { if (i == (allBytes.Length - 3)) { ansiPositions.Add(i); continue; } byte nxt3 = allBytes[i + 3]; if ((cur >= 0xF0) && (cur <= 0xF7) && (nxt1 >= 0x80) && (nxt1 <= 0xBF) && (nxt2 >= 0x80) && (nxt2 <= 0xBF) && (nxt3 >= 0x80) && (nxt3 <= 0xBF)) { i += 3; //ok: 4-byte char } else { ansiPositions.Add(i); continue; } } } } if (ansiPositions.Count == 0) { codePage = (int)SpecialCodePage.Utf8; } else { int defaultCodePage = System.Text.Encoding.Default.CodePage; if (!TextFile.AnsiCodePages.ContainsKey(defaultCodePage)) foreach (KeyValuePair<int, string> kv in TextFile.AnsiCodePages) { defaultCodePage = kv.Key; break; } if (ansiCodePageGetter == null) { codePage = defaultCodePage; } else { int? cp = ansiCodePageGetter(TextFile.AnsiCodePages, defaultCodePage, allBytes, ansiPositions.ToArray()); if (cp.HasValue) codePage = cp.Value; else throw new OperationCanceledException(); } } } } string text; if ((allBytes.Length - textStart) == 0) text = ""; else text = Encoding.GetEncoding((int)codePage).GetString(allBytes, textStart, allBytes.Length - textStart); string newLineSequence = null; for (int i = 0; (newLineSequence == null) && (i < text.Length); i++) { switch (text[i]) { case '\r': if ((i < text.Length - 1) && (text[i + 1] == '\n')) newLineSequence = "\r\n"; else newLineSequence = "\r"; break; case '\n': newLineSequence = "\n"; break; } } if (newLineSequence == null) newLineSequence = Environment.NewLine; IO.LineEndings lineEnding; switch (newLineSequence) { case "\r\n": lineEnding = IO.LineEndings.Windows; break; case "\n": lineEnding = IO.LineEndings.Linux; break; case "\r": lineEnding = IO.LineEndings.Mac; break; default: throw new Exception("Unknown line separator."); } return new TextFile(fileName, codePage, textStart > 0, (newLineSequence.Length == 1) ? text.Split(newLineSequence[0]) : text.Split(new string[] { newLineSequence }, StringSplitOptions.None), lineEnding); }
public static TextFile Load(string fileName, AnsiCodePageGetter ansiCodePageGetter) { if (string.IsNullOrEmpty(fileName)) { throw new ArgumentNullException("fileName"); } if (!File.Exists(fileName)) { throw new FileNotFoundException("File not found.", fileName); } byte[] allBytes; using (FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { if (fs.Length == 0) { allBytes = new byte[] { } } ; else { allBytes = new byte[fs.Length]; int nread = fs.Read(allBytes, 0, allBytes.Length); if (nread != allBytes.Length) { throw new Exception(); } } } int codePage; int textStart; if (allBytes.Length == 0) { codePage = (int)SpecialCodePage.Utf8; textStart = 0; } else { KeyValuePair <SpecialCodePage, CodePageInfoAttribute>?codePageGot = null; foreach (KeyValuePair <SpecialCodePage, CodePageInfoAttribute> kv in TextFile.SpecialCodePageInfo) { bool maybe; if (kv.Value.Bom.Length > 0) { if (allBytes.Length >= kv.Value.Bom.Length) { byte[] bytesBom = new byte[kv.Value.Bom.Length]; Array.Copy(allBytes, bytesBom, bytesBom.Length); maybe = Array.Equals(bytesBom, kv.Value.Bom); } else { maybe = false; } } else { maybe = true; } if (maybe && ((codePageGot == null) || (codePageGot.Value.Value.Bom.Length < kv.Value.Bom.Length))) { codePageGot = kv; } } if (codePageGot != null) { codePage = (int)codePageGot.Value.Key; textStart = codePageGot.Value.Value.Bom.Length; } else { textStart = 0; List <int> ansiPositions = new List <int>(); for (int i = 0; i < allBytes.Length; i++) { byte cur = allBytes[i]; if (cur <= 0x80) { continue; } if (i == (allBytes.Length - 1)) { ansiPositions.Add(i); continue; } byte nxt1 = allBytes[i + 1]; if ((cur >= 0xC0) && (cur <= 0xDF) && (nxt1 >= 0x80) && (nxt1 <= 0xBF)) { i += 1; //ok: 2-byte char } else { if (i == (allBytes.Length - 2)) { ansiPositions.Add(i); continue; } byte nxt2 = allBytes[i + 2]; if ((cur >= 0xE0) && (cur <= 0xEF) && (nxt1 >= 0x80) && (nxt1 <= 0xBF) && (nxt2 >= 0x80) && (nxt2 <= 0xBF)) { i += 2; //ok: 3-byte char } else { if (i == (allBytes.Length - 3)) { ansiPositions.Add(i); continue; } byte nxt3 = allBytes[i + 3]; if ((cur >= 0xF0) && (cur <= 0xF7) && (nxt1 >= 0x80) && (nxt1 <= 0xBF) && (nxt2 >= 0x80) && (nxt2 <= 0xBF) && (nxt3 >= 0x80) && (nxt3 <= 0xBF)) { i += 3; //ok: 4-byte char } else { ansiPositions.Add(i); continue; } } } } if (ansiPositions.Count == 0) { codePage = (int)SpecialCodePage.Utf8; } else { int defaultCodePage = System.Text.Encoding.Default.CodePage; if (!TextFile.AnsiCodePages.ContainsKey(defaultCodePage)) { foreach (KeyValuePair <int, string> kv in TextFile.AnsiCodePages) { defaultCodePage = kv.Key; break; } } if (ansiCodePageGetter == null) { codePage = defaultCodePage; } else { int?cp = ansiCodePageGetter(TextFile.AnsiCodePages, defaultCodePage, allBytes, ansiPositions.ToArray()); if (cp.HasValue) { codePage = cp.Value; } else { throw new OperationCanceledException(); } } } } } string text; if ((allBytes.Length - textStart) == 0) { text = ""; } else { text = Encoding.GetEncoding((int)codePage).GetString(allBytes, textStart, allBytes.Length - textStart); } string newLineSequence = null; for (int i = 0; (newLineSequence == null) && (i < text.Length); i++) { switch (text[i]) { case '\r': if ((i < text.Length - 1) && (text[i + 1] == '\n')) { newLineSequence = "\r\n"; } else { newLineSequence = "\r"; } break; case '\n': newLineSequence = "\n"; break; } } if (newLineSequence == null) { newLineSequence = Environment.NewLine; } IO.LineEndings lineEnding; switch (newLineSequence) { case "\r\n": lineEnding = IO.LineEndings.Windows; break; case "\n": lineEnding = IO.LineEndings.Linux; break; case "\r": lineEnding = IO.LineEndings.Mac; break; default: throw new Exception("Unknown line separator."); } return(new TextFile(fileName, codePage, textStart > 0, (newLineSequence.Length == 1) ? text.Split(newLineSequence[0]) : text.Split(new string[] { newLineSequence }, StringSplitOptions.None), lineEnding)); }