public static bool IsPlainText(string fileName) { var fileInfo = new FileInfo(fileName); if (fileInfo.Length < 20) { return(false); // too short to be plain text } if (fileInfo.Length > 5000000) { return(false); // too large to be plain text } var enc = LanguageAutoDetect.GetEncodingFromFile(fileName); var s = File.ReadAllText(fileName, enc); int numberCount = 0; int letterCount = 0; int len = s.Length; for (int i = 0; i < len; i++) { char ch = s[i]; if (char.IsLetter(ch) || " -,.!?[]()\r\n".Contains(ch)) { letterCount++; } else if (char.IsControl(ch) && ch != '\t') // binary found { return(false); } else if (CharUtils.IsDigit(ch)) { numberCount++; } } if (len < 100) { return(numberCount < 5 && letterCount > 20); } var numberThreshold = len * 0.002 + 1; var letterThreshold = len * 0.8; return(numberCount < numberThreshold && letterCount > letterThreshold); }
public static bool IsPlainText(string fileName) { var fileInfo = new FileInfo(fileName); if (fileInfo.Length < 20) { return(false); // too short to be plain text } if (fileInfo.Length > 5000000) { return(false); // too large to be plain text } var enc = LanguageAutoDetect.GetEncodingFromFile(fileName); var s = ReadAllTextShared(fileName, enc); int numberCount = 0; int letterCount = 0; int len = s.Length; for (int i = 0; i < len; i++) { var ch = s[i]; if (char.IsLetter(ch) || " -,.!?[]()\r\n".Contains(ch)) { letterCount++; } else if (char.IsControl(ch) && ch != '\t') // binary found { return(false); } else if (CharUtils.IsDigit(ch)) { numberCount++; } } if (len < 100) { return(numberCount < 5 && letterCount > 20); } var numberPatternMatches = new Regex(@"\d+[.:,; -]\d+").Matches(s); if (numberPatternMatches.Count > 30) { return(false); // looks like time codes } var largeBlocksOfLargeNumbers = new Regex(@"\d{3,8}").Matches(s); if (largeBlocksOfLargeNumbers.Count > 30) { return(false); // looks like time codes } if (len < 1000 && largeBlocksOfLargeNumbers.Count > 10) { return(false); // looks like time codes } var partsWithMoreThan100CharsOfNonNumbers = new Regex(@"[^\d]{150,100000}").Matches(s); if (partsWithMoreThan100CharsOfNonNumbers.Count > 10) { return(true); // looks like text } var numberThreshold = len * 0.015 + 25; var letterThreshold = len * 0.8; return(numberCount < numberThreshold && letterCount > letterThreshold); }