Example #1
0
        public static bool IsPlainText(string fileName)
        {
            var fileInfo = new FileInfo(fileName);

            if (fileInfo.Length < 20)
            {
                return(false); // too short to be plain text
            }
            if (fileInfo.Length > 5000000)
            {
                return(false); // too large to be plain text
            }
            var enc = LanguageAutoDetect.GetEncodingFromFile(fileName);
            var s   = File.ReadAllText(fileName, enc);

            int numberCount = 0;
            int letterCount = 0;
            int len         = s.Length;

            for (int i = 0; i < len; i++)
            {
                char ch = s[i];
                if (char.IsLetter(ch) || " -,.!?[]()\r\n".Contains(ch))
                {
                    letterCount++;
                }
                else if (char.IsControl(ch) && ch != '\t') // binary found
                {
                    return(false);
                }
                else if (CharUtils.IsDigit(ch))
                {
                    numberCount++;
                }
            }
            if (len < 100)
            {
                return(numberCount < 5 && letterCount > 20);
            }
            var numberThreshold = len * 0.002 + 1;
            var letterThreshold = len * 0.8;

            return(numberCount < numberThreshold && letterCount > letterThreshold);
        }
Example #2
0
        public static bool IsPlainText(string fileName)
        {
            var fileInfo = new FileInfo(fileName);

            if (fileInfo.Length < 20)
            {
                return(false); // too short to be plain text
            }

            if (fileInfo.Length > 5000000)
            {
                return(false); // too large to be plain text
            }

            var enc = LanguageAutoDetect.GetEncodingFromFile(fileName);
            var s   = ReadAllTextShared(fileName, enc);

            int numberCount = 0;
            int letterCount = 0;
            int len         = s.Length;

            for (int i = 0; i < len; i++)
            {
                var ch = s[i];
                if (char.IsLetter(ch) || " -,.!?[]()\r\n".Contains(ch))
                {
                    letterCount++;
                }
                else if (char.IsControl(ch) && ch != '\t') // binary found
                {
                    return(false);
                }
                else if (CharUtils.IsDigit(ch))
                {
                    numberCount++;
                }
            }
            if (len < 100)
            {
                return(numberCount < 5 && letterCount > 20);
            }

            var numberPatternMatches = new Regex(@"\d+[.:,; -]\d+").Matches(s);

            if (numberPatternMatches.Count > 30)
            {
                return(false); // looks like time codes
            }

            var largeBlocksOfLargeNumbers = new Regex(@"\d{3,8}").Matches(s);

            if (largeBlocksOfLargeNumbers.Count > 30)
            {
                return(false); // looks like time codes
            }

            if (len < 1000 && largeBlocksOfLargeNumbers.Count > 10)
            {
                return(false); // looks like time codes
            }

            var partsWithMoreThan100CharsOfNonNumbers = new Regex(@"[^\d]{150,100000}").Matches(s);

            if (partsWithMoreThan100CharsOfNonNumbers.Count > 10)
            {
                return(true); // looks like text
            }

            var numberThreshold = len * 0.015 + 25;
            var letterThreshold = len * 0.8;

            return(numberCount < numberThreshold && letterCount > letterThreshold);
        }