private void RunTest(EncodingDetector.Options encDetectorOptions)
        {
            int totalFiles        = 0;
            int numFoundEncodings = 0;

            foreach (string filePath in Directory.EnumerateFiles(_tempDir, "*.*", SearchOption.AllDirectories))
            {
                var sampleBytes = Utils.ReadFileContentSample(filePath);

                Encoding encoding = EncodingDetector.Detect(sampleBytes, encDetectorOptions);
                totalFiles++;

                if (encoding != null)
                {
                    numFoundEncodings++;
                }

                WriteToConsole(Path.GetFileName(filePath) + ": " + encoding);

                if (totalFiles > 10)
                {
                    break;
                }
            }

            Console.WriteLine("Found Encoding in:" + numFoundEncodings + " out of " + totalFiles);
        }
Example #2
0
 /// <summary>
 /// Detects encoding of a filestream when BOM is present
 /// </summary>
 /// <param name="fileName"></param>
 public static void ExtractEncodingByBOM(string fileName)
 {
     //ExStart:ExtractEncodingByBOM
     try
     {
         EncodingDetector detector = new EncodingDetector(Encoding.GetEncoding(1251));
         //get file actual path
         String filePath = Common.GetFilePath(fileName);
         Stream stream   = new FileStream(filePath, FileMode.Open);
         Console.WriteLine(detector.Detect(stream));
     }
     catch (Exception ex)
     {
         Console.WriteLine(ex.Message);
     }
     //ExEnd:ExtractEncodingByBOM
 }
Example #3
0
        public ActionResult ExtractDocumentEndocing([FromBody] string fileName)
        {
            List <string>    extractedText = new List <string>();
            ExtractorFactory factory       = new ExtractorFactory();
            string           filePath      = Server.MapPath("../App_Data//Uploads//" + fileName);

            try
            {
                EncodingDetector detector = new EncodingDetector(Encoding.GetEncoding(1251));
                Stream           stream   = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
                extractedText.Add(detector.Detect(stream).ToString());
            }
            catch (Exception ex)
            {
                extractedText.Add("File Format not supported");
            }
            return(Json(extractedText, JsonRequestBehavior.AllowGet));
        }
Example #4
0
        private void DetectEncodingSyncAction(DetectEncodingFileDto dto)
        {
            string stopWatchKey = "DetectEncodingSyncAction_" + Thread.CurrentThread.ManagedThreadId;

            StopWatch.Start(stopWatchKey);

            //First try BOM detection and Unicode detection using Klerks Soft encoder
            //stream.Seek(0, SeekOrigin.Begin);

            Encoding encoding = EncodingDetector.Detect(dto.SampleBytes, EncodingDetector.Options.MLang);

            //Encoding encoding = null;
            //Encoding[] detected;

            //try
            //{
            //	detected = EncodingTools.DetectInputCodepages(dto.SampleBytes, 1);

            //	if (detected.Length > 0)
            //		encoding = detected[0];
            //}
            //catch (COMException ex)
            //{
            //	// return default codepage on error
            //}



            lock (lockObj)
            {
                _totalFiles++;

                if (encoding != null)
                {
                    _numFoundEncodings++;
                }
            }

            //WriteToConsole(dto.FilePath + ": " + encoding);

            StopWatch.Stop(stopWatchKey);
        }
Example #5
0
        public static Event?Recode(FileContext context)
        {
            byte[]   content  = File.ReadAllBytes(context.FullFileName);
            Encoding encoding = EncodingDetector.Detect(content);

            if (encoding != FileRecoder.win1252)
            {
                return(null);
            }

            int length = (int)(content.Length * 1.5);

            if (FileRecoder.resultContent.Length < length)
            {
                FileRecoder.resultContent = new byte[length];
            }

            using (MemoryStream mem = new(FileRecoder.resultContent))
            {
                foreach (byte b in content)
                {
                    if (b >= 0x80)
                    {
                        byte[] buffer = FileRecoder.charmap1[b - 0x80];
                        mem.Write(buffer, 0, buffer.Length);
                    }
                    else
                    {
                        mem.WriteByte(b);
                    }
                }

                length = (int)mem.Position;
            }

            using FileStream stream = File.OpenWrite(context.FullFileName);
            stream.Write(FileRecoder.resultContent, 0, length);

            return(null);
        }
        private async Task <Response> ParseFileText(string fileName, string folderName)
        {
            string logMsg = "ControllerName: GroupDocsParserController FileName: " + fileName + " FolderName: " + folderName;

            try
            {
                return(await ProcessTask(fileName, folderName, ".txt", false, "", delegate(string inFilePath, string outPath, string zipOutFolder)
                {
                    EncodingDetector detector = new EncodingDetector(Encoding.GetEncoding(1251));

                    if (!Directory.Exists(zipOutFolder))
                    {
                        Directory.CreateDirectory(zipOutFolder);
                    }

                    using (Stream stream = new FileStream(inFilePath, FileMode.Open)) {
                        System.IO.File.WriteAllText(outPath, "Encoding: " + detector.Detect(stream, true) + Environment.NewLine);
                    }

                    ExtractorFactory factory = new ExtractorFactory();
                    MetadataExtractor metadataExtractor = factory.CreateMetadataExtractor(inFilePath);
                    if (metadataExtractor != null)
                    {
                        MetadataCollection metadataCollection = metadataExtractor.ExtractMetadata(inFilePath);

                        System.IO.File.AppendAllText(outPath, Environment.NewLine + "Metadata:" + Environment.NewLine);
                        foreach (string key in metadataCollection.Keys)
                        {
                            System.IO.File.AppendAllText(outPath, string.Format("{0} = {1}", key, metadataCollection[key]) + Environment.NewLine);
                        }
                    }

                    System.IO.File.AppendAllText(outPath, Environment.NewLine + "Parsed content:" + Environment.NewLine);

                    string fileExt = Path.GetExtension(fileName).Substring(1).ToLower();
                    if (GetFormatType(fileExt) == FormatType.Excel)
                    {
                        CellsTextExtractor extractor = new CellsTextExtractor(inFilePath);
                        extractor.ExtractMode = ExtractMode.Standard;
                        for (int sheetIndex = 0; sheetIndex < extractor.SheetCount; sheetIndex++)
                        {
                            System.IO.File.AppendAllText(outPath, Environment.NewLine + "Sheet # " + extractor.SheetCount + Environment.NewLine);
                            System.IO.File.AppendAllText(outPath, extractor.ExtractSheet(sheetIndex));
                        }
                    }
                    else
                    {
                        TextExtractor textExtractor = factory.CreateFormattedTextExtractor(inFilePath);
                        if (textExtractor == null)
                        {
                            textExtractor = factory.CreateTextExtractor(inFilePath);
                        }
                        System.IO.File.AppendAllText(outPath, textExtractor.ExtractAll());
                    }
                }));
            }
            catch (Exception exc)
            {
                return(new Response {
                    FileName = fileName, FolderName = folderName, OutputType = "txt", Status = exc.Message, StatusCode = 500, Text = exc.ToString()
                });
            }
        }
Example #7
0
        private void RunTest(EncodingDetector.Options encDetectorOptions, string dir)
        {
            int totalFiles        = 0;
            int numFoundEncodings = 0;

            StopWatch stopWatch = new StopWatch();

            stopWatch.Start();

            string detectorName = encDetectorOptions.ToString();

            foreach (string filePath in Directory.EnumerateFiles(dir, "*.*", SearchOption.AllDirectories))
            {
                StopWatch.Start("ReadFileSample");

                var sampleBytes = Utils.ReadFileContentSample(filePath);

                StopWatch.Stop("ReadFileSample");

                StopWatch.Start("IsBinaryFile");

                if (Utils.IsBinaryFile(sampleBytes))
                {
                    StopWatch.Stop("IsBinaryFile");
                    continue;
                }

                StopWatch.Stop("IsBinaryFile");


                StopWatch.Start(detectorName);


                //First try BOM detection and Unicode detection using Klerks Soft encoder
                //stream.Seek(0, SeekOrigin.Begin);

                Encoding encoding = EncodingDetector.Detect(sampleBytes, encDetectorOptions);
                totalFiles++;

                if (encoding != null)
                {
                    numFoundEncodings++;
                }

                StopWatch.Stop(detectorName);

                WriteToConsole(filePath + ": " + encoding);

                if (totalFiles > 10)
                {
                    break;
                }
            }

            Console.WriteLine("Found Encoding in:" + numFoundEncodings + " out of " + totalFiles);

            stopWatch.Stop();

            StopWatch.PrintCollection(stopWatch.Milliseconds);
            StopWatch.Collection.Clear();
        }