private void RunTest(EncodingDetector.Options encDetectorOptions) { int totalFiles = 0; int numFoundEncodings = 0; foreach (string filePath in Directory.EnumerateFiles(_tempDir, "*.*", SearchOption.AllDirectories)) { var sampleBytes = Utils.ReadFileContentSample(filePath); Encoding encoding = EncodingDetector.Detect(sampleBytes, encDetectorOptions); totalFiles++; if (encoding != null) { numFoundEncodings++; } WriteToConsole(Path.GetFileName(filePath) + ": " + encoding); if (totalFiles > 10) { break; } } Console.WriteLine("Found Encoding in:" + numFoundEncodings + " out of " + totalFiles); }
/// <summary> /// Detects encoding of a filestream when BOM is present /// </summary> /// <param name="fileName"></param> public static void ExtractEncodingByBOM(string fileName) { //ExStart:ExtractEncodingByBOM try { EncodingDetector detector = new EncodingDetector(Encoding.GetEncoding(1251)); //get file actual path String filePath = Common.GetFilePath(fileName); Stream stream = new FileStream(filePath, FileMode.Open); Console.WriteLine(detector.Detect(stream)); } catch (Exception ex) { Console.WriteLine(ex.Message); } //ExEnd:ExtractEncodingByBOM }
public ActionResult ExtractDocumentEndocing([FromBody] string fileName) { List <string> extractedText = new List <string>(); ExtractorFactory factory = new ExtractorFactory(); string filePath = Server.MapPath("../App_Data//Uploads//" + fileName); try { EncodingDetector detector = new EncodingDetector(Encoding.GetEncoding(1251)); Stream stream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite); extractedText.Add(detector.Detect(stream).ToString()); } catch (Exception ex) { extractedText.Add("File Format not supported"); } return(Json(extractedText, JsonRequestBehavior.AllowGet)); }
private void DetectEncodingSyncAction(DetectEncodingFileDto dto) { string stopWatchKey = "DetectEncodingSyncAction_" + Thread.CurrentThread.ManagedThreadId; StopWatch.Start(stopWatchKey); //First try BOM detection and Unicode detection using Klerks Soft encoder //stream.Seek(0, SeekOrigin.Begin); Encoding encoding = EncodingDetector.Detect(dto.SampleBytes, EncodingDetector.Options.MLang); //Encoding encoding = null; //Encoding[] detected; //try //{ // detected = EncodingTools.DetectInputCodepages(dto.SampleBytes, 1); // if (detected.Length > 0) // encoding = detected[0]; //} //catch (COMException ex) //{ // // return default codepage on error //} lock (lockObj) { _totalFiles++; if (encoding != null) { _numFoundEncodings++; } } //WriteToConsole(dto.FilePath + ": " + encoding); StopWatch.Stop(stopWatchKey); }
public static Event?Recode(FileContext context) { byte[] content = File.ReadAllBytes(context.FullFileName); Encoding encoding = EncodingDetector.Detect(content); if (encoding != FileRecoder.win1252) { return(null); } int length = (int)(content.Length * 1.5); if (FileRecoder.resultContent.Length < length) { FileRecoder.resultContent = new byte[length]; } using (MemoryStream mem = new(FileRecoder.resultContent)) { foreach (byte b in content) { if (b >= 0x80) { byte[] buffer = FileRecoder.charmap1[b - 0x80]; mem.Write(buffer, 0, buffer.Length); } else { mem.WriteByte(b); } } length = (int)mem.Position; } using FileStream stream = File.OpenWrite(context.FullFileName); stream.Write(FileRecoder.resultContent, 0, length); return(null); }
private async Task <Response> ParseFileText(string fileName, string folderName) { string logMsg = "ControllerName: GroupDocsParserController FileName: " + fileName + " FolderName: " + folderName; try { return(await ProcessTask(fileName, folderName, ".txt", false, "", delegate(string inFilePath, string outPath, string zipOutFolder) { EncodingDetector detector = new EncodingDetector(Encoding.GetEncoding(1251)); if (!Directory.Exists(zipOutFolder)) { Directory.CreateDirectory(zipOutFolder); } using (Stream stream = new FileStream(inFilePath, FileMode.Open)) { System.IO.File.WriteAllText(outPath, "Encoding: " + detector.Detect(stream, true) + Environment.NewLine); } ExtractorFactory factory = new ExtractorFactory(); MetadataExtractor metadataExtractor = factory.CreateMetadataExtractor(inFilePath); if (metadataExtractor != null) { MetadataCollection metadataCollection = metadataExtractor.ExtractMetadata(inFilePath); System.IO.File.AppendAllText(outPath, Environment.NewLine + "Metadata:" + Environment.NewLine); foreach (string key in metadataCollection.Keys) { System.IO.File.AppendAllText(outPath, string.Format("{0} = {1}", key, metadataCollection[key]) + Environment.NewLine); } } System.IO.File.AppendAllText(outPath, Environment.NewLine + "Parsed content:" + Environment.NewLine); string fileExt = Path.GetExtension(fileName).Substring(1).ToLower(); if (GetFormatType(fileExt) == FormatType.Excel) { CellsTextExtractor extractor = new CellsTextExtractor(inFilePath); extractor.ExtractMode = ExtractMode.Standard; for (int sheetIndex = 0; sheetIndex < extractor.SheetCount; sheetIndex++) { System.IO.File.AppendAllText(outPath, Environment.NewLine + "Sheet # " + extractor.SheetCount + Environment.NewLine); System.IO.File.AppendAllText(outPath, extractor.ExtractSheet(sheetIndex)); } } else { TextExtractor textExtractor = factory.CreateFormattedTextExtractor(inFilePath); if (textExtractor == null) { textExtractor = factory.CreateTextExtractor(inFilePath); } System.IO.File.AppendAllText(outPath, textExtractor.ExtractAll()); } })); } catch (Exception exc) { return(new Response { FileName = fileName, FolderName = folderName, OutputType = "txt", Status = exc.Message, StatusCode = 500, Text = exc.ToString() }); } }
private void RunTest(EncodingDetector.Options encDetectorOptions, string dir) { int totalFiles = 0; int numFoundEncodings = 0; StopWatch stopWatch = new StopWatch(); stopWatch.Start(); string detectorName = encDetectorOptions.ToString(); foreach (string filePath in Directory.EnumerateFiles(dir, "*.*", SearchOption.AllDirectories)) { StopWatch.Start("ReadFileSample"); var sampleBytes = Utils.ReadFileContentSample(filePath); StopWatch.Stop("ReadFileSample"); StopWatch.Start("IsBinaryFile"); if (Utils.IsBinaryFile(sampleBytes)) { StopWatch.Stop("IsBinaryFile"); continue; } StopWatch.Stop("IsBinaryFile"); StopWatch.Start(detectorName); //First try BOM detection and Unicode detection using Klerks Soft encoder //stream.Seek(0, SeekOrigin.Begin); Encoding encoding = EncodingDetector.Detect(sampleBytes, encDetectorOptions); totalFiles++; if (encoding != null) { numFoundEncodings++; } StopWatch.Stop(detectorName); WriteToConsole(filePath + ": " + encoding); if (totalFiles > 10) { break; } } Console.WriteLine("Found Encoding in:" + numFoundEncodings + " out of " + totalFiles); stopWatch.Stop(); StopWatch.PrintCollection(stopWatch.Milliseconds); StopWatch.Collection.Clear(); }