public void Verify_sanitize_of_single_mhtml_file() { var tempFile = Path.ChangeExtension(Path.GetTempFileName(), ".mht"); File.Copy(TestConfig.PathToMht, tempFile); string mhtml = File.ReadAllText(tempFile); MHTMLParser parser = new MHTMLParser(mhtml) { OutputDirectory = Path.GetDirectoryName(tempFile), DecodeImageData = true }; var outFile = Path.ChangeExtension(tempFile, ".html"); File.WriteAllText(outFile, parser.getHTMLText()); _sanitizer = new SafeHtmlConverter(outFile) { Logger = NullLogger.Instance }; var result = _sanitizer.Run("jobtest"); Assert.That(File.Exists(result), "Output pdf file not created"); File.Delete(result); }
protected async override Task <ProcessResult> OnPolling(PollerJobParameters parameters, string workingFolder) { string pathToFile = await DownloadBlob(parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder).ConfigureAwait(false); //String fileName = Path.Combine(Path.GetDirectoryName(pathToFile), parameters.All[JobKeys.FileName]); //Logger.DebugFormat("Move blob id {0} to real filename {1}", pathToFile, fileName); //if (File.Exists(fileName)) File.Delete(fileName); //File.Copy(pathToFile, fileName); if (Logger.IsDebugEnabled) { Logger.DebugFormat("Conversion of HtmlZip to PDF: file {0}", pathToFile); } var file = pathToFile; if (pathToFile.EndsWith(".mht", StringComparison.OrdinalIgnoreCase) || pathToFile.EndsWith(".mhtml", StringComparison.OrdinalIgnoreCase)) { string mhtml = File.ReadAllText(pathToFile); MHTMLParser parser = new MHTMLParser(mhtml) { OutputDirectory = workingFolder, DecodeImageData = true }; var outFile = Path.ChangeExtension(pathToFile, ".html"); File.WriteAllText(outFile, parser.getHTMLText()); file = outFile; } var sanitizer = new SafeHtmlConverter(file) { Logger = Logger }; file = sanitizer.Run(parameters.JobId); var converter = new HtmlToPdfConverterFromDiskFile(file, base.JobsHostConfiguration) { Logger = Logger }; var pdfConvertedFileName = converter.Run(parameters.TenantId, parameters.JobId); await AddFormatToDocumentFromFile( parameters.TenantId, parameters.JobId, new DocumentFormat(DocumentFormats.Pdf), pdfConvertedFileName, new Dictionary <string, object>()).ConfigureAwait(false); return(ProcessResult.Ok); }
static void Main(string[] args) { Console.Title = "QQMht To Html"; string outdir = "."; string hFile = Path.GetFileNameWithoutExtension(args[0]); string d = $@"{outdir}\{hFile}_images"; string mht, html = string.Empty; try { mht = File.ReadAllText(args[0]); } catch { return; } MHTMLParser parser = new MHTMLParser(mht); List <string[]> nodes; Console.WriteLine("Processing data..."); nodes = parser.DecompressString(); if (nodes.Count > 0) { int c = nodes.Count - 1; if (nodes.Count > 1) { Console.WriteLine($"{c} image(s) found."); Directory.CreateDirectory(d); Console.WriteLine($"create dir {d}"); } html = nodes[0][2]; for (int i = 1; i < nodes.Count; i++) { string ext = nodes[i][0].Split("/".ToArray())[1], name = nodes[i][1].Split(".".ToArray())[0]; if (ext == "jpeg") { ext = "jpg"; } string iFile = $@"{d}\{name}.{ext}"; byte[] bytes = Convert.FromBase64String(nodes[i][2]); File.WriteAllBytes(iFile, bytes); html = html.Replace($"{name}.dat", $@"{iFile}"); Console.WriteLine($"Processing image...({i}/{c}): {iFile}"); } hFile = $@"{outdir}\{hFile}.html"; File.WriteAllText(hFile, html); Console.Write("All done."); } }
private string ProcessFile(string pathToFile, string workingFolder) { var extension = Path.GetExtension(pathToFile).ToLower(); if (extension == ".htmlzip" || extension == ".htmzip") { ZipFile.ExtractToDirectory(pathToFile, workingFolder); Logger.DebugFormat("Extracted zip to {0}", workingFolder); var htmlFile = Path.ChangeExtension(pathToFile, "html"); if (File.Exists(htmlFile)) { Logger.DebugFormat("Html file is {0}", htmlFile); return(htmlFile); } htmlFile = Path.ChangeExtension(pathToFile, "htm"); if (File.Exists(htmlFile)) { Logger.DebugFormat("Html file is {0}", htmlFile); return(htmlFile); } Logger.ErrorFormat("Invalid HTMLZIP file, name is {0} but corresponding html file not found after decompression", Path.GetFileName(pathToFile)); } else if (extension == ".mht" || extension == ".mhtml") { MHTMLParser parser = new MHTMLParser(File.ReadAllText(pathToFile)); parser.OutputDirectory = workingFolder; parser.DecodeImageData = false; var html = parser.getHTMLText(); pathToFile = pathToFile + ".html"; File.WriteAllText(pathToFile, html); } return(pathToFile); }