public void Verify_sanitize_of_single_mhtml_file()
        {
            var tempFile = Path.ChangeExtension(Path.GetTempFileName(), ".mht");

            File.Copy(TestConfig.PathToMht, tempFile);

            string      mhtml  = File.ReadAllText(tempFile);
            MHTMLParser parser = new MHTMLParser(mhtml)
            {
                OutputDirectory = Path.GetDirectoryName(tempFile),
                DecodeImageData = true
            };
            var outFile = Path.ChangeExtension(tempFile, ".html");

            File.WriteAllText(outFile, parser.getHTMLText());

            _sanitizer = new SafeHtmlConverter(outFile)
            {
                Logger = NullLogger.Instance
            };
            var result = _sanitizer.Run("jobtest");

            Assert.That(File.Exists(result), "Output pdf file not created");
            File.Delete(result);
        }
        protected async override Task <ProcessResult> OnPolling(PollerJobParameters parameters, string workingFolder)
        {
            string pathToFile = await DownloadBlob(parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder).ConfigureAwait(false);

            //String fileName = Path.Combine(Path.GetDirectoryName(pathToFile), parameters.All[JobKeys.FileName]);
            //Logger.DebugFormat("Move blob id {0} to real filename {1}", pathToFile, fileName);
            //if (File.Exists(fileName)) File.Delete(fileName);
            //File.Copy(pathToFile, fileName);
            if (Logger.IsDebugEnabled)
            {
                Logger.DebugFormat("Conversion of HtmlZip to PDF: file {0}", pathToFile);
            }

            var file = pathToFile;

            if (pathToFile.EndsWith(".mht", StringComparison.OrdinalIgnoreCase) || pathToFile.EndsWith(".mhtml", StringComparison.OrdinalIgnoreCase))
            {
                string      mhtml  = File.ReadAllText(pathToFile);
                MHTMLParser parser = new MHTMLParser(mhtml)
                {
                    OutputDirectory = workingFolder,
                    DecodeImageData = true
                };
                var outFile = Path.ChangeExtension(pathToFile, ".html");
                File.WriteAllText(outFile, parser.getHTMLText());
                file = outFile;
            }

            var sanitizer = new SafeHtmlConverter(file)
            {
                Logger = Logger
            };

            file = sanitizer.Run(parameters.JobId);


            var converter = new HtmlToPdfConverterFromDiskFile(file, base.JobsHostConfiguration)
            {
                Logger = Logger
            };

            var pdfConvertedFileName = converter.Run(parameters.TenantId, parameters.JobId);

            await AddFormatToDocumentFromFile(
                parameters.TenantId,
                parameters.JobId,
                new DocumentFormat(DocumentFormats.Pdf),
                pdfConvertedFileName,
                new Dictionary <string, object>()).ConfigureAwait(false);

            return(ProcessResult.Ok);
        }
示例#3
0
        static void Main(string[] args)
        {
            Console.Title = "QQMht To Html";
            string outdir = ".";
            string hFile = Path.GetFileNameWithoutExtension(args[0]);
            string d = $@"{outdir}\{hFile}_images";
            string mht, html = string.Empty;

            try
            {
                mht = File.ReadAllText(args[0]);
            }
            catch
            {
                return;
            }
            MHTMLParser     parser = new MHTMLParser(mht);
            List <string[]> nodes;

            Console.WriteLine("Processing data...");
            nodes = parser.DecompressString();
            if (nodes.Count > 0)
            {
                int c = nodes.Count - 1;
                if (nodes.Count > 1)
                {
                    Console.WriteLine($"{c} image(s) found.");
                    Directory.CreateDirectory(d);
                    Console.WriteLine($"create dir {d}");
                }
                html = nodes[0][2];
                for (int i = 1; i < nodes.Count; i++)
                {
                    string ext  = nodes[i][0].Split("/".ToArray())[1],
                           name = nodes[i][1].Split(".".ToArray())[0];
                    if (ext == "jpeg")
                    {
                        ext = "jpg";
                    }
                    string iFile = $@"{d}\{name}.{ext}";
                    byte[] bytes = Convert.FromBase64String(nodes[i][2]);
                    File.WriteAllBytes(iFile, bytes);
                    html = html.Replace($"{name}.dat", $@"{iFile}");
                    Console.WriteLine($"Processing image...({i}/{c}): {iFile}");
                }
                hFile = $@"{outdir}\{hFile}.html";
                File.WriteAllText(hFile, html);
                Console.Write("All done.");
            }
        }
示例#4
0
        private string ProcessFile(string pathToFile, string workingFolder)
        {
            var extension = Path.GetExtension(pathToFile).ToLower();

            if (extension == ".htmlzip" || extension == ".htmzip")
            {
                ZipFile.ExtractToDirectory(pathToFile, workingFolder);
                Logger.DebugFormat("Extracted zip to {0}", workingFolder);

                var htmlFile = Path.ChangeExtension(pathToFile, "html");
                if (File.Exists(htmlFile))
                {
                    Logger.DebugFormat("Html file is {0}", htmlFile);
                    return(htmlFile);
                }

                htmlFile = Path.ChangeExtension(pathToFile, "htm");
                if (File.Exists(htmlFile))
                {
                    Logger.DebugFormat("Html file is {0}", htmlFile);
                    return(htmlFile);
                }

                Logger.ErrorFormat("Invalid HTMLZIP file, name is {0} but corresponding html file not found after decompression", Path.GetFileName(pathToFile));
            }
            else if (extension == ".mht" || extension == ".mhtml")
            {
                MHTMLParser parser = new MHTMLParser(File.ReadAllText(pathToFile));
                parser.OutputDirectory = workingFolder;
                parser.DecodeImageData = false;
                var html = parser.getHTMLText();
                pathToFile = pathToFile + ".html";
                File.WriteAllText(pathToFile, html);
            }
            return(pathToFile);
        }