public void Convert() { byte[] file = Files.toByteArray(new File(@"filename.doc")); AutoDetectParser tikaParser = new AutoDetectParser(); ByteArrayOutputStream output = new ByteArrayOutputStream(); SAXTransformerFactory factory = (SAXTransformerFactory)TransformerFactory.newInstance(); var inputStream = new ByteArrayInputStream(file); // ToHTMLContentHandler handler = new ToHTMLContentHandler(); var metaData = new Metadata(); EncodingDetector encodingDetector = new UniversalEncodingDetector(); var encode = encodingDetector.detect(inputStream, metaData) ?? new UTF_32(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, encode.toString()); handler.setResult(new StreamResult(output)); ContentHandler imageRewriting = new ImageRewritingContentHandler(handler); // ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler); ParseContext context = new ParseContext(); context.set(typeof(EmbeddedDocumentExtractor), new FileEmbeddedDocumentEtractor()); tikaParser.parse(inputStream, imageRewriting, new Metadata(), context); byte[] array = output.toByteArray(); System.IO.File.WriteAllBytes(@"C:\toHtml\text.html", array); }
private TransformerHandler getTransformerHandler() { SAXTransformerFactory factory = (SAXTransformerFactory)TransformerFactory.newInstance(); TransformerHandler transformerHandler = factory.newTransformerHandler(); transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text"); transformerHandler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); _outputWriter = new StringWriter(); transformerHandler.setResult(new StreamResult(_outputWriter)); return(transformerHandler); }