public TextExtractionResult Extract(Guid applicationId, DocFileInfo file, ref bool succeed, ref string errorText) { try { AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); Class parserClass = parser.GetType(); parseContext.set(parserClass, parser); byte[] fileContent = file.toByteArray(applicationId); using (InputStream inputStream = TikaInputStream.get(fileContent, metadata)) { parser.parse(inputStream, getTransformerHandler(), metadata, parseContext); inputStream.close(); } return(assembleExtractionResult(_outputWriter.toString(), metadata)); } catch (Exception ex) { errorText = ex.StackTrace.ToString(); succeed = false; return(null); } }
public TextExtractionResult Extract(Func<Metadata, InputStream> streamFactory) { try { var parser = new AutoDetectParser(); var metadata = new Metadata(); var outputWriter = new StringWriter(); var parseContext = new ParseContext(); //use the base class type for the key or parts of Tika won't find a usable parser parseContext.set(typeof(org.apache.tika.parser.Parser), parser); using (var inputStream = streamFactory(metadata)) { try { parser.parse(inputStream, getTransformerHandler(outputWriter), metadata, parseContext); } finally { inputStream.close(); } } return assembleExtractionResult(outputWriter.ToString(), metadata); } catch (Exception ex) { throw new TextExtractionException("Extraction failed.", ex); } }
public Metadata Extract(Func <Metadata, InputStream> streamFactory, System.IO.Stream outputStream) { try { var parser = new AutoDetectParser(); var metadata = new Metadata(); var parseContext = new ParseContext(); var handler = GetTransformerHandler(outputStream); //use the base class type for the key or parts of Tika won't find a usable parser parseContext.set(typeof(org.apache.tika.parser.Parser), parser); using (var inputStream = streamFactory(metadata)) { try { parser.parse(inputStream, handler, metadata, parseContext); } finally { inputStream.close(); } } return(metadata); } catch (Exception ex) { throw new TextExtractionException("Extraction failed.", ex); } }
public TextExtractionResult Extract(string filePath) { var parser = new AutoDetectParser(); var metadata = new Metadata(); var parseContext = new ParseContext(); Class parserClass = parser.GetType(); parseContext.set(parserClass, parser); try { var file = new File(filePath); var url = file.toURI().toURL(); using (InputStream inputStream = TikaInputStream.get(url, metadata)) { parser.parse(inputStream, getTransformerHandler(), metadata, parseContext); inputStream.close(); } return(assembleExtractionResult(_outputWriter.toString(), metadata)); } catch (Exception ex) { throw new ApplicationException("Extraction of text from the file '{0}' failed.".ToFormat(filePath), ex); } }
public TextExtractionResult Extract(Func <Metadata, InputStream> streamFactory) { try { var parser = new AutoDetectParser(); var metadata = new Metadata(); var outputWriter = new StringWriter(); var parseContext = new ParseContext(); //use the base class type for the key or parts of Tika won't find a usable parser parseContext.set(typeof(Parser), parser); using (var inputStream = streamFactory(metadata)) { try { parser.parse(inputStream, getTransformerHandler(outputWriter), metadata, parseContext); } finally { inputStream.close(); } } return(AssembleExtractionResult(outputWriter.ToString(), metadata)); } catch (Exception ex) { throw new TextExtractionException("Extraction failed.", ex); } }
public void Convert() { byte[] file = Files.toByteArray(new File(@"filename.doc")); AutoDetectParser tikaParser = new AutoDetectParser(); ByteArrayOutputStream output = new ByteArrayOutputStream(); SAXTransformerFactory factory = (SAXTransformerFactory)TransformerFactory.newInstance(); var inputStream = new ByteArrayInputStream(file); // ToHTMLContentHandler handler = new ToHTMLContentHandler(); var metaData = new Metadata(); EncodingDetector encodingDetector = new UniversalEncodingDetector(); var encode = encodingDetector.detect(inputStream, metaData) ?? new UTF_32(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, encode.toString()); handler.setResult(new StreamResult(output)); ContentHandler imageRewriting = new ImageRewritingContentHandler(handler); // ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler); ParseContext context = new ParseContext(); context.set(typeof(EmbeddedDocumentExtractor), new FileEmbeddedDocumentEtractor()); tikaParser.parse(inputStream, imageRewriting, new Metadata(), context); byte[] array = output.toByteArray(); System.IO.File.WriteAllBytes(@"C:\toHtml\text.html", array); }
public TextExtractionResult Extract(Func<Metadata, InputStream> streamFactory) { try { var parser = new AutoDetectParser(); var metadata = new Metadata(); var parseContext = new ParseContext(); //use the base class type for the key or parts of Tika won't find a usable parser parseContext.set(typeof(Parser), parser); var content = new System.IO.StringWriter(); var contentHandlerResult = new TextExtractorContentHandler(content); using (var inputStream = streamFactory(metadata)) { try { parser.parse(inputStream, contentHandlerResult, metadata, parseContext); } finally { inputStream.close(); } } return AssembleExtractionResult(content.ToString(), metadata); } catch (Exception ex) { throw new TextExtractionException("Extraction failed.", ex); } }
public string ExtractText(byte[] data) { var parser = new AutoDetectParser(); var handler = new BodyContentHandler(); var context = new ParseContext(); context.set(parser.getClass(), parser); var metadata = new Metadata(); using (var output = new StringWriter()) { var transformerHandler = CreateTransformerHandler(output); using (var inputStream = TikaInputStream.get(data, metadata)) { parser.parse(inputStream, transformerHandler, metadata, context); inputStream.close(); } return output.toString(); } }
private static string GetContent(string fileName) { using (InputStream stream = new FileInputStream(new File(fileName))) { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); var xpsParser = new XpsParser(); parser.setParsers(new java.util.HashMap { { MediaType.application("vnd.ms-xpsdocument"), xpsParser } }); parser.setParsers(new java.util.HashMap { { MediaType.application("x-tika-ooxml"), xpsParser } }); parser.parse(stream, handler, metadata); return handler.toString(); } }
private string Extract(Func <Metadata, InputStream> streamFactory) { try { Logger.Debug("Autodetect parser"); var autoDetectParser = new AutoDetectParser(); Logger.Debug("MetaData"); var metadata = new Metadata(); Logger.Debug("ToXMLContentHandler"); var handler = new ToXMLContentHandler(); Logger.Debug("Reading stream"); using (InputStream inputStream = streamFactory(metadata)) { try { autoDetectParser.parse( inputStream, handler, metadata ); } finally { Logger.Debug("Closing Stream after auto-detect parser"); inputStream.close(); } } var content = handler.ToString(); return(content); } catch (System.Exception ex) { Logger.ErrorFormat(ex, "Extract error: {0}", ex.Message); if (ex.InnerException != null) { Logger.ErrorFormat(ex.InnerException, "Extract inner error: {0}", ex.InnerException.Message); } throw new TextExtractionException("Extraction failed.", ex); } }
private string Extract(Func<Metadata, InputStream> streamFactory) { try { Logger.Debug("Autodetect parser"); var autoDetectParser = new AutoDetectParser(); Logger.Debug("MetaData"); var metadata = new Metadata(); Logger.Debug("ToXMLContentHandler"); var handler = new ToXMLContentHandler(); Logger.Debug("Reading stream"); using (InputStream inputStream = streamFactory(metadata)) { try { autoDetectParser.parse( inputStream, handler, metadata ); } finally { Logger.Debug("Closing Stream after auto-detect parser"); inputStream.close(); } } var content = handler.ToString(); return content; } catch (System.Exception ex) { Logger.ErrorFormat(ex,"Extract {0}",ex.Message); throw new TextExtractionException("Extraction failed.", ex); } }
public TextExtractionResult Extract(byte[] data) { var parser = new AutoDetectParser(); var metadata = new Metadata(); var parseContext = new ParseContext(); Class parserClass = parser.GetType(); parseContext.set(parserClass, parser); try { using (InputStream inputStream = TikaInputStream.get(data, metadata)) { parser.parse(inputStream, getTransformerHandler(), metadata, parseContext); inputStream.close(); } return(assembleExtractionResult(_outputWriter.ToString(), metadata)); } catch (Exception ex) { throw new ApplicationException("Extraction of text from the byte array failed.", ex); } }
public void Get(object threadname) { System.Console.WriteLine("Wait for file path 3 second .."); Thread.Sleep(3000); Stopwatch s = new Stopwatch(); s.Start();//開始計時 Dictionary <string, string> value; while (Program.File_Dir.Count != 0) { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); ParseContext pcontext = new ParseContext(); BodyContentHandler handler = new BodyContentHandler(-1); //System.Console.WriteLine(threadname.ToString()); string filename = ""; System.Console.WriteLine(); try { filename = Program.File_Dir.Pop(); java.io.File document = new java.io.File(filename); //System.Console.WriteLine("========Read======="+filename); parser.parse(new FileInputStream(document), handler, metadata, pcontext); } catch (InvalidOperationException) { System.Console.WriteLine("堆疊為空"); break; } catch (Exception ex) { System.Console.WriteLine(filename + ": parser error" + ex); System.Console.WriteLine(); continue; } value = new Dictionary <string, string>(); value.Add("id", filename); foreach (var prop in metadata.names()) { if (prop.Contains("TRC") || prop.Contains("Byte")) { continue; } value.Add(prop, metadata.get(prop).ToString()); //System.Console.WriteLine($"{prop} = {metadata.get(prop)}"); } if (handler.toString() != "") { var str = Regex.Replace(handler.ToString(), @"\s", ""); //string str = handler.ToString(); if (str.Length < 65535) { value.Add("content", str); } } //PostData(filename, value); Program.JSON_Data.Push(JsonConvert.SerializeObject(value)); if (Program.Post_Thread.ThreadState == System.Threading.ThreadState.Unstarted && Program.JSON_Data.Count > 10) { //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState); Program.Post_Thread.Start("POST"); //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState); } if (Program.Post_Thread.ThreadState == System.Threading.ThreadState.Suspended) { //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState); Program.Post_Thread.Resume(); //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState); } } s.Stop(); System.Console.WriteLine(threadname.ToString() + "," + (s.Elapsed).ToString()); }
public StreamTextExtractor() { Config = TikaConfig.getDefaultConfig(); Parser = new AutoDetectParser(); }
public TextExtractionResult Extract(string filePath) { if (!System.IO.File.Exists(filePath)) throw new System.ArgumentException("File does not exist"); var parser = new AutoDetectParser(); var metadata = new Metadata(); var parseContext = new ParseContext(); Class parserClass = parser.GetType(); parseContext.set(parserClass, parser); try { var file = new File(filePath); var url = file.toURI().toURL(); using (var inputStream = TikaInputStream.get(url, metadata)) { parser.parse(inputStream, getTransformerHandler(), metadata, parseContext); inputStream.close(); } return assembleExtractionResult(_outputWriter.toString(), metadata); } catch (Exception ex) { throw new ApplicationException(string.Format("Extraction of text from the file '{0}' failed.", filePath), ex); } }