public TextExtractionResult Extract(string filePath) { var parser = new AutoDetectParser(); var metadata = new Metadata(); var parseContext = new ParseContext(); Class parserClass = parser.GetType(); parseContext.set(parserClass, parser); try { var file = new File(filePath); var url = file.toURI().toURL(); using (InputStream inputStream = TikaInputStream.get(url, metadata)) { parser.parse(inputStream, getTransformerHandler(), metadata, parseContext); inputStream.close(); } return(assembleExtractionResult(_outputWriter.toString(), metadata)); } catch (Exception ex) { throw new ApplicationException("Extraction of text from the file '{0}' failed.".ToFormat(filePath), ex); } }
/// <summary> /// /// </summary> /// <param name="filePath"></param> /// <param name="password">Optional password to open the file.</param> /// <returns></returns> public string GetHtmlContent(string filePath, String password) { try { Logger.DebugFormat("GetHtmlContent for filePath: {0}", filePath); var file = new File(filePath); return(this.Extract((Func <Metadata, InputStream>)(metadata => { Logger.DebugFormat("Extract metadata for {0}", filePath); if (!String.IsNullOrEmpty(password)) { metadata.add("org.apache.tika.parser.pdf.password", password); } var tikaInputStream = TikaInputStream.get(file, metadata); Logger.DebugFormat("Return tikaInputStream for {0}", filePath); return (InputStream)tikaInputStream; }))); } catch (System.Exception ex) { Logger.ErrorFormat(ex, "Error on GetHtmlContent for {0}", filePath); throw new TextExtractionException( StringExtrensions.ToFormat( "Extraction of text from the file '{0}' failed.", new object[] { (object)filePath } ), ex ); } }
public TextExtractionResult Extract(Guid applicationId, DocFileInfo file, ref bool succeed, ref string errorText) { try { AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); Class parserClass = parser.GetType(); parseContext.set(parserClass, parser); byte[] fileContent = file.toByteArray(applicationId); using (InputStream inputStream = TikaInputStream.get(fileContent, metadata)) { parser.parse(inputStream, getTransformerHandler(), metadata, parseContext); inputStream.close(); } return(assembleExtractionResult(_outputWriter.toString(), metadata)); } catch (Exception ex) { errorText = ex.StackTrace.ToString(); succeed = false; return(null); } }
public TextExtractionResult Extract(Uri uri) { return(Extract(metadata => { metadata.add("Uri", uri.ToString()); var pageBytes = new WebClient().DownloadData(uri); return TikaInputStream.get(pageBytes, metadata); })); }
public void should_throw_when_given_closed_stream() { var closedStream = new MemoryStream(); closedStream.Dispose(); var bytes = new byte[] { 0, 1, 2, 3 }; Action act = () => _cut.Extract(metadata => TikaInputStream.get(bytes, metadata), closedStream); act.ShouldThrow <TextExtractionException>().WithInnerException <ArgumentException>(); }
public TextExtractionResult Extract(Uri uri) { var jUri = new java.net.URI(uri.ToString()); return(Extract(metadata => { var result = TikaInputStream.get(jUri, metadata); metadata.add("Uri", uri.ToString()); return result; })); }
public void should_throw_when_given_a_disposed_stream() { var disposedStream = new MemoryStream(); disposedStream.Dispose(); var bytes = new byte[] { 0, 1, 2, 3 }; Action act = () => _cut.Extract(metadata => TikaInputStream.get(bytes, metadata), disposedStream); act.Should().Throw <TextExtractionException>(); //act.ShouldThrow<TextExtractionException>(); }
public void should_throw_when_given_a_closed_stream() { var file = Path.GetTempFileName(); const int bufferSize = 4096; var bytes = new byte[] { 0, 1, 2, 3 }; var closedStream = File.Create(file, bufferSize, FileOptions.DeleteOnClose); closedStream.Close(); Action act = () => _cut.Extract(metadata => TikaInputStream.get(bytes, metadata), closedStream); act.ShouldThrow <TextExtractionException>(); }
public TExtractionResult Extract <TExtractionResult>( Uri uri, Func <string, Metadata, TExtractionResult> extractionResultAssembler ) { return(Extract(UrlStreamFactory, extractionResultAssembler)); InputStream UrlStreamFactory(Metadata metadata) { metadata.add("Uri", uri.ToString()); var pageBytes = new WebClient().DownloadData(uri); return(TikaInputStream.get(pageBytes, metadata)); } }
public TextExtractionResult Extract(string filePath) { try { var file = new File(filePath); return(Extract(metadata => { var result = TikaInputStream.get(file, metadata); metadata.add("FilePath", filePath); return result; })); } catch (Exception ex) { throw new TextExtractionException("Extraction of text from the file '{0}' failed.".ToFormat(filePath), ex); } }
public TExtractionResult Parse <TExtractionResult>( System.IO.Stream inputStream, Func <string, Metadata, TExtractionResult> extractionResultAssembler ) { try { return(Parse(SystemStreamFactory, extractionResultAssembler)); } catch (Exception ex) { throw new TextExtractionException("Extraction of text from stream failed.", ex); } InputStream SystemStreamFactory(Metadata metadata) { var ioStream = new ikvm.io.InputStreamWrapper(inputStream); var result = TikaInputStream.get(ioStream); return(result); } }
public TExtractionResult Extract <TExtractionResult>( string filePath, Func <string, Metadata, TExtractionResult> extractionResultAssembler ) { try { return(Extract(FileStreamFactory, extractionResultAssembler)); } catch (Exception ex) { throw new TextExtractionException("Extraction of text from the file '{0}' failed.".ToFormat(filePath), ex); } InputStream FileStreamFactory(Metadata metadata) { var inputStream = new FileInputStream(filePath); var result = TikaInputStream.get(inputStream); metadata.add("FilePath", filePath); return(result); } }
public TextExtractionResult Extract(byte[] data) { var parser = new AutoDetectParser(); var metadata = new Metadata(); var parseContext = new ParseContext(); Class parserClass = parser.GetType(); parseContext.set(parserClass, parser); try { using (InputStream inputStream = TikaInputStream.get(data, metadata)) { parser.parse(inputStream, getTransformerHandler(), metadata, parseContext); inputStream.close(); } return(assembleExtractionResult(_outputWriter.ToString(), metadata)); } catch (Exception ex) { throw new ApplicationException("Extraction of text from the byte array failed.", ex); } }
public TExtractionResult Extract <TExtractionResult>(byte[] data, Func <string, Metadata, TExtractionResult> extractionResultAssembler) { return(Extract(metadata => TikaInputStream.get(data, metadata), extractionResultAssembler)); }
public TextExtractionResult Extract(byte[] data) { return(Extract(metadata => TikaInputStream.get(data, metadata))); }