コード例 #1
0
        public TextExtractionResult Extract(string filePath)
        {
            var   parser       = new AutoDetectParser();
            var   metadata     = new Metadata();
            var   parseContext = new ParseContext();
            Class parserClass  = parser.GetType();

            parseContext.set(parserClass, parser);

            try
            {
                var file = new File(filePath);
                var url  = file.toURI().toURL();
                using (InputStream inputStream = TikaInputStream.get(url, metadata))
                {
                    parser.parse(inputStream, getTransformerHandler(), metadata, parseContext);
                    inputStream.close();
                }

                return(assembleExtractionResult(_outputWriter.toString(), metadata));
            }
            catch (Exception ex)
            {
                throw new ApplicationException("Extraction of text from the file '{0}' failed.".ToFormat(filePath), ex);
            }
        }
コード例 #2
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="filePath"></param>
        /// <param name="password">Optional password to open the file.</param>
        /// <returns></returns>
        public string GetHtmlContent(string filePath, String password)
        {
            try
            {
                Logger.DebugFormat("GetHtmlContent for filePath: {0}", filePath);
                var file = new File(filePath);
                return(this.Extract((Func <Metadata, InputStream>)(metadata =>
                {
                    Logger.DebugFormat("Extract metadata for {0}", filePath);
                    if (!String.IsNullOrEmpty(password))
                    {
                        metadata.add("org.apache.tika.parser.pdf.password", password);
                    }

                    var tikaInputStream = TikaInputStream.get(file, metadata);
                    Logger.DebugFormat("Return tikaInputStream for {0}", filePath);
                    return (InputStream)tikaInputStream;
                })));
            }
            catch (System.Exception ex)
            {
                Logger.ErrorFormat(ex, "Error on GetHtmlContent for {0}", filePath);
                throw new TextExtractionException(
                          StringExtrensions.ToFormat(
                              "Extraction of text from the file '{0}' failed.",
                              new object[] { (object)filePath }
                              ), ex
                          );
            }
        }
コード例 #3
0
        public TextExtractionResult Extract(Guid applicationId, DocFileInfo file, ref bool succeed, ref string errorText)
        {
            try
            {
                AutoDetectParser parser       = new AutoDetectParser();
                Metadata         metadata     = new Metadata();
                ParseContext     parseContext = new ParseContext();
                Class            parserClass  = parser.GetType();
                parseContext.set(parserClass, parser);

                byte[] fileContent = file.toByteArray(applicationId);

                using (InputStream inputStream = TikaInputStream.get(fileContent, metadata))
                {
                    parser.parse(inputStream, getTransformerHandler(), metadata, parseContext);
                    inputStream.close();
                }

                return(assembleExtractionResult(_outputWriter.toString(), metadata));
            }
            catch (Exception ex)
            {
                errorText = ex.StackTrace.ToString();
                succeed   = false;
                return(null);
            }
        }
コード例 #4
0
        public TextExtractionResult Extract(Uri uri)
        {
            return(Extract(metadata =>
            {
                metadata.add("Uri", uri.ToString());
                var pageBytes = new WebClient().DownloadData(uri);

                return TikaInputStream.get(pageBytes, metadata);
            }));
        }
コード例 #5
0
        public void should_throw_when_given_closed_stream()
        {
            var closedStream = new MemoryStream();

            closedStream.Dispose();

            var    bytes = new byte[] { 0, 1, 2, 3 };
            Action act   = () => _cut.Extract(metadata => TikaInputStream.get(bytes, metadata), closedStream);

            act.ShouldThrow <TextExtractionException>().WithInnerException <ArgumentException>();
        }
コード例 #6
0
        public TextExtractionResult Extract(Uri uri)
        {
            var jUri = new java.net.URI(uri.ToString());

            return(Extract(metadata =>
            {
                var result = TikaInputStream.get(jUri, metadata);
                metadata.add("Uri", uri.ToString());
                return result;
            }));
        }
コード例 #7
0
        public void should_throw_when_given_a_disposed_stream()
        {
            var disposedStream = new MemoryStream();

            disposedStream.Dispose();
            var bytes = new byte[] { 0, 1, 2, 3 };

            Action act = () => _cut.Extract(metadata => TikaInputStream.get(bytes, metadata), disposedStream);

            act.Should().Throw <TextExtractionException>();

            //act.ShouldThrow<TextExtractionException>();
        }
コード例 #8
0
        public void should_throw_when_given_a_closed_stream()
        {
            var       file         = Path.GetTempFileName();
            const int bufferSize   = 4096;
            var       bytes        = new byte[] { 0, 1, 2, 3 };
            var       closedStream = File.Create(file, bufferSize, FileOptions.DeleteOnClose);

            closedStream.Close();

            Action act = () => _cut.Extract(metadata => TikaInputStream.get(bytes, metadata), closedStream);

            act.ShouldThrow <TextExtractionException>();
        }
コード例 #9
0
        public TExtractionResult Extract <TExtractionResult>(
            Uri uri,
            Func <string, Metadata, TExtractionResult> extractionResultAssembler
            )
        {
            return(Extract(UrlStreamFactory, extractionResultAssembler));

            InputStream UrlStreamFactory(Metadata metadata)
            {
                metadata.add("Uri", uri.ToString());
                var pageBytes = new WebClient().DownloadData(uri);

                return(TikaInputStream.get(pageBytes, metadata));
            }
        }
コード例 #10
0
 public TextExtractionResult Extract(string filePath)
 {
     try
     {
         var file = new File(filePath);
         return(Extract(metadata =>
         {
             var result = TikaInputStream.get(file, metadata);
             metadata.add("FilePath", filePath);
             return result;
         }));
     }
     catch (Exception ex)
     {
         throw new TextExtractionException("Extraction of text from the file '{0}' failed.".ToFormat(filePath), ex);
     }
 }
コード例 #11
0
        public TExtractionResult Parse <TExtractionResult>(
            System.IO.Stream inputStream,
            Func <string, Metadata, TExtractionResult> extractionResultAssembler
            )
        {
            try
            {
                return(Parse(SystemStreamFactory, extractionResultAssembler));
            }
            catch (Exception ex)
            {
                throw new TextExtractionException("Extraction of text from stream failed.", ex);
            }

            InputStream SystemStreamFactory(Metadata metadata)
            {
                var ioStream = new ikvm.io.InputStreamWrapper(inputStream);
                var result   = TikaInputStream.get(ioStream);

                return(result);
            }
        }
コード例 #12
0
        public TExtractionResult Extract <TExtractionResult>(
            string filePath,
            Func <string, Metadata, TExtractionResult> extractionResultAssembler
            )
        {
            try
            {
                return(Extract(FileStreamFactory, extractionResultAssembler));
            }
            catch (Exception ex)
            {
                throw new TextExtractionException("Extraction of text from the file '{0}' failed.".ToFormat(filePath), ex);
            }

            InputStream FileStreamFactory(Metadata metadata)
            {
                var inputStream = new FileInputStream(filePath);

                var result = TikaInputStream.get(inputStream);

                metadata.add("FilePath", filePath);
                return(result);
            }
        }
コード例 #13
0
        public TextExtractionResult Extract(byte[] data)
        {
            var   parser       = new AutoDetectParser();
            var   metadata     = new Metadata();
            var   parseContext = new ParseContext();
            Class parserClass  = parser.GetType();

            parseContext.set(parserClass, parser);

            try
            {
                using (InputStream inputStream = TikaInputStream.get(data, metadata))
                {
                    parser.parse(inputStream, getTransformerHandler(), metadata, parseContext);
                    inputStream.close();
                }

                return(assembleExtractionResult(_outputWriter.ToString(), metadata));
            }
            catch (Exception ex)
            {
                throw new ApplicationException("Extraction of text from the byte array failed.", ex);
            }
        }
コード例 #14
0
 public TExtractionResult Extract <TExtractionResult>(byte[] data, Func <string, Metadata, TExtractionResult> extractionResultAssembler)
 {
     return(Extract(metadata => TikaInputStream.get(data, metadata), extractionResultAssembler));
 }
コード例 #15
0
 public TextExtractionResult Extract(byte[] data)
 {
     return(Extract(metadata => TikaInputStream.get(data, metadata)));
 }