Esempio n. 1
0
        public TextExtractionResult Extract(Guid applicationId, DocFileInfo file, ref bool succeed, ref string errorText)
        {
            try
            {
                AutoDetectParser parser       = new AutoDetectParser();
                Metadata         metadata     = new Metadata();
                ParseContext     parseContext = new ParseContext();
                Class            parserClass  = parser.GetType();
                parseContext.set(parserClass, parser);

                byte[] fileContent = file.toByteArray(applicationId);

                using (InputStream inputStream = TikaInputStream.get(fileContent, metadata))
                {
                    parser.parse(inputStream, getTransformerHandler(), metadata, parseContext);
                    inputStream.close();
                }

                return(assembleExtractionResult(_outputWriter.toString(), metadata));
            }
            catch (Exception ex)
            {
                errorText = ex.StackTrace.ToString();
                succeed   = false;
                return(null);
            }
        }
Esempio n. 2
0
		public TextExtractionResult Extract(Func<Metadata, InputStream> streamFactory)
		{
			try
			{
				var parser = new AutoDetectParser();
				var metadata = new Metadata();
				var outputWriter = new StringWriter();
				var parseContext = new ParseContext();

                //use the base class type for the key or parts of Tika won't find a usable parser
				parseContext.set(typeof(org.apache.tika.parser.Parser), parser);
				
				using (var inputStream = streamFactory(metadata))
				{
					try
					{
						parser.parse(inputStream, getTransformerHandler(outputWriter), metadata, parseContext);
					}
					finally
					{
						inputStream.close();
					}
				}

				return assembleExtractionResult(outputWriter.ToString(), metadata);
			}
			catch (Exception ex)
			{
				throw new TextExtractionException("Extraction failed.", ex);
			}
		}
Esempio n. 3
0
        public Metadata Extract(Func <Metadata, InputStream> streamFactory, System.IO.Stream outputStream)
        {
            try
            {
                var parser       = new AutoDetectParser();
                var metadata     = new Metadata();
                var parseContext = new ParseContext();
                var handler      = GetTransformerHandler(outputStream);

                //use the base class type for the key or parts of Tika won't find a usable parser
                parseContext.set(typeof(org.apache.tika.parser.Parser), parser);

                using (var inputStream = streamFactory(metadata))
                {
                    try
                    {
                        parser.parse(inputStream, handler, metadata, parseContext);
                    }
                    finally
                    {
                        inputStream.close();
                    }
                }

                return(metadata);
            }
            catch (Exception ex)
            {
                throw new TextExtractionException("Extraction failed.", ex);
            }
        }
Esempio n. 4
0
        public TextExtractionResult Extract(string filePath)
        {
            var   parser       = new AutoDetectParser();
            var   metadata     = new Metadata();
            var   parseContext = new ParseContext();
            Class parserClass  = parser.GetType();

            parseContext.set(parserClass, parser);

            try
            {
                var file = new File(filePath);
                var url  = file.toURI().toURL();
                using (InputStream inputStream = TikaInputStream.get(url, metadata))
                {
                    parser.parse(inputStream, getTransformerHandler(), metadata, parseContext);
                    inputStream.close();
                }

                return(assembleExtractionResult(_outputWriter.toString(), metadata));
            }
            catch (Exception ex)
            {
                throw new ApplicationException("Extraction of text from the file '{0}' failed.".ToFormat(filePath), ex);
            }
        }
Esempio n. 5
0
        public TextExtractionResult Extract(Func <Metadata, InputStream> streamFactory)
        {
            try
            {
                var parser       = new AutoDetectParser();
                var metadata     = new Metadata();
                var outputWriter = new StringWriter();
                var parseContext = new ParseContext();

                //use the base class type for the key or parts of Tika won't find a usable parser
                parseContext.set(typeof(Parser), parser);

                using (var inputStream = streamFactory(metadata))
                {
                    try
                    {
                        parser.parse(inputStream, getTransformerHandler(outputWriter), metadata, parseContext);
                    }
                    finally
                    {
                        inputStream.close();
                    }
                }

                return(AssembleExtractionResult(outputWriter.ToString(), metadata));
            }
            catch (Exception ex)
            {
                throw new TextExtractionException("Extraction failed.", ex);
            }
        }
Esempio n. 6
0
    public void Convert()
    {
        byte[]                file       = Files.toByteArray(new File(@"filename.doc"));
        AutoDetectParser      tikaParser = new AutoDetectParser();
        ByteArrayOutputStream output     = new ByteArrayOutputStream();
        SAXTransformerFactory factory    = (SAXTransformerFactory)TransformerFactory.newInstance();
        var inputStream = new ByteArrayInputStream(file);
        //           ToHTMLContentHandler handler = new ToHTMLContentHandler();
        var metaData = new Metadata();
        EncodingDetector encodingDetector = new UniversalEncodingDetector();
        var encode = encodingDetector.detect(inputStream, metaData) ?? new UTF_32();
        TransformerHandler handler = factory.newTransformerHandler();

        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, encode.toString());
        handler.setResult(new StreamResult(output));
        ContentHandler imageRewriting = new ImageRewritingContentHandler(handler);
        //  ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);
        ParseContext context = new ParseContext();

        context.set(typeof(EmbeddedDocumentExtractor), new FileEmbeddedDocumentEtractor());
        tikaParser.parse(inputStream, imageRewriting, new Metadata(), context);

        byte[] array = output.toByteArray();
        System.IO.File.WriteAllBytes(@"C:\toHtml\text.html", array);
    }
Esempio n. 7
0
        public TextExtractionResult Extract(Func<Metadata, InputStream> streamFactory)
        {
            try
            {
                var parser = new AutoDetectParser();
                var metadata = new Metadata();
                var parseContext = new ParseContext();

                //use the base class type for the key or parts of Tika won't find a usable parser
                parseContext.set(typeof(Parser), parser);

                var content = new System.IO.StringWriter();
                var contentHandlerResult = new TextExtractorContentHandler(content);

                using (var inputStream = streamFactory(metadata))
                {
                    try
                    {
                        parser.parse(inputStream, contentHandlerResult, metadata, parseContext);
                    }
                    finally
                    {
                        inputStream.close();
                    }
                }

                return AssembleExtractionResult(content.ToString(), metadata);
            }
            catch (Exception ex)
            {
                throw new TextExtractionException("Extraction failed.", ex);
            }
        }
Esempio n. 8
0
 public string ExtractText(byte[] data)
 {
     var parser = new AutoDetectParser();
     var handler = new BodyContentHandler();
     var context = new ParseContext();
     context.set(parser.getClass(), parser);
     var metadata = new Metadata();
     using (var output = new StringWriter()) {
         var transformerHandler = CreateTransformerHandler(output);
         using (var inputStream = TikaInputStream.get(data, metadata)) {
             parser.parse(inputStream, transformerHandler, metadata, context);
             inputStream.close();
         }
         return output.toString();
     }
 }
Esempio n. 9
0
        private static string GetContent(string fileName)
        {
            using (InputStream stream = new FileInputStream(new File(fileName)))
            {
                AutoDetectParser parser = new AutoDetectParser();
                BodyContentHandler handler = new BodyContentHandler();
                Metadata metadata = new Metadata();

                var xpsParser = new XpsParser();

                parser.setParsers(new java.util.HashMap { { MediaType.application("vnd.ms-xpsdocument"), xpsParser } });
                parser.setParsers(new java.util.HashMap { { MediaType.application("x-tika-ooxml"), xpsParser } });

                parser.parse(stream, handler, metadata);

                return handler.toString();
            }
        }
Esempio n. 10
0
        private string Extract(Func <Metadata, InputStream> streamFactory)
        {
            try
            {
                Logger.Debug("Autodetect parser");
                var autoDetectParser = new AutoDetectParser();
                Logger.Debug("MetaData");
                var metadata = new Metadata();
                Logger.Debug("ToXMLContentHandler");
                var handler = new ToXMLContentHandler();

                Logger.Debug("Reading stream");
                using (InputStream inputStream = streamFactory(metadata))
                {
                    try
                    {
                        autoDetectParser.parse(
                            inputStream,
                            handler,
                            metadata
                            );
                    }
                    finally
                    {
                        Logger.Debug("Closing Stream after auto-detect parser");
                        inputStream.close();
                    }
                }

                var content = handler.ToString();
                return(content);
            }
            catch (System.Exception ex)
            {
                Logger.ErrorFormat(ex, "Extract error: {0}", ex.Message);
                if (ex.InnerException != null)
                {
                    Logger.ErrorFormat(ex.InnerException, "Extract inner error: {0}", ex.InnerException.Message);
                }

                throw new TextExtractionException("Extraction failed.", ex);
            }
        }
        private string Extract(Func<Metadata, InputStream> streamFactory)
        {
            try
            {
                Logger.Debug("Autodetect parser");
                var autoDetectParser = new AutoDetectParser();
                Logger.Debug("MetaData");
                var metadata = new Metadata();
                Logger.Debug("ToXMLContentHandler");
                var handler = new ToXMLContentHandler();

                Logger.Debug("Reading stream");
                using (InputStream inputStream = streamFactory(metadata))
                {
                    try
                    {
                        autoDetectParser.parse(
                            inputStream,
                            handler,
                            metadata
                            );
                    }
                    finally
                    {
                        Logger.Debug("Closing Stream after auto-detect parser");
                        inputStream.close();
                    }
                }

                var content = handler.ToString();
                return content;
            }
            catch (System.Exception ex)
            {
                Logger.ErrorFormat(ex,"Extract {0}",ex.Message);

                throw new TextExtractionException("Extraction failed.", ex);
            }
        }
Esempio n. 12
0
        public TextExtractionResult Extract(byte[] data)
        {
            var   parser       = new AutoDetectParser();
            var   metadata     = new Metadata();
            var   parseContext = new ParseContext();
            Class parserClass  = parser.GetType();

            parseContext.set(parserClass, parser);

            try
            {
                using (InputStream inputStream = TikaInputStream.get(data, metadata))
                {
                    parser.parse(inputStream, getTransformerHandler(), metadata, parseContext);
                    inputStream.close();
                }

                return(assembleExtractionResult(_outputWriter.ToString(), metadata));
            }
            catch (Exception ex)
            {
                throw new ApplicationException("Extraction of text from the byte array failed.", ex);
            }
        }
Esempio n. 13
0
        public void Get(object threadname)
        {
            System.Console.WriteLine("Wait for file path 3 second ..");
            Thread.Sleep(3000);
            Stopwatch s = new Stopwatch();

            s.Start();//開始計時

            Dictionary <string, string> value;

            while (Program.File_Dir.Count != 0)
            {
                Parser             parser   = new AutoDetectParser();
                Metadata           metadata = new Metadata();
                ParseContext       pcontext = new ParseContext();
                BodyContentHandler handler  = new BodyContentHandler(-1);


                //System.Console.WriteLine(threadname.ToString());
                string filename = "";

                System.Console.WriteLine();
                try
                {
                    filename = Program.File_Dir.Pop();
                    java.io.File document = new java.io.File(filename);
                    //System.Console.WriteLine("========Read======="+filename);
                    parser.parse(new FileInputStream(document), handler, metadata, pcontext);
                }
                catch (InvalidOperationException)
                {
                    System.Console.WriteLine("堆疊為空");
                    break;
                }
                catch (Exception ex)
                {
                    System.Console.WriteLine(filename + ": parser error" + ex);
                    System.Console.WriteLine();
                    continue;
                }

                value = new Dictionary <string, string>();
                value.Add("id", filename);
                foreach (var prop in metadata.names())
                {
                    if (prop.Contains("TRC") || prop.Contains("Byte"))
                    {
                        continue;
                    }
                    value.Add(prop, metadata.get(prop).ToString());
                    //System.Console.WriteLine($"{prop} =  {metadata.get(prop)}");
                }
                if (handler.toString() != "")
                {
                    var str = Regex.Replace(handler.ToString(), @"\s", "");
                    //string str = handler.ToString();
                    if (str.Length < 65535)
                    {
                        value.Add("content", str);
                    }
                }
                //PostData(filename, value);
                Program.JSON_Data.Push(JsonConvert.SerializeObject(value));
                if (Program.Post_Thread.ThreadState == System.Threading.ThreadState.Unstarted && Program.JSON_Data.Count > 10)
                {
                    //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState);
                    Program.Post_Thread.Start("POST");
                    //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState);
                }
                if (Program.Post_Thread.ThreadState == System.Threading.ThreadState.Suspended)
                {
                    //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState);
                    Program.Post_Thread.Resume();
                    //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState);
                }
            }
            s.Stop();
            System.Console.WriteLine(threadname.ToString() + "," + (s.Elapsed).ToString());
        }
Esempio n. 14
0
 public StreamTextExtractor()
 {
     Config = TikaConfig.getDefaultConfig();
     Parser = new AutoDetectParser();
 }
Esempio n. 15
0
        public TextExtractionResult Extract(string filePath)
        {
            if (!System.IO.File.Exists(filePath))
                throw new System.ArgumentException("File does not exist");

            var parser = new AutoDetectParser();
            var metadata = new Metadata();
            var parseContext = new ParseContext();
            Class parserClass = parser.GetType();
            parseContext.set(parserClass, parser);

            try {
                var file = new File(filePath);
                var url = file.toURI().toURL();
                using (var inputStream = TikaInputStream.get(url, metadata)) {
                    parser.parse(inputStream, getTransformerHandler(), metadata, parseContext);
                    inputStream.close();
                }

                return assembleExtractionResult(_outputWriter.toString(), metadata);
            } catch (Exception ex) {
                throw new ApplicationException(string.Format("Extraction of text from the file '{0}' failed.", filePath), ex);
            }
        }