Beispiel #1
0
 public string ExtractText(byte[] data)
 {
     var parser = new AutoDetectParser();
     var handler = new BodyContentHandler();
     var context = new ParseContext();
     context.set(parser.getClass(), parser);
     var metadata = new Metadata();
     using (var output = new StringWriter()) {
         var transformerHandler = CreateTransformerHandler(output);
         using (var inputStream = TikaInputStream.get(data, metadata)) {
             parser.parse(inputStream, transformerHandler, metadata, context);
             inputStream.close();
         }
         return output.toString();
     }
 }
Beispiel #2
0
        static void Main(string[] args)
        {
            String html = "<html><head> <title>Best Pizza Joints in America</title></head><body>" +
                          "<p>The best pizza place in the US is <a href=\"http://antoniospizzas.com/\">Antonio's Pizza</a>.</p>" +
                          "<p>It is located in Amherst, MA.</p></body></html>";
            ContentHandler     text     = new BodyContentHandler();
            LinkContentHandler links    = new LinkContentHandler();
            ContentHandler     handler  = new TeeContentHandler(text, links);
            Metadata           metadata = new Metadata();

            org.apache.tika.parser.Parser parser = new HtmlParser();
            ParseContext context = new ParseContext();

            parser.parse(new java.io.ByteArrayInputStream(new UTF8Encoding().GetBytes(html)), handler, metadata, context);
            Console.WriteLine("Title: " + metadata.get(TikaCoreProperties.__Fields.TITLE));
            Console.WriteLine("Body: " + text.ToString());
            Console.WriteLine("Links: " + links.getLinks());
        }
Beispiel #3
0
        private static string GetContent(string fileName)
        {
            using (InputStream stream = new FileInputStream(new File(fileName)))
            {
                AutoDetectParser parser = new AutoDetectParser();
                BodyContentHandler handler = new BodyContentHandler();
                Metadata metadata = new Metadata();

                var xpsParser = new XpsParser();

                parser.setParsers(new java.util.HashMap { { MediaType.application("vnd.ms-xpsdocument"), xpsParser } });
                parser.setParsers(new java.util.HashMap { { MediaType.application("x-tika-ooxml"), xpsParser } });

                parser.parse(stream, handler, metadata);

                return handler.toString();
            }
        }
Beispiel #4
0
        public void Get(object threadname)
        {
            System.Console.WriteLine("Wait for file path 3 second ..");
            Thread.Sleep(3000);
            Stopwatch s = new Stopwatch();

            s.Start();//開始計時

            Dictionary <string, string> value;

            while (Program.File_Dir.Count != 0)
            {
                Parser             parser   = new AutoDetectParser();
                Metadata           metadata = new Metadata();
                ParseContext       pcontext = new ParseContext();
                BodyContentHandler handler  = new BodyContentHandler(-1);


                //System.Console.WriteLine(threadname.ToString());
                string filename = "";

                System.Console.WriteLine();
                try
                {
                    filename = Program.File_Dir.Pop();
                    java.io.File document = new java.io.File(filename);
                    //System.Console.WriteLine("========Read======="+filename);
                    parser.parse(new FileInputStream(document), handler, metadata, pcontext);
                }
                catch (InvalidOperationException)
                {
                    System.Console.WriteLine("堆疊為空");
                    break;
                }
                catch (Exception ex)
                {
                    System.Console.WriteLine(filename + ": parser error" + ex);
                    System.Console.WriteLine();
                    continue;
                }

                value = new Dictionary <string, string>();
                value.Add("id", filename);
                foreach (var prop in metadata.names())
                {
                    if (prop.Contains("TRC") || prop.Contains("Byte"))
                    {
                        continue;
                    }
                    value.Add(prop, metadata.get(prop).ToString());
                    //System.Console.WriteLine($"{prop} =  {metadata.get(prop)}");
                }
                if (handler.toString() != "")
                {
                    var str = Regex.Replace(handler.ToString(), @"\s", "");
                    //string str = handler.ToString();
                    if (str.Length < 65535)
                    {
                        value.Add("content", str);
                    }
                }
                //PostData(filename, value);
                Program.JSON_Data.Push(JsonConvert.SerializeObject(value));
                if (Program.Post_Thread.ThreadState == System.Threading.ThreadState.Unstarted && Program.JSON_Data.Count > 10)
                {
                    //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState);
                    Program.Post_Thread.Start("POST");
                    //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState);
                }
                if (Program.Post_Thread.ThreadState == System.Threading.ThreadState.Suspended)
                {
                    //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState);
                    Program.Post_Thread.Resume();
                    //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState);
                }
            }
            s.Stop();
            System.Console.WriteLine(threadname.ToString() + "," + (s.Elapsed).ToString());
        }