public string ExtractText(byte[] data) { var parser = new AutoDetectParser(); var handler = new BodyContentHandler(); var context = new ParseContext(); context.set(parser.getClass(), parser); var metadata = new Metadata(); using (var output = new StringWriter()) { var transformerHandler = CreateTransformerHandler(output); using (var inputStream = TikaInputStream.get(data, metadata)) { parser.parse(inputStream, transformerHandler, metadata, context); inputStream.close(); } return output.toString(); } }
static void Main(string[] args) { String html = "<html><head> <title>Best Pizza Joints in America</title></head><body>" + "<p>The best pizza place in the US is <a href=\"http://antoniospizzas.com/\">Antonio's Pizza</a>.</p>" + "<p>It is located in Amherst, MA.</p></body></html>"; ContentHandler text = new BodyContentHandler(); LinkContentHandler links = new LinkContentHandler(); ContentHandler handler = new TeeContentHandler(text, links); Metadata metadata = new Metadata(); org.apache.tika.parser.Parser parser = new HtmlParser(); ParseContext context = new ParseContext(); parser.parse(new java.io.ByteArrayInputStream(new UTF8Encoding().GetBytes(html)), handler, metadata, context); Console.WriteLine("Title: " + metadata.get(TikaCoreProperties.__Fields.TITLE)); Console.WriteLine("Body: " + text.ToString()); Console.WriteLine("Links: " + links.getLinks()); }
private static string GetContent(string fileName) { using (InputStream stream = new FileInputStream(new File(fileName))) { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); var xpsParser = new XpsParser(); parser.setParsers(new java.util.HashMap { { MediaType.application("vnd.ms-xpsdocument"), xpsParser } }); parser.setParsers(new java.util.HashMap { { MediaType.application("x-tika-ooxml"), xpsParser } }); parser.parse(stream, handler, metadata); return handler.toString(); } }
public void Get(object threadname) { System.Console.WriteLine("Wait for file path 3 second .."); Thread.Sleep(3000); Stopwatch s = new Stopwatch(); s.Start();//開始計時 Dictionary <string, string> value; while (Program.File_Dir.Count != 0) { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); ParseContext pcontext = new ParseContext(); BodyContentHandler handler = new BodyContentHandler(-1); //System.Console.WriteLine(threadname.ToString()); string filename = ""; System.Console.WriteLine(); try { filename = Program.File_Dir.Pop(); java.io.File document = new java.io.File(filename); //System.Console.WriteLine("========Read======="+filename); parser.parse(new FileInputStream(document), handler, metadata, pcontext); } catch (InvalidOperationException) { System.Console.WriteLine("堆疊為空"); break; } catch (Exception ex) { System.Console.WriteLine(filename + ": parser error" + ex); System.Console.WriteLine(); continue; } value = new Dictionary <string, string>(); value.Add("id", filename); foreach (var prop in metadata.names()) { if (prop.Contains("TRC") || prop.Contains("Byte")) { continue; } value.Add(prop, metadata.get(prop).ToString()); //System.Console.WriteLine($"{prop} = {metadata.get(prop)}"); } if (handler.toString() != "") { var str = Regex.Replace(handler.ToString(), @"\s", ""); //string str = handler.ToString(); if (str.Length < 65535) { value.Add("content", str); } } //PostData(filename, value); Program.JSON_Data.Push(JsonConvert.SerializeObject(value)); if (Program.Post_Thread.ThreadState == System.Threading.ThreadState.Unstarted && Program.JSON_Data.Count > 10) { //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState); Program.Post_Thread.Start("POST"); //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState); } if (Program.Post_Thread.ThreadState == System.Threading.ThreadState.Suspended) { //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState); Program.Post_Thread.Resume(); //System.Console.WriteLine("POST" + " : " + Program.Post_Thread.ThreadState); } } s.Stop(); System.Console.WriteLine(threadname.ToString() + "," + (s.Elapsed).ToString()); }