private TikaAsyncWorker pushPop(PipelineContext ctx, IDatasourceSink sink, TikaAsyncWorker newElt) { try { return((TikaAsyncWorker)((newElt == null) ? workerQueue.Pop() : workerQueue.PushAndOptionalPop(newElt))); } catch (Exception e) { ctx.HandleException(e); return(null); } }
private void importUrl(PipelineContext ctx, IDatasourceSink sink, TikaAsyncWorker worker) { String fileName = worker.StreamElt.FullName; sink.HandleValue(ctx, "record/_start", fileName); sink.HandleValue(ctx, "record/lastmodutc", worker.LastModifiedUtc); sink.HandleValue(ctx, "record/virtualFilename", worker.StreamElt.VirtualName); sink.HandleValue(ctx, "record/virtualRoot", worker.StreamElt.VirtualRoot); try { var htmlProcessor = worker.HtmlProcessor; if (worker.StoredAs != null) { sink.HandleValue(ctx, "record/converted_file", worker.StoredAs); } //Write html properties foreach (var kvp in htmlProcessor.Properties) { sink.HandleValue(ctx, "record/" + kvp.Key, kvp.Value); } if (mustEmitSecurity) { emitSecurity(ctx, sink, fileName); } //Add dummy type to recognize the errors //if (error) // doc.AddField("content_type", "ConversionError"); //if (htmlProcessor.IsTextMail) sink.HandleValue(ctx, "record/_istextmail", htmlProcessor.IsTextMail); sink.HandleValue(ctx, "record/_numparts", htmlProcessor.numParts); sink.HandleValue(ctx, "record/_numattachments", htmlProcessor.Attachments.Count); foreach (var a in htmlProcessor.Attachments) { sink.HandleValue(ctx, "record/_attachment", a); } sink.HandleValue(ctx, "record/_filesize", worker.FileSize); sink.HandleValue(ctx, "record/shortcontent", htmlProcessor.GetAbstract(abstractLength, abstractDelta)); sink.HandleValue(ctx, "record/head", htmlProcessor.GetInnerHead()); sink.HandleValue(ctx, "record/content", htmlProcessor.GetInnerBody()); sink.HandleValue(ctx, "record/_end", fileName); sink.HandleValue(ctx, "record", null); } catch (Exception e) { ctx.HandleException(e); } }
public void Import(PipelineContext ctx, IDatasourceSink sink) { _BeforeImport(ctx, sink); try { foreach (var elt in streamDirectory.GetProviders(ctx)) { try { ImportUrl(ctx, sink, elt); } catch (Exception e) { e = new BMException(e, WrapMessage(e, elt.ToString(), "{0}\r\nUrl={1}.")); ctx.HandleException(e); } } } finally { _AfterImport(ctx, sink); } }
private void importUrl(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt) { int splitUntil = elt.ContextNode.ReadInt("@splituntil", this.splitUntil); bool objectPerLine = elt.ContextNode.ReadBool("@objectperline", this.objectPerLine); ctx.SendItemStart(elt); if ((ctx.ActionFlags & _ActionFlags.Skip) != 0) { return; } ExistState existState = ExistState.NotExist; if ((ctx.ImportFlags & _ImportFlags.ImportFull) == 0) //Not a full import { existState = toExistState(sink.HandleValue(ctx, "record/_checkexist", null)); } //Check if we need to convert this file if ((existState & (ExistState.ExistSame | ExistState.ExistNewer | ExistState.Exist)) != 0) { ctx.Skipped++; ctx.ImportLog.Log("Skipped: {0}. Date={1}", elt, 0);// dtFile); return; } List <String> keys = new List <string>(); List <String> values = new List <String>(); Stream fs = null; try { fs = elt.CreateStream(ctx); if (!this.objectPerLine) { importRecord(ctx, sink, fs, splitUntil); } else { byte[] buf = new byte[4096]; int offset = 0; MemoryStream tmp = new MemoryStream(); while (true) { int len = offset + fs.Read(buf, offset, buf.Length - offset); if (len == offset) { break; } int i = offset; for (; i < len; i++) { if (buf[i] == '\n') { break; } } tmp.Write(buf, offset, i - offset); if (i == offset) { offset = 0; continue; } if (tmp.Position > 0) { tmp.Position = 0; importRecord(ctx, sink, tmp, splitUntil); tmp.Position = 0; } if (i + 1 < offset) { tmp.Write(buf, i + 1, len - i - 1); } } if (offset > 0) { tmp.Write(buf, 0, offset); } if (tmp.Position > 0) { tmp.Position = 0; importRecord(ctx, sink, tmp, splitUntil); } } ctx.OptSendItemStop(); } catch (Exception e) { ctx.HandleException(e); } }
protected override void ImportStream(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt, Stream strm) { int lineNo = -1; try { TextReader rdr = strm.CreateTextReader(encoding); int charsRead = 0; if ((mode & _Mode.lines) != 0) { while (charsRead < maxToRead) { lineNo++; String line = rdr.ReadLine(); if (line == null) { break; } if (line.Length == 0) { if ((mode & _Mode.stopAtEmpty) != 0) { break; } } sink.HandleValue(ctx, "record/line", line); charsRead += line.Length; } } else { lineNo++; String line = rdr.ReadLine(); if (line != null) { charsRead += line.Length; } String key, value; while (line != null) { lineNo++; String nextLine = rdr.ReadLine(); if (nextLine == null) { key = "record/" + splitKV(line, out value); sink.HandleValue(ctx, key, value); break; } charsRead += nextLine.Length; if (nextLine.Length == 0) { if ((mode & _Mode.stopAtEmpty) != 0) { break; } else { continue; } } int offs = 0; for (; offs < nextLine.Length; offs++) { switch (nextLine[offs]) { case ' ': case '\t': continue; } break; } if (offs > 0) { line = line + nextLine.Substring(offs); continue; } if (lenient && nextLine.IndexOf(':') < 0) { line = line + nextLine; continue; } key = "record/" + splitKV(line, out value); sink.HandleValue(ctx, key, value); line = nextLine; } } sink.HandleValue(ctx, "record", null); ctx.IncrementEmitted(); } catch (Exception e) { e = new BMException(e, "{0}\nLine={1}.", e.Message, lineNo); ctx.HandleException(e); } }
private void importUrl(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt) { int maxParallel = elt.ContextNode.ReadInt("@maxparallel", this.maxParallel); int splitUntil = elt.ContextNode.ReadInt("@splituntil", this.splitUntil); if (splitUntil < 0) { splitUntil = int.MaxValue; } bool scan = elt.ContextNode.ReadBool("@scan", this.scan); String url = elt.ToString(); ctx.SendItemStart(elt); String command = elt.ContextNode.ReadStr("@command", null); String index = command != null ? null : elt.ContextNode.ReadStr("@index"); //mutual exclusive with command String reqBody = elt.ContextNode.ReadStr("request", this.requestBody); JObject req = null; if (reqBody != null) { req = JObject.Parse(reqBody); } ctx.DebugLog.Log("Request scan={1}, body={0}", reqBody, scan); try { Uri uri = new Uri(url); ESConnection conn = ESHelper.CreateConnection(ctx, url); ContextCallback cb = new ContextCallback(ctx, this, elt); conn.Timeout = timeoutInMs; //Same timeout as what we send to ES conn.OnPrepareRequest = cb.OnPrepareRequest; if (command != null) { var resp = conn.SendCmd("POST", command, reqBody); resp.ThrowIfError(); Pipeline.EmitToken(ctx, sink, resp.JObject, "response", splitUntil); } else { ESRecordEnum e = new ESRecordEnum(conn, index, req, numRecords, timeout, scan); if (maxParallel > 0) { e.Async = true; } ctx.ImportLog.Log("Starting scan of {0} records. Index={1}, connection={2}, async={3}, buffersize={4} requestbody={5}, splituntil={6}, scan={7}.", e.Count, index, url, e.Async, numRecords, req != null, splitUntil, scan); foreach (var doc in e) { ctx.IncrementEmitted(); sink.HandleValue(ctx, "record/_sort", doc.Sort); sink.HandleValue(ctx, "record/_type", doc.Type); if (splitUntil != 0) { foreach (var kvp in doc) { String pfx = "record/" + kvp.Key; if (splitUntil == 1) { sink.HandleValue(ctx, pfx, kvp.Value); continue; } Pipeline.EmitToken(ctx, sink, kvp.Value, pfx, splitUntil - 1); } } sink.HandleValue(ctx, "record", doc); } } ctx.SendItemStop(); } catch (Exception e) { ctx.HandleException(e); } }