protected virtual void ImportUrl(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt) { int orgEmitted = ctx.Emitted; if (addEmitted) { ctx.IncrementEmitted(); } DateTime dtFile = elt.LastModified; ctx.SendItemStart(elt); //TODO if ((ctx.ActionFlags & _ActionFlags.Skip) != 0 //Check if we need to import this file if ((ctx.ImportFlags & _ImportFlags.ImportFull) == 0) //Not a full import { if (!CheckNeedImport(ctx, sink, elt)) { goto SKIPPED; } } if (ctx.SkipUntilKey == "record") { goto SKIPPED; } using (Stream fs = _CreateStream(ctx, elt)) { ImportStream(ctx, sink, elt, fs); } if (!addEmitted && orgEmitted == ctx.Emitted) { ctx.IncrementEmitted(); } ctx.OptSendItemStop(); return; SKIPPED: ctx.Skipped++; if (!addEmitted && orgEmitted == ctx.Emitted) { ctx.IncrementEmitted(); } if (logSkips) { ctx.DebugLog.Log("Skipped: {0}. Date={1}", elt.FullName, elt.LastModified); } }
protected void EmitRecords(PipelineContext ctx, DbDataReader rdr, Query q) { String pfxRecord = String.IsNullOrEmpty(q.Prefix) ? "record" : q.Prefix; String pfxField = pfxRecord + "/"; var sink = ctx.Pipeline; while (rdr.Read()) { int fcnt = rdr.FieldCount; for (int i = 0; i < fcnt; i++) { String name = rdr.GetName(i); Type ft = rdr.GetFieldType(i); Object value = null; try { if (!rdr.IsDBNull(i)) { value = rdr.GetValue(i); } } catch (Exception e) { if (!q.AllowConversionErrors) { throw new BMException(e, "{0}\r\nField={1}, type={2}.", e.Message, name, ft); } addConversionError(name, e); } sink.HandleValue(ctx, pfxField + name, value); } sink.HandleValue(ctx, pfxRecord, rdr); ctx.IncrementEmitted(); } }
private void importUrl(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt) { ctx.IncrementEmitted(); TikaAsyncWorker worker = new TikaAsyncWorker(this, elt); String fileName = elt.FullName; sink.HandleValue(ctx, "record/_start", fileName); sink.HandleValue(ctx, "record/lastmodutc", worker.LastModifiedUtc); sink.HandleValue(ctx, "record/virtualFilename", elt.VirtualName); //Check if we need to convert this file if ((ctx.ImportFlags & _ImportFlags.ImportFull) == 0) //Not a full import { if ((ctx.ImportFlags & _ImportFlags.RetryErrors) == 0 && worker.LastModifiedUtc < previousRun) { ctx.Skipped++; return; } ExistState existState = toExistState(sink.HandleValue(ctx, "record/_checkexist", elt)); if ((existState & (ExistState.ExistSame | ExistState.ExistNewer | ExistState.Exist)) != 0) { ctx.Skipped++; return; } } TikaAsyncWorker popped = pushPop(ctx, sink, worker); if (popped != null) { importUrl(ctx, sink, popped); } }
private void importSheet(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt, Worksheet sheet) { Range used = sheet.UsedRange; Range usedCells = used.Cells; if (usedCells == null) { return; } Object[,] c = (Object[, ])used.Cells.Value2; if (c == null) { return; } int lo1 = c.GetLowerBound(0); int hi1 = c.GetUpperBound(0); int lo2 = c.GetLowerBound(1); int hi2 = c.GetUpperBound(1); List <String> headers = new List <string>(); if (headersAt >= 0) { int headersRow = lo1 + headersAt; if (headersRow <= hi1) { int h = 0; for (int j = lo2; j <= hi2; j++) { for (; h < j; h++) { headers.Add(null); } headers.Add(_toString(c[headersRow, j])); h++; } } } var keys = prepareEventKeys(sheet.Name, hi2 + 1, headers); for (int i = lo1 + startAt; i <= hi1; i++) { for (int j = lo2; j <= hi2; j++) { sink.HandleValue(ctx, keys[j], c[i, j]); } sink.HandleValue(ctx, keys[0], null); ctx.IncrementEmitted(); } }
private void importRecord(PipelineContext ctx, IDatasourceSink sink, Stream strm, int splitUntil) { JsonTextReader rdr = new JsonTextReader(new StreamReader(strm, true)); JToken jt = JObject.ReadFrom(rdr); rdr.Close(); strm.Close(); if (jt.Type != JTokenType.Array) { Pipeline.EmitToken(ctx, sink, jt, "record", splitUntil); ctx.IncrementEmitted(); } else { foreach (var item in (JArray)jt) { Pipeline.EmitToken(ctx, sink, item, "record", splitUntil); ctx.IncrementEmitted(); } } }
//private Stream createInputStream (String fn) //{ // FileStream fs = new FileStream(fn, FileMode.Open, FileAccess.Read, FileShare.Read, 16 * 1024, false); // //_FileStream fs = new _FileStream(fn, _FileMode.Open, _FileAccess.Read, _FileShare.Read, 16 * 1024); // String ext = Path.GetExtension(fn); // if (!String.Equals(".gz", ext, StringComparison.OrdinalIgnoreCase)) goto NO_ZIP; // byte[] buf = new byte[2]; // fs.Read(buf, 0, 2); // if (buf[0] != 0x1f || buf[1] != 0x8b) goto NO_ZIP; // fs.Position = 0; // return new GZipStream(fs, CompressionMode.Decompress, false); //NO_ZIP: // return fs; //} protected override void ImportStream(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt, Stream strm) { if (oneBasedSortKey != 0) { ImportSortedStream(ctx, sink, elt, strm); return; } List <String> keys; CsvReader csvRdr = createReader(strm); optReadHeader(csvRdr); keys = createKeysForEmit(); int startAt = this.startAt; while (csvRdr.NextRecord()) { if (startAt > 0 && startAt > csvRdr.Line) { continue; } ctx.IncrementEmitted(); sink.HandleValue(ctx, "record/_start", null); var fields = csvRdr.Fields; int fieldCount = fields.Count; //ctx.DebugLog.Log("Record {0}. FC={1}", line, fieldCount); generateMissingKeysForEmit(keys, fieldCount); for (int i = 0; i < fieldCount; i++) { sink.HandleValue(ctx, keys[i], fields[i]); } sink.HandleValue(ctx, "record", null); } if (csvRdr.NumInvalidRecords > 0) { ctx.ImportLog.Log(_LogType.ltWarning, "Invalid records detected: {0}", csvRdr.NumInvalidRecords); } }
protected void ImportSortedStream(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt, Stream strm) { List <String[]> rows = new List <string[]>(); int maxFieldCount = 0; CsvReader csvRdr = createReader(strm); optReadHeader(csvRdr); int startAt = this.startAt; int zeroBasedSortKey = (oneBasedSortKey & ~SORTKEY_REVERSE) - 1; while (csvRdr.NextRecord()) { if (startAt > 0 && startAt > csvRdr.Line) { continue; } var fields = csvRdr.Fields; int fieldCount = fields.Count; if (fieldCount > maxFieldCount) { maxFieldCount = fieldCount; } String[] arr = new String[fieldCount + 1]; for (int i = 0; i < fieldCount; i++) { arr[i + 1] = fields[i]; } if (fieldCount > zeroBasedSortKey) { arr[0] = arr[zeroBasedSortKey + 1]; } rows.Add(arr); } ctx.DebugLog.Log("First 10 sortkeys:"); int N = rows.Count; if (N > 10) { N = 10; } for (int i = 0; i < N; i++) { ctx.DebugLog.Log("-- [{0}]: '{1}'", i, rows[i][0]); } if (zeroBasedSortKey >= 0) { rows.Sort(cbSortString); } ctx.DebugLog.Log("First 10 sortkeys after sort:"); for (int i = 0; i < N; i++) { ctx.DebugLog.Log("-- [{0}]: '{1}'", i, rows[i][0]); } //Fill pre-calculated keys List <String> keys = createKeysForEmit(); generateMissingKeysForEmit(keys, maxFieldCount); if ((oneBasedSortKey & SORTKEY_REVERSE) == 0) //Normal order { //Emit sorted records for (int r = 0; r < rows.Count; r++) { ctx.IncrementEmitted(); String[] arr = rows[r]; rows[r] = null; //Let this element be GC-ed sink.HandleValue(ctx, "record/_start", null); for (int i = 1; i < arr.Length; i++) //arr[0] is the sortkey { sink.HandleValue(ctx, keys[i - 1], arr[i]); } sink.HandleValue(ctx, "record", null); } } else { //Emit reverse sorted records for (int r = rows.Count - 1; r >= 0; r--) { ctx.IncrementEmitted(); String[] arr = rows[r]; rows[r] = null; //Let this element be GC-ed sink.HandleValue(ctx, "record/_start", null); for (int i = 1; i < arr.Length; i++) //arr[0] is the sortkey { sink.HandleValue(ctx, keys[i - 1], arr[i]); } sink.HandleValue(ctx, "record", null); } } }
protected override void ImportStream(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt, Stream strm) { int lineNo = -1; try { TextReader rdr = strm.CreateTextReader(encoding); int charsRead = 0; if ((mode & _Mode.lines) != 0) { while (charsRead < maxToRead) { lineNo++; String line = rdr.ReadLine(); if (line == null) { break; } if (line.Length == 0) { if ((mode & _Mode.stopAtEmpty) != 0) { break; } } sink.HandleValue(ctx, "record/line", line); charsRead += line.Length; } } else { lineNo++; String line = rdr.ReadLine(); if (line != null) { charsRead += line.Length; } String key, value; while (line != null) { lineNo++; String nextLine = rdr.ReadLine(); if (nextLine == null) { key = "record/" + splitKV(line, out value); sink.HandleValue(ctx, key, value); break; } charsRead += nextLine.Length; if (nextLine.Length == 0) { if ((mode & _Mode.stopAtEmpty) != 0) { break; } else { continue; } } int offs = 0; for (; offs < nextLine.Length; offs++) { switch (nextLine[offs]) { case ' ': case '\t': continue; } break; } if (offs > 0) { line = line + nextLine.Substring(offs); continue; } if (lenient && nextLine.IndexOf(':') < 0) { line = line + nextLine; continue; } key = "record/" + splitKV(line, out value); sink.HandleValue(ctx, key, value); line = nextLine; } } sink.HandleValue(ctx, "record", null); ctx.IncrementEmitted(); } catch (Exception e) { e = new BMException(e, "{0}\nLine={1}.", e.Message, lineNo); ctx.HandleException(e); } }
protected override void ImportStream(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt, Stream strm) { if (selector != null) { XmlHelper h = new XmlHelper(); h.Load(strm.CreateTextReader(), elt.FullName); selector.Process(ctx, new XmlNodeWrapper(h.DocumentElement)); return; } List <String> keys = new List <string>(); List <String> values = new List <String>(); int lvl = -1; XmlReader rdr = XmlReader.Create(strm); Logger l = ctx.DebugLog; while (rdr.Read()) { if (dumpReader) { l.Log("{0}: {1}, {2} [{3}]", rdr.Name, rdr.NodeType, rdr.IsEmptyElement, rdr.Value); } switch (rdr.NodeType) { case XmlNodeType.CDATA: case XmlNodeType.Text: case XmlNodeType.Whitespace: case XmlNodeType.SignificantWhitespace: if (lvl <= 0) { continue; } values[lvl] = values[lvl] + rdr.Value; continue; case XmlNodeType.Element: lvl++; if (lvl >= keys.Count) { keys.Add(null); values.Add(null); } if (lvl == 0) { keys[0] = rdr.Name; } else { keys[lvl] = keys[lvl - 1] + "/" + rdr.Name; if (lvl == 1) { ctx.IncrementEmitted(); } } //l.Log("{0}: [{1}, {2}]", lvl, keys[lvl], rdr.NodeType); bool isEmpty = rdr.IsEmptyElement; //cache this value: after reading the attribs its value is lost if (rdr.AttributeCount > 0) { String pfx = keys[lvl] + "/@"; for (int j = 0; j < rdr.AttributeCount; j++) { rdr.MoveToNextAttribute(); sink.HandleValue(ctx, pfx + rdr.Name, rdr.Value); } } if (!isEmpty) { continue; } //l.Log("{0}: [{1}]", keys[lvl], rdr.NodeType); sink.HandleValue(ctx, keys[lvl], null); lvl--; continue; case XmlNodeType.EndElement: //l.Log("{0}: [{1}]", keys[lvl], rdr.NodeType); sink.HandleValue(ctx, keys[lvl], values[lvl]); values[lvl] = null; lvl--; continue; } } rdr.Close(); }
private void importUrl(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt) { int maxParallel = elt.ContextNode.ReadInt("@maxparallel", this.maxParallel); int splitUntil = elt.ContextNode.ReadInt("@splituntil", this.splitUntil); if (splitUntil < 0) { splitUntil = int.MaxValue; } bool scan = elt.ContextNode.ReadBool("@scan", this.scan); String url = elt.ToString(); ctx.SendItemStart(elt); String command = elt.ContextNode.ReadStr("@command", null); String index = command != null ? null : elt.ContextNode.ReadStr("@index"); //mutual exclusive with command String reqBody = elt.ContextNode.ReadStr("request", this.requestBody); JObject req = null; if (reqBody != null) { req = JObject.Parse(reqBody); } ctx.DebugLog.Log("Request scan={1}, body={0}", reqBody, scan); try { Uri uri = new Uri(url); ESConnection conn = ESHelper.CreateConnection(ctx, url); ContextCallback cb = new ContextCallback(ctx, this, elt); conn.Timeout = timeoutInMs; //Same timeout as what we send to ES conn.OnPrepareRequest = cb.OnPrepareRequest; if (command != null) { var resp = conn.SendCmd("POST", command, reqBody); resp.ThrowIfError(); Pipeline.EmitToken(ctx, sink, resp.JObject, "response", splitUntil); } else { ESRecordEnum e = new ESRecordEnum(conn, index, req, numRecords, timeout, scan); if (maxParallel > 0) { e.Async = true; } ctx.ImportLog.Log("Starting scan of {0} records. Index={1}, connection={2}, async={3}, buffersize={4} requestbody={5}, splituntil={6}, scan={7}.", e.Count, index, url, e.Async, numRecords, req != null, splitUntil, scan); foreach (var doc in e) { ctx.IncrementEmitted(); sink.HandleValue(ctx, "record/_sort", doc.Sort); sink.HandleValue(ctx, "record/_type", doc.Type); if (splitUntil != 0) { foreach (var kvp in doc) { String pfx = "record/" + kvp.Key; if (splitUntil == 1) { sink.HandleValue(ctx, pfx, kvp.Value); continue; } Pipeline.EmitToken(ctx, sink, kvp.Value, pfx, splitUntil - 1); } } sink.HandleValue(ctx, "record", doc); } } ctx.SendItemStop(); } catch (Exception e) { ctx.HandleException(e); } }