protected virtual void ImportUrl(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt)
        {
            int orgEmitted = ctx.Emitted;

            if (addEmitted)
            {
                ctx.IncrementEmitted();
            }
            DateTime dtFile = elt.LastModified;

            ctx.SendItemStart(elt);
            //TODO if ((ctx.ActionFlags & _ActionFlags.Skip) != 0

            //Check if we need to import this file
            if ((ctx.ImportFlags & _ImportFlags.ImportFull) == 0) //Not a full import
            {
                if (!CheckNeedImport(ctx, sink, elt))
                {
                    goto SKIPPED;
                }
            }
            if (ctx.SkipUntilKey == "record")
            {
                goto SKIPPED;
            }

            using (Stream fs = _CreateStream(ctx, elt))
            {
                ImportStream(ctx, sink, elt, fs);
            }
            if (!addEmitted && orgEmitted == ctx.Emitted)
            {
                ctx.IncrementEmitted();
            }
            ctx.OptSendItemStop();
            return;

SKIPPED:
            ctx.Skipped++;
            if (!addEmitted && orgEmitted == ctx.Emitted)
            {
                ctx.IncrementEmitted();
            }
            if (logSkips)
            {
                ctx.DebugLog.Log("Skipped: {0}. Date={1}", elt.FullName, elt.LastModified);
            }
        }
        protected void EmitRecords(PipelineContext ctx, DbDataReader rdr, Query q)
        {
            String pfxRecord = String.IsNullOrEmpty(q.Prefix) ? "record" : q.Prefix;
            String pfxField  = pfxRecord + "/";
            var    sink      = ctx.Pipeline;

            while (rdr.Read())
            {
                int fcnt = rdr.FieldCount;
                for (int i = 0; i < fcnt; i++)
                {
                    String name  = rdr.GetName(i);
                    Type   ft    = rdr.GetFieldType(i);
                    Object value = null;
                    try
                    {
                        if (!rdr.IsDBNull(i))
                        {
                            value = rdr.GetValue(i);
                        }
                    } catch (Exception e)
                    {
                        if (!q.AllowConversionErrors)
                        {
                            throw new BMException(e, "{0}\r\nField={1}, type={2}.", e.Message, name, ft);
                        }
                        addConversionError(name, e);
                    }
                    sink.HandleValue(ctx, pfxField + name, value);
                }
                sink.HandleValue(ctx, pfxRecord, rdr);
                ctx.IncrementEmitted();
            }
        }
Beispiel #3
0
        private void importUrl(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt)
        {
            ctx.IncrementEmitted();
            TikaAsyncWorker worker   = new TikaAsyncWorker(this, elt);
            String          fileName = elt.FullName;

            sink.HandleValue(ctx, "record/_start", fileName);
            sink.HandleValue(ctx, "record/lastmodutc", worker.LastModifiedUtc);
            sink.HandleValue(ctx, "record/virtualFilename", elt.VirtualName);

            //Check if we need to convert this file
            if ((ctx.ImportFlags & _ImportFlags.ImportFull) == 0) //Not a full import
            {
                if ((ctx.ImportFlags & _ImportFlags.RetryErrors) == 0 && worker.LastModifiedUtc < previousRun)
                {
                    ctx.Skipped++;
                    return;
                }
                ExistState existState = toExistState(sink.HandleValue(ctx, "record/_checkexist", elt));
                if ((existState & (ExistState.ExistSame | ExistState.ExistNewer | ExistState.Exist)) != 0)
                {
                    ctx.Skipped++;
                    return;
                }
            }

            TikaAsyncWorker popped = pushPop(ctx, sink, worker);

            if (popped != null)
            {
                importUrl(ctx, sink, popped);
            }
        }
        private void importSheet(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt, Worksheet sheet)
        {
            Range used      = sheet.UsedRange;
            Range usedCells = used.Cells;

            if (usedCells == null)
            {
                return;
            }

            Object[,] c = (Object[, ])used.Cells.Value2;
            if (c == null)
            {
                return;
            }

            int lo1 = c.GetLowerBound(0);
            int hi1 = c.GetUpperBound(0);
            int lo2 = c.GetLowerBound(1);
            int hi2 = c.GetUpperBound(1);

            List <String> headers = new List <string>();

            if (headersAt >= 0)
            {
                int headersRow = lo1 + headersAt;
                if (headersRow <= hi1)
                {
                    int h = 0;
                    for (int j = lo2; j <= hi2; j++)
                    {
                        for (; h < j; h++)
                        {
                            headers.Add(null);
                        }
                        headers.Add(_toString(c[headersRow, j]));
                        h++;
                    }
                }
            }

            var keys = prepareEventKeys(sheet.Name, hi2 + 1, headers);

            for (int i = lo1 + startAt; i <= hi1; i++)
            {
                for (int j = lo2; j <= hi2; j++)
                {
                    sink.HandleValue(ctx, keys[j], c[i, j]);
                }
                sink.HandleValue(ctx, keys[0], null);
                ctx.IncrementEmitted();
            }
        }
        private void importRecord(PipelineContext ctx, IDatasourceSink sink, Stream strm, int splitUntil)
        {
            JsonTextReader rdr = new JsonTextReader(new StreamReader(strm, true));
            JToken         jt  = JObject.ReadFrom(rdr);

            rdr.Close();
            strm.Close();

            if (jt.Type != JTokenType.Array)
            {
                Pipeline.EmitToken(ctx, sink, jt, "record", splitUntil);
                ctx.IncrementEmitted();
            }
            else
            {
                foreach (var item in (JArray)jt)
                {
                    Pipeline.EmitToken(ctx, sink, item, "record", splitUntil);
                    ctx.IncrementEmitted();
                }
            }
        }
        //private Stream createInputStream (String fn)
        //{
        //   FileStream fs = new FileStream(fn, FileMode.Open, FileAccess.Read, FileShare.Read, 16 * 1024, false);
        //   //_FileStream fs = new _FileStream(fn, _FileMode.Open, _FileAccess.Read, _FileShare.Read, 16 * 1024);
        //   String ext = Path.GetExtension(fn);
        //   if (!String.Equals(".gz", ext, StringComparison.OrdinalIgnoreCase)) goto NO_ZIP;
        //   byte[] buf = new byte[2];
        //   fs.Read(buf, 0, 2);
        //   if (buf[0] != 0x1f || buf[1] != 0x8b) goto NO_ZIP;
        //   fs.Position = 0;
        //   return new GZipStream(fs, CompressionMode.Decompress, false);

        //NO_ZIP:
        //   return fs;
        //}


        protected override void ImportStream(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt, Stream strm)
        {
            if (oneBasedSortKey != 0)
            {
                ImportSortedStream(ctx, sink, elt, strm);
                return;
            }

            List <String> keys;

            CsvReader csvRdr = createReader(strm);

            optReadHeader(csvRdr);
            keys = createKeysForEmit();
            int startAt = this.startAt;

            while (csvRdr.NextRecord())
            {
                if (startAt > 0 && startAt > csvRdr.Line)
                {
                    continue;
                }
                ctx.IncrementEmitted();
                sink.HandleValue(ctx, "record/_start", null);
                var fields     = csvRdr.Fields;
                int fieldCount = fields.Count;
                //ctx.DebugLog.Log("Record {0}. FC={1}", line, fieldCount);
                generateMissingKeysForEmit(keys, fieldCount);
                for (int i = 0; i < fieldCount; i++)
                {
                    sink.HandleValue(ctx, keys[i], fields[i]);
                }
                sink.HandleValue(ctx, "record", null);
            }
            if (csvRdr.NumInvalidRecords > 0)
            {
                ctx.ImportLog.Log(_LogType.ltWarning, "Invalid records detected: {0}", csvRdr.NumInvalidRecords);
            }
        }
        protected void ImportSortedStream(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt, Stream strm)
        {
            List <String[]> rows = new List <string[]>();

            int       maxFieldCount = 0;
            CsvReader csvRdr        = createReader(strm);

            optReadHeader(csvRdr);
            int startAt          = this.startAt;
            int zeroBasedSortKey = (oneBasedSortKey & ~SORTKEY_REVERSE) - 1;

            while (csvRdr.NextRecord())
            {
                if (startAt > 0 && startAt > csvRdr.Line)
                {
                    continue;
                }
                var fields     = csvRdr.Fields;
                int fieldCount = fields.Count;
                if (fieldCount > maxFieldCount)
                {
                    maxFieldCount = fieldCount;
                }
                String[] arr = new String[fieldCount + 1];

                for (int i = 0; i < fieldCount; i++)
                {
                    arr[i + 1] = fields[i];
                }
                if (fieldCount > zeroBasedSortKey)
                {
                    arr[0] = arr[zeroBasedSortKey + 1];
                }
                rows.Add(arr);
            }

            ctx.DebugLog.Log("First 10 sortkeys:");
            int N = rows.Count;

            if (N > 10)
            {
                N = 10;
            }
            for (int i = 0; i < N; i++)
            {
                ctx.DebugLog.Log("-- [{0}]: '{1}'", i, rows[i][0]);
            }

            if (zeroBasedSortKey >= 0)
            {
                rows.Sort(cbSortString);
            }

            ctx.DebugLog.Log("First 10 sortkeys after sort:");
            for (int i = 0; i < N; i++)
            {
                ctx.DebugLog.Log("-- [{0}]: '{1}'", i, rows[i][0]);
            }

            //Fill pre-calculated keys
            List <String> keys = createKeysForEmit();

            generateMissingKeysForEmit(keys, maxFieldCount);

            if ((oneBasedSortKey & SORTKEY_REVERSE) == 0) //Normal order
            {
                //Emit sorted records
                for (int r = 0; r < rows.Count; r++)
                {
                    ctx.IncrementEmitted();
                    String[] arr = rows[r];
                    rows[r] = null;                      //Let this element be GC-ed
                    sink.HandleValue(ctx, "record/_start", null);
                    for (int i = 1; i < arr.Length; i++) //arr[0] is the sortkey
                    {
                        sink.HandleValue(ctx, keys[i - 1], arr[i]);
                    }
                    sink.HandleValue(ctx, "record", null);
                }
            }
            else
            {
                //Emit reverse sorted records
                for (int r = rows.Count - 1; r >= 0; r--)
                {
                    ctx.IncrementEmitted();
                    String[] arr = rows[r];
                    rows[r] = null;                      //Let this element be GC-ed
                    sink.HandleValue(ctx, "record/_start", null);
                    for (int i = 1; i < arr.Length; i++) //arr[0] is the sortkey
                    {
                        sink.HandleValue(ctx, keys[i - 1], arr[i]);
                    }
                    sink.HandleValue(ctx, "record", null);
                }
            }
        }
        protected override void ImportStream(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt, Stream strm)
        {
            int lineNo = -1;

            try
            {
                TextReader rdr = strm.CreateTextReader(encoding);

                int charsRead = 0;
                if ((mode & _Mode.lines) != 0)
                {
                    while (charsRead < maxToRead)
                    {
                        lineNo++;
                        String line = rdr.ReadLine();
                        if (line == null)
                        {
                            break;
                        }
                        if (line.Length == 0)
                        {
                            if ((mode & _Mode.stopAtEmpty) != 0)
                            {
                                break;
                            }
                        }
                        sink.HandleValue(ctx, "record/line", line);
                        charsRead += line.Length;
                    }
                }
                else
                {
                    lineNo++;
                    String line = rdr.ReadLine();
                    if (line != null)
                    {
                        charsRead += line.Length;
                    }
                    String key, value;
                    while (line != null)
                    {
                        lineNo++;
                        String nextLine = rdr.ReadLine();
                        if (nextLine == null)
                        {
                            key = "record/" + splitKV(line, out value);
                            sink.HandleValue(ctx, key, value);
                            break;
                        }
                        charsRead += nextLine.Length;
                        if (nextLine.Length == 0)
                        {
                            if ((mode & _Mode.stopAtEmpty) != 0)
                            {
                                break;
                            }
                            else
                            {
                                continue;
                            }
                        }

                        int offs = 0;
                        for (; offs < nextLine.Length; offs++)
                        {
                            switch (nextLine[offs])
                            {
                            case ' ':
                            case '\t': continue;
                            }
                            break;
                        }

                        if (offs > 0)
                        {
                            line = line + nextLine.Substring(offs);
                            continue;
                        }

                        if (lenient && nextLine.IndexOf(':') < 0)
                        {
                            line = line + nextLine;
                            continue;
                        }

                        key = "record/" + splitKV(line, out value);
                        sink.HandleValue(ctx, key, value);
                        line = nextLine;
                    }
                }
                sink.HandleValue(ctx, "record", null);
                ctx.IncrementEmitted();
            }
            catch (Exception e)
            {
                e = new BMException(e, "{0}\nLine={1}.", e.Message, lineNo);
                ctx.HandleException(e);
            }
        }
        protected override void ImportStream(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt, Stream strm)
        {
            if (selector != null)
            {
                XmlHelper h = new XmlHelper();
                h.Load(strm.CreateTextReader(), elt.FullName);

                selector.Process(ctx, new XmlNodeWrapper(h.DocumentElement));
                return;
            }


            List <String> keys   = new List <string>();
            List <String> values = new List <String>();
            int           lvl    = -1;

            XmlReader rdr = XmlReader.Create(strm);

            Logger l = ctx.DebugLog;

            while (rdr.Read())
            {
                if (dumpReader)
                {
                    l.Log("{0}: {1}, {2} [{3}]", rdr.Name, rdr.NodeType, rdr.IsEmptyElement, rdr.Value);
                }
                switch (rdr.NodeType)
                {
                case XmlNodeType.CDATA:
                case XmlNodeType.Text:
                case XmlNodeType.Whitespace:
                case XmlNodeType.SignificantWhitespace:
                    if (lvl <= 0)
                    {
                        continue;
                    }
                    values[lvl] = values[lvl] + rdr.Value;
                    continue;

                case XmlNodeType.Element:
                    lvl++;
                    if (lvl >= keys.Count)
                    {
                        keys.Add(null); values.Add(null);
                    }
                    if (lvl == 0)
                    {
                        keys[0] = rdr.Name;
                    }
                    else
                    {
                        keys[lvl] = keys[lvl - 1] + "/" + rdr.Name;
                        if (lvl == 1)
                        {
                            ctx.IncrementEmitted();
                        }
                    }

                    //l.Log("{0}: [{1}, {2}]", lvl, keys[lvl], rdr.NodeType);
                    bool isEmpty = rdr.IsEmptyElement; //cache this value: after reading the attribs its value is lost
                    if (rdr.AttributeCount > 0)
                    {
                        String pfx = keys[lvl] + "/@";
                        for (int j = 0; j < rdr.AttributeCount; j++)
                        {
                            rdr.MoveToNextAttribute();
                            sink.HandleValue(ctx, pfx + rdr.Name, rdr.Value);
                        }
                    }
                    if (!isEmpty)
                    {
                        continue;
                    }

                    //l.Log("{0}: [{1}]", keys[lvl], rdr.NodeType);
                    sink.HandleValue(ctx, keys[lvl], null);
                    lvl--;

                    continue;

                case XmlNodeType.EndElement:
                    //l.Log("{0}: [{1}]", keys[lvl], rdr.NodeType);
                    sink.HandleValue(ctx, keys[lvl], values[lvl]);
                    values[lvl] = null;
                    lvl--;
                    continue;
                }
            }
            rdr.Close();
        }
Beispiel #10
0
        private void importUrl(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt)
        {
            int maxParallel = elt.ContextNode.ReadInt("@maxparallel", this.maxParallel);
            int splitUntil  = elt.ContextNode.ReadInt("@splituntil", this.splitUntil);

            if (splitUntil < 0)
            {
                splitUntil = int.MaxValue;
            }
            bool scan = elt.ContextNode.ReadBool("@scan", this.scan);

            String url = elt.ToString();

            ctx.SendItemStart(elt);
            String  command = elt.ContextNode.ReadStr("@command", null);
            String  index   = command != null ? null : elt.ContextNode.ReadStr("@index"); //mutual exclusive with command
            String  reqBody = elt.ContextNode.ReadStr("request", this.requestBody);
            JObject req     = null;

            if (reqBody != null)
            {
                req = JObject.Parse(reqBody);
            }
            ctx.DebugLog.Log("Request scan={1}, body={0}", reqBody, scan);
            try
            {
                Uri             uri  = new Uri(url);
                ESConnection    conn = ESHelper.CreateConnection(ctx, url);
                ContextCallback cb   = new ContextCallback(ctx, this, elt);
                conn.Timeout          = timeoutInMs; //Same timeout as what we send to ES
                conn.OnPrepareRequest = cb.OnPrepareRequest;
                if (command != null)
                {
                    var resp = conn.SendCmd("POST", command, reqBody);
                    resp.ThrowIfError();
                    Pipeline.EmitToken(ctx, sink, resp.JObject, "response", splitUntil);
                }
                else
                {
                    ESRecordEnum e = new ESRecordEnum(conn, index, req, numRecords, timeout, scan);
                    if (maxParallel > 0)
                    {
                        e.Async = true;
                    }
                    ctx.ImportLog.Log("Starting scan of {0} records. Index={1}, connection={2}, async={3}, buffersize={4} requestbody={5}, splituntil={6}, scan={7}.", e.Count, index, url, e.Async, numRecords, req != null, splitUntil, scan);
                    foreach (var doc in e)
                    {
                        ctx.IncrementEmitted();
                        sink.HandleValue(ctx, "record/_sort", doc.Sort);
                        sink.HandleValue(ctx, "record/_type", doc.Type);
                        if (splitUntil != 0)
                        {
                            foreach (var kvp in doc)
                            {
                                String pfx = "record/" + kvp.Key;
                                if (splitUntil == 1)
                                {
                                    sink.HandleValue(ctx, pfx, kvp.Value);
                                    continue;
                                }
                                Pipeline.EmitToken(ctx, sink, kvp.Value, pfx, splitUntil - 1);
                            }
                        }
                        sink.HandleValue(ctx, "record", doc);
                    }
                }
                ctx.SendItemStop();
            }
            catch (Exception e)
            {
                ctx.HandleException(e);
            }
        }